;***************************************************************************** ;* pixel.asm: x86 pixel metrics ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz ;* Laurent Aimar ;* Alex Izvorski ;* Fiona Glaser ;* Oskar Arvidsson ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 hmul_8p: times 8 db 1 times 4 db 1, -1 times 8 db 1 times 4 db 1, -1 hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 hmul_8w: times 4 dw 1 times 2 dw 1, -1 times 4 dw 1 times 2 dw 1, -1 ALIGN 32 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 sw_f0: dq 0xfff0, 0 pd_f0: times 4 dd 0xffff0000 SECTION .text cextern pb_0 cextern pb_1 cextern pw_1 cextern pw_8 cextern pw_16 cextern pw_32 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz cextern pd_1 cextern popcnt_table cextern pd_2 cextern hmul_16p cextern pb_movemask cextern pb_movemask_32 cextern pw_pixel_max ;============================================================================= ; SATD ;============================================================================= %macro JDUP 2 %if cpuflag(sse4) ; just use shufps on anything post conroe shufps %1, %2, 0 %elif cpuflag(ssse3) && notcpuflag(atom) ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 movsldup %1, %1 %else ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d punpckldq %1, %2 %endif %endmacro %macro HSUMSUB 5 pmaddubsw m%2, m%5 pmaddubsw m%1, m%5 pmaddubsw m%4, m%5 pmaddubsw m%3, m%5 %endmacro %macro DIFF_UNPACK_SSE2 5 punpcklbw m%1, m%5 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro DIFF_SUMSUB_SSSE3 5 HSUMSUB %1, %2, %3, %4, %5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer movd %1, %3 movd %2, %4 JDUP %1, %2 %endmacro %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer movddup m%3, %6 movddup m%4, %8 movddup m%1, %5 movddup m%2, %7 %endmacro %macro LOAD_DUP_4x8P_PENRYN 8 ; penryn and nehalem run punpcklqdq and movddup in different units movh m%3, %6 movh m%4, %8 punpcklqdq m%3, m%3 movddup m%1, %5 punpcklqdq m%4, m%4 movddup m%2, %7 %endmacro %macro LOAD_SUMSUB_8x2P 9 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr movddup m%1, [%7] movddup m%2, [%7+8] mova m%4, [%6] movddup m%3, m%4 punpckhqdq m%4, m%4 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr movu m%4, [%7] mova m%2, [%6] DEINTB %1, %2, %3, %4, %5 psubw m%1, m%3 psubw m%2, m%4 SUMSUB_BA w, %1, %2, %3 %endmacro %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro %macro LOAD_SUMSUB_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr vbroadcasti128 m%1, [%6] vbroadcasti128 m%3, [%7] vbroadcasti128 m%2, [%8] vbroadcasti128 m%4, [%9] DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer mova xm%3, %6 mova xm%4, %8 mova xm%1, %5 mova xm%2, %7 vpermq m%3, m%3, q0011 vpermq m%4, m%4, q0011 vpermq m%1, m%1, q0011 vpermq m%2, m%2, q0011 %endmacro %macro LOAD_SUMSUB8_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 %xdefine %%n n%1 %assign offset %2*SIZEOF_PIXEL LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] %if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif HADAMARD4_2D 4, 5, 6, 7, 3, %%n paddw m4, m6 ;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12) ; pxor m5, m5 ; punpcklwd m6, m4, m5 ; punpckhwd m4, m5 ; paddd m4, m6 ;%endif SWAP %%n, 4 %endmacro ; in: %1 = horizontal if 0, vertical if 1 %macro SATD_8x4_SSE 8-9 %if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 ; doing the abs first is a slight advantage ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 HADAMARD 1, max, %2, %4, %6, %7 %endif %ifnidn %9, swap %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%2, m%6 punpckhwd m%2, m%6 paddd m%8, m%7 paddd m%8, m%2 %else paddw m%8, m%2 %endif %else SWAP %8, %2 %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%8, m%6 punpckhwd m%8, m%6 paddd m%8, m%7 %endif %endif %if %1 %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%4, m%6 punpckhwd m%4, m%6 paddd m%8, m%7 paddd m%8, m%4 %else paddw m%8, m%4 %endif %else HADAMARD 1, max, %3, %5, %6, %7 %if (BIT_DEPTH == 12) pxor m%6, m%6 punpcklwd m%7, m%3, m%6 punpckhwd m%3, m%6 paddd m%8, m%7 paddd m%8, m%3 %else paddw m%8, m%3 %endif %endif %endmacro %macro SATD_8x4_1_SSE 10 %if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 ; doing the abs first is a slight advantage ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 HADAMARD 1, max, %2, %4, %6, %7 %endif pxor m%10, m%10 punpcklwd m%9, m%2, m%10 paddd m%8, m%9 punpckhwd m%9, m%2, m%10 paddd m%8, m%9 %if %1 pxor m%10, m%10 punpcklwd m%9, m%4, m%10 paddd m%8, m%9 punpckhwd m%9, m%4, m%10 paddd m%8, m%9 %else HADAMARD 1, max, %3, %5, %6, %7 pxor m%10, m%10 punpcklwd m%9, m%3, m%10 paddd m%8, m%9 punpckhwd m%9, m%3, m%10 paddd m%8, m%9 %endif %endmacro %macro SATD_START_MMX 0 FIX_STRIDES r1, r3 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2 %endmacro %macro SATD_END_MMX 0 %if HIGH_BIT_DEPTH HADDUW m0, m1 movd eax, m0 %else ; !HIGH_BIT_DEPTH pshufw m1, m0, q1032 paddw m0, m1 pshufw m1, m0, q2301 paddw m0, m1 movd eax, m0 and eax, 0xffff %endif ; HIGH_BIT_DEPTH EMMS RET %endmacro ; FIXME avoid the spilling of regs to hold 3*stride. ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_satd_4x4, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX %macro SATD_START_SSE2 2-3 0 FIX_STRIDES r1, r3 %if HIGH_BIT_DEPTH && %3 pxor %2, %2 %elif cpuflag(ssse3) && notcpuflag(atom) %if mmsize==32 mova %2, [hmul_16p] %else mova %2, [hmul_8p] %endif %endif lea r4, [3*r1] lea r5, [3*r3] pxor %1, %1 %endmacro %macro SATD_END_SSE2 1-2 %if HIGH_BIT_DEPTH %if BIT_DEPTH == 12 HADDD %1, xm0 %else ; BIT_DEPTH == 12 HADDUW %1, xm0 %endif ; BIT_DEPTH == 12 %if %0 == 2 paddd %1, %2 %endif %else HADDW %1, xm7 %endif movd eax, %1 RET %endmacro %macro SATD_ACCUM 3 %if HIGH_BIT_DEPTH HADDUW %1, %2 paddd %3, %1 pxor %1, %1 %endif %endmacro %macro BACKUP_POINTERS 0 %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r6, r0 mov r7, r2 %endif %endmacro %macro RESTORE_AND_INC_POINTERS 0 %if ARCH_X86_64 lea r0, [r6+8*SIZEOF_PIXEL] lea r2, [r7+8*SIZEOF_PIXEL] %if WIN64 POP r7 %endif %else mov r0, r0mp mov r2, r2mp add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL %endif %endmacro %macro SATD_4x8_SSE 3-4 %if HIGH_BIT_DEPTH movh m0, [r0+0*r1] movh m4, [r2+0*r3] movh m1, [r0+1*r1] movh m5, [r2+1*r3] movhps m0, [r0+4*r1] movhps m4, [r2+4*r3] movh m2, [r0+2*r1] movh m6, [r2+2*r3] psubw m0, m4 movh m3, [r0+r4] movh m4, [r2+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] movhps m1, [r0+1*r1] movhps m5, [r2+1*r3] movhps m2, [r0+2*r1] movhps m6, [r2+2*r3] psubw m1, m5 movhps m3, [r0+r4] movhps m4, [r2+r5] psubw m2, m6 psubw m3, m4 %else ; !HIGH_BIT_DEPTH movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] add r2, r5 movd m0, [r0] movd m1, [r0+r1] movd m2, [r0+2*r1] add r0, r4 movd m3, [r2+r3] JDUP m4, m3 movd m3, [r0+r1] JDUP m0, m3 movd m3, [r2+2*r3] JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 %if %1==0 && %2==1 mova m3, [hmul_4p] DIFFOP 0, 4, 1, 5, 3 %else DIFFOP 0, 4, 1, 5, 7 %endif movd m5, [r2] add r2, r5 movd m3, [r0] add r0, r4 movd m4, [r2] JDUP m6, m4 movd m4, [r0] JDUP m2, m4 movd m4, [r2+r3] JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 %if %1==0 && %2==1 mova m4, [hmul_4p] DIFFOP 2, 6, 3, 5, 4 %else DIFFOP 2, 6, 3, 5, 7 %endif %endif ; HIGH_BIT_DEPTH %if %0 == 4 SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4 %else SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 %endif %endmacro ;----------------------------------------------------------------------------- ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 0 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH) cglobal pixel_satd_4x4, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p] LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 HADAMARD 0, sumsub, 0, 1, 2, 3 HADAMARD 4, sumsub, 0, 1, 2, 3 HADAMARD 1, amax, 0, 1, 2, 3 HADDW m0, m1 movd eax, m0 RET %endif cglobal pixel_satd_4x8, 4, 6, 8 SATD_START_MMX %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap %if BIT_DEPTH == 12 HADDD m7, m1 %else HADDUW m7, m1 %endif movd eax, m7 RET cglobal pixel_satd_4x16, 4, 6, 8 SATD_START_MMX %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0+r1*2*SIZEOF_PIXEL] lea r2, [r2+r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add %if BIT_DEPTH == 12 HADDD m7, m1 %else HADDUW m7, m1 %endif movd eax, m7 RET cglobal pixel_satd_8x8_internal LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 %%pixel_satd_8x4_internal: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_8x8_internal2 %if WIN64 LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 %%pixel_satd_8x4_internal2: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13 %else LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 %%pixel_satd_8x4_internal2: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5 %endif ret ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx) cglobal pixel_satd_16x4_internal2 LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13 SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13 ret cglobal pixel_satd_16x4, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_16x8, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x12, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x32, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x64, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x16, 4,6,14 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 %%pixel_satd_16x8_internal: call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx) SATD_START_SSE2 m10, m7 mov r6, r0 mov r7, r2 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 16] lea r2, [r7 + 16] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 32] lea r2, [r7 + 32] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 lea r0, [r6 + 48] lea r2, [r7 + 48] call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 call pixel_satd_16x4_internal2 HADDD m10, m0 movd eax, m10 RET %else %if WIN64 cglobal pixel_satd_16x24, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_16x24, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x48, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_24x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_8x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_8x12, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 pxor m7, m7 movhlps m7, m6 paddd m6, m7 pshufd m7, m6, 1 paddd m6, m7 movd eax, m6 RET %else cglobal pixel_satd_8x12, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %else cglobal pixel_satd_12x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %endif %else ;HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %else cglobal pixel_satd_12x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %endif %endif %if HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %else cglobal pixel_satd_4x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 pxor m1, m1 movhlps m1, m7 paddd m7, m1 pshufd m1, m7, 1 paddd m7, m1 movd eax, m7 RET %endif %else %if WIN64 cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) SATD_START_MMX mov r6, r0 mov r7, r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %else cglobal pixel_satd_4x32, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %endif %endif %if WIN64 cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2,8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2,16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2,24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2,32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2,40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2,8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2,16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2,24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2,32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2,40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2,48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2,56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2, 32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2, 40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2, 48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2, 56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2, 32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2, 40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2, 48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2, 56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] lea r2, [r7 + 32*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] lea r2, [r7 + 40*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] lea r2, [r7 + 48*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] lea r2, [r7 + 56*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 32*SIZEOF_PIXEL] mov r2, [rsp] add r2, 32*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 40*SIZEOF_PIXEL] mov r2, [rsp] add r2, 40*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 48*SIZEOF_PIXEL] mov r2, [rsp] add r2, 48*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 56*SIZEOF_PIXEL] mov r2, [rsp] add r2, 56*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if WIN64 cglobal pixel_satd_16x4, 4,6,14 %else cglobal pixel_satd_16x4, 4,6,8 %endif SATD_START_SSE2 m6, m7 BACKUP_POINTERS call %%pixel_satd_8x4_internal2 RESTORE_AND_INC_POINTERS call %%pixel_satd_8x4_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x8, 4,6,14 %else cglobal pixel_satd_16x8, 4,6,8 %endif SATD_START_SSE2 m6, m7 BACKUP_POINTERS call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x12, 4,6,14 %else cglobal pixel_satd_16x12, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call %%pixel_satd_8x4_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x16, 4,6,14 %else cglobal pixel_satd_16x16, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x32, 4,6,14 %else cglobal pixel_satd_16x32, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_16x64, 4,6,14 %else cglobal pixel_satd_16x64, 4,6,8 %endif SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif %if HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x16, 4,8,8 SATD_START_MMX mov r6, r0 mov r7, r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %else cglobal pixel_satd_12x16, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 pxor m7, m7 SATD_4x8_SSE vertical, 0, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, 4, 5 lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, 4, 5 HADDD m7, m0 movd eax, m7 RET %endif %else ;HIGH_BIT_DEPTH %if WIN64 cglobal pixel_satd_12x16, 4,8,8 SATD_START_MMX mov r6, r0 mov r7, r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] lea r2, [r7 + 4*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %else cglobal pixel_satd_12x16, 4,7,8,0-gprsize SATD_START_MMX mov r6, r0 mov [rsp], r2 %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 4*SIZEOF_PIXEL] mov r2, [rsp] add r2, 4*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL SATD_4x8_SSE vertical, 1, add lea r0, [r0 + r1*2*SIZEOF_PIXEL] lea r2, [r2 + r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET %endif %endif %if WIN64 cglobal pixel_satd_24x32, 4,8,14 SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %else cglobal pixel_satd_24x32, 4,7,8,0-gprsize SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %endif ;WIN64 %if WIN64 cglobal pixel_satd_8x32, 4,6,14 %else cglobal pixel_satd_8x32, 4,6,8 %endif SATD_START_SSE2 m6, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET %if WIN64 cglobal pixel_satd_8x16, 4,6,14 %else cglobal pixel_satd_8x16, 4,6,8 %endif SATD_START_SSE2 m6, m7 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 HADDD m6, m0 movd eax, m6 RET cglobal pixel_satd_8x8, 4,6,8 SATD_START_SSE2 m6, m7 call pixel_satd_8x8_internal SATD_END_SSE2 m6 %if WIN64 cglobal pixel_satd_8x4, 4,6,14 %else cglobal pixel_satd_8x4, 4,6,8 %endif SATD_START_SSE2 m6, m7 call %%pixel_satd_8x4_internal2 SATD_END_SSE2 m6 %endmacro ; SATDS_SSE2 ;============================================================================= ; SA8D ;============================================================================= %macro SA8D_INTER 0 %if ARCH_X86_64 %define lh m10 %define rh m0 %else %define lh m0 %define rh [esp+48] %endif %if HIGH_BIT_DEPTH HADDUW m0, m1 paddd lh, rh %else paddusw lh, rh %endif ; HIGH_BIT_DEPTH %endmacro %macro SA8D_8x8 0 call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH paddd m0, [pd_1] psrld m0, 1 paddd m12, m0 %endmacro %macro SA8D_16x16 0 call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 call pixel_sa8d_8x8_internal ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r2, 8*SIZEOF_PIXEL sub r0, 8*SIZEOF_PIXEL SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif paddd m0, [pd_1] psrld m0, 1 paddd m12, m0 %endmacro %macro AVG_16x16 0 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d %endmacro %macro SA8D 0 ; sse2 doesn't seem to like the horizontal way of doing things %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal lea r6, [r0+4*r1] lea r7, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 %if vertical HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax %else ; non-sse2 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 %endif paddw m0, m1 paddw m0, m2 paddw m0, m8 SAVE_MM_PERMUTATION ret cglobal pixel_sa8d_8x8, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] %if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] %if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 call pixel_sa8d_8x8_internal ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r2, 8*SIZEOF_PIXEL sub r0, 8*SIZEOF_PIXEL SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd eax, m0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_8x16, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 lea r0, [r0 + 8*r1] lea r2, [r2 + 8*r3] SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_8x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_16x8, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_16x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_16x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_24x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_32x8, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_32x16, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_32x24, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 sub r0, 8*SIZEOF_PIXEL sub r2, 8*SIZEOF_PIXEL SA8D_8x8 lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_8x8 movd eax, m12 RET cglobal pixel_sa8d_32x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_32x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_48x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x16, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x32, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x48, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET cglobal pixel_sa8d_64x64, 4,8,13 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] pxor m12, m12 %if vertical == 0 mova m7, [hmul_8p] %endif SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 add r2, 16*SIZEOF_PIXEL add r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r0, [r0+8*r1] lea r2, [r2+8*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 lea r4, [8*r1] lea r5, [8*r3] sub r0, r4 sub r2, r5 sub r2, 16*SIZEOF_PIXEL sub r0, 16*SIZEOF_PIXEL lea r4, [3*r1] lea r5, [3*r3] SA8D_16x16 movd eax, m12 RET %else ; ARCH_X86_32 %if mmsize == 16 cglobal pixel_sa8d_8x8_internal %define spill0 [esp+4] %define spill1 [esp+20] %define spill2 [esp+36] %if vertical LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 HADAMARD4_2D 0, 1, 2, 3, 4 movdqa spill0, m3 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 HADAMARD4_2D 4, 5, 6, 7, 3 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax movdqa m3, spill0 paddw m0, m1 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax %else ; mmsize == 8 mova m7, [hmul_8p] LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 ; could do first HADAMARD4_V here to save spilling later ; surprisingly, not a win on conroe or even p4 mova spill0, m2 mova spill1, m3 mova spill2, m1 SWAP 1, 7 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 HADAMARD4_V 4, 5, 6, 7, 3 mova m1, spill2 mova m2, spill0 mova m3, spill1 mova spill0, m6 mova spill1, m7 HADAMARD4_V 0, 1, 2, 3, 7 SUMSUB_BADC w, 0, 4, 1, 5, 7 HADAMARD 2, sumsub, 0, 4, 7, 6 HADAMARD 2, sumsub, 1, 5, 7, 6 HADAMARD 1, amax, 0, 4, 7, 6 HADAMARD 1, amax, 1, 5, 7, 6 mova m6, spill0 mova m7, spill1 paddw m0, m1 SUMSUB_BADC w, 2, 6, 3, 7, 4 HADAMARD 2, sumsub, 2, 6, 4, 5 HADAMARD 2, sumsub, 3, 7, 4, 5 HADAMARD 1, amax, 2, 6, 4, 5 HADAMARD 1, amax, 3, 7, 4, 5 %endif ; sse2/non-sse2 paddw m0, m2 paddw m0, m3 SAVE_MM_PERMUTATION ret %endif ; ifndef mmx2 cglobal pixel_sa8d_8x8_internal2 %define spill0 [esp+4] LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 HADAMARD4_2D 0, 1, 2, 3, 4 movdqa spill0, m3 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 HADAMARD4_2D 4, 5, 6, 7, 3 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax movdqa m3, spill0 paddw m0, m1 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax paddw m0, m2 paddw m0, m3 SAVE_MM_PERMUTATION ret cglobal pixel_sa8d_8x8, 4,7 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 48 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET cglobal pixel_sa8d_16x16, 4,7 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal %if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal %if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %else SA8D_INTER %endif mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH SA8D_INTER %else ; !HIGH_BIT_DEPTH paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 %else mova m2, [esp+48] pxor m7, m7 mova m1, m0 mova m3, m2 punpcklwd m0, m7 punpckhwd m1, m7 punpcklwd m2, m7 punpckhwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m2 HADDD m0, m1 %endif %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET cglobal pixel_sa8d_8x16, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_8x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_16x8, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_16x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_16x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_24x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x8, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x16, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x24, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 HADDUW m0, m1 movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_32x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_48x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x16, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x32, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x48, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET cglobal pixel_sa8d_64x64, 4,7,8 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [r1 + 2*r1] lea r5, [r3 + 2*r3] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [rsp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 mov dword [esp+36], r4d mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] lea r0, [r0 + r1*8] lea r2, [r2 + r3*8] mov [r6+20], r0 mov [r6+28], r2 lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 16*SIZEOF_PIXEL add r2, 16*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 24*SIZEOF_PIXEL add r2, 24*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 32*SIZEOF_PIXEL add r2, 32*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 40*SIZEOF_PIXEL add r2, 40*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 AVG_16x16 mov r0, [r6+20] mov r2, [r6+28] add r0, 48*SIZEOF_PIXEL add r2, 48*SIZEOF_PIXEL lea r4, [r1 + 2*r1] call pixel_sa8d_8x8_internal2 %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+48], m0 mov r0, [r6+20] mov r2, [r6+28] add r0, 56*SIZEOF_PIXEL add r2, 56*SIZEOF_PIXEL call pixel_sa8d_8x8_internal2 SA8D_INTER mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal2 SA8D_INTER %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd r4d, m0 add r4d, 1 shr r4d, 1 add r4d, dword [esp+36] mov eax, r4d mov esp, r6 RET %endif ; !ARCH_X86_64 %endmacro ; SA8D ;============================================================================= ; INTRA SATD ;============================================================================= %define TRANS TRANS_SSE2 %define DIFFOP DIFF_UNPACK_SSE2 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size %define movdqu movups %define punpcklqdq movlhps INIT_XMM sse2 SA8D SATDS_SSE2 %if HIGH_BIT_DEPTH == 0 INIT_XMM ssse3,atom SATDS_SSE2 SA8D %endif %define DIFFOP DIFF_SUMSUB_SSSE3 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE %if HIGH_BIT_DEPTH == 0 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 %endif INIT_XMM ssse3 SATDS_SSE2 SA8D %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps %define TRANS TRANS_SSE4 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN INIT_XMM sse4 SATDS_SSE2 SA8D ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so ; it's effectively free. %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE INIT_XMM avx SATDS_SSE2 SA8D %define TRANS TRANS_XOP INIT_XMM xop SATDS_SSE2 SA8D %if HIGH_BIT_DEPTH == 0 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 %define TRANS TRANS_SSE4 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] movq xm%1, [r0] movq xm%3, [r2] movq xm%2, [r0+r1] movq xm%4, [r2+r3] vinserti128 m%1, m%1, [r0+4*r1], 1 vinserti128 m%3, m%3, [r2+4*r3], 1 vinserti128 m%2, m%2, [r0+r4], 1 vinserti128 m%4, m%4, [r2+r5], 1 punpcklqdq m%1, m%1 punpcklqdq m%3, m%3 punpcklqdq m%2, m%2 punpcklqdq m%4, m%4 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movq xm%3, [r0] movq xm%5, [r2] movq xm%4, [r0+r1] movq xm%6, [r2+r3] vinserti128 m%3, m%3, [r0+4*r1], 1 vinserti128 m%5, m%5, [r2+4*r3], 1 vinserti128 m%4, m%4, [r0+r4], 1 vinserti128 m%6, m%6, [r2+r5], 1 punpcklqdq m%3, m%3 punpcklqdq m%5, m%5 punpcklqdq m%4, m%4 punpcklqdq m%6, m%6 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 %endmacro %macro SATD_START_AVX2 2-3 0 FIX_STRIDES r1, r3 %if %3 mova %2, [hmul_8p] lea r4, [5*r1] lea r5, [5*r3] %else mova %2, [hmul_16p] lea r4, [3*r1] lea r5, [3*r3] %endif pxor %1, %1 %endmacro %define TRANS TRANS_SSE4 INIT_YMM avx2 cglobal pixel_satd_16x8_internal LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_16x16, 4,6,8 SATD_START_AVX2 m6, m7 call pixel_satd_16x8_internal lea r0, [r0+4*r1] lea r2, [r2+4*r3] pixel_satd_16x8_internal: call pixel_satd_16x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_satd_16x8, 4,6,8 SATD_START_AVX2 m6, m7 jmp pixel_satd_16x8_internal cglobal pixel_satd_8x8_internal LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_8x16, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_satd_8x8_internal lea r0, [r0+2*r1] lea r2, [r2+2*r3] lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_8x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_satd_8x8, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_satd_8x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_sa8d_8x8_internal LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 HADAMARD4_V 0, 1, 2, 3, 4 HADAMARD 8, sumsub, 0, 1, 4, 5 HADAMARD 8, sumsub, 2, 3, 4, 5 HADAMARD 2, sumsub, 0, 1, 4, 5 HADAMARD 2, sumsub, 2, 3, 4, 5 HADAMARD 1, amax, 0, 1, 4, 5 HADAMARD 1, amax, 2, 3, 4, 5 paddw m6, m0 paddw m6, m2 ret cglobal pixel_sa8d_8x8, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_sa8d_8x8_internal vextracti128 xm1, m6, 1 paddw xm6, xm1 HADDW xm6, xm1 movd eax, xm6 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_sa8d_8x8_internal ; pix[0] sub r0, r1 sub r0, r1 add r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8] add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r0, r1 sub r0, r1 sub r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 sub r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8*stride] ; TODO: analyze Dynamic Range vextracti128 xm0, m6, 1 paddusw xm6, xm0 HADDUW xm6, xm0 movd eax, xm6 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16_internal call pixel_sa8d_8x8_internal ; pix[0] sub r0, r1 sub r0, r1 add r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 add r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8] add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r0, r1 sub r0, r1 sub r0, 8*SIZEOF_PIXEL sub r2, r3 sub r2, r3 sub r2, 8*SIZEOF_PIXEL call pixel_sa8d_8x8_internal ; pix[8*stride] ; TODO: analyze Dynamic Range vextracti128 xm0, m6, 1 paddusw xm6, xm0 HADDUW xm6, xm0 movd eax, xm6 add eax, 1 shr eax, 1 ret %if ARCH_X86_64 cglobal pixel_sa8d_32x32, 4,8,8 ; TODO: R6 is RAX on x64 platform, so we use it directly SATD_START_AVX2 m6, m7, 1 xor r7d, r7d call pixel_sa8d_16x16_internal ; [0] pxor m6, m6 add r7d, eax add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_16x16_internal ; [2] pxor m6, m6 add r7d, eax lea eax, [r4 * 5 - 16] sub r0, rax sub r0, r1 lea eax, [r5 * 5 - 16] sub r2, rax sub r2, r3 call pixel_sa8d_16x16_internal ; [1] pxor m6, m6 add r7d, eax add r0, r4 add r0, r1 add r2, r5 add r2, r3 call pixel_sa8d_16x16_internal ; [3] add eax, r7d RET %endif ; ARCH_X86_64=1 %endif ; HIGH_BIT_DEPTH ; Input 10bit, Output 8bit ;------------------------------------------------------------------------------------------------------------------------ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------ INIT_XMM sse2 cglobal downShift_16, 7,7,3 movd m0, r6d ; m0 = shift add r1, r1 dec r5d .loopH: xor r6, r6 .loopW: movu m1, [r0 + r6 * 2] movu m2, [r0 + r6 * 2 + 16] psrlw m1, m0 psrlw m2, m0 packuswb m1, m2 movu [r2 + r6], m1 add r6, 16 cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ;processing last row of every frame [To handle width which not a multiple of 16] .loop16: movu m1, [r0] movu m2, [r0 + 16] psrlw m1, m0 psrlw m2, m0 packuswb m1, m2 movu [r2], m1 add r0, 2 * mmsize add r2, mmsize sub r4d, 16 jz .end cmp r4d, 15 jg .loop16 cmp r4d, 8 jl .process4 movu m1, [r0] psrlw m1, m0 packuswb m1, m1 movh [r2], m1 add r0, mmsize add r2, 8 sub r4d, 8 jz .end .process4: cmp r4d, 4 jl .process2 movh m1,[r0] psrlw m1, m0 packuswb m1, m1 movd [r2], m1 add r0, 8 add r2, 4 sub r4d, 4 jz .end .process2: cmp r4d, 2 jl .process1 movd m1, [r0] psrlw m1, m0 packuswb m1, m1 movd r6, m1 mov [r2], r6w add r0, 4 add r2, 2 sub r4d, 2 jz .end .process1: movd m1, [r0] psrlw m1, m0 packuswb m1, m1 movd r3, m1 mov [r2], r3b .end: RET ; Input 10bit, Output 8bit ;------------------------------------------------------------------------------------------------------------------------------------- ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal downShift_16, 6,7,3 movd xm0, r6m ; m0 = shift add r1d, r1d dec r5d .loopH: xor r6, r6 .loopW: movu m1, [r0 + r6 * 2 + 0] movu m2, [r0 + r6 * 2 + 32] vpsrlw m1, xm0 vpsrlw m2, xm0 packuswb m1, m2 vpermq m1, m1, 11011000b movu [r2 + r6], m1 add r6d, mmsize cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ; processing last row of every frame [To handle width which not a multiple of 32] mov r6d, r4d and r4d, 31 shr r6d, 5 .loop32: movu m1, [r0] movu m2, [r0 + 32] psrlw m1, xm0 psrlw m2, xm0 packuswb m1, m2 vpermq m1, m1, 11011000b movu [r2], m1 add r0, 2*mmsize add r2, mmsize dec r6d jnz .loop32 cmp r4d, 16 jl .process8 movu m1, [r0] psrlw m1, xm0 packuswb m1, m1 vpermq m1, m1, 10001000b movu [r2], xm1 add r0, mmsize add r2, 16 sub r4d, 16 jz .end .process8: cmp r4d, 8 jl .process4 movu m1, [r0] psrlw m1, xm0 packuswb m1, m1 movq [r2], xm1 add r0, 16 add r2, 8 sub r4d, 8 jz .end .process4: cmp r4d, 4 jl .process2 movq xm1,[r0] psrlw m1, xm0 packuswb m1, m1 movd [r2], xm1 add r0, 8 add r2, 4 sub r4d, 4 jz .end .process2: cmp r4d, 2 jl .process1 movd xm1, [r0] psrlw m1, xm0 packuswb m1, m1 movd r6d, xm1 mov [r2], r6w add r0, 4 add r2, 2 sub r4d, 2 jz .end .process1: movd xm1, [r0] psrlw m1, xm0 packuswb m1, m1 movd r3d, xm1 mov [r2], r3b .end: RET ; Input 8bit, Output 10bit ;--------------------------------------------------------------------------------------------------------------------- ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) ;--------------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal upShift_8, 6,7,3 movd xm2, r6m add r3d, r3d dec r5d .loopH: xor r6, r6 .loopW: pmovzxbw m0,[r0 + r6] pmovzxbw m1,[r0 + r6 + mmsize/2] psllw m0, m2 psllw m1, m2 movu [r2 + r6 * 2], m0 movu [r2 + r6 * 2 + mmsize], m1 add r6d, mmsize cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jg .loopH ; processing last row of every frame [To handle width which not a multiple of 16] mov r1d, (mmsize/2 - 1) and r1d, r4d sub r1, mmsize/2 ; NOTE: Width MUST BE more than or equal to 8 shr r4d, 3 ; log2(mmsize) .loopW8: pmovzxbw m0,[r0] psllw m0, m2 movu [r2], m0 add r0, mmsize/2 add r2, mmsize dec r4d jg .loopW8 ; Mac OS X can't read beyond array bound, so rollback some bytes pmovzxbw m0,[r0 + r1] psllw m0, m2 movu [r2 + r1 * 2], m0 RET ;--------------------------------------------------------------------------------------------------------------------- ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) ;--------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 INIT_YMM avx2 cglobal upShift_8, 6,7,3 movd xm2, r6m add r3d, r3d dec r5d .loopH: xor r6, r6 .loopW: pmovzxbw m0,[r0 + r6] pmovzxbw m1,[r0 + r6 + mmsize/2] psllw m0, xm2 psllw m1, xm2 movu [r2 + r6 * 2], m0 movu [r2 + r6 * 2 + mmsize], m1 add r6d, mmsize cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jg .loopH ; processing last row of every frame [To handle width which not a multiple of 32] mov r1d, (mmsize/2 - 1) and r1d, r4d sub r1, mmsize/2 ; NOTE: Width MUST BE more than or equal to 16 shr r4d, 4 ; log2(mmsize) .loopW16: pmovzxbw m0,[r0] psllw m0, xm2 movu [r2], m0 add r0, mmsize/2 add r2, mmsize dec r4d jg .loopW16 ; Mac OS X can't read beyond array bound, so rollback some bytes pmovzxbw m0,[r0 + r1] psllw m0, xm2 movu [r2 + r1 * 2], m0 RET %endif %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp %if cpuflag(ssse3) pabsd %1, %3 pabsd %2, %4 %elifidn %1, %3 pxor %5, %5 pxor %6, %6 psubd %5, %1 psubd %6, %2 pmaxsd %1, %5 pmaxsd %2, %6 %else pxor %1, %1 pxor %2, %2 psubd %1, %3 psubd %2, %4 pmaxsd %1, %3 pmaxsd %2, %4 %endif %endmacro ; Input 10bit, Output 12bit ;------------------------------------------------------------------------------------------------------------------------ ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------ INIT_XMM sse2 cglobal upShift_16, 6,7,4 movd m0, r6m ; m0 = shift mova m3, [pw_pixel_max] FIX_STRIDES r1d, r3d dec r5d .loopH: xor r6d, r6d .loopW: movu m1, [r0 + r6 * SIZEOF_PIXEL] movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] psllw m1, m0 psllw m2, m0 ; TODO: if input always valid, we can remove below 2 instructions. pand m1, m3 pand m2, m3 movu [r2 + r6 * SIZEOF_PIXEL], m1 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 add r6, mmsize * 2 / SIZEOF_PIXEL cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ;processing last row of every frame [To handle width which not a multiple of 16] .loop16: movu m1, [r0] movu m2, [r0 + mmsize] psllw m1, m0 psllw m2, m0 pand m1, m3 pand m2, m3 movu [r2], m1 movu [r2 + mmsize], m2 add r0, 2 * mmsize add r2, 2 * mmsize sub r4d, 16 jz .end jg .loop16 cmp r4d, 8 jl .process4 movu m1, [r0] psrlw m1, m0 pand m1, m3 movu [r2], m1 add r0, mmsize add r2, mmsize sub r4d, 8 jz .end .process4: cmp r4d, 4 jl .process2 movh m1,[r0] psllw m1, m0 pand m1, m3 movh [r2], m1 add r0, 8 add r2, 8 sub r4d, 4 jz .end .process2: cmp r4d, 2 jl .process1 movd m1, [r0] psllw m1, m0 pand m1, m3 movd [r2], m1 add r0, 4 add r2, 4 sub r4d, 2 jz .end .process1: movd m1, [r0] psllw m1, m0 pand m1, m3 movd r3, m1 mov [r2], r3w .end: RET ; Input 10bit, Output 12bit ;------------------------------------------------------------------------------------------------------------------------------------- ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;------------------------------------------------------------------------------------------------------------------------------------- ; TODO: NO TEST CODE! INIT_YMM avx2 cglobal upShift_16, 6,7,4 movd xm0, r6m ; m0 = shift vbroadcasti128 m3, [pw_pixel_max] FIX_STRIDES r1d, r3d dec r5d .loopH: xor r6d, r6d .loopW: movu m1, [r0 + r6 * SIZEOF_PIXEL] movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize] psllw m1, xm0 psllw m2, xm0 pand m1, m3 pand m2, m3 movu [r2 + r6 * SIZEOF_PIXEL], m1 movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2 add r6, mmsize * 2 / SIZEOF_PIXEL cmp r6d, r4d jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d jnz .loopH ; processing last row of every frame [To handle width which not a multiple of 32] mov r6d, r4d and r4d, 31 shr r6d, 5 .loop32: movu m1, [r0] movu m2, [r0 + mmsize] psllw m1, xm0 psllw m2, xm0 pand m1, m3 pand m2, m3 movu [r2], m1 movu [r2 + mmsize], m2 add r0, 2*mmsize add r2, 2*mmsize dec r6d jnz .loop32 cmp r4d, 16 jl .process8 movu m1, [r0] psllw m1, xm0 pand m1, m3 movu [r2], m1 add r0, mmsize add r2, mmsize sub r4d, 16 jz .end .process8: cmp r4d, 8 jl .process4 movu xm1, [r0] psllw xm1, xm0 pand xm1, xm3 movu [r2], xm1 add r0, 16 add r2, 16 sub r4d, 8 jz .end .process4: cmp r4d, 4 jl .process2 movq xm1,[r0] psllw xm1, xm0 pand xm1, xm3 movq [r2], xm1 add r0, 8 add r2, 8 sub r4d, 4 jz .end .process2: cmp r4d, 2 jl .process1 movd xm1, [r0] psllw xm1, xm0 pand xm1, xm3 movd [r2], xm1 add r0, 4 add r2, 4 sub r4d, 2 jz .end .process1: movd xm1, [r0] psllw xm1, xm0 pand xm1, xm3 movd r3d, xm1 mov [r2], r3w .end: RET ;--------------------------------------------------------------------------------------------------------------------- ;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) ;--------------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal psyCost_pp_4x4, 4, 5, 8 %if HIGH_BIT_DEPTH FIX_STRIDES r1, r3 lea r4, [3 * r1] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 paddd m5, m0, m1 paddd m5, m2 paddd m5, m3 psrldq m4, m5, 4 paddd m5, m4 psrld m5, 2 SUMSUB_BA d, 0, 1, 4 SUMSUB_BA d, 2, 3, 4 SUMSUB_BA d, 0, 2, 4 SUMSUB_BA d, 1, 3, 4 %define ORDER unord TRANS q, ORDER, 0, 2, 4, 6 TRANS q, ORDER, 1, 3, 4, 6 ABSD2 m0, m2, m0, m2, m4, m6 pmaxsd m0, m2 ABSD2 m1, m3, m1, m3, m4, m6 pmaxsd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psubd m7, m0, m5 lea r4, [3 * r3] movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r4] mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 paddd m5, m0, m1 paddd m5, m2 paddd m5, m3 psrldq m4, m5, 4 paddd m5, m4 psrld m5, 2 SUMSUB_BA d, 0, 1, 4 SUMSUB_BA d, 2, 3, 4 SUMSUB_BA d, 0, 2, 4 SUMSUB_BA d, 1, 3, 4 %define ORDER unord TRANS q, ORDER, 0, 2, 4, 6 TRANS q, ORDER, 1, 3, 4, 6 ABSD2 m0, m2, m0, m2, m4, m6 pmaxsd m0, m2 ABSD2 m1, m3, m1, m3, m4, m6 pmaxsd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psubd m0, m5 psubd m7, m0 pabsd m0, m7 movd eax, m0 %else ; !HIGH_BIT_DEPTH lea r4, [3 * r1] movd m0, [r0] movd m1, [r0 + r1] movd m2, [r0 + r1 * 2] movd m3, [r0 + r4] shufps m0, m1, 0 shufps m2, m3, 0 mova m4, [hmul_4p] pmaddubsw m0, m4 pmaddubsw m2, m4 paddw m5, m0, m2 movhlps m4, m5 paddw m5, m4 pmaddwd m5, [pw_1] psrld m5, 2 HADAMARD 0, sumsub, 0, 2, 1, 3 HADAMARD 4, sumsub, 0, 2, 1, 3 HADAMARD 1, amax, 0, 2, 1, 3 HADDW m0, m2 psubd m6, m0, m5 lea r4, [3 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + r3 * 2] movd m3, [r2 + r4] shufps m0, m1, 0 shufps m2, m3, 0 mova m4, [hmul_4p] pmaddubsw m0, m4 pmaddubsw m2, m4 paddw m5, m0, m2 movhlps m4, m5 paddw m5, m4 pmaddwd m5, [pw_1] psrld m5, 2 HADAMARD 0, sumsub, 0, 2, 1, 3 HADAMARD 4, sumsub, 0, 2, 1, 3 HADAMARD 1, amax, 0, 2, 1, 3 HADDW m0, m2 psubd m0, m5 psubd m6, m0 pabsd m0, m6 movd eax, m0 %endif ; HIGH_BIT_DEPTH RET %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_pp_8x8, 4, 6, 13 %if HIGH_BIT_DEPTH FIX_STRIDES r1, r3 lea r4, [3 * r1] pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, [pw_1] movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m10, m0, m8 lea r4, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r4] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, [pw_1] movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 movd eax, m0 %else ; !HIGH_BIT_DEPTH lea r4, [3 * r1] mova m8, [hmul_8p] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, [pw_1] psrldq m10, m11, 4 paddd m11, m10 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m12, m0, m11 lea r4, [3 * r3] movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r4] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, [pw_1] psrldq m10, m11, 4 paddd m11, m10 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 movd eax, m0 %endif ; HIGH_BIT_DEPTH RET %endif %if ARCH_X86_64 %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_16x16, 4, 9, 14 FIX_STRIDES r1, r3 lea r4, [3 * r1] lea r8, [3 * r3] mova m12, [pw_1] mova m13, [pd_1] pxor m11, m11 mov r7d, 2 .loopH: mov r6d, 2 .loopW: pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m10, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r8] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r8] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 paddd m11, m0 add r0, 16 add r2, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, m11 RET %else ; !HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_16x16, 4, 9, 15 lea r4, [3 * r1] lea r8, [3 * r3] mova m8, [hmul_8p] mova m10, [pw_1] mova m14, [pd_1] pxor m13, m13 mov r7d, 2 .loopH: mov r6d, 2 .loopW: pxor m12, m12 movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m12, m0, m11 movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r8] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r8] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 paddd m13, m0 add r0, 8 add r2, 8 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 16] lea r2, [r2 + r3 * 8 - 16] dec r7d jnz .loopH movd eax, m13 RET %endif ; HIGH_BIT_DEPTH %endif %if ARCH_X86_64 %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_32x32, 4, 9, 14 FIX_STRIDES r1, r3 lea r4, [3 * r1] lea r8, [3 * r3] mova m12, [pw_1] mova m13, [pd_1] pxor m11, m11 mov r7d, 4 .loopH: mov r6d, 4 .loopW: pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m10, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r8] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r8] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 paddd m11, m0 add r0, 16 add r2, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, m11 RET %else ; !HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_32x32, 4, 9, 15 lea r4, [3 * r1] lea r8, [3 * r3] mova m8, [hmul_8p] mova m10, [pw_1] mova m14, [pd_1] pxor m13, m13 mov r7d, 4 .loopH: mov r6d, 4 .loopW: pxor m12, m12 movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m12, m0, m11 movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r8] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r8] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 paddd m13, m0 add r0, 8 add r2, 8 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, m13 RET %endif ; HIGH_BIT_DEPTH %endif %if ARCH_X86_64 %if HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_64x64, 4, 9, 14 FIX_STRIDES r1, r3 lea r4, [3 * r1] lea r8, [3 * r3] mova m12, [pw_1] mova m13, [pd_1] pxor m11, m11 mov r7d, 8 .loopH: mov r6d, 8 .loopW: pxor m10, m10 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m10, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r8] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r8] paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, m12 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax paddd m0, m1 paddd m0, m2 paddd m0, m3 HADDUW m0, m1 paddd m0, m13 psrld m0, 1 psubd m0, m8 psubd m10, m0 pabsd m0, m10 paddd m11, m0 add r0, 16 add r2, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r7d jnz .loopH movd eax, m11 RET %else ; !HIGH_BIT_DEPTH INIT_XMM sse4 cglobal psyCost_pp_64x64, 4, 9, 15 lea r4, [3 * r1] lea r8, [3 * r3] mova m8, [hmul_8p] mova m10, [pw_1] mova m14, [pd_1] pxor m13, m13 mov r7d, 8 .loopH: mov r6d, 8 .loopW: pxor m12, m12 movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r5, [r0 + r1 * 4] movddup m4, [r5] movddup m5, [r5 + r1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m12, m0, m11 movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r8] lea r5, [r2 + r3 * 4] movddup m4, [r5] movddup m5, [r5 + r3] movddup m6, [r5 + r3 * 2] movddup m7, [r5 + r8] pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, m10 psrldq m9, m11, 4 paddd m11, m9 psrld m11, 2 HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 paddw m0, m1 paddw m0, m2 paddw m0, m3 HADDW m0, m1 paddd m0, m14 psrld m0, 1 psubd m0, m11 psubd m12, m0 pabsd m0, m12 paddd m13, m0 add r0, 8 add r2, 8 dec r6d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, m13 RET %endif ; HIGH_BIT_DEPTH %endif INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal psyCost_pp_4x4, 4, 5, 6 add r1d, r1d add r3d, r3d lea r4, [r1 * 3] movddup xm0, [r0] movddup xm1, [r0 + r1] movddup xm2, [r0 + r1 * 2] movddup xm3, [r0 + r4] lea r4, [r3 * 3] movddup xm4, [r2] movddup xm5, [r2 + r3] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 movddup xm4, [r2 + r3 * 2] movddup xm5, [r2 + r4] vinserti128 m2, m2, xm4, 1 vinserti128 m3, m3, xm5, 1 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 paddd m5, m0, m1 paddd m4, m2, m3 paddd m5, m4 psrldq m4, m5, 4 paddd m5, m4 psrld m5, 2 mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 movaps m4, m0 vshufps m4, m4, m2, 11011101b vshufps m0, m0, m2, 10001000b movaps m2, m1 vshufps m2, m2, m3, 11011101b vshufps m1, m1, m3, 10001000b pabsd m0, m0 pabsd m4, m4 pmaxsd m0, m4 pabsd m1, m1 pabsd m2, m2 pmaxsd m1, m2 paddd m0, m1 vpermq m1, m0, 11110101b paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psubd m0, m5 vextracti128 xm1, m0, 1 psubd xm1, xm0 pabsd xm1, xm1 movd eax, xm1 RET %else ; !HIGH_BIT_DEPTH cglobal psyCost_pp_4x4, 4, 5, 6 lea r4, [3 * r1] movd xm0, [r0] movd xm1, [r0 + r1] movd xm2, [r0 + r1 * 2] movd xm3, [r0 + r4] vshufps xm0, xm1, 0 vshufps xm2, xm3, 0 lea r4, [3 * r3] movd xm1, [r2] movd xm3, [r2 + r3] movd xm4, [r2 + r3 * 2] movd xm5, [r2 + r4] vshufps xm1, xm3, 0 vshufps xm4, xm5, 0 vinserti128 m0, m0, xm1, 1 vinserti128 m2, m2, xm4, 1 mova m4, [hmul_4p] pmaddubsw m0, m4 pmaddubsw m2, m4 paddw m5, m0, m2 mova m1, m5 psrldq m4, m5, 8 paddw m5, m4 pmaddwd m5, [pw_1] psrld m5, 2 vpsubw m2, m2, m0 vpunpckhqdq m0, m1, m2 vpunpcklqdq m1, m1, m2 vpaddw m2, m1, m0 vpsubw m0, m0, m1 vpblendw m1, m2, m0, 10101010b vpslld m0, m0, 10h vpsrld m2, m2, 10h vpor m0, m0, m2 vpabsw m1, m1 vpabsw m0, m0 vpmaxsw m1, m1, m0 vpmaddwd m1, m1, [pw_1] psrldq m2, m1, 8 paddd m1, m2 psrldq m3, m1, 4 paddd m1, m3 psubd m1, m5 vextracti128 xm2, m1, 1 psubd m1, m2 pabsd m1, m1 movd eax, xm1 RET %endif %macro PSY_PP_8x8 0 movddup m0, [r0 + r1 * 0] movddup m1, [r0 + r1 * 1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4 * 1] lea r5, [r0 + r1 * 4] movddup m4, [r2 + r3 * 0] movddup m5, [r2 + r3 * 1] movddup m6, [r2 + r3 * 2] movddup m7, [r2 + r7 * 1] lea r6, [r2 + r3 * 4] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 movddup m4, [r5 + r1 * 0] movddup m5, [r5 + r1 * 1] movddup m6, [r5 + r1 * 2] movddup m7, [r5 + r4 * 1] movddup m9, [r6 + r3 * 0] movddup m10, [r6 + r3 * 1] movddup m11, [r6 + r3 * 2] movddup m12, [r6 + r7 * 1] vinserti128 m4, m4, xm9, 1 vinserti128 m5, m5, xm10, 1 vinserti128 m6, m6, xm11, 1 vinserti128 m7, m7, xm12, 1 pmaddubsw m0, m8 pmaddubsw m1, m8 pmaddubsw m2, m8 pmaddubsw m3, m8 pmaddubsw m4, m8 pmaddubsw m5, m8 pmaddubsw m6, m8 pmaddubsw m7, m8 paddw m11, m0, m1 paddw m11, m2 paddw m11, m3 paddw m11, m4 paddw m11, m5 paddw m11, m6 paddw m11, m7 pmaddwd m11, [pw_1] psrldq m10, m11, 4 paddd m11, m10 psrld m11, 2 mova m9, m0 paddw m0, m1 ; m0+m1 psubw m1, m9 ; m1-m0 mova m9, m2 paddw m2, m3 ; m2+m3 psubw m3, m9 ; m3-m2 mova m9, m0 paddw m0, m2 ; m0+m1+m2+m3 psubw m2, m9 ; m2+m3-m0+m1 mova m9, m1 paddw m1, m3 ; m1-m0+m3-m2 psubw m3, m9 ; m3-m2-m1-m0 movdqa m9, m4 paddw m4, m5 ; m4+m5 psubw m5, m9 ; m5-m4 movdqa m9, m6 paddw m6, m7 ; m6+m7 psubw m7, m9 ; m7-m6 movdqa m9, m4 paddw m4, m6 ; m4+m5+m6+m7 psubw m6, m9 ; m6+m7-m4+m5 movdqa m9, m5 paddw m5, m7 ; m5-m4+m7-m6 psubw m7, m9 ; m7-m6-m5-m4 movdqa m9, m0 paddw m0, m4 ; (m0+m1+m2+m3)+(m4+m5+m6+m7) psubw m4, m9 ; (m4+m5+m6+m7)-(m0+m1+m2+m3) movdqa m9, m1 paddw m1, m5 ; (m1-m0+m3-m2)+(m5-m4+m7-m6) psubw m5, m9 ; (m5-m4+m7-m6)-(m1-m0+m3-m2) mova m9, m0 vshufps m9, m9, m4, 11011101b vshufps m0, m0, m4, 10001000b movdqa m4, m0 paddw m0, m9 ; (a0 + a4) + (a4 - a0) psubw m9, m4 ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4) movaps m4, m1 vshufps m4, m4, m5, 11011101b vshufps m1, m1, m5, 10001000b movdqa m5, m1 paddw m1, m4 psubw m4, m5 movdqa m5, m2 paddw m2, m6 psubw m6, m5 movdqa m5, m3 paddw m3, m7 psubw m7, m5 movaps m5, m2 vshufps m5, m5, m6, 11011101b vshufps m2, m2, m6, 10001000b movdqa m6, m2 paddw m2, m5 psubw m5, m6 movaps m6, m3 vshufps m6, m6, m7, 11011101b vshufps m3, m3, m7, 10001000b movdqa m7, m3 paddw m3, m6 psubw m6, m7 movdqa m7, m0 pblendw m0, m9, 10101010b pslld m9, 10h psrld m7, 10h por m9, m7 pabsw m0, m0 pabsw m9, m9 pmaxsw m0, m9 movdqa m7, m1 pblendw m1, m4, 10101010b pslld m4, 10h psrld m7, 10h por m4, m7 pabsw m1, m1 pabsw m4, m4 pmaxsw m1, m4 movdqa m7, m2 pblendw m2, m5, 10101010b pslld m5, 10h psrld m7, 10h por m5, m7 pabsw m2, m2 pabsw m5, m5 pmaxsw m2, m5 mova m7, m3 pblendw m3, m6, 10101010b pslld m6, 10h psrld m7, 10h por m6, m7 pabsw m3, m3 pabsw m6, m6 pmaxsw m3, m6 paddw m0, m1 paddw m0, m2 paddw m0, m3 pmaddwd m0, [pw_1] psrldq m1, m0, 8 paddd m0, m1 pshuflw m1, m0, 00001110b paddd m0, m1 paddd m0, [pd_1] psrld m0, 1 psubd m0, m11 vextracti128 xm1, m0, 1 psubd m0, m1 pabsd m0, m0 %endmacro %macro PSY_PP_8x8_AVX2 0 lea r4, [r1 * 3] movu xm0, [r0] movu xm1, [r0 + r1] movu xm2, [r0 + r1 * 2] movu xm3, [r0 + r4] lea r5, [r0 + r1 * 4] movu xm4, [r5] movu xm5, [r5 + r1] movu xm6, [r5 + r1 * 2] movu xm7, [r5 + r4] lea r4, [r3 * 3] vinserti128 m0, m0, [r2], 1 vinserti128 m1, m1, [r2 + r3], 1 vinserti128 m2, m2, [r2 + r3 * 2], 1 vinserti128 m3, m3, [r2 + r4], 1 lea r5, [r2 + r3 * 4] vinserti128 m4, m4, [r5], 1 vinserti128 m5, m5, [r5 + r3], 1 vinserti128 m6, m6, [r5 + r3 * 2], 1 vinserti128 m7, m7, [r5 + r4], 1 paddw m8, m0, m1 paddw m8, m2 paddw m8, m3 paddw m8, m4 paddw m8, m5 paddw m8, m6 paddw m8, m7 pmaddwd m8, [pw_1] psrldq m9, m8, 8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 psubw m9, m1, m0 paddw m0, m1 psubw m1, m3, m2 paddw m2, m3 punpckhwd m3, m0, m9 punpcklwd m0, m9 psubw m9, m3, m0 paddw m0, m3 punpckhwd m3, m2, m1 punpcklwd m2, m1 psubw m10, m3, m2 paddw m2, m3 psubw m3, m5, m4 paddw m4, m5 psubw m5, m7, m6 paddw m6, m7 punpckhwd m1, m4, m3 punpcklwd m4, m3 psubw m7, m1, m4 paddw m4, m1 punpckhwd m3, m6, m5 punpcklwd m6, m5 psubw m1, m3, m6 paddw m6, m3 psubw m3, m2, m0 paddw m0, m2 psubw m2, m10, m9 paddw m9, m10 punpckhdq m5, m0, m3 punpckldq m0, m3 psubw m10, m5, m0 paddw m0, m5 punpckhdq m3, m9, m2 punpckldq m9, m2 psubw m5, m3, m9 paddw m9, m3 psubw m3, m6, m4 paddw m4, m6 psubw m6, m1, m7 paddw m7, m1 punpckhdq m2, m4, m3 punpckldq m4, m3 psubw m1, m2, m4 paddw m4, m2 punpckhdq m3, m7, m6 punpckldq m7, m6 psubw m2, m3, m7 paddw m7, m3 psubw m3, m4, m0 paddw m0, m4 psubw m4, m1, m10 paddw m10, m1 punpckhqdq m6, m0, m3 punpcklqdq m0, m3 pabsw m0, m0 pabsw m6, m6 pmaxsw m0, m6 punpckhqdq m3, m10, m4 punpcklqdq m10, m4 pabsw m10, m10 pabsw m3, m3 pmaxsw m10, m3 psubw m3, m7, m9 paddw m9, m7 psubw m7, m2, m5 paddw m5, m2 punpckhqdq m4, m9, m3 punpcklqdq m9, m3 pabsw m9, m9 pabsw m4, m4 pmaxsw m9, m4 punpckhqdq m3, m5, m7 punpcklqdq m5, m7 pabsw m5, m5 pabsw m3, m3 pmaxsw m5, m3 paddd m0, m9 paddd m0, m10 paddd m0, m5 psrld m9, m0, 16 pslld m0, 16 psrld m0, 16 paddd m0, m9 psrldq m9, m0, 8 paddd m0, m9 psrldq m9, m0, 4 paddd m0, m9 paddd m0, [pd_1] psrld m0, 1 psubd m0, m8 vextracti128 xm1, m0, 1 psubd xm1, xm0 pabsd xm1, xm1 %endmacro %if ARCH_X86_64 %if HIGH_BIT_DEPTH cglobal psyCost_pp_8x8, 4, 8, 11 add r1d, r1d add r3d, r3d PSY_PP_8x8_AVX2 movd eax, xm1 RET %else ; !HIGH_BIT_DEPTH INIT_YMM avx2 cglobal psyCost_pp_8x8, 4, 8, 13 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] PSY_PP_8x8 movd eax, xm0 RET %endif %endif %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal psyCost_pp_16x16, 4, 10, 12 add r1d, r1d add r3d, r3d pxor m11, m11 mov r8d, 2 .loopH: mov r9d, 2 .loopW: PSY_PP_8x8_AVX2 paddd xm11, xm1 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r8d jnz .loopH movd eax, xm11 RET %else ; !HIGH_BIT_DEPTH cglobal psyCost_pp_16x16, 4, 10, 14 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] pxor m13, m13 mov r8d, 2 .loopH: mov r9d, 2 .loopW: PSY_PP_8x8 paddd m13, m0 add r0, 8 add r2, 8 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 16] lea r2, [r2 + r3 * 8 - 16] dec r8d jnz .loopH movd eax, xm13 RET %endif %endif %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal psyCost_pp_32x32, 4, 10, 12 add r1d, r1d add r3d, r3d pxor m11, m11 mov r8d, 4 .loopH: mov r9d, 4 .loopW: PSY_PP_8x8_AVX2 paddd xm11, xm1 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r8d jnz .loopH movd eax, xm11 RET %else ; !HIGH_BIT_DEPTH cglobal psyCost_pp_32x32, 4, 10, 14 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] pxor m13, m13 mov r8d, 4 .loopH: mov r9d, 4 .loopW: PSY_PP_8x8 paddd m13, m0 add r0, 8 add r2, 8 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r8d jnz .loopH movd eax, xm13 RET %endif %endif %if ARCH_X86_64 INIT_YMM avx2 %if HIGH_BIT_DEPTH cglobal psyCost_pp_64x64, 4, 10, 12 add r1d, r1d add r3d, r3d pxor m11, m11 mov r8d, 8 .loopH: mov r9d, 8 .loopW: PSY_PP_8x8_AVX2 paddd xm11, xm1 add r0, 16 add r2, 16 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r8d jnz .loopH movd eax, xm11 RET %else ; !HIGH_BIT_DEPTH cglobal psyCost_pp_64x64, 4, 10, 14 lea r4, [3 * r1] lea r7, [3 * r3] mova m8, [hmul_8p] pxor m13, m13 mov r8d, 8 .loopH: mov r9d, 8 .loopW: PSY_PP_8x8 paddd m13, m0 add r0, 8 add r2, 8 dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r8d jnz .loopH movd eax, xm13 RET %endif %endif ;--------------------------------------------------------------------------------------------------------------------- ;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) ;--------------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal psyCost_ss_4x4, 4, 5, 8 add r1, r1 lea r4, [3 * r1] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] pabsw m4, m0 pabsw m5, m1 paddw m5, m4 pabsw m4, m2 paddw m5, m4 pabsw m4, m3 paddw m5, m4 pmaddwd m5, [pw_1] psrldq m4, m5, 4 paddd m5, m4 psrld m6, m5, 2 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 psrldq m4, m0, 4 psubd m5, m0, m4 paddd m0, m4 shufps m0, m5, 10001000b psrldq m4, m1, 4 psubd m5, m1, m4 paddd m1, m4 shufps m1, m5, 10001000b psrldq m4, m2, 4 psubd m5, m2, m4 paddd m2, m4 shufps m2, m5, 10001000b psrldq m4, m3, 4 psubd m5, m3, m4 paddd m3, m4 shufps m3, m5, 10001000b mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 paddd m0, m2 paddd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psrld m0, 1 psubd m7, m0, m6 add r3, r3 lea r4, [3 * r3] movddup m0, [r2] movddup m1, [r2 + r3] movddup m2, [r2 + r3 * 2] movddup m3, [r2 + r4] pabsw m4, m0 pabsw m5, m1 paddw m5, m4 pabsw m4, m2 paddw m5, m4 pabsw m4, m3 paddw m5, m4 pmaddwd m5, [pw_1] psrldq m4, m5, 4 paddd m5, m4 psrld m6, m5, 2 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 psrldq m4, m0, 4 psubd m5, m0, m4 paddd m0, m4 shufps m0, m5, 10001000b psrldq m4, m1, 4 psubd m5, m1, m4 paddd m1, m4 shufps m1, m5, 10001000b psrldq m4, m2, 4 psubd m5, m2, m4 paddd m2, m4 shufps m2, m5, 10001000b psrldq m4, m3, 4 psubd m5, m3, m4 paddd m3, m4 shufps m3, m5, 10001000b mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 paddd m0, m2 paddd m1, m3 paddd m0, m1 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psrld m0, 1 psubd m0, m6 psubd m7, m0 pabsd m0, m7 movd eax, m0 RET %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_8x8, 4, 6, 15 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m12, m7 paddw m11, m12 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m12, m0, m8 lea r4, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r4] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m10, m7 paddw m11, m10 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r4] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m0, m8 psubd m12, m0 pabsd m0, m12 movd eax, m0 RET %endif %macro psy_cost_ss 0 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] lea r5, [r0 + r1 * 4] movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m12, m7 paddw m11, m12 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] movu m3, [r0 + r4] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r1] movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m12, m0, m8 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r6] lea r5, [r2 + r3 * 4] movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r6] pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m10, m7 paddw m11, m10 paddw m9, m11 paddw m8, m9 movhlps m9, m8 pmovzxwd m8, m8 pmovzxwd m9, m9 paddd m8, m9 movhlps m9, m8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + r3 * 2] movu m3, [r2 + r6] pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pmaddwd m3, m14 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 shufps m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 shufps m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 shufps m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 shufps m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 movu m4, [r5] movu m5, [r5 + r3] movu m6, [r5 + r3 * 2] movu m7, [r5 + r6] pmaddwd m4, m14 pmaddwd m5, m14 pmaddwd m6, m14 pmaddwd m7, m14 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 shufps m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 shufps m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 shufps m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 shufps m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m0, m8 psubd m12, m0 pabsd m0, m12 paddd m15, m0 %endmacro %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_16x16, 4, 9, 16 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] lea r6, [3 * r3] pxor m15, m15 mov r7d, 2 .loopH: mov r8d, 2 .loopW: psy_cost_ss add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, m15 RET %endif %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_32x32, 4, 9, 16 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] lea r6, [3 * r3] pxor m15, m15 mov r7d, 4 .loopH: mov r8d, 4 .loopW: psy_cost_ss add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, m15 RET %endif %if ARCH_X86_64 INIT_XMM sse4 cglobal psyCost_ss_64x64, 4, 9, 16 mova m13, [pw_pmpmpmpm] mova m14, [pw_1] add r1, r1 add r3, r3 lea r4, [3 * r1] lea r6, [3 * r3] pxor m15, m15 mov r7d, 8 .loopH: mov r8d, 8 .loopW: psy_cost_ss add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r7d jnz .loopH movd eax, m15 RET %endif INIT_YMM avx2 cglobal psyCost_ss_4x4, 4, 5, 8 add r1, r1 add r3, r3 lea r4, [3 * r1] movddup m0, [r0] movddup m1, [r0 + r1] movddup m2, [r0 + r1 * 2] movddup m3, [r0 + r4] lea r4, [3 * r3] movddup m4, [r2] movddup m5, [r2 + r3] movddup m6, [r2 + r3 * 2] movddup m7, [r2 + r4] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 pabsw m4, m0 pabsw m5, m1 paddw m5, m4 pabsw m4, m2 paddw m5, m4 pabsw m4, m3 paddw m5, m4 pmaddwd m5, [pw_1] psrldq m4, m5, 4 paddd m5, m4 psrld m6, m5, 2 mova m4, [hmul_8w] pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 psrldq m4, m0, 4 psubd m5, m0, m4 paddd m0, m4 shufps m0, m0, m5, 10001000b psrldq m4, m1, 4 psubd m5, m1, m4 paddd m1, m4 shufps m1, m1, m5, 10001000b psrldq m4, m2, 4 psubd m5, m2, m4 paddd m2, m4 shufps m2, m2, m5, 10001000b psrldq m4, m3, 4 psubd m5, m3, m4 paddd m3, m4 shufps m3, m3, m5, 10001000b mova m4, m0 paddd m0, m1 psubd m1, m4 mova m4, m2 paddd m2, m3 psubd m3, m4 mova m4, m0 paddd m0, m2 psubd m2, m4 mova m4, m1 paddd m1, m3 psubd m3, m4 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 paddd m0, m2 paddd m1, m3 paddd m0, m1 psrldq m1, m0, 8 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 psrld m0, 1 psubd m0, m6 vextracti128 xm1, m0, 1 psubd m0, m1 pabsd m0, m0 movd eax, xm0 RET %macro PSY_SS_8x8 0 lea r4, [3 * r1] lea r6, [r0 + r1 * 4] movu xm0, [r0] movu xm1, [r0 + r1] movu xm2, [r0 + r1 * 2] movu xm3, [r0 + r4] movu xm4, [r6] movu xm5, [r6 + r1] movu xm6, [r6 + r1 * 2] movu xm7, [r6 + r4] lea r4, [3 * r3] lea r6, [r2 + r3 * 4] movu xm8, [r2] movu xm9, [r2 + r3] movu xm10, [r2 + r3 * 2] movu xm11, [r2 + r4] vinserti128 m0, m0, xm8, 1 vinserti128 m1, m1, xm9, 1 vinserti128 m2, m2, xm10, 1 vinserti128 m3, m3, xm11, 1 movu xm8, [r6] movu xm9, [r6 + r3] movu xm10, [r6 + r3 * 2] movu xm11, [r6 + r4] vinserti128 m4, m4, xm8, 1 vinserti128 m5, m5, xm9, 1 vinserti128 m6, m6, xm10, 1 vinserti128 m7, m7, xm11, 1 ;; store on stack to use later mova [rsp + 0 * mmsize], m0 mova [rsp + 1 * mmsize], m1 mova [rsp + 2 * mmsize], m2 mova [rsp + 3 * mmsize], m3 mova [rsp + 4 * mmsize], m4 mova [rsp + 5 * mmsize], m5 mova [rsp + 6 * mmsize], m6 mova [rsp + 7 * mmsize], m7 pabsw m8, m0 pabsw m9, m1 paddw m8, m9 pabsw m10, m2 pabsw m11, m3 paddw m10, m11 paddw m8, m10 pabsw m9, m4 pabsw m10, m5 paddw m9, m10 pabsw m11, m6 pabsw m10, m7 paddw m11, m10 paddw m9, m11 paddw m8, m9 psrldq m9, m8, 8 vextracti128 xm10, m8, 1 vextracti128 xm11, m9, 1 vpmovzxwd m8, xm8 vpmovzxwd m9, xm9 vpmovzxwd m10, xm10 vpmovzxwd m11, xm11 vinserti128 m8, m8, xm10, 1 vinserti128 m9, m9, xm11, 1 paddd m8, m9 psrldq m9, m8, 8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 psrld m8, 2 ; sad_4x4 pmaddwd m0, m13 pmaddwd m1, m13 pmaddwd m2, m13 pmaddwd m3, m13 psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m13 pmaddwd m5, m13 pmaddwd m6, m13 pmaddwd m7, m13 psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m11, m0, m7 pmaddwd m0, m12, [rsp + 0 * mmsize] pmaddwd m1, m12, [rsp + 1 * mmsize] pmaddwd m2, m12, [rsp + 2 * mmsize] pmaddwd m3, m12, [rsp + 3 * mmsize] psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 vshufps m0, m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 vshufps m1, m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 vshufps m2, m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 vshufps m3, m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 pmaddwd m4, m12, [rsp + 4 * mmsize] pmaddwd m5, m12, [rsp + 5 * mmsize] pmaddwd m6, m12, [rsp + 6 * mmsize] pmaddwd m7, m12, [rsp + 7 * mmsize] psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 vshufps m4, m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 vshufps m5, m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 vshufps m6, m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 vshufps m7, m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 SUMSUB_BA d, 4, 6, 9 SUMSUB_BA d, 5, 7, 9 SUMSUB_BA d, 0, 4, 9 SUMSUB_BA d, 1, 5, 9 SUMSUB_BA d, 2, 6, 9 SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 pabsd m5, m5 pabsd m6, m6 pabsd m7, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd m5, m4 paddd m0, m5 paddd m7, m6 paddd m0, m7 paddd m0, m11 psrldq m1, m0, 8 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m0, m8 vextracti128 xm1, m0, 1 psubd m0, m1 pabsd m0, m0 %endmacro %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_8x8, 4, 7, 14 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 PSY_SS_8x8 movd eax, xm0 mov rsp, r5 RET %endif %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_16x16, 4, 9, 15 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 pxor m14, m14 mov r7d, 2 .loopH: mov r8d, 2 .loopW: PSY_SS_8x8 paddd m14, m0 add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 32] lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH movd eax, xm14 mov rsp, r5 RET %endif %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_32x32, 4, 9, 15 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 pxor m14, m14 mov r7d, 4 .loopH: mov r8d, 4 .loopW: PSY_SS_8x8 paddd m14, m0 add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH movd eax, xm14 mov rsp, r5 RET %endif %if ARCH_X86_64 INIT_YMM avx2 cglobal psyCost_ss_64x64, 4, 9, 15 ; NOTE: align stack to 64 bytes, so all of local data in same cache line mov r5, rsp sub rsp, 8*mmsize and rsp, ~63 mova m12, [pw_1] mova m13, [pw_pmpmpmpm] add r1, r1 add r3, r3 pxor m14, m14 mov r7d, 8 .loopH: mov r8d, 8 .loopW: PSY_SS_8x8 paddd m14, m0 add r0, 16 add r2, 16 dec r8d jnz .loopW lea r0, [r0 + r1 * 8 - 128] lea r2, [r2 + r3 * 8 - 128] dec r7d jnz .loopH movd eax, xm14 mov rsp, r5 RET %endif ;;--------------------------------------------------------------- ;; SATD AVX2 ;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t) ;;--------------------------------------------------------------- ;; r0 - pix0 ;; r1 - pix0Stride ;; r2 - pix1 ;; r3 - pix1Stride %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 INIT_YMM avx2 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows pxor m6, m6 vbroadcasti128 m0, [r0] vbroadcasti128 m4, [r2] vbroadcasti128 m1, [r0 + r1] vbroadcasti128 m5, [r2 + r3] pmaddubsw m4, m7 pmaddubsw m0, m7 pmaddubsw m5, m7 pmaddubsw m1, m7 psubw m0, m4 psubw m1, m5 vbroadcasti128 m2, [r0 + r1 * 2] vbroadcasti128 m4, [r2 + r3 * 2] vbroadcasti128 m3, [r0 + r4] vbroadcasti128 m5, [r2 + r5] pmaddubsw m4, m7 pmaddubsw m2, m7 pmaddubsw m5, m7 pmaddubsw m3, m7 psubw m2, m4 psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m1, m0 paddw m0, m2, m3 psubw m3, m2 paddw m2, m4, m0 psubw m0, m4 paddw m4, m1, m3 psubw m3, m1 pabsw m2, m2 pabsw m0, m0 pabsw m4, m4 pabsw m3, m3 pblendw m1, m2, m0, 10101010b pslld m0, 16 psrld m2, 16 por m0, m2 pmaxsw m1, m0 paddw m6, m1 pblendw m2, m4, m3, 10101010b pslld m3, 16 psrld m4, 16 por m3, m4 pmaxsw m2, m3 paddw m6, m2 vbroadcasti128 m1, [r0] vbroadcasti128 m4, [r2] vbroadcasti128 m2, [r0 + r1] vbroadcasti128 m5, [r2 + r3] pmaddubsw m4, m7 pmaddubsw m1, m7 pmaddubsw m5, m7 pmaddubsw m2, m7 psubw m1, m4 psubw m2, m5 vbroadcasti128 m0, [r0 + r1 * 2] vbroadcasti128 m4, [r2 + r3 * 2] vbroadcasti128 m3, [r0 + r4] vbroadcasti128 m5, [r2 + r5] lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmaddubsw m4, m7 pmaddubsw m0, m7 pmaddubsw m5, m7 pmaddubsw m3, m7 psubw m0, m4 psubw m3, m5 paddw m4, m1, m2 psubw m2, m1 paddw m1, m0, m3 psubw m3, m0 paddw m0, m4, m1 psubw m1, m4 paddw m4, m2, m3 psubw m3, m2 pabsw m0, m0 pabsw m1, m1 pabsw m4, m4 pabsw m3, m3 pblendw m2, m0, m1, 10101010b pslld m1, 16 psrld m0, 16 por m1, m0 pmaxsw m2, m1 paddw m6, m2 pblendw m0, m4, m3, 10101010b pslld m3, 16 psrld m4, 16 por m3, m4 pmaxsw m0, m3 paddw m6, m0 vextracti128 xm0, m6, 1 pmovzxwd m6, xm6 pmovzxwd m0, xm0 paddd m8, m6 paddd m9, m0 ret cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows pxor m6, m6 vbroadcasti128 m0, [r0] vbroadcasti128 m4, [r2] vbroadcasti128 m1, [r0 + r1] vbroadcasti128 m5, [r2 + r3] pmaddubsw m4, m7 pmaddubsw m0, m7 pmaddubsw m5, m7 pmaddubsw m1, m7 psubw m0, m4 psubw m1, m5 vbroadcasti128 m2, [r0 + r1 * 2] vbroadcasti128 m4, [r2 + r3 * 2] vbroadcasti128 m3, [r0 + r4] vbroadcasti128 m5, [r2 + r5] pmaddubsw m4, m7 pmaddubsw m2, m7 pmaddubsw m5, m7 pmaddubsw m3, m7 psubw m2, m4 psubw m3, m5 paddw m4, m0, m1 psubw m1, m1, m0 paddw m0, m2, m3 psubw m3, m2 paddw m2, m4, m0 psubw m0, m4 paddw m4, m1, m3 psubw m3, m1 pabsw m2, m2 pabsw m0, m0 pabsw m4, m4 pabsw m3, m3 pblendw m1, m2, m0, 10101010b pslld m0, 16 psrld m2, 16 por m0, m2 pmaxsw m1, m0 paddw m6, m1 pblendw m2, m4, m3, 10101010b pslld m3, 16 psrld m4, 16 por m3, m4 pmaxsw m2, m3 paddw m6, m2 vextracti128 xm0, m6, 1 pmovzxwd m6, xm6 pmovzxwd m0, xm0 paddd m8, m6 paddd m9, m0 ret cglobal pixel_satd_16x4, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x4 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_16x12, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x8 call calc_satd_16x4 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_16x32, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_16x64, 4,6,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x16, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x32, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET cglobal pixel_satd_64x64, 4,8,10 ; if WIN64 && cpuflag(avx2) mova m7, [hmul_16p] lea r4, [3 * r1] lea r5, [3 * r3] pxor m8, m8 pxor m9, m9 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 16] lea r2, [r7 + 16] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 48] lea r2, [r7 + 48] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 paddd m8, m9 vextracti128 xm0, m8, 1 paddd xm0, xm8 movhlps xm1, xm0 paddd xm0, xm1 pshuflw xm1, xm0, q0032 paddd xm0, xm1 movd eax, xm0 RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 INIT_YMM avx2 cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows ; rows 0-3 movu m0, [r0] movu m4, [r2] psubw m0, m4 movu m1, [r0 + r1] movu m5, [r2 + r3] psubw m1, m5 movu m2, [r0 + r1 * 2] movu m4, [r2 + r3 * 2] psubw m2, m4 movu m3, [r0 + r4] movu m5, [r2 + r5] psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m0 paddw m0, m2, m3 psubw m3, m2 punpckhwd m2, m4, m1 punpcklwd m4, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddw m3, m4, m0 psubw m0, m4 paddw m4, m2, m1 psubw m1, m2 punpckhdq m2, m3, m0 punpckldq m3, m0 paddw m0, m3, m2 psubw m2, m3 punpckhdq m3, m4, m1 punpckldq m4, m1 paddw m1, m4, m3 psubw m3, m4 punpckhqdq m4, m0, m1 punpcklqdq m0, m1 pabsw m0, m0 pabsw m4, m4 pmaxsw m0, m0, m4 punpckhqdq m1, m2, m3 punpcklqdq m2, m3 pabsw m2, m2 pabsw m1, m1 pmaxsw m2, m1 pxor m7, m7 mova m1, m0 punpcklwd m1, m7 paddd m6, m1 mova m1, m0 punpckhwd m1, m7 paddd m6, m1 pxor m7, m7 mova m1, m2 punpcklwd m1, m7 paddd m6, m1 mova m1, m2 punpckhwd m1, m7 paddd m6, m1 ; rows 4-7 movu m0, [r0] movu m4, [r2] psubw m0, m4 movu m1, [r0 + r1] movu m5, [r2 + r3] psubw m1, m5 movu m2, [r0 + r1 * 2] movu m4, [r2 + r3 * 2] psubw m2, m4 movu m3, [r0 + r4] movu m5, [r2 + r5] psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m0 paddw m0, m2, m3 psubw m3, m2 punpckhwd m2, m4, m1 punpcklwd m4, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddw m3, m4, m0 psubw m0, m4 paddw m4, m2, m1 psubw m1, m2 punpckhdq m2, m3, m0 punpckldq m3, m0 paddw m0, m3, m2 psubw m2, m3 punpckhdq m3, m4, m1 punpckldq m4, m1 paddw m1, m4, m3 psubw m3, m4 punpckhqdq m4, m0, m1 punpcklqdq m0, m1 pabsw m0, m0 pabsw m4, m4 pmaxsw m0, m0, m4 punpckhqdq m1, m2, m3 punpcklqdq m2, m3 pabsw m2, m2 pabsw m1, m1 pmaxsw m2, m1 pxor m7, m7 mova m1, m0 punpcklwd m1, m7 paddd m6, m1 mova m1, m0 punpckhwd m1, m7 paddd m6, m1 pxor m7, m7 mova m1, m2 punpcklwd m1, m7 paddd m6, m1 mova m1, m2 punpckhwd m1, m7 paddd m6, m1 ret cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows ; rows 0-3 movu m0, [r0] movu m4, [r2] psubw m0, m4 movu m1, [r0 + r1] movu m5, [r2 + r3] psubw m1, m5 movu m2, [r0 + r1 * 2] movu m4, [r2 + r3 * 2] psubw m2, m4 movu m3, [r0 + r4] movu m5, [r2 + r5] psubw m3, m5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] paddw m4, m0, m1 psubw m1, m0 paddw m0, m2, m3 psubw m3, m2 punpckhwd m2, m4, m1 punpcklwd m4, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddw m3, m4, m0 psubw m0, m4 paddw m4, m2, m1 psubw m1, m2 punpckhdq m2, m3, m0 punpckldq m3, m0 paddw m0, m3, m2 psubw m2, m3 punpckhdq m3, m4, m1 punpckldq m4, m1 paddw m1, m4, m3 psubw m3, m4 punpckhqdq m4, m0, m1 punpcklqdq m0, m1 pabsw m0, m0 pabsw m4, m4 pmaxsw m0, m0, m4 punpckhqdq m1, m2, m3 punpcklqdq m2, m3 pabsw m2, m2 pabsw m1, m1 pmaxsw m2, m1 pxor m7, m7 mova m1, m0 punpcklwd m1, m7 paddd m6, m1 mova m1, m0 punpckhwd m1, m7 paddd m6, m1 pxor m7, m7 mova m1, m2 punpcklwd m1, m7 paddd m6, m1 mova m1, m2 punpckhwd m1, m7 paddd m6, m1 ret cglobal pixel_satd_16x4, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x4 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x8, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x12, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x4 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x16, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x32, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_16x64, 4,6,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x8, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x16, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x24, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x32, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_32x64, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_48x64, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x16, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x32, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x48, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET cglobal pixel_satd_64x64, 4,8,8 add r1d, r1d add r3d, r3d lea r4, [3 * r1] lea r5, [3 * r3] pxor m6, m6 mov r6, r0 mov r7, r2 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 32] lea r2, [r7 + 32] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 64] lea r2, [r7 + 64] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 lea r0, [r6 + 96] lea r2, [r7 + 96] call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 call calc_satd_16x8 vextracti128 xm7, m6, 1 paddd xm6, xm7 pxor xm7, xm7 movhlps xm7, xm6 paddd xm6, xm7 pshufd xm7, xm6, 1 paddd xm6, xm7 movd eax, xm6 RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 ;------------------------------------------------------------------------------------------------------------------------------------- ; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix) ;------------------------------------------------------------------------------------------------------------------------------------- %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 INIT_YMM avx2 cglobal planeClipAndMax, 5,7,8 movd xm0, r5m vpbroadcastb m0, xm0 ; m0 = [min] vpbroadcastb m1, r6m ; m1 = [max] pxor m2, m2 ; m2 = sumLuma pxor m3, m3 ; m3 = maxLumaLevel pxor m4, m4 ; m4 = zero ; get mask to partial register pixels mov r5d, r2d and r2d, ~(mmsize - 1) sub r5d, r2d lea r6, [pb_movemask_32 + mmsize] sub r6, r5 movu m5, [r6] ; m5 = mask for last couple column .loopH: lea r5d, [r2 - mmsize] .loopW: movu m6, [r0 + r5] pmaxub m6, m0 pminub m6, m1 movu [r0 + r5], m6 ; store back pmaxub m3, m6 ; update maxLumaLevel psadbw m6, m4 paddq m2, m6 sub r5d, mmsize jge .loopW ; partial pixels movu m7, [r0 + r2] pmaxub m6, m7, m0 pminub m6, m1 pand m7, m5 ; get invalid/unchange pixel pandn m6, m5, m6 ; clear invalid pixels por m7, m6 ; combin valid & invalid pixels movu [r0 + r2], m7 ; store back pmaxub m3, m6 ; update maxLumaLevel psadbw m6, m4 paddq m2, m6 .next: add r0, r1 dec r3d jg .loopH ; sumLuma vextracti128 xm0, m2, 1 paddq xm0, xm2 movhlps xm1, xm0 paddq xm0, xm1 movq [r4], xm0 ; maxLumaLevel vextracti128 xm0, m3, 1 pmaxub xm0, xm3 movhlps xm3, xm0 pmaxub xm0, xm3 pmovzxbw xm0, xm0 pxor xm0, [pb_movemask + 16] phminposuw xm0, xm0 movd eax, xm0 not al movzx eax, al RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0