libbpg/x265/source/common/x86/pixel-a.asm
2015-10-27 11:46:00 +01:00

12266 lines
303 KiB
NASM

;*****************************************************************************
;* pixel.asm: x86 pixel metrics
;*****************************************************************************
;* Copyright (C) 2003-2013 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Alex Izvorski <aizvorksi@gmail.com>
;* Fiona Glaser <fiona@x264.com>
;* Oskar Arvidsson <oskar@irock.se>
;* Min Chen <chenm003@163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
hmul_8p: times 8 db 1
times 4 db 1, -1
times 8 db 1
times 4 db 1, -1
hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
hmul_8w: times 4 dw 1
times 2 dw 1, -1
times 4 dw 1
times 2 dw 1, -1
ALIGN 32
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
sw_f0: dq 0xfff0, 0
pd_f0: times 4 dd 0xffff0000
SECTION .text
cextern pb_0
cextern pb_1
cextern pw_1
cextern pw_8
cextern pw_16
cextern pw_32
cextern pw_00ff
cextern pw_ppppmmmm
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
cextern pw_pmmpzzzz
cextern pd_1
cextern popcnt_table
cextern pd_2
cextern hmul_16p
cextern pb_movemask
cextern pb_movemask_32
cextern pw_pixel_max
;=============================================================================
; SATD
;=============================================================================
%macro JDUP 2
%if cpuflag(sse4)
; just use shufps on anything post conroe
shufps %1, %2, 0
%elif cpuflag(ssse3) && notcpuflag(atom)
; join 2x 32 bit and duplicate them
; emulating shufps is faster on conroe
punpcklqdq %1, %2
movsldup %1, %1
%else
; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
punpckldq %1, %2
%endif
%endmacro
%macro HSUMSUB 5
pmaddubsw m%2, m%5
pmaddubsw m%1, m%5
pmaddubsw m%4, m%5
pmaddubsw m%3, m%5
%endmacro
%macro DIFF_UNPACK_SSE2 5
punpcklbw m%1, m%5
punpcklbw m%2, m%5
punpcklbw m%3, m%5
punpcklbw m%4, m%5
psubw m%1, m%2
psubw m%3, m%4
%endmacro
%macro DIFF_SUMSUB_SSSE3 5
HSUMSUB %1, %2, %3, %4, %5
psubw m%1, m%2
psubw m%3, m%4
%endmacro
%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
movd %1, %3
movd %2, %4
JDUP %1, %2
%endmacro
%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
movddup m%3, %6
movddup m%4, %8
movddup m%1, %5
movddup m%2, %7
%endmacro
%macro LOAD_DUP_4x8P_PENRYN 8
; penryn and nehalem run punpcklqdq and movddup in different units
movh m%3, %6
movh m%4, %8
punpcklqdq m%3, m%3
movddup m%1, %5
punpcklqdq m%4, m%4
movddup m%2, %7
%endmacro
%macro LOAD_SUMSUB_8x2P 9
LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro
%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
%endif
%endmacro
%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
movddup m%1, [%7]
movddup m%2, [%7+8]
mova m%4, [%6]
movddup m%3, m%4
punpckhqdq m%4, m%4
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro
%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
movu m%4, [%7]
mova m%2, [%6]
DEINTB %1, %2, %3, %4, %5
psubw m%1, m%3
psubw m%2, m%4
SUMSUB_BA w, %1, %2, %3
%endmacro
%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
%endmacro
%macro LOAD_SUMSUB_16x2P_AVX2 9
; 2*dst, 2*tmp, mul, 4*ptr
vbroadcasti128 m%1, [%6]
vbroadcasti128 m%3, [%7]
vbroadcasti128 m%2, [%8]
vbroadcasti128 m%4, [%9]
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro
%macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
%endif
%endmacro
%macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
mova xm%3, %6
mova xm%4, %8
mova xm%1, %5
mova xm%2, %7
vpermq m%3, m%3, q0011
vpermq m%4, m%4, q0011
vpermq m%1, m%1, q0011
vpermq m%2, m%2, q0011
%endmacro
%macro LOAD_SUMSUB8_16x2P_AVX2 9
; 2*dst, 2*tmp, mul, 4*ptr
LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro
%macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
%endif
%endmacro
; in: r4=3*stride1, r5=3*stride2
; in: %2 = horizontal offset
; in: %3 = whether we need to increment pix1 and pix2
; clobber: m3..m7
; out: %1 = satd
%macro SATD_4x4_MMX 3
%xdefine %%n n%1
%assign offset %2*SIZEOF_PIXEL
LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
%if %3
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
HADAMARD4_2D 4, 5, 6, 7, 3, %%n
paddw m4, m6
;%if HIGH_BIT_DEPTH && (BIT_DEPTH == 12)
; pxor m5, m5
; punpcklwd m6, m4, m5
; punpckhwd m4, m5
; paddd m4, m6
;%endif
SWAP %%n, 4
%endmacro
; in: %1 = horizontal if 0, vertical if 1
%macro SATD_8x4_SSE 8-9
%if %1
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
%else
HADAMARD4_V %2, %3, %4, %5, %6
; doing the abs first is a slight advantage
ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
HADAMARD 1, max, %2, %4, %6, %7
%endif
%ifnidn %9, swap
%if (BIT_DEPTH == 12)
pxor m%6, m%6
punpcklwd m%7, m%2, m%6
punpckhwd m%2, m%6
paddd m%8, m%7
paddd m%8, m%2
%else
paddw m%8, m%2
%endif
%else
SWAP %8, %2
%if (BIT_DEPTH == 12)
pxor m%6, m%6
punpcklwd m%7, m%8, m%6
punpckhwd m%8, m%6
paddd m%8, m%7
%endif
%endif
%if %1
%if (BIT_DEPTH == 12)
pxor m%6, m%6
punpcklwd m%7, m%4, m%6
punpckhwd m%4, m%6
paddd m%8, m%7
paddd m%8, m%4
%else
paddw m%8, m%4
%endif
%else
HADAMARD 1, max, %3, %5, %6, %7
%if (BIT_DEPTH == 12)
pxor m%6, m%6
punpcklwd m%7, m%3, m%6
punpckhwd m%3, m%6
paddd m%8, m%7
paddd m%8, m%3
%else
paddw m%8, m%3
%endif
%endif
%endmacro
%macro SATD_8x4_1_SSE 10
%if %1
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
%else
HADAMARD4_V %2, %3, %4, %5, %6
; doing the abs first is a slight advantage
ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
HADAMARD 1, max, %2, %4, %6, %7
%endif
pxor m%10, m%10
punpcklwd m%9, m%2, m%10
paddd m%8, m%9
punpckhwd m%9, m%2, m%10
paddd m%8, m%9
%if %1
pxor m%10, m%10
punpcklwd m%9, m%4, m%10
paddd m%8, m%9
punpckhwd m%9, m%4, m%10
paddd m%8, m%9
%else
HADAMARD 1, max, %3, %5, %6, %7
pxor m%10, m%10
punpcklwd m%9, m%3, m%10
paddd m%8, m%9
punpckhwd m%9, m%3, m%10
paddd m%8, m%9
%endif
%endmacro
%macro SATD_START_MMX 0
FIX_STRIDES r1, r3
lea r4, [3*r1] ; 3*stride1
lea r5, [3*r3] ; 3*stride2
%endmacro
%macro SATD_END_MMX 0
%if HIGH_BIT_DEPTH
HADDUW m0, m1
movd eax, m0
%else ; !HIGH_BIT_DEPTH
pshufw m1, m0, q1032
paddw m0, m1
pshufw m1, m0, q2301
paddw m0, m1
movd eax, m0
and eax, 0xffff
%endif ; HIGH_BIT_DEPTH
EMMS
RET
%endmacro
; FIXME avoid the spilling of regs to hold 3*stride.
; for small blocks on x86_32, modify pixel pointer instead.
;-----------------------------------------------------------------------------
; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_satd_4x4, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
%macro SATD_START_SSE2 2-3 0
FIX_STRIDES r1, r3
%if HIGH_BIT_DEPTH && %3
pxor %2, %2
%elif cpuflag(ssse3) && notcpuflag(atom)
%if mmsize==32
mova %2, [hmul_16p]
%else
mova %2, [hmul_8p]
%endif
%endif
lea r4, [3*r1]
lea r5, [3*r3]
pxor %1, %1
%endmacro
%macro SATD_END_SSE2 1-2
%if HIGH_BIT_DEPTH
%if BIT_DEPTH == 12
HADDD %1, xm0
%else ; BIT_DEPTH == 12
HADDUW %1, xm0
%endif ; BIT_DEPTH == 12
%if %0 == 2
paddd %1, %2
%endif
%else
HADDW %1, xm7
%endif
movd eax, %1
RET
%endmacro
%macro SATD_ACCUM 3
%if HIGH_BIT_DEPTH
HADDUW %1, %2
paddd %3, %1
pxor %1, %1
%endif
%endmacro
%macro BACKUP_POINTERS 0
%if ARCH_X86_64
%if WIN64
PUSH r7
%endif
mov r6, r0
mov r7, r2
%endif
%endmacro
%macro RESTORE_AND_INC_POINTERS 0
%if ARCH_X86_64
lea r0, [r6+8*SIZEOF_PIXEL]
lea r2, [r7+8*SIZEOF_PIXEL]
%if WIN64
POP r7
%endif
%else
mov r0, r0mp
mov r2, r2mp
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
%endif
%endmacro
%macro SATD_4x8_SSE 3-4
%if HIGH_BIT_DEPTH
movh m0, [r0+0*r1]
movh m4, [r2+0*r3]
movh m1, [r0+1*r1]
movh m5, [r2+1*r3]
movhps m0, [r0+4*r1]
movhps m4, [r2+4*r3]
movh m2, [r0+2*r1]
movh m6, [r2+2*r3]
psubw m0, m4
movh m3, [r0+r4]
movh m4, [r2+r5]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
movhps m1, [r0+1*r1]
movhps m5, [r2+1*r3]
movhps m2, [r0+2*r1]
movhps m6, [r2+2*r3]
psubw m1, m5
movhps m3, [r0+r4]
movhps m4, [r2+r5]
psubw m2, m6
psubw m3, m4
%else ; !HIGH_BIT_DEPTH
movd m4, [r2]
movd m5, [r2+r3]
movd m6, [r2+2*r3]
add r2, r5
movd m0, [r0]
movd m1, [r0+r1]
movd m2, [r0+2*r1]
add r0, r4
movd m3, [r2+r3]
JDUP m4, m3
movd m3, [r0+r1]
JDUP m0, m3
movd m3, [r2+2*r3]
JDUP m5, m3
movd m3, [r0+2*r1]
JDUP m1, m3
%if %1==0 && %2==1
mova m3, [hmul_4p]
DIFFOP 0, 4, 1, 5, 3
%else
DIFFOP 0, 4, 1, 5, 7
%endif
movd m5, [r2]
add r2, r5
movd m3, [r0]
add r0, r4
movd m4, [r2]
JDUP m6, m4
movd m4, [r0]
JDUP m2, m4
movd m4, [r2+r3]
JDUP m5, m4
movd m4, [r0+r1]
JDUP m3, m4
%if %1==0 && %2==1
mova m4, [hmul_4p]
DIFFOP 2, 6, 3, 5, 4
%else
DIFFOP 2, 6, 3, 5, 7
%endif
%endif ; HIGH_BIT_DEPTH
%if %0 == 4
SATD_8x4_1_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3, %4
%else
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
%endif
%endmacro
;-----------------------------------------------------------------------------
; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
HADAMARD 0, sumsub, 0, 1, 2, 3
HADAMARD 4, sumsub, 0, 1, 2, 3
HADAMARD 1, amax, 0, 1, 2, 3
HADDW m0, m1
movd eax, m0
RET
%endif
cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
%if BIT_DEPTH == 12
HADDD m7, m1
%else
HADDUW m7, m1
%endif
movd eax, m7
RET
cglobal pixel_satd_4x16, 4, 6, 8
SATD_START_MMX
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
%if BIT_DEPTH == 12
HADDD m7, m1
%else
HADDUW m7, m1
%endif
movd eax, m7
RET
cglobal pixel_satd_8x8_internal
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
%%pixel_satd_8x4_internal:
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
ret
cglobal pixel_satd_8x8_internal2
%if WIN64
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
%%pixel_satd_8x4_internal2:
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
%else
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
%%pixel_satd_8x4_internal2:
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
%endif
ret
; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
%if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
cglobal pixel_satd_16x4_internal2
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13
SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13
ret
cglobal pixel_satd_16x4, 4,6,14
SATD_START_SSE2 m10, m7
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_16x8, 4,6,14
SATD_START_SSE2 m10, m7
%if vertical
mova m7, [pw_00ff]
%endif
jmp %%pixel_satd_16x8_internal
cglobal pixel_satd_16x12, 4,6,14
SATD_START_SSE2 m10, m7
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
jmp %%pixel_satd_16x8_internal
cglobal pixel_satd_16x32, 4,6,14
SATD_START_SSE2 m10, m7
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
jmp %%pixel_satd_16x8_internal
cglobal pixel_satd_16x64, 4,6,14
SATD_START_SSE2 m10, m7
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
jmp %%pixel_satd_16x8_internal
cglobal pixel_satd_16x16, 4,6,14
SATD_START_SSE2 m10, m7
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
%%pixel_satd_16x8_internal:
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && notcpuflag(avx)
SATD_START_SSE2 m10, m7
mov r6, r0
mov r7, r2
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
call pixel_satd_16x4_internal2
HADDD m10, m0
movd eax, m10
RET
%else
%if WIN64
cglobal pixel_satd_16x24, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_16x24, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_32x48, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_24x64, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_8x64, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_8x12, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
pxor m7, m7
movhlps m7, m6
paddd m6, m7
pshufd m7, m6, 1
paddd m6, m7
movd eax, m6
RET
%else
cglobal pixel_satd_8x12, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if HIGH_BIT_DEPTH
%if WIN64
cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx)
SATD_START_MMX
mov r6, r0
mov r7, r2
pxor m7, m7
SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
lea r2, [r7 + 4*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
HADDD m7, m0
movd eax, m7
RET
%else
cglobal pixel_satd_12x32, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
pxor m7, m7
SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 4*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
HADDD m7, m0
movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
%if WIN64
cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx)
SATD_START_MMX
mov r6, r0
mov r7, r2
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 4*SIZEOF_PIXEL]
lea r2, [r7 + 4*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
%else
cglobal pixel_satd_12x32, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 4*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 4*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
%endif
%endif
%if HIGH_BIT_DEPTH
%if WIN64
cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx)
SATD_START_MMX
mov r6, r0
mov r7, r2
pxor m7, m7
SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
HADDD m7, m0
movd eax, m7
RET
%else
cglobal pixel_satd_4x32, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
pxor m7, m7
SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
pxor m1, m1
movhlps m1, m7
paddd m7, m1
pshufd m1, m7, 1
paddd m7, m1
movd eax, m7
RET
%endif
%else
%if WIN64
cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx)
SATD_START_MMX
mov r6, r0
mov r7, r2
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
%else
cglobal pixel_satd_4x32, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
%endif
%endif
%if WIN64
cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
lea r2, [r7 + 32*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
lea r2, [r7 + 40*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,32*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,40*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_64x16, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
lea r2, [r7 + 32*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
lea r2, [r7 + 40*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
lea r2, [r7 + 48*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
lea r2, [r7 + 56*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_64x16, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,32*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,40*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,48*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
mov r2, [rsp]
add r2,56*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_64x32, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
lea r2, [r7 + 32*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
lea r2, [r7 + 40*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
lea r2, [r7 + 48*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
lea r2, [r7 + 56*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_64x32, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 32*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 40*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 48*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 56*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_64x48, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
lea r2, [r7 + 32*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
lea r2, [r7 + 40*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
lea r2, [r7 + 48*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
lea r2, [r7 + 56*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_64x48, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 32*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 40*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 48*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 56*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_64x64, 4,8,14 ;if WIN64 && cpuflag(avx)
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
lea r2, [r7 + 24*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
lea r2, [r7 + 32*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
lea r2, [r7 + 40*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
lea r2, [r7 + 48*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
lea r2, [r7 + 56*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_64x64, 4,7,8,0-gprsize ;if !WIN64
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 24*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 24*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 32*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 32*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 40*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 40*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 48*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 48*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 56*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 56*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if WIN64
cglobal pixel_satd_16x4, 4,6,14
%else
cglobal pixel_satd_16x4, 4,6,8
%endif
SATD_START_SSE2 m6, m7
BACKUP_POINTERS
call %%pixel_satd_8x4_internal2
RESTORE_AND_INC_POINTERS
call %%pixel_satd_8x4_internal2
HADDD m6, m0
movd eax, m6
RET
%if WIN64
cglobal pixel_satd_16x8, 4,6,14
%else
cglobal pixel_satd_16x8, 4,6,8
%endif
SATD_START_SSE2 m6, m7
BACKUP_POINTERS
call pixel_satd_8x8_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%if WIN64
cglobal pixel_satd_16x12, 4,6,14
%else
cglobal pixel_satd_16x12, 4,6,8
%endif
SATD_START_SSE2 m6, m7, 1
BACKUP_POINTERS
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call %%pixel_satd_8x4_internal2
HADDD m6, m0
movd eax, m6
RET
%if WIN64
cglobal pixel_satd_16x16, 4,6,14
%else
cglobal pixel_satd_16x16, 4,6,8
%endif
SATD_START_SSE2 m6, m7, 1
BACKUP_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%if WIN64
cglobal pixel_satd_16x32, 4,6,14
%else
cglobal pixel_satd_16x32, 4,6,8
%endif
SATD_START_SSE2 m6, m7, 1
BACKUP_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%if WIN64
cglobal pixel_satd_16x64, 4,6,14
%else
cglobal pixel_satd_16x64, 4,6,8
%endif
SATD_START_SSE2 m6, m7, 1
BACKUP_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif
%if HIGH_BIT_DEPTH
%if WIN64
cglobal pixel_satd_12x16, 4,8,8
SATD_START_MMX
mov r6, r0
mov r7, r2
pxor m7, m7
SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
lea r2, [r7 + 4*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
HADDD m7, m0
movd eax, m7
RET
%else
cglobal pixel_satd_12x16, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
pxor m7, m7
SATD_4x8_SSE vertical, 0, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 4*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 4*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, 4, 5
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, 4, 5
HADDD m7, m0
movd eax, m7
RET
%endif
%else ;HIGH_BIT_DEPTH
%if WIN64
cglobal pixel_satd_12x16, 4,8,8
SATD_START_MMX
mov r6, r0
mov r7, r2
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 4*SIZEOF_PIXEL]
lea r2, [r7 + 4*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
%else
cglobal pixel_satd_12x16, 4,7,8,0-gprsize
SATD_START_MMX
mov r6, r0
mov [rsp], r2
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE vertical, 0, swap
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 4*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 4*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
SATD_4x8_SSE vertical, 1, add
lea r0, [r0 + r1*2*SIZEOF_PIXEL]
lea r2, [r2 + r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
HADDW m7, m1
movd eax, m7
RET
%endif
%endif
%if WIN64
cglobal pixel_satd_24x32, 4,8,14
SATD_START_SSE2 m6, m7
mov r6, r0
mov r7, r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
lea r2, [r7 + 8*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
lea r2, [r7 + 16*SIZEOF_PIXEL]
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%else
cglobal pixel_satd_24x32, 4,7,8,0-gprsize
SATD_START_SSE2 m6, m7
mov r6, r0
mov [rsp], r2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 8*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 8*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
lea r0, [r6 + 16*SIZEOF_PIXEL]
mov r2, [rsp]
add r2, 16*SIZEOF_PIXEL
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%endif ;WIN64
%if WIN64
cglobal pixel_satd_8x32, 4,6,14
%else
cglobal pixel_satd_8x32, 4,6,8
%endif
SATD_START_SSE2 m6, m7
%if vertical
mova m7, [pw_00ff]
%endif
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
%if WIN64
cglobal pixel_satd_8x16, 4,6,14
%else
cglobal pixel_satd_8x16, 4,6,8
%endif
SATD_START_SSE2 m6, m7
call pixel_satd_8x8_internal2
call pixel_satd_8x8_internal2
HADDD m6, m0
movd eax, m6
RET
cglobal pixel_satd_8x8, 4,6,8
SATD_START_SSE2 m6, m7
call pixel_satd_8x8_internal
SATD_END_SSE2 m6
%if WIN64
cglobal pixel_satd_8x4, 4,6,14
%else
cglobal pixel_satd_8x4, 4,6,8
%endif
SATD_START_SSE2 m6, m7
call %%pixel_satd_8x4_internal2
SATD_END_SSE2 m6
%endmacro ; SATDS_SSE2
;=============================================================================
; SA8D
;=============================================================================
%macro SA8D_INTER 0
%if ARCH_X86_64
%define lh m10
%define rh m0
%else
%define lh m0
%define rh [esp+48]
%endif
%if HIGH_BIT_DEPTH
HADDUW m0, m1
paddd lh, rh
%else
paddusw lh, rh
%endif ; HIGH_BIT_DEPTH
%endmacro
%macro SA8D_8x8 0
call pixel_sa8d_8x8_internal
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%else
HADDW m0, m1
%endif ; HIGH_BIT_DEPTH
paddd m0, [pd_1]
psrld m0, 1
paddd m12, m0
%endmacro
%macro SA8D_16x16 0
call pixel_sa8d_8x8_internal ; pix[0]
add r2, 8*SIZEOF_PIXEL
add r0, 8*SIZEOF_PIXEL
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova m10, m0
call pixel_sa8d_8x8_internal ; pix[8]
lea r2, [r2+8*r3]
lea r0, [r0+8*r1]
SA8D_INTER
call pixel_sa8d_8x8_internal ; pix[8*stride+8]
sub r2, 8*SIZEOF_PIXEL
sub r0, 8*SIZEOF_PIXEL
SA8D_INTER
call pixel_sa8d_8x8_internal ; pix[8*stride]
SA8D_INTER
SWAP 0, 10
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
paddd m0, [pd_1]
psrld m0, 1
paddd m12, m0
%endmacro
%macro AVG_16x16 0
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
%endmacro
%macro SA8D 0
; sse2 doesn't seem to like the horizontal way of doing things
%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
lea r6, [r0+4*r1]
lea r7, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
%if vertical
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
%endif
paddw m0, m1
paddw m0, m2
paddw m0, m8
SAVE_MM_PERMUTATION
ret
cglobal pixel_sa8d_8x8, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
%if vertical == 0
mova m7, [hmul_8p]
%endif
call pixel_sa8d_8x8_internal
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%else
HADDW m0, m1
%endif ; HIGH_BIT_DEPTH
movd eax, m0
add eax, 1
shr eax, 1
RET
cglobal pixel_sa8d_16x16, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
%if vertical == 0
mova m7, [hmul_8p]
%endif
call pixel_sa8d_8x8_internal ; pix[0]
add r2, 8*SIZEOF_PIXEL
add r0, 8*SIZEOF_PIXEL
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova m10, m0
call pixel_sa8d_8x8_internal ; pix[8]
lea r2, [r2+8*r3]
lea r0, [r0+8*r1]
SA8D_INTER
call pixel_sa8d_8x8_internal ; pix[8*stride+8]
sub r2, 8*SIZEOF_PIXEL
sub r0, 8*SIZEOF_PIXEL
SA8D_INTER
call pixel_sa8d_8x8_internal ; pix[8*stride]
SA8D_INTER
SWAP 0, 10
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd eax, m0
add eax, 1
shr eax, 1
RET
cglobal pixel_sa8d_8x16, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_8x8
lea r0, [r0 + 8*r1]
lea r2, [r2 + 8*r3]
SA8D_8x8
movd eax, m12
RET
cglobal pixel_sa8d_8x32, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
movd eax, m12
RET
cglobal pixel_sa8d_16x8, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
cglobal pixel_sa8d_16x32, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_16x64, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_24x32, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
sub r0, 8*SIZEOF_PIXEL
sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
sub r0, 8*SIZEOF_PIXEL
sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
sub r0, 8*SIZEOF_PIXEL
sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
sub r0, 8*SIZEOF_PIXEL
sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
cglobal pixel_sa8d_32x8, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
cglobal pixel_sa8d_32x16, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_32x24, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
sub r0, 8*SIZEOF_PIXEL
sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
sub r0, 8*SIZEOF_PIXEL
sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
sub r0, 8*SIZEOF_PIXEL
sub r2, 8*SIZEOF_PIXEL
SA8D_8x8
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_8x8
movd eax, m12
RET
cglobal pixel_sa8d_32x32, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_32x64, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_48x64, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_64x16, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_64x32, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_64x48, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
cglobal pixel_sa8d_64x64, 4,8,13
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
pxor m12, m12
%if vertical == 0
mova m7, [hmul_8p]
%endif
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
add r2, 16*SIZEOF_PIXEL
add r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r0, [r0+8*r1]
lea r2, [r2+8*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
lea r4, [8*r1]
lea r5, [8*r3]
sub r0, r4
sub r2, r5
sub r2, 16*SIZEOF_PIXEL
sub r0, 16*SIZEOF_PIXEL
lea r4, [3*r1]
lea r5, [3*r3]
SA8D_16x16
movd eax, m12
RET
%else ; ARCH_X86_32
%if mmsize == 16
cglobal pixel_sa8d_8x8_internal
%define spill0 [esp+4]
%define spill1 [esp+20]
%define spill2 [esp+36]
%if vertical
LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
HADAMARD4_2D 0, 1, 2, 3, 4
movdqa spill0, m3
LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
HADAMARD4_2D 4, 5, 6, 7, 3
HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
movdqa m3, spill0
paddw m0, m1
HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
%else ; mmsize == 8
mova m7, [hmul_8p]
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
; could do first HADAMARD4_V here to save spilling later
; surprisingly, not a win on conroe or even p4
mova spill0, m2
mova spill1, m3
mova spill2, m1
SWAP 1, 7
LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
HADAMARD4_V 4, 5, 6, 7, 3
mova m1, spill2
mova m2, spill0
mova m3, spill1
mova spill0, m6
mova spill1, m7
HADAMARD4_V 0, 1, 2, 3, 7
SUMSUB_BADC w, 0, 4, 1, 5, 7
HADAMARD 2, sumsub, 0, 4, 7, 6
HADAMARD 2, sumsub, 1, 5, 7, 6
HADAMARD 1, amax, 0, 4, 7, 6
HADAMARD 1, amax, 1, 5, 7, 6
mova m6, spill0
mova m7, spill1
paddw m0, m1
SUMSUB_BADC w, 2, 6, 3, 7, 4
HADAMARD 2, sumsub, 2, 6, 4, 5
HADAMARD 2, sumsub, 3, 7, 4, 5
HADAMARD 1, amax, 2, 6, 4, 5
HADAMARD 1, amax, 3, 7, 4, 5
%endif ; sse2/non-sse2
paddw m0, m2
paddw m0, m3
SAVE_MM_PERMUTATION
ret
%endif ; ifndef mmx2
cglobal pixel_sa8d_8x8_internal2
%define spill0 [esp+4]
LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
HADAMARD4_2D 0, 1, 2, 3, 4
movdqa spill0, m3
LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
HADAMARD4_2D 4, 5, 6, 7, 3
HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
movdqa m3, spill0
paddw m0, m1
HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
paddw m0, m2
paddw m0, m3
SAVE_MM_PERMUTATION
ret
cglobal pixel_sa8d_8x8, 4,7
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 48
lea r4, [3*r1]
lea r5, [3*r3]
call pixel_sa8d_8x8_internal
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%else
HADDW m0, m1
%endif ; HIGH_BIT_DEPTH
movd eax, m0
add eax, 1
shr eax, 1
mov esp, r6
RET
cglobal pixel_sa8d_16x16, 4,7
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [3*r1]
lea r5, [3*r3]
call pixel_sa8d_8x8_internal
%if mmsize == 8
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal
%if mmsize == 8
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%else
SA8D_INTER
%endif
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal
%if HIGH_BIT_DEPTH
SA8D_INTER
%else ; !HIGH_BIT_DEPTH
paddusw m0, [esp+64-mmsize]
%if mmsize == 16
HADDUW m0, m1
%else
mova m2, [esp+48]
pxor m7, m7
mova m1, m0
mova m3, m2
punpcklwd m0, m7
punpckhwd m1, m7
punpcklwd m2, m7
punpckhwd m3, m7
paddd m0, m1
paddd m2, m3
paddd m0, m2
HADDD m0, m1
%endif
%endif ; HIGH_BIT_DEPTH
movd eax, m0
add eax, 1
shr eax, 1
mov esp, r6
RET
cglobal pixel_sa8d_8x16, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_8x32, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_16x8, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_16x32, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_16x64, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_24x32, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_32x8, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_32x16, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_32x24, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
HADDUW m0, m1
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_32x32, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_32x64, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_48x64, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_64x16, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_64x32, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_64x48, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
cglobal pixel_sa8d_64x64, 4,7,8
FIX_STRIDES r1, r3
mov r6, esp
and esp, ~15
sub esp, 64
lea r4, [r1 + 2*r1]
lea r5, [r3 + 2*r3]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [rsp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
mov dword [esp+36], r4d
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
lea r0, [r0 + r1*8]
lea r2, [r2 + r3*8]
mov [r6+20], r0
mov [r6+28], r2
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 8*SIZEOF_PIXEL
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 16*SIZEOF_PIXEL
add r2, 16*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 24*SIZEOF_PIXEL
add r2, 24*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 32*SIZEOF_PIXEL
add r2, 32*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 40*SIZEOF_PIXEL
add r2, 40*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
AVG_16x16
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 48*SIZEOF_PIXEL
add r2, 48*SIZEOF_PIXEL
lea r4, [r1 + 2*r1]
call pixel_sa8d_8x8_internal2
%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+48], m0
mov r0, [r6+20]
mov r2, [r6+28]
add r0, 56*SIZEOF_PIXEL
add r2, 56*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal2
SA8D_INTER
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal2
SA8D_INTER
%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd r4d, m0
add r4d, 1
shr r4d, 1
add r4d, dword [esp+36]
mov eax, r4d
mov esp, r6
RET
%endif ; !ARCH_X86_64
%endmacro ; SA8D
;=============================================================================
; INTRA SATD
;=============================================================================
%define TRANS TRANS_SSE2
%define DIFFOP DIFF_UNPACK_SSE2
%define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
%define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
%define movdqu movups
%define punpcklqdq movlhps
INIT_XMM sse2
SA8D
SATDS_SSE2
%if HIGH_BIT_DEPTH == 0
INIT_XMM ssse3,atom
SATDS_SSE2
SA8D
%endif
%define DIFFOP DIFF_SUMSUB_SSSE3
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
%if HIGH_BIT_DEPTH == 0
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
%endif
INIT_XMM ssse3
SATDS_SSE2
SA8D
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
%define TRANS TRANS_SSE4
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
INIT_XMM sse4
SATDS_SSE2
SA8D
; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
; it's effectively free.
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
INIT_XMM avx
SATDS_SSE2
SA8D
%define TRANS TRANS_XOP
INIT_XMM xop
SATDS_SSE2
SA8D
%if HIGH_BIT_DEPTH == 0
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
%define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
%define TRANS TRANS_SSE4
%macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
movq xm%1, [r0]
movq xm%3, [r2]
movq xm%2, [r0+r1]
movq xm%4, [r2+r3]
vinserti128 m%1, m%1, [r0+4*r1], 1
vinserti128 m%3, m%3, [r2+4*r3], 1
vinserti128 m%2, m%2, [r0+r4], 1
vinserti128 m%4, m%4, [r2+r5], 1
punpcklqdq m%1, m%1
punpcklqdq m%3, m%3
punpcklqdq m%2, m%2
punpcklqdq m%4, m%4
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
movq xm%3, [r0]
movq xm%5, [r2]
movq xm%4, [r0+r1]
movq xm%6, [r2+r3]
vinserti128 m%3, m%3, [r0+4*r1], 1
vinserti128 m%5, m%5, [r2+4*r3], 1
vinserti128 m%4, m%4, [r0+r4], 1
vinserti128 m%6, m%6, [r2+r5], 1
punpcklqdq m%3, m%3
punpcklqdq m%5, m%5
punpcklqdq m%4, m%4
punpcklqdq m%6, m%6
DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
%endmacro
%macro SATD_START_AVX2 2-3 0
FIX_STRIDES r1, r3
%if %3
mova %2, [hmul_8p]
lea r4, [5*r1]
lea r5, [5*r3]
%else
mova %2, [hmul_16p]
lea r4, [3*r1]
lea r5, [3*r3]
%endif
pxor %1, %1
%endmacro
%define TRANS TRANS_SSE4
INIT_YMM avx2
cglobal pixel_satd_16x8_internal
LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
ret
cglobal pixel_satd_16x16, 4,6,8
SATD_START_AVX2 m6, m7
call pixel_satd_16x8_internal
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
pixel_satd_16x8_internal:
call pixel_satd_16x8_internal
vextracti128 xm0, m6, 1
paddw xm0, xm6
SATD_END_SSE2 xm0
RET
cglobal pixel_satd_16x8, 4,6,8
SATD_START_AVX2 m6, m7
jmp pixel_satd_16x8_internal
cglobal pixel_satd_8x8_internal
LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
ret
cglobal pixel_satd_8x16, 4,6,8
SATD_START_AVX2 m6, m7, 1
call pixel_satd_8x8_internal
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
call pixel_satd_8x8_internal
vextracti128 xm0, m6, 1
paddw xm0, xm6
SATD_END_SSE2 xm0
RET
cglobal pixel_satd_8x8, 4,6,8
SATD_START_AVX2 m6, m7, 1
call pixel_satd_8x8_internal
vextracti128 xm0, m6, 1
paddw xm0, xm6
SATD_END_SSE2 xm0
RET
cglobal pixel_sa8d_8x8_internal
LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
HADAMARD4_V 0, 1, 2, 3, 4
HADAMARD 8, sumsub, 0, 1, 4, 5
HADAMARD 8, sumsub, 2, 3, 4, 5
HADAMARD 2, sumsub, 0, 1, 4, 5
HADAMARD 2, sumsub, 2, 3, 4, 5
HADAMARD 1, amax, 0, 1, 4, 5
HADAMARD 1, amax, 2, 3, 4, 5
paddw m6, m0
paddw m6, m2
ret
cglobal pixel_sa8d_8x8, 4,6,8
SATD_START_AVX2 m6, m7, 1
call pixel_sa8d_8x8_internal
vextracti128 xm1, m6, 1
paddw xm6, xm1
HADDW xm6, xm1
movd eax, xm6
add eax, 1
shr eax, 1
RET
cglobal pixel_sa8d_16x16, 4,6,8
SATD_START_AVX2 m6, m7, 1
call pixel_sa8d_8x8_internal ; pix[0]
sub r0, r1
sub r0, r1
add r0, 8*SIZEOF_PIXEL
sub r2, r3
sub r2, r3
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal ; pix[8]
add r0, r4
add r0, r1
add r2, r5
add r2, r3
call pixel_sa8d_8x8_internal ; pix[8*stride+8]
sub r0, r1
sub r0, r1
sub r0, 8*SIZEOF_PIXEL
sub r2, r3
sub r2, r3
sub r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal ; pix[8*stride]
; TODO: analyze Dynamic Range
vextracti128 xm0, m6, 1
paddusw xm6, xm0
HADDUW xm6, xm0
movd eax, xm6
add eax, 1
shr eax, 1
RET
cglobal pixel_sa8d_16x16_internal
call pixel_sa8d_8x8_internal ; pix[0]
sub r0, r1
sub r0, r1
add r0, 8*SIZEOF_PIXEL
sub r2, r3
sub r2, r3
add r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal ; pix[8]
add r0, r4
add r0, r1
add r2, r5
add r2, r3
call pixel_sa8d_8x8_internal ; pix[8*stride+8]
sub r0, r1
sub r0, r1
sub r0, 8*SIZEOF_PIXEL
sub r2, r3
sub r2, r3
sub r2, 8*SIZEOF_PIXEL
call pixel_sa8d_8x8_internal ; pix[8*stride]
; TODO: analyze Dynamic Range
vextracti128 xm0, m6, 1
paddusw xm6, xm0
HADDUW xm6, xm0
movd eax, xm6
add eax, 1
shr eax, 1
ret
%if ARCH_X86_64
cglobal pixel_sa8d_32x32, 4,8,8
; TODO: R6 is RAX on x64 platform, so we use it directly
SATD_START_AVX2 m6, m7, 1
xor r7d, r7d
call pixel_sa8d_16x16_internal ; [0]
pxor m6, m6
add r7d, eax
add r0, r4
add r0, r1
add r2, r5
add r2, r3
call pixel_sa8d_16x16_internal ; [2]
pxor m6, m6
add r7d, eax
lea eax, [r4 * 5 - 16]
sub r0, rax
sub r0, r1
lea eax, [r5 * 5 - 16]
sub r2, rax
sub r2, r3
call pixel_sa8d_16x16_internal ; [1]
pxor m6, m6
add r7d, eax
add r0, r4
add r0, r1
add r2, r5
add r2, r3
call pixel_sa8d_16x16_internal ; [3]
add eax, r7d
RET
%endif ; ARCH_X86_64=1
%endif ; HIGH_BIT_DEPTH
; Input 10bit, Output 8bit
;------------------------------------------------------------------------------------------------------------------------
;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal downShift_16, 7,7,3
movd m0, r6d ; m0 = shift
add r1, r1
dec r5d
.loopH:
xor r6, r6
.loopW:
movu m1, [r0 + r6 * 2]
movu m2, [r0 + r6 * 2 + 16]
psrlw m1, m0
psrlw m2, m0
packuswb m1, m2
movu [r2 + r6], m1
add r6, 16
cmp r6d, r4d
jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
jnz .loopH
;processing last row of every frame [To handle width which not a multiple of 16]
.loop16:
movu m1, [r0]
movu m2, [r0 + 16]
psrlw m1, m0
psrlw m2, m0
packuswb m1, m2
movu [r2], m1
add r0, 2 * mmsize
add r2, mmsize
sub r4d, 16
jz .end
cmp r4d, 15
jg .loop16
cmp r4d, 8
jl .process4
movu m1, [r0]
psrlw m1, m0
packuswb m1, m1
movh [r2], m1
add r0, mmsize
add r2, 8
sub r4d, 8
jz .end
.process4:
cmp r4d, 4
jl .process2
movh m1,[r0]
psrlw m1, m0
packuswb m1, m1
movd [r2], m1
add r0, 8
add r2, 4
sub r4d, 4
jz .end
.process2:
cmp r4d, 2
jl .process1
movd m1, [r0]
psrlw m1, m0
packuswb m1, m1
movd r6, m1
mov [r2], r6w
add r0, 4
add r2, 2
sub r4d, 2
jz .end
.process1:
movd m1, [r0]
psrlw m1, m0
packuswb m1, m1
movd r3, m1
mov [r2], r3b
.end:
RET
; Input 10bit, Output 8bit
;-------------------------------------------------------------------------------------------------------------------------------------
;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;-------------------------------------------------------------------------------------------------------------------------------------
INIT_YMM avx2
cglobal downShift_16, 6,7,3
movd xm0, r6m ; m0 = shift
add r1d, r1d
dec r5d
.loopH:
xor r6, r6
.loopW:
movu m1, [r0 + r6 * 2 + 0]
movu m2, [r0 + r6 * 2 + 32]
vpsrlw m1, xm0
vpsrlw m2, xm0
packuswb m1, m2
vpermq m1, m1, 11011000b
movu [r2 + r6], m1
add r6d, mmsize
cmp r6d, r4d
jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
jnz .loopH
; processing last row of every frame [To handle width which not a multiple of 32]
mov r6d, r4d
and r4d, 31
shr r6d, 5
.loop32:
movu m1, [r0]
movu m2, [r0 + 32]
psrlw m1, xm0
psrlw m2, xm0
packuswb m1, m2
vpermq m1, m1, 11011000b
movu [r2], m1
add r0, 2*mmsize
add r2, mmsize
dec r6d
jnz .loop32
cmp r4d, 16
jl .process8
movu m1, [r0]
psrlw m1, xm0
packuswb m1, m1
vpermq m1, m1, 10001000b
movu [r2], xm1
add r0, mmsize
add r2, 16
sub r4d, 16
jz .end
.process8:
cmp r4d, 8
jl .process4
movu m1, [r0]
psrlw m1, xm0
packuswb m1, m1
movq [r2], xm1
add r0, 16
add r2, 8
sub r4d, 8
jz .end
.process4:
cmp r4d, 4
jl .process2
movq xm1,[r0]
psrlw m1, xm0
packuswb m1, m1
movd [r2], xm1
add r0, 8
add r2, 4
sub r4d, 4
jz .end
.process2:
cmp r4d, 2
jl .process1
movd xm1, [r0]
psrlw m1, xm0
packuswb m1, m1
movd r6d, xm1
mov [r2], r6w
add r0, 4
add r2, 2
sub r4d, 2
jz .end
.process1:
movd xm1, [r0]
psrlw m1, xm0
packuswb m1, m1
movd r3d, xm1
mov [r2], r3b
.end:
RET
; Input 8bit, Output 10bit
;---------------------------------------------------------------------------------------------------------------------
;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
;---------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal upShift_8, 6,7,3
movd xm2, r6m
add r3d, r3d
dec r5d
.loopH:
xor r6, r6
.loopW:
pmovzxbw m0,[r0 + r6]
pmovzxbw m1,[r0 + r6 + mmsize/2]
psllw m0, m2
psllw m1, m2
movu [r2 + r6 * 2], m0
movu [r2 + r6 * 2 + mmsize], m1
add r6d, mmsize
cmp r6d, r4d
jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
jg .loopH
; processing last row of every frame [To handle width which not a multiple of 16]
mov r1d, (mmsize/2 - 1)
and r1d, r4d
sub r1, mmsize/2
; NOTE: Width MUST BE more than or equal to 8
shr r4d, 3 ; log2(mmsize)
.loopW8:
pmovzxbw m0,[r0]
psllw m0, m2
movu [r2], m0
add r0, mmsize/2
add r2, mmsize
dec r4d
jg .loopW8
; Mac OS X can't read beyond array bound, so rollback some bytes
pmovzxbw m0,[r0 + r1]
psllw m0, m2
movu [r2 + r1 * 2], m0
RET
;---------------------------------------------------------------------------------------------------------------------
;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
;---------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_YMM avx2
cglobal upShift_8, 6,7,3
movd xm2, r6m
add r3d, r3d
dec r5d
.loopH:
xor r6, r6
.loopW:
pmovzxbw m0,[r0 + r6]
pmovzxbw m1,[r0 + r6 + mmsize/2]
psllw m0, xm2
psllw m1, xm2
movu [r2 + r6 * 2], m0
movu [r2 + r6 * 2 + mmsize], m1
add r6d, mmsize
cmp r6d, r4d
jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
jg .loopH
; processing last row of every frame [To handle width which not a multiple of 32]
mov r1d, (mmsize/2 - 1)
and r1d, r4d
sub r1, mmsize/2
; NOTE: Width MUST BE more than or equal to 16
shr r4d, 4 ; log2(mmsize)
.loopW16:
pmovzxbw m0,[r0]
psllw m0, xm2
movu [r2], m0
add r0, mmsize/2
add r2, mmsize
dec r4d
jg .loopW16
; Mac OS X can't read beyond array bound, so rollback some bytes
pmovzxbw m0,[r0 + r1]
psllw m0, xm2
movu [r2 + r1 * 2], m0
RET
%endif
%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
%if cpuflag(ssse3)
pabsd %1, %3
pabsd %2, %4
%elifidn %1, %3
pxor %5, %5
pxor %6, %6
psubd %5, %1
psubd %6, %2
pmaxsd %1, %5
pmaxsd %2, %6
%else
pxor %1, %1
pxor %2, %2
psubd %1, %3
psubd %2, %4
pmaxsd %1, %3
pmaxsd %2, %4
%endif
%endmacro
; Input 10bit, Output 12bit
;------------------------------------------------------------------------------------------------------------------------
;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal upShift_16, 6,7,4
movd m0, r6m ; m0 = shift
mova m3, [pw_pixel_max]
FIX_STRIDES r1d, r3d
dec r5d
.loopH:
xor r6d, r6d
.loopW:
movu m1, [r0 + r6 * SIZEOF_PIXEL]
movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
psllw m1, m0
psllw m2, m0
; TODO: if input always valid, we can remove below 2 instructions.
pand m1, m3
pand m2, m3
movu [r2 + r6 * SIZEOF_PIXEL], m1
movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
add r6, mmsize * 2 / SIZEOF_PIXEL
cmp r6d, r4d
jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
jnz .loopH
;processing last row of every frame [To handle width which not a multiple of 16]
.loop16:
movu m1, [r0]
movu m2, [r0 + mmsize]
psllw m1, m0
psllw m2, m0
pand m1, m3
pand m2, m3
movu [r2], m1
movu [r2 + mmsize], m2
add r0, 2 * mmsize
add r2, 2 * mmsize
sub r4d, 16
jz .end
jg .loop16
cmp r4d, 8
jl .process4
movu m1, [r0]
psrlw m1, m0
pand m1, m3
movu [r2], m1
add r0, mmsize
add r2, mmsize
sub r4d, 8
jz .end
.process4:
cmp r4d, 4
jl .process2
movh m1,[r0]
psllw m1, m0
pand m1, m3
movh [r2], m1
add r0, 8
add r2, 8
sub r4d, 4
jz .end
.process2:
cmp r4d, 2
jl .process1
movd m1, [r0]
psllw m1, m0
pand m1, m3
movd [r2], m1
add r0, 4
add r2, 4
sub r4d, 2
jz .end
.process1:
movd m1, [r0]
psllw m1, m0
pand m1, m3
movd r3, m1
mov [r2], r3w
.end:
RET
; Input 10bit, Output 12bit
;-------------------------------------------------------------------------------------------------------------------------------------
;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;-------------------------------------------------------------------------------------------------------------------------------------
; TODO: NO TEST CODE!
INIT_YMM avx2
cglobal upShift_16, 6,7,4
movd xm0, r6m ; m0 = shift
vbroadcasti128 m3, [pw_pixel_max]
FIX_STRIDES r1d, r3d
dec r5d
.loopH:
xor r6d, r6d
.loopW:
movu m1, [r0 + r6 * SIZEOF_PIXEL]
movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
psllw m1, xm0
psllw m2, xm0
pand m1, m3
pand m2, m3
movu [r2 + r6 * SIZEOF_PIXEL], m1
movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
add r6, mmsize * 2 / SIZEOF_PIXEL
cmp r6d, r4d
jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
jnz .loopH
; processing last row of every frame [To handle width which not a multiple of 32]
mov r6d, r4d
and r4d, 31
shr r6d, 5
.loop32:
movu m1, [r0]
movu m2, [r0 + mmsize]
psllw m1, xm0
psllw m2, xm0
pand m1, m3
pand m2, m3
movu [r2], m1
movu [r2 + mmsize], m2
add r0, 2*mmsize
add r2, 2*mmsize
dec r6d
jnz .loop32
cmp r4d, 16
jl .process8
movu m1, [r0]
psllw m1, xm0
pand m1, m3
movu [r2], m1
add r0, mmsize
add r2, mmsize
sub r4d, 16
jz .end
.process8:
cmp r4d, 8
jl .process4
movu xm1, [r0]
psllw xm1, xm0
pand xm1, xm3
movu [r2], xm1
add r0, 16
add r2, 16
sub r4d, 8
jz .end
.process4:
cmp r4d, 4
jl .process2
movq xm1,[r0]
psllw xm1, xm0
pand xm1, xm3
movq [r2], xm1
add r0, 8
add r2, 8
sub r4d, 4
jz .end
.process2:
cmp r4d, 2
jl .process1
movd xm1, [r0]
psllw xm1, xm0
pand xm1, xm3
movd [r2], xm1
add r0, 4
add r2, 4
sub r4d, 2
jz .end
.process1:
movd xm1, [r0]
psllw xm1, xm0
pand xm1, xm3
movd r3d, xm1
mov [r2], r3w
.end:
RET
;---------------------------------------------------------------------------------------------------------------------
;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
;---------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal psyCost_pp_4x4, 4, 5, 8
%if HIGH_BIT_DEPTH
FIX_STRIDES r1, r3
lea r4, [3 * r1]
movddup m0, [r0]
movddup m1, [r0 + r1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4]
mova m4, [hmul_8w]
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
paddd m5, m0, m1
paddd m5, m2
paddd m5, m3
psrldq m4, m5, 4
paddd m5, m4
psrld m5, 2
SUMSUB_BA d, 0, 1, 4
SUMSUB_BA d, 2, 3, 4
SUMSUB_BA d, 0, 2, 4
SUMSUB_BA d, 1, 3, 4
%define ORDER unord
TRANS q, ORDER, 0, 2, 4, 6
TRANS q, ORDER, 1, 3, 4, 6
ABSD2 m0, m2, m0, m2, m4, m6
pmaxsd m0, m2
ABSD2 m1, m3, m1, m3, m4, m6
pmaxsd m1, m3
paddd m0, m1
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
psubd m7, m0, m5
lea r4, [3 * r3]
movddup m0, [r2]
movddup m1, [r2 + r3]
movddup m2, [r2 + r3 * 2]
movddup m3, [r2 + r4]
mova m4, [hmul_8w]
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
paddd m5, m0, m1
paddd m5, m2
paddd m5, m3
psrldq m4, m5, 4
paddd m5, m4
psrld m5, 2
SUMSUB_BA d, 0, 1, 4
SUMSUB_BA d, 2, 3, 4
SUMSUB_BA d, 0, 2, 4
SUMSUB_BA d, 1, 3, 4
%define ORDER unord
TRANS q, ORDER, 0, 2, 4, 6
TRANS q, ORDER, 1, 3, 4, 6
ABSD2 m0, m2, m0, m2, m4, m6
pmaxsd m0, m2
ABSD2 m1, m3, m1, m3, m4, m6
pmaxsd m1, m3
paddd m0, m1
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
psubd m0, m5
psubd m7, m0
pabsd m0, m7
movd eax, m0
%else ; !HIGH_BIT_DEPTH
lea r4, [3 * r1]
movd m0, [r0]
movd m1, [r0 + r1]
movd m2, [r0 + r1 * 2]
movd m3, [r0 + r4]
shufps m0, m1, 0
shufps m2, m3, 0
mova m4, [hmul_4p]
pmaddubsw m0, m4
pmaddubsw m2, m4
paddw m5, m0, m2
movhlps m4, m5
paddw m5, m4
pmaddwd m5, [pw_1]
psrld m5, 2
HADAMARD 0, sumsub, 0, 2, 1, 3
HADAMARD 4, sumsub, 0, 2, 1, 3
HADAMARD 1, amax, 0, 2, 1, 3
HADDW m0, m2
psubd m6, m0, m5
lea r4, [3 * r3]
movd m0, [r2]
movd m1, [r2 + r3]
movd m2, [r2 + r3 * 2]
movd m3, [r2 + r4]
shufps m0, m1, 0
shufps m2, m3, 0
mova m4, [hmul_4p]
pmaddubsw m0, m4
pmaddubsw m2, m4
paddw m5, m0, m2
movhlps m4, m5
paddw m5, m4
pmaddwd m5, [pw_1]
psrld m5, 2
HADAMARD 0, sumsub, 0, 2, 1, 3
HADAMARD 4, sumsub, 0, 2, 1, 3
HADAMARD 1, amax, 0, 2, 1, 3
HADDW m0, m2
psubd m0, m5
psubd m6, m0
pabsd m0, m6
movd eax, m0
%endif ; HIGH_BIT_DEPTH
RET
%if ARCH_X86_64
INIT_XMM sse4
cglobal psyCost_pp_8x8, 4, 6, 13
%if HIGH_BIT_DEPTH
FIX_STRIDES r1, r3
lea r4, [3 * r1]
pxor m10, m10
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, [pw_1]
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, [pd_1]
psrld m0, 1
psubd m10, m0, m8
lea r4, [3 * r3]
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r4]
lea r5, [r2 + r3 * 4]
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r4]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, [pw_1]
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, [pd_1]
psrld m0, 1
psubd m0, m8
psubd m10, m0
pabsd m0, m10
movd eax, m0
%else ; !HIGH_BIT_DEPTH
lea r4, [3 * r1]
mova m8, [hmul_8p]
movddup m0, [r0]
movddup m1, [r0 + r1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movddup m4, [r5]
movddup m5, [r5 + r1]
movddup m6, [r5 + r1 * 2]
movddup m7, [r5 + r4]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, [pw_1]
psrldq m10, m11, 4
paddd m11, m10
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, [pd_1]
psrld m0, 1
psubd m12, m0, m11
lea r4, [3 * r3]
movddup m0, [r2]
movddup m1, [r2 + r3]
movddup m2, [r2 + r3 * 2]
movddup m3, [r2 + r4]
lea r5, [r2 + r3 * 4]
movddup m4, [r5]
movddup m5, [r5 + r3]
movddup m6, [r5 + r3 * 2]
movddup m7, [r5 + r4]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, [pw_1]
psrldq m10, m11, 4
paddd m11, m10
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, [pd_1]
psrld m0, 1
psubd m0, m11
psubd m12, m0
pabsd m0, m12
movd eax, m0
%endif ; HIGH_BIT_DEPTH
RET
%endif
%if ARCH_X86_64
%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal psyCost_pp_16x16, 4, 9, 14
FIX_STRIDES r1, r3
lea r4, [3 * r1]
lea r8, [3 * r3]
mova m12, [pw_1]
mova m13, [pd_1]
pxor m11, m11
mov r7d, 2
.loopH:
mov r6d, 2
.loopW:
pxor m10, m10
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, m12
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, m13
psrld m0, 1
psubd m10, m0, m8
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r8]
lea r5, [r2 + r3 * 4]
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r8]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, m12
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, m13
psrld m0, 1
psubd m0, m8
psubd m10, m0
pabsd m0, m10
paddd m11, m0
add r0, 16
add r2, 16
dec r6d
jnz .loopW
lea r0, [r0 + r1 * 8 - 32]
lea r2, [r2 + r3 * 8 - 32]
dec r7d
jnz .loopH
movd eax, m11
RET
%else ; !HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal psyCost_pp_16x16, 4, 9, 15
lea r4, [3 * r1]
lea r8, [3 * r3]
mova m8, [hmul_8p]
mova m10, [pw_1]
mova m14, [pd_1]
pxor m13, m13
mov r7d, 2
.loopH:
mov r6d, 2
.loopW:
pxor m12, m12
movddup m0, [r0]
movddup m1, [r0 + r1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movddup m4, [r5]
movddup m5, [r5 + r1]
movddup m6, [r5 + r1 * 2]
movddup m7, [r5 + r4]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, m10
psrldq m9, m11, 4
paddd m11, m9
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, m14
psrld m0, 1
psubd m12, m0, m11
movddup m0, [r2]
movddup m1, [r2 + r3]
movddup m2, [r2 + r3 * 2]
movddup m3, [r2 + r8]
lea r5, [r2 + r3 * 4]
movddup m4, [r5]
movddup m5, [r5 + r3]
movddup m6, [r5 + r3 * 2]
movddup m7, [r5 + r8]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, m10
psrldq m9, m11, 4
paddd m11, m9
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, m14
psrld m0, 1
psubd m0, m11
psubd m12, m0
pabsd m0, m12
paddd m13, m0
add r0, 8
add r2, 8
dec r6d
jnz .loopW
lea r0, [r0 + r1 * 8 - 16]
lea r2, [r2 + r3 * 8 - 16]
dec r7d
jnz .loopH
movd eax, m13
RET
%endif ; HIGH_BIT_DEPTH
%endif
%if ARCH_X86_64
%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal psyCost_pp_32x32, 4, 9, 14
FIX_STRIDES r1, r3
lea r4, [3 * r1]
lea r8, [3 * r3]
mova m12, [pw_1]
mova m13, [pd_1]
pxor m11, m11
mov r7d, 4
.loopH:
mov r6d, 4
.loopW:
pxor m10, m10
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, m12
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, m13
psrld m0, 1
psubd m10, m0, m8
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r8]
lea r5, [r2 + r3 * 4]
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r8]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, m12
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, m13
psrld m0, 1
psubd m0, m8
psubd m10, m0
pabsd m0, m10
paddd m11, m0
add r0, 16
add r2, 16
dec r6d
jnz .loopW
lea r0, [r0 + r1 * 8 - 64]
lea r2, [r2 + r3 * 8 - 64]
dec r7d
jnz .loopH
movd eax, m11
RET
%else ; !HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal psyCost_pp_32x32, 4, 9, 15
lea r4, [3 * r1]
lea r8, [3 * r3]
mova m8, [hmul_8p]
mova m10, [pw_1]
mova m14, [pd_1]
pxor m13, m13
mov r7d, 4
.loopH:
mov r6d, 4
.loopW:
pxor m12, m12
movddup m0, [r0]
movddup m1, [r0 + r1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movddup m4, [r5]
movddup m5, [r5 + r1]
movddup m6, [r5 + r1 * 2]
movddup m7, [r5 + r4]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, m10
psrldq m9, m11, 4
paddd m11, m9
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, m14
psrld m0, 1
psubd m12, m0, m11
movddup m0, [r2]
movddup m1, [r2 + r3]
movddup m2, [r2 + r3 * 2]
movddup m3, [r2 + r8]
lea r5, [r2 + r3 * 4]
movddup m4, [r5]
movddup m5, [r5 + r3]
movddup m6, [r5 + r3 * 2]
movddup m7, [r5 + r8]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, m10
psrldq m9, m11, 4
paddd m11, m9
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, m14
psrld m0, 1
psubd m0, m11
psubd m12, m0
pabsd m0, m12
paddd m13, m0
add r0, 8
add r2, 8
dec r6d
jnz .loopW
lea r0, [r0 + r1 * 8 - 32]
lea r2, [r2 + r3 * 8 - 32]
dec r7d
jnz .loopH
movd eax, m13
RET
%endif ; HIGH_BIT_DEPTH
%endif
%if ARCH_X86_64
%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal psyCost_pp_64x64, 4, 9, 14
FIX_STRIDES r1, r3
lea r4, [3 * r1]
lea r8, [3 * r3]
mova m12, [pw_1]
mova m13, [pd_1]
pxor m11, m11
mov r7d, 8
.loopH:
mov r6d, 8
.loopW:
pxor m10, m10
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, m12
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, m13
psrld m0, 1
psubd m10, m0, m8
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r8]
lea r5, [r2 + r3 * 4]
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r8]
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, m12
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
paddd m0, m1
paddd m0, m2
paddd m0, m3
HADDUW m0, m1
paddd m0, m13
psrld m0, 1
psubd m0, m8
psubd m10, m0
pabsd m0, m10
paddd m11, m0
add r0, 16
add r2, 16
dec r6d
jnz .loopW
lea r0, [r0 + r1 * 8 - 128]
lea r2, [r2 + r3 * 8 - 128]
dec r7d
jnz .loopH
movd eax, m11
RET
%else ; !HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal psyCost_pp_64x64, 4, 9, 15
lea r4, [3 * r1]
lea r8, [3 * r3]
mova m8, [hmul_8p]
mova m10, [pw_1]
mova m14, [pd_1]
pxor m13, m13
mov r7d, 8
.loopH:
mov r6d, 8
.loopW:
pxor m12, m12
movddup m0, [r0]
movddup m1, [r0 + r1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movddup m4, [r5]
movddup m5, [r5 + r1]
movddup m6, [r5 + r1 * 2]
movddup m7, [r5 + r4]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, m10
psrldq m9, m11, 4
paddd m11, m9
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, m14
psrld m0, 1
psubd m12, m0, m11
movddup m0, [r2]
movddup m1, [r2 + r3]
movddup m2, [r2 + r3 * 2]
movddup m3, [r2 + r8]
lea r5, [r2 + r3 * 4]
movddup m4, [r5]
movddup m5, [r5 + r3]
movddup m6, [r5 + r3 * 2]
movddup m7, [r5 + r8]
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, m10
psrldq m9, m11, 4
paddd m11, m9
psrld m11, 2
HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9
paddw m0, m1
paddw m0, m2
paddw m0, m3
HADDW m0, m1
paddd m0, m14
psrld m0, 1
psubd m0, m11
psubd m12, m0
pabsd m0, m12
paddd m13, m0
add r0, 8
add r2, 8
dec r6d
jnz .loopW
lea r0, [r0 + r1 * 8 - 64]
lea r2, [r2 + r3 * 8 - 64]
dec r7d
jnz .loopH
movd eax, m13
RET
%endif ; HIGH_BIT_DEPTH
%endif
INIT_YMM avx2
%if HIGH_BIT_DEPTH
cglobal psyCost_pp_4x4, 4, 5, 6
add r1d, r1d
add r3d, r3d
lea r4, [r1 * 3]
movddup xm0, [r0]
movddup xm1, [r0 + r1]
movddup xm2, [r0 + r1 * 2]
movddup xm3, [r0 + r4]
lea r4, [r3 * 3]
movddup xm4, [r2]
movddup xm5, [r2 + r3]
vinserti128 m0, m0, xm4, 1
vinserti128 m1, m1, xm5, 1
movddup xm4, [r2 + r3 * 2]
movddup xm5, [r2 + r4]
vinserti128 m2, m2, xm4, 1
vinserti128 m3, m3, xm5, 1
mova m4, [hmul_8w]
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
paddd m5, m0, m1
paddd m4, m2, m3
paddd m5, m4
psrldq m4, m5, 4
paddd m5, m4
psrld m5, 2
mova m4, m0
paddd m0, m1
psubd m1, m4
mova m4, m2
paddd m2, m3
psubd m3, m4
mova m4, m0
paddd m0, m2
psubd m2, m4
mova m4, m1
paddd m1, m3
psubd m3, m4
movaps m4, m0
vshufps m4, m4, m2, 11011101b
vshufps m0, m0, m2, 10001000b
movaps m2, m1
vshufps m2, m2, m3, 11011101b
vshufps m1, m1, m3, 10001000b
pabsd m0, m0
pabsd m4, m4
pmaxsd m0, m4
pabsd m1, m1
pabsd m2, m2
pmaxsd m1, m2
paddd m0, m1
vpermq m1, m0, 11110101b
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
psubd m0, m5
vextracti128 xm1, m0, 1
psubd xm1, xm0
pabsd xm1, xm1
movd eax, xm1
RET
%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_4x4, 4, 5, 6
lea r4, [3 * r1]
movd xm0, [r0]
movd xm1, [r0 + r1]
movd xm2, [r0 + r1 * 2]
movd xm3, [r0 + r4]
vshufps xm0, xm1, 0
vshufps xm2, xm3, 0
lea r4, [3 * r3]
movd xm1, [r2]
movd xm3, [r2 + r3]
movd xm4, [r2 + r3 * 2]
movd xm5, [r2 + r4]
vshufps xm1, xm3, 0
vshufps xm4, xm5, 0
vinserti128 m0, m0, xm1, 1
vinserti128 m2, m2, xm4, 1
mova m4, [hmul_4p]
pmaddubsw m0, m4
pmaddubsw m2, m4
paddw m5, m0, m2
mova m1, m5
psrldq m4, m5, 8
paddw m5, m4
pmaddwd m5, [pw_1]
psrld m5, 2
vpsubw m2, m2, m0
vpunpckhqdq m0, m1, m2
vpunpcklqdq m1, m1, m2
vpaddw m2, m1, m0
vpsubw m0, m0, m1
vpblendw m1, m2, m0, 10101010b
vpslld m0, m0, 10h
vpsrld m2, m2, 10h
vpor m0, m0, m2
vpabsw m1, m1
vpabsw m0, m0
vpmaxsw m1, m1, m0
vpmaddwd m1, m1, [pw_1]
psrldq m2, m1, 8
paddd m1, m2
psrldq m3, m1, 4
paddd m1, m3
psubd m1, m5
vextracti128 xm2, m1, 1
psubd m1, m2
pabsd m1, m1
movd eax, xm1
RET
%endif
%macro PSY_PP_8x8 0
movddup m0, [r0 + r1 * 0]
movddup m1, [r0 + r1 * 1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4 * 1]
lea r5, [r0 + r1 * 4]
movddup m4, [r2 + r3 * 0]
movddup m5, [r2 + r3 * 1]
movddup m6, [r2 + r3 * 2]
movddup m7, [r2 + r7 * 1]
lea r6, [r2 + r3 * 4]
vinserti128 m0, m0, xm4, 1
vinserti128 m1, m1, xm5, 1
vinserti128 m2, m2, xm6, 1
vinserti128 m3, m3, xm7, 1
movddup m4, [r5 + r1 * 0]
movddup m5, [r5 + r1 * 1]
movddup m6, [r5 + r1 * 2]
movddup m7, [r5 + r4 * 1]
movddup m9, [r6 + r3 * 0]
movddup m10, [r6 + r3 * 1]
movddup m11, [r6 + r3 * 2]
movddup m12, [r6 + r7 * 1]
vinserti128 m4, m4, xm9, 1
vinserti128 m5, m5, xm10, 1
vinserti128 m6, m6, xm11, 1
vinserti128 m7, m7, xm12, 1
pmaddubsw m0, m8
pmaddubsw m1, m8
pmaddubsw m2, m8
pmaddubsw m3, m8
pmaddubsw m4, m8
pmaddubsw m5, m8
pmaddubsw m6, m8
pmaddubsw m7, m8
paddw m11, m0, m1
paddw m11, m2
paddw m11, m3
paddw m11, m4
paddw m11, m5
paddw m11, m6
paddw m11, m7
pmaddwd m11, [pw_1]
psrldq m10, m11, 4
paddd m11, m10
psrld m11, 2
mova m9, m0
paddw m0, m1 ; m0+m1
psubw m1, m9 ; m1-m0
mova m9, m2
paddw m2, m3 ; m2+m3
psubw m3, m9 ; m3-m2
mova m9, m0
paddw m0, m2 ; m0+m1+m2+m3
psubw m2, m9 ; m2+m3-m0+m1
mova m9, m1
paddw m1, m3 ; m1-m0+m3-m2
psubw m3, m9 ; m3-m2-m1-m0
movdqa m9, m4
paddw m4, m5 ; m4+m5
psubw m5, m9 ; m5-m4
movdqa m9, m6
paddw m6, m7 ; m6+m7
psubw m7, m9 ; m7-m6
movdqa m9, m4
paddw m4, m6 ; m4+m5+m6+m7
psubw m6, m9 ; m6+m7-m4+m5
movdqa m9, m5
paddw m5, m7 ; m5-m4+m7-m6
psubw m7, m9 ; m7-m6-m5-m4
movdqa m9, m0
paddw m0, m4 ; (m0+m1+m2+m3)+(m4+m5+m6+m7)
psubw m4, m9 ; (m4+m5+m6+m7)-(m0+m1+m2+m3)
movdqa m9, m1
paddw m1, m5 ; (m1-m0+m3-m2)+(m5-m4+m7-m6)
psubw m5, m9 ; (m5-m4+m7-m6)-(m1-m0+m3-m2)
mova m9, m0
vshufps m9, m9, m4, 11011101b
vshufps m0, m0, m4, 10001000b
movdqa m4, m0
paddw m0, m9 ; (a0 + a4) + (a4 - a0)
psubw m9, m4 ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4)
movaps m4, m1
vshufps m4, m4, m5, 11011101b
vshufps m1, m1, m5, 10001000b
movdqa m5, m1
paddw m1, m4
psubw m4, m5
movdqa m5, m2
paddw m2, m6
psubw m6, m5
movdqa m5, m3
paddw m3, m7
psubw m7, m5
movaps m5, m2
vshufps m5, m5, m6, 11011101b
vshufps m2, m2, m6, 10001000b
movdqa m6, m2
paddw m2, m5
psubw m5, m6
movaps m6, m3
vshufps m6, m6, m7, 11011101b
vshufps m3, m3, m7, 10001000b
movdqa m7, m3
paddw m3, m6
psubw m6, m7
movdqa m7, m0
pblendw m0, m9, 10101010b
pslld m9, 10h
psrld m7, 10h
por m9, m7
pabsw m0, m0
pabsw m9, m9
pmaxsw m0, m9
movdqa m7, m1
pblendw m1, m4, 10101010b
pslld m4, 10h
psrld m7, 10h
por m4, m7
pabsw m1, m1
pabsw m4, m4
pmaxsw m1, m4
movdqa m7, m2
pblendw m2, m5, 10101010b
pslld m5, 10h
psrld m7, 10h
por m5, m7
pabsw m2, m2
pabsw m5, m5
pmaxsw m2, m5
mova m7, m3
pblendw m3, m6, 10101010b
pslld m6, 10h
psrld m7, 10h
por m6, m7
pabsw m3, m3
pabsw m6, m6
pmaxsw m3, m6
paddw m0, m1
paddw m0, m2
paddw m0, m3
pmaddwd m0, [pw_1]
psrldq m1, m0, 8
paddd m0, m1
pshuflw m1, m0, 00001110b
paddd m0, m1
paddd m0, [pd_1]
psrld m0, 1
psubd m0, m11
vextracti128 xm1, m0, 1
psubd m0, m1
pabsd m0, m0
%endmacro
%macro PSY_PP_8x8_AVX2 0
lea r4, [r1 * 3]
movu xm0, [r0]
movu xm1, [r0 + r1]
movu xm2, [r0 + r1 * 2]
movu xm3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movu xm4, [r5]
movu xm5, [r5 + r1]
movu xm6, [r5 + r1 * 2]
movu xm7, [r5 + r4]
lea r4, [r3 * 3]
vinserti128 m0, m0, [r2], 1
vinserti128 m1, m1, [r2 + r3], 1
vinserti128 m2, m2, [r2 + r3 * 2], 1
vinserti128 m3, m3, [r2 + r4], 1
lea r5, [r2 + r3 * 4]
vinserti128 m4, m4, [r5], 1
vinserti128 m5, m5, [r5 + r3], 1
vinserti128 m6, m6, [r5 + r3 * 2], 1
vinserti128 m7, m7, [r5 + r4], 1
paddw m8, m0, m1
paddw m8, m2
paddw m8, m3
paddw m8, m4
paddw m8, m5
paddw m8, m6
paddw m8, m7
pmaddwd m8, [pw_1]
psrldq m9, m8, 8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
psubw m9, m1, m0
paddw m0, m1
psubw m1, m3, m2
paddw m2, m3
punpckhwd m3, m0, m9
punpcklwd m0, m9
psubw m9, m3, m0
paddw m0, m3
punpckhwd m3, m2, m1
punpcklwd m2, m1
psubw m10, m3, m2
paddw m2, m3
psubw m3, m5, m4
paddw m4, m5
psubw m5, m7, m6
paddw m6, m7
punpckhwd m1, m4, m3
punpcklwd m4, m3
psubw m7, m1, m4
paddw m4, m1
punpckhwd m3, m6, m5
punpcklwd m6, m5
psubw m1, m3, m6
paddw m6, m3
psubw m3, m2, m0
paddw m0, m2
psubw m2, m10, m9
paddw m9, m10
punpckhdq m5, m0, m3
punpckldq m0, m3
psubw m10, m5, m0
paddw m0, m5
punpckhdq m3, m9, m2
punpckldq m9, m2
psubw m5, m3, m9
paddw m9, m3
psubw m3, m6, m4
paddw m4, m6
psubw m6, m1, m7
paddw m7, m1
punpckhdq m2, m4, m3
punpckldq m4, m3
psubw m1, m2, m4
paddw m4, m2
punpckhdq m3, m7, m6
punpckldq m7, m6
psubw m2, m3, m7
paddw m7, m3
psubw m3, m4, m0
paddw m0, m4
psubw m4, m1, m10
paddw m10, m1
punpckhqdq m6, m0, m3
punpcklqdq m0, m3
pabsw m0, m0
pabsw m6, m6
pmaxsw m0, m6
punpckhqdq m3, m10, m4
punpcklqdq m10, m4
pabsw m10, m10
pabsw m3, m3
pmaxsw m10, m3
psubw m3, m7, m9
paddw m9, m7
psubw m7, m2, m5
paddw m5, m2
punpckhqdq m4, m9, m3
punpcklqdq m9, m3
pabsw m9, m9
pabsw m4, m4
pmaxsw m9, m4
punpckhqdq m3, m5, m7
punpcklqdq m5, m7
pabsw m5, m5
pabsw m3, m3
pmaxsw m5, m3
paddd m0, m9
paddd m0, m10
paddd m0, m5
psrld m9, m0, 16
pslld m0, 16
psrld m0, 16
paddd m0, m9
psrldq m9, m0, 8
paddd m0, m9
psrldq m9, m0, 4
paddd m0, m9
paddd m0, [pd_1]
psrld m0, 1
psubd m0, m8
vextracti128 xm1, m0, 1
psubd xm1, xm0
pabsd xm1, xm1
%endmacro
%if ARCH_X86_64
%if HIGH_BIT_DEPTH
cglobal psyCost_pp_8x8, 4, 8, 11
add r1d, r1d
add r3d, r3d
PSY_PP_8x8_AVX2
movd eax, xm1
RET
%else ; !HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal psyCost_pp_8x8, 4, 8, 13
lea r4, [3 * r1]
lea r7, [3 * r3]
mova m8, [hmul_8p]
PSY_PP_8x8
movd eax, xm0
RET
%endif
%endif
%if ARCH_X86_64
INIT_YMM avx2
%if HIGH_BIT_DEPTH
cglobal psyCost_pp_16x16, 4, 10, 12
add r1d, r1d
add r3d, r3d
pxor m11, m11
mov r8d, 2
.loopH:
mov r9d, 2
.loopW:
PSY_PP_8x8_AVX2
paddd xm11, xm1
add r0, 16
add r2, 16
dec r9d
jnz .loopW
lea r0, [r0 + r1 * 8 - 32]
lea r2, [r2 + r3 * 8 - 32]
dec r8d
jnz .loopH
movd eax, xm11
RET
%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_16x16, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
mova m8, [hmul_8p]
pxor m13, m13
mov r8d, 2
.loopH:
mov r9d, 2
.loopW:
PSY_PP_8x8
paddd m13, m0
add r0, 8
add r2, 8
dec r9d
jnz .loopW
lea r0, [r0 + r1 * 8 - 16]
lea r2, [r2 + r3 * 8 - 16]
dec r8d
jnz .loopH
movd eax, xm13
RET
%endif
%endif
%if ARCH_X86_64
INIT_YMM avx2
%if HIGH_BIT_DEPTH
cglobal psyCost_pp_32x32, 4, 10, 12
add r1d, r1d
add r3d, r3d
pxor m11, m11
mov r8d, 4
.loopH:
mov r9d, 4
.loopW:
PSY_PP_8x8_AVX2
paddd xm11, xm1
add r0, 16
add r2, 16
dec r9d
jnz .loopW
lea r0, [r0 + r1 * 8 - 64]
lea r2, [r2 + r3 * 8 - 64]
dec r8d
jnz .loopH
movd eax, xm11
RET
%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_32x32, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
mova m8, [hmul_8p]
pxor m13, m13
mov r8d, 4
.loopH:
mov r9d, 4
.loopW:
PSY_PP_8x8
paddd m13, m0
add r0, 8
add r2, 8
dec r9d
jnz .loopW
lea r0, [r0 + r1 * 8 - 32]
lea r2, [r2 + r3 * 8 - 32]
dec r8d
jnz .loopH
movd eax, xm13
RET
%endif
%endif
%if ARCH_X86_64
INIT_YMM avx2
%if HIGH_BIT_DEPTH
cglobal psyCost_pp_64x64, 4, 10, 12
add r1d, r1d
add r3d, r3d
pxor m11, m11
mov r8d, 8
.loopH:
mov r9d, 8
.loopW:
PSY_PP_8x8_AVX2
paddd xm11, xm1
add r0, 16
add r2, 16
dec r9d
jnz .loopW
lea r0, [r0 + r1 * 8 - 128]
lea r2, [r2 + r3 * 8 - 128]
dec r8d
jnz .loopH
movd eax, xm11
RET
%else ; !HIGH_BIT_DEPTH
cglobal psyCost_pp_64x64, 4, 10, 14
lea r4, [3 * r1]
lea r7, [3 * r3]
mova m8, [hmul_8p]
pxor m13, m13
mov r8d, 8
.loopH:
mov r9d, 8
.loopW:
PSY_PP_8x8
paddd m13, m0
add r0, 8
add r2, 8
dec r9d
jnz .loopW
lea r0, [r0 + r1 * 8 - 64]
lea r2, [r2 + r3 * 8 - 64]
dec r8d
jnz .loopH
movd eax, xm13
RET
%endif
%endif
;---------------------------------------------------------------------------------------------------------------------
;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
;---------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal psyCost_ss_4x4, 4, 5, 8
add r1, r1
lea r4, [3 * r1]
movddup m0, [r0]
movddup m1, [r0 + r1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4]
pabsw m4, m0
pabsw m5, m1
paddw m5, m4
pabsw m4, m2
paddw m5, m4
pabsw m4, m3
paddw m5, m4
pmaddwd m5, [pw_1]
psrldq m4, m5, 4
paddd m5, m4
psrld m6, m5, 2
mova m4, [hmul_8w]
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
psrldq m4, m0, 4
psubd m5, m0, m4
paddd m0, m4
shufps m0, m5, 10001000b
psrldq m4, m1, 4
psubd m5, m1, m4
paddd m1, m4
shufps m1, m5, 10001000b
psrldq m4, m2, 4
psubd m5, m2, m4
paddd m2, m4
shufps m2, m5, 10001000b
psrldq m4, m3, 4
psubd m5, m3, m4
paddd m3, m4
shufps m3, m5, 10001000b
mova m4, m0
paddd m0, m1
psubd m1, m4
mova m4, m2
paddd m2, m3
psubd m3, m4
mova m4, m0
paddd m0, m2
psubd m2, m4
mova m4, m1
paddd m1, m3
psubd m3, m4
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
paddd m0, m2
paddd m1, m3
paddd m0, m1
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
psrld m0, 1
psubd m7, m0, m6
add r3, r3
lea r4, [3 * r3]
movddup m0, [r2]
movddup m1, [r2 + r3]
movddup m2, [r2 + r3 * 2]
movddup m3, [r2 + r4]
pabsw m4, m0
pabsw m5, m1
paddw m5, m4
pabsw m4, m2
paddw m5, m4
pabsw m4, m3
paddw m5, m4
pmaddwd m5, [pw_1]
psrldq m4, m5, 4
paddd m5, m4
psrld m6, m5, 2
mova m4, [hmul_8w]
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
psrldq m4, m0, 4
psubd m5, m0, m4
paddd m0, m4
shufps m0, m5, 10001000b
psrldq m4, m1, 4
psubd m5, m1, m4
paddd m1, m4
shufps m1, m5, 10001000b
psrldq m4, m2, 4
psubd m5, m2, m4
paddd m2, m4
shufps m2, m5, 10001000b
psrldq m4, m3, 4
psubd m5, m3, m4
paddd m3, m4
shufps m3, m5, 10001000b
mova m4, m0
paddd m0, m1
psubd m1, m4
mova m4, m2
paddd m2, m3
psubd m3, m4
mova m4, m0
paddd m0, m2
psubd m2, m4
mova m4, m1
paddd m1, m3
psubd m3, m4
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
paddd m0, m2
paddd m1, m3
paddd m0, m1
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
psrld m0, 1
psubd m0, m6
psubd m7, m0
pabsd m0, m7
movd eax, m0
RET
%if ARCH_X86_64
INIT_XMM sse4
cglobal psyCost_ss_8x8, 4, 6, 15
mova m13, [pw_pmpmpmpm]
mova m14, [pw_1]
add r1, r1
add r3, r3
lea r4, [3 * r1]
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
pabsw m8, m0
pabsw m9, m1
paddw m8, m9
pabsw m10, m2
pabsw m11, m3
paddw m10, m11
paddw m8, m10
pabsw m9, m4
pabsw m10, m5
paddw m9, m10
pabsw m11, m6
pabsw m12, m7
paddw m11, m12
paddw m9, m11
paddw m8, m9
movhlps m9, m8
pmovzxwd m8, m8
pmovzxwd m9, m9
paddd m8, m9
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
pmaddwd m0, m13
pmaddwd m1, m13
pmaddwd m2, m13
pmaddwd m3, m13
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
pmaddwd m4, m13
pmaddwd m5, m13
pmaddwd m6, m13
pmaddwd m7, m13
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m11, m0, m7
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
pmaddwd m0, m14
pmaddwd m1, m14
pmaddwd m2, m14
pmaddwd m3, m14
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
pmaddwd m4, m14
pmaddwd m5, m14
pmaddwd m6, m14
pmaddwd m7, m14
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m0, m7
paddd m0, m11
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
paddd m0, [pd_2]
psrld m0, 2
psubd m12, m0, m8
lea r4, [3 * r3]
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r4]
lea r5, [r2 + r3 * 4]
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r4]
pabsw m8, m0
pabsw m9, m1
paddw m8, m9
pabsw m10, m2
pabsw m11, m3
paddw m10, m11
paddw m8, m10
pabsw m9, m4
pabsw m10, m5
paddw m9, m10
pabsw m11, m6
pabsw m10, m7
paddw m11, m10
paddw m9, m11
paddw m8, m9
movhlps m9, m8
pmovzxwd m8, m8
pmovzxwd m9, m9
paddd m8, m9
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
pmaddwd m0, m13
pmaddwd m1, m13
pmaddwd m2, m13
pmaddwd m3, m13
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
pmaddwd m4, m13
pmaddwd m5, m13
pmaddwd m6, m13
pmaddwd m7, m13
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m11, m0, m7
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r4]
pmaddwd m0, m14
pmaddwd m1, m14
pmaddwd m2, m14
pmaddwd m3, m14
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r4]
pmaddwd m4, m14
pmaddwd m5, m14
pmaddwd m6, m14
pmaddwd m7, m14
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m0, m7
paddd m0, m11
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
paddd m0, [pd_2]
psrld m0, 2
psubd m0, m8
psubd m12, m0
pabsd m0, m12
movd eax, m0
RET
%endif
%macro psy_cost_ss 0
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
lea r5, [r0 + r1 * 4]
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
pabsw m8, m0
pabsw m9, m1
paddw m8, m9
pabsw m10, m2
pabsw m11, m3
paddw m10, m11
paddw m8, m10
pabsw m9, m4
pabsw m10, m5
paddw m9, m10
pabsw m11, m6
pabsw m12, m7
paddw m11, m12
paddw m9, m11
paddw m8, m9
movhlps m9, m8
pmovzxwd m8, m8
pmovzxwd m9, m9
paddd m8, m9
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
pmaddwd m0, m13
pmaddwd m1, m13
pmaddwd m2, m13
pmaddwd m3, m13
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
pmaddwd m4, m13
pmaddwd m5, m13
pmaddwd m6, m13
pmaddwd m7, m13
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m11, m0, m7
movu m0, [r0]
movu m1, [r0 + r1]
movu m2, [r0 + r1 * 2]
movu m3, [r0 + r4]
pmaddwd m0, m14
pmaddwd m1, m14
pmaddwd m2, m14
pmaddwd m3, m14
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
movu m4, [r5]
movu m5, [r5 + r1]
movu m6, [r5 + r1 * 2]
movu m7, [r5 + r4]
pmaddwd m4, m14
pmaddwd m5, m14
pmaddwd m6, m14
pmaddwd m7, m14
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m0, m7
paddd m0, m11
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
paddd m0, [pd_2]
psrld m0, 2
psubd m12, m0, m8
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r6]
lea r5, [r2 + r3 * 4]
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r6]
pabsw m8, m0
pabsw m9, m1
paddw m8, m9
pabsw m10, m2
pabsw m11, m3
paddw m10, m11
paddw m8, m10
pabsw m9, m4
pabsw m10, m5
paddw m9, m10
pabsw m11, m6
pabsw m10, m7
paddw m11, m10
paddw m9, m11
paddw m8, m9
movhlps m9, m8
pmovzxwd m8, m8
pmovzxwd m9, m9
paddd m8, m9
movhlps m9, m8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2
pmaddwd m0, m13
pmaddwd m1, m13
pmaddwd m2, m13
pmaddwd m3, m13
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
pmaddwd m4, m13
pmaddwd m5, m13
pmaddwd m6, m13
pmaddwd m7, m13
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m11, m0, m7
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + r3 * 2]
movu m3, [r2 + r6]
pmaddwd m0, m14
pmaddwd m1, m14
pmaddwd m2, m14
pmaddwd m3, m14
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
shufps m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
shufps m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
shufps m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
shufps m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
movu m4, [r5]
movu m5, [r5 + r3]
movu m6, [r5 + r3 * 2]
movu m7, [r5 + r6]
pmaddwd m4, m14
pmaddwd m5, m14
pmaddwd m6, m14
pmaddwd m7, m14
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
shufps m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
shufps m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
shufps m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
shufps m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m0, m7
paddd m0, m11
movhlps m1, m0
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
paddd m0, [pd_2]
psrld m0, 2
psubd m0, m8
psubd m12, m0
pabsd m0, m12
paddd m15, m0
%endmacro
%if ARCH_X86_64
INIT_XMM sse4
cglobal psyCost_ss_16x16, 4, 9, 16
mova m13, [pw_pmpmpmpm]
mova m14, [pw_1]
add r1, r1
add r3, r3
lea r4, [3 * r1]
lea r6, [3 * r3]
pxor m15, m15
mov r7d, 2
.loopH:
mov r8d, 2
.loopW:
psy_cost_ss
add r0, 16
add r2, 16
dec r8d
jnz .loopW
lea r0, [r0 + r1 * 8 - 32]
lea r2, [r2 + r3 * 8 - 32]
dec r7d
jnz .loopH
movd eax, m15
RET
%endif
%if ARCH_X86_64
INIT_XMM sse4
cglobal psyCost_ss_32x32, 4, 9, 16
mova m13, [pw_pmpmpmpm]
mova m14, [pw_1]
add r1, r1
add r3, r3
lea r4, [3 * r1]
lea r6, [3 * r3]
pxor m15, m15
mov r7d, 4
.loopH:
mov r8d, 4
.loopW:
psy_cost_ss
add r0, 16
add r2, 16
dec r8d
jnz .loopW
lea r0, [r0 + r1 * 8 - 64]
lea r2, [r2 + r3 * 8 - 64]
dec r7d
jnz .loopH
movd eax, m15
RET
%endif
%if ARCH_X86_64
INIT_XMM sse4
cglobal psyCost_ss_64x64, 4, 9, 16
mova m13, [pw_pmpmpmpm]
mova m14, [pw_1]
add r1, r1
add r3, r3
lea r4, [3 * r1]
lea r6, [3 * r3]
pxor m15, m15
mov r7d, 8
.loopH:
mov r8d, 8
.loopW:
psy_cost_ss
add r0, 16
add r2, 16
dec r8d
jnz .loopW
lea r0, [r0 + r1 * 8 - 128]
lea r2, [r2 + r3 * 8 - 128]
dec r7d
jnz .loopH
movd eax, m15
RET
%endif
INIT_YMM avx2
cglobal psyCost_ss_4x4, 4, 5, 8
add r1, r1
add r3, r3
lea r4, [3 * r1]
movddup m0, [r0]
movddup m1, [r0 + r1]
movddup m2, [r0 + r1 * 2]
movddup m3, [r0 + r4]
lea r4, [3 * r3]
movddup m4, [r2]
movddup m5, [r2 + r3]
movddup m6, [r2 + r3 * 2]
movddup m7, [r2 + r4]
vinserti128 m0, m0, xm4, 1
vinserti128 m1, m1, xm5, 1
vinserti128 m2, m2, xm6, 1
vinserti128 m3, m3, xm7, 1
pabsw m4, m0
pabsw m5, m1
paddw m5, m4
pabsw m4, m2
paddw m5, m4
pabsw m4, m3
paddw m5, m4
pmaddwd m5, [pw_1]
psrldq m4, m5, 4
paddd m5, m4
psrld m6, m5, 2
mova m4, [hmul_8w]
pmaddwd m0, m4
pmaddwd m1, m4
pmaddwd m2, m4
pmaddwd m3, m4
psrldq m4, m0, 4
psubd m5, m0, m4
paddd m0, m4
shufps m0, m0, m5, 10001000b
psrldq m4, m1, 4
psubd m5, m1, m4
paddd m1, m4
shufps m1, m1, m5, 10001000b
psrldq m4, m2, 4
psubd m5, m2, m4
paddd m2, m4
shufps m2, m2, m5, 10001000b
psrldq m4, m3, 4
psubd m5, m3, m4
paddd m3, m4
shufps m3, m3, m5, 10001000b
mova m4, m0
paddd m0, m1
psubd m1, m4
mova m4, m2
paddd m2, m3
psubd m3, m4
mova m4, m0
paddd m0, m2
psubd m2, m4
mova m4, m1
paddd m1, m3
psubd m3, m4
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
paddd m0, m2
paddd m1, m3
paddd m0, m1
psrldq m1, m0, 8
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
psrld m0, 1
psubd m0, m6
vextracti128 xm1, m0, 1
psubd m0, m1
pabsd m0, m0
movd eax, xm0
RET
%macro PSY_SS_8x8 0
lea r4, [3 * r1]
lea r6, [r0 + r1 * 4]
movu xm0, [r0]
movu xm1, [r0 + r1]
movu xm2, [r0 + r1 * 2]
movu xm3, [r0 + r4]
movu xm4, [r6]
movu xm5, [r6 + r1]
movu xm6, [r6 + r1 * 2]
movu xm7, [r6 + r4]
lea r4, [3 * r3]
lea r6, [r2 + r3 * 4]
movu xm8, [r2]
movu xm9, [r2 + r3]
movu xm10, [r2 + r3 * 2]
movu xm11, [r2 + r4]
vinserti128 m0, m0, xm8, 1
vinserti128 m1, m1, xm9, 1
vinserti128 m2, m2, xm10, 1
vinserti128 m3, m3, xm11, 1
movu xm8, [r6]
movu xm9, [r6 + r3]
movu xm10, [r6 + r3 * 2]
movu xm11, [r6 + r4]
vinserti128 m4, m4, xm8, 1
vinserti128 m5, m5, xm9, 1
vinserti128 m6, m6, xm10, 1
vinserti128 m7, m7, xm11, 1
;; store on stack to use later
mova [rsp + 0 * mmsize], m0
mova [rsp + 1 * mmsize], m1
mova [rsp + 2 * mmsize], m2
mova [rsp + 3 * mmsize], m3
mova [rsp + 4 * mmsize], m4
mova [rsp + 5 * mmsize], m5
mova [rsp + 6 * mmsize], m6
mova [rsp + 7 * mmsize], m7
pabsw m8, m0
pabsw m9, m1
paddw m8, m9
pabsw m10, m2
pabsw m11, m3
paddw m10, m11
paddw m8, m10
pabsw m9, m4
pabsw m10, m5
paddw m9, m10
pabsw m11, m6
pabsw m10, m7
paddw m11, m10
paddw m9, m11
paddw m8, m9
psrldq m9, m8, 8
vextracti128 xm10, m8, 1
vextracti128 xm11, m9, 1
vpmovzxwd m8, xm8
vpmovzxwd m9, xm9
vpmovzxwd m10, xm10
vpmovzxwd m11, xm11
vinserti128 m8, m8, xm10, 1
vinserti128 m9, m9, xm11, 1
paddd m8, m9
psrldq m9, m8, 8
paddd m8, m9
psrldq m9, m8, 4
paddd m8, m9
psrld m8, 2 ; sad_4x4
pmaddwd m0, m13
pmaddwd m1, m13
pmaddwd m2, m13
pmaddwd m3, m13
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
vshufps m0, m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
vshufps m0, m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
vshufps m1, m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
vshufps m1, m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
vshufps m2, m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
vshufps m2, m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
vshufps m3, m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
vshufps m3, m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
pmaddwd m4, m13
pmaddwd m5, m13
pmaddwd m6, m13
pmaddwd m7, m13
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
vshufps m4, m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
vshufps m4, m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
vshufps m5, m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
vshufps m5, m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
vshufps m6, m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
vshufps m6, m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
vshufps m7, m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
vshufps m7, m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m11, m0, m7
pmaddwd m0, m12, [rsp + 0 * mmsize]
pmaddwd m1, m12, [rsp + 1 * mmsize]
pmaddwd m2, m12, [rsp + 2 * mmsize]
pmaddwd m3, m12, [rsp + 3 * mmsize]
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
vshufps m0, m0, m10, 10001000b
psrldq m9, m0, 4
psubd m10, m0, m9
paddd m0, m9
vshufps m0, m0, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
vshufps m1, m1, m10, 10001000b
psrldq m9, m1, 4
psubd m10, m1, m9
paddd m1, m9
vshufps m1, m1, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
vshufps m2, m2, m10, 10001000b
psrldq m9, m2, 4
psubd m10, m2, m9
paddd m2, m9
vshufps m2, m2, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
vshufps m3, m3, m10, 10001000b
psrldq m9, m3, 4
psubd m10, m3, m9
paddd m3, m9
vshufps m3, m3, m10, 10001000b
SUMSUB_BA d, 0, 1, 9
SUMSUB_BA d, 2, 3, 9
SUMSUB_BA d, 0, 2, 9
SUMSUB_BA d, 1, 3, 9
pmaddwd m4, m12, [rsp + 4 * mmsize]
pmaddwd m5, m12, [rsp + 5 * mmsize]
pmaddwd m6, m12, [rsp + 6 * mmsize]
pmaddwd m7, m12, [rsp + 7 * mmsize]
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
vshufps m4, m4, m10, 10001000b
psrldq m9, m4, 4
psubd m10, m4, m9
paddd m4, m9
vshufps m4, m4, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
vshufps m5, m5, m10, 10001000b
psrldq m9, m5, 4
psubd m10, m5, m9
paddd m5, m9
vshufps m5, m5, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
vshufps m6, m6, m10, 10001000b
psrldq m9, m6, 4
psubd m10, m6, m9
paddd m6, m9
vshufps m6, m6, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
vshufps m7, m7, m10, 10001000b
psrldq m9, m7, 4
psubd m10, m7, m9
paddd m7, m9
vshufps m7, m7, m10, 10001000b
SUMSUB_BA d, 4, 5, 9
SUMSUB_BA d, 6, 7, 9
SUMSUB_BA d, 4, 6, 9
SUMSUB_BA d, 5, 7, 9
SUMSUB_BA d, 0, 4, 9
SUMSUB_BA d, 1, 5, 9
SUMSUB_BA d, 2, 6, 9
SUMSUB_BA d, 3, 7, 9
pabsd m0, m0
pabsd m2, m2
pabsd m1, m1
pabsd m3, m3
pabsd m4, m4
pabsd m5, m5
pabsd m6, m6
pabsd m7, m7
paddd m0, m2
paddd m1, m3
paddd m0, m1
paddd m5, m4
paddd m0, m5
paddd m7, m6
paddd m0, m7
paddd m0, m11
psrldq m1, m0, 8
paddd m0, m1
psrldq m1, m0, 4
paddd m0, m1
paddd m0, [pd_2]
psrld m0, 2
psubd m0, m8
vextracti128 xm1, m0, 1
psubd m0, m1
pabsd m0, m0
%endmacro
%if ARCH_X86_64
INIT_YMM avx2
cglobal psyCost_ss_8x8, 4, 7, 14
; NOTE: align stack to 64 bytes, so all of local data in same cache line
mov r5, rsp
sub rsp, 8*mmsize
and rsp, ~63
mova m12, [pw_1]
mova m13, [pw_pmpmpmpm]
add r1, r1
add r3, r3
PSY_SS_8x8
movd eax, xm0
mov rsp, r5
RET
%endif
%if ARCH_X86_64
INIT_YMM avx2
cglobal psyCost_ss_16x16, 4, 9, 15
; NOTE: align stack to 64 bytes, so all of local data in same cache line
mov r5, rsp
sub rsp, 8*mmsize
and rsp, ~63
mova m12, [pw_1]
mova m13, [pw_pmpmpmpm]
add r1, r1
add r3, r3
pxor m14, m14
mov r7d, 2
.loopH:
mov r8d, 2
.loopW:
PSY_SS_8x8
paddd m14, m0
add r0, 16
add r2, 16
dec r8d
jnz .loopW
lea r0, [r0 + r1 * 8 - 32]
lea r2, [r2 + r3 * 8 - 32]
dec r7d
jnz .loopH
movd eax, xm14
mov rsp, r5
RET
%endif
%if ARCH_X86_64
INIT_YMM avx2
cglobal psyCost_ss_32x32, 4, 9, 15
; NOTE: align stack to 64 bytes, so all of local data in same cache line
mov r5, rsp
sub rsp, 8*mmsize
and rsp, ~63
mova m12, [pw_1]
mova m13, [pw_pmpmpmpm]
add r1, r1
add r3, r3
pxor m14, m14
mov r7d, 4
.loopH:
mov r8d, 4
.loopW:
PSY_SS_8x8
paddd m14, m0
add r0, 16
add r2, 16
dec r8d
jnz .loopW
lea r0, [r0 + r1 * 8 - 64]
lea r2, [r2 + r3 * 8 - 64]
dec r7d
jnz .loopH
movd eax, xm14
mov rsp, r5
RET
%endif
%if ARCH_X86_64
INIT_YMM avx2
cglobal psyCost_ss_64x64, 4, 9, 15
; NOTE: align stack to 64 bytes, so all of local data in same cache line
mov r5, rsp
sub rsp, 8*mmsize
and rsp, ~63
mova m12, [pw_1]
mova m13, [pw_pmpmpmpm]
add r1, r1
add r3, r3
pxor m14, m14
mov r7d, 8
.loopH:
mov r8d, 8
.loopW:
PSY_SS_8x8
paddd m14, m0
add r0, 16
add r2, 16
dec r8d
jnz .loopW
lea r0, [r0 + r1 * 8 - 128]
lea r2, [r2 + r3 * 8 - 128]
dec r7d
jnz .loopH
movd eax, xm14
mov rsp, r5
RET
%endif
;;---------------------------------------------------------------
;; SATD AVX2
;; int pixel_satd(const pixel*, intptr_t, const pixel*, intptr_t)
;;---------------------------------------------------------------
;; r0 - pix0
;; r1 - pix0Stride
;; r2 - pix1
;; r3 - pix1Stride
%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
INIT_YMM avx2
cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
pxor m6, m6
vbroadcasti128 m0, [r0]
vbroadcasti128 m4, [r2]
vbroadcasti128 m1, [r0 + r1]
vbroadcasti128 m5, [r2 + r3]
pmaddubsw m4, m7
pmaddubsw m0, m7
pmaddubsw m5, m7
pmaddubsw m1, m7
psubw m0, m4
psubw m1, m5
vbroadcasti128 m2, [r0 + r1 * 2]
vbroadcasti128 m4, [r2 + r3 * 2]
vbroadcasti128 m3, [r0 + r4]
vbroadcasti128 m5, [r2 + r5]
pmaddubsw m4, m7
pmaddubsw m2, m7
pmaddubsw m5, m7
pmaddubsw m3, m7
psubw m2, m4
psubw m3, m5
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
paddw m4, m0, m1
psubw m1, m1, m0
paddw m0, m2, m3
psubw m3, m2
paddw m2, m4, m0
psubw m0, m4
paddw m4, m1, m3
psubw m3, m1
pabsw m2, m2
pabsw m0, m0
pabsw m4, m4
pabsw m3, m3
pblendw m1, m2, m0, 10101010b
pslld m0, 16
psrld m2, 16
por m0, m2
pmaxsw m1, m0
paddw m6, m1
pblendw m2, m4, m3, 10101010b
pslld m3, 16
psrld m4, 16
por m3, m4
pmaxsw m2, m3
paddw m6, m2
vbroadcasti128 m1, [r0]
vbroadcasti128 m4, [r2]
vbroadcasti128 m2, [r0 + r1]
vbroadcasti128 m5, [r2 + r3]
pmaddubsw m4, m7
pmaddubsw m1, m7
pmaddubsw m5, m7
pmaddubsw m2, m7
psubw m1, m4
psubw m2, m5
vbroadcasti128 m0, [r0 + r1 * 2]
vbroadcasti128 m4, [r2 + r3 * 2]
vbroadcasti128 m3, [r0 + r4]
vbroadcasti128 m5, [r2 + r5]
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
pmaddubsw m4, m7
pmaddubsw m0, m7
pmaddubsw m5, m7
pmaddubsw m3, m7
psubw m0, m4
psubw m3, m5
paddw m4, m1, m2
psubw m2, m1
paddw m1, m0, m3
psubw m3, m0
paddw m0, m4, m1
psubw m1, m4
paddw m4, m2, m3
psubw m3, m2
pabsw m0, m0
pabsw m1, m1
pabsw m4, m4
pabsw m3, m3
pblendw m2, m0, m1, 10101010b
pslld m1, 16
psrld m0, 16
por m1, m0
pmaxsw m2, m1
paddw m6, m2
pblendw m0, m4, m3, 10101010b
pslld m3, 16
psrld m4, 16
por m3, m4
pmaxsw m0, m3
paddw m6, m0
vextracti128 xm0, m6, 1
pmovzxwd m6, xm6
pmovzxwd m0, xm0
paddd m8, m6
paddd m9, m0
ret
cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows
pxor m6, m6
vbroadcasti128 m0, [r0]
vbroadcasti128 m4, [r2]
vbroadcasti128 m1, [r0 + r1]
vbroadcasti128 m5, [r2 + r3]
pmaddubsw m4, m7
pmaddubsw m0, m7
pmaddubsw m5, m7
pmaddubsw m1, m7
psubw m0, m4
psubw m1, m5
vbroadcasti128 m2, [r0 + r1 * 2]
vbroadcasti128 m4, [r2 + r3 * 2]
vbroadcasti128 m3, [r0 + r4]
vbroadcasti128 m5, [r2 + r5]
pmaddubsw m4, m7
pmaddubsw m2, m7
pmaddubsw m5, m7
pmaddubsw m3, m7
psubw m2, m4
psubw m3, m5
paddw m4, m0, m1
psubw m1, m1, m0
paddw m0, m2, m3
psubw m3, m2
paddw m2, m4, m0
psubw m0, m4
paddw m4, m1, m3
psubw m3, m1
pabsw m2, m2
pabsw m0, m0
pabsw m4, m4
pabsw m3, m3
pblendw m1, m2, m0, 10101010b
pslld m0, 16
psrld m2, 16
por m0, m2
pmaxsw m1, m0
paddw m6, m1
pblendw m2, m4, m3, 10101010b
pslld m3, 16
psrld m4, 16
por m3, m4
pmaxsw m2, m3
paddw m6, m2
vextracti128 xm0, m6, 1
pmovzxwd m6, xm6
pmovzxwd m0, xm0
paddd m8, m6
paddd m9, m0
ret
cglobal pixel_satd_16x4, 4,6,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
call calc_satd_16x4
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_16x12, 4,6,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
call calc_satd_16x8
call calc_satd_16x4
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_16x32, 4,6,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_16x64, 4,6,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_32x8, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_32x16, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_32x24, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_32x32, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_32x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_64x16, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_64x32, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
cglobal pixel_satd_64x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
mova m7, [hmul_16p]
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m8, m8
pxor m9, m9
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 16]
lea r2, [r7 + 16]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 48]
lea r2, [r7 + 48]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
paddd m8, m9
vextracti128 xm0, m8, 1
paddd xm0, xm8
movhlps xm1, xm0
paddd xm0, xm1
pshuflw xm1, xm0, q0032
paddd xm0, xm1
movd eax, xm0
RET
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
INIT_YMM avx2
cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
; rows 0-3
movu m0, [r0]
movu m4, [r2]
psubw m0, m4
movu m1, [r0 + r1]
movu m5, [r2 + r3]
psubw m1, m5
movu m2, [r0 + r1 * 2]
movu m4, [r2 + r3 * 2]
psubw m2, m4
movu m3, [r0 + r4]
movu m5, [r2 + r5]
psubw m3, m5
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
paddw m4, m0, m1
psubw m1, m0
paddw m0, m2, m3
psubw m3, m2
punpckhwd m2, m4, m1
punpcklwd m4, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
paddw m3, m4, m0
psubw m0, m4
paddw m4, m2, m1
psubw m1, m2
punpckhdq m2, m3, m0
punpckldq m3, m0
paddw m0, m3, m2
psubw m2, m3
punpckhdq m3, m4, m1
punpckldq m4, m1
paddw m1, m4, m3
psubw m3, m4
punpckhqdq m4, m0, m1
punpcklqdq m0, m1
pabsw m0, m0
pabsw m4, m4
pmaxsw m0, m0, m4
punpckhqdq m1, m2, m3
punpcklqdq m2, m3
pabsw m2, m2
pabsw m1, m1
pmaxsw m2, m1
pxor m7, m7
mova m1, m0
punpcklwd m1, m7
paddd m6, m1
mova m1, m0
punpckhwd m1, m7
paddd m6, m1
pxor m7, m7
mova m1, m2
punpcklwd m1, m7
paddd m6, m1
mova m1, m2
punpckhwd m1, m7
paddd m6, m1
; rows 4-7
movu m0, [r0]
movu m4, [r2]
psubw m0, m4
movu m1, [r0 + r1]
movu m5, [r2 + r3]
psubw m1, m5
movu m2, [r0 + r1 * 2]
movu m4, [r2 + r3 * 2]
psubw m2, m4
movu m3, [r0 + r4]
movu m5, [r2 + r5]
psubw m3, m5
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
paddw m4, m0, m1
psubw m1, m0
paddw m0, m2, m3
psubw m3, m2
punpckhwd m2, m4, m1
punpcklwd m4, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
paddw m3, m4, m0
psubw m0, m4
paddw m4, m2, m1
psubw m1, m2
punpckhdq m2, m3, m0
punpckldq m3, m0
paddw m0, m3, m2
psubw m2, m3
punpckhdq m3, m4, m1
punpckldq m4, m1
paddw m1, m4, m3
psubw m3, m4
punpckhqdq m4, m0, m1
punpcklqdq m0, m1
pabsw m0, m0
pabsw m4, m4
pmaxsw m0, m0, m4
punpckhqdq m1, m2, m3
punpcklqdq m2, m3
pabsw m2, m2
pabsw m1, m1
pmaxsw m2, m1
pxor m7, m7
mova m1, m0
punpcklwd m1, m7
paddd m6, m1
mova m1, m0
punpckhwd m1, m7
paddd m6, m1
pxor m7, m7
mova m1, m2
punpcklwd m1, m7
paddd m6, m1
mova m1, m2
punpckhwd m1, m7
paddd m6, m1
ret
cglobal calc_satd_16x4 ; function to compute satd cost for 16 columns, 4 rows
; rows 0-3
movu m0, [r0]
movu m4, [r2]
psubw m0, m4
movu m1, [r0 + r1]
movu m5, [r2 + r3]
psubw m1, m5
movu m2, [r0 + r1 * 2]
movu m4, [r2 + r3 * 2]
psubw m2, m4
movu m3, [r0 + r4]
movu m5, [r2 + r5]
psubw m3, m5
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r3 * 4]
paddw m4, m0, m1
psubw m1, m0
paddw m0, m2, m3
psubw m3, m2
punpckhwd m2, m4, m1
punpcklwd m4, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
paddw m3, m4, m0
psubw m0, m4
paddw m4, m2, m1
psubw m1, m2
punpckhdq m2, m3, m0
punpckldq m3, m0
paddw m0, m3, m2
psubw m2, m3
punpckhdq m3, m4, m1
punpckldq m4, m1
paddw m1, m4, m3
psubw m3, m4
punpckhqdq m4, m0, m1
punpcklqdq m0, m1
pabsw m0, m0
pabsw m4, m4
pmaxsw m0, m0, m4
punpckhqdq m1, m2, m3
punpcklqdq m2, m3
pabsw m2, m2
pabsw m1, m1
pmaxsw m2, m1
pxor m7, m7
mova m1, m0
punpcklwd m1, m7
paddd m6, m1
mova m1, m0
punpckhwd m1, m7
paddd m6, m1
pxor m7, m7
mova m1, m2
punpcklwd m1, m7
paddd m6, m1
mova m1, m2
punpckhwd m1, m7
paddd m6, m1
ret
cglobal pixel_satd_16x4, 4,6,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
call calc_satd_16x4
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_16x8, 4,6,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_16x12, 4,6,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
call calc_satd_16x8
call calc_satd_16x4
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_16x16, 4,6,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_16x32, 4,6,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_16x64, 4,6,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_32x8, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_32x16, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_32x24, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_32x32, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_32x64, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_48x64, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 64]
lea r2, [r7 + 64]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_64x16, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 64]
lea r2, [r7 + 64]
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 96]
lea r2, [r7 + 96]
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_64x32, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 64]
lea r2, [r7 + 64]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 96]
lea r2, [r7 + 96]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_64x48, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 64]
lea r2, [r7 + 64]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 96]
lea r2, [r7 + 96]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
cglobal pixel_satd_64x64, 4,8,8
add r1d, r1d
add r3d, r3d
lea r4, [3 * r1]
lea r5, [3 * r3]
pxor m6, m6
mov r6, r0
mov r7, r2
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 32]
lea r2, [r7 + 32]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 64]
lea r2, [r7 + 64]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
lea r0, [r6 + 96]
lea r2, [r7 + 96]
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
call calc_satd_16x8
vextracti128 xm7, m6, 1
paddd xm6, xm7
pxor xm7, xm7
movhlps xm7, xm6
paddd xm6, xm7
pshufd xm7, xm6, 1
paddd xm6, xm7
movd eax, xm6
RET
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
;-------------------------------------------------------------------------------------------------------------------------------------
; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
;-------------------------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
INIT_YMM avx2
cglobal planeClipAndMax, 5,7,8
movd xm0, r5m
vpbroadcastb m0, xm0 ; m0 = [min]
vpbroadcastb m1, r6m ; m1 = [max]
pxor m2, m2 ; m2 = sumLuma
pxor m3, m3 ; m3 = maxLumaLevel
pxor m4, m4 ; m4 = zero
; get mask to partial register pixels
mov r5d, r2d
and r2d, ~(mmsize - 1)
sub r5d, r2d
lea r6, [pb_movemask_32 + mmsize]
sub r6, r5
movu m5, [r6] ; m5 = mask for last couple column
.loopH:
lea r5d, [r2 - mmsize]
.loopW:
movu m6, [r0 + r5]
pmaxub m6, m0
pminub m6, m1
movu [r0 + r5], m6 ; store back
pmaxub m3, m6 ; update maxLumaLevel
psadbw m6, m4
paddq m2, m6
sub r5d, mmsize
jge .loopW
; partial pixels
movu m7, [r0 + r2]
pmaxub m6, m7, m0
pminub m6, m1
pand m7, m5 ; get invalid/unchange pixel
pandn m6, m5, m6 ; clear invalid pixels
por m7, m6 ; combin valid & invalid pixels
movu [r0 + r2], m7 ; store back
pmaxub m3, m6 ; update maxLumaLevel
psadbw m6, m4
paddq m2, m6
.next:
add r0, r1
dec r3d
jg .loopH
; sumLuma
vextracti128 xm0, m2, 1
paddq xm0, xm2
movhlps xm1, xm0
paddq xm0, xm1
movq [r4], xm0
; maxLumaLevel
vextracti128 xm0, m3, 1
pmaxub xm0, xm3
movhlps xm3, xm0
pmaxub xm0, xm3
pmovzxbw xm0, xm0
pxor xm0, [pb_movemask + 16]
phminposuw xm0, xm0
movd eax, xm0
not al
movzx eax, al
RET
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0