forked from mirror/libbpg
2760 lines
68 KiB
NASM
2760 lines
68 KiB
NASM
;*****************************************************************************
|
|
;* ssd-a.asm: x86 ssd functions
|
|
;*****************************************************************************
|
|
;* Copyright (C) 2003-2013 x264 project
|
|
;*
|
|
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
|
;* Fiona Glaser <fiona@x264.com>
|
|
;* Laurent Aimar <fenrir@via.ecp.fr>
|
|
;* Alex Izvorski <aizvorksi@gmail.com>
|
|
;*
|
|
;* This program is free software; you can redistribute it and/or modify
|
|
;* it under the terms of the GNU General Public License as published by
|
|
;* the Free Software Foundation; either version 2 of the License, or
|
|
;* (at your option) any later version.
|
|
;*
|
|
;* This program is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
;* GNU General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU General Public License
|
|
;* along with this program; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
;*
|
|
;* This program is also available under a commercial proprietary license.
|
|
;* For more information, contact us at license @ x265.com.
|
|
;*****************************************************************************
|
|
|
|
%include "x86inc.asm"
|
|
%include "x86util.asm"
|
|
|
|
SECTION_RODATA 32
|
|
|
|
SECTION .text
|
|
|
|
cextern pw_00ff
|
|
cextern hsub_mul
|
|
|
|
;=============================================================================
|
|
; SSD
|
|
;=============================================================================
|
|
|
|
%if HIGH_BIT_DEPTH
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
%macro SSD_ONE 2
|
|
cglobal pixel_ssd_ss_%1x%2, 4,7,8
|
|
FIX_STRIDES r1, r3
|
|
%if mmsize == %1*2
|
|
%define offset0_1 r1
|
|
%define offset0_2 r1*2
|
|
%define offset0_3 r5
|
|
%define offset1_1 r3
|
|
%define offset1_2 r3*2
|
|
%define offset1_3 r6
|
|
lea r5, [3*r1]
|
|
lea r6, [3*r3]
|
|
%elif mmsize == %1
|
|
%define offset0_1 mmsize
|
|
%define offset0_2 r1
|
|
%define offset0_3 r1+mmsize
|
|
%define offset1_1 mmsize
|
|
%define offset1_2 r3
|
|
%define offset1_3 r3+mmsize
|
|
%elif mmsize == %1/2
|
|
%define offset0_1 mmsize
|
|
%define offset0_2 mmsize*2
|
|
%define offset0_3 mmsize*3
|
|
%define offset1_1 mmsize
|
|
%define offset1_2 mmsize*2
|
|
%define offset1_3 mmsize*3
|
|
%endif
|
|
%assign %%n %2/(2*mmsize/%1)
|
|
%if %%n > 1
|
|
mov r4d, %%n
|
|
%endif
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0+offset0_1]
|
|
movu m3, [r0+offset0_2]
|
|
movu m4, [r0+offset0_3]
|
|
movu m6, [r2]
|
|
movu m7, [r2+offset1_1]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
movu m6, [r2+offset1_2]
|
|
movu m7, [r2+offset1_3]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
%if %%n > 1
|
|
lea r0, [r0+r1*(%2/%%n)]
|
|
lea r2, [r2+r3*(%2/%%n)]
|
|
%endif
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
%if %%n > 1
|
|
dec r4d
|
|
jg .loop
|
|
%endif
|
|
|
|
%if BIT_DEPTH == 12 && mmsize == 16
|
|
movu m5, m0
|
|
pxor m6, m6
|
|
punpckldq m0, m6
|
|
punpckhdq m5, m6
|
|
paddq m0, m5
|
|
movhlps m5, m0
|
|
paddq m0, m5
|
|
movq r6, xm0
|
|
%else
|
|
HADDD m0, m5
|
|
movd eax,xm0
|
|
%endif
|
|
%ifidn movu,movq ; detect MMX
|
|
EMMS
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
%macro SSD_TWO 2
|
|
cglobal pixel_ssd_ss_%1x%2, 4,7,8
|
|
FIX_STRIDES r1, r3
|
|
pxor m0, m0
|
|
mov r4d, %2/2
|
|
lea r5, [r1 * 2]
|
|
lea r6, [r3 * 2]
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0 + 16]
|
|
movu m3, [r0 + 32]
|
|
movu m4, [r0 + 48]
|
|
movu m6, [r2]
|
|
movu m7, [r2 + 16]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
movu m6, [r2 + 32]
|
|
movu m7, [r2 + 48]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
movu m1, [r0 + 64]
|
|
movu m2, [r0 + 80]
|
|
movu m6, [r2 + 64]
|
|
movu m7, [r2 + 80]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
paddd m1, m2
|
|
paddd m0, m1
|
|
%if %1 == 64
|
|
movu m3, [r0 + 96]
|
|
movu m4, [r0 + 112]
|
|
movu m6, [r2 + 96]
|
|
movu m7, [r2 + 112]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m3, m4
|
|
paddd m0, m3
|
|
%endif
|
|
movu m1, [r0 + r1]
|
|
movu m2, [r0 + r1 + 16]
|
|
movu m3, [r0 + r1 + 32]
|
|
movu m4, [r0 + r1 + 48]
|
|
movu m6, [r2 + r3]
|
|
movu m7, [r2 + r3 + 16]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
movu m6, [r2 + r3 + 32]
|
|
movu m7, [r2 + r3 + 48]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
movu m1, [r0 + r1 + 64]
|
|
movu m2, [r0 + r1 + 80]
|
|
movu m6, [r2 + r3 + 64]
|
|
movu m7, [r2 + r3 + 80]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
paddd m1, m2
|
|
paddd m0, m1
|
|
%if %1 == 64
|
|
movu m3, [r0 + r1 + 96]
|
|
movu m4, [r0 + r1 + 112]
|
|
movu m6, [r2 + r3 + 96]
|
|
movu m7, [r2 + r3 + 112]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m3, m4
|
|
paddd m0, m3
|
|
%endif
|
|
lea r0, [r0 + r5]
|
|
lea r2, [r2 + r6]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m5
|
|
movd eax, xm0
|
|
RET
|
|
%endmacro
|
|
%macro SSD_24 2
|
|
cglobal pixel_ssd_ss_%1x%2, 4,7,8
|
|
FIX_STRIDES r1, r3
|
|
pxor m0, m0
|
|
mov r4d, %2/2
|
|
lea r5, [r1 * 2]
|
|
lea r6, [r3 * 2]
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0 + 16]
|
|
movu m3, [r0 + 32]
|
|
movu m5, [r2]
|
|
movu m6, [r2 + 16]
|
|
movu m7, [r2 + 32]
|
|
psubw m1, m5
|
|
psubw m2, m6
|
|
psubw m3, m7
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
paddd m1, m2
|
|
paddd m0, m1
|
|
movu m1, [r0 + r1]
|
|
movu m2, [r0 + r1 + 16]
|
|
movu m4, [r0 + r1 + 32]
|
|
movu m5, [r2 + r3]
|
|
movu m6, [r2 + r3 + 16]
|
|
movu m7, [r2 + r3 + 32]
|
|
psubw m1, m5
|
|
psubw m2, m6
|
|
psubw m4, m7
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
lea r0, [r0 + r5]
|
|
lea r2, [r2 + r6]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m5
|
|
movd eax, xm0
|
|
RET
|
|
%endmacro
|
|
%macro SSD_12 2
|
|
cglobal pixel_ssd_ss_%1x%2, 4,7,8
|
|
FIX_STRIDES r1, r3
|
|
pxor m0, m0
|
|
mov r4d, %2/4
|
|
lea r5, [r1 * 2]
|
|
lea r6, [r3 * 2]
|
|
.loop:
|
|
movu m1, [r0]
|
|
movh m2, [r0 + 16]
|
|
movu m3, [r0 + r1]
|
|
punpcklqdq m2, [r0 + r1 + 16]
|
|
movu m7, [r2]
|
|
psubw m1, m7
|
|
movh m4, [r2 + 16]
|
|
movu m7, [r2 + r3]
|
|
psubw m3, m7
|
|
punpcklqdq m4, [r2 + r3 + 16]
|
|
psubw m2, m4
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
paddd m1, m2
|
|
paddd m0, m1
|
|
|
|
movu m1, [r0 + r5]
|
|
movh m2, [r0 + r5 + 16]
|
|
lea r0, [r0 + r5]
|
|
movu m6, [r0 + r1]
|
|
punpcklqdq m2, [r0 + r1 + 16]
|
|
movu m7, [r2 + r6]
|
|
psubw m1, m7
|
|
movh m4, [r2 + r6 + 16]
|
|
lea r2, [r2 + r6]
|
|
movu m7, [r2 + r3]
|
|
psubw m6, m7
|
|
punpcklqdq m4, [r2 + r3 + 16]
|
|
psubw m2, m4
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m6, m6
|
|
paddd m1, m2
|
|
paddd m3, m6
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
lea r0, [r0 + r5]
|
|
lea r2, [r2 + r6]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m5
|
|
movd eax, xm0
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_YMM avx2
|
|
cglobal pixel_ssd_16x16, 4,7,8
|
|
FIX_STRIDES r1, r3
|
|
lea r5, [3 * r1]
|
|
lea r6, [3 * r3]
|
|
mov r4d, 4
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0 + r1]
|
|
movu m3, [r0 + r1 * 2]
|
|
movu m4, [r0 + r5]
|
|
movu m6, [r2]
|
|
movu m7, [r2 + r3]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
movu m6, [r2 + r3 * 2]
|
|
movu m7, [r2 + r6]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
lea r2, [r2 + r3 * 4]
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
|
|
dec r4d
|
|
jg .loop
|
|
|
|
HADDD m0, m5
|
|
movd eax, xm0
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal pixel_ssd_32x32, 4,7,8
|
|
add r1, r1
|
|
add r3, r3
|
|
mov r4d, 16
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0 + 32]
|
|
movu m3, [r0 + r1]
|
|
movu m4, [r0 + r1 + 32]
|
|
movu m6, [r2]
|
|
movu m7, [r2 + 32]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
movu m6, [r2 + r3]
|
|
movu m7, [r2 + r3 + 32]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
|
|
lea r0, [r0 + r1 * 2]
|
|
lea r2, [r2 + r3 * 2]
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
|
|
dec r4d
|
|
jg .loop
|
|
|
|
HADDD m0, m5
|
|
movd eax, xm0
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal pixel_ssd_64x64, 4,7,8
|
|
FIX_STRIDES r1, r3
|
|
mov r4d, 64
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0+32]
|
|
movu m3, [r0+32*2]
|
|
movu m4, [r0+32*3]
|
|
movu m6, [r2]
|
|
movu m7, [r2+32]
|
|
psubw m1, m6
|
|
psubw m2, m7
|
|
movu m6, [r2+32*2]
|
|
movu m7, [r2+32*3]
|
|
psubw m3, m6
|
|
psubw m4, m7
|
|
|
|
lea r0, [r0+r1]
|
|
lea r2, [r2+r3]
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
|
|
dec r4d
|
|
jg .loop
|
|
|
|
HADDD m0, m5
|
|
movd eax, xm0
|
|
RET
|
|
|
|
INIT_MMX mmx2
|
|
SSD_ONE 4, 4
|
|
SSD_ONE 4, 8
|
|
SSD_ONE 4, 16
|
|
SSD_ONE 8, 4
|
|
SSD_ONE 8, 8
|
|
SSD_ONE 8, 16
|
|
SSD_ONE 16, 8
|
|
SSD_ONE 16, 16
|
|
INIT_XMM sse2
|
|
SSD_ONE 8, 4
|
|
SSD_ONE 8, 8
|
|
SSD_ONE 8, 16
|
|
SSD_ONE 8, 32
|
|
SSD_12 12, 16
|
|
SSD_ONE 16, 4
|
|
SSD_ONE 16, 8
|
|
SSD_ONE 16, 12
|
|
SSD_ONE 16, 16
|
|
SSD_ONE 16, 32
|
|
SSD_ONE 16, 64
|
|
SSD_24 24, 32
|
|
SSD_ONE 32, 8
|
|
SSD_ONE 32, 16
|
|
SSD_ONE 32, 24
|
|
SSD_ONE 32, 32
|
|
SSD_ONE 32, 64
|
|
SSD_TWO 48, 64
|
|
SSD_TWO 64, 16
|
|
SSD_TWO 64, 32
|
|
SSD_TWO 64, 48
|
|
SSD_TWO 64, 64
|
|
INIT_YMM avx2
|
|
SSD_ONE 16, 8
|
|
SSD_ONE 16, 16
|
|
SSD_ONE 32, 32
|
|
SSD_ONE 64, 64
|
|
SSD_ONE 16, 32
|
|
SSD_ONE 32, 64
|
|
%endif ; HIGH_BIT_DEPTH
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
%if HIGH_BIT_DEPTH == 0
|
|
%macro SSD_SS 2
|
|
cglobal pixel_ssd_ss_%1x%2, 4,7,6
|
|
FIX_STRIDES r1, r3
|
|
%if mmsize == %1*4 || mmsize == %1*2
|
|
%define offset0_1 r1*2
|
|
%define offset0_2 r1*4
|
|
%define offset0_3 r5
|
|
%define offset1_1 r3*2
|
|
%define offset1_2 r3*4
|
|
%define offset1_3 r6
|
|
lea r5, [4*r1]
|
|
lea r6, [4*r3]
|
|
lea r5, [r5 + 2*r1]
|
|
lea r6, [r6 + 2*r3]
|
|
%elif mmsize == %1
|
|
%define offset0_1 16
|
|
%define offset0_2 r1*2
|
|
%define offset0_3 r1*2+16
|
|
%define offset1_1 16
|
|
%define offset1_2 r3*2
|
|
%define offset1_3 r3*2+16
|
|
%endif
|
|
%if %1 == 4
|
|
%assign %%n %2/(mmsize/%1)
|
|
%else
|
|
%assign %%n %2/(2*mmsize/%1)
|
|
%endif
|
|
%if %%n > 1
|
|
mov r4d, %%n
|
|
%endif
|
|
pxor m0, m0
|
|
.loop:
|
|
%if %1 == 4
|
|
movh m1, [r0]
|
|
movh m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movh m1, [r0 + offset0_1]
|
|
movh m2, [r2 + offset1_1]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movh m1, [r0 + offset0_2]
|
|
movh m2, [r2 + offset1_2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movh m1, [r0 + offset0_3]
|
|
movh m2, [r2 + offset1_3]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
%else
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + offset0_1]
|
|
movu m2, [r2 + offset1_1]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + offset0_2]
|
|
movu m2, [r2 + offset1_2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + offset0_3]
|
|
movu m2, [r2 + offset1_3]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
%endif
|
|
lea r0, [r0+r1*(%2/%%n)*2]
|
|
lea r2, [r2+r3*(%2/%%n)*2]
|
|
%if %%n > 1
|
|
dec r4d
|
|
jg .loop
|
|
%endif
|
|
%if %1 == 4
|
|
%if notcpuflag(ssse3)
|
|
pshufd m1, m0, 1
|
|
paddd m0, m1
|
|
%else
|
|
phaddd m0, m0
|
|
%endif
|
|
%else
|
|
HADDD m0, m1
|
|
%endif
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
%macro SSD_SS_ONE 0
|
|
SSD_SS 4, 4
|
|
SSD_SS 4, 8
|
|
SSD_SS 4, 16
|
|
SSD_SS 8, 4
|
|
SSD_SS 8, 8
|
|
SSD_SS 8, 16
|
|
SSD_SS 8, 32
|
|
SSD_SS 16, 4
|
|
SSD_SS 16, 8
|
|
SSD_SS 16, 12
|
|
SSD_SS 16, 16
|
|
SSD_SS 16, 32
|
|
SSD_SS 16, 64
|
|
%endmacro
|
|
|
|
%macro SSD_SS_12x16 0
|
|
cglobal pixel_ssd_ss_12x16, 4,7,6
|
|
FIX_STRIDES r1, r3
|
|
mov r4d, 8
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
pslldq m1, 8
|
|
psrldq m1, 8
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
pslldq m1, 8
|
|
psrldq m1, 8
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
|
|
%macro SSD_SS_32 1
|
|
cglobal pixel_ssd_ss_32x%1, 4,7,6
|
|
FIX_STRIDES r1, r3
|
|
mov r4d, %1/2
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 48]
|
|
movu m2, [r2 + 48]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 48]
|
|
movu m2, [r2 + 48]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
|
|
%macro SSD_SS_32xN 0
|
|
SSD_SS_32 8
|
|
SSD_SS_32 16
|
|
SSD_SS_32 24
|
|
SSD_SS_32 32
|
|
SSD_SS_32 64
|
|
%endmacro
|
|
|
|
%macro SSD_SS_24 0
|
|
cglobal pixel_ssd_ss_24x32, 4,7,6
|
|
FIX_STRIDES r1, r3
|
|
mov r4d, 16
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
|
|
%macro SSD_SS_48 0
|
|
cglobal pixel_ssd_ss_48x64, 4,7,6
|
|
FIX_STRIDES r1, r3
|
|
mov r4d, 32
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 48]
|
|
movu m2, [r2 + 48]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 64]
|
|
movu m2, [r2 + 64]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 80]
|
|
movu m2, [r2 + 80]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 48]
|
|
movu m2, [r2 + 48]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 64]
|
|
movu m2, [r2 + 64]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 80]
|
|
movu m2, [r2 + 80]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
|
|
%macro SSD_SS_64 1
|
|
cglobal pixel_ssd_ss_64x%1, 4,7,6
|
|
FIX_STRIDES r1, r3
|
|
mov r4d, %1/2
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 48]
|
|
movu m2, [r2 + 48]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 64]
|
|
movu m2, [r2 + 64]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 80]
|
|
movu m2, [r2 + 80]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 96]
|
|
movu m2, [r2 + 96]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 112]
|
|
movu m2, [r2 + 112]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
movu m1, [r0]
|
|
movu m2, [r2]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 16]
|
|
movu m2, [r2 + 16]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 32]
|
|
movu m2, [r2 + 32]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 48]
|
|
movu m2, [r2 + 48]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 64]
|
|
movu m2, [r2 + 64]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 80]
|
|
movu m2, [r2 + 80]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 96]
|
|
movu m2, [r2 + 96]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
movu m1, [r0 + 112]
|
|
movu m2, [r2 + 112]
|
|
psubw m1, m2
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
lea r0, [r0 + 2*r1]
|
|
lea r2, [r2 + 2*r3]
|
|
dec r4d
|
|
jnz .loop
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
|
|
%macro SSD_SS_64xN 0
|
|
SSD_SS_64 16
|
|
SSD_SS_64 32
|
|
SSD_SS_64 48
|
|
SSD_SS_64 64
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
SSD_SS_ONE
|
|
SSD_SS_12x16
|
|
SSD_SS_24
|
|
SSD_SS_32xN
|
|
SSD_SS_48
|
|
SSD_SS_64xN
|
|
INIT_XMM sse4
|
|
SSD_SS_ONE
|
|
SSD_SS_12x16
|
|
SSD_SS_24
|
|
SSD_SS_32xN
|
|
SSD_SS_48
|
|
SSD_SS_64xN
|
|
INIT_XMM avx
|
|
SSD_SS_ONE
|
|
SSD_SS_12x16
|
|
SSD_SS_24
|
|
SSD_SS_32xN
|
|
SSD_SS_48
|
|
SSD_SS_64xN
|
|
%endif ; !HIGH_BIT_DEPTH
|
|
|
|
%if HIGH_BIT_DEPTH == 0
|
|
%macro SSD_LOAD_FULL 5
|
|
movu m1, [t0+%1]
|
|
movu m2, [t2+%2]
|
|
movu m3, [t0+%3]
|
|
movu m4, [t2+%4]
|
|
%if %5==1
|
|
add t0, t1
|
|
add t2, t3
|
|
%elif %5==2
|
|
lea t0, [t0+2*t1]
|
|
lea t2, [t2+2*t3]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LOAD 5
|
|
movh m%1, %3
|
|
movh m%2, %4
|
|
%if %5
|
|
lea t0, [t0+2*t1]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro JOIN 7
|
|
movh m%3, %5
|
|
movh m%4, %6
|
|
%if %7
|
|
lea t2, [t2+2*t3]
|
|
%endif
|
|
punpcklbw m%1, m7
|
|
punpcklbw m%3, m7
|
|
psubw m%1, m%3
|
|
punpcklbw m%2, m7
|
|
punpcklbw m%4, m7
|
|
psubw m%2, m%4
|
|
%endmacro
|
|
|
|
%macro JOIN_SSE2 7
|
|
movh m%3, %5
|
|
movh m%4, %6
|
|
%if %7
|
|
lea t2, [t2+2*t3]
|
|
%endif
|
|
punpcklqdq m%1, m%2
|
|
punpcklqdq m%3, m%4
|
|
DEINTB %2, %1, %4, %3, 7
|
|
psubw m%2, m%4
|
|
psubw m%1, m%3
|
|
%endmacro
|
|
|
|
%macro JOIN_SSSE3 7
|
|
movh m%3, %5
|
|
movh m%4, %6
|
|
%if %7
|
|
lea t2, [t2+2*t3]
|
|
%endif
|
|
punpcklbw m%1, m%3
|
|
punpcklbw m%2, m%4
|
|
%endmacro
|
|
|
|
%macro LOAD_AVX2 5
|
|
mova xm%1, %3
|
|
vinserti128 m%1, m%1, %4, 1
|
|
%if %5
|
|
lea t0, [t0+2*t1]
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro JOIN_AVX2 7
|
|
mova xm%2, %5
|
|
vinserti128 m%2, m%2, %6, 1
|
|
%if %7
|
|
lea t2, [t2+2*t3]
|
|
%endif
|
|
SBUTTERFLY bw, %1, %2, %3
|
|
%endmacro
|
|
|
|
%macro SSD_LOAD_HALF 5
|
|
LOAD 1, 2, [t0+%1], [t0+%3], 1
|
|
JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
|
|
LOAD 3, 4, [t0+%1], [t0+%3], %5
|
|
JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
|
|
%endmacro
|
|
|
|
%macro SSD_CORE 7-8
|
|
%ifidn %8, FULL
|
|
mova m%6, m%2
|
|
mova m%7, m%4
|
|
psubusb m%2, m%1
|
|
psubusb m%4, m%3
|
|
psubusb m%1, m%6
|
|
psubusb m%3, m%7
|
|
por m%1, m%2
|
|
por m%3, m%4
|
|
punpcklbw m%2, m%1, m%5
|
|
punpckhbw m%1, m%5
|
|
punpcklbw m%4, m%3, m%5
|
|
punpckhbw m%3, m%5
|
|
%endif
|
|
pmaddwd m%1, m%1
|
|
pmaddwd m%2, m%2
|
|
pmaddwd m%3, m%3
|
|
pmaddwd m%4, m%4
|
|
%endmacro
|
|
|
|
%macro SSD_CORE_SSE2 7-8
|
|
%ifidn %8, FULL
|
|
DEINTB %6, %1, %7, %2, %5
|
|
psubw m%6, m%7
|
|
psubw m%1, m%2
|
|
SWAP %6, %2, %1
|
|
DEINTB %6, %3, %7, %4, %5
|
|
psubw m%6, m%7
|
|
psubw m%3, m%4
|
|
SWAP %6, %4, %3
|
|
%endif
|
|
pmaddwd m%1, m%1
|
|
pmaddwd m%2, m%2
|
|
pmaddwd m%3, m%3
|
|
pmaddwd m%4, m%4
|
|
%endmacro
|
|
|
|
%macro SSD_CORE_SSSE3 7-8
|
|
%ifidn %8, FULL
|
|
punpckhbw m%6, m%1, m%2
|
|
punpckhbw m%7, m%3, m%4
|
|
punpcklbw m%1, m%2
|
|
punpcklbw m%3, m%4
|
|
SWAP %6, %2, %3
|
|
SWAP %7, %4
|
|
%endif
|
|
pmaddubsw m%1, m%5
|
|
pmaddubsw m%2, m%5
|
|
pmaddubsw m%3, m%5
|
|
pmaddubsw m%4, m%5
|
|
pmaddwd m%1, m%1
|
|
pmaddwd m%2, m%2
|
|
pmaddwd m%3, m%3
|
|
pmaddwd m%4, m%4
|
|
%endmacro
|
|
|
|
%macro SSD_ITER 6
|
|
SSD_LOAD_%1 %2,%3,%4,%5,%6
|
|
SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m0, m1
|
|
paddd m0, m3
|
|
%endmacro
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
%macro SSD 2
|
|
%if %1 != %2
|
|
%assign function_align 8
|
|
%else
|
|
%assign function_align 16
|
|
%endif
|
|
cglobal pixel_ssd_%1x%2, 0,0,0
|
|
mov al, %1*%2/mmsize/2
|
|
|
|
%if %1 != %2
|
|
jmp mangle(private_prefix %+ _ %+ pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
|
|
%else
|
|
|
|
.startloop:
|
|
%if ARCH_X86_64
|
|
DECLARE_REG_TMP 0,1,2,3
|
|
PROLOGUE 0,0,8
|
|
%else
|
|
PROLOGUE 0,5
|
|
DECLARE_REG_TMP 1,2,3,4
|
|
mov t0, r0m
|
|
mov t1, r1m
|
|
mov t2, r2m
|
|
mov t3, r3m
|
|
%endif
|
|
|
|
%if cpuflag(ssse3)
|
|
mova m7, [hsub_mul]
|
|
%elifidn cpuname, sse2
|
|
mova m7, [pw_00ff]
|
|
%elif %1 >= mmsize
|
|
pxor m7, m7
|
|
%endif
|
|
pxor m0, m0
|
|
|
|
ALIGN 16
|
|
.loop:
|
|
%if %1 > mmsize
|
|
SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
|
|
%elif %1 == mmsize
|
|
SSD_ITER FULL, 0, 0, t1, t3, 2
|
|
%else
|
|
SSD_ITER HALF, 0, 0, t1, t3, 2
|
|
%endif
|
|
dec al
|
|
jg .loop
|
|
%if mmsize==32
|
|
vextracti128 xm1, m0, 1
|
|
paddd xm0, xm1
|
|
HADDD xm0, xm1
|
|
movd eax, xm0
|
|
%else
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
%endif
|
|
%if (mmsize == 8)
|
|
emms
|
|
%endif
|
|
RET
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro HEVC_SSD 0
|
|
SSD 32, 64
|
|
SSD 16, 64
|
|
SSD 32, 32
|
|
SSD 32, 16
|
|
SSD 16, 32
|
|
SSD 32, 8
|
|
SSD 8, 32
|
|
SSD 32, 24
|
|
SSD 24, 24 ; not used, but resolves x265_pixel_ssd_24x24_sse2.startloop symbol
|
|
SSD 8, 4
|
|
SSD 8, 8
|
|
SSD 16, 16
|
|
SSD 16, 12
|
|
SSD 16, 8
|
|
SSD 8, 16
|
|
SSD 16, 4
|
|
%endmacro
|
|
|
|
INIT_MMX mmx
|
|
SSD 16, 16
|
|
SSD 16, 8
|
|
SSD 8, 8
|
|
SSD 8, 16
|
|
SSD 4, 4
|
|
SSD 8, 4
|
|
SSD 4, 8
|
|
SSD 4, 16
|
|
INIT_XMM sse2slow
|
|
SSD 16, 16
|
|
SSD 8, 8
|
|
SSD 16, 8
|
|
SSD 8, 16
|
|
SSD 8, 4
|
|
INIT_XMM sse2
|
|
%define SSD_CORE SSD_CORE_SSE2
|
|
%define JOIN JOIN_SSE2
|
|
HEVC_SSD
|
|
INIT_XMM ssse3
|
|
%define SSD_CORE SSD_CORE_SSSE3
|
|
%define JOIN JOIN_SSSE3
|
|
HEVC_SSD
|
|
INIT_XMM avx
|
|
HEVC_SSD
|
|
INIT_MMX ssse3
|
|
SSD 4, 4
|
|
SSD 4, 8
|
|
SSD 4, 16
|
|
INIT_XMM xop
|
|
SSD 16, 16
|
|
SSD 8, 8
|
|
SSD 16, 8
|
|
SSD 8, 16
|
|
SSD 8, 4
|
|
%define LOAD LOAD_AVX2
|
|
%define JOIN JOIN_AVX2
|
|
INIT_YMM avx2
|
|
SSD 16, 16
|
|
SSD 16, 8
|
|
SSD 32, 32
|
|
SSD 64, 64
|
|
%assign function_align 16
|
|
%endif ; !HIGH_BIT_DEPTH
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
|
|
|
|
pxor m6, m6
|
|
mov r4d, 4
|
|
|
|
.loop:
|
|
movu m0, [r0]
|
|
movu m1, [r2]
|
|
movu m2, [r0 + r1]
|
|
movu m3, [r2 + r3]
|
|
|
|
punpckhdq m4, m0, m2
|
|
punpckhdq m5, m1, m3
|
|
|
|
pmovzxbw m0, m0
|
|
pmovzxbw m1, m1
|
|
pmovzxbw m2, m2
|
|
pmovzxbw m3, m3
|
|
pmovzxbw m4, m4
|
|
pmovzxbw m5, m5
|
|
|
|
psubw m0, m1
|
|
psubw m2, m3
|
|
psubw m4, m5
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m2, m2
|
|
pmaddwd m4, m4
|
|
|
|
paddd m0, m2
|
|
paddd m6, m4
|
|
paddd m6, m0
|
|
|
|
movu m0, [r0 + 2 * r1]
|
|
movu m1, [r2 + 2 * r3]
|
|
lea r0, [r0 + 2 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
movu m2, [r0 + r1]
|
|
movu m3, [r2 + r3]
|
|
|
|
punpckhdq m4, m0, m2
|
|
punpckhdq m5, m1, m3
|
|
|
|
pmovzxbw m0, m0
|
|
pmovzxbw m1, m1
|
|
pmovzxbw m2, m2
|
|
pmovzxbw m3, m3
|
|
pmovzxbw m4, m4
|
|
pmovzxbw m5, m5
|
|
|
|
psubw m0, m1
|
|
psubw m2, m3
|
|
psubw m4, m5
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m2, m2
|
|
pmaddwd m4, m4
|
|
|
|
paddd m0, m2
|
|
paddd m6, m4
|
|
paddd m6, m0
|
|
|
|
dec r4d
|
|
lea r0, [r0 + 2 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
jnz .loop
|
|
|
|
HADDD m6, m1
|
|
movd eax, m6
|
|
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r4d, 16
|
|
|
|
.loop:
|
|
movu m1, [r0]
|
|
pmovzxbw m0, m1
|
|
punpckhbw m1, m6
|
|
pmovzxbw m2, [r0 + 16]
|
|
movu m4, [r2]
|
|
pmovzxbw m3, m4
|
|
punpckhbw m4, m6
|
|
pmovzxbw m5, [r2 + 16]
|
|
|
|
psubw m0, m3
|
|
psubw m1, m4
|
|
psubw m2, m5
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
|
|
paddd m0, m1
|
|
paddd m7, m2
|
|
paddd m7, m0
|
|
|
|
movu m1, [r0 + r1]
|
|
pmovzxbw m0, m1
|
|
punpckhbw m1, m6
|
|
pmovzxbw m2, [r0 + r1 + 16]
|
|
movu m4, [r2 + r3]
|
|
pmovzxbw m3, m4
|
|
punpckhbw m4, m6
|
|
pmovzxbw m5, [r2 + r3 + 16]
|
|
|
|
psubw m0, m3
|
|
psubw m1, m4
|
|
psubw m2, m5
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
|
|
paddd m0, m1
|
|
paddd m7, m2
|
|
paddd m7, m0
|
|
|
|
dec r4d
|
|
lea r0, [r0 + 2 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
jnz .loop
|
|
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
|
|
RET
|
|
|
|
%macro PIXEL_SSD_16x4 0
|
|
movu m1, [r0]
|
|
pmovzxbw m0, m1
|
|
punpckhbw m1, m6
|
|
movu m3, [r2]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m0, m2
|
|
psubw m1, m3
|
|
|
|
movu m5, [r0 + r1]
|
|
pmovzxbw m4, m5
|
|
punpckhbw m5, m6
|
|
movu m3, [r2 + r3]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m4, m2
|
|
psubw m5, m3
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m4, m4
|
|
pmaddwd m5, m5
|
|
|
|
paddd m0, m1
|
|
paddd m4, m5
|
|
paddd m4, m0
|
|
paddd m7, m4
|
|
|
|
movu m1, [r0 + r6]
|
|
pmovzxbw m0, m1
|
|
punpckhbw m1, m6
|
|
movu m3, [r2 + 2 * r3]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m0, m2
|
|
psubw m1, m3
|
|
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
movu m5, [r0 + r1]
|
|
pmovzxbw m4, m5
|
|
punpckhbw m5, m6
|
|
movu m3, [r2 + r3]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m4, m2
|
|
psubw m5, m3
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m4, m4
|
|
pmaddwd m5, m5
|
|
|
|
paddd m0, m1
|
|
paddd m4, m5
|
|
paddd m4, m0
|
|
paddd m7, m4
|
|
%endmacro
|
|
|
|
cglobal pixel_ssd_16x16_internal
|
|
PIXEL_SSD_16x4
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_16x4
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_16x4
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_16x4
|
|
ret
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r4, r0
|
|
mov r5, r2
|
|
lea r6, [r1 * 2]
|
|
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 16]
|
|
lea r2, [r5 + 16]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 32]
|
|
lea r2, [r5 + 32]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r4, r0
|
|
mov r5, r2
|
|
lea r6, [r1 * 2]
|
|
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 16]
|
|
lea r2, [r5 + 16]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 32]
|
|
lea r2, [r5 + 32]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 48]
|
|
lea r2, [r5 + 48]
|
|
call pixel_ssd_16x16_internal
|
|
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r4, r0
|
|
mov r5, r2
|
|
lea r6, [r1 * 2]
|
|
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 16]
|
|
lea r2, [r5 + 16]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 32]
|
|
lea r2, [r5 + 32]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 48]
|
|
lea r2, [r5 + 48]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r4, r0
|
|
mov r5, r2
|
|
lea r6, [r1 * 2]
|
|
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 16]
|
|
lea r2, [r5 + 16]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 32]
|
|
lea r2, [r5 + 32]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 48]
|
|
lea r2, [r5 + 48]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r4, r0
|
|
mov r5, r2
|
|
lea r6, [r1 * 2]
|
|
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 16]
|
|
lea r2, [r5 + 16]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 32]
|
|
lea r2, [r5 + 32]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r4 + 48]
|
|
lea r2, [r5 + 48]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
lea r0, [r0 + r6]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_16x16_internal
|
|
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
|
|
cglobal pixel_ssd_sp_4x4_internal
|
|
movh m0, [r0]
|
|
movh m1, [r0 + r1]
|
|
punpcklqdq m0, m1
|
|
movd m2, [r2]
|
|
movd m3, [r2 + r3]
|
|
punpckldq m2, m3
|
|
pmovzxbw m2, m2
|
|
psubw m0, m2
|
|
movh m4, [r0 + 2 * r1]
|
|
movh m5, [r0 + r4]
|
|
punpcklqdq m4, m5
|
|
movd m6, [r2 + 2 * r3]
|
|
lea r2, [r2 + 2 * r3]
|
|
movd m1, [r2 + r3]
|
|
punpckldq m6, m1
|
|
pmovzxbw m6, m6
|
|
psubw m4, m6
|
|
pmaddwd m0, m0
|
|
pmaddwd m4, m4
|
|
paddd m0, m4
|
|
paddd m7, m0
|
|
ret
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
cglobal pixel_ssd_sp_8x4_internal
|
|
movu m0, [r0]
|
|
movu m1, [r0 + r1]
|
|
movh m2, [r2]
|
|
movh m3, [r2 + r3]
|
|
pmovzxbw m2, m2
|
|
pmovzxbw m3, m3
|
|
|
|
psubw m0, m2
|
|
psubw m1, m3
|
|
|
|
movu m4, [r0 + 2 * r1]
|
|
movu m5, [r0 + r4]
|
|
movh m2, [r2 + 2 * r3]
|
|
movh m3, [r2 + r5]
|
|
pmovzxbw m2, m2
|
|
pmovzxbw m3, m3
|
|
|
|
psubw m4, m2
|
|
psubw m5, m3
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m4, m4
|
|
pmaddwd m5, m5
|
|
|
|
paddd m0, m1
|
|
paddd m4, m5
|
|
paddd m4, m0
|
|
paddd m7, m4
|
|
ret
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
lea r5, [r3 * 3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
lea r5, [r3 * 3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
lea r5, [r3 * 3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
lea r5, [r3 * 3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 3]
|
|
mov r5, r0
|
|
mov r6, r2
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_4x4_internal
|
|
lea r0, [r5 + 8]
|
|
lea r2, [r6 + 4]
|
|
lea r5, [r3 * 3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
%macro PIXEL_SSD_SP_16x4 0
|
|
movu m0, [r0]
|
|
movu m1, [r0 + 16]
|
|
movu m3, [r2]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m0, m2
|
|
psubw m1, m3
|
|
|
|
movu m4, [r0 + r1]
|
|
movu m5, [r0 + r1 +16]
|
|
movu m3, [r2 + r3]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m4, m2
|
|
psubw m5, m3
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m4, m4
|
|
pmaddwd m5, m5
|
|
|
|
paddd m0, m1
|
|
paddd m4, m5
|
|
paddd m4, m0
|
|
paddd m7, m4
|
|
|
|
movu m0, [r0 + 2 * r1]
|
|
movu m1, [r0 + 2 * r1 + 16]
|
|
movu m3, [r2 + 2 * r3]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m0, m2
|
|
psubw m1, m3
|
|
|
|
lea r0, [r0 + 2 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
movu m4, [r0 + r1]
|
|
movu m5, [r0 + r1 + 16]
|
|
movu m3, [r2 + r3]
|
|
pmovzxbw m2, m3
|
|
punpckhbw m3, m6
|
|
|
|
psubw m4, m2
|
|
psubw m5, m3
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m4, m4
|
|
pmaddwd m5, m5
|
|
|
|
paddd m0, m1
|
|
paddd m4, m5
|
|
paddd m4, m0
|
|
paddd m7, m4
|
|
%endmacro
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m6, m6
|
|
pxor m7, m7
|
|
add r1, r1
|
|
PIXEL_SSD_SP_16x4
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m6, m6
|
|
pxor m7, m7
|
|
add r1, r1
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + 2 * r1]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m6, m6
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
lea r5, [r3 * 2]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
PIXEL_SSD_SP_16x4
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m6, m6
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
lea r5, [r3 * 2]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
PIXEL_SSD_SP_16x4
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
cglobal pixel_ssd_sp_16x16_internal
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
ret
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m6, m6
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m6, m6
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
lea r5, [r3 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + r5]
|
|
call pixel_ssd_sp_16x16_internal
|
|
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2
|
|
pxor m6, m6
|
|
pxor m7, m7
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
mov r5, r0
|
|
mov r6, r2
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
lea r4, [r1 * 3]
|
|
lea r5, [r3 * 3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
lea r0, [r0 + 4 * r1]
|
|
lea r2, [r2 + 4 * r3]
|
|
call pixel_ssd_sp_8x4_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
PIXEL_SSD_SP_16x4
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 64]
|
|
lea r2, [r6 + 32]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 64]
|
|
lea r2, [r6 + 32]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 96]
|
|
lea r2, [r6 + 48]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 64]
|
|
lea r2, [r6 + 32]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 96]
|
|
lea r2, [r6 + 48]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 64]
|
|
lea r2, [r6 + 32]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 96]
|
|
lea r2, [r6 + 48]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2
|
|
|
|
pxor m7, m7
|
|
pxor m6, m6
|
|
mov r5, r0
|
|
mov r6, r2
|
|
add r1, r1
|
|
lea r4, [r1 * 2]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 32]
|
|
lea r2, [r6 + 16]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 64]
|
|
lea r2, [r6 + 32]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r5 + 96]
|
|
lea r2, [r6 + 48]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
lea r0, [r0 + r4]
|
|
lea r2, [r2 + 2 * r3]
|
|
call pixel_ssd_sp_16x16_internal
|
|
HADDD m7, m1
|
|
movd eax, m7
|
|
RET
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
|
|
;-----------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal pixel_ssd_s_4, 2,2,2
|
|
add r1, r1
|
|
movh m0, [r0]
|
|
movhps m0, [r0 + r1]
|
|
|
|
lea r0, [r0 + r1 * 2]
|
|
movh m1, [r0]
|
|
movhps m1, [r0 + r1]
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
paddd m0, m1
|
|
|
|
; calculate sum and return
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
|
|
|
|
INIT_XMM sse2
|
|
cglobal pixel_ssd_s_8, 2,3,5
|
|
add r1, r1
|
|
lea r2, [r1 * 3]
|
|
movu m0, [r0]
|
|
movu m1, [r0 + r1]
|
|
movu m2, [r0 + r1 * 2]
|
|
movu m3, [r0 + r2]
|
|
|
|
pmaddwd m0, m0
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
paddd m0, m1
|
|
paddd m2, m3
|
|
paddd m0, m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
movu m4, [r0]
|
|
movu m1, [r0 + r1]
|
|
movu m2, [r0 + r1 * 2]
|
|
movu m3, [r0 + r2]
|
|
|
|
pmaddwd m4, m4
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
paddd m4, m1
|
|
paddd m2, m3
|
|
paddd m4, m2
|
|
paddd m0, m4
|
|
|
|
; calculate sum and return
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
|
|
|
|
INIT_XMM sse2
|
|
cglobal pixel_ssd_s_16, 2,3,5
|
|
add r1, r1
|
|
|
|
mov r2d, 4
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0 + mmsize]
|
|
movu m3, [r0 + r1]
|
|
movu m4, [r0 + r1 + mmsize]
|
|
lea r0, [r0 + r1 * 2]
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m1, m3
|
|
paddd m0, m1
|
|
|
|
movu m1, [r0]
|
|
movu m2, [r0 + mmsize]
|
|
movu m3, [r0 + r1]
|
|
movu m4, [r0 + r1 + mmsize]
|
|
lea r0, [r0 + r1 * 2]
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m1, m3
|
|
paddd m0, m1
|
|
|
|
dec r2d
|
|
jnz .loop
|
|
|
|
; calculate sum and return
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
|
|
|
|
INIT_XMM sse2
|
|
cglobal pixel_ssd_s_32, 2,3,5
|
|
add r1, r1
|
|
|
|
mov r2d, 16
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0 + 0 * mmsize]
|
|
movu m2, [r0 + 1 * mmsize]
|
|
movu m3, [r0 + 2 * mmsize]
|
|
movu m4, [r0 + 3 * mmsize]
|
|
add r0, r1
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m1, m3
|
|
paddd m0, m1
|
|
|
|
movu m1, [r0 + 0 * mmsize]
|
|
movu m2, [r0 + 1 * mmsize]
|
|
movu m3, [r0 + 2 * mmsize]
|
|
movu m4, [r0 + 3 * mmsize]
|
|
add r0, r1
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m1, m3
|
|
paddd m0, m1
|
|
|
|
dec r2d
|
|
jnz .loop
|
|
|
|
; calculate sum and return
|
|
HADDD m0, m1
|
|
movd eax, m0
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal pixel_ssd_s_16, 2,4,5
|
|
add r1, r1
|
|
lea r3, [r1 * 3]
|
|
mov r2d, 16/4
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0]
|
|
movu m2, [r0 + r1]
|
|
movu m3, [r0 + 2 * r1]
|
|
movu m4, [r0 + r3]
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m1, m3
|
|
paddd m0, m1
|
|
|
|
dec r2d
|
|
jnz .loop
|
|
|
|
; calculate sum and return
|
|
HADDD m0, m1
|
|
movd eax, xm0
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal pixel_ssd_s_32, 2,4,5
|
|
add r1, r1
|
|
lea r3, [r1 * 3]
|
|
|
|
mov r2d, 8
|
|
pxor m0, m0
|
|
.loop:
|
|
movu m1, [r0 + 0 * mmsize]
|
|
movu m2, [r0 + 1 * mmsize]
|
|
movu m3, [r0 + r1 + 0 * mmsize]
|
|
movu m4, [r0 + r1 + 1 * mmsize]
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m1, m3
|
|
paddd m0, m1
|
|
|
|
movu m1, [r0 + r1 * 2 + 0 * mmsize]
|
|
movu m2, [r0 + r1 * 2 + 1 * mmsize]
|
|
movu m3, [r0 + r3 + 0 * mmsize]
|
|
movu m4, [r0 + r3 + 1 * mmsize]
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
pmaddwd m1, m1
|
|
pmaddwd m2, m2
|
|
pmaddwd m3, m3
|
|
pmaddwd m4, m4
|
|
paddd m1, m2
|
|
paddd m3, m4
|
|
paddd m1, m3
|
|
paddd m0, m1
|
|
|
|
dec r2d
|
|
jnz .loop
|
|
|
|
; calculate sum and return
|
|
HADDD m0, m1
|
|
movd eax, xm0
|
|
RET
|