libbpg/x265/source/common/x86/pixel-util8.asm
2015-10-27 11:46:00 +01:00

7340 lines
201 KiB
NASM

;*****************************************************************************
;* Copyright (C) 2013 x265 project
;*
;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
;* Nabajit Deka <nabajit@multicorewareinc.com>
;* Rajesh Paulraj <rajesh@multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************/
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
%if BIT_DEPTH == 12
ssim_c1: times 4 dd 107321.76 ; .01*.01*4095*4095*64
ssim_c2: times 4 dd 60851437.92 ; .03*.03*4095*4095*64*63
pf_64: times 4 dd 64.0
pf_128: times 4 dd 128.0
%elif BIT_DEPTH == 10
ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
pf_64: times 4 dd 64.0
pf_128: times 4 dd 128.0
%elif BIT_DEPTH == 9
ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
%else ; 8-bit
ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
%endif
mask_ff: times 16 db 0xff
times 16 db 0
deinterleave_shuf: times 2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
deinterleave_word_shuf: times 2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
hmulw_16p: times 8 dw 1
times 4 dw 1, -1
SECTION .text
cextern pw_1
cextern pw_0_15
cextern pb_1
cextern pb_128
cextern pw_00ff
cextern pw_1023
cextern pw_3fff
cextern pw_2000
cextern pw_pixel_max
cextern pd_1
cextern pd_32767
cextern pd_n32768
cextern pb_2
cextern pb_4
cextern pb_8
cextern pb_15
cextern pb_16
cextern pb_32
cextern pb_64
cextern hmul_16p
cextern trans8_shuf
cextern_naked private_prefix %+ _entropyStateBits
cextern pb_movemask
;-----------------------------------------------------------------------------
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
%if HIGH_BIT_DEPTH
cglobal getResidual4, 4,4,4
add r3, r3
; row 0-1
movh m0, [r0]
movh m1, [r0 + r3]
movh m2, [r1]
movh m3, [r1 + r3]
punpcklqdq m0, m1
punpcklqdq m2, m3
psubw m0, m2
movh [r2], m0
movhps [r2 + r3], m0
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 2]
; row 2-3
movh m0, [r0]
movh m1, [r0 + r3]
movh m2, [r1]
movh m3, [r1 + r3]
punpcklqdq m0, m1
punpcklqdq m2, m3
psubw m0, m2
movh [r2], m0
movhps [r2 + r3], m0
RET
%else
cglobal getResidual4, 4,4,5
pxor m0, m0
; row 0-1
movd m1, [r0]
movd m2, [r0 + r3]
movd m3, [r1]
movd m4, [r1 + r3]
punpckldq m1, m2
punpcklbw m1, m0
punpckldq m3, m4
punpcklbw m3, m0
psubw m1, m3
movh [r2], m1
movhps [r2 + r3 * 2], m1
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 4]
; row 2-3
movd m1, [r0]
movd m2, [r0 + r3]
movd m3, [r1]
movd m4, [r1 + r3]
punpckldq m1, m2
punpcklbw m1, m0
punpckldq m3, m4
punpcklbw m3, m0
psubw m1, m3
movh [r2], m1
movhps [r2 + r3 * 2], m1
RET
%endif
INIT_XMM sse2
%if HIGH_BIT_DEPTH
cglobal getResidual8, 4,4,4
add r3, r3
%assign x 0
%rep 8/2
; row 0-1
movu m1, [r0]
movu m2, [r0 + r3]
movu m3, [r1]
movu m4, [r1 + r3]
psubw m1, m3
psubw m2, m4
movu [r2], m1
movu [r2 + r3], m2
%assign x x+1
%if (x != 4)
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 2]
%endif
%endrep
RET
%else
cglobal getResidual8, 4,4,5
pxor m0, m0
%assign x 0
%rep 8/2
; row 0-1
movh m1, [r0]
movh m2, [r0 + r3]
movh m3, [r1]
movh m4, [r1 + r3]
punpcklbw m1, m0
punpcklbw m2, m0
punpcklbw m3, m0
punpcklbw m4, m0
psubw m1, m3
psubw m2, m4
movu [r2], m1
movu [r2 + r3 * 2], m2
%assign x x+1
%if (x != 4)
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 4]
%endif
%endrep
RET
%endif
%if HIGH_BIT_DEPTH
INIT_XMM sse2
cglobal getResidual16, 4,5,6
add r3, r3
mov r4d, 16/4
.loop:
; row 0-1
movu m0, [r0]
movu m1, [r0 + 16]
movu m2, [r0 + r3]
movu m3, [r0 + r3 + 16]
movu m4, [r1]
movu m5, [r1 + 16]
psubw m0, m4
psubw m1, m5
movu m4, [r1 + r3]
movu m5, [r1 + r3 + 16]
psubw m2, m4
psubw m3, m5
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
movu [r2], m0
movu [r2 + 16], m1
movu [r2 + r3], m2
movu [r2 + r3 + 16], m3
lea r2, [r2 + r3 * 2]
; row 2-3
movu m0, [r0]
movu m1, [r0 + 16]
movu m2, [r0 + r3]
movu m3, [r0 + r3 + 16]
movu m4, [r1]
movu m5, [r1 + 16]
psubw m0, m4
psubw m1, m5
movu m4, [r1 + r3]
movu m5, [r1 + r3 + 16]
psubw m2, m4
psubw m3, m5
movu [r2], m0
movu [r2 + 16], m1
movu [r2 + r3], m2
movu [r2 + r3 + 16], m3
dec r4d
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 2]
jnz .loop
RET
%else
INIT_XMM sse4
cglobal getResidual16, 4,5,8
mov r4d, 16/4
pxor m0, m0
.loop:
; row 0-1
movu m1, [r0]
movu m2, [r0 + r3]
movu m3, [r1]
movu m4, [r1 + r3]
pmovzxbw m5, m1
punpckhbw m1, m0
pmovzxbw m6, m2
punpckhbw m2, m0
pmovzxbw m7, m3
punpckhbw m3, m0
psubw m5, m7
psubw m1, m3
pmovzxbw m7, m4
punpckhbw m4, m0
psubw m6, m7
psubw m2, m4
movu [r2], m5
movu [r2 + 16], m1
movu [r2 + r3 * 2], m6
movu [r2 + r3 * 2 + 16], m2
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 4]
; row 2-3
movu m1, [r0]
movu m2, [r0 + r3]
movu m3, [r1]
movu m4, [r1 + r3]
pmovzxbw m5, m1
punpckhbw m1, m0
pmovzxbw m6, m2
punpckhbw m2, m0
pmovzxbw m7, m3
punpckhbw m3, m0
psubw m5, m7
psubw m1, m3
pmovzxbw m7, m4
punpckhbw m4, m0
psubw m6, m7
psubw m2, m4
movu [r2], m5
movu [r2 + 16], m1
movu [r2 + r3 * 2], m6
movu [r2 + r3 * 2 + 16], m2
dec r4d
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 4]
jnz .loop
RET
%endif
%if HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal getResidual16, 4,4,5
add r3, r3
pxor m0, m0
%assign x 0
%rep 16/2
movu m1, [r0]
movu m2, [r0 + r3]
movu m3, [r1]
movu m4, [r1 + r3]
psubw m1, m3
psubw m2, m4
movu [r2], m1
movu [r2 + r3], m2
%assign x x+1
%if (x != 8)
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 2]
%endif
%endrep
RET
%else
INIT_YMM avx2
cglobal getResidual16, 4,5,8
lea r4, [r3 * 2]
add r4d, r3d
%assign x 0
%rep 4
pmovzxbw m0, [r0]
pmovzxbw m1, [r0 + r3]
pmovzxbw m2, [r0 + r3 * 2]
pmovzxbw m3, [r0 + r4]
pmovzxbw m4, [r1]
pmovzxbw m5, [r1 + r3]
pmovzxbw m6, [r1 + r3 * 2]
pmovzxbw m7, [r1 + r4]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r2], m0
movu [r2 + r3 * 2], m1
movu [r2 + r3 * 2 * 2], m2
movu [r2 + r4 * 2], m3
%assign x x+1
%if (x != 4)
lea r0, [r0 + r3 * 2 * 2]
lea r1, [r1 + r3 * 2 * 2]
lea r2, [r2 + r3 * 4 * 2]
%endif
%endrep
RET
%endif
%if HIGH_BIT_DEPTH
INIT_XMM sse2
cglobal getResidual32, 4,5,6
add r3, r3
mov r4d, 32/2
.loop:
; row 0
movu m0, [r0]
movu m1, [r0 + 16]
movu m2, [r0 + 32]
movu m3, [r0 + 48]
movu m4, [r1]
movu m5, [r1 + 16]
psubw m0, m4
psubw m1, m5
movu m4, [r1 + 32]
movu m5, [r1 + 48]
psubw m2, m4
psubw m3, m5
movu [r2], m0
movu [r2 + 16], m1
movu [r2 + 32], m2
movu [r2 + 48], m3
; row 1
movu m0, [r0 + r3]
movu m1, [r0 + r3 + 16]
movu m2, [r0 + r3 + 32]
movu m3, [r0 + r3 + 48]
movu m4, [r1 + r3]
movu m5, [r1 + r3 + 16]
psubw m0, m4
psubw m1, m5
movu m4, [r1 + r3 + 32]
movu m5, [r1 + r3 + 48]
psubw m2, m4
psubw m3, m5
movu [r2 + r3], m0
movu [r2 + r3 + 16], m1
movu [r2 + r3 + 32], m2
movu [r2 + r3 + 48], m3
dec r4d
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 2]
jnz .loop
RET
%else
INIT_XMM sse4
cglobal getResidual32, 4,5,7
mov r4d, 32/2
pxor m0, m0
.loop:
movu m1, [r0]
movu m2, [r0 + 16]
movu m3, [r1]
movu m4, [r1 + 16]
pmovzxbw m5, m1
punpckhbw m1, m0
pmovzxbw m6, m3
punpckhbw m3, m0
psubw m5, m6
psubw m1, m3
movu [r2 + 0 * 16], m5
movu [r2 + 1 * 16], m1
pmovzxbw m5, m2
punpckhbw m2, m0
pmovzxbw m6, m4
punpckhbw m4, m0
psubw m5, m6
psubw m2, m4
movu [r2 + 2 * 16], m5
movu [r2 + 3 * 16], m2
movu m1, [r0 + r3]
movu m2, [r0 + r3 + 16]
movu m3, [r1 + r3]
movu m4, [r1 + r3 + 16]
pmovzxbw m5, m1
punpckhbw m1, m0
pmovzxbw m6, m3
punpckhbw m3, m0
psubw m5, m6
psubw m1, m3
movu [r2 + r3 * 2 + 0 * 16], m5
movu [r2 + r3 * 2 + 1 * 16], m1
pmovzxbw m5, m2
punpckhbw m2, m0
pmovzxbw m6, m4
punpckhbw m4, m0
psubw m5, m6
psubw m2, m4
movu [r2 + r3 * 2 + 2 * 16], m5
movu [r2 + r3 * 2 + 3 * 16], m2
dec r4d
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 4]
jnz .loop
RET
%endif
%if HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal getResidual32, 4,4,5
add r3, r3
pxor m0, m0
%assign x 0
%rep 32
movu m1, [r0]
movu m2, [r0 + 32]
movu m3, [r1]
movu m4, [r1 + 32]
psubw m1, m3
psubw m2, m4
movu [r2], m1
movu [r2 + 32], m2
%assign x x+1
%if (x != 32)
lea r0, [r0 + r3]
lea r1, [r1 + r3]
lea r2, [r2 + r3]
%endif
%endrep
RET
%else
INIT_YMM avx2
cglobal getResidual32, 4,5,8
lea r4, [r3 * 2]
%assign x 0
%rep 16
pmovzxbw m0, [r0]
pmovzxbw m1, [r0 + 16]
pmovzxbw m2, [r0 + r3]
pmovzxbw m3, [r0 + r3 + 16]
pmovzxbw m4, [r1]
pmovzxbw m5, [r1 + 16]
pmovzxbw m6, [r1 + r3]
pmovzxbw m7, [r1 + r3 + 16]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r2 + 0 ], m0
movu [r2 + 32], m1
movu [r2 + r4 + 0], m2
movu [r2 + r4 + 32], m3
%assign x x+1
%if (x != 16)
lea r0, [r0 + r3 * 2]
lea r1, [r1 + r3 * 2]
lea r2, [r2 + r3 * 4]
%endif
%endrep
RET
%endif
;-----------------------------------------------------------------------------
; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal quant, 5,6,8
; fill qbits
movd m4, r4d ; m4 = qbits
; fill qbits-8
sub r4d, 8
movd m6, r4d ; m6 = qbits8
; fill offset
movd m5, r5m
pshufd m5, m5, 0 ; m5 = add
lea r5, [pd_1]
mov r4d, r6m
shr r4d, 3
pxor m7, m7 ; m7 = numZero
.loop:
; 4 coeff
pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
psrad m2, m4 ; m2 = level1
pslld m3, m2, 8
psrad m1, m6
psubd m1, m3 ; m1 = deltaU1
movu [r2], m1
psignd m3, m2, m0
pminud m2, [r5]
paddd m7, m2
packssdw m3, m3
movh [r3], m3
; 4 coeff
pmovsxwd m0, [r0 + 8] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + 16] ; m0 = tmpLevel1
paddd m2, m1, m5
psrad m2, m4 ; m2 = level1
pslld m3, m2, 8
psrad m1, m6
psubd m1, m3 ; m1 = deltaU1
movu [r2 + 16], m1
psignd m3, m2, m0
pminud m2, [r5]
paddd m7, m2
packssdw m3, m3
movh [r3 + 8], m3
add r0, 16
add r1, 32
add r2, 32
add r3, 16
dec r4d
jnz .loop
pshufd m0, m7, 00001110b
paddd m0, m7
pshufd m1, m0, 00000001b
paddd m0, m1
movd eax, m0
RET
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal quant, 5,6,9
; fill qbits
movd xm4, r4d ; m4 = qbits
; fill qbits-8
sub r4d, 8
movd xm6, r4d ; m6 = qbits8
; fill offset
%if UNIX64 == 0
vpbroadcastd m5, r5m ; m5 = add
%else ; Mac
movd xm5, r5m
vpbroadcastd m5, xm5 ; m5 = add
%endif
lea r5, [pw_1]
mov r4d, r6m
shr r4d, 4
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
psrad m2, xm4 ; m2 = level1
pslld m3, m2, 8
psrad m1, xm6
psubd m1, m3 ; m1 = deltaU1
movu [r2], m1
psignd m2, m0
; 8 coeff
pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m3, m1, m5
psrad m3, xm4 ; m2 = level1
pslld m8, m3, 8
psrad m1, xm6
psubd m1, m8 ; m1 = deltaU1
movu [r2 + mmsize], m1
psignd m3, m0
packssdw m2, m3
vpermq m2, m2, q3120
movu [r3], m2
; count non-zero coeff
; TODO: popcnt is faster, but some CPU can't support
pminuw m2, [r5]
paddw m7, m2
add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
dec r4d
jnz .loop
; sum count
xorpd m0, m0
psadbw m7, m0
vextracti128 xm1, m7, 1
paddd xm7, xm1
movhlps xm0, xm7
paddd xm7, xm0
movd eax, xm7
RET
%else ; ARCH_X86_64 == 1
INIT_YMM avx2
cglobal quant, 5,6,8
; fill qbits
movd xm4, r4d ; m4 = qbits
; fill qbits-8
sub r4d, 8
movd xm6, r4d ; m6 = qbits8
; fill offset
%if UNIX64 == 0
vpbroadcastd m5, r5m ; m5 = add
%else ; Mac
movd xm5, r5m
vpbroadcastd m5, xm5 ; m5 = add
%endif
lea r5, [pd_1]
mov r4d, r6m
shr r4d, 4
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
psrad m2, xm4 ; m2 = level1
pslld m3, m2, 8
psrad m1, xm6
psubd m1, m3 ; m1 = deltaU1
movu [r2], m1
psignd m3, m2, m0
pminud m2, [r5]
paddd m7, m2
packssdw m3, m3
vpermq m3, m3, q0020
movu [r3], xm3
; 8 coeff
pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m2, m1, m5
psrad m2, xm4 ; m2 = level1
pslld m3, m2, 8
psrad m1, xm6
psubd m1, m3 ; m1 = deltaU1
movu [r2 + mmsize], m1
psignd m3, m2, m0
pminud m2, [r5]
paddd m7, m2
packssdw m3, m3
vpermq m3, m3, q0020
movu [r3 + mmsize/2], xm3
add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
dec r4d
jnz .loop
xorpd m0, m0
psadbw m7, m0
vextracti128 xm1, m7, 1
paddd xm7, xm1
movhlps xm0, xm7
paddd xm7, xm0
movd eax, xm7
RET
%endif ; ARCH_X86_64 == 1
;-----------------------------------------------------------------------------
; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal nquant, 3,5,8
movd m6, r4m
mov r4d, r5m
pxor m7, m7 ; m7 = numZero
movd m5, r3m ; m5 = qbits
pshufd m6, m6, 0 ; m6 = add
mov r3d, r4d ; r3 = numCoeff
shr r4d, 3
.loop:
pmovsxwd m0, [r0] ; m0 = level
pmovsxwd m1, [r0 + 8] ; m1 = level
pabsd m2, m0
pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
paddd m2, m6
psrad m2, m5 ; m0 = level1
psignd m2, m0
pabsd m3, m1
pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff
paddd m3, m6
psrad m3, m5 ; m1 = level1
psignd m3, m1
packssdw m2, m3
movu [r2], m2
add r0, 16
add r1, 32
add r2, 16
pxor m4, m4
pcmpeqw m2, m4
psubw m7, m2
dec r4d
jnz .loop
packuswb m7, m7
psadbw m7, m4
mov eax, r3d
movd r4d, m7
sub eax, r4d ; numSig
RET
INIT_YMM avx2
cglobal nquant, 3,5,7
%if UNIX64 == 0
vpbroadcastd m4, r4m
%else ; Mac
movd xm4, r4m
vpbroadcastd m4, xm4
%endif
vpbroadcastd m6, [pw_1]
mov r4d, r5m
pxor m5, m5 ; m7 = numZero
movd xm3, r3m ; m5 = qbits
mov r3d, r4d ; r3 = numCoeff
shr r4d, 4
.loop:
pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
paddd m1, m4
psrad m1, xm3 ; m0 = level1
psignd m1, m0
pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m2, m0
pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
paddd m2, m4
psrad m2, xm3 ; m0 = level1
psignd m2, m0
packssdw m1, m2
vpermq m2, m1, q3120
movu [r2], m2
add r0, mmsize
add r1, mmsize * 2
add r2, mmsize
pminuw m1, m6
paddw m5, m1
dec r4d
jnz .loop
pxor m0, m0
psadbw m5, m0
vextracti128 xm0, m5, 1
paddd xm5, xm0
pshufd xm0, xm5, 2
paddd xm5, xm0
movd eax, xm5
RET
;-----------------------------------------------------------------------------
; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal dequant_normal, 5,5,5
mova m2, [pw_1]
%if HIGH_BIT_DEPTH
cmp r3d, 32767
jle .skip
shr r3d, (BIT_DEPTH - 8)
sub r4d, (BIT_DEPTH - 8)
.skip:
%endif
movd m0, r4d ; m0 = shift
add r4d, 15
bts r3d, r4d
movd m1, r3d
pshufd m1, m1, 0 ; m1 = dword [add scale]
; m0 = shift
; m1 = scale
; m2 = word [1]
.loop:
movu m3, [r0]
punpckhwd m4, m3, m2
punpcklwd m3, m2
pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
pmaddwd m4, m1
psrad m3, m0
psrad m4, m0
packssdw m3, m4
mova [r1], m3
add r0, 16
add r1, 16
sub r2d, 8
jnz .loop
RET
;----------------------------------------------------------------------------------------------------------------------
;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
;----------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal dequant_scaling, 6,6,6
add r5d, 4
shr r3d, 3 ; num/8
cmp r5d, r4d
jle .skip
sub r5d, r4d
mova m0, [pd_1]
movd m1, r5d ; shift - per
dec r5d
movd m2, r5d ; shift - per - 1
pslld m0, m2 ; 1 << shift - per - 1
.part0:
pmovsxwd m2, [r0]
pmovsxwd m4, [r0 + 8]
movu m3, [r1]
movu m5, [r1 + 16]
pmulld m2, m3
pmulld m4, m5
paddd m2, m0
paddd m4, m0
psrad m2, m1
psrad m4, m1
packssdw m2, m4
movu [r2], m2
add r0, 16
add r1, 32
add r2, 16
dec r3d
jnz .part0
jmp .end
.skip:
sub r4d, r5d ; per - shift
movd m0, r4d
.part1:
pmovsxwd m2, [r0]
pmovsxwd m4, [r0 + 8]
movu m3, [r1]
movu m5, [r1 + 16]
pmulld m2, m3
pmulld m4, m5
packssdw m2, m4
pmovsxwd m1, m2
psrldq m2, 8
pmovsxwd m2, m2
pslld m1, m0
pslld m2, m0
packssdw m1, m2
movu [r2], m1
add r0, 16
add r1, 32
add r2, 16
dec r3d
jnz .part1
.end:
RET
;----------------------------------------------------------------------------------------------------------------------
;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
;----------------------------------------------------------------------------------------------------------------------
INIT_YMM avx2
cglobal dequant_scaling, 6,6,6
add r5d, 4
shr r3d, 4 ; num/16
cmp r5d, r4d
jle .skip
sub r5d, r4d
mova m0, [pd_1]
movd xm1, r5d ; shift - per
dec r5d
movd xm2, r5d ; shift - per - 1
pslld m0, xm2 ; 1 << shift - per - 1
.part0:
pmovsxwd m2, [r0]
pmovsxwd m4, [r0 + 16]
movu m3, [r1]
movu m5, [r1 + 32]
pmulld m2, m3
pmulld m4, m5
paddd m2, m0
paddd m4, m0
psrad m2, xm1
psrad m4, xm1
packssdw m2, m4
vpermq m2, m2, 11011000b
movu [r2], m2
add r0, 32
add r1, 64
add r2, 32
dec r3d
jnz .part0
jmp .end
.skip:
sub r4d, r5d ; per - shift
movd xm0, r4d
.part1:
pmovsxwd m2, [r0]
pmovsxwd m4, [r0 + 16]
movu m3, [r1]
movu m5, [r1 + 32]
pmulld m2, m3
pmulld m4, m5
packssdw m2, m4
vextracti128 xm4, m2, 1
pmovsxwd m1, xm2
pmovsxwd m2, xm4
pslld m1, xm0
pslld m2, xm0
packssdw m1, m2
movu [r2], m1
add r0, 32
add r1, 64
add r2, 32
dec r3d
jnz .part1
.end:
RET
INIT_YMM avx2
cglobal dequant_normal, 5,5,7
vpbroadcastd m2, [pw_1] ; m2 = word [1]
vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
%if HIGH_BIT_DEPTH
cmp r3d, 32767
jle .skip
shr r3d, (BIT_DEPTH - 8)
sub r4d, (BIT_DEPTH - 8)
.skip:
%endif
movd xm0, r4d ; m0 = shift
add r4d, -1+16
bts r3d, r4d
movd xm1, r3d
vpbroadcastd m1, xm1 ; m1 = dword [add scale]
; m0 = shift
; m1 = scale
; m2 = word [1]
shr r2d, 4
.loop:
movu m3, [r0]
punpckhwd m4, m3, m2
punpcklwd m3, m2
pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
pmaddwd m4, m1
psrad m3, xm0
psrad m4, xm0
pminsd m3, m5
pmaxsd m3, m6
pminsd m4, m5
pmaxsd m4, m6
packssdw m3, m4
mova [r1 + 0 * mmsize/2], xm3
vextracti128 [r1 + 1 * mmsize/2], m3, 1
add r0, mmsize
add r1, mmsize
dec r2d
jnz .loop
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_4x4_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal count_nonzero_4x4, 1,1,2
pxor m0, m0
mova m1, [r0 + 0]
packsswb m1, [r0 + 16]
pcmpeqb m1, m0
paddb m1, [pb_1]
psadbw m1, m0
pshufd m0, m1, 2
paddd m0, m1
movd eax, m0
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_4x4_avx2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal count_nonzero_4x4, 1,1,2
pxor m0, m0
movu m1, [r0]
pcmpeqw m1, m0
pmovmskb eax, m1
not eax
popcnt eax, eax
shr eax, 1
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_8x8_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal count_nonzero_8x8, 1,1,3
pxor m0, m0
movu m1, [pb_4]
%rep 4
mova m2, [r0 + 0]
packsswb m2, [r0 + 16]
add r0, 32
pcmpeqb m2, m0
paddb m1, m2
%endrep
psadbw m1, m0
pshufd m0, m1, 2
paddd m0, m1
movd eax, m0
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_8x8_avx2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal count_nonzero_8x8, 1,1,3
pxor m0, m0
movu m1, [pb_2]
mova m2, [r0]
packsswb m2, [r0 + 32]
pcmpeqb m2, m0
paddb m1, m2
mova m2, [r0 + 64]
packsswb m2, [r0 + 96]
pcmpeqb m2, m0
paddb m1, m2
psadbw m1, m0
vextracti128 xm0, m1, 1
paddd m0, m1
pshufd m1, m0, 2
paddd m0, m1
movd eax, xm0
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_16x16_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal count_nonzero_16x16, 1,1,3
pxor m0, m0
movu m1, [pb_16]
%rep 16
mova m2, [r0 + 0]
packsswb m2, [r0 + 16]
add r0, 32
pcmpeqb m2, m0
paddb m1, m2
%endrep
psadbw m1, m0
pshufd m0, m1, 2
paddd m0, m1
movd eax, m0
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_16x16_avx2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal count_nonzero_16x16, 1,1,3
pxor m0, m0
movu m1, [pb_8]
%assign x 0
%rep 8
mova m2, [r0 + x]
packsswb m2, [r0 + x + 32]
%assign x x+64
pcmpeqb m2, m0
paddb m1, m2
%endrep
psadbw m1, m0
vextracti128 xm0, m1, 1
paddd m0, m1
pshufd m1, m0, 2
paddd m0, m1
movd eax, xm0
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal count_nonzero_32x32, 1,1,3
pxor m0, m0
movu m1, [pb_64]
%rep 64
mova m2, [r0 + 0]
packsswb m2, [r0 + 16]
add r0, 32
pcmpeqb m2, m0
paddb m1, m2
%endrep
psadbw m1, m0
pshufd m0, m1, 2
paddd m0, m1
movd eax, m0
RET
;-----------------------------------------------------------------------------
; int x265_count_nonzero_32x32_avx2(const int16_t *quantCoeff);
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal count_nonzero_32x32, 1,1,3
pxor m0, m0
movu m1, [pb_32]
%assign x 0
%rep 32
mova m2, [r0 + x]
packsswb m2, [r0 + x + 32]
%assign x x+64
pcmpeqb m2, m0
paddb m1, m2
%endrep
psadbw m1, m0
vextracti128 xm0, m1, 1
paddd m0, m1
pshufd m1, m0, 2
paddd m0, m1
movd eax, xm0
RET
;-----------------------------------------------------------------------------------------------------------------------------------------------
;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
;-----------------------------------------------------------------------------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal weight_pp, 4,7,7
%define correction (14 - BIT_DEPTH)
mova m6, [pw_pixel_max]
mov r6d, r6m
mov r4d, r4m
mov r5d, r5m
shl r6d, 16 - correction
or r6d, r5d ; assuming both (w0) and round are using maximum of 16 bits each.
movd m0, r6d
pshufd m0, m0, 0 ; m0 = [w0, round]
mov r5d, r7m
sub r5d, correction
movd m1, r5d
movd m2, r8m
pshufd m2, m2, 0
mova m5, [pw_1]
sub r2d, r3d
add r2d, r2d
shr r3d, 4
.loopH:
mov r5d, r3d
.loopW:
movu m4, [r0]
punpcklwd m3, m4, m5
pmaddwd m3, m0
psrad m3, m1
paddd m3, m2 ; TODO: we can put Offset into Round, but we have to analyze Dynamic Range before that.
punpckhwd m4, m5
pmaddwd m4, m0
psrad m4, m1
paddd m4, m2
packusdw m3, m4
pminuw m3, m6
movu [r1], m3
movu m4, [r0 + mmsize]
punpcklwd m3, m4, m5
pmaddwd m3, m0
psrad m3, m1
paddd m3, m2
punpckhwd m4, m5
pmaddwd m4, m0
psrad m4, m1
paddd m4, m2
packusdw m3, m4
pminuw m3, m6
movu [r1 + mmsize], m3
add r0, 2 * mmsize
add r1, 2 * mmsize
dec r5d
jnz .loopW
add r0, r2
add r1, r2
dec r4d
jnz .loopH
RET
%else ; end of (HIGH_BIT_DEPTH == 1)
INIT_XMM sse4
cglobal weight_pp, 6,7,6
shl r5d, 6 ; m0 = [w0<<6]
mov r6d, r6m
shl r6d, 16
or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
movd m0, r6d
pshufd m0, m0, 0 ; m0 = [w0<<6, round]
movd m1, r7m
movd m2, r8m
pshufd m2, m2, 0
mova m5, [pw_1]
sub r2d, r3d
shr r3d, 4
.loopH:
mov r5d, r3d
.loopW:
pmovzxbw m4, [r0]
punpcklwd m3, m4, m5
pmaddwd m3, m0
psrad m3, m1
paddd m3, m2
punpckhwd m4, m5
pmaddwd m4, m0
psrad m4, m1
paddd m4, m2
packssdw m3, m4
packuswb m3, m3
movh [r1], m3
pmovzxbw m4, [r0 + 8]
punpcklwd m3, m4, m5
pmaddwd m3, m0
psrad m3, m1
paddd m3, m2
punpckhwd m4, m5
pmaddwd m4, m0
psrad m4, m1
paddd m4, m2
packssdw m3, m4
packuswb m3, m3
movh [r1 + 8], m3
add r0, 16
add r1, 16
dec r5d
jnz .loopW
lea r0, [r0 + r2]
lea r1, [r1 + r2]
dec r4d
jnz .loopH
RET
%endif ; end of (HIGH_BIT_DEPTH == 0)
%if HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal weight_pp, 6, 7, 7
%define correction (14 - BIT_DEPTH)
mov r6d, r6m
shl r6d, 16 - correction
or r6d, r5d ; assuming both w0 and round are using maximum of 16 bits each.
movd xm0, r6d
vpbroadcastd m0, xm0
mov r5d, r7m
sub r5d, correction
movd xm1, r5d
vpbroadcastd m2, r8m
mova m5, [pw_1]
mova m6, [pw_pixel_max]
add r2d, r2d
add r3d, r3d
sub r2d, r3d
shr r3d, 5
.loopH:
mov r5d, r3d
.loopW:
movu m4, [r0]
punpcklwd m3, m4, m5
pmaddwd m3, m0
psrad m3, xm1
paddd m3, m2
punpckhwd m4, m5
pmaddwd m4, m0
psrad m4, xm1
paddd m4, m2
packusdw m3, m4
pminuw m3, m6
movu [r1], m3
add r0, 32
add r1, 32
dec r5d
jnz .loopW
lea r0, [r0 + r2]
lea r1, [r1 + r2]
dec r4d
jnz .loopH
%undef correction
RET
%else
INIT_YMM avx2
cglobal weight_pp, 6, 7, 6
shl r5d, 6 ; m0 = [w0<<6]
mov r6d, r6m
shl r6d, 16
or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
movd xm0, r6d
vpbroadcastd m0, xm0
movd xm1, r7m
vpbroadcastd m2, r8m
mova m5, [pw_1]
sub r2d, r3d
shr r3d, 4
.loopH:
mov r5d, r3d
.loopW:
pmovzxbw m4, [r0]
punpcklwd m3, m4, m5
pmaddwd m3, m0
psrad m3, xm1
paddd m3, m2
punpckhwd m4, m5
pmaddwd m4, m0
psrad m4, xm1
paddd m4, m2
packssdw m3, m4
vextracti128 xm4, m3, 1
packuswb xm3, xm4
movu [r1], xm3
add r0, 16
add r1, 16
dec r5d
jnz .loopW
lea r0, [r0 + r2]
lea r1, [r1 + r2]
dec r4d
jnz .loopH
RET
%endif
;-------------------------------------------------------------------------------------------------------------------------------------------------
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
;-------------------------------------------------------------------------------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal weight_sp, 6,7,8
mova m1, [pw_pixel_max]
mova m2, [pw_1]
mov r6d, r7m
shl r6d, 16
or r6d, r6m ; assuming both (w0) and round are using maximum of 16 bits each.
movd m3, r6d
pshufd m3, m3, 0 ; m3 = [round w0]
movd m4, r8m ; m4 = [shift]
movd m5, r9m
pshufd m5, m5, 0 ; m5 = [offset]
; correct row stride
add r3d, r3d
add r2d, r2d
mov r6d, r4d
and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
sub r3d, r6d
sub r3d, r6d
sub r2d, r6d
sub r2d, r6d
; generate partial width mask (MUST BE IN XMM0)
mov r6d, r4d
and r6d, (mmsize / SIZEOF_PIXEL - 1)
movd m0, r6d
pshuflw m0, m0, 0
punpcklqdq m0, m0
pcmpgtw m0, [pw_0_15]
.loopH:
mov r6d, r4d
.loopW:
movu m6, [r0]
paddw m6, [pw_2000]
punpcklwd m7, m6, m2
pmaddwd m7, m3
psrad m7, m4
paddd m7, m5
punpckhwd m6, m2
pmaddwd m6, m3
psrad m6, m4
paddd m6, m5
packusdw m7, m6
pminuw m7, m1
sub r6d, (mmsize / SIZEOF_PIXEL)
jl .widthLess8
movu [r1], m7
lea r0, [r0 + mmsize]
lea r1, [r1 + mmsize]
je .nextH
jmp .loopW
.widthLess8:
movu m6, [r1]
pblendvb m6, m6, m7, m0
movu [r1], m6
.nextH:
add r0, r2
add r1, r3
dec r5d
jnz .loopH
RET
%else ; end of (HIGH_BIT_DEPTH == 1)
INIT_XMM sse4
%if ARCH_X86_64
cglobal weight_sp, 6, 7+2, 7
%define tmp_r0 r7
%define tmp_r1 r8
%else ; ARCH_X86_64 = 0
cglobal weight_sp, 6, 7, 7, 0-(2*4)
%define tmp_r0 [(rsp + 0 * 4)]
%define tmp_r1 [(rsp + 1 * 4)]
%endif ; ARCH_X86_64
movd m0, r6m ; m0 = [w0]
movd m1, r7m ; m1 = [round]
punpcklwd m0, m1
pshufd m0, m0, 0 ; m0 = [w0 round]
movd m1, r8m ; m1 = [shift]
movd m2, r9m
pshufd m2, m2, 0 ; m2 =[offset]
mova m3, [pw_1]
mova m4, [pw_2000]
add r2d, r2d
.loopH:
mov r6d, r4d
; save old src and dst
mov tmp_r0, r0
mov tmp_r1, r1
.loopW:
movu m5, [r0]
paddw m5, m4
punpcklwd m6,m5, m3
pmaddwd m6, m0
psrad m6, m1
paddd m6, m2
punpckhwd m5, m3
pmaddwd m5, m0
psrad m5, m1
paddd m5, m2
packssdw m6, m5
packuswb m6, m6
sub r6d, 8
jl .width4
movh [r1], m6
je .nextH
add r0, 16
add r1, 8
jmp .loopW
.width4:
cmp r6d, -4
jl .width2
movd [r1], m6
je .nextH
add r1, 4
pshufd m6, m6, 1
.width2:
pextrw [r1], m6, 0
.nextH:
mov r0, tmp_r0
mov r1, tmp_r1
lea r0, [r0 + r2]
lea r1, [r1 + r3]
dec r5d
jnz .loopH
RET
%endif
%if ARCH_X86_64 == 1
%if HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal weight_sp, 6,7,9
mova m1, [pw_pixel_max]
mova m2, [pw_1]
mov r6d, r7m
shl r6d, 16
or r6d, r6m
movd xm3, r6d
vpbroadcastd m3, xm3 ; m3 = [round w0]
movd xm4, r8m ; m4 = [shift]
vpbroadcastd m5, r9m ; m5 = [offset]
; correct row stride
add r3d, r3d
add r2d, r2d
mov r6d, r4d
and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
sub r3d, r6d
sub r3d, r6d
sub r2d, r6d
sub r2d, r6d
; generate partial width mask (MUST BE IN YMM0)
mov r6d, r4d
and r6d, (mmsize / SIZEOF_PIXEL - 1)
movd xm0, r6d
pshuflw m0, m0, 0
punpcklqdq m0, m0
vinserti128 m0, m0, xm0, 1
pcmpgtw m0, [pw_0_15]
.loopH:
mov r6d, r4d
.loopW:
movu m6, [r0]
paddw m6, [pw_2000]
punpcklwd m7, m6, m2
pmaddwd m7, m3 ;(round w0)
psrad m7, xm4 ;(shift)
paddd m7, m5 ;(offset)
punpckhwd m6, m2
pmaddwd m6, m3
psrad m6, xm4
paddd m6, m5
packusdw m7, m6
pminuw m7, m1
sub r6d, (mmsize / SIZEOF_PIXEL)
jl .width14
movu [r1], m7
lea r0, [r0 + mmsize]
lea r1, [r1 + mmsize]
je .nextH
jmp .loopW
.width14:
add r6d, 16
cmp r6d, 14
jl .width12
movu [r1], xm7
vextracti128 xm8, m7, 1
movq [r1 + 16], xm8
pextrd [r1 + 24], xm8, 2
je .nextH
.width12:
cmp r6d, 12
jl .width10
movu [r1], xm7
vextracti128 xm8, m7, 1
movq [r1 + 16], xm8
je .nextH
.width10:
cmp r6d, 10
jl .width8
movu [r1], xm7
vextracti128 xm8, m7, 1
movd [r1 + 16], xm8
je .nextH
.width8:
cmp r6d, 8
jl .width6
movu [r1], xm7
je .nextH
.width6
cmp r6d, 6
jl .width4
movq [r1], xm7
pextrd [r1 + 8], xm7, 2
je .nextH
.width4:
cmp r6d, 4
jl .width2
movq [r1], xm7
je .nextH
add r1, 4
pshufd m6, m6, 1
je .nextH
.width2:
movd [r1], xm7
.nextH:
add r0, r2
add r1, r3
dec r5d
jnz .loopH
RET
%else
INIT_YMM avx2
cglobal weight_sp, 6, 9, 7
mov r7d, r7m
shl r7d, 16
or r7d, r6m
movd xm0, r7d
vpbroadcastd m0, xm0 ; m0 = times 8 dw w0, round
movd xm1, r8m ; m1 = [shift]
vpbroadcastd m2, r9m ; m2 = times 16 dw offset
vpbroadcastw m3, [pw_1]
vpbroadcastw m4, [pw_2000]
add r2d, r2d ; 2 * srcstride
mov r7, r0
mov r8, r1
.loopH:
mov r6d, r4d ; width
; save old src and dst
mov r0, r7 ; src
mov r1, r8 ; dst
.loopW:
movu m5, [r0]
paddw m5, m4
punpcklwd m6,m5, m3
pmaddwd m6, m0
psrad m6, xm1
paddd m6, m2
punpckhwd m5, m3
pmaddwd m5, m0
psrad m5, xm1
paddd m5, m2
packssdw m6, m5
packuswb m6, m6
vpermq m6, m6, 10001000b
sub r6d, 16
jl .width8
movu [r1], xm6
je .nextH
add r0, 32
add r1, 16
jmp .loopW
.width8:
add r6d, 16
cmp r6d, 8
jl .width4
movq [r1], xm6
je .nextH
psrldq m6, 8
sub r6d, 8
add r1, 8
.width4:
cmp r6d, 4
jl .width2
movd [r1], xm6
je .nextH
add r1, 4
pshufd m6, m6, 1
.width2:
pextrw [r1], xm6, 0
.nextH:
lea r7, [r7 + r2]
lea r8, [r8 + r3]
dec r5d
jnz .loopH
RET
%endif
%endif
;-----------------------------------------------------------------
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
INIT_XMM sse2
cglobal transpose4, 3, 3, 4, dest, src, stride
%if HIGH_BIT_DEPTH == 1
add r2, r2
movh m0, [r1]
movh m1, [r1 + r2]
movh m2, [r1 + 2 * r2]
lea r1, [r1 + 2 * r2]
movh m3, [r1 + r2]
punpcklwd m0, m1
punpcklwd m2, m3
punpckhdq m1, m0, m2
punpckldq m0, m2
movu [r0], m0
movu [r0 + 16], m1
%else ;HIGH_BIT_DEPTH == 0
movd m0, [r1]
movd m1, [r1 + r2]
movd m2, [r1 + 2 * r2]
lea r1, [r1 + 2 * r2]
movd m3, [r1 + r2]
punpcklbw m0, m1
punpcklbw m2, m3
punpcklwd m0, m2
movu [r0], m0
%endif
RET
;-----------------------------------------------------------------
; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
%if HIGH_BIT_DEPTH == 1
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose8, 3, 5, 5
add r2, r2
lea r3, [3 * r2]
lea r4, [r1 + 4 * r2]
movu xm0, [r1]
vinserti128 m0, m0, [r4], 1
movu xm1, [r1 + r2]
vinserti128 m1, m1, [r4 + r2], 1
movu xm2, [r1 + 2 * r2]
vinserti128 m2, m2, [r4 + 2 * r2], 1
movu xm3, [r1 + r3]
vinserti128 m3, m3, [r4 + r3], 1
punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6]
punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6]
punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8]
punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8]
punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8]
punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8]
punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8]
punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8]
vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8]
vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8]
vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8]
vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8]
movu [r0 + 0 * 32], m3
movu [r0 + 1 * 32], m4
movu [r0 + 2 * 32], m1
movu [r0 + 3 * 32], m0
RET
%endif
INIT_XMM sse2
%macro TRANSPOSE_4x4 1
movh m0, [r1]
movh m1, [r1 + r2]
movh m2, [r1 + 2 * r2]
lea r1, [r1 + 2 * r2]
movh m3, [r1 + r2]
punpcklwd m0, m1
punpcklwd m2, m3
punpckhdq m1, m0, m2
punpckldq m0, m2
movh [r0], m0
movhps [r0 + %1], m0
movh [r0 + 2 * %1], m1
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m1
%endmacro
cglobal transpose8_internal
TRANSPOSE_4x4 r5
lea r1, [r1 + 2 * r2]
lea r0, [r3 + 8]
TRANSPOSE_4x4 r5
lea r1, [r1 + 2 * r2]
neg r2
lea r1, [r1 + r2 * 8 + 8]
neg r2
lea r0, [r3 + 4 * r5]
TRANSPOSE_4x4 r5
lea r1, [r1 + 2 * r2]
lea r0, [r3 + 8 + 4 * r5]
TRANSPOSE_4x4 r5
ret
cglobal transpose8, 3, 6, 4, dest, src, stride
add r2, r2
mov r3, r0
mov r5, 16
call transpose8_internal
RET
%else ;HIGH_BIT_DEPTH == 0
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose8, 3, 4, 4
lea r3, [r2 * 3]
movq xm0, [r1]
movhps xm0, [r1 + 2 * r2]
movq xm1, [r1 + r2]
movhps xm1, [r1 + r3]
lea r1, [r1 + 4 * r2]
movq xm2, [r1]
movhps xm2, [r1 + 2 * r2]
movq xm3, [r1 + r2]
movhps xm3, [r1 + r3]
vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7]
vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8]
punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6]
punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8]
punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8]
punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8]
mova m0, [trans8_shuf]
vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8]
vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8]
movu [r0], m1
movu [r0 + 32], m2
RET
%endif
INIT_XMM sse2
cglobal transpose8, 3, 5, 8, dest, src, stride
lea r3, [2 * r2]
lea r4, [3 * r2]
movh m0, [r1]
movh m1, [r1 + r2]
movh m2, [r1 + r3]
movh m3, [r1 + r4]
movh m4, [r1 + 4 * r2]
lea r1, [r1 + 4 * r2]
movh m5, [r1 + r2]
movh m6, [r1 + r3]
movh m7, [r1 + r4]
punpcklbw m0, m1
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
punpckhwd m1, m0, m2
punpcklwd m0, m2
punpckhwd m5, m4, m6
punpcklwd m4, m6
punpckhdq m2, m0, m4
punpckldq m0, m4
punpckhdq m3, m1, m5
punpckldq m1, m5
movu [r0], m0
movu [r0 + 16], m2
movu [r0 + 32], m1
movu [r0 + 48], m3
RET
%endif
%macro TRANSPOSE_8x8 1
movh m0, [r1]
movh m1, [r1 + r2]
movh m2, [r1 + 2 * r2]
lea r1, [r1 + 2 * r2]
movh m3, [r1 + r2]
movh m4, [r1 + 2 * r2]
lea r1, [r1 + 2 * r2]
movh m5, [r1 + r2]
movh m6, [r1 + 2 * r2]
lea r1, [r1 + 2 * r2]
movh m7, [r1 + r2]
punpcklbw m0, m1
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
punpckhwd m1, m0, m2
punpcklwd m0, m2
punpckhwd m5, m4, m6
punpcklwd m4, m6
punpckhdq m2, m0, m4
punpckldq m0, m4
punpckhdq m3, m1, m5
punpckldq m1, m5
movh [r0], m0
movhps [r0 + %1], m0
movh [r0 + 2 * %1], m2
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m2
movh [r0 + 2 * %1], m1
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m1
movh [r0 + 2 * %1], m3
lea r0, [r0 + 2 * %1]
movhps [r0 + %1], m3
%endmacro
;-----------------------------------------------------------------
; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
%if HIGH_BIT_DEPTH == 1
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose16x8_internal
movu m0, [r1]
movu m1, [r1 + r2]
movu m2, [r1 + 2 * r2]
movu m3, [r1 + r3]
lea r1, [r1 + 4 * r2]
movu m4, [r1]
movu m5, [r1 + r2]
movu m6, [r1 + 2 * r2]
movu m7, [r1 + r3]
punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2]
punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2]
punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4]
punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4]
punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6]
punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6]
punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8]
punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8]
punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4]
punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4]
punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8]
punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8]
punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4]
punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4]
punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8]
punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8]
punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8]
punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8]
punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8]
punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8]
punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8]
punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8]
punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8]
punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8]
movu [r0 + 0 * 32], xm6
vextracti128 [r0 + 8 * 32], m6, 1
movu [r0 + 1 * 32], xm7
vextracti128 [r0 + 9 * 32], m7, 1
movu [r0 + 2 * 32], xm1
vextracti128 [r0 + 10 * 32], m1, 1
movu [r0 + 3 * 32], xm8
vextracti128 [r0 + 11 * 32], m8, 1
movu [r0 + 4 * 32], xm3
vextracti128 [r0 + 12 * 32], m3, 1
movu [r0 + 5 * 32], xm5
vextracti128 [r0 + 13 * 32], m5, 1
movu [r0 + 6 * 32], xm2
vextracti128 [r0 + 14 * 32], m2, 1
movu [r0 + 7 * 32], xm0
vextracti128 [r0 + 15 * 32], m0, 1
ret
cglobal transpose16, 3, 4, 9
add r2, r2
lea r3, [r2 * 3]
call transpose16x8_internal
lea r1, [r1 + 4 * r2]
add r0, 16
call transpose16x8_internal
RET
%endif
INIT_XMM sse2
cglobal transpose16, 3, 7, 4, dest, src, stride
add r2, r2
mov r3, r0
mov r4, r1
mov r5, 32
mov r6, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 16]
lea r0, [r6 + 8 * r5]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * r5 + 16]
mov r3, r0
call transpose8_internal
RET
%else ;HIGH_BIT_DEPTH == 0
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose16, 3, 5, 9
lea r3, [r2 * 3]
lea r4, [r1 + 8 * r2]
movu xm0, [r1]
movu xm1, [r1 + r2]
movu xm2, [r1 + 2 * r2]
movu xm3, [r1 + r3]
vinserti128 m0, m0, [r4], 1
vinserti128 m1, m1, [r4 + r2], 1
vinserti128 m2, m2, [r4 + 2 * r2], 1
vinserti128 m3, m3, [r4 + r3], 1
lea r1, [r1 + 4 * r2]
lea r4, [r4 + 4 * r2]
movu xm4, [r1]
movu xm5, [r1 + r2]
movu xm6, [r1 + 2 * r2]
movu xm7, [r1 + r3]
vinserti128 m4, m4, [r4], 1
vinserti128 m5, m5, [r4 + r2], 1
vinserti128 m6, m6, [r4 + 2 * r2], 1
vinserti128 m7, m7, [r4 + r3], 1
punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10]
punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10]
punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12]
punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12]
punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14]
punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14]
punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16]
punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16]
punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12]
punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12]
punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16]
punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16]
punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12]
punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12]
punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16]
punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16]
punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
vpermq m6, m6, 0xD8
vpermq m7, m7, 0xD8
vpermq m1, m1, 0xD8
vpermq m8, m8, 0xD8
vpermq m3, m3, 0xD8
vpermq m5, m5, 0xD8
vpermq m2, m2, 0xD8
vpermq m0, m0, 0xD8
movu [r0 + 0 * 16], m6
movu [r0 + 2 * 16], m7
movu [r0 + 4 * 16], m1
movu [r0 + 6 * 16], m8
movu [r0 + 8 * 16], m3
movu [r0 + 10 * 16], m5
movu [r0 + 12 * 16], m2
movu [r0 + 14 * 16], m0
RET
%endif
INIT_XMM sse2
cglobal transpose16, 3, 5, 8, dest, src, stride
mov r3, r0
mov r4, r1
TRANSPOSE_8x8 16
lea r1, [r1 + 2 * r2]
lea r0, [r3 + 8]
TRANSPOSE_8x8 16
lea r1, [r4 + 8]
lea r0, [r3 + 8 * 16]
TRANSPOSE_8x8 16
lea r1, [r1 + 2 * r2]
lea r0, [r3 + 8 * 16 + 8]
TRANSPOSE_8x8 16
RET
%endif
cglobal transpose16_internal
TRANSPOSE_8x8 r6
lea r1, [r1 + 2 * r2]
lea r0, [r5 + 8]
TRANSPOSE_8x8 r6
lea r1, [r1 + 2 * r2]
neg r2
lea r1, [r1 + r2 * 8]
lea r1, [r1 + r2 * 8 + 8]
neg r2
lea r0, [r5 + 8 * r6]
TRANSPOSE_8x8 r6
lea r1, [r1 + 2 * r2]
lea r0, [r5 + 8 * r6 + 8]
TRANSPOSE_8x8 r6
ret
;-----------------------------------------------------------------
; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
%if HIGH_BIT_DEPTH == 1
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose8x32_internal
movu m0, [r1]
movu m1, [r1 + 32]
movu m2, [r1 + r2]
movu m3, [r1 + r2 + 32]
movu m4, [r1 + 2 * r2]
movu m5, [r1 + 2 * r2 + 32]
movu m6, [r1 + r3]
movu m7, [r1 + r3 + 32]
lea r1, [r1 + 4 * r2]
punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
movq [r0 + 0 * 64], xm7
movhps [r0 + 1 * 64], xm7
vextracti128 xm5, m7, 1
movq [r0 + 8 * 64], xm5
movhps [r0 + 9 * 64], xm5
movu m7, [r1]
movu m9, [r1 + 32]
movu m10, [r1 + r2]
movu m11, [r1 + r2 + 32]
movu m12, [r1 + 2 * r2]
movu m13, [r1 + 2 * r2 + 32]
movu m14, [r1 + r3]
movu m15, [r1 + r3 + 32]
punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
movq [r0 + 0 * 64 + 8], xm15
movhps [r0 + 1 * 64 + 8], xm15
vextracti128 xm13, m15, 1
movq [r0 + 8 * 64 + 8], xm13
movhps [r0 + 9 * 64 + 8], xm13
punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
movu [r0 + 2 * 64], xm13
vextracti128 [r0 + 10 * 64], m13, 1
movu [r0 + 3 * 64], xm8
vextracti128 [r0 + 11 * 64], m8, 1
movu [r0 + 4 * 64], xm5
vextracti128 [r0 + 12 * 64], m5, 1
movu [r0 + 5 * 64], xm2
vextracti128 [r0 + 13 * 64], m2, 1
movu [r0 + 6 * 64], xm10
vextracti128 [r0 + 14 * 64], m10, 1
movu [r0 + 7 * 64], xm0
vextracti128 [r0 + 15 * 64], m0, 1
movu [r0 + 16 * 64], xm7
vextracti128 [r0 + 24 * 64], m7, 1
movu [r0 + 17 * 64], xm4
vextracti128 [r0 + 25 * 64], m4, 1
movu [r0 + 18 * 64], xm12
vextracti128 [r0 + 26 * 64], m12, 1
movu [r0 + 19 * 64], xm6
vextracti128 [r0 + 27 * 64], m6, 1
movu [r0 + 20 * 64], xm14
vextracti128 [r0 + 28 * 64], m14, 1
movu [r0 + 21 * 64], xm3
vextracti128 [r0 + 29 * 64], m3, 1
movu [r0 + 22 * 64], xm11
vextracti128 [r0 + 30 * 64], m11, 1
movu [r0 + 23 * 64], xm1
vextracti128 [r0 + 31 * 64], m1, 1
ret
cglobal transpose32, 3, 4, 16
add r2, r2
lea r3, [r2 * 3]
call transpose8x32_internal
add r0, 16
lea r1, [r1 + 4 * r2]
call transpose8x32_internal
add r0, 16
lea r1, [r1 + 4 * r2]
call transpose8x32_internal
add r0, 16
lea r1, [r1 + 4 * r2]
call transpose8x32_internal
RET
%endif
INIT_XMM sse2
cglobal transpose32, 3, 7, 4, dest, src, stride
add r2, r2
mov r3, r0
mov r4, r1
mov r5, 64
mov r6, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 16]
lea r0, [r6 + 8 * 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 64 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 64 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 64 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 32]
lea r0, [r6 + 16 * 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 64 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 64 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 64 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 48]
lea r0, [r6 + 24 * 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 64 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 64 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 64 + 48]
mov r3, r0
call transpose8_internal
RET
%else ;HIGH_BIT_DEPTH == 0
INIT_XMM sse2
cglobal transpose32, 3, 7, 8, dest, src, stride
mov r3, r0
mov r4, r1
mov r5, r0
mov r6, 32
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 16]
mov r5, r0
call transpose16_internal
lea r1, [r4 + 16]
lea r0, [r3 + 16 * 32]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 16 * 32 + 16]
mov r5, r0
call transpose16_internal
RET
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose32, 3, 5, 16
lea r3, [r2 * 3]
mov r4d, 2
.loop:
movu m0, [r1]
movu m1, [r1 + r2]
movu m2, [r1 + 2 * r2]
movu m3, [r1 + r3]
lea r1, [r1 + 4 * r2]
movu m4, [r1]
movu m5, [r1 + r2]
movu m6, [r1 + 2 * r2]
movu m7, [r1 + r3]
punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4]
punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8]
punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
movq [r0 + 0 * 32], xm6
movhps [r0 + 1 * 32], xm6
vextracti128 xm4, m6, 1
movq [r0 + 16 * 32], xm4
movhps [r0 + 17 * 32], xm4
lea r1, [r1 + 4 * r2]
movu m9, [r1]
movu m10, [r1 + r2]
movu m11, [r1 + 2 * r2]
movu m12, [r1 + r3]
lea r1, [r1 + 4 * r2]
movu m13, [r1]
movu m14, [r1 + r2]
movu m15, [r1 + 2 * r2]
movu m6, [r1 + r3]
punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
movq [r0 + 0 * 32 + 8], xm15
movhps [r0 + 1 * 32 + 8], xm15
vextracti128 xm9, m15, 1
movq [r0 + 16 * 32 + 8], xm9
movhps [r0 + 17 * 32 + 8], xm9
movu [r0 + 2 * 32], xm13
vextracti128 [r0 + 18 * 32], m13, 1
movu [r0 + 3 * 32], xm7
vextracti128 [r0 + 19 * 32], m7, 1
movu [r0 + 4 * 32], xm6
vextracti128 [r0 + 20 * 32], m6, 1
movu [r0 + 5 * 32], xm1
vextracti128 [r0 + 21 * 32], m1, 1
movu [r0 + 6 * 32], xm10
vextracti128 [r0 + 22 * 32], m10, 1
movu [r0 + 7 * 32], xm8
vextracti128 [r0 + 23 * 32], m8, 1
movu [r0 + 8 * 32], xm4
vextracti128 [r0 + 24 * 32], m4, 1
movu [r0 + 9 * 32], xm3
vextracti128 [r0 + 25 * 32], m3, 1
movu [r0 + 10 * 32], xm12
vextracti128 [r0 + 26 * 32], m12, 1
movu [r0 + 11 * 32], xm5
vextracti128 [r0 + 27 * 32], m5, 1
movu [r0 + 12 * 32], xm14
vextracti128 [r0 + 28 * 32], m14, 1
movu [r0 + 13 * 32], xm2
vextracti128 [r0 + 29 * 32], m2, 1
movu [r0 + 14 * 32], xm11
vextracti128 [r0 + 30 * 32], m11, 1
movu [r0 + 15 * 32], xm0
vextracti128 [r0 + 31 * 32], m0, 1
add r0, 16
lea r1, [r1 + 4 * r2]
dec r4d
jnz .loop
RET
%endif
%endif
;-----------------------------------------------------------------
; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
%if HIGH_BIT_DEPTH == 1
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose8x32_64_internal
movu m0, [r1]
movu m1, [r1 + 32]
movu m2, [r1 + r2]
movu m3, [r1 + r2 + 32]
movu m4, [r1 + 2 * r2]
movu m5, [r1 + 2 * r2 + 32]
movu m6, [r1 + r3]
movu m7, [r1 + r3 + 32]
lea r1, [r1 + 4 * r2]
punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
movq [r0 + 0 * 128], xm7
movhps [r0 + 1 * 128], xm7
vextracti128 xm5, m7, 1
movq [r0 + 8 * 128], xm5
movhps [r0 + 9 * 128], xm5
movu m7, [r1]
movu m9, [r1 + 32]
movu m10, [r1 + r2]
movu m11, [r1 + r2 + 32]
movu m12, [r1 + 2 * r2]
movu m13, [r1 + 2 * r2 + 32]
movu m14, [r1 + r3]
movu m15, [r1 + r3 + 32]
punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
movq [r0 + 0 * 128 + 8], xm15
movhps [r0 + 1 * 128 + 8], xm15
vextracti128 xm13, m15, 1
movq [r0 + 8 * 128 + 8], xm13
movhps [r0 + 9 * 128 + 8], xm13
punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
movu [r0 + 2 * 128], xm13
vextracti128 [r0 + 10 * 128], m13, 1
movu [r0 + 3 * 128], xm8
vextracti128 [r0 + 11 * 128], m8, 1
movu [r0 + 4 * 128], xm5
vextracti128 [r0 + 12 * 128], m5, 1
movu [r0 + 5 * 128], xm2
vextracti128 [r0 + 13 * 128], m2, 1
movu [r0 + 6 * 128], xm10
vextracti128 [r0 + 14 * 128], m10, 1
movu [r0 + 7 * 128], xm0
vextracti128 [r0 + 15 * 128], m0, 1
movu [r0 + 16 * 128], xm7
vextracti128 [r0 + 24 * 128], m7, 1
movu [r0 + 17 * 128], xm4
vextracti128 [r0 + 25 * 128], m4, 1
movu [r0 + 18 * 128], xm12
vextracti128 [r0 + 26 * 128], m12, 1
movu [r0 + 19 * 128], xm6
vextracti128 [r0 + 27 * 128], m6, 1
movu [r0 + 20 * 128], xm14
vextracti128 [r0 + 28 * 128], m14, 1
movu [r0 + 21 * 128], xm3
vextracti128 [r0 + 29 * 128], m3, 1
movu [r0 + 22 * 128], xm11
vextracti128 [r0 + 30 * 128], m11, 1
movu [r0 + 23 * 128], xm1
vextracti128 [r0 + 31 * 128], m1, 1
ret
cglobal transpose64, 3, 6, 16
add r2, r2
lea r3, [3 * r2]
lea r4, [r1 + 64]
lea r5, [r0 + 16]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
mov r0, r5
lea r5, [r0 + 16]
lea r4, [r1 + 4 * r2]
lea r1, [r4 - 64]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
mov r0, r5
lea r5, [r0 + 16]
lea r4, [r1 + 4 * r2]
lea r1, [r4 - 64]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
mov r0, r5
lea r5, [r0 + 16]
lea r4, [r1 + 4 * r2]
lea r1, [r4 - 64]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
mov r0, r5
lea r5, [r0 + 16]
lea r4, [r1 + 4 * r2]
lea r1, [r4 - 64]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
mov r0, r5
lea r5, [r0 + 16]
lea r4, [r1 + 4 * r2]
lea r1, [r4 - 64]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
mov r0, r5
lea r5, [r0 + 16]
lea r4, [r1 + 4 * r2]
lea r1, [r4 - 64]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
mov r0, r5
lea r4, [r1 + 4 * r2]
lea r1, [r4 - 64]
call transpose8x32_64_internal
mov r1, r4
lea r0, [r0 + 32 * 128]
call transpose8x32_64_internal
RET
%endif
INIT_XMM sse2
cglobal transpose64, 3, 7, 4, dest, src, stride
add r2, r2
mov r3, r0
mov r4, r1
mov r5, 128
mov r6, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 112]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 16]
lea r0, [r6 + 8 * 128]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 128 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 128 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 128 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 128 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 128 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 128 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 8 * 128 + 112]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 32]
lea r0, [r6 + 16 * 128]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 128 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 128 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 128 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 128 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 128 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 128 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 16 * 128 + 112]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 48]
lea r0, [r6 + 24 * 128]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 128 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 128 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 128 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 128 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 128 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 128 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 24 * 128 + 112]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 64]
lea r0, [r6 + 32 * 128]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32 * 128 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32 * 128 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32 * 128 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32 * 128 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32 * 128 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32 * 128 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 32 * 128 + 112]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 80]
lea r0, [r6 + 40 * 128]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 40 * 128 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 40 * 128 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 40 * 128 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 40 * 128 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 40 * 128 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 40 * 128 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 40 * 128 + 112]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 96]
lea r0, [r6 + 48 * 128]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48 * 128 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48 * 128 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48 * 128 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48 * 128 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48 * 128 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48 * 128 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 48 * 128 + 112]
mov r3, r0
call transpose8_internal
lea r1, [r4 + 112]
lea r0, [r6 + 56 * 128]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 56 * 128 + 16]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 56 * 128 + 32]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 56 * 128 + 48]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 56 * 128 + 64]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 56 * 128 + 80]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 56 * 128 + 96]
mov r3, r0
call transpose8_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r6 + 56 * 128 + 112]
mov r3, r0
call transpose8_internal
RET
%else ;HIGH_BIT_DEPTH == 0
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal transpose16x32_avx2
movu m0, [r1]
movu m1, [r1 + r2]
movu m2, [r1 + 2 * r2]
movu m3, [r1 + r3]
lea r1, [r1 + 4 * r2]
movu m4, [r1]
movu m5, [r1 + r2]
movu m6, [r1 + 2 * r2]
movu m7, [r1 + r3]
punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]
punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]
punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
movq [r0 + 0 * 64], xm6
movhps [r0 + 1 * 64], xm6
vextracti128 xm4, m6, 1
movq [r0 + 16 * 64], xm4
movhps [r0 + 17 * 64], xm4
lea r1, [r1 + 4 * r2]
movu m9, [r1]
movu m10, [r1 + r2]
movu m11, [r1 + 2 * r2]
movu m12, [r1 + r3]
lea r1, [r1 + 4 * r2]
movu m13, [r1]
movu m14, [r1 + r2]
movu m15, [r1 + 2 * r2]
movu m6, [r1 + r3]
punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
movq [r0 + 0 * 64 + 8], xm15
movhps [r0 + 1 * 64 + 8], xm15
vextracti128 xm9, m15, 1
movq [r0 + 16 * 64 + 8], xm9
movhps [r0 + 17 * 64 + 8], xm9
movu [r0 + 2 * 64], xm13
vextracti128 [r0 + 18 * 64], m13, 1
movu [r0 + 3 * 64], xm7
vextracti128 [r0 + 19 * 64], m7, 1
movu [r0 + 4 * 64], xm6
vextracti128 [r0 + 20 * 64], m6, 1
movu [r0 + 5 * 64], xm1
vextracti128 [r0 + 21 * 64], m1, 1
movu [r0 + 6 * 64], xm10
vextracti128 [r0 + 22 * 64], m10, 1
movu [r0 + 7 * 64], xm8
vextracti128 [r0 + 23 * 64], m8, 1
movu [r0 + 8 * 64], xm4
vextracti128 [r0 + 24 * 64], m4, 1
movu [r0 + 9 * 64], xm3
vextracti128 [r0 + 25 * 64], m3, 1
movu [r0 + 10 * 64], xm12
vextracti128 [r0 + 26 * 64], m12, 1
movu [r0 + 11 * 64], xm5
vextracti128 [r0 + 27 * 64], m5, 1
movu [r0 + 12 * 64], xm14
vextracti128 [r0 + 28 * 64], m14, 1
movu [r0 + 13 * 64], xm2
vextracti128 [r0 + 29 * 64], m2, 1
movu [r0 + 14 * 64], xm11
vextracti128 [r0 + 30 * 64], m11, 1
movu [r0 + 15 * 64], xm0
vextracti128 [r0 + 31 * 64], m0, 1
ret
cglobal transpose64, 3, 6, 16
lea r3, [r2 * 3]
lea r4, [r0 + 16]
lea r5, [r1 + 32]
call transpose16x32_avx2
lea r0, [r0 + 32 * 64]
mov r1, r5
call transpose16x32_avx2
mov r0, r4
lea r5, [r1 + 4 * r2]
lea r1, [r5 - 32]
call transpose16x32_avx2
lea r0, [r0 + 32 * 64]
mov r1, r5
call transpose16x32_avx2
lea r0, [r4 + 16]
lea r5, [r1 + 4 * r2]
lea r1, [r5 - 32]
call transpose16x32_avx2
lea r0, [r0 + 32 * 64]
mov r1, r5
call transpose16x32_avx2
lea r5, [r1 + 4 * r2]
lea r0, [r4 + 32]
lea r1, [r5 - 32]
call transpose16x32_avx2
lea r0, [r0 + 32 * 64]
mov r1, r5
call transpose16x32_avx2
RET
%endif
INIT_XMM sse2
cglobal transpose64, 3, 7, 8, dest, src, stride
mov r3, r0
mov r4, r1
mov r5, r0
mov r6, 64
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 16]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 32]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 48]
mov r5, r0
call transpose16_internal
lea r1, [r4 + 16]
lea r0, [r3 + 16 * 64]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 16 * 64 + 16]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 16 * 64 + 32]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 16 * 64 + 48]
mov r5, r0
call transpose16_internal
lea r1, [r4 + 32]
lea r0, [r3 + 32 * 64]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 32 * 64 + 16]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 32 * 64 + 32]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 32 * 64 + 48]
mov r5, r0
call transpose16_internal
lea r1, [r4 + 48]
lea r0, [r3 + 48 * 64]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 48 * 64 + 16]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 48 * 64 + 32]
mov r5, r0
call transpose16_internal
lea r1, [r1 - 8 + 2 * r2]
lea r0, [r3 + 48 * 64 + 48]
mov r5, r0
call transpose16_internal
RET
%endif
;=============================================================================
; SSIM
;=============================================================================
;-----------------------------------------------------------------------------
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
%if HIGH_BIT_DEPTH
movdqu m5, [r0+(%1&1)*r1]
movdqu m6, [r2+(%1&1)*r3]
%else
movq m5, [r0+(%1&1)*r1]
movq m6, [r2+(%1&1)*r3]
punpcklbw m5, m0
punpcklbw m6, m0
%endif
%if %1==1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
%endif
%if %1==0
movdqa m1, m5
movdqa m2, m6
%else
paddw m1, m5
paddw m2, m6
%endif
pmaddwd m7, m5, m6
pmaddwd m5, m5
pmaddwd m6, m6
ACCUM paddd, 3, 5, %1
ACCUM paddd, 4, 7, %1
paddd m3, m6
%endmacro
%macro SSIM 0
cglobal pixel_ssim_4x4x2_core, 4,4,8
FIX_STRIDES r1, r3
pxor m0, m0
SSIM_ITER 0
SSIM_ITER 1
SSIM_ITER 2
SSIM_ITER 3
; PHADDW m1, m2
; PHADDD m3, m4
movdqa m7, [pw_1]
pshufd m5, m3, q2301
pmaddwd m1, m7
pmaddwd m2, m7
pshufd m6, m4, q2301
packssdw m1, m2
paddd m3, m5
pshufd m1, m1, q3120
paddd m4, m6
pmaddwd m1, m7
punpckhdq m5, m3, m4
punpckldq m3, m4
%if UNIX64
%define t0 r4
%else
%define t0 rax
mov t0, r4mp
%endif
movq [t0+ 0], m1
movq [t0+ 8], m3
movhps [t0+16], m1
movq [t0+24], m5
RET
;-----------------------------------------------------------------------------
; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
cglobal pixel_ssim_end4, 2,3
mov r2d, r2m
mova m0, [r0+ 0]
mova m1, [r0+16]
mova m2, [r0+32]
mova m3, [r0+48]
mova m4, [r0+64]
paddd m0, [r1+ 0]
paddd m1, [r1+16]
paddd m2, [r1+32]
paddd m3, [r1+48]
paddd m4, [r1+64]
paddd m0, m1
paddd m1, m2
paddd m2, m3
paddd m3, m4
TRANSPOSE4x4D 0, 1, 2, 3, 4
; s1=m0, s2=m1, ss=m2, s12=m3
%if BIT_DEPTH >= 10
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m4, m0, m1 ; s1*s2
mulps m0, m0 ; s1*s1
mulps m1, m1 ; s2*s2
mulps m2, [pf_64] ; ss*64
mulps m3, [pf_128] ; s12*128
addps m4, m4 ; s1*s2*2
addps m0, m1 ; s1*s1 + s2*s2
subps m2, m0 ; vars
subps m3, m4 ; covar*2
movaps m1, [ssim_c1]
addps m4, m1 ; s1*s2*2 + ssim_c1
addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
movaps m1, [ssim_c2]
addps m2, m1 ; vars + ssim_c2
addps m3, m1 ; covar*2 + ssim_c2
%else
pmaddwd m4, m1, m0 ; s1*s2
pslld m1, 16
por m0, m1
pmaddwd m0, m0 ; s1*s1 + s2*s2
pslld m4, 1
pslld m3, 7
pslld m2, 6
psubd m3, m4 ; covar*2
psubd m2, m0 ; vars
mova m1, [ssim_c1]
paddd m0, m1
paddd m4, m1
mova m1, [ssim_c2]
paddd m3, m1
paddd m2, m1
cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
%endif
mulps m4, m3
mulps m0, m2
divps m4, m0 ; ssim
cmp r2d, 4
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
%ifdef PIC
lea r3, [mask_ff + 16]
%xdefine %%mask r3
%else
%xdefine %%mask mask_ff + 16
%endif
%if cpuflag(avx)
andps m4, [%%mask + r2*4]
%else
movups m0, [%%mask + r2*4]
andps m4, m0
%endif
.skip:
movhlps m0, m4
addps m0, m4
%if cpuflag(ssse3)
movshdup m4, m0
%else
pshuflw m4, m0, q0032
%endif
addss m0, m4
%if ARCH_X86_64 == 0
movss r0m, m0
fld dword r0m
%endif
RET
%endmacro ; SSIM
INIT_XMM sse2
SSIM
INIT_XMM avx
SSIM
%macro SCALE1D_128to64_HBD 0
movu m0, [r1]
palignr m1, m0, 2
movu m2, [r1 + 16]
palignr m3, m2, 2
movu m4, [r1 + 32]
palignr m5, m4, 2
movu m6, [r1 + 48]
pavgw m0, m1
palignr m1, m6, 2
pavgw m2, m3
pavgw m4, m5
pavgw m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0], m0
punpcklqdq m4, m6
movu [r0 + 16], m4
movu m0, [r1 + 64]
palignr m1, m0, 2
movu m2, [r1 + 80]
palignr m3, m2, 2
movu m4, [r1 + 96]
palignr m5, m4, 2
movu m6, [r1 + 112]
pavgw m0, m1
palignr m1, m6, 2
pavgw m2, m3
pavgw m4, m5
pavgw m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0 + 32], m0
punpcklqdq m4, m6
movu [r0 + 48], m4
movu m0, [r1 + 128]
palignr m1, m0, 2
movu m2, [r1 + 144]
palignr m3, m2, 2
movu m4, [r1 + 160]
palignr m5, m4, 2
movu m6, [r1 + 176]
pavgw m0, m1
palignr m1, m6, 2
pavgw m2, m3
pavgw m4, m5
pavgw m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0 + 64], m0
punpcklqdq m4, m6
movu [r0 + 80], m4
movu m0, [r1 + 192]
palignr m1, m0, 2
movu m2, [r1 + 208]
palignr m3, m2, 2
movu m4, [r1 + 224]
palignr m5, m4, 2
movu m6, [r1 + 240]
pavgw m0, m1
palignr m1, m6, 2
pavgw m2, m3
pavgw m4, m5
pavgw m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0 + 96], m0
punpcklqdq m4, m6
movu [r0 + 112], m4
%endmacro
;-----------------------------------------------------------------
; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
;-----------------------------------------------------------------
INIT_XMM ssse3
cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
%if HIGH_BIT_DEPTH
mova m7, [deinterleave_word_shuf]
;Top pixel
SCALE1D_128to64_HBD
;Left pixel
add r1, 256
add r0, 128
SCALE1D_128to64_HBD
%else
mova m7, [deinterleave_shuf]
;Top pixel
movu m0, [r1]
palignr m1, m0, 1
movu m2, [r1 + 16]
palignr m3, m2, 1
movu m4, [r1 + 32]
palignr m5, m4, 1
movu m6, [r1 + 48]
pavgb m0, m1
palignr m1, m6, 1
pavgb m2, m3
pavgb m4, m5
pavgb m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0], m0
punpcklqdq m4, m6
movu [r0 + 16], m4
movu m0, [r1 + 64]
palignr m1, m0, 1
movu m2, [r1 + 80]
palignr m3, m2, 1
movu m4, [r1 + 96]
palignr m5, m4, 1
movu m6, [r1 + 112]
pavgb m0, m1
palignr m1, m6, 1
pavgb m2, m3
pavgb m4, m5
pavgb m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0 + 32], m0
punpcklqdq m4, m6
movu [r0 + 48], m4
;Left pixel
movu m0, [r1 + 128]
palignr m1, m0, 1
movu m2, [r1 + 144]
palignr m3, m2, 1
movu m4, [r1 + 160]
palignr m5, m4, 1
movu m6, [r1 + 176]
pavgb m0, m1
palignr m1, m6, 1
pavgb m2, m3
pavgb m4, m5
pavgb m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0 + 64], m0
punpcklqdq m4, m6
movu [r0 + 80], m4
movu m0, [r1 + 192]
palignr m1, m0, 1
movu m2, [r1 + 208]
palignr m3, m2, 1
movu m4, [r1 + 224]
palignr m5, m4, 1
movu m6, [r1 + 240]
pavgb m0, m1
palignr m1, m6, 1
pavgb m2, m3
pavgb m4, m5
pavgb m6, m1
pshufb m0, m0, m7
pshufb m2, m2, m7
pshufb m4, m4, m7
pshufb m6, m6, m7
punpcklqdq m0, m2
movu [r0 + 96], m0
punpcklqdq m4, m6
movu [r0 + 112], m4
%endif
RET
%if HIGH_BIT_DEPTH == 1
INIT_YMM avx2
cglobal scale1D_128to64, 2, 2, 3
pxor m2, m2
;Top pixel
movu m0, [r1]
movu m1, [r1 + 32]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0], m0
movu m0, [r1 + 64]
movu m1, [r1 + 96]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 32], m0
movu m0, [r1 + 128]
movu m1, [r1 + 160]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 64], m0
movu m0, [r1 + 192]
movu m1, [r1 + 224]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 96], m0
;Left pixel
movu m0, [r1 + 256]
movu m1, [r1 + 288]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 128], m0
movu m0, [r1 + 320]
movu m1, [r1 + 352]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 160], m0
movu m0, [r1 + 384]
movu m1, [r1 + 416]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 192], m0
movu m0, [r1 + 448]
movu m1, [r1 + 480]
phaddw m0, m1
pavgw m0, m2
vpermq m0, m0, 0xD8
movu [r0 + 224], m0
RET
%else ; HIGH_BIT_DEPTH == 0
INIT_YMM avx2
cglobal scale1D_128to64, 2, 2, 4
pxor m2, m2
mova m3, [pb_1]
;Top pixel
movu m0, [r1]
pmaddubsw m0, m0, m3
pavgw m0, m2
movu m1, [r1 + 32]
pmaddubsw m1, m1, m3
pavgw m1, m2
packuswb m0, m1
vpermq m0, m0, 0xD8
movu [r0], m0
movu m0, [r1 + 64]
pmaddubsw m0, m0, m3
pavgw m0, m2
movu m1, [r1 + 96]
pmaddubsw m1, m1, m3
pavgw m1, m2
packuswb m0, m1
vpermq m0, m0, 0xD8
movu [r0 + 32], m0
;Left pixel
movu m0, [r1 + 128]
pmaddubsw m0, m0, m3
pavgw m0, m2
movu m1, [r1 + 160]
pmaddubsw m1, m1, m3
pavgw m1, m2
packuswb m0, m1
vpermq m0, m0, 0xD8
movu [r0 + 64], m0
movu m0, [r1 + 192]
pmaddubsw m0, m0, m3
pavgw m0, m2
movu m1, [r1 + 224]
pmaddubsw m1, m1, m3
pavgw m1, m2
packuswb m0, m1
vpermq m0, m0, 0xD8
movu [r0 + 96], m0
RET
%endif
;-----------------------------------------------------------------
; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_XMM ssse3
cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
mov r3d, 32
mova m7, [deinterleave_word_shuf]
add r2, r2
.loop:
movu m0, [r1] ;i
psrld m1, m0, 16 ;j
movu m2, [r1 + r2] ;k
psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
pxor m5, m3 ;k^l
por m4, m5 ;ij|kl
pavgw m0, m1 ;s
pavgw m2, m3 ;t
movu m5, m0
pavgw m0, m2 ;(s+t+1)/2
pxor m5, m2 ;s^t
pand m4, m5 ;(ij|kl)&st
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
movu m1, [r1 + 16] ;i
psrld m2, m1, 16 ;j
movu m3, [r1 + r2 + 16] ;k
psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
pxor m6, m4 ;k^l
por m5, m6 ;ij|kl
pavgw m1, m2 ;s
pavgw m3, m4 ;t
movu m6, m1
pavgw m1, m3 ;(s+t+1)/2
pxor m6, m3 ;s^t
pand m5, m6 ;(ij|kl)&st
pand m5, [hmulw_16p]
psubw m1, m5 ;Result
pshufb m0, m7
pshufb m1, m7
punpcklqdq m0, m1
movu [r0], m0
movu m0, [r1 + 32] ;i
psrld m1, m0, 16 ;j
movu m2, [r1 + r2 + 32] ;k
psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
pxor m5, m3 ;k^l
por m4, m5 ;ij|kl
pavgw m0, m1 ;s
pavgw m2, m3 ;t
movu m5, m0
pavgw m0, m2 ;(s+t+1)/2
pxor m5, m2 ;s^t
pand m4, m5 ;(ij|kl)&st
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
movu m1, [r1 + 48] ;i
psrld m2, m1, 16 ;j
movu m3, [r1 + r2 + 48] ;k
psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
pxor m6, m4 ;k^l
por m5, m6 ;ij|kl
pavgw m1, m2 ;s
pavgw m3, m4 ;t
movu m6, m1
pavgw m1, m3 ;(s+t+1)/2
pxor m6, m3 ;s^t
pand m5, m6 ;(ij|kl)&st
pand m5, [hmulw_16p]
psubw m1, m5 ;Result
pshufb m0, m7
pshufb m1, m7
punpcklqdq m0, m1
movu [r0 + 16], m0
movu m0, [r1 + 64] ;i
psrld m1, m0, 16 ;j
movu m2, [r1 + r2 + 64] ;k
psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
pxor m5, m3 ;k^l
por m4, m5 ;ij|kl
pavgw m0, m1 ;s
pavgw m2, m3 ;t
movu m5, m0
pavgw m0, m2 ;(s+t+1)/2
pxor m5, m2 ;s^t
pand m4, m5 ;(ij|kl)&st
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
movu m1, [r1 + 80] ;i
psrld m2, m1, 16 ;j
movu m3, [r1 + r2 + 80] ;k
psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
pxor m6, m4 ;k^l
por m5, m6 ;ij|kl
pavgw m1, m2 ;s
pavgw m3, m4 ;t
movu m6, m1
pavgw m1, m3 ;(s+t+1)/2
pxor m6, m3 ;s^t
pand m5, m6 ;(ij|kl)&st
pand m5, [hmulw_16p]
psubw m1, m5 ;Result
pshufb m0, m7
pshufb m1, m7
punpcklqdq m0, m1
movu [r0 + 32], m0
movu m0, [r1 + 96] ;i
psrld m1, m0, 16 ;j
movu m2, [r1 + r2 + 96] ;k
psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
pxor m5, m3 ;k^l
por m4, m5 ;ij|kl
pavgw m0, m1 ;s
pavgw m2, m3 ;t
movu m5, m0
pavgw m0, m2 ;(s+t+1)/2
pxor m5, m2 ;s^t
pand m4, m5 ;(ij|kl)&st
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
movu m1, [r1 + 112] ;i
psrld m2, m1, 16 ;j
movu m3, [r1 + r2 + 112] ;k
psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
pxor m6, m4 ;k^l
por m5, m6 ;ij|kl
pavgw m1, m2 ;s
pavgw m3, m4 ;t
movu m6, m1
pavgw m1, m3 ;(s+t+1)/2
pxor m6, m3 ;s^t
pand m5, m6 ;(ij|kl)&st
pand m5, [hmulw_16p]
psubw m1, m5 ;Result
pshufb m0, m7
pshufb m1, m7
punpcklqdq m0, m1
movu [r0 + 48], m0
lea r0, [r0 + 64]
lea r1, [r1 + 2 * r2]
dec r3d
jnz .loop
RET
%else
INIT_XMM ssse3
cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
mov r3d, 32
mova m7, [deinterleave_shuf]
.loop:
movu m0, [r1] ;i
psrlw m1, m0, 8 ;j
movu m2, [r1 + r2] ;k
psrlw m3, m2, 8 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
pxor m5, m3 ;k^l
por m4, m5 ;ij|kl
pavgb m0, m1 ;s
pavgb m2, m3 ;t
movu m5, m0
pavgb m0, m2 ;(s+t+1)/2
pxor m5, m2 ;s^t
pand m4, m5 ;(ij|kl)&st
pand m4, [hmul_16p]
psubb m0, m4 ;Result
movu m1, [r1 + 16] ;i
psrlw m2, m1, 8 ;j
movu m3, [r1 + r2 + 16] ;k
psrlw m4, m3, 8 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
pxor m6, m4 ;k^l
por m5, m6 ;ij|kl
pavgb m1, m2 ;s
pavgb m3, m4 ;t
movu m6, m1
pavgb m1, m3 ;(s+t+1)/2
pxor m6, m3 ;s^t
pand m5, m6 ;(ij|kl)&st
pand m5, [hmul_16p]
psubb m1, m5 ;Result
pshufb m0, m0, m7
pshufb m1, m1, m7
punpcklqdq m0, m1
movu [r0], m0
movu m0, [r1 + 32] ;i
psrlw m1, m0, 8 ;j
movu m2, [r1 + r2 + 32] ;k
psrlw m3, m2, 8 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
pxor m5, m3 ;k^l
por m4, m5 ;ij|kl
pavgb m0, m1 ;s
pavgb m2, m3 ;t
movu m5, m0
pavgb m0, m2 ;(s+t+1)/2
pxor m5, m2 ;s^t
pand m4, m5 ;(ij|kl)&st
pand m4, [hmul_16p]
psubb m0, m4 ;Result
movu m1, [r1 + 48] ;i
psrlw m2, m1, 8 ;j
movu m3, [r1 + r2 + 48] ;k
psrlw m4, m3, 8 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
pxor m6, m4 ;k^l
por m5, m6 ;ij|kl
pavgb m1, m2 ;s
pavgb m3, m4 ;t
movu m6, m1
pavgb m1, m3 ;(s+t+1)/2
pxor m6, m3 ;s^t
pand m5, m6 ;(ij|kl)&st
pand m5, [hmul_16p]
psubb m1, m5 ;Result
pshufb m0, m0, m7
pshufb m1, m1, m7
punpcklqdq m0, m1
movu [r0 + 16], m0
lea r0, [r0 + 32]
lea r1, [r1 + 2 * r2]
dec r3d
jnz .loop
RET
%endif
;-----------------------------------------------------------------
; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_YMM avx2
cglobal scale2D_64to32, 3, 4, 5, dest, src, stride
mov r3d, 32
add r2d, r2d
mova m4, [pw_2000]
.loop:
movu m0, [r1]
movu m1, [r1 + 1 * mmsize]
movu m2, [r1 + r2]
movu m3, [r1 + r2 + 1 * mmsize]
paddw m0, m2
paddw m1, m3
phaddw m0, m1
pmulhrsw m0, m4
vpermq m0, m0, q3120
movu [r0], m0
movu m0, [r1 + 2 * mmsize]
movu m1, [r1 + 3 * mmsize]
movu m2, [r1 + r2 + 2 * mmsize]
movu m3, [r1 + r2 + 3 * mmsize]
paddw m0, m2
paddw m1, m3
phaddw m0, m1
pmulhrsw m0, m4
vpermq m0, m0, q3120
movu [r0 + mmsize], m0
add r0, 64
lea r1, [r1 + 2 * r2]
dec r3d
jnz .loop
RET
%else
INIT_YMM avx2
cglobal scale2D_64to32, 3, 5, 8, dest, src, stride
mov r3d, 16
mova m7, [deinterleave_shuf]
.loop:
movu m0, [r1] ; i
lea r4, [r1 + r2 * 2]
psrlw m1, m0, 8 ; j
movu m2, [r1 + r2] ; k
psrlw m3, m2, 8 ; l
pxor m4, m0, m1 ; i^j
pxor m5, m2, m3 ; k^l
por m4, m5 ; ij|kl
pavgb m0, m1 ; s
pavgb m2, m3 ; t
mova m5, m0
pavgb m0, m2 ; (s+t+1)/2
pxor m5, m2 ; s^t
pand m4, m5 ; (ij|kl)&st
pand m4, [pb_1]
psubb m0, m4 ; Result
movu m1, [r1 + 32] ; i
psrlw m2, m1, 8 ; j
movu m3, [r1 + r2 + 32] ; k
psrlw m4, m3, 8 ; l
pxor m5, m1, m2 ; i^j
pxor m6, m3, m4 ; k^l
por m5, m6 ; ij|kl
pavgb m1, m2 ; s
pavgb m3, m4 ; t
mova m6, m1
pavgb m1, m3 ; (s+t+1)/2
pxor m6, m3 ; s^t
pand m5, m6 ; (ij|kl)&st
pand m5, [pb_1]
psubb m1, m5 ; Result
pshufb m0, m0, m7
pshufb m1, m1, m7
punpcklqdq m0, m1
vpermq m0, m0, 11011000b
movu [r0], m0
add r0, 32
movu m0, [r4] ; i
psrlw m1, m0, 8 ; j
movu m2, [r4 + r2] ; k
psrlw m3, m2, 8 ; l
pxor m4, m0, m1 ; i^j
pxor m5, m2, m3 ; k^l
por m4, m5 ; ij|kl
pavgb m0, m1 ; s
pavgb m2, m3 ; t
mova m5, m0
pavgb m0, m2 ; (s+t+1)/2
pxor m5, m2 ; s^t
pand m4, m5 ; (ij|kl)&st
pand m4, [pb_1]
psubb m0, m4 ; Result
movu m1, [r4 + 32] ; i
psrlw m2, m1, 8 ; j
movu m3, [r4 + r2 + 32] ; k
psrlw m4, m3, 8 ; l
pxor m5, m1, m2 ; i^j
pxor m6, m3, m4 ; k^l
por m5, m6 ; ij|kl
pavgb m1, m2 ; s
pavgb m3, m4 ; t
mova m6, m1
pavgb m1, m3 ; (s+t+1)/2
pxor m6, m3 ; s^t
pand m5, m6 ; (ij|kl)&st
pand m5, [pb_1]
psubb m1, m5 ; Result
pshufb m0, m0, m7
pshufb m1, m1, m7
punpcklqdq m0, m1
vpermq m0, m0, 11011000b
movu [r0], m0
lea r1, [r1 + 4 * r2]
add r0, 32
dec r3d
jnz .loop
RET
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
INIT_XMM sse2
cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
add r4, r4
add r5, r5
add r1, r1
movh m0, [r2]
movh m2, [r2 + r4]
movh m1, [r3]
movh m3, [r3 + r5]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
movh m4, [r2]
movh m6, [r2 + r4]
movh m5, [r3]
movh m7, [r3 + r5]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movh [r0], m0
movh [r0 + r1], m2
lea r0, [r0 + r1 * 2]
movh [r0], m4
movh [r0 + r1], m6
RET
%else
INIT_XMM sse4
cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
add r1, r1
movd m0, [r2]
movd m2, [r2 + r4]
movd m1, [r3]
movd m3, [r3 + r5]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
movd m4, [r2]
movd m6, [r2 + r4]
movd m5, [r3]
movd m7, [r3 + r5]
punpckldq m0, m2
punpckldq m1, m3
punpckldq m4, m6
punpckldq m5, m7
pmovzxbw m0, m0
pmovzxbw m1, m1
pmovzxbw m4, m4
pmovzxbw m5, m5
psubw m0, m1
psubw m4, m5
movh [r0], m0
movhps [r0 + r1], m0
movh [r0 + r1 * 2], m4
lea r0, [r0 + r1 * 2]
movhps [r0 + r1], m4
RET
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W4_H4 2
%if HIGH_BIT_DEPTH
cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/4
add r4, r4
add r5, r5
add r1, r1
.loop:
movh m0, [r2]
movh m2, [r2 + r4]
movh m1, [r3]
movh m3, [r3 + r5]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
movh m4, [r2]
movh m6, [r2 + r4]
movh m5, [r3]
movh m7, [r3 + r5]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movh [r0], m0
movh [r0 + r1], m2
movh [r0 + r1 * 2], m4
lea r0, [r0 + r1 * 2]
movh [r0 + r1], m6
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%else
cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/4
add r1, r1
.loop:
movd m0, [r2]
movd m2, [r2 + r4]
movd m1, [r3]
movd m3, [r3 + r5]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
movd m4, [r2]
movd m6, [r2 + r4]
movd m5, [r3]
movd m7, [r3 + r5]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
punpckldq m0, m2
punpckldq m1, m3
punpckldq m4, m6
punpckldq m5, m7
pmovzxbw m0, m0
pmovzxbw m1, m1
pmovzxbw m4, m4
pmovzxbw m5, m5
psubw m0, m1
psubw m4, m5
movh [r0], m0
movhps [r0 + r1], m0
movh [r0 + r1 * 2], m4
lea r0, [r0 + r1 * 2]
movhps [r0 + r1], m4
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%endif
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W4_H4 4, 8
%else
INIT_XMM sse4
PIXELSUB_PS_W4_H4 4, 8
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W8_H4 2
%if HIGH_BIT_DEPTH
cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/4
add r4, r4
add r5, r5
add r1, r1
.loop:
movu m0, [r2]
movu m2, [r2 + r4]
movu m1, [r3]
movu m3, [r3 + r5]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
movu m4, [r2]
movu m6, [r2 + r4]
movu m5, [r3]
movu m7, [r3 + r5]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0], m0
movu [r0 + r1], m2
movu [r0 + r1 * 2], m4
lea r0, [r0 + r1 * 2]
movu [r0 + r1], m6
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%else
cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/4
add r1, r1
.loop:
movh m0, [r2]
movh m2, [r2 + r4]
movh m1, [r3]
movh m3, [r3 + r5]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
movh m4, [r2]
movh m6, [r2 + r4]
movh m5, [r3]
movh m7, [r3 + r5]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
pmovzxbw m0, m0
pmovzxbw m1, m1
pmovzxbw m2, m2
pmovzxbw m3, m3
pmovzxbw m4, m4
pmovzxbw m5, m5
pmovzxbw m6, m6
pmovzxbw m7, m7
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0], m0
movu [r0 + r1], m2
movu [r0 + r1 * 2], m4
lea r0, [r0 + r1 * 2]
movu [r0 + r1], m6
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%endif
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W8_H4 8, 8
PIXELSUB_PS_W8_H4 8, 16
%else
INIT_XMM sse4
PIXELSUB_PS_W8_H4 8, 8
PIXELSUB_PS_W8_H4 8, 16
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W16_H4 2
%if HIGH_BIT_DEPTH
cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/4
add r4, r4
add r5, r5
add r1, r1
.loop:
movu m0, [r2]
movu m2, [r2 + 16]
movu m1, [r3]
movu m3, [r3 + 16]
movu m4, [r2 + r4]
movu m6, [r2 + r4 + 16]
movu m5, [r3 + r5]
movu m7, [r3 + r5 + 16]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0], m0
movu [r0 + 16], m2
movu [r0 + r1], m4
movu [r0 + r1 + 16], m6
movu m0, [r2]
movu m2, [r2 + 16]
movu m1, [r3]
movu m3, [r3 + 16]
movu m4, [r2 + r4]
movu m5, [r3 + r5]
movu m6, [r2 + r4 + 16]
movu m7, [r3 + r5 + 16]
lea r0, [r0 + r1 * 2]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0], m0
movu [r0 + 16], m2
movu [r0 + r1], m4
movu [r0 + r1 + 16], m6
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%else
cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/4
pxor m6, m6
add r1, r1
.loop:
movu m1, [r2]
movu m3, [r3]
pmovzxbw m0, m1
pmovzxbw m2, m3
punpckhbw m1, m6
punpckhbw m3, m6
psubw m0, m2
psubw m1, m3
movu m5, [r2 + r4]
movu m3, [r3 + r5]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
pmovzxbw m4, m5
pmovzxbw m2, m3
punpckhbw m5, m6
punpckhbw m3, m6
psubw m4, m2
psubw m5, m3
movu [r0], m0
movu [r0 + 16], m1
movu [r0 + r1], m4
movu [r0 + r1 + 16], m5
movu m1, [r2]
movu m3, [r3]
pmovzxbw m0, m1
pmovzxbw m2, m3
punpckhbw m1, m6
punpckhbw m3, m6
psubw m0, m2
psubw m1, m3
movu m5, [r2 + r4]
movu m3, [r3 + r5]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
lea r0, [r0 + r1 * 2]
pmovzxbw m4, m5
pmovzxbw m2, m3
punpckhbw m5, m6
punpckhbw m3, m6
psubw m4, m2
psubw m5, m3
movu [r0], m0
movu [r0 + 16], m1
movu [r0 + r1], m4
movu [r0 + r1 + 16], m5
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%endif
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W16_H4 16, 16
PIXELSUB_PS_W16_H4 16, 32
%else
INIT_XMM sse4
PIXELSUB_PS_W16_H4 16, 16
PIXELSUB_PS_W16_H4 16, 32
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
%macro PIXELSUB_PS_W16_H4_avx2 1
%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_sub_ps_16x%1, 6, 9, 4, dest, deststride, src0, src1, srcstride0, srcstride1
add r1d, r1d
add r4d, r4d
add r5d, r5d
lea r6, [r1 * 3]
lea r7, [r4 * 3]
lea r8, [r5 * 3]
%rep %1/4
movu m0, [r2]
movu m1, [r3]
movu m2, [r2 + r4]
movu m3, [r3 + r5]
psubw m0, m1
psubw m2, m3
movu [r0], m0
movu [r0 + r1], m2
movu m0, [r2 + r4 * 2]
movu m1, [r3 + r5 * 2]
movu m2, [r2 + r7]
movu m3, [r3 + r8]
psubw m0, m1
psubw m2, m3
movu [r0 + r1 * 2], m0
movu [r0 + r6], m2
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r5 * 4]
%endrep
RET
%endif
%endmacro
PIXELSUB_PS_W16_H4_avx2 16
PIXELSUB_PS_W16_H4_avx2 32
%else
;-----------------------------------------------------------------------------
; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W16_H8_avx2 2
%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_sub_ps_16x%2, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
add r1, r1
lea r6, [r1 * 3]
mov r7d, %2/8
lea r9, [r4 * 3]
lea r8, [r5 * 3]
.loop
pmovzxbw m0, [r2]
pmovzxbw m1, [r3]
pmovzxbw m2, [r2 + r4]
pmovzxbw m3, [r3 + r5]
psubw m0, m1
psubw m2, m3
movu [r0], m0
movu [r0 + r1], m2
pmovzxbw m0, [r2 + 2 * r4]
pmovzxbw m1, [r3 + 2 * r5]
pmovzxbw m2, [r2 + r9]
pmovzxbw m3, [r3 + r8]
psubw m0, m1
psubw m2, m3
movu [r0 + r1 * 2], m0
movu [r0 + r6], m2
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r5 * 4]
pmovzxbw m0, [r2]
pmovzxbw m1, [r3]
pmovzxbw m2, [r2 + r4]
pmovzxbw m3, [r3 + r5]
psubw m0, m1
psubw m2, m3
movu [r0], m0
movu [r0 + r1], m2
pmovzxbw m0, [r2 + 2 * r4]
pmovzxbw m1, [r3 + 2 * r5]
pmovzxbw m2, [r2 + r9]
pmovzxbw m3, [r3 + r8]
psubw m0, m1
psubw m2, m3
movu [r0 + r1 * 2], m0
movu [r0 + r6], m2
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r5 * 4]
dec r7d
jnz .loop
RET
%endif
%endmacro
PIXELSUB_PS_W16_H8_avx2 16, 16
PIXELSUB_PS_W16_H8_avx2 16, 32
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W32_H2 2
%if HIGH_BIT_DEPTH
cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/2
add r4, r4
add r5, r5
add r1, r1
.loop:
movu m0, [r2]
movu m2, [r2 + 16]
movu m4, [r2 + 32]
movu m6, [r2 + 48]
movu m1, [r3]
movu m3, [r3 + 16]
movu m5, [r3 + 32]
movu m7, [r3 + 48]
dec r6d
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0], m0
movu [r0 + 16], m2
movu [r0 + 32], m4
movu [r0 + 48], m6
movu m0, [r2 + r4]
movu m2, [r2 + r4 + 16]
movu m4, [r2 + r4 + 32]
movu m6, [r2 + r4 + 48]
movu m1, [r3 + r5]
movu m3, [r3 + r5 + 16]
movu m5, [r3 + r5 + 32]
movu m7, [r3 + r5 + 48]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0 + r1], m0
movu [r0 + r1 + 16], m2
movu [r0 + r1 + 32], m4
movu [r0 + r1 + 48], m6
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%else
cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/2
add r1, r1
.loop:
movh m0, [r2]
movh m1, [r2 + 8]
movh m2, [r2 + 16]
movh m6, [r2 + 24]
movh m3, [r3]
movh m4, [r3 + 8]
movh m5, [r3 + 16]
movh m7, [r3 + 24]
dec r6d
pmovzxbw m0, m0
pmovzxbw m1, m1
pmovzxbw m2, m2
pmovzxbw m6, m6
pmovzxbw m3, m3
pmovzxbw m4, m4
pmovzxbw m5, m5
pmovzxbw m7, m7
psubw m0, m3
psubw m1, m4
psubw m2, m5
psubw m6, m7
movu [r0], m0
movu [r0 + 16], m1
movu [r0 + 32], m2
movu [r0 + 48], m6
movh m0, [r2 + r4]
movh m1, [r2 + r4 + 8]
movh m2, [r2 + r4 + 16]
movh m6, [r2 + r4 + 24]
movh m3, [r3 + r5]
movh m4, [r3 + r5 + 8]
movh m5, [r3 + r5 + 16]
movh m7, [r3 + r5 + 24]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
pmovzxbw m0, m0
pmovzxbw m1, m1
pmovzxbw m2, m2
pmovzxbw m6, m6
pmovzxbw m3, m3
pmovzxbw m4, m4
pmovzxbw m5, m5
pmovzxbw m7, m7
psubw m0, m3
psubw m1, m4
psubw m2, m5
psubw m6, m7
movu [r0 + r1], m0
movu [r0 + r1 + 16], m1
movu [r0 + r1 + 32], m2
movu [r0 + r1 + 48], m6
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%endif
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W32_H2 32, 32
PIXELSUB_PS_W32_H2 32, 64
%else
INIT_XMM sse4
PIXELSUB_PS_W32_H2 32, 32
PIXELSUB_PS_W32_H2 32, 64
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
%macro PIXELSUB_PS_W32_H4_avx2 1
%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_sub_ps_32x%1, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
add r1d, r1d
add r4d, r4d
add r5d, r5d
mov r9d, %1/4
lea r6, [r1 * 3]
lea r7, [r4 * 3]
lea r8, [r5 * 3]
.loop
movu m0, [r2]
movu m1, [r2 + 32]
movu m2, [r3]
movu m3, [r3 + 32]
psubw m0, m2
psubw m1, m3
movu [r0], m0
movu [r0 + 32], m1
movu m0, [r2 + r4]
movu m1, [r2 + r4 + 32]
movu m2, [r3 + r5]
movu m3, [r3 + r5 + 32]
psubw m0, m2
psubw m1, m3
movu [r0 + r1], m0
movu [r0 + r1 + 32], m1
movu m0, [r2 + r4 * 2]
movu m1, [r2 + r4 * 2 + 32]
movu m2, [r3 + r5 * 2]
movu m3, [r3 + r5 * 2 + 32]
psubw m0, m2
psubw m1, m3
movu [r0 + r1 * 2], m0
movu [r0 + r1 * 2 + 32], m1
movu m0, [r2 + r7]
movu m1, [r2 + r7 + 32]
movu m2, [r3 + r8]
movu m3, [r3 + r8 + 32]
psubw m0, m2
psubw m1, m3
movu [r0 + r6], m0
movu [r0 + r6 + 32], m1
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r5 * 4]
dec r9d
jnz .loop
RET
%endif
%endmacro
PIXELSUB_PS_W32_H4_avx2 32
PIXELSUB_PS_W32_H4_avx2 64
%else
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W32_H8_avx2 2
%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_sub_ps_32x%2, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/8
add r1, r1
lea r7, [r4 * 3]
lea r8, [r5 * 3]
lea r9, [r1 * 3]
.loop:
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 16]
pmovzxbw m2, [r3]
pmovzxbw m3, [r3 + 16]
psubw m0, m2
psubw m1, m3
movu [r0], m0
movu [r0 + 32], m1
pmovzxbw m0, [r2 + r4]
pmovzxbw m1, [r2 + r4 + 16]
pmovzxbw m2, [r3 + r5]
pmovzxbw m3, [r3 + r5 + 16]
psubw m0, m2
psubw m1, m3
movu [r0 + r1], m0
movu [r0 + r1 + 32], m1
pmovzxbw m0, [r2 + 2 * r4]
pmovzxbw m1, [r2 + 2 * r4 + 16]
pmovzxbw m2, [r3 + 2 * r5]
pmovzxbw m3, [r3 + 2 * r5 + 16]
psubw m0, m2
psubw m1, m3
movu [r0 + r1 * 2 ], m0
movu [r0 + r1 * 2 + 32], m1
pmovzxbw m0, [r2 + r7]
pmovzxbw m1, [r2 + r7 + 16]
pmovzxbw m2, [r3 + r8]
pmovzxbw m3, [r3 + r8 + 16]
psubw m0, m2
psubw m1, m3
movu [r0 + r9], m0
movu [r0 + r9 +32], m1
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r5 * 4]
lea r0, [r0 + r1 * 4]
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 16]
pmovzxbw m2, [r3]
pmovzxbw m3, [r3 + 16]
psubw m0, m2
psubw m1, m3
movu [r0 ], m0
movu [r0 + 32], m1
pmovzxbw m0, [r2 + r4]
pmovzxbw m1, [r2 + r4 + 16]
pmovzxbw m2, [r3 + r5]
pmovzxbw m3, [r3 + r5 + 16]
psubw m0, m2
psubw m1, m3
movu [r0 + r1], m0
movu [r0 + r1 + 32], m1
pmovzxbw m0, [r2 + 2 * r4]
pmovzxbw m1, [r2 + 2 * r4 + 16]
pmovzxbw m2, [r3 + 2 * r5]
pmovzxbw m3, [r3 + 2 * r5 + 16]
psubw m0, m2
psubw m1, m3
movu [r0 + r1 * 2], m0
movu [r0 + r1 * 2 + 32], m1
pmovzxbw m0, [r2 + r7]
pmovzxbw m1, [r2 + r7 + 16]
pmovzxbw m2, [r3 + r8]
pmovzxbw m3, [r3 + r8 + 16]
psubw m0, m2
psubw m1, m3
movu [r0 + r9], m0
movu [r0 + r9 + 32], m1
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r5 * 4]
dec r6d
jnz .loop
RET
%endif
%endmacro
PIXELSUB_PS_W32_H8_avx2 32, 32
PIXELSUB_PS_W32_H8_avx2 32, 64
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W64_H2 2
%if HIGH_BIT_DEPTH
cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/2
add r4, r4
add r5, r5
add r1, r1
.loop:
movu m0, [r2]
movu m2, [r2 + 16]
movu m4, [r2 + 32]
movu m6, [r2 + 48]
movu m1, [r3]
movu m3, [r3 + 16]
movu m5, [r3 + 32]
movu m7, [r3 + 48]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0], m0
movu [r0 + 16], m2
movu [r0 + 32], m4
movu [r0 + 48], m6
movu m0, [r2 + 64]
movu m2, [r2 + 80]
movu m4, [r2 + 96]
movu m6, [r2 + 112]
movu m1, [r3 + 64]
movu m3, [r3 + 80]
movu m5, [r3 + 96]
movu m7, [r3 + 112]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0 + 64], m0
movu [r0 + 80], m2
movu [r0 + 96], m4
movu [r0 + 112], m6
movu m0, [r2 + r4]
movu m2, [r2 + r4 + 16]
movu m4, [r2 + r4 + 32]
movu m6, [r2 + r4 + 48]
movu m1, [r3 + r5]
movu m3, [r3 + r5 + 16]
movu m5, [r3 + r5 + 32]
movu m7, [r3 + r5 + 48]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0 + r1], m0
movu [r0 + r1 + 16], m2
movu [r0 + r1 + 32], m4
movu [r0 + r1 + 48], m6
movu m0, [r2 + r4 + 64]
movu m2, [r2 + r4 + 80]
movu m4, [r2 + r4 + 96]
movu m6, [r2 + r4 + 112]
movu m1, [r3 + r5 + 64]
movu m3, [r3 + r5 + 80]
movu m5, [r3 + r5 + 96]
movu m7, [r3 + r5 + 112]
dec r6d
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
psubw m0, m1
psubw m2, m3
psubw m4, m5
psubw m6, m7
movu [r0 + r1 + 64], m0
movu [r0 + r1 + 80], m2
movu [r0 + r1 + 96], m4
movu [r0 + r1 + 112], m6
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%else
cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, %2/2
pxor m6, m6
add r1, r1
.loop:
movu m1, [r2]
movu m5, [r2 + 16]
movu m3, [r3]
movu m7, [r3 + 16]
pmovzxbw m0, m1
pmovzxbw m4, m5
pmovzxbw m2, m3
punpckhbw m1, m6
punpckhbw m3, m6
punpckhbw m5, m6
psubw m0, m2
psubw m1, m3
pmovzxbw m2, m7
punpckhbw m7, m6
psubw m4, m2
psubw m5, m7
movu m3, [r2 + 32]
movu m7, [r3 + 32]
pmovzxbw m2, m3
punpckhbw m3, m6
movu [r0], m0
movu [r0 + 16], m1
movu [r0 + 32], m4
movu [r0 + 48], m5
movu m1, [r2 + 48]
movu m5, [r3 + 48]
pmovzxbw m0, m1
pmovzxbw m4, m7
punpckhbw m1, m6
punpckhbw m7, m6
psubw m2, m4
psubw m3, m7
movu [r0 + 64], m2
movu [r0 + 80], m3
movu m7, [r2 + r4]
movu m3, [r3 + r5]
pmovzxbw m2, m5
pmovzxbw m4, m7
punpckhbw m5, m6
punpckhbw m7, m6
psubw m0, m2
psubw m1, m5
movu [r0 + 96], m0
movu [r0 + 112], m1
movu m2, [r2 + r4 + 16]
movu m5, [r3 + r5 + 16]
pmovzxbw m0, m3
pmovzxbw m1, m2
punpckhbw m3, m6
punpckhbw m2, m6
psubw m4, m0
psubw m7, m3
movu [r0 + r1], m4
movu [r0 + r1 + 16], m7
movu m0, [r2 + r4 + 32]
movu m3, [r3 + r5 + 32]
dec r6d
pmovzxbw m4, m5
pmovzxbw m7, m0
punpckhbw m5, m6
punpckhbw m0, m6
psubw m1, m4
psubw m2, m5
movu [r0 + r1 + 32], m1
movu [r0 + r1 + 48], m2
movu m4, [r2 + r4 + 48]
movu m5, [r3 + r5 + 48]
lea r2, [r2 + r4 * 2]
lea r3, [r3 + r5 * 2]
pmovzxbw m1, m3
pmovzxbw m2, m4
punpckhbw m3, m6
punpckhbw m4, m6
psubw m7, m1
psubw m0, m3
movu [r0 + r1 + 64], m7
movu [r0 + r1 + 80], m0
pmovzxbw m7, m5
punpckhbw m5, m6
psubw m2, m7
psubw m4, m5
movu [r0 + r1 + 96], m2
movu [r0 + r1 + 112], m4
lea r0, [r0 + r1 * 2]
jnz .loop
RET
%endif
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM sse2
PIXELSUB_PS_W64_H2 64, 64
%else
INIT_XMM sse4
PIXELSUB_PS_W64_H2 64, 64
%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
%if ARCH_X86_64
INIT_YMM avx2
cglobal pixel_sub_ps_64x64, 6, 10, 8, dest, deststride, src0, src1, srcstride0, srcstride1
add r1d, r1d
add r4d, r4d
add r5d, r5d
mov r9d, 16
lea r6, [r1 * 3]
lea r7, [r4 * 3]
lea r8, [r5 * 3]
.loop
movu m0, [r2]
movu m1, [r2 + 32]
movu m2, [r2 + 64]
movu m3, [r2 + 96]
movu m4, [r3]
movu m5, [r3 + 32]
movu m6, [r3 + 64]
movu m7, [r3 + 96]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0], m0
movu [r0 + 32], m1
movu [r0 + 64], m2
movu [r0 + 96], m3
movu m0, [r2 + r4]
movu m1, [r2 + r4 + 32]
movu m2, [r2 + r4 + 64]
movu m3, [r2 + r4 + 96]
movu m4, [r3 + r5]
movu m5, [r3 + r5 + 32]
movu m6, [r3 + r5 + 64]
movu m7, [r3 + r5 + 96]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0 + r1], m0
movu [r0 + r1 + 32], m1
movu [r0 + r1 + 64], m2
movu [r0 + r1 + 96], m3
movu m0, [r2 + r4 * 2]
movu m1, [r2 + r4 * 2 + 32]
movu m2, [r2 + r4 * 2 + 64]
movu m3, [r2 + r4 * 2 + 96]
movu m4, [r3 + r5 * 2]
movu m5, [r3 + r5 * 2 + 32]
movu m6, [r3 + r5 * 2 + 64]
movu m7, [r3 + r5 * 2 + 96]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0 + r1 * 2], m0
movu [r0 + r1 * 2 + 32], m1
movu [r0 + r1 * 2 + 64], m2
movu [r0 + r1 * 2 + 96], m3
movu m0, [r2 + r7]
movu m1, [r2 + r7 + 32]
movu m2, [r2 + r7 + 64]
movu m3, [r2 + r7 + 96]
movu m4, [r3 + r8]
movu m5, [r3 + r8 + 32]
movu m6, [r3 + r8 + 64]
movu m7, [r3 + r8 + 96]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0 + r6], m0
movu [r0 + r6 + 32], m1
movu [r0 + r6 + 64], m2
movu [r0 + r6 + 96], m3
lea r0, [r0 + r1 * 4]
lea r2, [r2 + r4 * 4]
lea r3, [r3 + r5 * 4]
dec r9d
jnz .loop
RET
%endif
%else
;-----------------------------------------------------------------------------
; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal pixel_sub_ps_64x64, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, 16
add r1, r1
.loop:
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 16]
pmovzxbw m2, [r2 + 32]
pmovzxbw m3, [r2 + 48]
pmovzxbw m4, [r3]
pmovzxbw m5, [r3 + 16]
pmovzxbw m6, [r3 + 32]
pmovzxbw m7, [r3 + 48]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0], m0
movu [r0 + 32], m1
movu [r0 + 64], m2
movu [r0 + 96], m3
add r0, r1
add r2, r4
add r3, r5
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 16]
pmovzxbw m2, [r2 + 32]
pmovzxbw m3, [r2 + 48]
pmovzxbw m4, [r3]
pmovzxbw m5, [r3 + 16]
pmovzxbw m6, [r3 + 32]
pmovzxbw m7, [r3 + 48]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0], m0
movu [r0 + 32], m1
movu [r0 + 64], m2
movu [r0 + 96], m3
add r0, r1
add r2, r4
add r3, r5
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 16]
pmovzxbw m2, [r2 + 32]
pmovzxbw m3, [r2 + 48]
pmovzxbw m4, [r3]
pmovzxbw m5, [r3 + 16]
pmovzxbw m6, [r3 + 32]
pmovzxbw m7, [r3 + 48]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0], m0
movu [r0 + 32], m1
movu [r0 + 64], m2
movu [r0 + 96], m3
add r0, r1
add r2, r4
add r3, r5
pmovzxbw m0, [r2]
pmovzxbw m1, [r2 + 16]
pmovzxbw m2, [r2 + 32]
pmovzxbw m3, [r2 + 48]
pmovzxbw m4, [r3]
pmovzxbw m5, [r3 + 16]
pmovzxbw m6, [r3 + 32]
pmovzxbw m7, [r3 + 48]
psubw m0, m4
psubw m1, m5
psubw m2, m6
psubw m3, m7
movu [r0], m0
movu [r0 + 32], m1
movu [r0 + 64], m2
movu [r0 + 96], m3
add r0, r1
add r2, r4
add r3, r5
dec r6d
jnz .loop
RET
%endif
;=============================================================================
; variance
;=============================================================================
%macro VAR_START 1
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
%elif mmsize < 32
pxor m7, m7 ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
%endmacro
%macro VAR_END 2
%if HIGH_BIT_DEPTH
%if mmsize == 8 && %1*%2 == 256
HADDUW m5, m2
%else
%if %1 >= 32
HADDW m5, m2
movd m7, r4d
paddd m5, m7
%else
HADDW m5, m2
%endif
%endif
%else ; !HIGH_BIT_DEPTH
%if %1 == 64
HADDW m5, m2
movd m7, r4d
paddd m5, m7
%else
HADDW m5, m2
%endif
%endif ; HIGH_BIT_DEPTH
HADDD m6, m1
%if ARCH_X86_64
punpckldq m5, m6
movq rax, m5
%else
movd eax, m5
movd edx, m6
%endif
RET
%endmacro
%macro VAR_END_12bit 2
HADDD m5, m1
HADDD m6, m1
%if ARCH_X86_64
punpckldq m5, m6
movq rax, m5
%else
movd eax, m5
movd edx, m6
%endif
RET
%endmacro
%macro VAR_CORE 0
paddw m5, m0
paddw m5, m3
paddw m5, m1
paddw m5, m4
pmaddwd m0, m0
pmaddwd m3, m3
pmaddwd m1, m1
pmaddwd m4, m4
paddd m6, m0
paddd m6, m3
paddd m6, m1
paddd m6, m4
%endmacro
%macro VAR_2ROW 2
mov r2d, %2
%%loop:
%if HIGH_BIT_DEPTH
movu m0, [r0]
movu m1, [r0+mmsize]
movu m3, [r0+%1]
movu m4, [r0+%1+mmsize]
%else ; !HIGH_BIT_DEPTH
mova m0, [r0]
punpckhbw m1, m0, m7
mova m3, [r0+%1]
mova m4, m3
punpcklbw m0, m7
%endif ; HIGH_BIT_DEPTH
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
%if HIGH_BIT_DEPTH == 0
punpcklbw m3, m7
punpckhbw m4, m7
%endif ; !HIGH_BIT_DEPTH
VAR_CORE
dec r2d
jg %%loop
%endmacro
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
cglobal pixel_var_8x8, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
VAR_END 8, 8
%if HIGH_BIT_DEPTH
%macro VAR 0
%if BIT_DEPTH <= 10
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 16, 16
cglobal pixel_var_32x32, 2,6,8
FIX_STRIDES r1
mov r3, r0
VAR_START 0
VAR_2ROW r1, 8
HADDW m5, m2
movd r4d, m5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
lea r0, [r3 + 32]
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
VAR_END 32, 32
cglobal pixel_var_64x64, 2,6,8
FIX_STRIDES r1
mov r3, r0
VAR_START 0
VAR_2ROW r1, 8
HADDW m5, m2
movd r4d, m5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
lea r0, [r3 + 32]
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
lea r0, [r3 + 64]
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
lea r0, [r3 + 96]
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
VAR_2ROW r1, 8
VAR_END 64, 64
%else ; BIT_DEPTH <= 10
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
HADDUWD m5, m1
mova m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m5, m7
VAR_END_12bit 16, 16
cglobal pixel_var_32x32, 2,6,8
FIX_STRIDES r1
mov r3, r0
VAR_START 0
VAR_2ROW r1, 4
HADDUWD m5, m1
mova m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
lea r0, [r3 + 32]
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m5, m7
VAR_END_12bit 32, 32
cglobal pixel_var_64x64, 2,6,8
FIX_STRIDES r1
mov r3, r0
VAR_START 0
VAR_2ROW r1, 4
HADDUWD m5, m1
mova m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
lea r0, [r3 + 16 * SIZEOF_PIXEL]
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
lea r0, [r3 + 32 * SIZEOF_PIXEL]
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
lea r0, [r3 + 48 * SIZEOF_PIXEL]
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m7, m5
pxor m5, m5
VAR_2ROW r1, 4
HADDUWD m5, m1
paddd m5, m7
VAR_END_12bit 64, 64
%endif ; BIT_DEPTH <= 10
cglobal pixel_var_8x8, 2,3,8
lea r2, [r1*3]
VAR_START 0
movu m0, [r0]
movu m1, [r0+r1*2]
movu m3, [r0+r1*4]
movu m4, [r0+r2*2]
lea r0, [r0+r1*8]
VAR_CORE
movu m0, [r0]
movu m1, [r0+r1*2]
movu m3, [r0+r1*4]
movu m4, [r0+r2*2]
VAR_CORE
VAR_END 8, 8
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
%macro VAR 0
cglobal pixel_var_8x8, 2,3,8
VAR_START 1
lea r2, [r1 * 3]
movh m0, [r0]
movh m3, [r0 + r1]
movhps m0, [r0 + r1 * 2]
movhps m3, [r0 + r2]
DEINTB 1, 0, 4, 3, 7
lea r0, [r0 + r1 * 4]
VAR_CORE
movh m0, [r0]
movh m3, [r0 + r1]
movhps m0, [r0 + r1 * 2]
movhps m3, [r0 + r2]
DEINTB 1, 0, 4, 3, 7
VAR_CORE
VAR_END 8, 8
cglobal pixel_var_16x16_internal
movu m0, [r0]
movu m3, [r0 + r1]
DEINTB 1, 0, 4, 3, 7
VAR_CORE
movu m0, [r0 + 2 * r1]
movu m3, [r0 + r2]
DEINTB 1, 0, 4, 3, 7
lea r0, [r0 + r1 * 4]
VAR_CORE
movu m0, [r0]
movu m3, [r0 + r1]
DEINTB 1, 0, 4, 3, 7
VAR_CORE
movu m0, [r0 + 2 * r1]
movu m3, [r0 + r2]
DEINTB 1, 0, 4, 3, 7
lea r0, [r0 + r1 * 4]
VAR_CORE
movu m0, [r0]
movu m3, [r0 + r1]
DEINTB 1, 0, 4, 3, 7
VAR_CORE
movu m0, [r0 + 2 * r1]
movu m3, [r0 + r2]
DEINTB 1, 0, 4, 3, 7
lea r0, [r0 + r1 * 4]
VAR_CORE
movu m0, [r0]
movu m3, [r0 + r1]
DEINTB 1, 0, 4, 3, 7
VAR_CORE
movu m0, [r0 + 2 * r1]
movu m3, [r0 + r2]
DEINTB 1, 0, 4, 3, 7
VAR_CORE
ret
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
lea r2, [r1 * 3]
call pixel_var_16x16_internal
VAR_END 16, 16
cglobal pixel_var_32x32, 2,4,8
VAR_START 1
lea r2, [r1 * 3]
mov r3, r0
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r3 + 16]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
VAR_END 32, 32
cglobal pixel_var_64x64, 2,6,8
VAR_START 1
lea r2, [r1 * 3]
mov r3, r0
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
HADDW m5, m2
movd r4d, m5
pxor m5, m5
lea r0, [r3 + 16]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
lea r0, [r3 + 32]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r3 + 48]
HADDW m5, m2
movd r5d, m5
add r4, r5
pxor m5, m5
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
lea r0, [r0 + r1 * 4]
call pixel_var_16x16_internal
VAR_END 64, 64
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
INIT_YMM avx2
cglobal pixel_var_16x16, 2,4,7
VAR_START 0
mov r2d, 4
lea r3, [r1*3]
.loop:
pmovzxbw m0, [r0]
pmovzxbw m3, [r0+r1]
pmovzxbw m1, [r0+r1*2]
pmovzxbw m4, [r0+r3]
lea r0, [r0+r1*4]
VAR_CORE
dec r2d
jg .loop
vextracti128 xm0, m5, 1
vextracti128 xm1, m6, 1
paddw xm5, xm0
paddd xm6, xm1
HADDW xm5, xm2
HADDD xm6, xm1
%if ARCH_X86_64
punpckldq xm5, xm6
movq rax, xm5
%else
movd eax, xm5
movd edx, xm6
%endif
RET
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 3
HADDW %2, xm1
movd r1d, %2
imul r1d, r1d
HADDD %3, xm1
shr r1d, %1
movd eax, %3
movd [r4], %3
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%endmacro
;int scanPosLast(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize)
;{
; int scanPosLast = 0;
; do
; {
; const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
;
; const uint32_t posLast = scan[scanPosLast++];
;
; const int curCoeff = coeff[posLast];
; const uint32_t isNZCoeff = (curCoeff != 0);
; numSig -= isNZCoeff;
;
; coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
; coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
; coeffNum[cgIdx] += (uint8_t)isNZCoeff;
; }
; while (numSig > 0);
; return scanPosLast - 1;
;}
%if ARCH_X86_64 == 1
INIT_XMM avx2,bmi2
cglobal scanPosLast, 7,11,6
; convert unit of Stride(trSize) to int16_t
mov r7d, r7m
add r7d, r7d
; loading scan table and convert to Byte
mova m0, [r6]
packuswb m0, [r6 + mmsize]
pxor m1, m0, [pb_15]
; clear CG count
xor r9d, r9d
; m0 - Zigzag scan table
; m1 - revert order scan table
; m4 - zero
; m5 - ones
pxor m4, m4
pcmpeqb m5, m5
lea r8d, [r7d * 3]
.loop:
; position of current CG
movzx r6d, word [r0]
lea r6, [r6 * 2 + r1]
add r0, 16 * 2
; loading current CG
movh m2, [r6]
movhps m2, [r6 + r7]
movh m3, [r6 + r7 * 2]
movhps m3, [r6 + r8]
packsswb m2, m3
; Zigzag
pshufb m3, m2, m0
pshufb m2, m1
; get sign
pmovmskb r6d, m3
pcmpeqb m3, m4
pmovmskb r10d, m3
not r10d
pext r6d, r6d, r10d
mov [r2 + r9 * 2], r6w
; get non-zero flag
; TODO: reuse above result with reorder
pcmpeqb m2, m4
pxor m2, m5
pmovmskb r6d, m2
mov [r3 + r9 * 2], r6w
; get non-zero number, POPCNT is faster
pabsb m2, m2
psadbw m2, m4
movhlps m3, m2
paddd m2, m3
movd r6d, m2
mov [r4 + r9], r6b
inc r9d
sub r5d, r6d
jg .loop
; fixup last CG non-zero flag
dec r9d
movzx r0d, word [r3 + r9 * 2]
;%if cpuflag(bmi1) ; 2uops?
; tzcnt r1d, r0d
;%else
bsf r1d, r0d
;%endif
shrx r0d, r0d, r1d
mov [r3 + r9 * 2], r0w
; get last pos
mov eax, r9d
shl eax, 4
xor r1d, 15
add eax, r1d
RET
; t3 must be ecx, since it's used for shift.
%if WIN64
DECLARE_REG_TMP 3,1,2,0
%elif ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3
%else ; X86_32
%error Unsupport platform X86_32
%endif
INIT_CPUFLAGS
cglobal scanPosLast_x64, 5,12
mov r10, r3mp
movifnidn t0, r0mp
mov r5d, r5m
xor r11d, r11d ; cgIdx
xor r7d, r7d ; tmp for non-zero flag
.loop:
xor r8d, r8d ; coeffSign[]
xor r9d, r9d ; coeffFlag[]
xor t3d, t3d ; coeffNum[]
%assign x 0
%rep 16
movzx r6d, word [t0 + x * 2]
movsx r6d, word [t1 + r6 * 2]
test r6d, r6d
setnz r7b
shr r6d, 31
shl r6d, t3b
or r8d, r6d
lea r9, [r9 * 2 + r7]
add t3d, r7d
%assign x x+1
%endrep
; store latest group data
mov [t2 + r11 * 2], r8w
mov [r10 + r11 * 2], r9w
mov [r4 + r11], t3b
inc r11d
add t0, 16 * 2
sub r5d, t3d
jnz .loop
; store group data
bsf t3d, r9d
shr r9d, t3b
mov [r10 + (r11 - 1) * 2], r9w
; get posLast
shl r11d, 4
sub r11d, t3d
lea eax, [r11d - 1]
RET
%endif
;-----------------------------------------------------------------------------
; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
;-----------------------------------------------------------------------------
INIT_XMM ssse3
cglobal findPosFirstLast, 3,3,3
; convert stride to int16_t
add r1d, r1d
; loading scan table and convert to Byte
mova m0, [r2]
packuswb m0, [r2 + mmsize]
; loading 16 of coeff
movh m1, [r0]
movhps m1, [r0 + r1]
movh m2, [r0 + r1 * 2]
lea r1, [r1 * 3]
movhps m2, [r0 + r1]
packsswb m1, m2
; get non-zero mask
pxor m2, m2
pcmpeqb m1, m2
; reorder by Zigzag scan
pshufb m1, m0
; get First and Last pos
pmovmskb r0d, m1
not r0d
bsr r1w, r0w
bsf eax, r0d ; side effect: clear AH to Zero
shl r1d, 16
or eax, r1d
RET
;void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
;{
; X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
; X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
; int x, y;
; int32_t tmp_stats[SAO::NUM_EDGETYPE];
; int32_t tmp_count[SAO::NUM_EDGETYPE];
; memset(tmp_stats, 0, sizeof(tmp_stats));
; memset(tmp_count, 0, sizeof(tmp_count));
; for (y = 0; y < endY; y++)
; {
; upBufft[0] = signOf(rec[stride] - rec[-1]);
; for (x = 0; x < endX; x++)
; {
; int signDown = signOf2(rec[x], rec[x + stride + 1]);
; X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
; uint32_t edgeType = signDown + upBuff1[x] + 2;
; upBufft[x + 1] = (int8_t)(-signDown);
; tmp_stats[edgeType] += (fenc[x] - rec[x]);
; tmp_count[edgeType]++;
; }
; std::swap(upBuff1, upBufft);
; rec += stride;
; fenc += stride;
; }
; for (x = 0; x < SAO::NUM_EDGETYPE; x++)
; {
; stats[SAO::s_eoTable[x]] += tmp_stats[x];
; count[SAO::s_eoTable[x]] += tmp_count[x];
; }
;}
%if ARCH_X86_64
; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
INIT_XMM sse4
cglobal saoCuStatsE2, 5,9,8,0-32 ; Stack: 5 of stats and 5 of count
mov r5d, r5m
; clear internal temporary buffer
pxor m0, m0
mova [rsp], m0
mova [rsp + mmsize], m0
mova m0, [pb_128]
mova m5, [pb_1]
mova m6, [pb_2]
.loopH:
; TODO: merge into below SIMD
; get upBuffX[0]
mov r6b, [r1 + r2]
sub r6b, [r1 - 1]
seta r6b
setb r7b
sub r6b, r7b
mov [r4], r6b
; backup unavailable pixels
movh m7, [r4 + r5 + 1]
mov r6d, r5d
.loopW:
movu m1, [r1]
movu m2, [r1 + r2 + 1]
; signDown
pxor m1, m0
pxor m2, m0
pcmpgtb m3, m1, m2
pand m3, m5
pcmpgtb m2, m1
por m2, m3
pxor m3, m3
psubb m3, m2
; edgeType
movu m4, [r3]
paddb m4, m6
paddb m2, m4
; update upBuff1
movu [r4 + 1], m3
; stats[edgeType]
pxor m1, m0
movu m3, [r0]
punpckhbw m4, m3, m1
punpcklbw m3, m1
pmaddubsw m3, [hmul_16p + 16]
pmaddubsw m4, [hmul_16p + 16]
; 16 pixels
%assign x 0
%rep 16
pextrb r7d, m2, x
inc word [rsp + r7 * 2]
%if (x < 8)
pextrw r8d, m3, (x % 8)
%else
pextrw r8d, m4, (x % 8)
%endif
movsx r8d, r8w
add [rsp + 5 * 2 + r7 * 4], r8d
dec r6d
jz .next
%assign x x+1
%endrep
add r0, 16
add r1, 16
add r3, 16
add r4, 16
jmp .loopW
.next:
xchg r3, r4
; restore pointer upBuff1
mov r6d, r5d
and r6d, 15
; move to next row
sub r6, r5
add r3, r6
add r4, r6
add r6, r2
add r0, r6
add r1, r6
; restore unavailable pixels
movh [r3 + r5 + 1], m7
dec byte r6m
jg .loopH
; sum to global buffer
mov r1, r7m
mov r0, r8m
; s_eoTable = {1,2,0,3,4}
movzx r6d, word [rsp + 0 * 2]
add [r0 + 1 * 4], r6d
movzx r6d, word [rsp + 1 * 2]
add [r0 + 2 * 4], r6d
movzx r6d, word [rsp + 2 * 2]
add [r0 + 0 * 4], r6d
movzx r6d, word [rsp + 3 * 2]
add [r0 + 3 * 4], r6d
movzx r6d, word [rsp + 4 * 2]
add [r0 + 4 * 4], r6d
mov r6d, [rsp + 5 * 2 + 0 * 4]
add [r1 + 1 * 4], r6d
mov r6d, [rsp + 5 * 2 + 1 * 4]
add [r1 + 2 * 4], r6d
mov r6d, [rsp + 5 * 2 + 2 * 4]
add [r1 + 0 * 4], r6d
mov r6d, [rsp + 5 * 2 + 3 * 4]
add [r1 + 3 * 4], r6d
mov r6d, [rsp + 5 * 2 + 4 * 4]
add [r1 + 4 * 4], r6d
RET
%endif ; ARCH_X86_64
;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
;{
; memset(tmp_stats, 0, sizeof(tmp_stats));
; memset(tmp_count, 0, sizeof(tmp_count));
; for (y = startY; y < endY; y++)
; {
; for (x = startX; x < endX; x++)
; {
; int signDown = signOf2(rec[x], rec[x + stride - 1]);
; uint32_t edgeType = signDown + upBuff1[x] + 2;
; upBuff1[x - 1] = (int8_t)(-signDown);
; tmp_stats[edgeType] += (fenc[x] - rec[x]);
; tmp_count[edgeType]++;
; }
; upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
; rec += stride;
; fenc += stride;
; }
; for (x = 0; x < NUM_EDGETYPE; x++)
; {
; stats[s_eoTable[x]] += tmp_stats[x];
; count[s_eoTable[x]] += tmp_count[x];
; }
;}
%if ARCH_X86_64
INIT_XMM sse4
cglobal saoCuStatsE3, 4,9,8,0-32 ; Stack: 5 of stats and 5 of count
mov r4d, r4m
mov r5d, r5m
; clear internal temporary buffer
pxor m0, m0
mova [rsp], m0
mova [rsp + mmsize], m0
mova m0, [pb_128]
mova m5, [pb_1]
mova m6, [pb_2]
movh m7, [r3 + r4]
.loopH:
mov r6d, r4d
.loopW:
movu m1, [r1]
movu m2, [r1 + r2 - 1]
; signDown
pxor m1, m0
pxor m2, m0
pcmpgtb m3, m1, m2
pand m3, m5
pcmpgtb m2, m1
por m2, m3
pxor m3, m3
psubb m3, m2
; edgeType
movu m4, [r3]
paddb m4, m6
paddb m2, m4
; update upBuff1
movu [r3 - 1], m3
; stats[edgeType]
pxor m1, m0
movu m3, [r0]
punpckhbw m4, m3, m1
punpcklbw m3, m1
pmaddubsw m3, [hmul_16p + 16]
pmaddubsw m4, [hmul_16p + 16]
; 16 pixels
%assign x 0
%rep 16
pextrb r7d, m2, x
inc word [rsp + r7 * 2]
%if (x < 8)
pextrw r8d, m3, (x % 8)
%else
pextrw r8d, m4, (x % 8)
%endif
movsx r8d, r8w
add [rsp + 5 * 2 + r7 * 4], r8d
dec r6d
jz .next
%assign x x+1
%endrep
add r0, 16
add r1, 16
add r3, 16
jmp .loopW
.next:
; restore pointer upBuff1
mov r6d, r4d
and r6d, 15
; move to next row
sub r6, r4
add r3, r6
add r6, r2
add r0, r6
add r1, r6
dec r5d
jg .loopH
; restore unavailable pixels
movh [r3 + r4], m7
; sum to global buffer
mov r1, r6m
mov r0, r7m
; s_eoTable = {1,2,0,3,4}
movzx r6d, word [rsp + 0 * 2]
add [r0 + 1 * 4], r6d
movzx r6d, word [rsp + 1 * 2]
add [r0 + 2 * 4], r6d
movzx r6d, word [rsp + 2 * 2]
add [r0 + 0 * 4], r6d
movzx r6d, word [rsp + 3 * 2]
add [r0 + 3 * 4], r6d
movzx r6d, word [rsp + 4 * 2]
add [r0 + 4 * 4], r6d
mov r6d, [rsp + 5 * 2 + 0 * 4]
add [r1 + 1 * 4], r6d
mov r6d, [rsp + 5 * 2 + 1 * 4]
add [r1 + 2 * 4], r6d
mov r6d, [rsp + 5 * 2 + 2 * 4]
add [r1 + 0 * 4], r6d
mov r6d, [rsp + 5 * 2 + 3 * 4]
add [r1 + 3 * 4], r6d
mov r6d, [rsp + 5 * 2 + 4 * 4]
add [r1 + 4 * 4], r6d
RET
%endif ; ARCH_X86_64
; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int subPosBase)
;for (int i = 0; i < MLS_CG_SIZE; i++)
;{
; tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 0]);
; tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 1]);
; tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 2]);
; tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[blkPosBase + i * trSize + 3]);
;}
;do
;{
; uint32_t blkPos, sig, ctxSig;
; blkPos = g_scan4x4[codingParameters.scanType][scanPosSigOff];
; const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
; sig = scanFlagMask & 1;
; scanFlagMask >>= 1;
; if (scanPosSigOff + (subSet == 0) + numNonZero)
; {
; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
; ctxSig = cnt & posZeroMask;
;
; const uint32_t mstate = baseCtx[ctxSig];
; const uint32_t mps = mstate & 1;
; const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
; uint32_t nextState = (stateBits >> 24) + mps;
; if ((mstate ^ sig) == 1)
; nextState = sig;
; baseCtx[ctxSig] = (uint8_t)nextState;
; sum += stateBits;
; }
; absCoeff[numNonZero] = tmpCoeff[blkPos];
; numNonZero += sig;
; scanPosSigOff--;
;}
;while(scanPosSigOff >= 0);
; sum &= 0xFFFFFF
%if ARCH_X86_64
; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
INIT_XMM sse4
cglobal costCoeffNxN, 6,11,5
add r2d, r2d
; abs(coeff)
movh m1, [r1]
movhps m1, [r1 + r2]
movh m2, [r1 + r2 * 2]
lea r2, [r2 * 3]
movhps m2, [r1 + r2]
pabsw m1, m1
pabsw m2, m2
; r[1-2] free here
; WARNING: beyond-bound read here!
; loading scan table
mov r2d, r8m
xor r2d, 15
movu m0, [r0 + r2 * 2]
movu m3, [r0 + r2 * 2 + mmsize]
packuswb m0, m3
pxor m0, [pb_15]
xchg r2d, r8m
; r[0-1] free here
; reorder coeff
mova m3, [deinterleave_shuf]
pshufb m1, m3
pshufb m2, m3
punpcklqdq m3, m1, m2
punpckhqdq m1, m2
pshufb m3, m0
pshufb m1, m0
punpcklbw m2, m3, m1
punpckhbw m3, m1
; r[0-1], m[1] free here
; loading tabSigCtx (+offset)
mova m1, [r4]
pshufb m1, m0
movd m4, r7m
pxor m5, m5
pshufb m4, m5
paddb m1, m4
; register mapping
; m0 - Zigzag
; m1 - sigCtx
; {m3,m2} - abs(coeff)
; r0 - x265_entropyStateBits
; r1 - baseCtx
; r2 - scanPosSigOff
; r3 - absCoeff
; r4 - nonZero
; r5 - scanFlagMask
; r6 - sum
lea r0, [private_prefix %+ _entropyStateBits]
mov r1, r6mp
xor r6d, r6d
xor r4d, r4d
xor r8d, r8d
test r2d, r2d
jz .idx_zero
.loop:
; {
; const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
; ctxSig = cnt & posZeroMask;
; const uint32_t mstate = baseCtx[ctxSig];
; const uint32_t mps = mstate & 1;
; const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
; uint32_t nextState = (stateBits >> 24) + mps;
; if ((mstate ^ sig) == 1)
; nextState = sig;
; baseCtx[ctxSig] = (uint8_t)nextState;
; sum += stateBits;
; }
; absCoeff[numNonZero] = tmpCoeff[blkPos];
; numNonZero += sig;
; scanPosSigOff--;
pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] = tmpCoeff[blkPos]
shr r5d, 1
setc r8b ; r8 = sig
add r4d, r8d ; numNonZero += sig
palignr m4, m3, m2, 2
psrldq m3, 2
mova m2, m4
movd r7d, m1 ; r7 = ctxSig
movzx r7d, r7b
psrldq m1, 1
movzx r9d, byte [r1 + r7] ; mstate = baseCtx[ctxSig]
mov r10d, r9d
and r10d, 1 ; mps = mstate & 1
xor r9d, r8d ; r9 = mstate ^ sig
add r6d, [r0 + r9 * 4] ; sum += x265_entropyStateBits[mstate ^ sig]
add r10b, byte [r0 + r9 * 4 + 3] ; nextState = (stateBits >> 24) + mps
cmp r9b, 1
cmove r10d, r8d
mov byte [r1 + r7], r10b
dec r2d
jg .loop
.idx_zero:
pextrw [r3 + r4 * 2], m2, 0 ; absCoeff[numNonZero] = tmpCoeff[blkPos]
add r4b, r8m
xor r2d, r2d
cmp word r9m, 0
sete r2b
add r4b, r2b
jz .exit
dec r2b
movd r3d, m1
and r2d, r3d
movzx r3d, byte [r1 + r2] ; mstate = baseCtx[ctxSig]
mov r4d, r5d
xor r5d, r3d ; r0 = mstate ^ sig
and r3d, 1 ; mps = mstate & 1
add r6d, [r0 + r5 * 4] ; sum += x265_entropyStateBits[mstate ^ sig]
add r3b, [r0 + r5 * 4 + 3] ; nextState = (stateBits >> 24) + mps
cmp r5b, 1
cmove r3d, r4d
mov byte [r1 + r2], r3b
.exit:
%ifnidn eax,r6d
mov eax, r6d
%endif
and eax, 0xFFFFFF
RET
%endif ; ARCH_X86_64
;uint32_t goRiceParam = 0;
;int firstCoeff2 = 1;
;uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
;idx = 0;
;do
;{
; int baseLevel = (baseLevelN & 3) | firstCoeff2;
; baseLevelN >>= 2;
; int codeNumber = absCoeff[idx] - baseLevel;
; if (codeNumber >= 0)
; {
; uint32_t length = 0;
; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
; if (codeNumber >= 0)
; {
; {
; unsigned long cidx;
; CLZ(cidx, codeNumber + 1);
; length = cidx;
; }
; codeNumber = (length + length);
; }
; sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
; if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
; goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
; }
; if (absCoeff[idx] >= 2)
; firstCoeff2 = 0;
; idx++;
;}
;while(idx < numNonZero);
; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx)
INIT_XMM sse4
cglobal costCoeffRemain, 0,7,1
; assign RCX to R3
; RAX always in R6 and free
%if WIN64
DECLARE_REG_TMP 3,1,2,0
mov t0, r0
mov r4d, r2d
%elif ARCH_X86_64
; *nix x64 didn't do anything
DECLARE_REG_TMP 0,1,2,3
mov r4d, r2d
%else ; X86_32
DECLARE_REG_TMP 6,3,2,1
mov t0, r0m
mov r4d, r2m
%endif
xor t3d, t3d
xor r5d, r5d
lea t0, [t0 + r4 * 2]
mov r2d, 3
; register mapping
; r2d - baseLevel & tmp
; r4d - idx
; t3 - goRiceParam
; eax - absCoeff[idx] & tmp
; r5 - sum
.loop:
mov eax, 1
cmp r4d, 8
cmovge r2d, eax
movzx eax, word [t0]
add t0, 2
sub eax, r2d ; codeNumber = absCoeff[idx] - baseLevel
jl .next
shr eax, t3b ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
lea r2d, [rax - 3 + 1] ; CLZ(cidx, codeNumber + 1);
bsr r2d, r2d
add r2d, r2d ; codeNumber = (length + length)
sub eax, 3
cmovge eax, r2d
lea eax, [3 + 1 + t3 + rax] ; sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber)
add r5d, eax
; if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
; goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
cmp t3d, 4
setl al
mov r2d, 3
shl r2d, t3b
cmp word [t0 - 2], r2w
setg r2b
and al, r2b
add t3b, al
.next:
inc r4d
mov r2d, 2
cmp r4d, r1m
jl .loop
mov eax, r5d
RET
; uint32_t costC1C2Flag(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
;idx = 0;
;do
;{
; uint32_t symbol1 = absCoeff[idx] > 1;
; uint32_t symbol2 = absCoeff[idx] > 2;
; {
; const uint32_t mstate = baseCtxMod[c1];
; baseCtxMod[c1] = sbacNext(mstate, symbol1);
; sum += sbacGetEntropyBits(mstate, symbol1);
; }
; if (symbol1)
; c1Next = 0;
; if (symbol1 + firstC2Flag == 3)
; firstC2Flag = symbol2;
; if (symbol1 + firstC2Idx == 9)
; firstC2Idx = idx;
; c1 = (c1Next & 3);
; c1Next >>= 2;
; idx++;
;}
;while(idx < numC1Flag);
;if (!c1)
;{
; baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA) + OFF_ABS_FLAG_CTX + ctxSet];
; {
; const uint32_t mstate = baseCtxMod[0];
; baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
; sum += sbacGetEntropyBits(mstate, firstC2Flag);
; }
;}
;m_fracBits += (sum & 0xFFFFFF);
; TODO: we need more register, so I writen code as x64 only, but it is easy to portab to x86 platform
%if ARCH_X86_64
INIT_XMM sse2
cglobal costC1C2Flag, 4,12,2
mova m0, [r0]
packsswb m0, m0
pcmpgtb m1, m0, [pb_1]
pcmpgtb m0, [pb_2]
; get mask for 'X>1'
pmovmskb r0d, m1
mov r11d, r0d
; clear unavailable coeff flags
xor r6d, r6d
bts r6d, r1d
dec r6d
and r11d, r6d
; calculate firstC2Idx
or r11d, 0x100 ; default value setting to 8
bsf r11d, r11d
lea r5, [private_prefix %+ _entropyStateBits]
xor r6d, r6d
mov r4d, 0xFFFFFFF9
; register mapping
; r4d - nextC1
; r5 - x265_entropyStateBits
; r6d - sum
; r[7-10] - tmp
; r11d - firstC2Idx (not use in loop)
; process c1 flag
.loop:
; const uint32_t mstate = baseCtx[ctxSig];
; const uint32_t mps = mstate & 1;
; const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
; uint32_t nextState = (stateBits >> 24) + mps;
; if ((mstate ^ sig) == 1)
; nextState = sig;
mov r10d, r4d ; c1
and r10d, 3
shr r4d, 2
xor r7d, r7d
shr r0d, 1
cmovc r4d, r7d ; c1 <- 0 when C1Flag=1
setc r7b ; symbol1
movzx r8d, byte [r2 + r10] ; mstate = baseCtx[c1]
mov r9d, r7d ; sig = symbol1
xor r7d, r8d ; mstate ^ sig
and r8d, 1 ; mps = mstate & 1
add r6d, [r5 + r7 * 4] ; sum += x265_entropyStateBits[mstate ^ sig]
add r8b, [r5 + r7 * 4 + 3] ; nextState = (stateBits >> 24) + mps
cmp r7b, 1 ; if ((mstate ^ sig) == 1) nextState = sig;
cmove r8d, r9d
mov byte [r2 + r10], r8b
dec r1d
jg .loop
; check and generate c1 flag
shl r4d, 30
jnz .quit
; move to c2 ctx
add r2, r3
; process c2 flag
pmovmskb r8d, m0
bt r8d, r11d
setc r7b
movzx r8d, byte [r2] ; mstate = baseCtx[c1]
mov r1d, r7d ; sig = symbol1
xor r7d, r8d ; mstate ^ sig
and r8d, 1 ; mps = mstate & 1
add r6d, [r5 + r7 * 4] ; sum += x265_entropyStateBits[mstate ^ sig]
add r8b, [r5 + r7 * 4 + 3] ; nextState = (stateBits >> 24) + mps
cmp r7b, 1 ; if ((mstate ^ sig) == 1) nextState = sig;
cmove r8d, r1d
mov byte [r2], r8b
.quit:
shrd r4d, r11d, 4
%ifnidn r6d,eax
mov eax, r6d
%endif
and eax, 0x00FFFFFF
or eax, r4d
RET
%endif ; ARCH_X86_64