forked from mirror/libbpg
libbpg-0.9.6
This commit is contained in:
parent
3035b41edf
commit
35a8402710
248 changed files with 232891 additions and 100 deletions
14
x265/source/common/x86/README.txt
Normal file
14
x265/source/common/x86/README.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
The ASM source here is directly pulled from the x264 project with two
|
||||
changes:
|
||||
|
||||
1 - FENC_STRIDE must be increased to 64 in x86util.asm because of HEVC's
|
||||
larger CU sizes
|
||||
2 - Because of #1, we must rebrand the functions with x265_ prefixes in
|
||||
x86inc.asm (private_prefix) and pixel-a.asm (mangle(x265_pixel_ssd))
|
||||
3 - We have modified the MMX SSD primitives to use EMMS before returning
|
||||
4 - We have added some new SATD block sizes for SSE3
|
||||
|
||||
Current assembly is based on x264 revision:
|
||||
configure: Support cygwin64
|
||||
Diogo Franco (Kovensky) <diogomfranco@gmail.com>
|
||||
2013-07-23 22:17:44 -0300
|
3727
x265/source/common/x86/asm-primitives.cpp
Normal file
3727
x265/source/common/x86/asm-primitives.cpp
Normal file
File diff suppressed because it is too large
Load diff
5878
x265/source/common/x86/blockcopy8.asm
Normal file
5878
x265/source/common/x86/blockcopy8.asm
Normal file
File diff suppressed because it is too large
Load diff
63
x265/source/common/x86/blockcopy8.h
Normal file
63
x265/source/common/x86/blockcopy8.h
Normal file
|
@ -0,0 +1,63 @@
|
|||
/*****************************************************************************
|
||||
* Copyright (C) 2013 x265 project
|
||||
*
|
||||
* Authors: Steve Borho <steve@borho.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_BLOCKCOPY8_H
|
||||
#define X265_BLOCKCOPY8_H
|
||||
|
||||
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
|
||||
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
|
||||
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
|
||||
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
|
||||
|
||||
FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
|
||||
FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
|
||||
FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
|
||||
|
||||
FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
|
||||
FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
|
||||
|
||||
FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
|
||||
FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
|
||||
|
||||
FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
|
||||
FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
|
||||
|
||||
FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
|
||||
FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
|
||||
FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
|
||||
FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
|
||||
FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
|
||||
FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
|
||||
|
||||
#endif // ifndef X265_I386_PIXEL_H
|
146
x265/source/common/x86/const-a.asm
Normal file
146
x265/source/common/x86/const-a.asm
Normal file
|
@ -0,0 +1,146 @@
|
|||
;*****************************************************************************
|
||||
;* const-a.asm: x86 global constants
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2010-2013 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Fiona Glaser <fiona@x264.com>
|
||||
;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
|
||||
;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at license @ x265.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
;; 8-bit constants
|
||||
|
||||
const pb_0, times 16 db 0
|
||||
const pb_1, times 32 db 1
|
||||
const pb_2, times 32 db 2
|
||||
const pb_3, times 16 db 3
|
||||
const pb_4, times 32 db 4
|
||||
const pb_8, times 32 db 8
|
||||
const pb_15, times 32 db 15
|
||||
const pb_16, times 32 db 16
|
||||
const pb_32, times 32 db 32
|
||||
const pb_64, times 32 db 64
|
||||
const pb_128, times 32 db 128
|
||||
const pb_a1, times 16 db 0xa1
|
||||
|
||||
const pb_01, times 8 db 0, 1
|
||||
const hsub_mul, times 16 db 1, -1
|
||||
const pw_swap, times 2 db 6, 7, 4, 5, 2, 3, 0, 1
|
||||
const pb_unpackbd1, times 2 db 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
|
||||
const pb_unpackbd2, times 2 db 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7
|
||||
const pb_unpackwq1, times 1 db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
|
||||
const pb_unpackwq2, times 1 db 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7
|
||||
const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
|
||||
const pb_movemask, times 16 db 0x00
|
||||
times 16 db 0xFF
|
||||
|
||||
const pb_movemask_32, times 32 db 0x00
|
||||
times 32 db 0xFF
|
||||
times 32 db 0x00
|
||||
|
||||
const pb_0000000000000F0F, times 2 db 0xff, 0x00
|
||||
times 12 db 0x00
|
||||
const pb_000000000000000F, db 0xff
|
||||
times 15 db 0x00
|
||||
|
||||
;; 16-bit constants
|
||||
|
||||
const pw_1, times 16 dw 1
|
||||
const pw_2, times 16 dw 2
|
||||
const pw_3, times 16 dw 3
|
||||
const pw_7, times 16 dw 7
|
||||
const pw_m2, times 8 dw -2
|
||||
const pw_4, times 8 dw 4
|
||||
const pw_8, times 8 dw 8
|
||||
const pw_16, times 16 dw 16
|
||||
const pw_15, times 16 dw 15
|
||||
const pw_31, times 16 dw 31
|
||||
const pw_32, times 16 dw 32
|
||||
const pw_64, times 8 dw 64
|
||||
const pw_128, times 16 dw 128
|
||||
const pw_256, times 16 dw 256
|
||||
const pw_257, times 16 dw 257
|
||||
const pw_512, times 16 dw 512
|
||||
const pw_1023, times 16 dw 1023
|
||||
const pw_1024, times 16 dw 1024
|
||||
const pw_2048, times 16 dw 2048
|
||||
const pw_4096, times 16 dw 4096
|
||||
const pw_8192, times 8 dw 8192
|
||||
const pw_00ff, times 16 dw 0x00ff
|
||||
const pw_ff00, times 8 dw 0xff00
|
||||
const pw_2000, times 16 dw 0x2000
|
||||
const pw_8000, times 8 dw 0x8000
|
||||
const pw_3fff, times 8 dw 0x3fff
|
||||
const pw_32_0, times 4 dw 32,
|
||||
times 4 dw 0
|
||||
const pw_pixel_max, times 16 dw ((1 << BIT_DEPTH)-1)
|
||||
|
||||
const pw_0_15, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
|
||||
const pw_ppppmmmm, times 1 dw 1, 1, 1, 1, -1, -1, -1, -1
|
||||
const pw_ppmmppmm, times 1 dw 1, 1, -1, -1, 1, 1, -1, -1
|
||||
const pw_pmpmpmpm, times 16 dw 1, -1, 1, -1, 1, -1, 1, -1
|
||||
const pw_pmmpzzzz, times 1 dw 1, -1, -1, 1, 0, 0, 0, 0
|
||||
const multi_2Row, times 1 dw 1, 2, 3, 4, 1, 2, 3, 4
|
||||
const multiH, times 1 dw 9, 10, 11, 12, 13, 14, 15, 16
|
||||
const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32
|
||||
const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
||||
const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
|
||||
const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
|
||||
const pw_FFFFFFFFFFFFFFF0, dw 0x00
|
||||
times 7 dw 0xff
|
||||
const hmul_16p, times 16 db 1
|
||||
times 8 db 1, -1
|
||||
|
||||
|
||||
;; 32-bit constants
|
||||
|
||||
const pd_1, times 8 dd 1
|
||||
const pd_2, times 8 dd 2
|
||||
const pd_4, times 4 dd 4
|
||||
const pd_8, times 4 dd 8
|
||||
const pd_16, times 8 dd 16
|
||||
const pd_31, times 4 dd 31
|
||||
const pd_32, times 8 dd 32
|
||||
const pd_64, times 4 dd 64
|
||||
const pd_128, times 4 dd 128
|
||||
const pd_256, times 4 dd 256
|
||||
const pd_512, times 4 dd 512
|
||||
const pd_1024, times 4 dd 1024
|
||||
const pd_2048, times 4 dd 2048
|
||||
const pd_ffff, times 4 dd 0xffff
|
||||
const pd_32767, times 4 dd 32767
|
||||
const pd_524416, times 4 dd 524416
|
||||
const pd_n32768, times 8 dd 0xffff8000
|
||||
const pd_n131072, times 4 dd 0xfffe0000
|
||||
|
||||
const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7
|
||||
|
||||
const popcnt_table
|
||||
%assign x 0
|
||||
%rep 256
|
||||
; population count
|
||||
db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
|
||||
%assign x x+1
|
||||
%endrep
|
197
x265/source/common/x86/cpu-a.asm
Normal file
197
x265/source/common/x86/cpu-a.asm
Normal file
|
@ -0,0 +1,197 @@
|
|||
;*****************************************************************************
|
||||
;* cpu-a.asm: x86 cpu utilities
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2003-2013 x264 project
|
||||
;*
|
||||
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||
;* Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Fiona Glaser <fiona@x264.com>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at license @ x265.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid, 5,7
|
||||
push rbx
|
||||
push r4
|
||||
push r3
|
||||
push r2
|
||||
push r1
|
||||
mov eax, r0d
|
||||
xor ecx, ecx
|
||||
cpuid
|
||||
pop r4
|
||||
mov [r4], eax
|
||||
pop r4
|
||||
mov [r4], ebx
|
||||
pop r4
|
||||
mov [r4], ecx
|
||||
pop r4
|
||||
mov [r4], edx
|
||||
pop rbx
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void cpu_xgetbv( int op, int *eax, int *edx )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_xgetbv, 3,7
|
||||
push r2
|
||||
push r1
|
||||
mov ecx, r0d
|
||||
xgetbv
|
||||
pop r4
|
||||
mov [r4], eax
|
||||
pop r4
|
||||
mov [r4], edx
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void stack_align( void (*func)(void*), void *arg );
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal stack_align
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
%if WIN64
|
||||
sub rsp, 32 ; shadow space
|
||||
%endif
|
||||
and rsp, ~31
|
||||
mov rax, r0
|
||||
mov r0, r1
|
||||
mov r1, r2
|
||||
mov r2, r3
|
||||
call rax
|
||||
leave
|
||||
ret
|
||||
|
||||
%else
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int cpu_cpuid_test( void )
|
||||
; return 0 if unsupported
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid_test
|
||||
pushfd
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
pushfd
|
||||
pop eax
|
||||
mov ebx, eax
|
||||
xor eax, 0x200000
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
xor eax, ebx
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebp
|
||||
pop ebx
|
||||
popfd
|
||||
ret
|
||||
|
||||
cglobal stack_align
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
sub esp, 12
|
||||
and esp, ~31
|
||||
mov ecx, [ebp+8]
|
||||
mov edx, [ebp+12]
|
||||
mov [esp], edx
|
||||
mov edx, [ebp+16]
|
||||
mov [esp+4], edx
|
||||
mov edx, [ebp+20]
|
||||
mov [esp+8], edx
|
||||
call ecx
|
||||
leave
|
||||
ret
|
||||
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void cpu_emms( void )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_emms
|
||||
emms
|
||||
ret
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void cpu_sfence( void )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_sfence
|
||||
sfence
|
||||
ret
|
||||
|
||||
cextern intel_cpu_indicator_init
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void safe_intel_cpu_indicator_init( void );
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal safe_intel_cpu_indicator_init
|
||||
push r0
|
||||
push r1
|
||||
push r2
|
||||
push r3
|
||||
push r4
|
||||
push r5
|
||||
push r6
|
||||
%if ARCH_X86_64
|
||||
push r7
|
||||
push r8
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
%endif
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
%if WIN64
|
||||
sub rsp, 32 ; shadow space
|
||||
%endif
|
||||
and rsp, ~31
|
||||
call intel_cpu_indicator_init
|
||||
leave
|
||||
%if ARCH_X86_64
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop r8
|
||||
pop r7
|
||||
%endif
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
pop r2
|
||||
pop r1
|
||||
pop r0
|
||||
ret
|
3719
x265/source/common/x86/dct8.asm
Normal file
3719
x265/source/common/x86/dct8.asm
Normal file
File diff suppressed because it is too large
Load diff
45
x265/source/common/x86/dct8.h
Normal file
45
x265/source/common/x86/dct8.h
Normal file
|
@ -0,0 +1,45 @@
|
|||
/*****************************************************************************
|
||||
* Copyright (C) 2013 x265 project
|
||||
*
|
||||
* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_DCT8_H
|
||||
#define X265_DCT8_H
|
||||
|
||||
FUNCDEF_TU_S2(void, dct, sse2, const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
FUNCDEF_TU_S2(void, dct, ssse3, const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
FUNCDEF_TU_S2(void, dct, sse4, const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
FUNCDEF_TU_S2(void, dct, avx2, const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
|
||||
FUNCDEF_TU_S2(void, idct, sse2, const int16_t* src, int16_t* dst, intptr_t dstStride);
|
||||
FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
|
||||
FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
|
||||
FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
|
||||
|
||||
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
void PFX(dst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
|
||||
void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
|
||||
void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
|
||||
|
||||
#endif // ifndef X265_DCT8_H
|
93
x265/source/common/x86/intrapred.h
Normal file
93
x265/source/common/x86/intrapred.h
Normal file
|
@ -0,0 +1,93 @@
|
|||
/*****************************************************************************
|
||||
* intrapred.h: Intra Prediction metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2003-2013 x264 project
|
||||
*
|
||||
* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
|
||||
* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_INTRAPRED_H
|
||||
#define X265_INTRAPRED_H
|
||||
|
||||
#define DECL_ANG(bsize, mode, cpu) \
|
||||
void PFX(intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu)(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
|
||||
|
||||
#define DECL_ANGS(bsize, cpu) \
|
||||
DECL_ANG(bsize, 2, cpu); \
|
||||
DECL_ANG(bsize, 3, cpu); \
|
||||
DECL_ANG(bsize, 4, cpu); \
|
||||
DECL_ANG(bsize, 5, cpu); \
|
||||
DECL_ANG(bsize, 6, cpu); \
|
||||
DECL_ANG(bsize, 7, cpu); \
|
||||
DECL_ANG(bsize, 8, cpu); \
|
||||
DECL_ANG(bsize, 9, cpu); \
|
||||
DECL_ANG(bsize, 10, cpu); \
|
||||
DECL_ANG(bsize, 11, cpu); \
|
||||
DECL_ANG(bsize, 12, cpu); \
|
||||
DECL_ANG(bsize, 13, cpu); \
|
||||
DECL_ANG(bsize, 14, cpu); \
|
||||
DECL_ANG(bsize, 15, cpu); \
|
||||
DECL_ANG(bsize, 16, cpu); \
|
||||
DECL_ANG(bsize, 17, cpu); \
|
||||
DECL_ANG(bsize, 18, cpu); \
|
||||
DECL_ANG(bsize, 19, cpu); \
|
||||
DECL_ANG(bsize, 20, cpu); \
|
||||
DECL_ANG(bsize, 21, cpu); \
|
||||
DECL_ANG(bsize, 22, cpu); \
|
||||
DECL_ANG(bsize, 23, cpu); \
|
||||
DECL_ANG(bsize, 24, cpu); \
|
||||
DECL_ANG(bsize, 25, cpu); \
|
||||
DECL_ANG(bsize, 26, cpu); \
|
||||
DECL_ANG(bsize, 27, cpu); \
|
||||
DECL_ANG(bsize, 28, cpu); \
|
||||
DECL_ANG(bsize, 29, cpu); \
|
||||
DECL_ANG(bsize, 30, cpu); \
|
||||
DECL_ANG(bsize, 31, cpu); \
|
||||
DECL_ANG(bsize, 32, cpu); \
|
||||
DECL_ANG(bsize, 33, cpu); \
|
||||
DECL_ANG(bsize, 34, cpu)
|
||||
|
||||
#define DECL_ALL(cpu) \
|
||||
FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
|
||||
FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
|
||||
DECL_ANGS(4, cpu); \
|
||||
DECL_ANGS(8, cpu); \
|
||||
DECL_ANGS(16, cpu); \
|
||||
DECL_ANGS(32, cpu)
|
||||
|
||||
FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
|
||||
FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
|
||||
FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
|
||||
|
||||
FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
|
||||
FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
|
||||
FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
|
||||
|
||||
DECL_ALL(sse2);
|
||||
DECL_ALL(ssse3);
|
||||
DECL_ALL(sse4);
|
||||
DECL_ALL(avx2);
|
||||
|
||||
#undef DECL_ALL
|
||||
#undef DECL_ANGS
|
||||
#undef DECL_ANG
|
||||
|
||||
|
||||
#endif // ifndef X265_INTRAPRED_H
|
22071
x265/source/common/x86/intrapred16.asm
Normal file
22071
x265/source/common/x86/intrapred16.asm
Normal file
File diff suppressed because it is too large
Load diff
22682
x265/source/common/x86/intrapred8.asm
Normal file
22682
x265/source/common/x86/intrapred8.asm
Normal file
File diff suppressed because it is too large
Load diff
24166
x265/source/common/x86/intrapred8_allangs.asm
Normal file
24166
x265/source/common/x86/intrapred8_allangs.asm
Normal file
File diff suppressed because it is too large
Load diff
13007
x265/source/common/x86/ipfilter16.asm
Normal file
13007
x265/source/common/x86/ipfilter16.asm
Normal file
File diff suppressed because it is too large
Load diff
27826
x265/source/common/x86/ipfilter8.asm
Normal file
27826
x265/source/common/x86/ipfilter8.asm
Normal file
File diff suppressed because it is too large
Load diff
49
x265/source/common/x86/ipfilter8.h
Normal file
49
x265/source/common/x86/ipfilter8.h
Normal file
|
@ -0,0 +1,49 @@
|
|||
/*****************************************************************************
|
||||
* Copyright (C) 2013 x265 project
|
||||
*
|
||||
* Authors: Steve Borho <steve@borho.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_IPFILTER8_H
|
||||
#define X265_IPFILTER8_H
|
||||
|
||||
#define SETUP_FUNC_DEF(cpu) \
|
||||
FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
|
||||
FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
|
||||
FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
|
||||
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
|
||||
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
|
||||
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
|
||||
|
||||
SETUP_FUNC_DEF(sse2);
|
||||
SETUP_FUNC_DEF(ssse3);
|
||||
SETUP_FUNC_DEF(sse3);
|
||||
SETUP_FUNC_DEF(sse4);
|
||||
SETUP_FUNC_DEF(avx2);
|
||||
|
||||
#endif // ifndef X265_IPFILTER8_H
|
2281
x265/source/common/x86/loopfilter.asm
Normal file
2281
x265/source/common/x86/loopfilter.asm
Normal file
File diff suppressed because it is too large
Load diff
48
x265/source/common/x86/loopfilter.h
Normal file
48
x265/source/common/x86/loopfilter.h
Normal file
|
@ -0,0 +1,48 @@
|
|||
/*****************************************************************************
|
||||
* Copyright (C) 2013 x265 project
|
||||
*
|
||||
* Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
|
||||
* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_LOOPFILTER_H
|
||||
#define X265_LOOPFILTER_H
|
||||
|
||||
#define DECL_SAO(cpu) \
|
||||
void PFX(saoCuOrgE0_ ## cpu)(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride); \
|
||||
void PFX(saoCuOrgE1_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
|
||||
void PFX(saoCuOrgE1_2Rows_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
|
||||
void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
|
||||
void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
|
||||
void PFX(saoCuOrgE2_32_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
|
||||
void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
|
||||
void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
|
||||
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
|
||||
void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
|
||||
void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
|
||||
void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
|
||||
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
|
||||
void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
|
||||
void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
|
||||
|
||||
DECL_SAO(sse4);
|
||||
DECL_SAO(avx2);
|
||||
|
||||
#endif // ifndef X265_LOOPFILTER_H
|
5725
x265/source/common/x86/mc-a.asm
Normal file
5725
x265/source/common/x86/mc-a.asm
Normal file
File diff suppressed because it is too large
Load diff
1137
x265/source/common/x86/mc-a2.asm
Normal file
1137
x265/source/common/x86/mc-a2.asm
Normal file
File diff suppressed because it is too large
Load diff
39
x265/source/common/x86/mc.h
Normal file
39
x265/source/common/x86/mc.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
/*****************************************************************************
|
||||
* Copyright (C) 2013 x265 project
|
||||
*
|
||||
* Authors: Steve Borho <steve@borho.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_MC_H
|
||||
#define X265_MC_H
|
||||
|
||||
#define LOWRES(cpu) \
|
||||
void PFX(frame_init_lowres_core_ ## cpu)(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
|
||||
intptr_t src_stride, intptr_t dst_stride, int width, int height);
|
||||
LOWRES(mmx2)
|
||||
LOWRES(sse2)
|
||||
LOWRES(ssse3)
|
||||
LOWRES(avx)
|
||||
LOWRES(avx2)
|
||||
LOWRES(xop)
|
||||
|
||||
#undef LOWRES
|
||||
|
||||
#endif // ifndef X265_MC_H
|
420
x265/source/common/x86/pixel-32.asm
Normal file
420
x265/source/common/x86/pixel-32.asm
Normal file
|
@ -0,0 +1,420 @@
|
|||
;*****************************************************************************
|
||||
;* pixel-32.asm: x86_32 pixel metrics
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2003-2013 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Laurent Aimar <fenrir@via.ecp.fr>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at license @ x265.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
cextern pw_ppmmppmm
|
||||
cextern pw_pmpmpmpm
|
||||
|
||||
SECTION .text
|
||||
INIT_MMX mmx2
|
||||
|
||||
%macro LOAD_DIFF_4x8P 1 ; dx
|
||||
LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
|
||||
LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
|
||||
LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
|
||||
LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
|
||||
lea r0, [r0+4*r1]
|
||||
lea r2, [r2+4*r3]
|
||||
LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
|
||||
LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
|
||||
LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
|
||||
movq [spill], m5
|
||||
LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
|
||||
movq m5, [spill]
|
||||
%endmacro
|
||||
|
||||
%macro SUM4x8_MM 0
|
||||
movq [spill], m6
|
||||
movq [spill+8], m7
|
||||
ABSW2 m0, m1, m0, m1, m6, m7
|
||||
ABSW2 m2, m3, m2, m3, m6, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
movq m6, [spill]
|
||||
movq m7, [spill+8]
|
||||
ABSW2 m4, m5, m4, m5, m2, m3
|
||||
ABSW2 m6, m7, m6, m7, m2, m3
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
paddw m0, m1
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal pixel_sa8d_8x8_internal
|
||||
push r0
|
||||
push r2
|
||||
sub esp, 0x74
|
||||
%define args esp+0x74
|
||||
%define spill esp+0x60 ; +16
|
||||
%define trans esp+0 ; +96
|
||||
LOAD_DIFF_4x8P 0
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m1
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 1
|
||||
movq [trans+0x00], m4
|
||||
movq [trans+0x08], m5
|
||||
movq [trans+0x10], m6
|
||||
movq [trans+0x18], m7
|
||||
movq m1, [spill]
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
movq [trans+0x20], m0
|
||||
movq [trans+0x28], m1
|
||||
movq [trans+0x30], m2
|
||||
movq [trans+0x38], m3
|
||||
|
||||
mov r0, [args+4]
|
||||
mov r2, [args]
|
||||
LOAD_DIFF_4x8P 4
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
movq [trans+0x40], m0
|
||||
movq [trans+0x48], m1
|
||||
movq [trans+0x50], m2
|
||||
movq [trans+0x58], m3
|
||||
movq m7, [spill]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 1
|
||||
movq m0, [trans+0x00]
|
||||
movq m1, [trans+0x08]
|
||||
movq m2, [trans+0x10]
|
||||
movq m3, [trans+0x18]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
SUM4x8_MM
|
||||
movq [trans], m0
|
||||
|
||||
movq m0, [trans+0x20]
|
||||
movq m1, [trans+0x28]
|
||||
movq m2, [trans+0x30]
|
||||
movq m3, [trans+0x38]
|
||||
movq m4, [trans+0x40]
|
||||
movq m5, [trans+0x48]
|
||||
movq m6, [trans+0x50]
|
||||
movq m7, [trans+0x58]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
SUM4x8_MM
|
||||
|
||||
pavgw m0, [trans]
|
||||
add esp, 0x7c
|
||||
ret
|
||||
%undef args
|
||||
%undef spill
|
||||
%undef trans
|
||||
|
||||
%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
|
||||
pxor %7, %7
|
||||
pshufw %4, %1, q1032
|
||||
pshufw %5, %2, q1032
|
||||
pshufw %6, %3, q1032
|
||||
paddusw %1, %4
|
||||
paddusw %2, %5
|
||||
paddusw %3, %6
|
||||
punpcklwd %1, %7
|
||||
punpcklwd %2, %7
|
||||
punpcklwd %3, %7
|
||||
pshufw %4, %1, q1032
|
||||
pshufw %5, %2, q1032
|
||||
pshufw %6, %3, q1032
|
||||
%8 %1, %4
|
||||
%8 %2, %5
|
||||
%8 %3, %6
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_4x8P 1 ; dx
|
||||
pxor m7, m7
|
||||
movd m6, [r0+%1+7*FENC_STRIDE]
|
||||
movd m0, [r0+%1+0*FENC_STRIDE]
|
||||
movd m1, [r0+%1+1*FENC_STRIDE]
|
||||
movd m2, [r0+%1+2*FENC_STRIDE]
|
||||
movd m3, [r0+%1+3*FENC_STRIDE]
|
||||
movd m4, [r0+%1+4*FENC_STRIDE]
|
||||
movd m5, [r0+%1+5*FENC_STRIDE]
|
||||
punpcklbw m6, m7
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
movq [spill], m6
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
movd m6, [r0+%1+6*FENC_STRIDE]
|
||||
punpcklbw m4, m7
|
||||
punpcklbw m5, m7
|
||||
punpcklbw m6, m7
|
||||
movq m7, [spill]
|
||||
%endmacro
|
||||
|
||||
%macro HSUMSUB2 4
|
||||
pshufw m4, %1, %3
|
||||
pshufw m5, %2, %3
|
||||
pmullw %1, %4
|
||||
pmullw m5, %4
|
||||
paddw %1, m4
|
||||
paddw %2, m5
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal intra_sa8d_x3_8x8, 2,3
|
||||
SUB esp, 0x94
|
||||
%define edge esp+0x70 ; +32
|
||||
%define spill esp+0x60 ; +16
|
||||
%define trans esp+0 ; +96
|
||||
%define sum esp+0 ; +32
|
||||
|
||||
pxor m7, m7
|
||||
movq m0, [r1+7]
|
||||
movq m2, [r1+16]
|
||||
movq m1, m0
|
||||
movq m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
movq m6, [pw_ppmmppmm]
|
||||
HSUMSUB2 m0, m2, q1032, m6
|
||||
HSUMSUB2 m1, m3, q1032, m6
|
||||
movq m6, [pw_pmpmpmpm]
|
||||
HSUMSUB2 m0, m2, q2301, m6
|
||||
HSUMSUB2 m1, m3, q2301, m6
|
||||
movq m4, m0
|
||||
movq m5, m2
|
||||
paddw m0, m1
|
||||
paddw m2, m3
|
||||
psubw m4, m1
|
||||
psubw m3, m5
|
||||
movq [edge+0], m0
|
||||
movq [edge+8], m4
|
||||
movq [edge+16], m2
|
||||
movq [edge+24], m3
|
||||
|
||||
LOAD_4x8P 0
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m0
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
movq [trans+0x00], m4
|
||||
movq [trans+0x08], m5
|
||||
movq [trans+0x10], m6
|
||||
movq [trans+0x18], m7
|
||||
movq m0, [spill]
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
movq [trans+0x20], m0
|
||||
movq [trans+0x28], m1
|
||||
movq [trans+0x30], m2
|
||||
movq [trans+0x38], m3
|
||||
|
||||
LOAD_4x8P 4
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
movq [trans+0x40], m0
|
||||
movq [trans+0x48], m1
|
||||
movq [trans+0x50], m2
|
||||
movq [trans+0x58], m3
|
||||
movq m7, [spill]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
movq m0, [trans+0x00]
|
||||
movq m1, [trans+0x08]
|
||||
movq m2, [trans+0x10]
|
||||
movq m3, [trans+0x18]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill+0], m0
|
||||
movq [spill+8], m1
|
||||
ABSW2 m2, m3, m2, m3, m0, m1
|
||||
ABSW2 m4, m5, m4, m5, m0, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
ABSW2 m6, m7, m6, m7, m4, m5
|
||||
movq m0, [spill+0]
|
||||
movq m1, [spill+8]
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
paddw m2, m3
|
||||
ABSW m1, m1, m4
|
||||
paddw m2, m1 ; 7x4 sum
|
||||
movq m7, m0
|
||||
movq m1, [edge+8] ; left bottom
|
||||
psllw m1, 3
|
||||
psubw m7, m1
|
||||
ABSW2 m0, m7, m0, m7, m5, m3
|
||||
paddw m0, m2
|
||||
paddw m7, m2
|
||||
movq [sum+0], m0 ; dc
|
||||
movq [sum+8], m7 ; left
|
||||
|
||||
movq m0, [trans+0x20]
|
||||
movq m1, [trans+0x28]
|
||||
movq m2, [trans+0x30]
|
||||
movq m3, [trans+0x38]
|
||||
movq m4, [trans+0x40]
|
||||
movq m5, [trans+0x48]
|
||||
movq m6, [trans+0x50]
|
||||
movq m7, [trans+0x58]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movd [sum+0x10], m0
|
||||
movd [sum+0x12], m1
|
||||
movd [sum+0x14], m2
|
||||
movd [sum+0x16], m3
|
||||
movd [sum+0x18], m4
|
||||
movd [sum+0x1a], m5
|
||||
movd [sum+0x1c], m6
|
||||
movd [sum+0x1e], m7
|
||||
|
||||
movq [spill], m0
|
||||
movq [spill+8], m1
|
||||
ABSW2 m2, m3, m2, m3, m0, m1
|
||||
ABSW2 m4, m5, m4, m5, m0, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
paddw m2, m3
|
||||
movq m0, [spill]
|
||||
movq m1, [spill+8]
|
||||
ABSW2 m6, m7, m6, m7, m4, m5
|
||||
ABSW m1, m1, m3
|
||||
paddw m2, m7
|
||||
paddw m1, m6
|
||||
paddw m2, m1 ; 7x4 sum
|
||||
movq m1, m0
|
||||
|
||||
movq m7, [edge+0]
|
||||
psllw m7, 3 ; left top
|
||||
|
||||
mov r2, [edge+0]
|
||||
add r2, [edge+16]
|
||||
lea r2, [4*r2+32]
|
||||
and r2, 0xffc0
|
||||
movd m6, r2 ; dc
|
||||
|
||||
psubw m1, m7
|
||||
psubw m0, m6
|
||||
ABSW2 m0, m1, m0, m1, m5, m6
|
||||
movq m3, [sum+0] ; dc
|
||||
paddw m0, m2
|
||||
paddw m1, m2
|
||||
movq m2, m0
|
||||
paddw m0, m3
|
||||
paddw m1, [sum+8] ; h
|
||||
psrlq m2, 16
|
||||
paddw m2, m3
|
||||
|
||||
movq m3, [edge+16] ; top left
|
||||
movq m4, [edge+24] ; top right
|
||||
psllw m3, 3
|
||||
psllw m4, 3
|
||||
psubw m3, [sum+16]
|
||||
psubw m4, [sum+24]
|
||||
ABSW2 m3, m4, m3, m4, m5, m6
|
||||
paddw m2, m3
|
||||
paddw m2, m4 ; v
|
||||
|
||||
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
|
||||
mov r2, r2m
|
||||
pxor m7, m7
|
||||
punpckldq m2, m1
|
||||
pavgw m0, m7
|
||||
pavgw m2, m7
|
||||
movd [r2+8], m0 ; dc
|
||||
movq [r2+0], m2 ; v, h
|
||||
ADD esp, 0x94
|
||||
RET
|
||||
%undef edge
|
||||
%undef spill
|
||||
%undef trans
|
||||
%undef sum
|
||||
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
|
||||
; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal pixel_ssim_4x4x2_core, 0,5
|
||||
mov r1, r1m
|
||||
mov r3, r3m
|
||||
mov r4, 4
|
||||
pxor m0, m0
|
||||
.loop:
|
||||
mov r0, r0m
|
||||
mov r2, r2m
|
||||
add r0, r4
|
||||
add r2, r4
|
||||
pxor m1, m1
|
||||
pxor m2, m2
|
||||
pxor m3, m3
|
||||
pxor m4, m4
|
||||
%rep 4
|
||||
movd m5, [r0]
|
||||
movd m6, [r2]
|
||||
punpcklbw m5, m0
|
||||
punpcklbw m6, m0
|
||||
paddw m1, m5
|
||||
paddw m2, m6
|
||||
movq m7, m5
|
||||
pmaddwd m5, m5
|
||||
pmaddwd m7, m6
|
||||
pmaddwd m6, m6
|
||||
paddd m3, m5
|
||||
paddd m4, m7
|
||||
paddd m3, m6
|
||||
add r0, r1
|
||||
add r2, r3
|
||||
%endrep
|
||||
mov r0, r4m
|
||||
lea r0, [r0+r4*4]
|
||||
pshufw m5, m1, q0032
|
||||
pshufw m6, m2, q0032
|
||||
paddusw m1, m5
|
||||
paddusw m2, m6
|
||||
punpcklwd m1, m2
|
||||
pshufw m2, m1, q0032
|
||||
pshufw m5, m3, q0032
|
||||
pshufw m6, m4, q0032
|
||||
paddusw m1, m2
|
||||
paddd m3, m5
|
||||
paddd m4, m6
|
||||
punpcklwd m1, m0
|
||||
punpckldq m3, m4
|
||||
movq [r0+0], m1
|
||||
movq [r0+8], m3
|
||||
sub r4, 4
|
||||
jge .loop
|
||||
emms
|
||||
RET
|
||||
|
12266
x265/source/common/x86/pixel-a.asm
Normal file
12266
x265/source/common/x86/pixel-a.asm
Normal file
File diff suppressed because it is too large
Load diff
59
x265/source/common/x86/pixel-util.h
Normal file
59
x265/source/common/x86/pixel-util.h
Normal file
|
@ -0,0 +1,59 @@
|
|||
/*****************************************************************************
|
||||
* Copyright (C) 2013 x265 project
|
||||
*
|
||||
* Authors: Steve Borho <steve@borho.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_PIXEL_UTIL_H
|
||||
#define X265_PIXEL_UTIL_H
|
||||
|
||||
#define DEFINE_UTILS(cpu) \
|
||||
FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
|
||||
FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
|
||||
FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
|
||||
uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
|
||||
uint32_t PFX(nquant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)); \
|
||||
void PFX(dequant_normal_ ## cpu(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); \
|
||||
void PFX(dequant_scaling_## cpu(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)); \
|
||||
void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
|
||||
void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
|
||||
void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
|
||||
void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
|
||||
uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
|
||||
uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
|
||||
|
||||
DEFINE_UTILS(sse2);
|
||||
DEFINE_UTILS(ssse3);
|
||||
DEFINE_UTILS(sse4);
|
||||
DEFINE_UTILS(avx2);
|
||||
|
||||
#undef DEFINE_UTILS
|
||||
|
||||
void PFX(pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
|
||||
void PFX(pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
|
||||
float PFX(pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width));
|
||||
float PFX(pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width));
|
||||
|
||||
int PFX(scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
|
||||
int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
|
||||
uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
|
||||
uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
|
||||
|
||||
#endif // ifndef X265_PIXEL_UTIL_H
|
7340
x265/source/common/x86/pixel-util8.asm
Normal file
7340
x265/source/common/x86/pixel-util8.asm
Normal file
File diff suppressed because it is too large
Load diff
69
x265/source/common/x86/pixel.h
Normal file
69
x265/source/common/x86/pixel.h
Normal file
|
@ -0,0 +1,69 @@
|
|||
/*****************************************************************************
|
||||
* pixel.h: x86 pixel metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2003-2013 x264 project
|
||||
*
|
||||
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||
* Loren Merritt <lorenm@u.washington.edu>
|
||||
* Fiona Glaser <fiona@x264.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at license @ x265.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X265_I386_PIXEL_H
|
||||
#define X265_I386_PIXEL_H
|
||||
|
||||
void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
|
||||
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
|
||||
void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
|
||||
void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
|
||||
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
|
||||
void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
|
||||
pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
|
||||
|
||||
#define DECL_PIXELS(cpu) \
|
||||
FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
|
||||
FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
|
||||
FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
|
||||
FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
|
||||
FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
|
||||
FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
|
||||
FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
|
||||
FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
|
||||
FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
|
||||
FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
|
||||
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
|
||||
FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
|
||||
FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
|
||||
FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
|
||||
FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
|
||||
FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
|
||||
|
||||
DECL_PIXELS(mmx);
|
||||
DECL_PIXELS(mmx2);
|
||||
DECL_PIXELS(sse2);
|
||||
DECL_PIXELS(sse3);
|
||||
DECL_PIXELS(sse4);
|
||||
DECL_PIXELS(ssse3);
|
||||
DECL_PIXELS(avx);
|
||||
DECL_PIXELS(xop);
|
||||
DECL_PIXELS(avx2);
|
||||
|
||||
#undef DECL_PIXELS
|
||||
|
||||
#endif // ifndef X265_I386_PIXEL_H
|
1146
x265/source/common/x86/pixeladd8.asm
Normal file
1146
x265/source/common/x86/pixeladd8.asm
Normal file
File diff suppressed because it is too large
Load diff
4573
x265/source/common/x86/sad-a.asm
Normal file
4573
x265/source/common/x86/sad-a.asm
Normal file
File diff suppressed because it is too large
Load diff
1591
x265/source/common/x86/sad16-a.asm
Normal file
1591
x265/source/common/x86/sad16-a.asm
Normal file
File diff suppressed because it is too large
Load diff
2760
x265/source/common/x86/ssd-a.asm
Normal file
2760
x265/source/common/x86/ssd-a.asm
Normal file
File diff suppressed because it is too large
Load diff
1485
x265/source/common/x86/x86inc.asm
Normal file
1485
x265/source/common/x86/x86inc.asm
Normal file
File diff suppressed because it is too large
Load diff
893
x265/source/common/x86/x86util.asm
Normal file
893
x265/source/common/x86/x86util.asm
Normal file
|
@ -0,0 +1,893 @@
|
|||
;*****************************************************************************
|
||||
;* x86util.asm: x86 utility macros
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2008-2013 x264 project
|
||||
;*
|
||||
;* Authors: Holger Lubitz <holger@lubitz.org>
|
||||
;* Loren Merritt <lorenm@u.washington.edu>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at license @ x265.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%assign FENC_STRIDE 64
|
||||
%assign FDEC_STRIDE 32
|
||||
|
||||
%assign SIZEOF_PIXEL 1
|
||||
%assign SIZEOF_DCTCOEF 2
|
||||
%define pixel byte
|
||||
%define vpbroadcastdct vpbroadcastw
|
||||
%define vpbroadcastpix vpbroadcastb
|
||||
%if HIGH_BIT_DEPTH
|
||||
%assign SIZEOF_PIXEL 2
|
||||
%assign SIZEOF_DCTCOEF 4
|
||||
%define pixel word
|
||||
%define vpbroadcastdct vpbroadcastd
|
||||
%define vpbroadcastpix vpbroadcastw
|
||||
%endif
|
||||
|
||||
%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
|
||||
%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE
|
||||
|
||||
%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
|
||||
|
||||
%macro FIX_STRIDES 1-*
|
||||
%if HIGH_BIT_DEPTH
|
||||
%rep %0
|
||||
add %1, %1
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro SBUTTERFLY 4
|
||||
%ifidn %1, dqqq
|
||||
vperm2i128 m%4, m%2, m%3, q0301 ; punpckh
|
||||
vinserti128 m%2, m%2, xm%3, 1 ; punpckl
|
||||
%elif avx_enabled && mmsize >= 16
|
||||
punpckh%1 m%4, m%2, m%3
|
||||
punpckl%1 m%2, m%3
|
||||
%else
|
||||
mova m%4, m%2
|
||||
punpckl%1 m%2, m%3
|
||||
punpckh%1 m%4, m%3
|
||||
%endif
|
||||
SWAP %3, %4
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLY2 4
|
||||
punpckl%1 m%4, m%2, m%3
|
||||
punpckh%1 m%2, m%2, m%3
|
||||
SWAP %2, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4W 5
|
||||
SBUTTERFLY wd, %1, %2, %5
|
||||
SBUTTERFLY wd, %3, %4, %5
|
||||
SBUTTERFLY dq, %1, %3, %5
|
||||
SBUTTERFLY dq, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE2x4x4W 5
|
||||
SBUTTERFLY wd, %1, %2, %5
|
||||
SBUTTERFLY wd, %3, %4, %5
|
||||
SBUTTERFLY dq, %1, %3, %5
|
||||
SBUTTERFLY dq, %2, %4, %5
|
||||
SBUTTERFLY qdq, %1, %2, %5
|
||||
SBUTTERFLY qdq, %3, %4, %5
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4D 5
|
||||
SBUTTERFLY dq, %1, %2, %5
|
||||
SBUTTERFLY dq, %3, %4, %5
|
||||
SBUTTERFLY qdq, %1, %3, %5
|
||||
SBUTTERFLY qdq, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE8x8W 9-11
|
||||
%if ARCH_X86_64
|
||||
SBUTTERFLY wd, %1, %2, %9
|
||||
SBUTTERFLY wd, %3, %4, %9
|
||||
SBUTTERFLY wd, %5, %6, %9
|
||||
SBUTTERFLY wd, %7, %8, %9
|
||||
SBUTTERFLY dq, %1, %3, %9
|
||||
SBUTTERFLY dq, %2, %4, %9
|
||||
SBUTTERFLY dq, %5, %7, %9
|
||||
SBUTTERFLY dq, %6, %8, %9
|
||||
SBUTTERFLY qdq, %1, %5, %9
|
||||
SBUTTERFLY qdq, %2, %6, %9
|
||||
SBUTTERFLY qdq, %3, %7, %9
|
||||
SBUTTERFLY qdq, %4, %8, %9
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%else
|
||||
; in: m0..m7, unless %11 in which case m6 is in %9
|
||||
; out: m0..m7, unless %11 in which case m4 is in %10
|
||||
; spills into %9 and %10
|
||||
%if %0<11
|
||||
movdqa %9, m%7
|
||||
%endif
|
||||
SBUTTERFLY wd, %1, %2, %7
|
||||
movdqa %10, m%2
|
||||
movdqa m%7, %9
|
||||
SBUTTERFLY wd, %3, %4, %2
|
||||
SBUTTERFLY wd, %5, %6, %2
|
||||
SBUTTERFLY wd, %7, %8, %2
|
||||
SBUTTERFLY dq, %1, %3, %2
|
||||
movdqa %9, m%3
|
||||
movdqa m%2, %10
|
||||
SBUTTERFLY dq, %2, %4, %3
|
||||
SBUTTERFLY dq, %5, %7, %3
|
||||
SBUTTERFLY dq, %6, %8, %3
|
||||
SBUTTERFLY qdq, %1, %5, %3
|
||||
SBUTTERFLY qdq, %2, %6, %3
|
||||
movdqa %10, m%2
|
||||
movdqa m%3, %9
|
||||
SBUTTERFLY qdq, %3, %7, %2
|
||||
SBUTTERFLY qdq, %4, %8, %2
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%if %0<11
|
||||
movdqa m%5, %10
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WIDEN_SXWD 2
|
||||
punpckhwd m%2, m%1
|
||||
psrad m%2, 16
|
||||
%if cpuflag(sse4)
|
||||
pmovsxwd m%1, m%1
|
||||
%else
|
||||
punpcklwd m%1, m%1
|
||||
psrad m%1, 16
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %2
|
||||
%elifidn %3, sign ; version for pairing with PSIGNW: modifies src
|
||||
pxor %1, %1
|
||||
pcmpgtw %1, %2
|
||||
pxor %2, %1
|
||||
psubw %2, %1
|
||||
SWAP %1, %2
|
||||
%elifidn %1, %2
|
||||
pxor %3, %3
|
||||
psubw %3, %1
|
||||
pmaxsw %1, %3
|
||||
%elifid %2
|
||||
pxor %1, %1
|
||||
psubw %1, %2
|
||||
pmaxsw %1, %2
|
||||
%elif %0 == 2
|
||||
pxor %1, %1
|
||||
psubw %1, %2
|
||||
pmaxsw %1, %2
|
||||
%else
|
||||
mova %1, %2
|
||||
pxor %3, %3
|
||||
psubw %3, %1
|
||||
pmaxsw %1, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %3
|
||||
pabsw %2, %4
|
||||
%elifidn %1, %3
|
||||
pxor %5, %5
|
||||
pxor %6, %6
|
||||
psubw %5, %1
|
||||
psubw %6, %2
|
||||
pmaxsw %1, %5
|
||||
pmaxsw %2, %6
|
||||
%else
|
||||
pxor %1, %1
|
||||
pxor %2, %2
|
||||
psubw %1, %3
|
||||
psubw %2, %4
|
||||
pmaxsw %1, %3
|
||||
pmaxsw %2, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSB 2
|
||||
%if cpuflag(ssse3)
|
||||
pabsb %1, %1
|
||||
%else
|
||||
pxor %2, %2
|
||||
psubb %2, %1
|
||||
pminub %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSD 2-3
|
||||
%if cpuflag(ssse3)
|
||||
pabsd %1, %2
|
||||
%else
|
||||
%define %%s %2
|
||||
%if %0 == 3
|
||||
mova %3, %2
|
||||
%define %%s %3
|
||||
%endif
|
||||
pxor %1, %1
|
||||
pcmpgtd %1, %%s
|
||||
pxor %%s, %1
|
||||
psubd %%s, %1
|
||||
SWAP %1, %%s
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PSIGN 3-4
|
||||
%if cpuflag(ssse3) && %0 == 4
|
||||
psign%1 %2, %3, %4
|
||||
%elif cpuflag(ssse3)
|
||||
psign%1 %2, %3
|
||||
%elif %0 == 4
|
||||
pxor %2, %3, %4
|
||||
psub%1 %2, %4
|
||||
%else
|
||||
pxor %2, %3
|
||||
psub%1 %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%define PSIGNW PSIGN w,
|
||||
%define PSIGND PSIGN d,
|
||||
|
||||
%macro SPLATB_LOAD 3
|
||||
%if cpuflag(ssse3)
|
||||
movd %1, [%2-3]
|
||||
pshufb %1, %3
|
||||
%else
|
||||
movd %1, [%2-3] ;to avoid crossing a cacheline
|
||||
punpcklbw %1, %1
|
||||
SPLATW %1, %1, 3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%imacro SPLATW 2-3 0
|
||||
%if cpuflag(avx2) && %3 == 0
|
||||
vpbroadcastw %1, %2
|
||||
%else
|
||||
PSHUFLW %1, %2, (%3)*q1111
|
||||
%if mmsize == 16
|
||||
punpcklqdq %1, %1
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%imacro SPLATD 2-3 0
|
||||
%if mmsize == 16
|
||||
pshufd %1, %2, (%3)*q1111
|
||||
%else
|
||||
pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CLIPW 3 ;(dst, min, max)
|
||||
pmaxsw %1, %2
|
||||
pminsw %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro CLIPW2 4 ;(dst0, dst1, min, max)
|
||||
pmaxsw %1, %3
|
||||
pmaxsw %2, %3
|
||||
pminsw %1, %4
|
||||
pminsw %2, %4
|
||||
%endmacro
|
||||
|
||||
%macro HADDD 2 ; sum junk
|
||||
%if sizeof%1 == 32
|
||||
%define %2 xmm%2
|
||||
vextracti128 %2, %1, 1
|
||||
%define %1 xmm%1
|
||||
paddd %1, %2
|
||||
%endif
|
||||
%if mmsize >= 16
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphadddq %1, %1
|
||||
%endif
|
||||
movhlps %2, %1
|
||||
paddd %1, %2
|
||||
%endif
|
||||
%if notcpuflag(xop)
|
||||
PSHUFLW %2, %1, q0032
|
||||
paddd %1, %2
|
||||
%endif
|
||||
%undef %1
|
||||
%undef %2
|
||||
%endmacro
|
||||
|
||||
%macro HADDW 2 ; reg, tmp
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphaddwq %1, %1
|
||||
movhlps %2, %1
|
||||
paddd %1, %2
|
||||
%else
|
||||
pmaddwd %1, [pw_1]
|
||||
HADDD %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADDUWD 2
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphadduwd %1, %1
|
||||
%else
|
||||
psrld %2, %1, 16
|
||||
pslld %1, 16
|
||||
psrld %1, 16
|
||||
paddd %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADDUW 2
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphadduwq %1, %1
|
||||
movhlps %2, %1
|
||||
paddd %1, %2
|
||||
%else
|
||||
HADDUWD %1, %2
|
||||
HADDD %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
|
||||
; AVX2 version uses a precalculated extra input that
|
||||
; can be re-used across calls
|
||||
%if sizeof%1==32
|
||||
; %3 = abcdefgh ijklmnop (lower address)
|
||||
; %2 = ABCDEFGH IJKLMNOP (higher address)
|
||||
vperm2i128 %4, %1, %2, q0003 ; %4 = ijklmnop ABCDEFGH
|
||||
%if %3 < 16
|
||||
palignr %1, %4, %2, %3 ; %1 = bcdefghi jklmnopA
|
||||
%else
|
||||
palignr %1, %2, %4, %3-16 ; %1 = pABCDEFG HIJKLMNO
|
||||
%endif
|
||||
%elif cpuflag(ssse3)
|
||||
%if %0==5
|
||||
palignr %1, %2, %3, %4
|
||||
%else
|
||||
palignr %1, %2, %3
|
||||
%endif
|
||||
%else
|
||||
%define %%dst %1
|
||||
%if %0==5
|
||||
%ifnidn %1, %2
|
||||
mova %%dst, %2
|
||||
%endif
|
||||
%rotate 1
|
||||
%endif
|
||||
%ifnidn %4, %2
|
||||
mova %4, %2
|
||||
%endif
|
||||
%if mmsize==8
|
||||
psllq %%dst, (8-%3)*8
|
||||
psrlq %4, %3*8
|
||||
%else
|
||||
pslldq %%dst, 16-%3
|
||||
psrldq %4, %3
|
||||
%endif
|
||||
por %%dst, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PSHUFLW 1+
|
||||
%if mmsize == 8
|
||||
pshufw %1
|
||||
%else
|
||||
pshuflw %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
|
||||
; values shifted in are undefined
|
||||
; faster if dst==src
|
||||
%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
|
||||
%define PSRLPIX PSXLPIX r, 1, ;dst, src, shift
|
||||
%macro PSXLPIX 5
|
||||
%if mmsize == 8
|
||||
%if %5&1
|
||||
ps%1lq %3, %4, %5*8
|
||||
%else
|
||||
pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
|
||||
%endif
|
||||
%else
|
||||
ps%1ldq %3, %4, %5*2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
|
||||
%ifnum %5
|
||||
pand m%3, m%5, m%4 ; src .. y6 .. y4
|
||||
pand m%1, m%5, m%2 ; dst .. y6 .. y4
|
||||
%else
|
||||
mova m%1, %5
|
||||
pand m%3, m%1, m%4 ; src .. y6 .. y4
|
||||
pand m%1, m%1, m%2 ; dst .. y6 .. y4
|
||||
%endif
|
||||
psrlw m%2, 8 ; dst .. y7 .. y5
|
||||
psrlw m%4, 8 ; src .. y7 .. y5
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB_BA 3-4
|
||||
%if %0==3
|
||||
padd%1 m%2, m%3
|
||||
padd%1 m%3, m%3
|
||||
psub%1 m%3, m%2
|
||||
%elif avx_enabled
|
||||
padd%1 m%4, m%2, m%3
|
||||
psub%1 m%3, m%2
|
||||
SWAP %2, %4
|
||||
%else
|
||||
mova m%4, m%2
|
||||
padd%1 m%2, m%3
|
||||
psub%1 m%3, m%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB_BADC 5-6
|
||||
%if %0==6
|
||||
SUMSUB_BA %1, %2, %3, %6
|
||||
SUMSUB_BA %1, %4, %5, %6
|
||||
%else
|
||||
padd%1 m%2, m%3
|
||||
padd%1 m%4, m%5
|
||||
padd%1 m%3, m%3
|
||||
padd%1 m%5, m%5
|
||||
psub%1 m%3, m%2
|
||||
psub%1 m%5, m%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD4_V 4+
|
||||
SUMSUB_BADC w, %1, %2, %3, %4
|
||||
SUMSUB_BADC w, %1, %3, %2, %4
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8_V 8+
|
||||
SUMSUB_BADC w, %1, %2, %3, %4
|
||||
SUMSUB_BADC w, %5, %6, %7, %8
|
||||
SUMSUB_BADC w, %1, %3, %2, %4
|
||||
SUMSUB_BADC w, %5, %7, %6, %8
|
||||
SUMSUB_BADC w, %1, %5, %2, %6
|
||||
SUMSUB_BADC w, %3, %7, %4, %8
|
||||
%endmacro
|
||||
|
||||
%macro TRANS_SSE2 5-6
|
||||
; TRANSPOSE2x2
|
||||
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
|
||||
; %2: ord/unord (for compat with sse4, unused)
|
||||
; %3/%4: source regs
|
||||
; %5/%6: tmp regs
|
||||
%ifidn %1, d
|
||||
%define mask [mask_10]
|
||||
%define shift 16
|
||||
%elifidn %1, q
|
||||
%define mask [mask_1100]
|
||||
%define shift 32
|
||||
%endif
|
||||
%if %0==6 ; less dependency if we have two tmp
|
||||
mova m%5, mask ; ff00
|
||||
mova m%6, m%4 ; x5x4
|
||||
psll%1 m%4, shift ; x4..
|
||||
pand m%6, m%5 ; x5..
|
||||
pandn m%5, m%3 ; ..x0
|
||||
psrl%1 m%3, shift ; ..x1
|
||||
por m%4, m%5 ; x4x0
|
||||
por m%3, m%6 ; x5x1
|
||||
%else ; more dependency, one insn less. sometimes faster, sometimes not
|
||||
mova m%5, m%4 ; x5x4
|
||||
psll%1 m%4, shift ; x4..
|
||||
pxor m%4, m%3 ; (x4^x1)x0
|
||||
pand m%4, mask ; (x4^x1)..
|
||||
pxor m%3, m%4 ; x4x0
|
||||
psrl%1 m%4, shift ; ..(x1^x4)
|
||||
pxor m%5, m%4 ; x5x1
|
||||
SWAP %4, %3, %5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro TRANS_SSE4 5-6 ; see above
|
||||
%ifidn %1, d
|
||||
%ifidn %2, ord
|
||||
psrl%1 m%5, m%3, 16
|
||||
pblendw m%5, m%4, q2222
|
||||
psll%1 m%4, 16
|
||||
pblendw m%4, m%3, q1111
|
||||
SWAP %3, %5
|
||||
%else
|
||||
%if avx_enabled
|
||||
pblendw m%5, m%3, m%4, q2222
|
||||
SWAP %3, %5
|
||||
%else
|
||||
mova m%5, m%3
|
||||
pblendw m%3, m%4, q2222
|
||||
%endif
|
||||
psll%1 m%4, 16
|
||||
psrl%1 m%5, 16
|
||||
por m%4, m%5
|
||||
%endif
|
||||
%elifidn %1, q
|
||||
shufps m%5, m%3, m%4, q3131
|
||||
shufps m%3, m%3, m%4, q2020
|
||||
SWAP %4, %5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro TRANS_XOP 5-6
|
||||
%ifidn %1, d
|
||||
vpperm m%5, m%3, m%4, [transd_shuf1]
|
||||
vpperm m%3, m%3, m%4, [transd_shuf2]
|
||||
%elifidn %1, q
|
||||
shufps m%5, m%3, m%4, q3131
|
||||
shufps m%3, m%4, q2020
|
||||
%endif
|
||||
SWAP %4, %5
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD 5-6
|
||||
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
|
||||
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
|
||||
; %3/%4: regs
|
||||
; %5(%6): tmpregs
|
||||
%if %1!=0 ; have to reorder stuff for horizontal op
|
||||
%ifidn %2, sumsub
|
||||
%define ORDER ord
|
||||
; sumsub needs order because a-b != b-a unless a=b
|
||||
%else
|
||||
%define ORDER unord
|
||||
; if we just max, order doesn't matter (allows pblendw+or in sse4)
|
||||
%endif
|
||||
%if %1==1
|
||||
TRANS d, ORDER, %3, %4, %5, %6
|
||||
%elif %1==2
|
||||
%if mmsize==8
|
||||
SBUTTERFLY dq, %3, %4, %5
|
||||
%else
|
||||
TRANS q, ORDER, %3, %4, %5, %6
|
||||
%endif
|
||||
%elif %1==4
|
||||
SBUTTERFLY qdq, %3, %4, %5
|
||||
%elif %1==8
|
||||
SBUTTERFLY dqqq, %3, %4, %5
|
||||
%endif
|
||||
%endif
|
||||
%ifidn %2, sumsub
|
||||
SUMSUB_BA w, %3, %4, %5
|
||||
%else
|
||||
%ifidn %2, amax
|
||||
%if %0==6
|
||||
ABSW2 m%3, m%4, m%3, m%4, m%5, m%6
|
||||
%else
|
||||
ABSW m%3, m%3, m%5
|
||||
ABSW m%4, m%4, m%5
|
||||
%endif
|
||||
%endif
|
||||
pmaxsw m%3, m%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro HADAMARD2_2D 6-7 sumsub
|
||||
HADAMARD 0, sumsub, %1, %2, %5
|
||||
HADAMARD 0, sumsub, %3, %4, %5
|
||||
SBUTTERFLY %6, %1, %2, %5
|
||||
%ifnum %7
|
||||
HADAMARD 0, amax, %1, %2, %5, %7
|
||||
%else
|
||||
HADAMARD 0, %7, %1, %2, %5
|
||||
%endif
|
||||
SBUTTERFLY %6, %3, %4, %5
|
||||
%ifnum %7
|
||||
HADAMARD 0, amax, %3, %4, %5, %7
|
||||
%else
|
||||
HADAMARD 0, %7, %3, %4, %5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD4_2D 5-6 sumsub
|
||||
HADAMARD2_2D %1, %2, %3, %4, %5, wd
|
||||
HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD4_2D_SSE 5-6 sumsub
|
||||
HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
|
||||
HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
|
||||
SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0
|
||||
SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2
|
||||
HADAMARD2_2D %1, %3, %2, %4, %5, dq
|
||||
SBUTTERFLY qdq, %1, %2, %5
|
||||
HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1
|
||||
SBUTTERFLY qdq, %3, %4, %5
|
||||
HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8_2D 9-10 sumsub
|
||||
HADAMARD2_2D %1, %2, %3, %4, %9, wd
|
||||
HADAMARD2_2D %5, %6, %7, %8, %9, wd
|
||||
HADAMARD2_2D %1, %3, %2, %4, %9, dq
|
||||
HADAMARD2_2D %5, %7, %6, %8, %9, dq
|
||||
HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
|
||||
HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
|
||||
%ifnidn %10, amax
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; doesn't include the "pmaddubsw hmul_8p" pass
|
||||
%macro HADAMARD8_2D_HMUL 10
|
||||
HADAMARD4_V %1, %2, %3, %4, %9
|
||||
HADAMARD4_V %5, %6, %7, %8, %9
|
||||
SUMSUB_BADC w, %1, %5, %2, %6, %9
|
||||
HADAMARD 2, sumsub, %1, %5, %9, %10
|
||||
HADAMARD 2, sumsub, %2, %6, %9, %10
|
||||
SUMSUB_BADC w, %3, %7, %4, %8, %9
|
||||
HADAMARD 2, sumsub, %3, %7, %9, %10
|
||||
HADAMARD 2, sumsub, %4, %8, %9, %10
|
||||
HADAMARD 1, amax, %1, %5, %9, %10
|
||||
HADAMARD 1, amax, %2, %6, %9, %5
|
||||
HADAMARD 1, amax, %3, %7, %9, %5
|
||||
HADAMARD 1, amax, %4, %8, %9, %5
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB2_AB 4
|
||||
%if cpuflag(xop)
|
||||
pmacs%1%1 m%4, m%3, [p%1_m2], m%2
|
||||
pmacs%1%1 m%2, m%2, [p%1_2], m%3
|
||||
%elifnum %3
|
||||
psub%1 m%4, m%2, m%3
|
||||
psub%1 m%4, m%3
|
||||
padd%1 m%2, m%2
|
||||
padd%1 m%2, m%3
|
||||
%else
|
||||
mova m%4, m%2
|
||||
padd%1 m%2, m%2
|
||||
padd%1 m%2, %3
|
||||
psub%1 m%4, %3
|
||||
psub%1 m%4, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUBD2_AB 5
|
||||
%ifnum %4
|
||||
psra%1 m%5, m%2, 1 ; %3: %3>>1
|
||||
psra%1 m%4, m%3, 1 ; %2: %2>>1
|
||||
padd%1 m%4, m%2 ; %3: %3>>1+%2
|
||||
psub%1 m%5, m%3 ; %2: %2>>1-%3
|
||||
SWAP %2, %5
|
||||
SWAP %3, %4
|
||||
%else
|
||||
mova %5, m%2
|
||||
mova %4, m%3
|
||||
psra%1 m%3, 1 ; %3: %3>>1
|
||||
psra%1 m%2, 1 ; %2: %2>>1
|
||||
padd%1 m%3, %5 ; %3: %3>>1+%2
|
||||
psub%1 m%2, %4 ; %2: %2>>1-%3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DCT4_1D 5
|
||||
%ifnum %5
|
||||
SUMSUB_BADC w, %4, %1, %3, %2, %5
|
||||
SUMSUB_BA w, %3, %4, %5
|
||||
SUMSUB2_AB w, %1, %2, %5
|
||||
SWAP %1, %3, %4, %5, %2
|
||||
%else
|
||||
SUMSUB_BADC w, %4, %1, %3, %2
|
||||
SUMSUB_BA w, %3, %4
|
||||
mova [%5], m%2
|
||||
SUMSUB2_AB w, %1, [%5], %2
|
||||
SWAP %1, %3, %4, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT4_1D 6-7
|
||||
%ifnum %6
|
||||
SUMSUBD2_AB %1, %3, %5, %7, %6
|
||||
; %3: %3>>1-%5 %5: %3+%5>>1
|
||||
SUMSUB_BA %1, %4, %2, %7
|
||||
; %4: %2+%4 %2: %2-%4
|
||||
SUMSUB_BADC %1, %5, %4, %3, %2, %7
|
||||
; %5: %2+%4 + (%3+%5>>1)
|
||||
; %4: %2+%4 - (%3+%5>>1)
|
||||
; %3: %2-%4 + (%3>>1-%5)
|
||||
; %2: %2-%4 - (%3>>1-%5)
|
||||
%else
|
||||
%ifidn %1, w
|
||||
SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
|
||||
%else
|
||||
SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
|
||||
%endif
|
||||
SUMSUB_BA %1, %4, %2
|
||||
SUMSUB_BADC %1, %5, %4, %3, %2
|
||||
%endif
|
||||
SWAP %2, %5, %4
|
||||
; %2: %2+%4 + (%3+%5>>1) row0
|
||||
; %3: %2-%4 + (%3>>1-%5) row1
|
||||
; %4: %2-%4 - (%3>>1-%5) row2
|
||||
; %5: %2+%4 - (%3+%5>>1) row3
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro LOAD_DIFF 5-6 1
|
||||
%if HIGH_BIT_DEPTH
|
||||
%if %6 ; %5 aligned?
|
||||
mova %1, %4
|
||||
psubw %1, %5
|
||||
%else
|
||||
movu %1, %4
|
||||
movu %2, %5
|
||||
psubw %1, %2
|
||||
%endif
|
||||
%else ; !HIGH_BIT_DEPTH
|
||||
%ifidn %3, none
|
||||
movh %1, %4
|
||||
movh %2, %5
|
||||
punpcklbw %1, %2
|
||||
punpcklbw %2, %2
|
||||
psubw %1, %2
|
||||
%else
|
||||
movh %1, %4
|
||||
punpcklbw %1, %3
|
||||
movh %2, %5
|
||||
punpcklbw %2, %3
|
||||
psubw %1, %2
|
||||
%endif
|
||||
%endif ; HIGH_BIT_DEPTH
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
|
||||
%if BIT_DEPTH == 8 && cpuflag(ssse3)
|
||||
movh m%2, [%8+%1*FDEC_STRIDE]
|
||||
movh m%1, [%7+%1*FENC_STRIDE]
|
||||
punpcklbw m%1, m%2
|
||||
movh m%3, [%8+%2*FDEC_STRIDE]
|
||||
movh m%2, [%7+%2*FENC_STRIDE]
|
||||
punpcklbw m%2, m%3
|
||||
movh m%4, [%8+%3*FDEC_STRIDE]
|
||||
movh m%3, [%7+%3*FENC_STRIDE]
|
||||
punpcklbw m%3, m%4
|
||||
movh m%5, [%8+%4*FDEC_STRIDE]
|
||||
movh m%4, [%7+%4*FENC_STRIDE]
|
||||
punpcklbw m%4, m%5
|
||||
pmaddubsw m%1, m%6
|
||||
pmaddubsw m%2, m%6
|
||||
pmaddubsw m%3, m%6
|
||||
pmaddubsw m%4, m%6
|
||||
%else
|
||||
LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
|
||||
LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
|
||||
LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
|
||||
LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DCT 6
|
||||
movq [%5+%6+ 0], m%1
|
||||
movq [%5+%6+ 8], m%2
|
||||
movq [%5+%6+16], m%3
|
||||
movq [%5+%6+24], m%4
|
||||
movhps [%5+%6+32], m%1
|
||||
movhps [%5+%6+40], m%2
|
||||
movhps [%5+%6+48], m%3
|
||||
movhps [%5+%6+56], m%4
|
||||
%endmacro
|
||||
|
||||
%macro STORE_IDCT 4
|
||||
movhps [r0-4*FDEC_STRIDE], %1
|
||||
movh [r0-3*FDEC_STRIDE], %1
|
||||
movhps [r0-2*FDEC_STRIDE], %2
|
||||
movh [r0-1*FDEC_STRIDE], %2
|
||||
movhps [r0+0*FDEC_STRIDE], %3
|
||||
movh [r0+1*FDEC_STRIDE], %3
|
||||
movhps [r0+2*FDEC_STRIDE], %4
|
||||
movh [r0+3*FDEC_STRIDE], %4
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
|
||||
LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11
|
||||
LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11
|
||||
LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
|
||||
LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11
|
||||
%if %10
|
||||
lea %8, [%8+4*r1]
|
||||
lea %9, [%9+4*r3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; 2xdst, 2xtmp, 2xsrcrow
|
||||
%macro LOAD_DIFF16x2_AVX2 6
|
||||
pmovzxbw m%1, [r1+%5*FENC_STRIDE]
|
||||
pmovzxbw m%2, [r1+%6*FENC_STRIDE]
|
||||
pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
|
||||
pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
|
||||
psubw m%1, m%3
|
||||
psubw m%2, m%4
|
||||
%endmacro
|
||||
|
||||
%macro DIFFx2 6-7
|
||||
movh %3, %5
|
||||
punpcklbw %3, %4
|
||||
psraw %1, 6
|
||||
paddsw %1, %3
|
||||
movh %3, %6
|
||||
punpcklbw %3, %4
|
||||
psraw %2, 6
|
||||
paddsw %2, %3
|
||||
packuswb %2, %1
|
||||
%endmacro
|
||||
|
||||
; (high depth) in: %1, %2, min to clip, max to clip, mem128
|
||||
; in: %1, tmp, %3, mem64
|
||||
%macro STORE_DIFF 4-5
|
||||
%if HIGH_BIT_DEPTH
|
||||
psrad %1, 6
|
||||
psrad %2, 6
|
||||
packssdw %1, %2
|
||||
paddw %1, %5
|
||||
CLIPW %1, %3, %4
|
||||
mova %5, %1
|
||||
%else
|
||||
movh %2, %4
|
||||
punpcklbw %2, %3
|
||||
psraw %1, 6
|
||||
paddsw %1, %2
|
||||
packuswb %1, %1
|
||||
movh %4, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SHUFFLE_MASK_W 8
|
||||
%rep 8
|
||||
%if %1>=0x80
|
||||
db %1, %1
|
||||
%else
|
||||
db %1*2
|
||||
db %1*2+1
|
||||
%endif
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
; instruction, accum, input, iteration (zero to swap, nonzero to add)
|
||||
%macro ACCUM 4
|
||||
%if %4
|
||||
%1 m%2, m%3
|
||||
%else
|
||||
SWAP %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; IACA support
|
||||
%macro IACA_START 0
|
||||
mov ebx, 111
|
||||
db 0x64, 0x67, 0x90
|
||||
%endmacro
|
||||
|
||||
%macro IACA_END 0
|
||||
mov ebx, 222
|
||||
db 0x64, 0x67, 0x90
|
||||
%endmacro
|
Loading…
Add table
Add a link
Reference in a new issue