libbpg-0.9.6

This commit is contained in:
King_DuckZ 2015-10-27 11:46:00 +01:00
parent 3035b41edf
commit 35a8402710
248 changed files with 232891 additions and 100 deletions

View file

@ -0,0 +1,14 @@
The ASM source here is directly pulled from the x264 project with two
changes:
1 - FENC_STRIDE must be increased to 64 in x86util.asm because of HEVC's
larger CU sizes
2 - Because of #1, we must rebrand the functions with x265_ prefixes in
x86inc.asm (private_prefix) and pixel-a.asm (mangle(x265_pixel_ssd))
3 - We have modified the MMX SSD primitives to use EMMS before returning
4 - We have added some new SATD block sizes for SSE3
Current assembly is based on x264 revision:
configure: Support cygwin64
Diogo Franco (Kovensky) <diogomfranco@gmail.com>
2013-07-23 22:17:44 -0300

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,63 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_BLOCKCOPY8_H
#define X265_BLOCKCOPY8_H
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
#endif // ifndef X265_I386_PIXEL_H

View file

@ -0,0 +1,146 @@
;*****************************************************************************
;* const-a.asm: x86 global constants
;*****************************************************************************
;* Copyright (C) 2010-2013 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************
%include "x86inc.asm"
SECTION_RODATA 32
;; 8-bit constants
const pb_0, times 16 db 0
const pb_1, times 32 db 1
const pb_2, times 32 db 2
const pb_3, times 16 db 3
const pb_4, times 32 db 4
const pb_8, times 32 db 8
const pb_15, times 32 db 15
const pb_16, times 32 db 16
const pb_32, times 32 db 32
const pb_64, times 32 db 64
const pb_128, times 32 db 128
const pb_a1, times 16 db 0xa1
const pb_01, times 8 db 0, 1
const hsub_mul, times 16 db 1, -1
const pw_swap, times 2 db 6, 7, 4, 5, 2, 3, 0, 1
const pb_unpackbd1, times 2 db 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
const pb_unpackbd2, times 2 db 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7
const pb_unpackwq1, times 1 db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
const pb_unpackwq2, times 1 db 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7
const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
const pb_movemask, times 16 db 0x00
times 16 db 0xFF
const pb_movemask_32, times 32 db 0x00
times 32 db 0xFF
times 32 db 0x00
const pb_0000000000000F0F, times 2 db 0xff, 0x00
times 12 db 0x00
const pb_000000000000000F, db 0xff
times 15 db 0x00
;; 16-bit constants
const pw_1, times 16 dw 1
const pw_2, times 16 dw 2
const pw_3, times 16 dw 3
const pw_7, times 16 dw 7
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
const pw_16, times 16 dw 16
const pw_15, times 16 dw 15
const pw_31, times 16 dw 31
const pw_32, times 16 dw 32
const pw_64, times 8 dw 64
const pw_128, times 16 dw 128
const pw_256, times 16 dw 256
const pw_257, times 16 dw 257
const pw_512, times 16 dw 512
const pw_1023, times 16 dw 1023
const pw_1024, times 16 dw 1024
const pw_2048, times 16 dw 2048
const pw_4096, times 16 dw 4096
const pw_8192, times 8 dw 8192
const pw_00ff, times 16 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
const pw_2000, times 16 dw 0x2000
const pw_8000, times 8 dw 0x8000
const pw_3fff, times 8 dw 0x3fff
const pw_32_0, times 4 dw 32,
times 4 dw 0
const pw_pixel_max, times 16 dw ((1 << BIT_DEPTH)-1)
const pw_0_15, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
const pw_ppppmmmm, times 1 dw 1, 1, 1, 1, -1, -1, -1, -1
const pw_ppmmppmm, times 1 dw 1, 1, -1, -1, 1, 1, -1, -1
const pw_pmpmpmpm, times 16 dw 1, -1, 1, -1, 1, -1, 1, -1
const pw_pmmpzzzz, times 1 dw 1, -1, -1, 1, 0, 0, 0, 0
const multi_2Row, times 1 dw 1, 2, 3, 4, 1, 2, 3, 4
const multiH, times 1 dw 9, 10, 11, 12, 13, 14, 15, 16
const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32
const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
const pw_FFFFFFFFFFFFFFF0, dw 0x00
times 7 dw 0xff
const hmul_16p, times 16 db 1
times 8 db 1, -1
;; 32-bit constants
const pd_1, times 8 dd 1
const pd_2, times 8 dd 2
const pd_4, times 4 dd 4
const pd_8, times 4 dd 8
const pd_16, times 8 dd 16
const pd_31, times 4 dd 31
const pd_32, times 8 dd 32
const pd_64, times 4 dd 64
const pd_128, times 4 dd 128
const pd_256, times 4 dd 256
const pd_512, times 4 dd 512
const pd_1024, times 4 dd 1024
const pd_2048, times 4 dd 2048
const pd_ffff, times 4 dd 0xffff
const pd_32767, times 4 dd 32767
const pd_524416, times 4 dd 524416
const pd_n32768, times 8 dd 0xffff8000
const pd_n131072, times 4 dd 0xfffe0000
const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7
const popcnt_table
%assign x 0
%rep 256
; population count
db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
%assign x x+1
%endrep

View file

@ -0,0 +1,197 @@
;*****************************************************************************
;* cpu-a.asm: x86 cpu utilities
;*****************************************************************************
;* Copyright (C) 2003-2013 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************
%include "x86inc.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal cpu_cpuid, 5,7
push rbx
push r4
push r3
push r2
push r1
mov eax, r0d
xor ecx, ecx
cpuid
pop r4
mov [r4], eax
pop r4
mov [r4], ebx
pop r4
mov [r4], ecx
pop r4
mov [r4], edx
pop rbx
RET
;-----------------------------------------------------------------------------
; void cpu_xgetbv( int op, int *eax, int *edx )
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
push r2
push r1
mov ecx, r0d
xgetbv
pop r4
mov [r4], eax
pop r4
mov [r4], edx
RET
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void stack_align( void (*func)(void*), void *arg );
;-----------------------------------------------------------------------------
cglobal stack_align
push rbp
mov rbp, rsp
%if WIN64
sub rsp, 32 ; shadow space
%endif
and rsp, ~31
mov rax, r0
mov r0, r1
mov r1, r2
mov r2, r3
call rax
leave
ret
%else
;-----------------------------------------------------------------------------
; int cpu_cpuid_test( void )
; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
push esi
push edi
pushfd
pop eax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
xor eax, ebx
pop edi
pop esi
pop ebp
pop ebx
popfd
ret
cglobal stack_align
push ebp
mov ebp, esp
sub esp, 12
and esp, ~31
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
mov edx, [ebp+16]
mov [esp+4], edx
mov edx, [ebp+20]
mov [esp+8], edx
call ecx
leave
ret
%endif
;-----------------------------------------------------------------------------
; void cpu_emms( void )
;-----------------------------------------------------------------------------
cglobal cpu_emms
emms
ret
;-----------------------------------------------------------------------------
; void cpu_sfence( void )
;-----------------------------------------------------------------------------
cglobal cpu_sfence
sfence
ret
cextern intel_cpu_indicator_init
;-----------------------------------------------------------------------------
; void safe_intel_cpu_indicator_init( void );
;-----------------------------------------------------------------------------
cglobal safe_intel_cpu_indicator_init
push r0
push r1
push r2
push r3
push r4
push r5
push r6
%if ARCH_X86_64
push r7
push r8
push r9
push r10
push r11
push r12
push r13
push r14
%endif
push rbp
mov rbp, rsp
%if WIN64
sub rsp, 32 ; shadow space
%endif
and rsp, ~31
call intel_cpu_indicator_init
leave
%if ARCH_X86_64
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
pop r7
%endif
pop r6
pop r5
pop r4
pop r3
pop r2
pop r1
pop r0
ret

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,45 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_DCT8_H
#define X265_DCT8_H
FUNCDEF_TU_S2(void, dct, sse2, const int16_t* src, int16_t* dst, intptr_t srcStride);
FUNCDEF_TU_S2(void, dct, ssse3, const int16_t* src, int16_t* dst, intptr_t srcStride);
FUNCDEF_TU_S2(void, dct, sse4, const int16_t* src, int16_t* dst, intptr_t srcStride);
FUNCDEF_TU_S2(void, dct, avx2, const int16_t* src, int16_t* dst, intptr_t srcStride);
FUNCDEF_TU_S2(void, idct, sse2, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(dst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
#endif // ifndef X265_DCT8_H

View file

@ -0,0 +1,93 @@
/*****************************************************************************
* intrapred.h: Intra Prediction metrics
*****************************************************************************
* Copyright (C) 2003-2013 x264 project
*
* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_INTRAPRED_H
#define X265_INTRAPRED_H
#define DECL_ANG(bsize, mode, cpu) \
void PFX(intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu)(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
#define DECL_ANGS(bsize, cpu) \
DECL_ANG(bsize, 2, cpu); \
DECL_ANG(bsize, 3, cpu); \
DECL_ANG(bsize, 4, cpu); \
DECL_ANG(bsize, 5, cpu); \
DECL_ANG(bsize, 6, cpu); \
DECL_ANG(bsize, 7, cpu); \
DECL_ANG(bsize, 8, cpu); \
DECL_ANG(bsize, 9, cpu); \
DECL_ANG(bsize, 10, cpu); \
DECL_ANG(bsize, 11, cpu); \
DECL_ANG(bsize, 12, cpu); \
DECL_ANG(bsize, 13, cpu); \
DECL_ANG(bsize, 14, cpu); \
DECL_ANG(bsize, 15, cpu); \
DECL_ANG(bsize, 16, cpu); \
DECL_ANG(bsize, 17, cpu); \
DECL_ANG(bsize, 18, cpu); \
DECL_ANG(bsize, 19, cpu); \
DECL_ANG(bsize, 20, cpu); \
DECL_ANG(bsize, 21, cpu); \
DECL_ANG(bsize, 22, cpu); \
DECL_ANG(bsize, 23, cpu); \
DECL_ANG(bsize, 24, cpu); \
DECL_ANG(bsize, 25, cpu); \
DECL_ANG(bsize, 26, cpu); \
DECL_ANG(bsize, 27, cpu); \
DECL_ANG(bsize, 28, cpu); \
DECL_ANG(bsize, 29, cpu); \
DECL_ANG(bsize, 30, cpu); \
DECL_ANG(bsize, 31, cpu); \
DECL_ANG(bsize, 32, cpu); \
DECL_ANG(bsize, 33, cpu); \
DECL_ANG(bsize, 34, cpu)
#define DECL_ALL(cpu) \
FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
DECL_ANGS(4, cpu); \
DECL_ANGS(8, cpu); \
DECL_ANGS(16, cpu); \
DECL_ANGS(32, cpu)
FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
DECL_ALL(sse2);
DECL_ALL(ssse3);
DECL_ALL(sse4);
DECL_ALL(avx2);
#undef DECL_ALL
#undef DECL_ANGS
#undef DECL_ANG
#endif // ifndef X265_INTRAPRED_H

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,49 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_IPFILTER8_H
#define X265_IPFILTER8_H
#define SETUP_FUNC_DEF(cpu) \
FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
SETUP_FUNC_DEF(sse2);
SETUP_FUNC_DEF(ssse3);
SETUP_FUNC_DEF(sse3);
SETUP_FUNC_DEF(sse4);
SETUP_FUNC_DEF(avx2);
#endif // ifndef X265_IPFILTER8_H

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,48 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_LOOPFILTER_H
#define X265_LOOPFILTER_H
#define DECL_SAO(cpu) \
void PFX(saoCuOrgE0_ ## cpu)(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride); \
void PFX(saoCuOrgE1_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
void PFX(saoCuOrgE1_2Rows_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
void PFX(saoCuOrgE2_32_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
DECL_SAO(sse4);
DECL_SAO(avx2);
#endif // ifndef X265_LOOPFILTER_H

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,39 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_MC_H
#define X265_MC_H
#define LOWRES(cpu) \
void PFX(frame_init_lowres_core_ ## cpu)(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
intptr_t src_stride, intptr_t dst_stride, int width, int height);
LOWRES(mmx2)
LOWRES(sse2)
LOWRES(ssse3)
LOWRES(avx)
LOWRES(avx2)
LOWRES(xop)
#undef LOWRES
#endif // ifndef X265_MC_H

View file

@ -0,0 +1,420 @@
;*****************************************************************************
;* pixel-32.asm: x86_32 pixel metrics
;*****************************************************************************
;* Copyright (C) 2003-2013 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Laurent Aimar <fenrir@via.ecp.fr>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
SECTION .text
INIT_MMX mmx2
%macro LOAD_DIFF_4x8P 1 ; dx
LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
movq [spill], m5
LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
movq m5, [spill]
%endmacro
%macro SUM4x8_MM 0
movq [spill], m6
movq [spill+8], m7
ABSW2 m0, m1, m0, m1, m6, m7
ABSW2 m2, m3, m2, m3, m6, m7
paddw m0, m2
paddw m1, m3
movq m6, [spill]
movq m7, [spill+8]
ABSW2 m4, m5, m4, m5, m2, m3
ABSW2 m6, m7, m6, m7, m2, m3
paddw m4, m6
paddw m5, m7
paddw m0, m4
paddw m1, m5
paddw m0, m1
%endmacro
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
push r0
push r2
sub esp, 0x74
%define args esp+0x74
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
LOAD_DIFF_4x8P 0
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m1
TRANSPOSE4x4W 4, 5, 6, 7, 1
movq [trans+0x00], m4
movq [trans+0x08], m5
movq [trans+0x10], m6
movq [trans+0x18], m7
movq m1, [spill]
TRANSPOSE4x4W 0, 1, 2, 3, 4
movq [trans+0x20], m0
movq [trans+0x28], m1
movq [trans+0x30], m2
movq [trans+0x38], m3
mov r0, [args+4]
mov r2, [args]
LOAD_DIFF_4x8P 4
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq [trans+0x40], m0
movq [trans+0x48], m1
movq [trans+0x50], m2
movq [trans+0x58], m3
movq m7, [spill]
TRANSPOSE4x4W 4, 5, 6, 7, 1
movq m0, [trans+0x00]
movq m1, [trans+0x08]
movq m2, [trans+0x10]
movq m3, [trans+0x18]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
SUM4x8_MM
movq [trans], m0
movq m0, [trans+0x20]
movq m1, [trans+0x28]
movq m2, [trans+0x30]
movq m3, [trans+0x38]
movq m4, [trans+0x40]
movq m5, [trans+0x48]
movq m6, [trans+0x50]
movq m7, [trans+0x58]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
SUM4x8_MM
pavgw m0, [trans]
add esp, 0x7c
ret
%undef args
%undef spill
%undef trans
%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
pxor %7, %7
pshufw %4, %1, q1032
pshufw %5, %2, q1032
pshufw %6, %3, q1032
paddusw %1, %4
paddusw %2, %5
paddusw %3, %6
punpcklwd %1, %7
punpcklwd %2, %7
punpcklwd %3, %7
pshufw %4, %1, q1032
pshufw %5, %2, q1032
pshufw %6, %3, q1032
%8 %1, %4
%8 %2, %5
%8 %3, %6
%endmacro
%macro LOAD_4x8P 1 ; dx
pxor m7, m7
movd m6, [r0+%1+7*FENC_STRIDE]
movd m0, [r0+%1+0*FENC_STRIDE]
movd m1, [r0+%1+1*FENC_STRIDE]
movd m2, [r0+%1+2*FENC_STRIDE]
movd m3, [r0+%1+3*FENC_STRIDE]
movd m4, [r0+%1+4*FENC_STRIDE]
movd m5, [r0+%1+5*FENC_STRIDE]
punpcklbw m6, m7
punpcklbw m0, m7
punpcklbw m1, m7
movq [spill], m6
punpcklbw m2, m7
punpcklbw m3, m7
movd m6, [r0+%1+6*FENC_STRIDE]
punpcklbw m4, m7
punpcklbw m5, m7
punpcklbw m6, m7
movq m7, [spill]
%endmacro
%macro HSUMSUB2 4
pshufw m4, %1, %3
pshufw m5, %2, %3
pmullw %1, %4
pmullw m5, %4
paddw %1, m4
paddw %2, m5
%endmacro
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x3_8x8, 2,3
SUB esp, 0x94
%define edge esp+0x70 ; +32
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
%define sum esp+0 ; +32
pxor m7, m7
movq m0, [r1+7]
movq m2, [r1+16]
movq m1, m0
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
movq m6, [pw_ppmmppmm]
HSUMSUB2 m0, m2, q1032, m6
HSUMSUB2 m1, m3, q1032, m6
movq m6, [pw_pmpmpmpm]
HSUMSUB2 m0, m2, q2301, m6
HSUMSUB2 m1, m3, q2301, m6
movq m4, m0
movq m5, m2
paddw m0, m1
paddw m2, m3
psubw m4, m1
psubw m3, m5
movq [edge+0], m0
movq [edge+8], m4
movq [edge+16], m2
movq [edge+24], m3
LOAD_4x8P 0
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m0
TRANSPOSE4x4W 4, 5, 6, 7, 0
movq [trans+0x00], m4
movq [trans+0x08], m5
movq [trans+0x10], m6
movq [trans+0x18], m7
movq m0, [spill]
TRANSPOSE4x4W 0, 1, 2, 3, 4
movq [trans+0x20], m0
movq [trans+0x28], m1
movq [trans+0x30], m2
movq [trans+0x38], m3
LOAD_4x8P 4
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq [trans+0x40], m0
movq [trans+0x48], m1
movq [trans+0x50], m2
movq [trans+0x58], m3
movq m7, [spill]
TRANSPOSE4x4W 4, 5, 6, 7, 0
movq m0, [trans+0x00]
movq m1, [trans+0x08]
movq m2, [trans+0x10]
movq m3, [trans+0x18]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill+0], m0
movq [spill+8], m1
ABSW2 m2, m3, m2, m3, m0, m1
ABSW2 m4, m5, m4, m5, m0, m1
paddw m2, m4
paddw m3, m5
ABSW2 m6, m7, m6, m7, m4, m5
movq m0, [spill+0]
movq m1, [spill+8]
paddw m2, m6
paddw m3, m7
paddw m2, m3
ABSW m1, m1, m4
paddw m2, m1 ; 7x4 sum
movq m7, m0
movq m1, [edge+8] ; left bottom
psllw m1, 3
psubw m7, m1
ABSW2 m0, m7, m0, m7, m5, m3
paddw m0, m2
paddw m7, m2
movq [sum+0], m0 ; dc
movq [sum+8], m7 ; left
movq m0, [trans+0x20]
movq m1, [trans+0x28]
movq m2, [trans+0x30]
movq m3, [trans+0x38]
movq m4, [trans+0x40]
movq m5, [trans+0x48]
movq m6, [trans+0x50]
movq m7, [trans+0x58]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movd [sum+0x10], m0
movd [sum+0x12], m1
movd [sum+0x14], m2
movd [sum+0x16], m3
movd [sum+0x18], m4
movd [sum+0x1a], m5
movd [sum+0x1c], m6
movd [sum+0x1e], m7
movq [spill], m0
movq [spill+8], m1
ABSW2 m2, m3, m2, m3, m0, m1
ABSW2 m4, m5, m4, m5, m0, m1
paddw m2, m4
paddw m3, m5
paddw m2, m3
movq m0, [spill]
movq m1, [spill+8]
ABSW2 m6, m7, m6, m7, m4, m5
ABSW m1, m1, m3
paddw m2, m7
paddw m1, m6
paddw m2, m1 ; 7x4 sum
movq m1, m0
movq m7, [edge+0]
psllw m7, 3 ; left top
mov r2, [edge+0]
add r2, [edge+16]
lea r2, [4*r2+32]
and r2, 0xffc0
movd m6, r2 ; dc
psubw m1, m7
psubw m0, m6
ABSW2 m0, m1, m0, m1, m5, m6
movq m3, [sum+0] ; dc
paddw m0, m2
paddw m1, m2
movq m2, m0
paddw m0, m3
paddw m1, [sum+8] ; h
psrlq m2, 16
paddw m2, m3
movq m3, [edge+16] ; top left
movq m4, [edge+24] ; top right
psllw m3, 3
psllw m4, 3
psubw m3, [sum+16]
psubw m4, [sum+24]
ABSW2 m3, m4, m3, m4, m5, m6
paddw m2, m3
paddw m2, m4 ; v
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
mov r2, r2m
pxor m7, m7
punpckldq m2, m1
pavgw m0, m7
pavgw m2, m7
movd [r2+8], m0 ; dc
movq [r2+0], m2 ; v, h
ADD esp, 0x94
RET
%undef edge
%undef spill
%undef trans
%undef sum
;-----------------------------------------------------------------------------
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal pixel_ssim_4x4x2_core, 0,5
mov r1, r1m
mov r3, r3m
mov r4, 4
pxor m0, m0
.loop:
mov r0, r0m
mov r2, r2m
add r0, r4
add r2, r4
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
%rep 4
movd m5, [r0]
movd m6, [r2]
punpcklbw m5, m0
punpcklbw m6, m0
paddw m1, m5
paddw m2, m6
movq m7, m5
pmaddwd m5, m5
pmaddwd m7, m6
pmaddwd m6, m6
paddd m3, m5
paddd m4, m7
paddd m3, m6
add r0, r1
add r2, r3
%endrep
mov r0, r4m
lea r0, [r0+r4*4]
pshufw m5, m1, q0032
pshufw m6, m2, q0032
paddusw m1, m5
paddusw m2, m6
punpcklwd m1, m2
pshufw m2, m1, q0032
pshufw m5, m3, q0032
pshufw m6, m4, q0032
paddusw m1, m2
paddd m3, m5
paddd m4, m6
punpcklwd m1, m0
punpckldq m3, m4
movq [r0+0], m1
movq [r0+8], m3
sub r4, 4
jge .loop
emms
RET

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,59 @@
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve@borho.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_PIXEL_UTIL_H
#define X265_PIXEL_UTIL_H
#define DEFINE_UTILS(cpu) \
FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
uint32_t PFX(nquant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)); \
void PFX(dequant_normal_ ## cpu(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); \
void PFX(dequant_scaling_## cpu(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)); \
void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
DEFINE_UTILS(sse2);
DEFINE_UTILS(ssse3);
DEFINE_UTILS(sse4);
DEFINE_UTILS(avx2);
#undef DEFINE_UTILS
void PFX(pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
void PFX(pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
float PFX(pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width));
float PFX(pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width));
int PFX(scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
#endif // ifndef X265_PIXEL_UTIL_H

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,69 @@
/*****************************************************************************
* pixel.h: x86 pixel metrics
*****************************************************************************
* Copyright (C) 2003-2013 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#ifndef X265_I386_PIXEL_H
#define X265_I386_PIXEL_H
void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
#define DECL_PIXELS(cpu) \
FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
DECL_PIXELS(mmx);
DECL_PIXELS(mmx2);
DECL_PIXELS(sse2);
DECL_PIXELS(sse3);
DECL_PIXELS(sse4);
DECL_PIXELS(ssse3);
DECL_PIXELS(avx);
DECL_PIXELS(xop);
DECL_PIXELS(avx2);
#undef DECL_PIXELS
#endif // ifndef X265_I386_PIXEL_H

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,893 @@
;*****************************************************************************
;* x86util.asm: x86 utility macros
;*****************************************************************************
;* Copyright (C) 2008-2013 x264 project
;*
;* Authors: Holger Lubitz <holger@lubitz.org>
;* Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************
%assign FENC_STRIDE 64
%assign FDEC_STRIDE 32
%assign SIZEOF_PIXEL 1
%assign SIZEOF_DCTCOEF 2
%define pixel byte
%define vpbroadcastdct vpbroadcastw
%define vpbroadcastpix vpbroadcastb
%if HIGH_BIT_DEPTH
%assign SIZEOF_PIXEL 2
%assign SIZEOF_DCTCOEF 4
%define pixel word
%define vpbroadcastdct vpbroadcastd
%define vpbroadcastpix vpbroadcastw
%endif
%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE
%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
%macro FIX_STRIDES 1-*
%if HIGH_BIT_DEPTH
%rep %0
add %1, %1
%rotate 1
%endrep
%endif
%endmacro
%macro SBUTTERFLY 4
%ifidn %1, dqqq
vperm2i128 m%4, m%2, m%3, q0301 ; punpckh
vinserti128 m%2, m%2, xm%3, 1 ; punpckl
%elif avx_enabled && mmsize >= 16
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%else
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
%endif
SWAP %3, %4
%endmacro
%macro SBUTTERFLY2 4
punpckl%1 m%4, m%2, m%3
punpckh%1 m%2, m%2, m%3
SWAP %2, %4, %3
%endmacro
%macro TRANSPOSE4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
SBUTTERFLY dq, %1, %3, %5
SBUTTERFLY dq, %2, %4, %5
SWAP %2, %3
%endmacro
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
SBUTTERFLY dq, %1, %3, %5
SBUTTERFLY dq, %2, %4, %5
SBUTTERFLY qdq, %1, %2, %5
SBUTTERFLY qdq, %3, %4, %5
%endmacro
%macro TRANSPOSE4x4D 5
SBUTTERFLY dq, %1, %2, %5
SBUTTERFLY dq, %3, %4, %5
SBUTTERFLY qdq, %1, %3, %5
SBUTTERFLY qdq, %2, %4, %5
SWAP %2, %3
%endmacro
%macro TRANSPOSE8x8W 9-11
%if ARCH_X86_64
SBUTTERFLY wd, %1, %2, %9
SBUTTERFLY wd, %3, %4, %9
SBUTTERFLY wd, %5, %6, %9
SBUTTERFLY wd, %7, %8, %9
SBUTTERFLY dq, %1, %3, %9
SBUTTERFLY dq, %2, %4, %9
SBUTTERFLY dq, %5, %7, %9
SBUTTERFLY dq, %6, %8, %9
SBUTTERFLY qdq, %1, %5, %9
SBUTTERFLY qdq, %2, %6, %9
SBUTTERFLY qdq, %3, %7, %9
SBUTTERFLY qdq, %4, %8, %9
SWAP %2, %5
SWAP %4, %7
%else
; in: m0..m7, unless %11 in which case m6 is in %9
; out: m0..m7, unless %11 in which case m4 is in %10
; spills into %9 and %10
%if %0<11
movdqa %9, m%7
%endif
SBUTTERFLY wd, %1, %2, %7
movdqa %10, m%2
movdqa m%7, %9
SBUTTERFLY wd, %3, %4, %2
SBUTTERFLY wd, %5, %6, %2
SBUTTERFLY wd, %7, %8, %2
SBUTTERFLY dq, %1, %3, %2
movdqa %9, m%3
movdqa m%2, %10
SBUTTERFLY dq, %2, %4, %3
SBUTTERFLY dq, %5, %7, %3
SBUTTERFLY dq, %6, %8, %3
SBUTTERFLY qdq, %1, %5, %3
SBUTTERFLY qdq, %2, %6, %3
movdqa %10, m%2
movdqa m%3, %9
SBUTTERFLY qdq, %3, %7, %2
SBUTTERFLY qdq, %4, %8, %2
SWAP %2, %5
SWAP %4, %7
%if %0<11
movdqa m%5, %10
%endif
%endif
%endmacro
%macro WIDEN_SXWD 2
punpckhwd m%2, m%1
psrad m%2, 16
%if cpuflag(sse4)
pmovsxwd m%1, m%1
%else
punpcklwd m%1, m%1
psrad m%1, 16
%endif
%endmacro
%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
%if cpuflag(ssse3)
pabsw %1, %2
%elifidn %3, sign ; version for pairing with PSIGNW: modifies src
pxor %1, %1
pcmpgtw %1, %2
pxor %2, %1
psubw %2, %1
SWAP %1, %2
%elifidn %1, %2
pxor %3, %3
psubw %3, %1
pmaxsw %1, %3
%elifid %2
pxor %1, %1
psubw %1, %2
pmaxsw %1, %2
%elif %0 == 2
pxor %1, %1
psubw %1, %2
pmaxsw %1, %2
%else
mova %1, %2
pxor %3, %3
psubw %3, %1
pmaxsw %1, %3
%endif
%endmacro
%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp
%if cpuflag(ssse3)
pabsw %1, %3
pabsw %2, %4
%elifidn %1, %3
pxor %5, %5
pxor %6, %6
psubw %5, %1
psubw %6, %2
pmaxsw %1, %5
pmaxsw %2, %6
%else
pxor %1, %1
pxor %2, %2
psubw %1, %3
psubw %2, %4
pmaxsw %1, %3
pmaxsw %2, %4
%endif
%endmacro
%macro ABSB 2
%if cpuflag(ssse3)
pabsb %1, %1
%else
pxor %2, %2
psubb %2, %1
pminub %1, %2
%endif
%endmacro
%macro ABSD 2-3
%if cpuflag(ssse3)
pabsd %1, %2
%else
%define %%s %2
%if %0 == 3
mova %3, %2
%define %%s %3
%endif
pxor %1, %1
pcmpgtd %1, %%s
pxor %%s, %1
psubd %%s, %1
SWAP %1, %%s
%endif
%endmacro
%macro PSIGN 3-4
%if cpuflag(ssse3) && %0 == 4
psign%1 %2, %3, %4
%elif cpuflag(ssse3)
psign%1 %2, %3
%elif %0 == 4
pxor %2, %3, %4
psub%1 %2, %4
%else
pxor %2, %3
psub%1 %2, %3
%endif
%endmacro
%define PSIGNW PSIGN w,
%define PSIGND PSIGN d,
%macro SPLATB_LOAD 3
%if cpuflag(ssse3)
movd %1, [%2-3]
pshufb %1, %3
%else
movd %1, [%2-3] ;to avoid crossing a cacheline
punpcklbw %1, %1
SPLATW %1, %1, 3
%endif
%endmacro
%imacro SPLATW 2-3 0
%if cpuflag(avx2) && %3 == 0
vpbroadcastw %1, %2
%else
PSHUFLW %1, %2, (%3)*q1111
%if mmsize == 16
punpcklqdq %1, %1
%endif
%endif
%endmacro
%imacro SPLATD 2-3 0
%if mmsize == 16
pshufd %1, %2, (%3)*q1111
%else
pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
%endif
%endmacro
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
pminsw %1, %3
%endmacro
%macro CLIPW2 4 ;(dst0, dst1, min, max)
pmaxsw %1, %3
pmaxsw %2, %3
pminsw %1, %4
pminsw %2, %4
%endmacro
%macro HADDD 2 ; sum junk
%if sizeof%1 == 32
%define %2 xmm%2
vextracti128 %2, %1, 1
%define %1 xmm%1
paddd %1, %2
%endif
%if mmsize >= 16
%if cpuflag(xop) && sizeof%1 == 16
vphadddq %1, %1
%endif
movhlps %2, %1
paddd %1, %2
%endif
%if notcpuflag(xop)
PSHUFLW %2, %1, q0032
paddd %1, %2
%endif
%undef %1
%undef %2
%endmacro
%macro HADDW 2 ; reg, tmp
%if cpuflag(xop) && sizeof%1 == 16
vphaddwq %1, %1
movhlps %2, %1
paddd %1, %2
%else
pmaddwd %1, [pw_1]
HADDD %1, %2
%endif
%endmacro
%macro HADDUWD 2
%if cpuflag(xop) && sizeof%1 == 16
vphadduwd %1, %1
%else
psrld %2, %1, 16
pslld %1, 16
psrld %1, 16
paddd %1, %2
%endif
%endmacro
%macro HADDUW 2
%if cpuflag(xop) && sizeof%1 == 16
vphadduwq %1, %1
movhlps %2, %1
paddd %1, %2
%else
HADDUWD %1, %2
HADDD %1, %2
%endif
%endmacro
%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
; AVX2 version uses a precalculated extra input that
; can be re-used across calls
%if sizeof%1==32
; %3 = abcdefgh ijklmnop (lower address)
; %2 = ABCDEFGH IJKLMNOP (higher address)
vperm2i128 %4, %1, %2, q0003 ; %4 = ijklmnop ABCDEFGH
%if %3 < 16
palignr %1, %4, %2, %3 ; %1 = bcdefghi jklmnopA
%else
palignr %1, %2, %4, %3-16 ; %1 = pABCDEFG HIJKLMNO
%endif
%elif cpuflag(ssse3)
%if %0==5
palignr %1, %2, %3, %4
%else
palignr %1, %2, %3
%endif
%else
%define %%dst %1
%if %0==5
%ifnidn %1, %2
mova %%dst, %2
%endif
%rotate 1
%endif
%ifnidn %4, %2
mova %4, %2
%endif
%if mmsize==8
psllq %%dst, (8-%3)*8
psrlq %4, %3*8
%else
pslldq %%dst, 16-%3
psrldq %4, %3
%endif
por %%dst, %4
%endif
%endmacro
%macro PSHUFLW 1+
%if mmsize == 8
pshufw %1
%else
pshuflw %1
%endif
%endmacro
; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
; values shifted in are undefined
; faster if dst==src
%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
%define PSRLPIX PSXLPIX r, 1, ;dst, src, shift
%macro PSXLPIX 5
%if mmsize == 8
%if %5&1
ps%1lq %3, %4, %5*8
%else
pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
%endif
%else
ps%1ldq %3, %4, %5*2
%endif
%endmacro
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
%ifnum %5
pand m%3, m%5, m%4 ; src .. y6 .. y4
pand m%1, m%5, m%2 ; dst .. y6 .. y4
%else
mova m%1, %5
pand m%3, m%1, m%4 ; src .. y6 .. y4
pand m%1, m%1, m%2 ; dst .. y6 .. y4
%endif
psrlw m%2, 8 ; dst .. y7 .. y5
psrlw m%4, 8 ; src .. y7 .. y5
%endmacro
%macro SUMSUB_BA 3-4
%if %0==3
padd%1 m%2, m%3
padd%1 m%3, m%3
psub%1 m%3, m%2
%elif avx_enabled
padd%1 m%4, m%2, m%3
psub%1 m%3, m%2
SWAP %2, %4
%else
mova m%4, m%2
padd%1 m%2, m%3
psub%1 m%3, m%4
%endif
%endmacro
%macro SUMSUB_BADC 5-6
%if %0==6
SUMSUB_BA %1, %2, %3, %6
SUMSUB_BA %1, %4, %5, %6
%else
padd%1 m%2, m%3
padd%1 m%4, m%5
padd%1 m%3, m%3
padd%1 m%5, m%5
psub%1 m%3, m%2
psub%1 m%5, m%4
%endif
%endmacro
%macro HADAMARD4_V 4+
SUMSUB_BADC w, %1, %2, %3, %4
SUMSUB_BADC w, %1, %3, %2, %4
%endmacro
%macro HADAMARD8_V 8+
SUMSUB_BADC w, %1, %2, %3, %4
SUMSUB_BADC w, %5, %6, %7, %8
SUMSUB_BADC w, %1, %3, %2, %4
SUMSUB_BADC w, %5, %7, %6, %8
SUMSUB_BADC w, %1, %5, %2, %6
SUMSUB_BADC w, %3, %7, %4, %8
%endmacro
%macro TRANS_SSE2 5-6
; TRANSPOSE2x2
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
; %2: ord/unord (for compat with sse4, unused)
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
%define mask [mask_10]
%define shift 16
%elifidn %1, q
%define mask [mask_1100]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
mova m%5, mask ; ff00
mova m%6, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pand m%6, m%5 ; x5..
pandn m%5, m%3 ; ..x0
psrl%1 m%3, shift ; ..x1
por m%4, m%5 ; x4x0
por m%3, m%6 ; x5x1
%else ; more dependency, one insn less. sometimes faster, sometimes not
mova m%5, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pxor m%4, m%3 ; (x4^x1)x0
pand m%4, mask ; (x4^x1)..
pxor m%3, m%4 ; x4x0
psrl%1 m%4, shift ; ..(x1^x4)
pxor m%5, m%4 ; x5x1
SWAP %4, %3, %5
%endif
%endmacro
%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
%ifidn %2, ord
psrl%1 m%5, m%3, 16
pblendw m%5, m%4, q2222
psll%1 m%4, 16
pblendw m%4, m%3, q1111
SWAP %3, %5
%else
%if avx_enabled
pblendw m%5, m%3, m%4, q2222
SWAP %3, %5
%else
mova m%5, m%3
pblendw m%3, m%4, q2222
%endif
psll%1 m%4, 16
psrl%1 m%5, 16
por m%4, m%5
%endif
%elifidn %1, q
shufps m%5, m%3, m%4, q3131
shufps m%3, m%3, m%4, q2020
SWAP %4, %5
%endif
%endmacro
%macro TRANS_XOP 5-6
%ifidn %1, d
vpperm m%5, m%3, m%4, [transd_shuf1]
vpperm m%3, m%3, m%4, [transd_shuf2]
%elifidn %1, q
shufps m%5, m%3, m%4, q3131
shufps m%3, m%4, q2020
%endif
SWAP %4, %5
%endmacro
%macro HADAMARD 5-6
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
; %3/%4: regs
; %5(%6): tmpregs
%if %1!=0 ; have to reorder stuff for horizontal op
%ifidn %2, sumsub
%define ORDER ord
; sumsub needs order because a-b != b-a unless a=b
%else
%define ORDER unord
; if we just max, order doesn't matter (allows pblendw+or in sse4)
%endif
%if %1==1
TRANS d, ORDER, %3, %4, %5, %6
%elif %1==2
%if mmsize==8
SBUTTERFLY dq, %3, %4, %5
%else
TRANS q, ORDER, %3, %4, %5, %6
%endif
%elif %1==4
SBUTTERFLY qdq, %3, %4, %5
%elif %1==8
SBUTTERFLY dqqq, %3, %4, %5
%endif
%endif
%ifidn %2, sumsub
SUMSUB_BA w, %3, %4, %5
%else
%ifidn %2, amax
%if %0==6
ABSW2 m%3, m%4, m%3, m%4, m%5, m%6
%else
ABSW m%3, m%3, m%5
ABSW m%4, m%4, m%5
%endif
%endif
pmaxsw m%3, m%4
%endif
%endmacro
%macro HADAMARD2_2D 6-7 sumsub
HADAMARD 0, sumsub, %1, %2, %5
HADAMARD 0, sumsub, %3, %4, %5
SBUTTERFLY %6, %1, %2, %5
%ifnum %7
HADAMARD 0, amax, %1, %2, %5, %7
%else
HADAMARD 0, %7, %1, %2, %5
%endif
SBUTTERFLY %6, %3, %4, %5
%ifnum %7
HADAMARD 0, amax, %3, %4, %5, %7
%else
HADAMARD 0, %7, %3, %4, %5
%endif
%endmacro
%macro HADAMARD4_2D 5-6 sumsub
HADAMARD2_2D %1, %2, %3, %4, %5, wd
HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
SWAP %2, %3
%endmacro
%macro HADAMARD4_2D_SSE 5-6 sumsub
HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0
SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2
HADAMARD2_2D %1, %3, %2, %4, %5, dq
SBUTTERFLY qdq, %1, %2, %5
HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1
SBUTTERFLY qdq, %3, %4, %5
HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3
%endmacro
%macro HADAMARD8_2D 9-10 sumsub
HADAMARD2_2D %1, %2, %3, %4, %9, wd
HADAMARD2_2D %5, %6, %7, %8, %9, wd
HADAMARD2_2D %1, %3, %2, %4, %9, dq
HADAMARD2_2D %5, %7, %6, %8, %9, dq
HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
%ifnidn %10, amax
SWAP %2, %5
SWAP %4, %7
%endif
%endmacro
; doesn't include the "pmaddubsw hmul_8p" pass
%macro HADAMARD8_2D_HMUL 10
HADAMARD4_V %1, %2, %3, %4, %9
HADAMARD4_V %5, %6, %7, %8, %9
SUMSUB_BADC w, %1, %5, %2, %6, %9
HADAMARD 2, sumsub, %1, %5, %9, %10
HADAMARD 2, sumsub, %2, %6, %9, %10
SUMSUB_BADC w, %3, %7, %4, %8, %9
HADAMARD 2, sumsub, %3, %7, %9, %10
HADAMARD 2, sumsub, %4, %8, %9, %10
HADAMARD 1, amax, %1, %5, %9, %10
HADAMARD 1, amax, %2, %6, %9, %5
HADAMARD 1, amax, %3, %7, %9, %5
HADAMARD 1, amax, %4, %8, %9, %5
%endmacro
%macro SUMSUB2_AB 4
%if cpuflag(xop)
pmacs%1%1 m%4, m%3, [p%1_m2], m%2
pmacs%1%1 m%2, m%2, [p%1_2], m%3
%elifnum %3
psub%1 m%4, m%2, m%3
psub%1 m%4, m%3
padd%1 m%2, m%2
padd%1 m%2, m%3
%else
mova m%4, m%2
padd%1 m%2, m%2
padd%1 m%2, %3
psub%1 m%4, %3
psub%1 m%4, %3
%endif
%endmacro
%macro SUMSUBD2_AB 5
%ifnum %4
psra%1 m%5, m%2, 1 ; %3: %3>>1
psra%1 m%4, m%3, 1 ; %2: %2>>1
padd%1 m%4, m%2 ; %3: %3>>1+%2
psub%1 m%5, m%3 ; %2: %2>>1-%3
SWAP %2, %5
SWAP %3, %4
%else
mova %5, m%2
mova %4, m%3
psra%1 m%3, 1 ; %3: %3>>1
psra%1 m%2, 1 ; %2: %2>>1
padd%1 m%3, %5 ; %3: %3>>1+%2
psub%1 m%2, %4 ; %2: %2>>1-%3
%endif
%endmacro
%macro DCT4_1D 5
%ifnum %5
SUMSUB_BADC w, %4, %1, %3, %2, %5
SUMSUB_BA w, %3, %4, %5
SUMSUB2_AB w, %1, %2, %5
SWAP %1, %3, %4, %5, %2
%else
SUMSUB_BADC w, %4, %1, %3, %2
SUMSUB_BA w, %3, %4
mova [%5], m%2
SUMSUB2_AB w, %1, [%5], %2
SWAP %1, %3, %4, %2
%endif
%endmacro
%macro IDCT4_1D 6-7
%ifnum %6
SUMSUBD2_AB %1, %3, %5, %7, %6
; %3: %3>>1-%5 %5: %3+%5>>1
SUMSUB_BA %1, %4, %2, %7
; %4: %2+%4 %2: %2-%4
SUMSUB_BADC %1, %5, %4, %3, %2, %7
; %5: %2+%4 + (%3+%5>>1)
; %4: %2+%4 - (%3+%5>>1)
; %3: %2-%4 + (%3>>1-%5)
; %2: %2-%4 - (%3>>1-%5)
%else
%ifidn %1, w
SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
%else
SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
%endif
SUMSUB_BA %1, %4, %2
SUMSUB_BADC %1, %5, %4, %3, %2
%endif
SWAP %2, %5, %4
; %2: %2+%4 + (%3+%5>>1) row0
; %3: %2-%4 + (%3>>1-%5) row1
; %4: %2-%4 - (%3>>1-%5) row2
; %5: %2+%4 - (%3+%5>>1) row3
%endmacro
%macro LOAD_DIFF 5-6 1
%if HIGH_BIT_DEPTH
%if %6 ; %5 aligned?
mova %1, %4
psubw %1, %5
%else
movu %1, %4
movu %2, %5
psubw %1, %2
%endif
%else ; !HIGH_BIT_DEPTH
%ifidn %3, none
movh %1, %4
movh %2, %5
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%else
movh %1, %4
punpcklbw %1, %3
movh %2, %5
punpcklbw %2, %3
psubw %1, %2
%endif
%endif ; HIGH_BIT_DEPTH
%endmacro
%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
%if BIT_DEPTH == 8 && cpuflag(ssse3)
movh m%2, [%8+%1*FDEC_STRIDE]
movh m%1, [%7+%1*FENC_STRIDE]
punpcklbw m%1, m%2
movh m%3, [%8+%2*FDEC_STRIDE]
movh m%2, [%7+%2*FENC_STRIDE]
punpcklbw m%2, m%3
movh m%4, [%8+%3*FDEC_STRIDE]
movh m%3, [%7+%3*FENC_STRIDE]
punpcklbw m%3, m%4
movh m%5, [%8+%4*FDEC_STRIDE]
movh m%4, [%7+%4*FENC_STRIDE]
punpcklbw m%4, m%5
pmaddubsw m%1, m%6
pmaddubsw m%2, m%6
pmaddubsw m%3, m%6
pmaddubsw m%4, m%6
%else
LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
%endif
%endmacro
%macro STORE_DCT 6
movq [%5+%6+ 0], m%1
movq [%5+%6+ 8], m%2
movq [%5+%6+16], m%3
movq [%5+%6+24], m%4
movhps [%5+%6+32], m%1
movhps [%5+%6+40], m%2
movhps [%5+%6+48], m%3
movhps [%5+%6+56], m%4
%endmacro
%macro STORE_IDCT 4
movhps [r0-4*FDEC_STRIDE], %1
movh [r0-3*FDEC_STRIDE], %1
movhps [r0-2*FDEC_STRIDE], %2
movh [r0-1*FDEC_STRIDE], %2
movhps [r0+0*FDEC_STRIDE], %3
movh [r0+1*FDEC_STRIDE], %3
movhps [r0+2*FDEC_STRIDE], %4
movh [r0+3*FDEC_STRIDE], %4
%endmacro
%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11
LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11
LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
%endif
%endmacro
; 2xdst, 2xtmp, 2xsrcrow
%macro LOAD_DIFF16x2_AVX2 6
pmovzxbw m%1, [r1+%5*FENC_STRIDE]
pmovzxbw m%2, [r1+%6*FENC_STRIDE]
pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
psubw m%1, m%3
psubw m%2, m%4
%endmacro
%macro DIFFx2 6-7
movh %3, %5
punpcklbw %3, %4
psraw %1, 6
paddsw %1, %3
movh %3, %6
punpcklbw %3, %4
psraw %2, 6
paddsw %2, %3
packuswb %2, %1
%endmacro
; (high depth) in: %1, %2, min to clip, max to clip, mem128
; in: %1, tmp, %3, mem64
%macro STORE_DIFF 4-5
%if HIGH_BIT_DEPTH
psrad %1, 6
psrad %2, 6
packssdw %1, %2
paddw %1, %5
CLIPW %1, %3, %4
mova %5, %1
%else
movh %2, %4
punpcklbw %2, %3
psraw %1, 6
paddsw %1, %2
packuswb %1, %1
movh %4, %1
%endif
%endmacro
%macro SHUFFLE_MASK_W 8
%rep 8
%if %1>=0x80
db %1, %1
%else
db %1*2
db %1*2+1
%endif
%rotate 1
%endrep
%endmacro
; instruction, accum, input, iteration (zero to swap, nonzero to add)
%macro ACCUM 4
%if %4
%1 m%2, m%3
%else
SWAP %2, %3
%endif
%endmacro
; IACA support
%macro IACA_START 0
mov ebx, 111
db 0x64, 0x67, 0x90
%endmacro
%macro IACA_END 0
mov ebx, 222
db 0x64, 0x67, 0x90
%endmacro