libbpg-0.9.6

2015-10-27 11:46:00 +01:00 · 2015-10-27 11:46:00 +01:00 · 35a8402710
commit 35a8402710
parent 3035b41edf
248 changed files with 232891 additions and 100 deletions
--- a/x265/source/common/x86/README.txt
+++ b/x265/source/common/x86/README.txt
@ -0,0 +1,14 @@
+The ASM source here is directly pulled from the x264 project with two
+changes:
+
+1 - FENC_STRIDE must be increased to 64 in x86util.asm because of HEVC's
+    larger CU sizes
+2 - Because of #1, we must rebrand the functions with x265_ prefixes in
+    x86inc.asm (private_prefix) and pixel-a.asm (mangle(x265_pixel_ssd))
+3 - We have modified the MMX SSD primitives to use EMMS before returning
+4 - We have added some new SATD block sizes for SSE3
+
+Current assembly is based on x264 revision:
+   configure: Support cygwin64
+   Diogo Franco (Kovensky) <diogomfranco@gmail.com>
+   2013-07-23 22:17:44 -0300
--- a/x265/source/common/x86/asm-primitives.cpp
+++ b/x265/source/common/x86/asm-primitives.cpp
--- a/x265/source/common/x86/blockcopy8.asm
+++ b/x265/source/common/x86/blockcopy8.asm
--- a/x265/source/common/x86/blockcopy8.h
+++ b/x265/source/common/x86/blockcopy8.h
@ -0,0 +1,63 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_BLOCKCOPY8_H
+#define X265_BLOCKCOPY8_H
+
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+
+FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
+
+FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
+FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
+
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+
+FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+
+#endif // ifndef X265_I386_PIXEL_H
--- a/x265/source/common/x86/const-a.asm
+++ b/x265/source/common/x86/const-a.asm
@ -0,0 +1,146 @@
+;*****************************************************************************
+;* const-a.asm: x86 global constants
+;*****************************************************************************
+;* Copyright (C) 2010-2013 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
+;*          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA 32
+
+;; 8-bit constants
+
+const pb_0,                 times 16 db 0
+const pb_1,                 times 32 db 1
+const pb_2,                 times 32 db 2
+const pb_3,                 times 16 db 3
+const pb_4,                 times 32 db 4
+const pb_8,                 times 32 db 8
+const pb_15,                times 32 db 15
+const pb_16,                times 32 db 16
+const pb_32,                times 32 db 32
+const pb_64,                times 32 db 64
+const pb_128,               times 32 db 128
+const pb_a1,                times 16 db 0xa1
+
+const pb_01,                times  8 db   0,   1
+const hsub_mul,             times 16 db   1,  -1
+const pw_swap,              times  2 db   6,   7,   4,   5,   2,   3,   0,   1
+const pb_unpackbd1,         times  2 db   0,   0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3
+const pb_unpackbd2,         times  2 db   4,   4,   4,   4,   5,   5,   5,   5,   6,   6,   6,   6,   7,   7,   7,   7
+const pb_unpackwq1,         times  1 db   0,   1,   0,   1,   0,   1,   0,   1,   2,   3,   2,   3,   2,   3,   2,   3
+const pb_unpackwq2,         times  1 db   4,   5,   4,   5,   4,   5,   4,   5,   6,   7,   6,   7,   6,   7,   6,   7
+const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
+const pb_movemask,          times 16 db 0x00
+                            times 16 db 0xFF
+
+const pb_movemask_32,       times 32 db 0x00
+                            times 32 db 0xFF
+                            times 32 db 0x00
+
+const pb_0000000000000F0F,  times  2 db 0xff, 0x00
+                            times 12 db 0x00
+const pb_000000000000000F,           db 0xff
+                            times 15 db 0x00
+
+;; 16-bit constants
+
+const pw_1,                 times 16 dw 1
+const pw_2,                 times 16 dw 2
+const pw_3,                 times 16 dw 3
+const pw_7,                 times 16 dw 7
+const pw_m2,                times  8 dw -2
+const pw_4,                 times  8 dw 4
+const pw_8,                 times  8 dw 8
+const pw_16,                times 16 dw 16
+const pw_15,                times 16 dw 15
+const pw_31,                times 16 dw 31
+const pw_32,                times 16 dw 32
+const pw_64,                times  8 dw 64
+const pw_128,               times 16 dw 128
+const pw_256,               times 16 dw 256
+const pw_257,               times 16 dw 257
+const pw_512,               times 16 dw 512
+const pw_1023,              times 16 dw 1023
+const pw_1024,              times 16 dw 1024
+const pw_2048,              times 16 dw 2048
+const pw_4096,              times 16 dw 4096
+const pw_8192,              times  8 dw 8192
+const pw_00ff,              times 16 dw 0x00ff
+const pw_ff00,              times  8 dw 0xff00
+const pw_2000,              times 16 dw 0x2000
+const pw_8000,              times  8 dw 0x8000
+const pw_3fff,              times  8 dw 0x3fff
+const pw_32_0,              times  4 dw 32,
+                            times  4 dw 0
+const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
+
+const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
+const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
+const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
+const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
+const pw_pmmpzzzz,          times  1 dw   1,  -1,  -1,   1,   0,   0,   0,   0
+const multi_2Row,           times  1 dw   1,   2,   3,   4,   1,   2,   3,   4
+const multiH,               times  1 dw   9,  10,  11,  12,  13,  14,  15,  16
+const multiH3,              times  1 dw  25,  26,  27,  28,  29,  30,  31,  32
+const multiL,               times  1 dw   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16
+const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
+const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
+const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
+const pw_FFFFFFFFFFFFFFF0,           dw 0x00
+                            times  7 dw 0xff
+const hmul_16p,             times 16 db   1
+                            times  8 db   1,  -1
+
+
+;; 32-bit constants
+
+const pd_1,                 times  8 dd 1
+const pd_2,                 times  8 dd 2
+const pd_4,                 times  4 dd 4
+const pd_8,                 times  4 dd 8
+const pd_16,                times  8 dd 16
+const pd_31,                times  4 dd 31
+const pd_32,                times  8 dd 32
+const pd_64,                times  4 dd 64
+const pd_128,               times  4 dd 128
+const pd_256,               times  4 dd 256
+const pd_512,               times  4 dd 512
+const pd_1024,              times  4 dd 1024
+const pd_2048,              times  4 dd 2048
+const pd_ffff,              times  4 dd 0xffff
+const pd_32767,             times  4 dd 32767
+const pd_524416,            times  4 dd 524416
+const pd_n32768,            times  8 dd 0xffff8000
+const pd_n131072,           times  4 dd 0xfffe0000
+
+const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
+
+const popcnt_table
+%assign x 0
+%rep 256
+; population count
+db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
+%assign x x+1
+%endrep
--- a/x265/source/common/x86/cpu-a.asm
+++ b/x265/source/common/x86/cpu-a.asm
@ -0,0 +1,197 @@
+;*****************************************************************************
+;* cpu-a.asm: x86 cpu utilities
+;*****************************************************************************
+;* Copyright (C) 2003-2013 x264 project
+;*
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
+;*          Loren Merritt <lorenm@u.washington.edu>
+;*          Fiona Glaser <fiona@x264.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
+;-----------------------------------------------------------------------------
+cglobal cpu_cpuid, 5,7
+    push rbx
+    push  r4
+    push  r3
+    push  r2
+    push  r1
+    mov  eax, r0d
+    xor  ecx, ecx
+    cpuid
+    pop   r4
+    mov [r4], eax
+    pop   r4
+    mov [r4], ebx
+    pop   r4
+    mov [r4], ecx
+    pop   r4
+    mov [r4], edx
+    pop  rbx
+    RET
+
+;-----------------------------------------------------------------------------
+; void cpu_xgetbv( int op, int *eax, int *edx )
+;-----------------------------------------------------------------------------
+cglobal cpu_xgetbv, 3,7
+    push  r2
+    push  r1
+    mov  ecx, r0d
+    xgetbv
+    pop   r4
+    mov [r4], eax
+    pop   r4
+    mov [r4], edx
+    RET
+
+%if ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; void stack_align( void (*func)(void*), void *arg );
+;-----------------------------------------------------------------------------
+cglobal stack_align
+    push rbp
+    mov  rbp, rsp
+%if WIN64
+    sub  rsp, 32 ; shadow space
+%endif
+    and  rsp, ~31
+    mov  rax, r0
+    mov   r0, r1
+    mov   r1, r2
+    mov   r2, r3
+    call rax
+    leave
+    ret
+
+%else
+
+;-----------------------------------------------------------------------------
+; int cpu_cpuid_test( void )
+; return 0 if unsupported
+;-----------------------------------------------------------------------------
+cglobal cpu_cpuid_test
+    pushfd
+    push    ebx
+    push    ebp
+    push    esi
+    push    edi
+    pushfd
+    pop     eax
+    mov     ebx, eax
+    xor     eax, 0x200000
+    push    eax
+    popfd
+    pushfd
+    pop     eax
+    xor     eax, ebx
+    pop     edi
+    pop     esi
+    pop     ebp
+    pop     ebx
+    popfd
+    ret
+
+cglobal stack_align
+    push ebp
+    mov  ebp, esp
+    sub  esp, 12
+    and  esp, ~31
+    mov  ecx, [ebp+8]
+    mov  edx, [ebp+12]
+    mov  [esp], edx
+    mov  edx, [ebp+16]
+    mov  [esp+4], edx
+    mov  edx, [ebp+20]
+    mov  [esp+8], edx
+    call ecx
+    leave
+    ret
+
+%endif
+
+;-----------------------------------------------------------------------------
+; void cpu_emms( void )
+;-----------------------------------------------------------------------------
+cglobal cpu_emms
+    emms
+    ret
+
+;-----------------------------------------------------------------------------
+; void cpu_sfence( void )
+;-----------------------------------------------------------------------------
+cglobal cpu_sfence
+    sfence
+    ret
+
+cextern intel_cpu_indicator_init
+
+;-----------------------------------------------------------------------------
+; void safe_intel_cpu_indicator_init( void );
+;-----------------------------------------------------------------------------
+cglobal safe_intel_cpu_indicator_init
+    push r0
+    push r1
+    push r2
+    push r3
+    push r4
+    push r5
+    push r6
+%if ARCH_X86_64
+    push r7
+    push r8
+    push r9
+    push r10
+    push r11
+    push r12
+    push r13
+    push r14
+%endif
+    push rbp
+    mov  rbp, rsp
+%if WIN64
+    sub  rsp, 32 ; shadow space
+%endif
+    and  rsp, ~31
+    call intel_cpu_indicator_init
+    leave
+%if ARCH_X86_64
+    pop r14
+    pop r13
+    pop r12
+    pop r11
+    pop r10
+    pop r9
+    pop r8
+    pop r7
+%endif
+    pop r6
+    pop r5
+    pop r4
+    pop r3
+    pop r2
+    pop r1
+    pop r0
+    ret
--- a/x265/source/common/x86/dct8.asm
+++ b/x265/source/common/x86/dct8.asm
--- a/x265/source/common/x86/dct8.h
+++ b/x265/source/common/x86/dct8.h
@ -0,0 +1,45 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Nabajit Deka <nabajit@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_DCT8_H
+#define X265_DCT8_H
+
+FUNCDEF_TU_S2(void, dct, sse2, const int16_t* src, int16_t* dst, intptr_t srcStride);
+FUNCDEF_TU_S2(void, dct, ssse3, const int16_t* src, int16_t* dst, intptr_t srcStride);
+FUNCDEF_TU_S2(void, dct, sse4, const int16_t* src, int16_t* dst, intptr_t srcStride);
+FUNCDEF_TU_S2(void, dct, avx2, const int16_t* src, int16_t* dst, intptr_t srcStride);
+
+FUNCDEF_TU_S2(void, idct, sse2, const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
+FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
+
+void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(dst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
+
+#endif // ifndef X265_DCT8_H
--- a/x265/source/common/x86/intrapred.h
+++ b/x265/source/common/x86/intrapred.h
@ -0,0 +1,93 @@
+/*****************************************************************************
+ * intrapred.h: Intra Prediction metrics
+ *****************************************************************************
+ * Copyright (C) 2003-2013 x264 project
+ *
+ * Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_INTRAPRED_H
+#define X265_INTRAPRED_H
+
+#define DECL_ANG(bsize, mode, cpu) \
+    void PFX(intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu)(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+
+#define DECL_ANGS(bsize, cpu) \
+    DECL_ANG(bsize, 2, cpu); \
+    DECL_ANG(bsize, 3, cpu); \
+    DECL_ANG(bsize, 4, cpu); \
+    DECL_ANG(bsize, 5, cpu); \
+    DECL_ANG(bsize, 6, cpu); \
+    DECL_ANG(bsize, 7, cpu); \
+    DECL_ANG(bsize, 8, cpu); \
+    DECL_ANG(bsize, 9, cpu); \
+    DECL_ANG(bsize, 10, cpu); \
+    DECL_ANG(bsize, 11, cpu); \
+    DECL_ANG(bsize, 12, cpu); \
+    DECL_ANG(bsize, 13, cpu); \
+    DECL_ANG(bsize, 14, cpu); \
+    DECL_ANG(bsize, 15, cpu); \
+    DECL_ANG(bsize, 16, cpu); \
+    DECL_ANG(bsize, 17, cpu); \
+    DECL_ANG(bsize, 18, cpu); \
+    DECL_ANG(bsize, 19, cpu); \
+    DECL_ANG(bsize, 20, cpu); \
+    DECL_ANG(bsize, 21, cpu); \
+    DECL_ANG(bsize, 22, cpu); \
+    DECL_ANG(bsize, 23, cpu); \
+    DECL_ANG(bsize, 24, cpu); \
+    DECL_ANG(bsize, 25, cpu); \
+    DECL_ANG(bsize, 26, cpu); \
+    DECL_ANG(bsize, 27, cpu); \
+    DECL_ANG(bsize, 28, cpu); \
+    DECL_ANG(bsize, 29, cpu); \
+    DECL_ANG(bsize, 30, cpu); \
+    DECL_ANG(bsize, 31, cpu); \
+    DECL_ANG(bsize, 32, cpu); \
+    DECL_ANG(bsize, 33, cpu); \
+    DECL_ANG(bsize, 34, cpu)
+
+#define DECL_ALL(cpu) \
+    FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
+    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
+    DECL_ANGS(4, cpu); \
+    DECL_ANGS(8, cpu); \
+    DECL_ANGS(16, cpu); \
+    DECL_ANGS(32, cpu)
+
+FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+
+FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
+
+DECL_ALL(sse2);
+DECL_ALL(ssse3);
+DECL_ALL(sse4);
+DECL_ALL(avx2);
+
+#undef DECL_ALL
+#undef DECL_ANGS
+#undef DECL_ANG
+
+
+#endif // ifndef X265_INTRAPRED_H
--- a/x265/source/common/x86/intrapred16.asm
+++ b/x265/source/common/x86/intrapred16.asm
--- a/x265/source/common/x86/intrapred8.asm
+++ b/x265/source/common/x86/intrapred8.asm
--- a/x265/source/common/x86/intrapred8_allangs.asm
+++ b/x265/source/common/x86/intrapred8_allangs.asm
--- a/x265/source/common/x86/ipfilter16.asm
+++ b/x265/source/common/x86/ipfilter16.asm
--- a/x265/source/common/x86/ipfilter8.asm
+++ b/x265/source/common/x86/ipfilter8.asm
--- a/x265/source/common/x86/ipfilter8.h
+++ b/x265/source/common/x86/ipfilter8.h
@ -0,0 +1,49 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_IPFILTER8_H
+#define X265_IPFILTER8_H
+
+#define SETUP_FUNC_DEF(cpu) \
+    FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+    FUNCDEF_CHROMA_PU(void, interp_4tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
+
+SETUP_FUNC_DEF(sse2);
+SETUP_FUNC_DEF(ssse3);
+SETUP_FUNC_DEF(sse3);
+SETUP_FUNC_DEF(sse4);
+SETUP_FUNC_DEF(avx2);
+
+#endif // ifndef X265_IPFILTER8_H
--- a/x265/source/common/x86/loopfilter.asm
+++ b/x265/source/common/x86/loopfilter.asm
--- a/x265/source/common/x86/loopfilter.h
+++ b/x265/source/common/x86/loopfilter.h
@ -0,0 +1,48 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_LOOPFILTER_H
+#define X265_LOOPFILTER_H
+
+#define DECL_SAO(cpu) \
+    void PFX(saoCuOrgE0_ ## cpu)(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride); \
+    void PFX(saoCuOrgE1_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
+    void PFX(saoCuOrgE1_2Rows_ ## cpu)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width); \
+    void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
+    void PFX(saoCuOrgE2_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
+    void PFX(saoCuOrgE2_32_ ## cpu)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride); \
+    void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
+    void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
+    void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
+    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
+
+DECL_SAO(sse4);
+DECL_SAO(avx2);
+
+#endif // ifndef X265_LOOPFILTER_H
--- a/x265/source/common/x86/mc-a.asm
+++ b/x265/source/common/x86/mc-a.asm
--- a/x265/source/common/x86/mc-a2.asm
+++ b/x265/source/common/x86/mc-a2.asm
--- a/x265/source/common/x86/mc.h
+++ b/x265/source/common/x86/mc.h
@ -0,0 +1,39 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_MC_H
+#define X265_MC_H
+
+#define LOWRES(cpu) \
+    void PFX(frame_init_lowres_core_ ## cpu)(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, \
+                                             intptr_t src_stride, intptr_t dst_stride, int width, int height);
+LOWRES(mmx2)
+LOWRES(sse2)
+LOWRES(ssse3)
+LOWRES(avx)
+LOWRES(avx2)
+LOWRES(xop)
+
+#undef LOWRES
+
+#endif // ifndef X265_MC_H
--- a/x265/source/common/x86/pixel-32.asm
+++ b/x265/source/common/x86/pixel-32.asm
@ -0,0 +1,420 @@
+;*****************************************************************************
+;* pixel-32.asm: x86_32 pixel metrics
+;*****************************************************************************
+;* Copyright (C) 2003-2013 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Laurent Aimar <fenrir@via.ecp.fr>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*****************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+cextern pw_ppmmppmm
+cextern pw_pmpmpmpm
+
+SECTION .text
+INIT_MMX mmx2
+
+%macro LOAD_DIFF_4x8P 1 ; dx
+    LOAD_DIFF  m0, m7, none, [r0+%1],      [r2+%1]
+    LOAD_DIFF  m1, m6, none, [r0+%1+r1],   [r2+%1+r3]
+    LOAD_DIFF  m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
+    LOAD_DIFF  m3, m6, none, [r0+%1+r4],   [r2+%1+r5]
+    lea  r0, [r0+4*r1]
+    lea  r2, [r2+4*r3]
+    LOAD_DIFF  m4, m7, none, [r0+%1],      [r2+%1]
+    LOAD_DIFF  m5, m6, none, [r0+%1+r1],   [r2+%1+r3]
+    LOAD_DIFF  m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
+    movq [spill], m5
+    LOAD_DIFF  m7, m5, none, [r0+%1+r4],   [r2+%1+r5]
+    movq m5, [spill]
+%endmacro
+
+%macro SUM4x8_MM 0
+    movq [spill],   m6
+    movq [spill+8], m7
+    ABSW2    m0, m1, m0, m1, m6, m7
+    ABSW2    m2, m3, m2, m3, m6, m7
+    paddw    m0, m2
+    paddw    m1, m3
+    movq     m6, [spill]
+    movq     m7, [spill+8]
+    ABSW2    m4, m5, m4, m5, m2, m3
+    ABSW2    m6, m7, m6, m7, m2, m3
+    paddw    m4, m6
+    paddw    m5, m7
+    paddw    m0, m4
+    paddw    m1, m5
+    paddw    m0, m1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sa8d_8x8_internal
+    push   r0
+    push   r2
+    sub    esp, 0x74
+%define args  esp+0x74
+%define spill esp+0x60 ; +16
+%define trans esp+0    ; +96
+    LOAD_DIFF_4x8P 0
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+
+    movq   [spill], m1
+    TRANSPOSE4x4W 4, 5, 6, 7, 1
+    movq   [trans+0x00], m4
+    movq   [trans+0x08], m5
+    movq   [trans+0x10], m6
+    movq   [trans+0x18], m7
+    movq   m1, [spill]
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    movq   [trans+0x20], m0
+    movq   [trans+0x28], m1
+    movq   [trans+0x30], m2
+    movq   [trans+0x38], m3
+
+    mov    r0, [args+4]
+    mov    r2, [args]
+    LOAD_DIFF_4x8P 4
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+
+    movq   [spill], m7
+    TRANSPOSE4x4W 0, 1, 2, 3, 7
+    movq   [trans+0x40], m0
+    movq   [trans+0x48], m1
+    movq   [trans+0x50], m2
+    movq   [trans+0x58], m3
+    movq   m7, [spill]
+    TRANSPOSE4x4W 4, 5, 6, 7, 1
+    movq   m0, [trans+0x00]
+    movq   m1, [trans+0x08]
+    movq   m2, [trans+0x10]
+    movq   m3, [trans+0x18]
+
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+    SUM4x8_MM
+    movq   [trans], m0
+
+    movq   m0, [trans+0x20]
+    movq   m1, [trans+0x28]
+    movq   m2, [trans+0x30]
+    movq   m3, [trans+0x38]
+    movq   m4, [trans+0x40]
+    movq   m5, [trans+0x48]
+    movq   m6, [trans+0x50]
+    movq   m7, [trans+0x58]
+
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+    SUM4x8_MM
+
+    pavgw  m0, [trans]
+    add   esp, 0x7c
+    ret
+%undef args
+%undef spill
+%undef trans
+
+%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
+    pxor        %7, %7
+    pshufw      %4, %1, q1032
+    pshufw      %5, %2, q1032
+    pshufw      %6, %3, q1032
+    paddusw     %1, %4
+    paddusw     %2, %5
+    paddusw     %3, %6
+    punpcklwd   %1, %7
+    punpcklwd   %2, %7
+    punpcklwd   %3, %7
+    pshufw      %4, %1, q1032
+    pshufw      %5, %2, q1032
+    pshufw      %6, %3, q1032
+    %8          %1, %4
+    %8          %2, %5
+    %8          %3, %6
+%endmacro
+
+%macro LOAD_4x8P 1 ; dx
+    pxor        m7, m7
+    movd        m6, [r0+%1+7*FENC_STRIDE]
+    movd        m0, [r0+%1+0*FENC_STRIDE]
+    movd        m1, [r0+%1+1*FENC_STRIDE]
+    movd        m2, [r0+%1+2*FENC_STRIDE]
+    movd        m3, [r0+%1+3*FENC_STRIDE]
+    movd        m4, [r0+%1+4*FENC_STRIDE]
+    movd        m5, [r0+%1+5*FENC_STRIDE]
+    punpcklbw   m6, m7
+    punpcklbw   m0, m7
+    punpcklbw   m1, m7
+    movq   [spill], m6
+    punpcklbw   m2, m7
+    punpcklbw   m3, m7
+    movd        m6, [r0+%1+6*FENC_STRIDE]
+    punpcklbw   m4, m7
+    punpcklbw   m5, m7
+    punpcklbw   m6, m7
+    movq        m7, [spill]
+%endmacro
+
+%macro HSUMSUB2 4
+    pshufw m4, %1, %3
+    pshufw m5, %2, %3
+    pmullw %1, %4
+    pmullw m5, %4
+    paddw  %1, m4
+    paddw  %2, m5
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
+;-----------------------------------------------------------------------------
+cglobal intra_sa8d_x3_8x8, 2,3
+    SUB    esp, 0x94
+%define edge  esp+0x70 ; +32
+%define spill esp+0x60 ; +16
+%define trans esp+0    ; +96
+%define sum   esp+0    ; +32
+
+    pxor      m7, m7
+    movq      m0, [r1+7]
+    movq      m2, [r1+16]
+    movq      m1, m0
+    movq      m3, m2
+    punpcklbw m0, m7
+    punpckhbw m1, m7
+    punpcklbw m2, m7
+    punpckhbw m3, m7
+    movq      m6, [pw_ppmmppmm]
+    HSUMSUB2  m0, m2, q1032, m6
+    HSUMSUB2  m1, m3, q1032, m6
+    movq      m6, [pw_pmpmpmpm]
+    HSUMSUB2  m0, m2, q2301, m6
+    HSUMSUB2  m1, m3, q2301, m6
+    movq      m4, m0
+    movq      m5, m2
+    paddw     m0, m1
+    paddw     m2, m3
+    psubw     m4, m1
+    psubw     m3, m5
+    movq [edge+0], m0
+    movq [edge+8], m4
+    movq [edge+16], m2
+    movq [edge+24], m3
+
+    LOAD_4x8P 0
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+
+    movq   [spill], m0
+    TRANSPOSE4x4W 4, 5, 6, 7, 0
+    movq   [trans+0x00], m4
+    movq   [trans+0x08], m5
+    movq   [trans+0x10], m6
+    movq   [trans+0x18], m7
+    movq   m0, [spill]
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    movq   [trans+0x20], m0
+    movq   [trans+0x28], m1
+    movq   [trans+0x30], m2
+    movq   [trans+0x38], m3
+
+    LOAD_4x8P 4
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+
+    movq   [spill], m7
+    TRANSPOSE4x4W 0, 1, 2, 3, 7
+    movq   [trans+0x40], m0
+    movq   [trans+0x48], m1
+    movq   [trans+0x50], m2
+    movq   [trans+0x58], m3
+    movq   m7, [spill]
+    TRANSPOSE4x4W 4, 5, 6, 7, 0
+    movq   m0, [trans+0x00]
+    movq   m1, [trans+0x08]
+    movq   m2, [trans+0x10]
+    movq   m3, [trans+0x18]
+
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+
+    movq [spill+0], m0
+    movq [spill+8], m1
+    ABSW2    m2, m3, m2, m3, m0, m1
+    ABSW2    m4, m5, m4, m5, m0, m1
+    paddw    m2, m4
+    paddw    m3, m5
+    ABSW2    m6, m7, m6, m7, m4, m5
+    movq     m0, [spill+0]
+    movq     m1, [spill+8]
+    paddw    m2, m6
+    paddw    m3, m7
+    paddw    m2, m3
+    ABSW     m1, m1, m4
+    paddw    m2, m1 ; 7x4 sum
+    movq     m7, m0
+    movq     m1, [edge+8] ; left bottom
+    psllw    m1, 3
+    psubw    m7, m1
+    ABSW2    m0, m7, m0, m7, m5, m3
+    paddw    m0, m2
+    paddw    m7, m2
+    movq [sum+0], m0 ; dc
+    movq [sum+8], m7 ; left
+
+    movq   m0, [trans+0x20]
+    movq   m1, [trans+0x28]
+    movq   m2, [trans+0x30]
+    movq   m3, [trans+0x38]
+    movq   m4, [trans+0x40]
+    movq   m5, [trans+0x48]
+    movq   m6, [trans+0x50]
+    movq   m7, [trans+0x58]
+
+    HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
+
+    movd   [sum+0x10], m0
+    movd   [sum+0x12], m1
+    movd   [sum+0x14], m2
+    movd   [sum+0x16], m3
+    movd   [sum+0x18], m4
+    movd   [sum+0x1a], m5
+    movd   [sum+0x1c], m6
+    movd   [sum+0x1e], m7
+
+    movq [spill],   m0
+    movq [spill+8], m1
+    ABSW2    m2, m3, m2, m3, m0, m1
+    ABSW2    m4, m5, m4, m5, m0, m1
+    paddw    m2, m4
+    paddw    m3, m5
+    paddw    m2, m3
+    movq     m0, [spill]
+    movq     m1, [spill+8]
+    ABSW2    m6, m7, m6, m7, m4, m5
+    ABSW     m1, m1, m3
+    paddw    m2, m7
+    paddw    m1, m6
+    paddw    m2, m1 ; 7x4 sum
+    movq     m1, m0
+
+    movq     m7, [edge+0]
+    psllw    m7, 3   ; left top
+
+    mov      r2, [edge+0]
+    add      r2, [edge+16]
+    lea      r2, [4*r2+32]
+    and      r2, 0xffc0
+    movd     m6, r2 ; dc
+
+    psubw    m1, m7
+    psubw    m0, m6
+    ABSW2    m0, m1, m0, m1, m5, m6
+    movq     m3, [sum+0] ; dc
+    paddw    m0, m2
+    paddw    m1, m2
+    movq     m2, m0
+    paddw    m0, m3
+    paddw    m1, [sum+8] ; h
+    psrlq    m2, 16
+    paddw    m2, m3
+
+    movq     m3, [edge+16] ; top left
+    movq     m4, [edge+24] ; top right
+    psllw    m3, 3
+    psllw    m4, 3
+    psubw    m3, [sum+16]
+    psubw    m4, [sum+24]
+    ABSW2    m3, m4, m3, m4, m5, m6
+    paddw    m2, m3
+    paddw    m2, m4 ; v
+
+    SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
+    mov      r2, r2m
+    pxor      m7, m7
+    punpckldq m2, m1
+    pavgw     m0, m7
+    pavgw     m2, m7
+    movd  [r2+8], m0 ; dc
+    movq  [r2+0], m2 ; v, h
+    ADD     esp, 0x94
+    RET
+%undef edge
+%undef spill
+%undef trans
+%undef sum
+
+
+
+;-----------------------------------------------------------------------------
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
+;                             const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
+;-----------------------------------------------------------------------------
+cglobal pixel_ssim_4x4x2_core, 0,5
+    mov       r1, r1m
+    mov       r3, r3m
+    mov       r4, 4
+    pxor      m0, m0
+.loop:
+    mov       r0, r0m
+    mov       r2, r2m
+    add       r0, r4
+    add       r2, r4
+    pxor      m1, m1
+    pxor      m2, m2
+    pxor      m3, m3
+    pxor      m4, m4
+%rep 4
+    movd      m5, [r0]
+    movd      m6, [r2]
+    punpcklbw m5, m0
+    punpcklbw m6, m0
+    paddw     m1, m5
+    paddw     m2, m6
+    movq      m7, m5
+    pmaddwd   m5, m5
+    pmaddwd   m7, m6
+    pmaddwd   m6, m6
+    paddd     m3, m5
+    paddd     m4, m7
+    paddd     m3, m6
+    add       r0, r1
+    add       r2, r3
+%endrep
+    mov       r0, r4m
+    lea       r0, [r0+r4*4]
+    pshufw    m5, m1, q0032
+    pshufw    m6, m2, q0032
+    paddusw   m1, m5
+    paddusw   m2, m6
+    punpcklwd m1, m2
+    pshufw    m2, m1, q0032
+    pshufw    m5, m3, q0032
+    pshufw    m6, m4, q0032
+    paddusw   m1, m2
+    paddd     m3, m5
+    paddd     m4, m6
+    punpcklwd m1, m0
+    punpckldq m3, m4
+    movq  [r0+0], m1
+    movq  [r0+8], m3
+    sub       r4, 4
+    jge .loop
+    emms
+    RET
+
--- a/x265/source/common/x86/pixel-a.asm
+++ b/x265/source/common/x86/pixel-a.asm
--- a/x265/source/common/x86/pixel-util.h
+++ b/x265/source/common/x86/pixel-util.h
@ -0,0 +1,59 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PIXEL_UTIL_H
+#define X265_PIXEL_UTIL_H
+
+#define DEFINE_UTILS(cpu) \
+    FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
+    FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
+    FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
+    uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
+    uint32_t PFX(nquant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)); \
+    void PFX(dequant_normal_ ## cpu(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); \
+    void PFX(dequant_scaling_## cpu(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)); \
+    void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
+    void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
+    void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
+    void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
+    uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
+    uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
+
+DEFINE_UTILS(sse2);
+DEFINE_UTILS(ssse3);
+DEFINE_UTILS(sse4);
+DEFINE_UTILS(avx2);
+
+#undef DEFINE_UTILS
+
+void PFX(pixel_ssim_4x4x2_core_sse2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
+void PFX(pixel_ssim_4x4x2_core_avx(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
+float PFX(pixel_ssim_end4_sse2(int sum0[5][4], int sum1[5][4], int width));
+float PFX(pixel_ssim_end4_avx(int sum0[5][4], int sum1[5][4], int width));
+
+int PFX(scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
+int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
+uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
+uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
+
+#endif // ifndef X265_PIXEL_UTIL_H
--- a/x265/source/common/x86/pixel-util8.asm
+++ b/x265/source/common/x86/pixel-util8.asm
--- a/x265/source/common/x86/pixel.h
+++ b/x265/source/common/x86/pixel.h
@ -0,0 +1,69 @@
+/*****************************************************************************
+ * pixel.h: x86 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2003-2013 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *          Fiona Glaser <fiona@x264.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_I386_PIXEL_H
+#define X265_I386_PIXEL_H
+
+void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
+
+#define DECL_PIXELS(cpu) \
+    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
+    FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
+    FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
+    FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
+    FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+    FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
+    FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
+    FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
+
+DECL_PIXELS(mmx);
+DECL_PIXELS(mmx2);
+DECL_PIXELS(sse2);
+DECL_PIXELS(sse3);
+DECL_PIXELS(sse4);
+DECL_PIXELS(ssse3);
+DECL_PIXELS(avx);
+DECL_PIXELS(xop);
+DECL_PIXELS(avx2);
+
+#undef DECL_PIXELS
+
+#endif // ifndef X265_I386_PIXEL_H
--- a/x265/source/common/x86/pixeladd8.asm
+++ b/x265/source/common/x86/pixeladd8.asm
--- a/x265/source/common/x86/sad-a.asm
+++ b/x265/source/common/x86/sad-a.asm
--- a/x265/source/common/x86/sad16-a.asm
+++ b/x265/source/common/x86/sad16-a.asm
--- a/x265/source/common/x86/ssd-a.asm
+++ b/x265/source/common/x86/ssd-a.asm
--- a/x265/source/common/x86/x86inc.asm
+++ b/x265/source/common/x86/x86inc.asm
--- a/x265/source/common/x86/x86util.asm
+++ b/x265/source/common/x86/x86util.asm
@ -0,0 +1,893 @@
+;*****************************************************************************
+;* x86util.asm: x86 utility macros
+;*****************************************************************************
+;* Copyright (C) 2008-2013 x264 project
+;*
+;* Authors: Holger Lubitz <holger@lubitz.org>
+;*          Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*****************************************************************************
+
+%assign FENC_STRIDE 64
+%assign FDEC_STRIDE 32
+
+%assign SIZEOF_PIXEL 1
+%assign SIZEOF_DCTCOEF 2
+%define pixel byte
+%define vpbroadcastdct vpbroadcastw
+%define vpbroadcastpix vpbroadcastb
+%if HIGH_BIT_DEPTH
+    %assign SIZEOF_PIXEL 2
+    %assign SIZEOF_DCTCOEF 4
+    %define pixel word
+    %define vpbroadcastdct vpbroadcastd
+    %define vpbroadcastpix vpbroadcastw
+%endif
+
+%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
+%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE
+
+%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
+
+%macro FIX_STRIDES 1-*
+%if HIGH_BIT_DEPTH
+%rep %0
+    add %1, %1
+    %rotate 1
+%endrep
+%endif
+%endmacro
+
+
+%macro SBUTTERFLY 4
+%ifidn %1, dqqq
+    vperm2i128  m%4, m%2, m%3, q0301 ; punpckh
+    vinserti128 m%2, m%2, xm%3, 1    ; punpckl
+%elif avx_enabled && mmsize >= 16
+    punpckh%1 m%4, m%2, m%3
+    punpckl%1 m%2, m%3
+%else
+    mova      m%4, m%2
+    punpckl%1 m%2, m%3
+    punpckh%1 m%4, m%3
+%endif
+    SWAP %3, %4
+%endmacro
+
+%macro SBUTTERFLY2 4
+    punpckl%1 m%4, m%2, m%3
+    punpckh%1 m%2, m%2, m%3
+    SWAP %2, %4, %3
+%endmacro
+
+%macro TRANSPOSE4x4W 5
+    SBUTTERFLY wd, %1, %2, %5
+    SBUTTERFLY wd, %3, %4, %5
+    SBUTTERFLY dq, %1, %3, %5
+    SBUTTERFLY dq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE2x4x4W 5
+    SBUTTERFLY wd,  %1, %2, %5
+    SBUTTERFLY wd,  %3, %4, %5
+    SBUTTERFLY dq,  %1, %3, %5
+    SBUTTERFLY dq,  %2, %4, %5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+%endmacro
+
+%macro TRANSPOSE4x4D 5
+    SBUTTERFLY dq,  %1, %2, %5
+    SBUTTERFLY dq,  %3, %4, %5
+    SBUTTERFLY qdq, %1, %3, %5
+    SBUTTERFLY qdq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE8x8W 9-11
+%if ARCH_X86_64
+    SBUTTERFLY wd,  %1, %2, %9
+    SBUTTERFLY wd,  %3, %4, %9
+    SBUTTERFLY wd,  %5, %6, %9
+    SBUTTERFLY wd,  %7, %8, %9
+    SBUTTERFLY dq,  %1, %3, %9
+    SBUTTERFLY dq,  %2, %4, %9
+    SBUTTERFLY dq,  %5, %7, %9
+    SBUTTERFLY dq,  %6, %8, %9
+    SBUTTERFLY qdq, %1, %5, %9
+    SBUTTERFLY qdq, %2, %6, %9
+    SBUTTERFLY qdq, %3, %7, %9
+    SBUTTERFLY qdq, %4, %8, %9
+    SWAP %2, %5
+    SWAP %4, %7
+%else
+; in:  m0..m7, unless %11 in which case m6 is in %9
+; out: m0..m7, unless %11 in which case m4 is in %10
+; spills into %9 and %10
+%if %0<11
+    movdqa %9, m%7
+%endif
+    SBUTTERFLY wd,  %1, %2, %7
+    movdqa %10, m%2
+    movdqa m%7, %9
+    SBUTTERFLY wd,  %3, %4, %2
+    SBUTTERFLY wd,  %5, %6, %2
+    SBUTTERFLY wd,  %7, %8, %2
+    SBUTTERFLY dq,  %1, %3, %2
+    movdqa %9, m%3
+    movdqa m%2, %10
+    SBUTTERFLY dq,  %2, %4, %3
+    SBUTTERFLY dq,  %5, %7, %3
+    SBUTTERFLY dq,  %6, %8, %3
+    SBUTTERFLY qdq, %1, %5, %3
+    SBUTTERFLY qdq, %2, %6, %3
+    movdqa %10, m%2
+    movdqa m%3, %9
+    SBUTTERFLY qdq, %3, %7, %2
+    SBUTTERFLY qdq, %4, %8, %2
+    SWAP %2, %5
+    SWAP %4, %7
+%if %0<11
+    movdqa m%5, %10
+%endif
+%endif
+%endmacro
+
+%macro WIDEN_SXWD 2
+    punpckhwd m%2, m%1
+    psrad     m%2, 16
+%if cpuflag(sse4)
+    pmovsxwd  m%1, m%1
+%else
+    punpcklwd m%1, m%1
+    psrad     m%1, 16
+%endif
+%endmacro
+
+%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
+%if cpuflag(ssse3)
+    pabsw   %1, %2
+%elifidn %3, sign ; version for pairing with PSIGNW: modifies src
+    pxor    %1, %1
+    pcmpgtw %1, %2
+    pxor    %2, %1
+    psubw   %2, %1
+    SWAP    %1, %2
+%elifidn %1, %2
+    pxor    %3, %3
+    psubw   %3, %1
+    pmaxsw  %1, %3
+%elifid %2
+    pxor    %1, %1
+    psubw   %1, %2
+    pmaxsw  %1, %2
+%elif %0 == 2
+    pxor    %1, %1
+    psubw   %1, %2
+    pmaxsw  %1, %2
+%else
+    mova    %1, %2
+    pxor    %3, %3
+    psubw   %3, %1
+    pmaxsw  %1, %3
+%endif
+%endmacro
+
+%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp
+%if cpuflag(ssse3)
+    pabsw   %1, %3
+    pabsw   %2, %4
+%elifidn %1, %3
+    pxor    %5, %5
+    pxor    %6, %6
+    psubw   %5, %1
+    psubw   %6, %2
+    pmaxsw  %1, %5
+    pmaxsw  %2, %6
+%else
+    pxor    %1, %1
+    pxor    %2, %2
+    psubw   %1, %3
+    psubw   %2, %4
+    pmaxsw  %1, %3
+    pmaxsw  %2, %4
+%endif
+%endmacro
+
+%macro ABSB 2
+%if cpuflag(ssse3)
+    pabsb   %1, %1
+%else
+    pxor    %2, %2
+    psubb   %2, %1
+    pminub  %1, %2
+%endif
+%endmacro
+
+%macro ABSD 2-3
+%if cpuflag(ssse3)
+    pabsd   %1, %2
+%else
+    %define %%s %2
+%if %0 == 3
+    mova    %3, %2
+    %define %%s %3
+%endif
+    pxor     %1, %1
+    pcmpgtd  %1, %%s
+    pxor    %%s, %1
+    psubd   %%s, %1
+    SWAP     %1, %%s
+%endif
+%endmacro
+
+%macro PSIGN 3-4
+%if cpuflag(ssse3) && %0 == 4
+    psign%1 %2, %3, %4
+%elif cpuflag(ssse3)
+    psign%1 %2, %3
+%elif %0 == 4
+    pxor    %2, %3, %4
+    psub%1  %2, %4
+%else
+    pxor    %2, %3
+    psub%1  %2, %3
+%endif
+%endmacro
+
+%define PSIGNW PSIGN w,
+%define PSIGND PSIGN d,
+
+%macro SPLATB_LOAD 3
+%if cpuflag(ssse3)
+    movd      %1, [%2-3]
+    pshufb    %1, %3
+%else
+    movd      %1, [%2-3] ;to avoid crossing a cacheline
+    punpcklbw %1, %1
+    SPLATW    %1, %1, 3
+%endif
+%endmacro
+
+%imacro SPLATW 2-3 0
+%if cpuflag(avx2) && %3 == 0
+    vpbroadcastw %1, %2
+%else
+    PSHUFLW      %1, %2, (%3)*q1111
+%if mmsize == 16
+    punpcklqdq   %1, %1
+%endif
+%endif
+%endmacro
+
+%imacro SPLATD 2-3 0
+%if mmsize == 16
+    pshufd %1, %2, (%3)*q1111
+%else
+    pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
+%endif
+%endmacro
+
+%macro CLIPW 3 ;(dst, min, max)
+    pmaxsw %1, %2
+    pminsw %1, %3
+%endmacro
+
+%macro CLIPW2 4 ;(dst0, dst1, min, max)
+    pmaxsw %1, %3
+    pmaxsw %2, %3
+    pminsw %1, %4
+    pminsw %2, %4
+%endmacro
+
+%macro HADDD 2 ; sum junk
+%if sizeof%1 == 32
+%define %2 xmm%2
+    vextracti128 %2, %1, 1
+%define %1 xmm%1
+    paddd   %1, %2
+%endif
+%if mmsize >= 16
+%if cpuflag(xop) && sizeof%1 == 16
+    vphadddq %1, %1
+%endif
+    movhlps %2, %1
+    paddd   %1, %2
+%endif
+%if notcpuflag(xop)
+    PSHUFLW %2, %1, q0032
+    paddd   %1, %2
+%endif
+%undef %1
+%undef %2
+%endmacro
+
+%macro HADDW 2 ; reg, tmp
+%if cpuflag(xop) && sizeof%1 == 16
+    vphaddwq  %1, %1
+    movhlps   %2, %1
+    paddd     %1, %2
+%else
+    pmaddwd %1, [pw_1]
+    HADDD   %1, %2
+%endif
+%endmacro
+
+%macro HADDUWD 2
+%if cpuflag(xop) && sizeof%1 == 16
+    vphadduwd %1, %1
+%else
+    psrld %2, %1, 16
+    pslld %1, 16
+    psrld %1, 16
+    paddd %1, %2
+%endif
+%endmacro
+
+%macro HADDUW 2
+%if cpuflag(xop) && sizeof%1 == 16
+    vphadduwq %1, %1
+    movhlps   %2, %1
+    paddd     %1, %2
+%else
+    HADDUWD   %1, %2
+    HADDD     %1, %2
+%endif
+%endmacro
+
+%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
+; AVX2 version uses a precalculated extra input that
+; can be re-used across calls
+%if sizeof%1==32
+                                 ; %3 = abcdefgh ijklmnop (lower address)
+                                 ; %2 = ABCDEFGH IJKLMNOP (higher address)
+    vperm2i128 %4, %1, %2, q0003 ; %4 = ijklmnop ABCDEFGH
+%if %3 < 16
+    palignr    %1, %4, %2, %3    ; %1 = bcdefghi jklmnopA
+%else
+    palignr    %1, %2, %4, %3-16 ; %1 = pABCDEFG HIJKLMNO
+%endif
+%elif cpuflag(ssse3)
+    %if %0==5
+        palignr %1, %2, %3, %4
+    %else
+        palignr %1, %2, %3
+    %endif
+%else
+    %define %%dst %1
+    %if %0==5
+        %ifnidn %1, %2
+            mova %%dst, %2
+        %endif
+        %rotate 1
+    %endif
+    %ifnidn %4, %2
+        mova %4, %2
+    %endif
+    %if mmsize==8
+        psllq  %%dst, (8-%3)*8
+        psrlq  %4, %3*8
+    %else
+        pslldq %%dst, 16-%3
+        psrldq %4, %3
+    %endif
+    por %%dst, %4
+%endif
+%endmacro
+
+%macro PSHUFLW 1+
+    %if mmsize == 8
+        pshufw %1
+    %else
+        pshuflw %1
+    %endif
+%endmacro
+
+; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
+; values shifted in are undefined
+; faster if dst==src
+%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
+%define PSRLPIX PSXLPIX r,  1, ;dst, src, shift
+%macro PSXLPIX 5
+    %if mmsize == 8
+        %if %5&1
+            ps%1lq %3, %4, %5*8
+        %else
+            pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
+        %endif
+    %else
+        ps%1ldq %3, %4, %5*2
+    %endif
+%endmacro
+
+%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
+%ifnum %5
+    pand   m%3, m%5, m%4 ; src .. y6 .. y4
+    pand   m%1, m%5, m%2 ; dst .. y6 .. y4
+%else
+    mova   m%1, %5
+    pand   m%3, m%1, m%4 ; src .. y6 .. y4
+    pand   m%1, m%1, m%2 ; dst .. y6 .. y4
+%endif
+    psrlw  m%2, 8        ; dst .. y7 .. y5
+    psrlw  m%4, 8        ; src .. y7 .. y5
+%endmacro
+
+%macro SUMSUB_BA 3-4
+%if %0==3
+    padd%1  m%2, m%3
+    padd%1  m%3, m%3
+    psub%1  m%3, m%2
+%elif avx_enabled
+    padd%1  m%4, m%2, m%3
+    psub%1  m%3, m%2
+    SWAP    %2, %4
+%else
+    mova    m%4, m%2
+    padd%1  m%2, m%3
+    psub%1  m%3, m%4
+%endif
+%endmacro
+
+%macro SUMSUB_BADC 5-6
+%if %0==6
+    SUMSUB_BA %1, %2, %3, %6
+    SUMSUB_BA %1, %4, %5, %6
+%else
+    padd%1  m%2, m%3
+    padd%1  m%4, m%5
+    padd%1  m%3, m%3
+    padd%1  m%5, m%5
+    psub%1  m%3, m%2
+    psub%1  m%5, m%4
+%endif
+%endmacro
+
+%macro HADAMARD4_V 4+
+    SUMSUB_BADC w, %1, %2, %3, %4
+    SUMSUB_BADC w, %1, %3, %2, %4
+%endmacro
+
+%macro HADAMARD8_V 8+
+    SUMSUB_BADC w, %1, %2, %3, %4
+    SUMSUB_BADC w, %5, %6, %7, %8
+    SUMSUB_BADC w, %1, %3, %2, %4
+    SUMSUB_BADC w, %5, %7, %6, %8
+    SUMSUB_BADC w, %1, %5, %2, %6
+    SUMSUB_BADC w, %3, %7, %4, %8
+%endmacro
+
+%macro TRANS_SSE2 5-6
+; TRANSPOSE2x2
+; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
+; %2: ord/unord (for compat with sse4, unused)
+; %3/%4: source regs
+; %5/%6: tmp regs
+%ifidn %1, d
+%define mask [mask_10]
+%define shift 16
+%elifidn %1, q
+%define mask [mask_1100]
+%define shift 32
+%endif
+%if %0==6 ; less dependency if we have two tmp
+    mova   m%5, mask   ; ff00
+    mova   m%6, m%4    ; x5x4
+    psll%1 m%4, shift  ; x4..
+    pand   m%6, m%5    ; x5..
+    pandn  m%5, m%3    ; ..x0
+    psrl%1 m%3, shift  ; ..x1
+    por    m%4, m%5    ; x4x0
+    por    m%3, m%6    ; x5x1
+%else ; more dependency, one insn less. sometimes faster, sometimes not
+    mova   m%5, m%4    ; x5x4
+    psll%1 m%4, shift  ; x4..
+    pxor   m%4, m%3    ; (x4^x1)x0
+    pand   m%4, mask   ; (x4^x1)..
+    pxor   m%3, m%4    ; x4x0
+    psrl%1 m%4, shift  ; ..(x1^x4)
+    pxor   m%5, m%4    ; x5x1
+    SWAP   %4, %3, %5
+%endif
+%endmacro
+
+%macro TRANS_SSE4 5-6 ; see above
+%ifidn %1, d
+%ifidn %2, ord
+    psrl%1  m%5, m%3, 16
+    pblendw m%5, m%4, q2222
+    psll%1  m%4, 16
+    pblendw m%4, m%3, q1111
+    SWAP     %3, %5
+%else
+%if avx_enabled
+    pblendw m%5, m%3, m%4, q2222
+    SWAP     %3, %5
+%else
+    mova    m%5, m%3
+    pblendw m%3, m%4, q2222
+%endif
+    psll%1  m%4, 16
+    psrl%1  m%5, 16
+    por     m%4, m%5
+%endif
+%elifidn %1, q
+    shufps m%5, m%3, m%4, q3131
+    shufps m%3, m%3, m%4, q2020
+    SWAP    %4, %5
+%endif
+%endmacro
+
+%macro TRANS_XOP 5-6
+%ifidn %1, d
+    vpperm m%5, m%3, m%4, [transd_shuf1]
+    vpperm m%3, m%3, m%4, [transd_shuf2]
+%elifidn %1, q
+    shufps m%5, m%3, m%4, q3131
+    shufps m%3, m%4, q2020
+%endif
+    SWAP    %4, %5
+%endmacro
+
+%macro HADAMARD 5-6
+; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
+; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
+; %3/%4: regs
+; %5(%6): tmpregs
+%if %1!=0 ; have to reorder stuff for horizontal op
+    %ifidn %2, sumsub
+        %define ORDER ord
+        ; sumsub needs order because a-b != b-a unless a=b
+    %else
+        %define ORDER unord
+        ; if we just max, order doesn't matter (allows pblendw+or in sse4)
+    %endif
+    %if %1==1
+        TRANS d, ORDER, %3, %4, %5, %6
+    %elif %1==2
+        %if mmsize==8
+            SBUTTERFLY dq, %3, %4, %5
+        %else
+            TRANS q, ORDER, %3, %4, %5, %6
+        %endif
+    %elif %1==4
+        SBUTTERFLY qdq, %3, %4, %5
+    %elif %1==8
+        SBUTTERFLY dqqq, %3, %4, %5
+    %endif
+%endif
+%ifidn %2, sumsub
+    SUMSUB_BA w, %3, %4, %5
+%else
+    %ifidn %2, amax
+        %if %0==6
+            ABSW2 m%3, m%4, m%3, m%4, m%5, m%6
+        %else
+            ABSW m%3, m%3, m%5
+            ABSW m%4, m%4, m%5
+        %endif
+    %endif
+    pmaxsw m%3, m%4
+%endif
+%endmacro
+
+
+%macro HADAMARD2_2D 6-7 sumsub
+    HADAMARD 0, sumsub, %1, %2, %5
+    HADAMARD 0, sumsub, %3, %4, %5
+    SBUTTERFLY %6, %1, %2, %5
+%ifnum %7
+    HADAMARD 0, amax, %1, %2, %5, %7
+%else
+    HADAMARD 0, %7, %1, %2, %5
+%endif
+    SBUTTERFLY %6, %3, %4, %5
+%ifnum %7
+    HADAMARD 0, amax, %3, %4, %5, %7
+%else
+    HADAMARD 0, %7, %3, %4, %5
+%endif
+%endmacro
+
+%macro HADAMARD4_2D 5-6 sumsub
+    HADAMARD2_2D %1, %2, %3, %4, %5, wd
+    HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
+    SWAP %2, %3
+%endmacro
+
+%macro HADAMARD4_2D_SSE 5-6 sumsub
+    HADAMARD  0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
+    HADAMARD  0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
+    SBUTTERFLY   wd, %1, %2, %5     ; %1: m0 1+0 %2: m1 1+0
+    SBUTTERFLY   wd, %3, %4, %5     ; %3: m0 3+2 %4: m1 3+2
+    HADAMARD2_2D %1, %3, %2, %4, %5, dq
+    SBUTTERFLY  qdq, %1, %2, %5
+    HADAMARD  0, %6, %1, %2, %5     ; 2nd H m1/m0 row 0+1
+    SBUTTERFLY  qdq, %3, %4, %5
+    HADAMARD  0, %6, %3, %4, %5     ; 2nd H m1/m0 row 2+3
+%endmacro
+
+%macro HADAMARD8_2D 9-10 sumsub
+    HADAMARD2_2D %1, %2, %3, %4, %9, wd
+    HADAMARD2_2D %5, %6, %7, %8, %9, wd
+    HADAMARD2_2D %1, %3, %2, %4, %9, dq
+    HADAMARD2_2D %5, %7, %6, %8, %9, dq
+    HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
+    HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
+%ifnidn %10, amax
+    SWAP %2, %5
+    SWAP %4, %7
+%endif
+%endmacro
+
+; doesn't include the "pmaddubsw hmul_8p" pass
+%macro HADAMARD8_2D_HMUL 10
+    HADAMARD4_V %1, %2, %3, %4, %9
+    HADAMARD4_V %5, %6, %7, %8, %9
+    SUMSUB_BADC w, %1, %5, %2, %6, %9
+    HADAMARD 2, sumsub, %1, %5, %9, %10
+    HADAMARD 2, sumsub, %2, %6, %9, %10
+    SUMSUB_BADC w, %3, %7, %4, %8, %9
+    HADAMARD 2, sumsub, %3, %7, %9, %10
+    HADAMARD 2, sumsub, %4, %8, %9, %10
+    HADAMARD 1, amax, %1, %5, %9, %10
+    HADAMARD 1, amax, %2, %6, %9, %5
+    HADAMARD 1, amax, %3, %7, %9, %5
+    HADAMARD 1, amax, %4, %8, %9, %5
+%endmacro
+
+%macro SUMSUB2_AB 4
+%if cpuflag(xop)
+    pmacs%1%1 m%4, m%3, [p%1_m2], m%2
+    pmacs%1%1 m%2, m%2, [p%1_2], m%3
+%elifnum %3
+    psub%1  m%4, m%2, m%3
+    psub%1  m%4, m%3
+    padd%1  m%2, m%2
+    padd%1  m%2, m%3
+%else
+    mova    m%4, m%2
+    padd%1  m%2, m%2
+    padd%1  m%2, %3
+    psub%1  m%4, %3
+    psub%1  m%4, %3
+%endif
+%endmacro
+
+%macro SUMSUBD2_AB 5
+%ifnum %4
+    psra%1  m%5, m%2, 1  ; %3: %3>>1
+    psra%1  m%4, m%3, 1  ; %2: %2>>1
+    padd%1  m%4, m%2     ; %3: %3>>1+%2
+    psub%1  m%5, m%3     ; %2: %2>>1-%3
+    SWAP     %2, %5
+    SWAP     %3, %4
+%else
+    mova    %5, m%2
+    mova    %4, m%3
+    psra%1  m%3, 1  ; %3: %3>>1
+    psra%1  m%2, 1  ; %2: %2>>1
+    padd%1  m%3, %5 ; %3: %3>>1+%2
+    psub%1  m%2, %4 ; %2: %2>>1-%3
+%endif
+%endmacro
+
+%macro DCT4_1D 5
+%ifnum %5
+    SUMSUB_BADC w, %4, %1, %3, %2, %5
+    SUMSUB_BA   w, %3, %4, %5
+    SUMSUB2_AB  w, %1, %2, %5
+    SWAP %1, %3, %4, %5, %2
+%else
+    SUMSUB_BADC w, %4, %1, %3, %2
+    SUMSUB_BA   w, %3, %4
+    mova     [%5], m%2
+    SUMSUB2_AB  w, %1, [%5], %2
+    SWAP %1, %3, %4, %2
+%endif
+%endmacro
+
+%macro IDCT4_1D 6-7
+%ifnum %6
+    SUMSUBD2_AB %1, %3, %5, %7, %6
+    ; %3: %3>>1-%5 %5: %3+%5>>1
+    SUMSUB_BA   %1, %4, %2, %7
+    ; %4: %2+%4 %2: %2-%4
+    SUMSUB_BADC %1, %5, %4, %3, %2, %7
+    ; %5: %2+%4 + (%3+%5>>1)
+    ; %4: %2+%4 - (%3+%5>>1)
+    ; %3: %2-%4 + (%3>>1-%5)
+    ; %2: %2-%4 - (%3>>1-%5)
+%else
+%ifidn %1, w
+    SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
+%else
+    SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
+%endif
+    SUMSUB_BA   %1, %4, %2
+    SUMSUB_BADC %1, %5, %4, %3, %2
+%endif
+    SWAP %2, %5, %4
+    ; %2: %2+%4 + (%3+%5>>1) row0
+    ; %3: %2-%4 + (%3>>1-%5) row1
+    ; %4: %2-%4 - (%3>>1-%5) row2
+    ; %5: %2+%4 - (%3+%5>>1) row3
+%endmacro
+
+
+%macro LOAD_DIFF 5-6 1
+%if HIGH_BIT_DEPTH
+%if %6 ; %5 aligned?
+    mova       %1, %4
+    psubw      %1, %5
+%else
+    movu       %1, %4
+    movu       %2, %5
+    psubw      %1, %2
+%endif
+%else ; !HIGH_BIT_DEPTH
+%ifidn %3, none
+    movh       %1, %4
+    movh       %2, %5
+    punpcklbw  %1, %2
+    punpcklbw  %2, %2
+    psubw      %1, %2
+%else
+    movh       %1, %4
+    punpcklbw  %1, %3
+    movh       %2, %5
+    punpcklbw  %2, %3
+    psubw      %1, %2
+%endif
+%endif ; HIGH_BIT_DEPTH
+%endmacro
+
+%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
+%if BIT_DEPTH == 8 && cpuflag(ssse3)
+    movh       m%2, [%8+%1*FDEC_STRIDE]
+    movh       m%1, [%7+%1*FENC_STRIDE]
+    punpcklbw  m%1, m%2
+    movh       m%3, [%8+%2*FDEC_STRIDE]
+    movh       m%2, [%7+%2*FENC_STRIDE]
+    punpcklbw  m%2, m%3
+    movh       m%4, [%8+%3*FDEC_STRIDE]
+    movh       m%3, [%7+%3*FENC_STRIDE]
+    punpcklbw  m%3, m%4
+    movh       m%5, [%8+%4*FDEC_STRIDE]
+    movh       m%4, [%7+%4*FENC_STRIDE]
+    punpcklbw  m%4, m%5
+    pmaddubsw  m%1, m%6
+    pmaddubsw  m%2, m%6
+    pmaddubsw  m%3, m%6
+    pmaddubsw  m%4, m%6
+%else
+    LOAD_DIFF  m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
+    LOAD_DIFF  m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
+    LOAD_DIFF  m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
+    LOAD_DIFF  m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
+%endif
+%endmacro
+
+%macro STORE_DCT 6
+    movq   [%5+%6+ 0], m%1
+    movq   [%5+%6+ 8], m%2
+    movq   [%5+%6+16], m%3
+    movq   [%5+%6+24], m%4
+    movhps [%5+%6+32], m%1
+    movhps [%5+%6+40], m%2
+    movhps [%5+%6+48], m%3
+    movhps [%5+%6+56], m%4
+%endmacro
+
+%macro STORE_IDCT 4
+    movhps [r0-4*FDEC_STRIDE], %1
+    movh   [r0-3*FDEC_STRIDE], %1
+    movhps [r0-2*FDEC_STRIDE], %2
+    movh   [r0-1*FDEC_STRIDE], %2
+    movhps [r0+0*FDEC_STRIDE], %3
+    movh   [r0+1*FDEC_STRIDE], %3
+    movhps [r0+2*FDEC_STRIDE], %4
+    movh   [r0+3*FDEC_STRIDE], %4
+%endmacro
+
+%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
+    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9],      %11
+    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3],   %11
+    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
+    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5],   %11
+%if %10
+    lea %8, [%8+4*r1]
+    lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+; 2xdst, 2xtmp, 2xsrcrow
+%macro LOAD_DIFF16x2_AVX2 6
+    pmovzxbw m%1, [r1+%5*FENC_STRIDE]
+    pmovzxbw m%2, [r1+%6*FENC_STRIDE]
+    pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
+    pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
+    psubw    m%1, m%3
+    psubw    m%2, m%4
+%endmacro
+
+%macro DIFFx2 6-7
+    movh       %3, %5
+    punpcklbw  %3, %4
+    psraw      %1, 6
+    paddsw     %1, %3
+    movh       %3, %6
+    punpcklbw  %3, %4
+    psraw      %2, 6
+    paddsw     %2, %3
+    packuswb   %2, %1
+%endmacro
+
+; (high depth) in: %1, %2, min to clip, max to clip, mem128
+; in: %1, tmp, %3, mem64
+%macro STORE_DIFF 4-5
+%if HIGH_BIT_DEPTH
+    psrad      %1, 6
+    psrad      %2, 6
+    packssdw   %1, %2
+    paddw      %1, %5
+    CLIPW      %1, %3, %4
+    mova       %5, %1
+%else
+    movh       %2, %4
+    punpcklbw  %2, %3
+    psraw      %1, 6
+    paddsw     %1, %2
+    packuswb   %1, %1
+    movh       %4, %1
+%endif
+%endmacro
+
+%macro SHUFFLE_MASK_W 8
+    %rep 8
+        %if %1>=0x80
+            db %1, %1
+        %else
+            db %1*2
+            db %1*2+1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+; instruction, accum, input, iteration (zero to swap, nonzero to add)
+%macro ACCUM 4
+%if %4
+    %1        m%2, m%3
+%else
+    SWAP       %2, %3
+%endif
+%endmacro
+
+; IACA support
+%macro IACA_START 0
+    mov ebx, 111
+    db 0x64, 0x67, 0x90
+%endmacro
+
+%macro IACA_END 0
+    mov ebx, 222
+    db 0x64, 0x67, 0x90
+%endmacro