;***************************************************************************** ;* Copyright (C) 2013 x265 project ;* ;* Authors: Nabajit Deka ;* Min Chen ;* Li Cao ;* Praveen Kumar Tiwari ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ ;TO-DO : Further optimize the routines. %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64 dw 89, 75, 50, 18, -18, -50, -75, -89 dw 83, 36, -36, -83, -83, -36, 36, 83 dw 75, -18, -89, -50, 50, 89, 18, -75 dw 64, -64, -64, 64, 64, -64, -64, 64 dw 50, -89, 18, 75, -75, -18, 89, -50 dw 36, -83, 83, -36, -36, 83, -83, 36 dw 18, -50, 75, -89, 89, -75, 50, -18 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64 dw 90, 87, 80, 70, 57, 43, 25, 9 dw 89, 75, 50, 18, -18, -50, -75, -89 dw 87, 57, 9, -43, -80, -90, -70, -25 dw 83, 36, -36, -83, -83, -36, 36, 83 dw 80, 9, -70, -87, -25, 57, 90, 43 dw 75, -18, -89, -50, 50, 89, 18, -75 dw 70, -43, -87, 9, 90, 25, -80, -57 dw 64, -64, -64, 64, 64, -64, -64, 64 dw 57, -80, -25, 90, -9, -87, 43, 70 dw 50, -89, 18, 75, -75, -18, 89, -50 dw 43, -90, 57, 25, -87, 70, 9, -80 dw 36, -83, 83, -36, -36, 83, -83, 36 dw 25, -70, 90, -80, 43, 9, -57, 87 dw 18, -50, 75, -89, 89, -75, 50, -18 dw 9, -25, 43, -57, 70, -80, 87, -90 tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64 dw -9, -25, -43, -57, -70, -80, -87, -90 dw -89, -75, -50, -18, 18, 50, 75, 89 dw 25, 70, 90, 80, 43, -9, -57, -87 dw 83, 36, -36, -83, -83, -36, 36, 83 dw -43, -90, -57, 25, 87, 70, -9, -80 dw -75, 18, 89, 50, -50, -89, -18, 75 dw 57, 80, -25, -90, -9, 87, 43, -70 dw 64, -64, -64, 64, 64, -64, -64, 64 dw -70, -43, 87, 9, -90, 25, 80, -57 dw -50, 89, -18, -75, 75, 18, -89, 50 dw 80, -9, -70, 87, -25, -57, 90, -43 dw 36, -83, 83, -36, -36, 83, -83, 36 dw -87, 57, -9, -43, 80, -90, 70, -25 dw -18, 50, -75, 89, -89, 75, -50, 18 dw 90, -87, 80, -70, 57, -43, 25, -9 dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9 tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 avx2_idct8_1: times 4 dw 64, 83, 64, 36 times 4 dw 64, 36, -64, -83 times 4 dw 64, -36, -64, 83 times 4 dw 64, -83, 64, -36 avx2_idct8_2: times 4 dw 89, 75, 50, 18 times 4 dw 75, -18, -89, -50 times 4 dw 50, -89, 18, 75 times 4 dw 18, -50, 75, -89 idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7 const idct8_shuf2, times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9 dw 87, 57, 9, -43, -80, -90, -70, -25 dw 80, 9, -70, -87, -25, 57, 90, 43 dw 70, -43, -87, 9, 90, 25, -80, -57 dw 57, -80, -25, 90, -9, -87, 43, 70 dw 43, -90, 57, 25, -87, 70, 9, -80 dw 25, -70, 90, -80, 43, 9, -57, 87 dw 9, -25, 43, -57, 70, -80, 87, -90 tab_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18 dw 64, 75, 36, -18, -64, -89, -83, -50 dw 64, 50, -36, -89, -64, 18, 83, 75 dw 64, 18, -83, -50, 64, 75, -36, -89 dw 64, -18, -83, 50, 64, -75, -36, 89 dw 64, -50, -36, 89, -64, -18, 83, -75 dw 64, -75, 36, 18, -64, 89, -83, 50 dw 64, -89, 83, -75, 64, -50, 36, -18 idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7 idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 tab_idct32_2: dw 64, 89, 83, 75, 64, 50, 36, 18 dw 64, 75, 36, -18, -64, -89, -83, -50 dw 64, 50, -36, -89, -64, 18, 83, 75 dw 64, 18, -83, -50, 64, 75, -36, -89 dw 64, -18, -83, 50, 64, -75, -36, 89 dw 64, -50, -36, 89, -64, -18, 83, -75 dw 64, -75, 36, 18, -64, 89, -83, 50 dw 64, -89, 83, -75, 64, -50, 36, -18 tab_idct32_3: dw 90, 87, 80, 70, 57, 43, 25, 9 dw 87, 57, 9, -43, -80, -90, -70, -25 dw 80, 9, -70, -87, -25, 57, 90, 43 dw 70, -43, -87, 9, 90, 25, -80, -57 dw 57, -80, -25, 90, -9, -87, 43, 70 dw 43, -90, 57, 25, -87, 70, 9, -80 dw 25, -70, 90, -80, 43, 9, -57, 87 dw 9, -25, 43, -57, 70, -80, 87, -90 tab_idct32_4: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9 dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25 dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43 dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57 dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70 dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80 dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87 dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90 dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90 dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87 dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80 dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70 dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57 dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83 avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83 avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83 const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11 tab_dct4: times 4 dw 64, 64 times 4 dw 83, 36 times 4 dw 64, -64 times 4 dw 36, -83 dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13 tab_dst4: times 2 dw 29, 55, 74, 84 times 2 dw 74, 74, 0, -74 times 2 dw 84, -29, -74, 55 times 2 dw 55, -84, 74, -29 pw_dst4_tab: times 4 dw 29, 55, 74, 84 times 4 dw 74, 74, 0, -74 times 4 dw 84, -29, -74, 55 times 4 dw 55, -84, 74, -29 tab_idst4: times 4 dw 29, +84 times 4 dw +74, +55 times 4 dw 55, -29 times 4 dw +74, -84 times 4 dw 74, -74 times 4 dw 0, +74 times 4 dw 84, +55 times 4 dw -74, -29 pw_idst4_tab: times 4 dw 29, 84 times 4 dw 55, -29 times 4 dw 74, 55 times 4 dw 74, -84 times 4 dw 74, -74 times 4 dw 84, 55 times 4 dw 0, 74 times 4 dw -74, -29 pb_idst4_shuf: times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 tab_dct8_1: times 2 dw 89, 50, 75, 18 times 2 dw 75, -89, -18, -50 times 2 dw 50, 18, -89, 75 times 2 dw 18, 75, -50, -89 tab_dct8_2: times 2 dd 83, 36 times 2 dd 36, 83 times 1 dd 89, 75, 50, 18 times 1 dd 75, -18, -89, -50 times 1 dd 50, -89, 18, 75 times 1 dd 18, -50, 75, -89 tab_idct8_3: times 4 dw 89, 75 times 4 dw 50, 18 times 4 dw 75, -18 times 4 dw -89, -50 times 4 dw 50, -89 times 4 dw 18, 75 times 4 dw 18, -50 times 4 dw 75, -89 pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15 pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13 tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36 tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89 pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 SECTION .text cextern pd_1 cextern pd_2 cextern pd_4 cextern pd_8 cextern pd_16 cextern pd_32 cextern pd_64 cextern pd_128 cextern pd_256 cextern pd_512 cextern pd_1024 cextern pd_2048 cextern pw_ppppmmmm cextern trans8_shuf %if BIT_DEPTH == 12 %define DCT4_SHIFT 5 %define DCT4_ROUND 16 %define IDCT_SHIFT 8 %define IDCT_ROUND 128 %define DST4_SHIFT 5 %define DST4_ROUND 16 %define DCT8_SHIFT1 6 %define DCT8_ROUND1 32 %elif BIT_DEPTH == 10 %define DCT4_SHIFT 3 %define DCT4_ROUND 4 %define IDCT_SHIFT 10 %define IDCT_ROUND 512 %define DST4_SHIFT 3 %define DST4_ROUND 4 %define DCT8_SHIFT1 4 %define DCT8_ROUND1 8 %elif BIT_DEPTH == 8 %define DCT4_SHIFT 1 %define DCT4_ROUND 1 %define IDCT_SHIFT 12 %define IDCT_ROUND 2048 %define DST4_SHIFT 1 %define DST4_ROUND 1 %define DCT8_SHIFT1 2 %define DCT8_ROUND1 2 %else %error Unsupported BIT_DEPTH! %endif %define DCT8_ROUND2 256 %define DCT8_SHIFT2 9 ;------------------------------------------------------ ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------ INIT_XMM sse2 cglobal dct4, 3, 4, 8 mova m7, [pd_ %+ DCT4_ROUND] add r2d, r2d lea r3, [tab_dct4] mova m4, [r3 + 0 * 16] mova m5, [r3 + 1 * 16] mova m6, [r3 + 2 * 16] movh m0, [r0 + 0 * r2] movh m1, [r0 + 1 * r2] punpcklqdq m0, m1 pshufd m0, m0, 0xD8 pshufhw m0, m0, 0xB1 lea r0, [r0 + 2 * r2] movh m1, [r0] movh m2, [r0 + r2] punpcklqdq m1, m2 pshufd m1, m1, 0xD8 pshufhw m1, m1, 0xB1 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 paddw m1, m2, m0 psubw m2, m0 pmaddwd m0, m1, m4 paddd m0, m7 psrad m0, DCT4_SHIFT pmaddwd m3, m2, m5 paddd m3, m7 psrad m3, DCT4_SHIFT packssdw m0, m3 pshufd m0, m0, 0xD8 pshufhw m0, m0, 0xB1 pmaddwd m1, m6 paddd m1, m7 psrad m1, DCT4_SHIFT pmaddwd m2, [r3 + 3 * 16] paddd m2, m7 psrad m2, DCT4_SHIFT packssdw m1, m2 pshufd m1, m1, 0xD8 pshufhw m1, m1, 0xB1 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 mova m7, [pd_128] pmaddwd m1, m2, m4 pmaddwd m3, m0, m4 paddd m1, m3 paddd m1, m7 psrad m1, 8 pmaddwd m4, m2, m5 pmaddwd m3, m0, m5 psubd m4, m3 paddd m4, m7 psrad m4, 8 packssdw m1, m4 movu [r1 + 0 * 16], m1 pmaddwd m1, m2, m6 pmaddwd m3, m0, m6 paddd m1, m3 paddd m1, m7 psrad m1, 8 pmaddwd m2, [r3 + 3 * 16] pmaddwd m0, [r3 + 3 * 16] psubd m2, m0 paddd m2, m7 psrad m2, 8 packssdw m1, m2 movu [r1 + 1 * 16], m1 RET ; DCT 4x4 ; ; Input parameters: ; - r0: source ; - r1: destination ; - r2: source stride INIT_YMM avx2 cglobal dct4, 3, 4, 8, src, dst, srcStride vbroadcasti128 m7, [pd_ %+ DCT4_ROUND] add r2d, r2d lea r3, [avx2_dct4] vbroadcasti128 m4, [dct4_shuf] mova m5, [r3] mova m6, [r3 + 32] movq xm0, [r0] movhps xm0, [r0 + r2] lea r0, [r0 + 2 * r2] movq xm1, [r0] movhps xm1, [r0 + r2] vinserti128 m0, m0, xm1, 1 pshufb m0, m4 vpermq m1, m0, 11011101b vpermq m0, m0, 10001000b paddw m2, m0, m1 psubw m0, m1 pmaddwd m2, m5 paddd m2, m7 psrad m2, DCT4_SHIFT pmaddwd m0, m6 paddd m0, m7 psrad m0, DCT4_SHIFT packssdw m2, m0 pshufb m2, m4 vpermq m1, m2, 11011101b vpermq m2, m2, 10001000b vbroadcasti128 m7, [pd_128] pmaddwd m0, m2, m5 pmaddwd m3, m1, m5 paddd m3, m0 paddd m3, m7 psrad m3, 8 pmaddwd m2, m6 pmaddwd m1, m6 psubd m2, m1 paddd m2, m7 psrad m2, 8 packssdw m3, m2 movu [r1], m3 RET ;------------------------------------------------------- ;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_XMM sse2 cglobal idct4, 3, 4, 6 add r2d, r2d lea r3, [tab_dct4] movu m0, [r0 + 0 * 16] movu m1, [r0 + 1 * 16] punpcklwd m2, m0, m1 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1 paddd m3, [pd_64] pmaddwd m2, [r3 + 2 * 16] ; m2 = E2 paddd m2, [pd_64] punpckhwd m0, m1 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 paddd m4, m3, m1 psrad m4, 7 ; m4 = m128iA paddd m5, m2, m0 psrad m5, 7 packssdw m4, m5 ; m4 = m128iA psubd m2, m0 psrad m2, 7 psubd m3, m1 psrad m3, 7 packssdw m2, m3 ; m2 = m128iD punpcklwd m1, m4, m2 ; m1 = S0 punpckhwd m4, m2 ; m4 = S8 punpcklwd m0, m1, m4 ; m0 = m128iA punpckhwd m1, m4 ; m1 = m128iD punpcklwd m2, m0, m1 pmaddwd m3, m2, [r3 + 0 * 16] paddd m3, [pd_ %+ IDCT_ROUND] ; m3 = E1 pmaddwd m2, [r3 + 2 * 16] paddd m2, [pd_ %+ IDCT_ROUND] ; m2 = E2 punpckhwd m0, m1 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 paddd m4, m3, m1 psrad m4, IDCT_SHIFT ; m4 = m128iA paddd m5, m2, m0 psrad m5, IDCT_SHIFT packssdw m4, m5 ; m4 = m128iA psubd m2, m0 psrad m2, IDCT_SHIFT psubd m3, m1 psrad m3, IDCT_SHIFT packssdw m2, m3 ; m2 = m128iD punpcklwd m1, m4, m2 punpckhwd m4, m2 punpcklwd m0, m1, m4 movlps [r1 + 0 * r2], m0 movhps [r1 + 1 * r2], m0 punpckhwd m1, m4 movlps [r1 + 2 * r2], m1 lea r1, [r1 + 2 * r2] movhps [r1 + r2], m1 RET ;------------------------------------------------------ ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------ INIT_XMM sse2 %if ARCH_X86_64 cglobal dst4, 3, 4, 8+4 %define coef0 m8 %define coef1 m9 %define coef2 m10 %define coef3 m11 %else ; ARCH_X86_64 = 0 cglobal dst4, 3, 4, 8 %define coef0 [r3 + 0 * 16] %define coef1 [r3 + 1 * 16] %define coef2 [r3 + 2 * 16] %define coef3 [r3 + 3 * 16] %endif ; ARCH_X86_64 mova m5, [pd_ %+ DST4_ROUND] add r2d, r2d lea r3, [tab_dst4] %if ARCH_X86_64 mova coef0, [r3 + 0 * 16] mova coef1, [r3 + 1 * 16] mova coef2, [r3 + 2 * 16] mova coef3, [r3 + 3 * 16] %endif movh m0, [r0 + 0 * r2] ; load movhps m0, [r0 + 1 * r2] lea r0, [r0 + 2 * r2] movh m1, [r0] movhps m1, [r0 + r2] pmaddwd m2, m0, coef0 ; DST1 pmaddwd m3, m1, coef0 pshufd m6, m2, q2301 pshufd m7, m3, q2301 paddd m2, m6 paddd m3, m7 pshufd m2, m2, q3120 pshufd m3, m3, q3120 punpcklqdq m2, m3 paddd m2, m5 psrad m2, DST4_SHIFT pmaddwd m3, m0, coef1 pmaddwd m4, m1, coef1 pshufd m6, m4, q2301 pshufd m7, m3, q2301 paddd m4, m6 paddd m3, m7 pshufd m4, m4, q3120 pshufd m3, m3, q3120 punpcklqdq m3, m4 paddd m3, m5 psrad m3, DST4_SHIFT packssdw m2, m3 ; m2 = T70 pmaddwd m3, m0, coef2 pmaddwd m4, m1, coef2 pshufd m6, m4, q2301 pshufd m7, m3, q2301 paddd m4, m6 paddd m3, m7 pshufd m4, m4, q3120 pshufd m3, m3, q3120 punpcklqdq m3, m4 paddd m3, m5 psrad m3, DST4_SHIFT pmaddwd m0, coef3 pmaddwd m1, coef3 pshufd m6, m0, q2301 pshufd m7, m1, q2301 paddd m0, m6 paddd m1, m7 pshufd m0, m0, q3120 pshufd m1, m1, q3120 punpcklqdq m0, m1 paddd m0, m5 psrad m0, DST4_SHIFT packssdw m3, m0 ; m3 = T71 mova m5, [pd_128] pmaddwd m0, m2, coef0 ; DST2 pmaddwd m1, m3, coef0 pshufd m6, m0, q2301 pshufd m7, m1, q2301 paddd m0, m6 paddd m1, m7 pshufd m0, m0, q3120 pshufd m1, m1, q3120 punpcklqdq m0, m1 paddd m0, m5 psrad m0, 8 pmaddwd m4, m2, coef1 pmaddwd m1, m3, coef1 pshufd m6, m4, q2301 pshufd m7, m1, q2301 paddd m4, m6 paddd m1, m7 pshufd m4, m4, q3120 pshufd m1, m1, q3120 punpcklqdq m4, m1 paddd m4, m5 psrad m4, 8 packssdw m0, m4 movu [r1 + 0 * 16], m0 pmaddwd m0, m2, coef2 pmaddwd m1, m3, coef2 pshufd m6, m0, q2301 pshufd m7, m1, q2301 paddd m0, m6 paddd m1, m7 pshufd m0, m0, q3120 pshufd m1, m1, q3120 punpcklqdq m0, m1 paddd m0, m5 psrad m0, 8 pmaddwd m2, coef3 pmaddwd m3, coef3 pshufd m6, m2, q2301 pshufd m7, m3, q2301 paddd m2, m6 paddd m3, m7 pshufd m2, m2, q3120 pshufd m3, m3, q3120 punpcklqdq m2, m3 paddd m2, m5 psrad m2, 8 packssdw m0, m2 movu [r1 + 1 * 16], m0 RET ;------------------------------------------------------ ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------ INIT_XMM ssse3 %if ARCH_X86_64 cglobal dst4, 3, 4, 8+2 %define coef2 m8 %define coef3 m9 %else ; ARCH_X86_64 = 0 cglobal dst4, 3, 4, 8 %define coef2 [r3 + 2 * 16] %define coef3 [r3 + 3 * 16] %endif ; ARCH_X86_64 %define coef0 m6 %define coef1 m7 mova m5, [pd_ %+ DST4_ROUND] add r2d, r2d lea r3, [tab_dst4] mova coef0, [r3 + 0 * 16] mova coef1, [r3 + 1 * 16] %if ARCH_X86_64 mova coef2, [r3 + 2 * 16] mova coef3, [r3 + 3 * 16] %endif movh m0, [r0 + 0 * r2] ; load movh m1, [r0 + 1 * r2] punpcklqdq m0, m1 lea r0, [r0 + 2 * r2] movh m1, [r0] movh m2, [r0 + r2] punpcklqdq m1, m2 pmaddwd m2, m0, coef0 ; DST1 pmaddwd m3, m1, coef0 phaddd m2, m3 paddd m2, m5 psrad m2, DST4_SHIFT pmaddwd m3, m0, coef1 pmaddwd m4, m1, coef1 phaddd m3, m4 paddd m3, m5 psrad m3, DST4_SHIFT packssdw m2, m3 ; m2 = T70 pmaddwd m3, m0, coef2 pmaddwd m4, m1, coef2 phaddd m3, m4 paddd m3, m5 psrad m3, DST4_SHIFT pmaddwd m0, coef3 pmaddwd m1, coef3 phaddd m0, m1 paddd m0, m5 psrad m0, DST4_SHIFT packssdw m3, m0 ; m3 = T71 mova m5, [pd_128] pmaddwd m0, m2, coef0 ; DST2 pmaddwd m1, m3, coef0 phaddd m0, m1 paddd m0, m5 psrad m0, 8 pmaddwd m4, m2, coef1 pmaddwd m1, m3, coef1 phaddd m4, m1 paddd m4, m5 psrad m4, 8 packssdw m0, m4 movu [r1 + 0 * 16], m0 pmaddwd m0, m2, coef2 pmaddwd m1, m3, coef2 phaddd m0, m1 paddd m0, m5 psrad m0, 8 pmaddwd m2, coef3 pmaddwd m3, coef3 phaddd m2, m3 paddd m2, m5 psrad m2, 8 packssdw m0, m2 movu [r1 + 1 * 16], m0 RET ;------------------------------------------------------------------ ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------------------ INIT_YMM avx2 cglobal dst4, 3, 4, 6 vbroadcasti128 m5, [pd_ %+ DST4_ROUND] mova m4, [trans8_shuf] add r2d, r2d lea r3, [pw_dst4_tab] movq xm0, [r0 + 0 * r2] movhps xm0, [r0 + 1 * r2] lea r0, [r0 + 2 * r2] movq xm1, [r0] movhps xm1, [r0 + r2] vinserti128 m0, m0, xm1, 1 ; m0 = src[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] pmaddwd m2, m0, [r3 + 0 * 32] pmaddwd m1, m0, [r3 + 1 * 32] phaddd m2, m1 paddd m2, m5 psrad m2, DST4_SHIFT pmaddwd m3, m0, [r3 + 2 * 32] pmaddwd m1, m0, [r3 + 3 * 32] phaddd m3, m1 paddd m3, m5 psrad m3, DST4_SHIFT packssdw m2, m3 vpermd m2, m4, m2 vpbroadcastd m5, [pd_128] pmaddwd m0, m2, [r3 + 0 * 32] pmaddwd m1, m2, [r3 + 1 * 32] phaddd m0, m1 paddd m0, m5 psrad m0, 8 pmaddwd m3, m2, [r3 + 2 * 32] pmaddwd m2, m2, [r3 + 3 * 32] phaddd m3, m2 paddd m3, m5 psrad m3, 8 packssdw m0, m3 vpermd m0, m4, m0 movu [r1], m0 RET ;------------------------------------------------------- ;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_XMM sse2 cglobal idst4, 3, 4, 7 mova m6, [pd_ %+ IDCT_ROUND] add r2d, r2d lea r3, [tab_idst4] mova m5, [pd_64] movu m0, [r0 + 0 * 16] movu m1, [r0 + 1 * 16] punpcklwd m2, m0, m1 ; m2 = m128iAC punpckhwd m0, m1 ; m0 = m128iBD pmaddwd m1, m2, [r3 + 0 * 16] pmaddwd m3, m0, [r3 + 1 * 16] paddd m1, m3 paddd m1, m5 psrad m1, 7 ; m1 = S0 pmaddwd m3, m2, [r3 + 2 * 16] pmaddwd m4, m0, [r3 + 3 * 16] paddd m3, m4 paddd m3, m5 psrad m3, 7 ; m3 = S8 packssdw m1, m3 ; m1 = m128iA pmaddwd m3, m2, [r3 + 4 * 16] pmaddwd m4, m0, [r3 + 5 * 16] paddd m3, m4 paddd m3, m5 psrad m3, 7 ; m3 = S0 pmaddwd m2, [r3 + 6 * 16] pmaddwd m0, [r3 + 7 * 16] paddd m2, m0 paddd m2, m5 psrad m2, 7 ; m2 = S8 packssdw m3, m2 ; m3 = m128iD punpcklwd m0, m1, m3 punpckhwd m1, m3 punpcklwd m2, m0, m1 punpckhwd m0, m1 punpcklwd m1, m2, m0 punpckhwd m2, m0 pmaddwd m0, m1, [r3 + 0 * 16] pmaddwd m3, m2, [r3 + 1 * 16] paddd m0, m3 paddd m0, m6 psrad m0, IDCT_SHIFT ; m0 = S0 pmaddwd m3, m1, [r3 + 2 * 16] pmaddwd m4, m2, [r3 + 3 * 16] paddd m3, m4 paddd m3, m6 psrad m3, IDCT_SHIFT ; m3 = S8 packssdw m0, m3 ; m0 = m128iA pmaddwd m3, m1, [r3 + 4 * 16] pmaddwd m4, m2, [r3 + 5 * 16] paddd m3, m4 paddd m3, m6 psrad m3, IDCT_SHIFT ; m3 = S0 pmaddwd m1, [r3 + 6 * 16] pmaddwd m2, [r3 + 7 * 16] paddd m1, m2 paddd m1, m6 psrad m1, IDCT_SHIFT ; m1 = S8 packssdw m3, m1 ; m3 = m128iD punpcklwd m1, m0, m3 punpckhwd m0, m3 punpcklwd m2, m1, m0 movlps [r1 + 0 * r2], m2 movhps [r1 + 1 * r2], m2 punpckhwd m1, m0 movlps [r1 + 2 * r2], m1 lea r1, [r1 + 2 * r2] movhps [r1 + r2], m1 RET ;----------------------------------------------------------------- ;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;----------------------------------------------------------------- INIT_YMM avx2 cglobal idst4, 3, 4, 6 vbroadcasti128 m4, [pd_ %+ IDCT_ROUND] add r2d, r2d lea r3, [pw_idst4_tab] movu xm0, [r0 + 0 * 16] movu xm1, [r0 + 1 * 16] punpcklwd m2, m0, m1 punpckhwd m0, m1 vinserti128 m2, m2, xm2, 1 vinserti128 m0, m0, xm0, 1 vpbroadcastd m5, [pd_64] pmaddwd m1, m2, [r3 + 0 * 32] pmaddwd m3, m0, [r3 + 1 * 32] paddd m1, m3 paddd m1, m5 psrad m1, 7 pmaddwd m3, m2, [r3 + 2 * 32] pmaddwd m0, [r3 + 3 * 32] paddd m3, m0 paddd m3, m5 psrad m3, 7 packssdw m0, m1, m3 pshufb m0, [pb_idst4_shuf] vpermq m1, m0, 11101110b punpcklwd m2, m0, m1 punpckhwd m0, m1 punpcklwd m1, m2, m0 punpckhwd m2, m0 vpermq m1, m1, 01000100b vpermq m2, m2, 01000100b pmaddwd m0, m1, [r3 + 0 * 32] pmaddwd m3, m2, [r3 + 1 * 32] paddd m0, m3 paddd m0, m4 psrad m0, IDCT_SHIFT pmaddwd m3, m1, [r3 + 2 * 32] pmaddwd m2, m2, [r3 + 3 * 32] paddd m3, m2 paddd m3, m4 psrad m3, IDCT_SHIFT packssdw m0, m3 pshufb m1, m0, [pb_idst4_shuf] vpermq m0, m1, 11101110b punpcklwd m2, m1, m0 movq [r1 + 0 * r2], xm2 movhps [r1 + 1 * r2], xm2 punpckhwd m1, m0 movq [r1 + 2 * r2], xm1 lea r1, [r1 + 2 * r2] movhps [r1 + r2], xm1 RET ;------------------------------------------------------- ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------- INIT_XMM sse2 cglobal dct8, 3,6,8,0-16*mmsize ;------------------------ ; Stack Mapping(dword) ;------------------------ ; Row0[0-3] Row1[0-3] ; ... ; Row6[0-3] Row7[0-3] ; Row0[0-3] Row7[0-3] ; ... ; Row6[4-7] Row7[4-7] ;------------------------ add r2, r2 lea r3, [r2 * 3] mov r5, rsp %assign x 0 %rep 2 movu m0, [r0] movu m1, [r0 + r2] movu m2, [r0 + r2 * 2] movu m3, [r0 + r3] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m5, m2, m3 punpckhwd m2, m3 punpckldq m1, m4, m5 ; m1 = [1 0] punpckhdq m4, m5 ; m4 = [3 2] punpckldq m3, m0, m2 punpckhdq m0, m2 pshufd m2, m3, 0x4E ; m2 = [4 5] pshufd m0, m0, 0x4E ; m0 = [6 7] paddw m3, m1, m0 psubw m1, m0 ; m1 = [d1 d0] paddw m0, m4, m2 psubw m4, m2 ; m4 = [d3 d2] punpcklqdq m2, m3, m0 ; m2 = [s2 s0] punpckhqdq m3, m0 pshufd m3, m3, 0x4E ; m3 = [s1 s3] punpcklwd m0, m1, m4 ; m0 = [d2/d0] punpckhwd m1, m4 ; m1 = [d3/d1] punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] ; odd lea r4, [tab_dct8_1] pmaddwd m1, m4, [r4 + 0*16] pmaddwd m5, m0, [r4 + 0*16] pshufd m1, m1, 0xD8 pshufd m5, m5, 0xD8 mova m7, m1 punpckhqdq m7, m5 punpcklqdq m1, m5 paddd m1, m7 paddd m1, [pd_ %+ DCT8_ROUND1] psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 1*2*mmsize], m1 ; Row 1 pmaddwd m1, m4, [r4 + 1*16] pmaddwd m5, m0, [r4 + 1*16] pshufd m1, m1, 0xD8 pshufd m5, m5, 0xD8 mova m7, m1 punpckhqdq m7, m5 punpcklqdq m1, m5 paddd m1, m7 paddd m1, [pd_ %+ DCT8_ROUND1] psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 3*2*mmsize], m1 ; Row 3 pmaddwd m1, m4, [r4 + 2*16] pmaddwd m5, m0, [r4 + 2*16] pshufd m1, m1, 0xD8 pshufd m5, m5, 0xD8 mova m7, m1 punpckhqdq m7, m5 punpcklqdq m1, m5 paddd m1, m7 paddd m1, [pd_ %+ DCT8_ROUND1] psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 5*2*mmsize], m1 ; Row 5 pmaddwd m4, [r4 + 3*16] pmaddwd m0, [r4 + 3*16] pshufd m4, m4, 0xD8 pshufd m0, m0, 0xD8 mova m7, m4 punpckhqdq m7, m0 punpcklqdq m4, m0 paddd m4, m7 paddd m4, [pd_ %+ DCT8_ROUND1] psrad m4, DCT8_SHIFT1 %if x == 1 pshufd m4, m4, 0x1B %endif mova [r5 + 7*2*mmsize], m4; Row 7 ; even lea r4, [tab_dct4] paddw m0, m2, m3 ; m0 = [EE1 EE0] pshufd m0, m0, 0xD8 pshuflw m0, m0, 0xD8 pshufhw m0, m0, 0xD8 psubw m2, m3 ; m2 = [EO1 EO0] pmullw m2, [pw_ppppmmmm] pshufd m2, m2, 0xD8 pshuflw m2, m2, 0xD8 pshufhw m2, m2, 0xD8 pmaddwd m3, m0, [r4 + 0*16] paddd m3, [pd_ %+ DCT8_ROUND1] psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 0*2*mmsize], m3 ; Row 0 pmaddwd m0, [r4 + 2*16] paddd m0, [pd_ %+ DCT8_ROUND1] psrad m0, DCT8_SHIFT1 %if x == 1 pshufd m0, m0, 0x1B %endif mova [r5 + 4*2*mmsize], m0 ; Row 4 pmaddwd m3, m2, [r4 + 1*16] paddd m3, [pd_ %+ DCT8_ROUND1] psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 2*2*mmsize], m3 ; Row 2 pmaddwd m2, [r4 + 3*16] paddd m2, [pd_ %+ DCT8_ROUND1] psrad m2, DCT8_SHIFT1 %if x == 1 pshufd m2, m2, 0x1B %endif mova [r5 + 6*2*mmsize], m2 ; Row 6 %if x != 1 lea r0, [r0 + r2 * 4] add r5, mmsize %endif %assign x x+1 %endrep mov r0, rsp ; r0 = pointer to Low Part lea r4, [tab_dct8_2] %assign x 0 %rep 4 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] mova m1, [r0 + 1*2*mmsize] paddd m2, m0, [r0 + (0*2+1)*mmsize] pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] paddd m3, m1, [r0 + (1*2+1)*mmsize] pshufd m3, m3, 0x9C ; m3 = ^^ psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ ; even pshufd m4, m2, 0xD8 pshufd m3, m3, 0xD8 mova m7, m4 punpckhqdq m7, m3 punpcklqdq m4, m3 mova m2, m4 paddd m4, m7 ; m4 = [EE1 EE0 EE1 EE0] psubd m2, m7 ; m2 = [EO1 EO0 EO1 EO0] pslld m4, 6 ; m4 = [64*EE1 64*EE0] mova m5, m2 pmuludq m5, [r4 + 0*16] pshufd m7, m2, 0xF5 movu m6, [r4 + 0*16 + 4] pmuludq m7, m6 pshufd m5, m5, 0x88 pshufd m7, m7, 0x88 punpckldq m5, m7 ; m5 = [36*EO1 83*EO0] pshufd m7, m2, 0xF5 pmuludq m2, [r4 + 1*16] movu m6, [r4 + 1*16 + 4] pmuludq m7, m6 pshufd m2, m2, 0x88 pshufd m7, m7, 0x88 punpckldq m2, m7 ; m2 = [83*EO1 36*EO0] pshufd m3, m4, 0xD8 pshufd m5, m5, 0xD8 mova m7, m3 punpckhqdq m7, m5 punpcklqdq m3, m5 paddd m3, m7 ; m3 = [Row2 Row0] paddd m3, [pd_ %+ DCT8_ROUND2] psrad m3, DCT8_SHIFT2 pshufd m4, m4, 0xD8 pshufd m2, m2, 0xD8 mova m7, m4 punpckhqdq m7, m2 punpcklqdq m4, m2 psubd m4, m7 ; m4 = [Row6 Row4] paddd m4, [pd_ %+ DCT8_ROUND2] psrad m4, DCT8_SHIFT2 packssdw m3, m3 movd [r1 + 0*mmsize], m3 pshufd m3, m3, 1 movd [r1 + 2*mmsize], m3 packssdw m4, m4 movd [r1 + 4*mmsize], m4 pshufd m4, m4, 1 movd [r1 + 6*mmsize], m4 ; odd mova m2, m0 pmuludq m2, [r4 + 2*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 2*16 + 4] pmuludq m7, m6 pshufd m2, m2, 0x88 pshufd m7, m7, 0x88 punpckldq m2, m7 mova m3, m1 pmuludq m3, [r4 + 2*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m3, m3, 0x88 pshufd m7, m7, 0x88 punpckldq m3, m7 mova m4, m0 pmuludq m4, [r4 + 3*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 3*16 + 4] pmuludq m7, m6 pshufd m4, m4, 0x88 pshufd m7, m7, 0x88 punpckldq m4, m7 mova m5, m1 pmuludq m5, [r4 + 3*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m5, m5, 0x88 pshufd m7, m7, 0x88 punpckldq m5, m7 pshufd m2, m2, 0xD8 pshufd m3, m3, 0xD8 mova m7, m2 punpckhqdq m7, m3 punpcklqdq m2, m3 paddd m2, m7 pshufd m4, m4, 0xD8 pshufd m5, m5, 0xD8 mova m7, m4 punpckhqdq m7, m5 punpcklqdq m4, m5 paddd m4, m7 pshufd m2, m2, 0xD8 pshufd m4, m4, 0xD8 mova m7, m2 punpckhqdq m7, m4 punpcklqdq m2, m4 paddd m2, m7 ; m2 = [Row3 Row1] paddd m2, [pd_ %+ DCT8_ROUND2] psrad m2, DCT8_SHIFT2 packssdw m2, m2 movd [r1 + 1*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 3*mmsize], m2 mova m2, m0 pmuludq m2, [r4 + 4*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 4*16 + 4] pmuludq m7, m6 pshufd m2, m2, 0x88 pshufd m7, m7, 0x88 punpckldq m2, m7 mova m3, m1 pmuludq m3, [r4 + 4*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m3, m3, 0x88 pshufd m7, m7, 0x88 punpckldq m3, m7 mova m4, m0 pmuludq m4, [r4 + 5*16] pshufd m7, m0, 0xF5 movu m6, [r4 + 5*16 + 4] pmuludq m7, m6 pshufd m4, m4, 0x88 pshufd m7, m7, 0x88 punpckldq m4, m7 mova m5, m1 pmuludq m5, [r4 + 5*16] pshufd m7, m1, 0xF5 pmuludq m7, m6 pshufd m5, m5, 0x88 pshufd m7, m7, 0x88 punpckldq m5, m7 pshufd m2, m2, 0xD8 pshufd m3, m3, 0xD8 mova m7, m2 punpckhqdq m7, m3 punpcklqdq m2, m3 paddd m2, m7 pshufd m4, m4, 0xD8 pshufd m5, m5, 0xD8 mova m7, m4 punpckhqdq m7, m5 punpcklqdq m4, m5 paddd m4, m7 pshufd m2, m2, 0xD8 pshufd m4, m4, 0xD8 mova m7, m2 punpckhqdq m7, m4 punpcklqdq m2, m4 paddd m2, m7 ; m2 = [Row7 Row5] paddd m2, [pd_ %+ DCT8_ROUND2] psrad m2, DCT8_SHIFT2 packssdw m2, m2 movd [r1 + 5*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 7*mmsize], m2 %if x < 3 add r1, mmsize/4 add r0, 2*2*mmsize %endif %assign x x+1 %endrep RET ;------------------------------------------------------- ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride) ;------------------------------------------------------- INIT_XMM sse4 cglobal dct8, 3,6,7,0-16*mmsize ;------------------------ ; Stack Mapping(dword) ;------------------------ ; Row0[0-3] Row1[0-3] ; ... ; Row6[0-3] Row7[0-3] ; Row0[0-3] Row7[0-3] ; ... ; Row6[4-7] Row7[4-7] ;------------------------ mova m6, [pd_ %+ DCT8_ROUND1] add r2, r2 lea r3, [r2 * 3] mov r5, rsp %assign x 0 %rep 2 movu m0, [r0] movu m1, [r0 + r2] movu m2, [r0 + r2 * 2] movu m3, [r0 + r3] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m5, m2, m3 punpckhwd m2, m3 punpckldq m1, m4, m5 ; m1 = [1 0] punpckhdq m4, m5 ; m4 = [3 2] punpckldq m3, m0, m2 punpckhdq m0, m2 pshufd m2, m3, 0x4E ; m2 = [4 5] pshufd m0, m0, 0x4E ; m0 = [6 7] paddw m3, m1, m0 psubw m1, m0 ; m1 = [d1 d0] paddw m0, m4, m2 psubw m4, m2 ; m4 = [d3 d2] punpcklqdq m2, m3, m0 ; m2 = [s2 s0] punpckhqdq m3, m0 pshufd m3, m3, 0x4E ; m3 = [s1 s3] punpcklwd m0, m1, m4 ; m0 = [d2/d0] punpckhwd m1, m4 ; m1 = [d3/d1] punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] ; odd lea r4, [tab_dct8_1] pmaddwd m1, m4, [r4 + 0*16] pmaddwd m5, m0, [r4 + 0*16] phaddd m1, m5 paddd m1, m6 psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 1*2*mmsize], m1 ; Row 1 pmaddwd m1, m4, [r4 + 1*16] pmaddwd m5, m0, [r4 + 1*16] phaddd m1, m5 paddd m1, m6 psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 3*2*mmsize], m1 ; Row 3 pmaddwd m1, m4, [r4 + 2*16] pmaddwd m5, m0, [r4 + 2*16] phaddd m1, m5 paddd m1, m6 psrad m1, DCT8_SHIFT1 %if x == 1 pshufd m1, m1, 0x1B %endif mova [r5 + 5*2*mmsize], m1 ; Row 5 pmaddwd m4, [r4 + 3*16] pmaddwd m0, [r4 + 3*16] phaddd m4, m0 paddd m4, m6 psrad m4, DCT8_SHIFT1 %if x == 1 pshufd m4, m4, 0x1B %endif mova [r5 + 7*2*mmsize], m4; Row 7 ; even lea r4, [tab_dct4] paddw m0, m2, m3 ; m0 = [EE1 EE0] pshufb m0, [pb_unpackhlw1] psubw m2, m3 ; m2 = [EO1 EO0] psignw m2, [pw_ppppmmmm] pshufb m2, [pb_unpackhlw1] pmaddwd m3, m0, [r4 + 0*16] paddd m3, m6 psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 0*2*mmsize], m3 ; Row 0 pmaddwd m0, [r4 + 2*16] paddd m0, m6 psrad m0, DCT8_SHIFT1 %if x == 1 pshufd m0, m0, 0x1B %endif mova [r5 + 4*2*mmsize], m0 ; Row 4 pmaddwd m3, m2, [r4 + 1*16] paddd m3, m6 psrad m3, DCT8_SHIFT1 %if x == 1 pshufd m3, m3, 0x1B %endif mova [r5 + 2*2*mmsize], m3 ; Row 2 pmaddwd m2, [r4 + 3*16] paddd m2, m6 psrad m2, DCT8_SHIFT1 %if x == 1 pshufd m2, m2, 0x1B %endif mova [r5 + 6*2*mmsize], m2 ; Row 6 %if x != 1 lea r0, [r0 + r2 * 4] add r5, mmsize %endif %assign x x+1 %endrep mov r2, 2 mov r0, rsp ; r0 = pointer to Low Part lea r4, [tab_dct8_2] mova m6, [pd_256] .pass2: %rep 2 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] mova m1, [r0 + 1*2*mmsize] paddd m2, m0, [r0 + (0*2+1)*mmsize] pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] paddd m3, m1, [r0 + (1*2+1)*mmsize] pshufd m3, m3, 0x9C ; m3 = ^^ psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ ; even phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0] phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0] pslld m4, 6 ; m4 = [64*EE1 64*EE0] pmulld m5, m2, [r4 + 0*16] ; m5 = [36*EO1 83*EO0] pmulld m2, [r4 + 1*16] ; m2 = [83*EO1 36*EO0] phaddd m3, m4, m5 ; m3 = [Row2 Row0] paddd m3, m6 psrad m3, 9 phsubd m4, m2 ; m4 = [Row6 Row4] paddd m4, m6 psrad m4, 9 packssdw m3, m3 movd [r1 + 0*mmsize], m3 pshufd m3, m3, 1 movd [r1 + 2*mmsize], m3 packssdw m4, m4 movd [r1 + 4*mmsize], m4 pshufd m4, m4, 1 movd [r1 + 6*mmsize], m4 ; odd pmulld m2, m0, [r4 + 2*16] pmulld m3, m1, [r4 + 2*16] pmulld m4, m0, [r4 + 3*16] pmulld m5, m1, [r4 + 3*16] phaddd m2, m3 phaddd m4, m5 phaddd m2, m4 ; m2 = [Row3 Row1] paddd m2, m6 psrad m2, 9 packssdw m2, m2 movd [r1 + 1*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 3*mmsize], m2 pmulld m2, m0, [r4 + 4*16] pmulld m3, m1, [r4 + 4*16] pmulld m4, m0, [r4 + 5*16] pmulld m5, m1, [r4 + 5*16] phaddd m2, m3 phaddd m4, m5 phaddd m2, m4 ; m2 = [Row7 Row5] paddd m2, m6 psrad m2, 9 packssdw m2, m2 movd [r1 + 5*mmsize], m2 pshufd m2, m2, 1 movd [r1 + 7*mmsize], m2 add r1, mmsize/4 add r0, 2*2*mmsize %endrep dec r2 jnz .pass2 RET ;------------------------------------------------------- ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- %if ARCH_X86_64 INIT_XMM sse2 cglobal idct8, 3, 6, 16, 0-5*mmsize mova m9, [r0 + 1 * mmsize] mova m1, [r0 + 3 * mmsize] mova m7, m9 punpcklwd m7, m1 punpckhwd m9, m1 mova m14, [tab_idct8_3] mova m3, m14 pmaddwd m14, m7 pmaddwd m3, m9 mova m0, [r0 + 5 * mmsize] mova m10, [r0 + 7 * mmsize] mova m2, m0 punpcklwd m2, m10 punpckhwd m0, m10 mova m15, [tab_idct8_3 + 1 * mmsize] mova m11, [tab_idct8_3 + 1 * mmsize] pmaddwd m15, m2 mova m4, [tab_idct8_3 + 2 * mmsize] pmaddwd m11, m0 mova m1, [tab_idct8_3 + 2 * mmsize] paddd m15, m14 mova m5, [tab_idct8_3 + 4 * mmsize] mova m12, [tab_idct8_3 + 4 * mmsize] paddd m11, m3 mova [rsp + 0 * mmsize], m11 mova [rsp + 1 * mmsize], m15 pmaddwd m4, m7 pmaddwd m1, m9 mova m14, [tab_idct8_3 + 3 * mmsize] mova m3, [tab_idct8_3 + 3 * mmsize] pmaddwd m14, m2 pmaddwd m3, m0 paddd m14, m4 paddd m3, m1 mova [rsp + 2 * mmsize], m3 pmaddwd m5, m9 pmaddwd m9, [tab_idct8_3 + 6 * mmsize] mova m6, [tab_idct8_3 + 5 * mmsize] pmaddwd m12, m7 pmaddwd m7, [tab_idct8_3 + 6 * mmsize] mova m4, [tab_idct8_3 + 5 * mmsize] pmaddwd m6, m2 paddd m6, m12 pmaddwd m2, [tab_idct8_3 + 7 * mmsize] paddd m7, m2 mova [rsp + 3 * mmsize], m6 pmaddwd m4, m0 pmaddwd m0, [tab_idct8_3 + 7 * mmsize] paddd m9, m0 paddd m5, m4 mova m6, [r0 + 0 * mmsize] mova m0, [r0 + 4 * mmsize] mova m4, m6 punpcklwd m4, m0 punpckhwd m6, m0 mova m12, [r0 + 2 * mmsize] mova m0, [r0 + 6 * mmsize] mova m13, m12 mova m8, [tab_dct4] punpcklwd m13, m0 mova m10, [tab_dct4] punpckhwd m12, m0 pmaddwd m8, m4 mova m3, m8 pmaddwd m4, [tab_dct4 + 2 * mmsize] pmaddwd m10, m6 mova m2, [tab_dct4 + 1 * mmsize] mova m1, m10 pmaddwd m6, [tab_dct4 + 2 * mmsize] mova m0, [tab_dct4 + 1 * mmsize] pmaddwd m2, m13 paddd m3, m2 psubd m8, m2 mova m2, m6 pmaddwd m13, [tab_dct4 + 3 * mmsize] pmaddwd m0, m12 paddd m1, m0 psubd m10, m0 mova m0, m4 pmaddwd m12, [tab_dct4 + 3 * mmsize] paddd m3, [pd_64] paddd m1, [pd_64] paddd m8, [pd_64] paddd m10, [pd_64] paddd m0, m13 paddd m2, m12 paddd m0, [pd_64] paddd m2, [pd_64] psubd m4, m13 psubd m6, m12 paddd m4, [pd_64] paddd m6, [pd_64] mova m12, m8 psubd m8, m7 psrad m8, 7 paddd m15, m3 psubd m3, [rsp + 1 * mmsize] psrad m15, 7 paddd m12, m7 psrad m12, 7 paddd m11, m1 mova m13, m14 psrad m11, 7 packssdw m15, m11 psubd m1, [rsp + 0 * mmsize] psrad m1, 7 mova m11, [rsp + 2 * mmsize] paddd m14, m0 psrad m14, 7 psubd m0, m13 psrad m0, 7 paddd m11, m2 mova m13, [rsp + 3 * mmsize] psrad m11, 7 packssdw m14, m11 mova m11, m6 psubd m6, m5 paddd m13, m4 psrad m13, 7 psrad m6, 7 paddd m11, m5 psrad m11, 7 packssdw m13, m11 mova m11, m10 psubd m4, [rsp + 3 * mmsize] psubd m10, m9 psrad m4, 7 psrad m10, 7 packssdw m4, m6 packssdw m8, m10 paddd m11, m9 psrad m11, 7 packssdw m12, m11 psubd m2, [rsp + 2 * mmsize] mova m5, m15 psrad m2, 7 packssdw m0, m2 mova m2, m14 psrad m3, 7 packssdw m3, m1 mova m6, m13 punpcklwd m5, m8 punpcklwd m2, m4 mova m1, m12 punpcklwd m6, m0 punpcklwd m1, m3 mova m9, m5 punpckhwd m13, m0 mova m0, m2 punpcklwd m9, m6 punpckhwd m5, m6 punpcklwd m0, m1 punpckhwd m2, m1 punpckhwd m15, m8 mova m1, m5 punpckhwd m14, m4 punpckhwd m12, m3 mova m6, m9 punpckhwd m9, m0 punpcklwd m1, m2 mova m4, [tab_idct8_3 + 0 * mmsize] punpckhwd m5, m2 punpcklwd m6, m0 mova m2, m15 mova m0, m14 mova m7, m9 punpcklwd m2, m13 punpcklwd m0, m12 punpcklwd m7, m5 punpckhwd m14, m12 mova m10, m2 punpckhwd m15, m13 punpckhwd m9, m5 pmaddwd m4, m7 mova m13, m1 punpckhwd m2, m0 punpcklwd m10, m0 mova m0, m15 punpckhwd m15, m14 mova m12, m1 mova m3, [tab_idct8_3 + 0 * mmsize] punpcklwd m0, m14 pmaddwd m3, m9 mova m11, m2 punpckhwd m2, m15 punpcklwd m11, m15 mova m8, [tab_idct8_3 + 1 * mmsize] punpcklwd m13, m0 punpckhwd m12, m0 pmaddwd m8, m11 paddd m8, m4 mova [rsp + 4 * mmsize], m8 mova m4, [tab_idct8_3 + 2 * mmsize] pmaddwd m4, m7 mova m15, [tab_idct8_3 + 2 * mmsize] mova m5, [tab_idct8_3 + 1 * mmsize] pmaddwd m15, m9 pmaddwd m5, m2 paddd m5, m3 mova [rsp + 3 * mmsize], m5 mova m14, [tab_idct8_3 + 3 * mmsize] mova m5, [tab_idct8_3 + 3 * mmsize] pmaddwd m14, m11 paddd m14, m4 mova [rsp + 2 * mmsize], m14 pmaddwd m5, m2 paddd m5, m15 mova [rsp + 1 * mmsize], m5 mova m15, [tab_idct8_3 + 4 * mmsize] mova m5, [tab_idct8_3 + 4 * mmsize] pmaddwd m15, m7 pmaddwd m7, [tab_idct8_3 + 6 * mmsize] pmaddwd m5, m9 pmaddwd m9, [tab_idct8_3 + 6 * mmsize] mova m4, [tab_idct8_3 + 5 * mmsize] pmaddwd m4, m2 paddd m5, m4 mova m4, m6 mova m8, [tab_idct8_3 + 5 * mmsize] punpckhwd m6, m10 pmaddwd m2, [tab_idct8_3 + 7 * mmsize] punpcklwd m4, m10 paddd m9, m2 pmaddwd m8, m11 mova m10, [tab_dct4] paddd m8, m15 pmaddwd m11, [tab_idct8_3 + 7 * mmsize] paddd m7, m11 mova [rsp + 0 * mmsize], m8 pmaddwd m10, m6 pmaddwd m6, [tab_dct4 + 2 * mmsize] mova m1, m10 mova m8, [tab_dct4] mova m3, [tab_dct4 + 1 * mmsize] pmaddwd m8, m4 pmaddwd m4, [tab_dct4 + 2 * mmsize] mova m0, m8 mova m2, [tab_dct4 + 1 * mmsize] pmaddwd m3, m13 psubd m8, m3 paddd m0, m3 mova m3, m6 pmaddwd m13, [tab_dct4 + 3 * mmsize] pmaddwd m2, m12 paddd m1, m2 psubd m10, m2 mova m2, m4 pmaddwd m12, [tab_dct4 + 3 * mmsize] mova m15, [pd_ %+ IDCT_ROUND] paddd m0, m15 paddd m1, m15 paddd m8, m15 paddd m10, m15 paddd m2, m13 paddd m3, m12 paddd m2, m15 paddd m3, m15 psubd m4, m13 psubd m6, m12 paddd m4, m15 paddd m6, m15 mova m15, [rsp + 4 * mmsize] mova m12, m8 psubd m8, m7 psrad m8, IDCT_SHIFT mova m11, [rsp + 3 * mmsize] paddd m15, m0 psrad m15, IDCT_SHIFT psubd m0, [rsp + 4 * mmsize] psrad m0, IDCT_SHIFT paddd m12, m7 paddd m11, m1 mova m14, [rsp + 2 * mmsize] psrad m11, IDCT_SHIFT packssdw m15, m11 psubd m1, [rsp + 3 * mmsize] psrad m1, IDCT_SHIFT mova m11, [rsp + 1 * mmsize] paddd m14, m2 psrad m14, IDCT_SHIFT packssdw m0, m1 psrad m12, IDCT_SHIFT psubd m2, [rsp + 2 * mmsize] paddd m11, m3 mova m13, [rsp + 0 * mmsize] psrad m11, IDCT_SHIFT packssdw m14, m11 mova m11, m6 psubd m6, m5 paddd m13, m4 psrad m13, IDCT_SHIFT mova m1, m15 paddd m11, m5 psrad m11, IDCT_SHIFT packssdw m13, m11 mova m11, m10 psubd m10, m9 psrad m10, IDCT_SHIFT packssdw m8, m10 psrad m6, IDCT_SHIFT psubd m4, [rsp + 0 * mmsize] paddd m11, m9 psrad m11, IDCT_SHIFT packssdw m12, m11 punpcklwd m1, m14 mova m5, m13 psrad m4, IDCT_SHIFT packssdw m4, m6 psubd m3, [rsp + 1 * mmsize] psrad m2, IDCT_SHIFT mova m6, m8 psrad m3, IDCT_SHIFT punpcklwd m5, m12 packssdw m2, m3 punpcklwd m6, m4 punpckhwd m8, m4 mova m4, m1 mova m3, m2 punpckhdq m1, m5 punpckldq m4, m5 punpcklwd m3, m0 punpckhwd m2, m0 mova m0, m6 lea r2, [r2 + r2] lea r4, [r2 + r2] lea r3, [r4 + r2] lea r4, [r4 + r3] lea r0, [r4 + r2 * 2] movq [r1], m4 punpckhwd m15, m14 movhps [r1 + r2], m4 punpckhdq m0, m3 movq [r1 + r2 * 2], m1 punpckhwd m13, m12 movhps [r1 + r3], m1 mova m1, m6 punpckldq m1, m3 movq [r1 + 8], m1 movhps [r1 + r2 + 8], m1 movq [r1 + r2 * 2 + 8], m0 movhps [r1 + r3 + 8], m0 mova m0, m15 punpckhdq m15, m13 punpckldq m0, m13 movq [r1 + r2 * 4], m0 movhps [r1 + r4], m0 mova m0, m8 punpckhdq m8, m2 movq [r1 + r3 * 2], m15 punpckldq m0, m2 movhps [r1 + r0], m15 movq [r1 + r2 * 4 + 8], m0 movhps [r1 + r4 + 8], m0 movq [r1 + r3 * 2 + 8], m8 movhps [r1 + r0 + 8], m8 RET %endif ;------------------------------------------------------- ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_XMM ssse3 cglobal patial_butterfly_inverse_internal_pass1 movh m0, [r0] movhps m0, [r0 + 2 * 16] movh m1, [r0 + 4 * 16] movhps m1, [r0 + 6 * 16] punpckhwd m2, m0, m1 ; [2 6] punpcklwd m0, m1 ; [0 4] pmaddwd m1, m0, [r6] ; EE[0] pmaddwd m0, [r6 + 32] ; EE[1] pmaddwd m3, m2, [r6 + 16] ; EO[0] pmaddwd m2, [r6 + 48] ; EO[1] paddd m4, m1, m3 ; E[0] psubd m1, m3 ; E[3] paddd m3, m0, m2 ; E[1] psubd m0, m2 ; E[2] ;E[K] = E[k] + add mova m5, [pd_64] paddd m0, m5 paddd m1, m5 paddd m3, m5 paddd m4, m5 movh m2, [r0 + 16] movhps m2, [r0 + 5 * 16] movh m5, [r0 + 3 * 16] movhps m5, [r0 + 7 * 16] punpcklwd m6, m2, m5 ;[1 3] punpckhwd m2, m5 ;[5 7] pmaddwd m5, m6, [r4] pmaddwd m7, m2, [r4 + 16] paddd m5, m7 ; O[0] paddd m7, m4, m5 psrad m7, 7 psubd m4, m5 psrad m4, 7 packssdw m7, m4 movh [r5 + 0 * 16], m7 movhps [r5 + 7 * 16], m7 pmaddwd m5, m6, [r4 + 32] pmaddwd m4, m2, [r4 + 48] paddd m5, m4 ; O[1] paddd m4, m3, m5 psrad m4, 7 psubd m3, m5 psrad m3, 7 packssdw m4, m3 movh [r5 + 1 * 16], m4 movhps [r5 + 6 * 16], m4 pmaddwd m5, m6, [r4 + 64] pmaddwd m4, m2, [r4 + 80] paddd m5, m4 ; O[2] paddd m4, m0, m5 psrad m4, 7 psubd m0, m5 psrad m0, 7 packssdw m4, m0 movh [r5 + 2 * 16], m4 movhps [r5 + 5 * 16], m4 pmaddwd m5, m6, [r4 + 96] pmaddwd m4, m2, [r4 + 112] paddd m5, m4 ; O[3] paddd m4, m1, m5 psrad m4, 7 psubd m1, m5 psrad m1, 7 packssdw m4, m1 movh [r5 + 3 * 16], m4 movhps [r5 + 4 * 16], m4 ret %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1 pshufb m4, %1, [pb_idct8even] pmaddwd m4, [tab_idct8_1] phsubd m5, m4 pshufd m4, m4, 0x4E phaddd m4, m4 punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3] paddd m4, m6 pshufb %1, %1, [r6] pmaddwd m5, %1, [r4] pmaddwd %1, [r4 + 16] phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3] paddd %1, m4, m5 psrad %1, IDCT_SHIFT psubd m4, m5 psrad m4, IDCT_SHIFT pshufd m4, m4, 0x1B packssdw %1, m4 %endmacro INIT_XMM ssse3 cglobal patial_butterfly_inverse_internal_pass2 mova m0, [r5] PARTIAL_BUTTERFLY_PROCESS_ROW m0 movu [r1], m0 mova m2, [r5 + 16] PARTIAL_BUTTERFLY_PROCESS_ROW m2 movu [r1 + r2], m2 mova m1, [r5 + 32] PARTIAL_BUTTERFLY_PROCESS_ROW m1 movu [r1 + 2 * r2], m1 mova m3, [r5 + 48] PARTIAL_BUTTERFLY_PROCESS_ROW m3 movu [r1 + r3], m3 ret INIT_XMM ssse3 cglobal idct8, 3,7,8 ;,0-16*mmsize ; alignment stack to 64-bytes mov r5, rsp sub rsp, 16*mmsize + gprsize and rsp, ~(64-1) mov [rsp + 16*mmsize], r5 mov r5, rsp lea r4, [tab_idct8_3] lea r6, [tab_dct4] call patial_butterfly_inverse_internal_pass1 add r0, 8 add r5, 8 call patial_butterfly_inverse_internal_pass1 mova m6, [pd_ %+ IDCT_ROUND] add r2, r2 lea r3, [r2 * 3] lea r4, [tab_idct8_2] lea r6, [pb_idct8odd] sub r5, 8 call patial_butterfly_inverse_internal_pass2 lea r1, [r1 + 4 * r2] add r5, 64 call patial_butterfly_inverse_internal_pass2 ; restore origin stack pointer mov rsp, [rsp + 16*mmsize] RET ;----------------------------------------------------------------------------- ; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal denoise_dct, 4, 4, 6 pxor m5, m5 shr r3d, 3 .loop: mova m0, [r0] pabsw m1, m0 mova m2, [r1] pmovsxwd m3, m1 paddd m2, m3 mova [r1], m2 mova m2, [r1 + 16] psrldq m3, m1, 8 pmovsxwd m4, m3 paddd m2, m4 mova [r1 + 16], m2 movu m3, [r2] psubusw m1, m3 pcmpgtw m4, m1, m5 pand m1, m4 psignw m1, m0 mova [r0], m1 add r0, 16 add r1, 32 add r2, 16 dec r3d jnz .loop RET INIT_YMM avx2 cglobal denoise_dct, 4, 4, 6 pxor m5, m5 shr r3d, 4 .loop: movu m0, [r0] pabsw m1, m0 movu m2, [r1] pmovsxwd m4, xm1 paddd m2, m4 movu [r1], m2 vextracti128 xm4, m1, 1 movu m2, [r1 + 32] pmovsxwd m3, xm4 paddd m2, m3 movu [r1 + 32], m2 movu m3, [r2] psubusw m1, m3 pcmpgtw m4, m1, m5 pand m1, m4 psignw m1, m0 movu [r0], m1 add r0, 32 add r1, 64 add r2, 32 dec r3d jnz .loop RET %if ARCH_X86_64 == 1 %macro DCT8_PASS_1 4 vpbroadcastq m0, [r6 + %1] pmaddwd m2, m%3, m0 pmaddwd m0, m%4 phaddd m2, m0 paddd m2, m5 psrad m2, DCT_SHIFT packssdw m2, m2 vpermq m2, m2, 0x08 mova [r5 + %2], xm2 %endmacro %macro DCT8_PASS_2 2 vbroadcasti128 m4, [r6 + %1] pmaddwd m6, m0, m4 pmaddwd m7, m1, m4 pmaddwd m8, m2, m4 pmaddwd m9, m3, m4 phaddd m6, m7 phaddd m8, m9 phaddd m6, m8 paddd m6, m5 psrad m6, DCT_SHIFT2 vbroadcasti128 m4, [r6 + %2] pmaddwd m10, m0, m4 pmaddwd m7, m1, m4 pmaddwd m8, m2, m4 pmaddwd m9, m3, m4 phaddd m10, m7 phaddd m8, m9 phaddd m10, m8 paddd m10, m5 psrad m10, DCT_SHIFT2 packssdw m6, m10 vpermq m10, m6, 0xD8 %endmacro INIT_YMM avx2 cglobal dct8, 3, 7, 11, 0-8*16 %if BIT_DEPTH == 12 %define DCT_SHIFT 6 vbroadcasti128 m5, [pd_16] %elif BIT_DEPTH == 10 %define DCT_SHIFT 4 vbroadcasti128 m5, [pd_8] %elif BIT_DEPTH == 8 %define DCT_SHIFT 2 vbroadcasti128 m5, [pd_2] %else %error Unsupported BIT_DEPTH! %endif %define DCT_SHIFT2 9 add r2d, r2d lea r3, [r2 * 3] lea r4, [r0 + r2 * 4] mov r5, rsp lea r6, [tab_dct8] mova m6, [dct8_shuf] ;pass1 mova xm0, [r0] vinserti128 m0, m0, [r4], 1 mova xm1, [r0 + r2] vinserti128 m1, m1, [r4 + r2], 1 mova xm2, [r0 + r2 * 2] vinserti128 m2, m2, [r4 + r2 * 2], 1 mova xm3, [r0 + r3] vinserti128 m3, m3, [r4 + r3], 1 punpcklqdq m4, m0, m1 punpckhqdq m0, m1 punpcklqdq m1, m2, m3 punpckhqdq m2, m3 pshufb m0, m6 pshufb m2, m6 paddw m3, m4, m0 paddw m7, m1, m2 psubw m4, m0 psubw m1, m2 DCT8_PASS_1 0 * 16, 0 * 16, 3, 7 DCT8_PASS_1 1 * 16, 2 * 16, 4, 1 DCT8_PASS_1 2 * 16, 4 * 16, 3, 7 DCT8_PASS_1 3 * 16, 6 * 16, 4, 1 DCT8_PASS_1 4 * 16, 1 * 16, 3, 7 DCT8_PASS_1 5 * 16, 3 * 16, 4, 1 DCT8_PASS_1 6 * 16, 5 * 16, 3, 7 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1 ;pass2 vbroadcasti128 m5, [pd_256] mova m0, [r5] mova m1, [r5 + 32] mova m2, [r5 + 64] mova m3, [r5 + 96] DCT8_PASS_2 0 * 16, 1 * 16 movu [r1], m10 DCT8_PASS_2 2 * 16, 3 * 16 movu [r1 + 32], m10 DCT8_PASS_2 4 * 16, 5 * 16 movu [r1 + 64], m10 DCT8_PASS_2 6 * 16, 7 * 16 movu [r1 + 96], m10 RET %macro DCT16_PASS_1_E 2 vpbroadcastq m7, [r7 + %1] pmaddwd m4, m0, m7 pmaddwd m6, m2, m7 phaddd m4, m6 paddd m4, m9 psrad m4, DCT_SHIFT packssdw m4, m4 vpermq m4, m4, 0x08 mova [r5 + %2], xm4 %endmacro %macro DCT16_PASS_1_O 2 vbroadcasti128 m7, [r7 + %1] pmaddwd m10, m0, m7 pmaddwd m11, m2, m7 phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5] pmaddwd m11, m4, m7 pmaddwd m12, m6, m7 phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7] phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7] paddd m10, m9 psrad m10, DCT_SHIFT packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -] vpermq m10, m10, 0x08 mova [r5 + %2], xm10 %endmacro %macro DCT16_PASS_2 2 vbroadcasti128 m8, [r7 + %1] vbroadcasti128 m13, [r8 + %1] pmaddwd m10, m0, m8 pmaddwd m11, m1, m13 paddd m10, m11 pmaddwd m11, m2, m8 pmaddwd m12, m3, m13 paddd m11, m12 phaddd m10, m11 pmaddwd m11, m4, m8 pmaddwd m12, m5, m13 paddd m11, m12 pmaddwd m12, m6, m8 pmaddwd m13, m7, m13 paddd m12, m13 phaddd m11, m12 phaddd m10, m11 paddd m10, m9 psrad m10, DCT_SHIFT2 vbroadcasti128 m8, [r7 + %2] vbroadcasti128 m13, [r8 + %2] pmaddwd m14, m0, m8 pmaddwd m11, m1, m13 paddd m14, m11 pmaddwd m11, m2, m8 pmaddwd m12, m3, m13 paddd m11, m12 phaddd m14, m11 pmaddwd m11, m4, m8 pmaddwd m12, m5, m13 paddd m11, m12 pmaddwd m12, m6, m8 pmaddwd m13, m7, m13 paddd m12, m13 phaddd m11, m12 phaddd m14, m11 paddd m14, m9 psrad m14, DCT_SHIFT2 packssdw m10, m14 vextracti128 xm14, m10, 1 movlhps xm15, xm10, xm14 movhlps xm14, xm10 %endmacro INIT_YMM avx2 cglobal dct16, 3, 9, 16, 0-16*mmsize %if BIT_DEPTH == 12 %define DCT_SHIFT 7 vbroadcasti128 m9, [pd_64] %elif BIT_DEPTH == 10 %define DCT_SHIFT 5 vbroadcasti128 m9, [pd_16] %elif BIT_DEPTH == 8 %define DCT_SHIFT 3 vbroadcasti128 m9, [pd_4] %else %error Unsupported BIT_DEPTH! %endif %define DCT_SHIFT2 10 add r2d, r2d mova m13, [dct16_shuf1] mova m14, [dct16_shuf2] lea r7, [tab_dct16_1 + 8 * 16] lea r8, [tab_dct16_2 + 8 * 16] lea r3, [r2 * 3] mov r5, rsp mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations .pass1: lea r6, [r0 + r2 * 4] movu m2, [r0] movu m1, [r6] vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo] vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi] movu m4, [r0 + r2] movu m3, [r6 + r2] vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo] vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi] movu m6, [r0 + r2 * 2] movu m5, [r6 + r2 * 2] vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo] vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi] movu m8, [r0 + r3] movu m7, [r6 + r3] vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo] vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi] pshufb m1, m13 pshufb m3, m13 pshufb m5, m13 pshufb m7, m13 paddw m8, m0, m1 ;E psubw m0, m1 ;O paddw m1, m2, m3 ;E psubw m2, m3 ;O paddw m3, m4, m5 ;E psubw m4, m5 ;O paddw m5, m6, m7 ;E psubw m6, m7 ;O DCT16_PASS_1_O -7 * 16, 1 * 32 DCT16_PASS_1_O -5 * 16, 3 * 32 DCT16_PASS_1_O -3 * 16, 1 * 32 + 16 DCT16_PASS_1_O -1 * 16, 3 * 32 + 16 DCT16_PASS_1_O 1 * 16, 5 * 32 DCT16_PASS_1_O 3 * 16, 7 * 32 DCT16_PASS_1_O 5 * 16, 5 * 32 + 16 DCT16_PASS_1_O 7 * 16, 7 * 32 + 16 pshufb m8, m14 pshufb m1, m14 phaddw m0, m8, m1 pshufb m3, m14 pshufb m5, m14 phaddw m2, m3, m5 DCT16_PASS_1_E -8 * 16, 0 * 32 DCT16_PASS_1_E -4 * 16, 0 * 32 + 16 DCT16_PASS_1_E 0 * 16, 4 * 32 DCT16_PASS_1_E 4 * 16, 4 * 32 + 16 phsubw m0, m8, m1 phsubw m2, m3, m5 DCT16_PASS_1_E -6 * 16, 2 * 32 DCT16_PASS_1_E -2 * 16, 2 * 32 + 16 DCT16_PASS_1_E 2 * 16, 6 * 32 DCT16_PASS_1_E 6 * 16, 6 * 32 + 16 lea r0, [r0 + 8 * r2] add r5, 256 dec r4d jnz .pass1 mov r5, rsp mov r4d, 2 mov r2d, 32 lea r3, [r2 * 3] vbroadcasti128 m9, [pd_512] .pass2: mova m0, [r5 + 0 * 32] ; [row0lo row4lo] mova m1, [r5 + 8 * 32] ; [row0hi row4hi] mova m2, [r5 + 1 * 32] ; [row1lo row5lo] mova m3, [r5 + 9 * 32] ; [row1hi row5hi] mova m4, [r5 + 2 * 32] ; [row2lo row6lo] mova m5, [r5 + 10 * 32] ; [row2hi row6hi] mova m6, [r5 + 3 * 32] ; [row3lo row7lo] mova m7, [r5 + 11 * 32] ; [row3hi row7hi] DCT16_PASS_2 -8 * 16, -7 * 16 movu [r1], xm15 movu [r1 + r2], xm14 DCT16_PASS_2 -6 * 16, -5 * 16 movu [r1 + r2 * 2], xm15 movu [r1 + r3], xm14 lea r6, [r1 + r2 * 4] DCT16_PASS_2 -4 * 16, -3 * 16 movu [r6], xm15 movu [r6 + r2], xm14 DCT16_PASS_2 -2 * 16, -1 * 16 movu [r6 + r2 * 2], xm15 movu [r6 + r3], xm14 lea r6, [r6 + r2 * 4] DCT16_PASS_2 0 * 16, 1 * 16 movu [r6], xm15 movu [r6 + r2], xm14 DCT16_PASS_2 2 * 16, 3 * 16 movu [r6 + r2 * 2], xm15 movu [r6 + r3], xm14 lea r6, [r6 + r2 * 4] DCT16_PASS_2 4 * 16, 5 * 16 movu [r6], xm15 movu [r6 + r2], xm14 DCT16_PASS_2 6 * 16, 7 * 16 movu [r6 + r2 * 2], xm15 movu [r6 + r3], xm14 add r1, 16 add r5, 128 dec r4d jnz .pass2 RET %macro DCT32_PASS_1 4 vbroadcasti128 m8, [r7 + %1] pmaddwd m11, m%3, m8 pmaddwd m12, m%4, m8 phaddd m11, m12 vbroadcasti128 m8, [r7 + %1 + 32] vbroadcasti128 m10, [r7 + %1 + 48] pmaddwd m12, m5, m8 pmaddwd m13, m6, m10 phaddd m12, m13 pmaddwd m13, m4, m8 pmaddwd m14, m7, m10 phaddd m13, m14 phaddd m12, m13 phaddd m11, m12 paddd m11, m9 psrad m11, DCT_SHIFT vpermq m11, m11, 0xD8 packssdw m11, m11 movq [r5 + %2], xm11 vextracti128 xm10, m11, 1 movq [r5 + %2 + 64], xm10 %endmacro %macro DCT32_PASS_2 1 mova m8, [r7 + %1] mova m10, [r8 + %1] pmaddwd m11, m0, m8 pmaddwd m12, m1, m10 paddd m11, m12 pmaddwd m12, m2, m8 pmaddwd m13, m3, m10 paddd m12, m13 phaddd m11, m12 pmaddwd m12, m4, m8 pmaddwd m13, m5, m10 paddd m12, m13 pmaddwd m13, m6, m8 pmaddwd m14, m7, m10 paddd m13, m14 phaddd m12, m13 phaddd m11, m12 vextracti128 xm10, m11, 1 paddd xm11, xm10 paddd xm11, xm9 psrad xm11, DCT_SHIFT2 packssdw xm11, xm11 %endmacro INIT_YMM avx2 cglobal dct32, 3, 9, 16, 0-64*mmsize %if BIT_DEPTH == 12 %define DCT_SHIFT 8 vpbroadcastq m9, [pd_128] %elif BIT_DEPTH == 10 %define DCT_SHIFT 6 vpbroadcastq m9, [pd_32] %elif BIT_DEPTH == 8 %define DCT_SHIFT 4 vpbroadcastq m9, [pd_8] %else %error Unsupported BIT_DEPTH! %endif %define DCT_SHIFT2 11 add r2d, r2d lea r7, [tab_dct32_1] lea r8, [tab_dct32_2] lea r3, [r2 * 3] mov r5, rsp mov r4d, 8 mova m15, [dct16_shuf1] .pass1: movu m2, [r0] movu m1, [r0 + 32] pshufb m1, m15 vpermq m1, m1, 0x4E psubw m7, m2, m1 paddw m2, m1 movu m1, [r0 + r2 * 2] movu m0, [r0 + r2 * 2 + 32] pshufb m0, m15 vpermq m0, m0, 0x4E psubw m8, m1, m0 paddw m1, m0 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E pshufb m3, m15 psubw m1, m0, m3 paddw m0, m3 vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O movu m4, [r0 + r2] movu m2, [r0 + r2 + 32] pshufb m2, m15 vpermq m2, m2, 0x4E psubw m10, m4, m2 paddw m4, m2 movu m3, [r0 + r3] movu m2, [r0 + r3 + 32] pshufb m2, m15 vpermq m2, m2, 0x4E psubw m11, m3, m2 paddw m3, m2 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E pshufb m8, m15 psubw m3, m2, m8 paddw m2, m8 vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O DCT32_PASS_1 0 * 32, 0 * 64, 0, 2 DCT32_PASS_1 2 * 32, 2 * 64, 1, 3 DCT32_PASS_1 4 * 32, 4 * 64, 0, 2 DCT32_PASS_1 6 * 32, 6 * 64, 1, 3 DCT32_PASS_1 8 * 32, 8 * 64, 0, 2 DCT32_PASS_1 10 * 32, 10 * 64, 1, 3 DCT32_PASS_1 12 * 32, 12 * 64, 0, 2 DCT32_PASS_1 14 * 32, 14 * 64, 1, 3 DCT32_PASS_1 16 * 32, 16 * 64, 0, 2 DCT32_PASS_1 18 * 32, 18 * 64, 1, 3 DCT32_PASS_1 20 * 32, 20 * 64, 0, 2 DCT32_PASS_1 22 * 32, 22 * 64, 1, 3 DCT32_PASS_1 24 * 32, 24 * 64, 0, 2 DCT32_PASS_1 26 * 32, 26 * 64, 1, 3 DCT32_PASS_1 28 * 32, 28 * 64, 0, 2 DCT32_PASS_1 30 * 32, 30 * 64, 1, 3 add r5, 8 lea r0, [r0 + r2 * 4] dec r4d jnz .pass1 mov r2d, 64 lea r3, [r2 * 3] mov r5, rsp mov r4d, 8 vpbroadcastq m9, [pd_1024] .pass2: mova m0, [r5 + 0 * 64] mova m1, [r5 + 0 * 64 + 32] mova m2, [r5 + 1 * 64] mova m3, [r5 + 1 * 64 + 32] mova m4, [r5 + 2 * 64] mova m5, [r5 + 2 * 64 + 32] mova m6, [r5 + 3 * 64] mova m7, [r5 + 3 * 64 + 32] DCT32_PASS_2 0 * 32 movq [r1], xm11 DCT32_PASS_2 1 * 32 movq [r1 + r2], xm11 DCT32_PASS_2 2 * 32 movq [r1 + r2 * 2], xm11 DCT32_PASS_2 3 * 32 movq [r1 + r3], xm11 lea r6, [r1 + r2 * 4] DCT32_PASS_2 4 * 32 movq [r6], xm11 DCT32_PASS_2 5 * 32 movq [r6 + r2], xm11 DCT32_PASS_2 6 * 32 movq [r6 + r2 * 2], xm11 DCT32_PASS_2 7 * 32 movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 8 * 32 movq [r6], xm11 DCT32_PASS_2 9 * 32 movq [r6 + r2], xm11 DCT32_PASS_2 10 * 32 movq [r6 + r2 * 2], xm11 DCT32_PASS_2 11 * 32 movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 12 * 32 movq [r6], xm11 DCT32_PASS_2 13 * 32 movq [r6 + r2], xm11 DCT32_PASS_2 14 * 32 movq [r6 + r2 * 2], xm11 DCT32_PASS_2 15 * 32 movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 16 * 32 movq [r6], xm11 DCT32_PASS_2 17 * 32 movq [r6 + r2], xm11 DCT32_PASS_2 18 * 32 movq [r6 + r2 * 2], xm11 DCT32_PASS_2 19 * 32 movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 20 * 32 movq [r6], xm11 DCT32_PASS_2 21 * 32 movq [r6 + r2], xm11 DCT32_PASS_2 22 * 32 movq [r6 + r2 * 2], xm11 DCT32_PASS_2 23 * 32 movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 24 * 32 movq [r6], xm11 DCT32_PASS_2 25 * 32 movq [r6 + r2], xm11 DCT32_PASS_2 26 * 32 movq [r6 + r2 * 2], xm11 DCT32_PASS_2 27 * 32 movq [r6 + r3], xm11 lea r6, [r6 + r2 * 4] DCT32_PASS_2 28 * 32 movq [r6], xm11 DCT32_PASS_2 29 * 32 movq [r6 + r2], xm11 DCT32_PASS_2 30 * 32 movq [r6 + r2 * 2], xm11 DCT32_PASS_2 31 * 32 movq [r6 + r3], xm11 add r5, 256 add r1, 8 dec r4d jnz .pass2 RET %macro IDCT8_PASS_1 1 vpbroadcastd m7, [r5 + %1] vpbroadcastd m10, [r5 + %1 + 4] pmaddwd m5, m4, m7 pmaddwd m6, m0, m10 paddd m5, m6 vpbroadcastd m7, [r6 + %1] vpbroadcastd m10, [r6 + %1 + 4] pmaddwd m6, m1, m7 pmaddwd m3, m2, m10 paddd m6, m3 paddd m3, m5, m6 paddd m3, m11 psrad m3, IDCT_SHIFT1 psubd m5, m6 paddd m5, m11 psrad m5, IDCT_SHIFT1 vpbroadcastd m7, [r5 + %1 + 32] vpbroadcastd m10, [r5 + %1 + 36] pmaddwd m6, m4, m7 pmaddwd m8, m0, m10 paddd m6, m8 vpbroadcastd m7, [r6 + %1 + 32] vpbroadcastd m10, [r6 + %1 + 36] pmaddwd m8, m1, m7 pmaddwd m9, m2, m10 paddd m8, m9 paddd m9, m6, m8 paddd m9, m11 psrad m9, IDCT_SHIFT1 psubd m6, m8 paddd m6, m11 psrad m6, IDCT_SHIFT1 packssdw m3, m9 vpermq m3, m3, 0xD8 packssdw m6, m5 vpermq m6, m6, 0xD8 %endmacro %macro IDCT8_PASS_2 0 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 pmaddwd m3, m2, [r5] pmaddwd m5, m2, [r5 + 32] pmaddwd m6, m2, [r5 + 64] pmaddwd m7, m2, [r5 + 96] phaddd m3, m5 phaddd m6, m7 pshufb m3, [idct8_shuf2] pshufb m6, [idct8_shuf2] punpcklqdq m7, m3, m6 punpckhqdq m3, m6 pmaddwd m5, m0, [r6] pmaddwd m6, m0, [r6 + 32] pmaddwd m8, m0, [r6 + 64] pmaddwd m9, m0, [r6 + 96] phaddd m5, m6 phaddd m8, m9 pshufb m5, [idct8_shuf2] pshufb m8, [idct8_shuf2] punpcklqdq m6, m5, m8 punpckhqdq m5, m8 paddd m8, m7, m6 paddd m8, m12 psrad m8, IDCT_SHIFT2 psubd m7, m6 paddd m7, m12 psrad m7, IDCT_SHIFT2 pshufb m7, [idct8_shuf3] packssdw m8, m7 paddd m9, m3, m5 paddd m9, m12 psrad m9, IDCT_SHIFT2 psubd m3, m5 paddd m3, m12 psrad m3, IDCT_SHIFT2 pshufb m3, [idct8_shuf3] packssdw m9, m3 %endmacro INIT_YMM avx2 cglobal idct8, 3, 7, 13, 0-8*16 %if BIT_DEPTH == 12 %define IDCT_SHIFT2 8 vpbroadcastd m12, [pd_256] %elif BIT_DEPTH == 10 %define IDCT_SHIFT2 10 vpbroadcastd m12, [pd_512] %elif BIT_DEPTH == 8 %define IDCT_SHIFT2 12 vpbroadcastd m12, [pd_2048] %else %error Unsupported BIT_DEPTH! %endif %define IDCT_SHIFT1 7 vbroadcasti128 m11, [pd_64] mov r4, rsp lea r5, [avx2_idct8_1] lea r6, [avx2_idct8_2] ;pass1 mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1] mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3] vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2] vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3] vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3] mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5] mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7] vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6] vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7] vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7] mova m5, [idct8_shuf1] vpermd m4, m5, m4 vpermd m0, m5, m0 vpermd m1, m5, m1 vpermd m2, m5, m2 IDCT8_PASS_1 0 mova [r4], m3 mova [r4 + 96], m6 IDCT8_PASS_1 64 mova [r4 + 32], m3 mova [r4 + 64], m6 ;pass2 add r2d, r2d lea r3, [r2 * 3] mova m0, [r4] mova m1, [r4 + 32] IDCT8_PASS_2 vextracti128 xm3, m8, 1 mova [r1], xm8 mova [r1 + r2], xm3 vextracti128 xm3, m9, 1 mova [r1 + r2 * 2], xm9 mova [r1 + r3], xm3 lea r1, [r1 + r2 * 4] mova m0, [r4 + 64] mova m1, [r4 + 96] IDCT8_PASS_2 vextracti128 xm3, m8, 1 mova [r1], xm8 mova [r1 + r2], xm3 vextracti128 xm3, m9, 1 mova [r1 + r2 * 2], xm9 mova [r1 + r3], xm3 RET %macro IDCT_PASS1 2 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16] pmaddwd m9, m0, m5 pmaddwd m10, m7, m5 phaddd m9, m10 pmaddwd m10, m6, m5 pmaddwd m11, m8, m5 phaddd m10, m11 phaddd m9, m10 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16] pmaddwd m10, m1, m5 pmaddwd m11, m3, m5 phaddd m10, m11 pmaddwd m11, m4, m5 pmaddwd m12, m2, m5 phaddd m11, m12 phaddd m10, m11 paddd m11, m9, m10 paddd m11, m14 psrad m11, IDCT_SHIFT1 psubd m9, m10 paddd m9, m14 psrad m9, IDCT_SHIFT1 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16] pmaddwd m10, m0, m5 pmaddwd m12, m7, m5 phaddd m10, m12 pmaddwd m12, m6, m5 pmaddwd m13, m8, m5 phaddd m12, m13 phaddd m10, m12 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16] pmaddwd m12, m1, m5 pmaddwd m13, m3, m5 phaddd m12, m13 pmaddwd m13, m4, m5 pmaddwd m5, m2 phaddd m13, m5 phaddd m12, m13 paddd m5, m10, m12 paddd m5, m14 psrad m5, IDCT_SHIFT1 psubd m10, m12 paddd m10, m14 psrad m10, IDCT_SHIFT1 packssdw m11, m5 packssdw m9, m10 mova m10, [idct16_shuff] mova m5, [idct16_shuff1] vpermd m12, m10, m11 vpermd m13, m5, m9 mova [r3 + %1 * 16 * 2], xm12 mova [r3 + %2 * 16 * 2], xm13 vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1 vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1 %endmacro ;------------------------------------------------------- ; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_YMM avx2 cglobal idct16, 3, 7, 16, 0-16*mmsize %if BIT_DEPTH == 12 %define IDCT_SHIFT2 8 vpbroadcastd m15, [pd_256] %elif BIT_DEPTH == 10 %define IDCT_SHIFT2 10 vpbroadcastd m15, [pd_512] %elif BIT_DEPTH == 8 %define IDCT_SHIFT2 12 vpbroadcastd m15, [pd_2048] %else %error Unsupported BIT_DEPTH! %endif %define IDCT_SHIFT1 7 vbroadcasti128 m14, [pd_64] add r2d, r2d mov r3, rsp mov r4d, 2 .pass1: movu xm0, [r0 + 0 * 32] movu xm1, [r0 + 8 * 32] punpckhqdq xm2, xm0, xm1 punpcklqdq xm0, xm1 vinserti128 m0, m0, xm2, 1 movu xm1, [r0 + 1 * 32] movu xm2, [r0 + 9 * 32] punpckhqdq xm3, xm1, xm2 punpcklqdq xm1, xm2 vinserti128 m1, m1, xm3, 1 movu xm2, [r0 + 2 * 32] movu xm3, [r0 + 10 * 32] punpckhqdq xm4, xm2, xm3 punpcklqdq xm2, xm3 vinserti128 m2, m2, xm4, 1 movu xm3, [r0 + 3 * 32] movu xm4, [r0 + 11 * 32] punpckhqdq xm5, xm3, xm4 punpcklqdq xm3, xm4 vinserti128 m3, m3, xm5, 1 movu xm4, [r0 + 4 * 32] movu xm5, [r0 + 12 * 32] punpckhqdq xm6, xm4, xm5 punpcklqdq xm4, xm5 vinserti128 m4, m4, xm6, 1 movu xm5, [r0 + 5 * 32] movu xm6, [r0 + 13 * 32] punpckhqdq xm7, xm5, xm6 punpcklqdq xm5, xm6 vinserti128 m5, m5, xm7, 1 movu xm6, [r0 + 6 * 32] movu xm7, [r0 + 14 * 32] punpckhqdq xm8, xm6, xm7 punpcklqdq xm6, xm7 vinserti128 m6, m6, xm8, 1 movu xm7, [r0 + 7 * 32] movu xm8, [r0 + 15 * 32] punpckhqdq xm9, xm7, xm8 punpcklqdq xm7, xm8 vinserti128 m7, m7, xm9, 1 punpckhwd m8, m0, m2 ;[8 10] punpcklwd m0, m2 ;[0 2] punpckhwd m2, m1, m3 ;[9 11] punpcklwd m1, m3 ;[1 3] punpckhwd m3, m4, m6 ;[12 14] punpcklwd m4, m6 ;[4 6] punpckhwd m6, m5, m7 ;[13 15] punpcklwd m5, m7 ;[5 7] punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] IDCT_PASS1 0, 14 IDCT_PASS1 2, 12 IDCT_PASS1 4, 10 IDCT_PASS1 6, 8 add r0, 16 add r3, 16 dec r4d jnz .pass1 mov r3, rsp mov r4d, 8 lea r5, [tab_idct16_2] lea r6, [tab_idct16_1] vbroadcasti128 m7, [r5] vbroadcasti128 m8, [r5 + 16] vbroadcasti128 m9, [r5 + 32] vbroadcasti128 m10, [r5 + 48] vbroadcasti128 m11, [r5 + 64] vbroadcasti128 m12, [r5 + 80] vbroadcasti128 m13, [r5 + 96] .pass2: movu m1, [r3] vpermq m0, m1, 0xD8 pmaddwd m1, m0, m7 pmaddwd m2, m0, m8 phaddd m1, m2 pmaddwd m2, m0, m9 pmaddwd m3, m0, m10 phaddd m2, m3 phaddd m1, m2 pmaddwd m2, m0, m11 pmaddwd m3, m0, m12 phaddd m2, m3 vbroadcasti128 m14, [r5 + 112] pmaddwd m3, m0, m13 pmaddwd m4, m0, m14 phaddd m3, m4 phaddd m2, m3 movu m3, [r3 + 32] vpermq m0, m3, 0xD8 vbroadcasti128 m14, [r6] pmaddwd m3, m0, m14 vbroadcasti128 m14, [r6 + 16] pmaddwd m4, m0, m14 phaddd m3, m4 vbroadcasti128 m14, [r6 + 32] pmaddwd m4, m0, m14 vbroadcasti128 m14, [r6 + 48] pmaddwd m5, m0, m14 phaddd m4, m5 phaddd m3, m4 vbroadcasti128 m14, [r6 + 64] pmaddwd m4, m0, m14 vbroadcasti128 m14, [r6 + 80] pmaddwd m5, m0, m14 phaddd m4, m5 vbroadcasti128 m14, [r6 + 96] pmaddwd m6, m0, m14 vbroadcasti128 m14, [r6 + 112] pmaddwd m0, m14 phaddd m6, m0 phaddd m4, m6 paddd m5, m1, m3 paddd m5, m15 psrad m5, IDCT_SHIFT2 psubd m1, m3 paddd m1, m15 psrad m1, IDCT_SHIFT2 paddd m6, m2, m4 paddd m6, m15 psrad m6, IDCT_SHIFT2 psubd m2, m4 paddd m2, m15 psrad m2, IDCT_SHIFT2 packssdw m5, m6 packssdw m1, m2 pshufb m2, m1, [dct16_shuf1] mova [r1], xm5 mova [r1 + 16], xm2 vextracti128 [r1 + r2], m5, 1 vextracti128 [r1 + r2 + 16], m2, 1 lea r1, [r1 + 2 * r2] add r3, 64 dec r4d jnz .pass2 RET %macro IDCT32_PASS1 1 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32] vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16] pmaddwd m9, m4, m3 pmaddwd m10, m8, m13 phaddd m9, m10 pmaddwd m10, m2, m3 pmaddwd m11, m1, m13 phaddd m10, m11 phaddd m9, m10 vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32] vbroadcasti128 m13, [tab_idct32_1 + (15- %1) * 32 + 16] pmaddwd m10, m4, m3 pmaddwd m11, m8, m13 phaddd m10, m11 pmaddwd m11, m2, m3 pmaddwd m12, m1, m13 phaddd m11, m12 phaddd m10, m11 phaddd m9, m10 ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15] vbroadcasti128 m3, [tab_idct32_2 + %1 * 16] pmaddwd m10, m0, m3 pmaddwd m11, m7, m3 phaddd m10, m11 phaddd m10, m10 vbroadcasti128 m3, [tab_idct32_3 + %1 * 16] pmaddwd m11, m5, m3 pmaddwd m12, m6, m3 phaddd m11, m12 phaddd m11, m11 paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL] psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL] punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15] paddd m10, m9, m12 paddd m10, m15 psrad m10, IDCT_SHIFT1 psubd m12, m9 paddd m12, m15 psrad m12, IDCT_SHIFT1 packssdw m10, m12 vextracti128 xm12, m10, 1 movd [r3 + %1 * 64], xm10 movd [r3 + 32 + %1 * 64], xm12 pextrd [r4 - %1 * 64], xm10, 1 pextrd [r4+ 32 - %1 * 64], xm12, 1 pextrd [r3 + 16 * 64 + %1 *64], xm10, 3 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2 %endmacro ;------------------------------------------------------- ; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- ; TODO: Reduce PHADDD instruction by PADDD INIT_YMM avx2 cglobal idct32, 3, 6, 16, 0-32*64 %define IDCT_SHIFT1 7 vbroadcasti128 m15, [pd_64] mov r3, rsp lea r4, [r3 + 15 * 64] mov r5d, 8 .pass1: movq xm0, [r0 + 2 * 64] movq xm1, [r0 + 18 * 64] punpcklqdq xm0, xm0, xm1 movq xm1, [r0 + 0 * 64] movq xm2, [r0 + 16 * 64] punpcklqdq xm1, xm1, xm2 vinserti128 m0, m0, xm1, 1 ;[2 18 0 16] movq xm1, [r0 + 1 * 64] movq xm2, [r0 + 9 * 64] punpcklqdq xm1, xm1, xm2 movq xm2, [r0 + 17 * 64] movq xm3, [r0 + 25 * 64] punpcklqdq xm2, xm2, xm3 vinserti128 m1, m1, xm2, 1 ;[1 9 17 25] movq xm2, [r0 + 6 * 64] movq xm3, [r0 + 22 * 64] punpcklqdq xm2, xm2, xm3 movq xm3, [r0 + 4 * 64] movq xm4, [r0 + 20 * 64] punpcklqdq xm3, xm3, xm4 vinserti128 m2, m2, xm3, 1 ;[6 22 4 20] movq xm3, [r0 + 3 * 64] movq xm4, [r0 + 11 * 64] punpcklqdq xm3, xm3, xm4 movq xm4, [r0 + 19 * 64] movq xm5, [r0 + 27 * 64] punpcklqdq xm4, xm4, xm5 vinserti128 m3, m3, xm4, 1 ;[3 11 17 25] movq xm4, [r0 + 10 * 64] movq xm5, [r0 + 26 * 64] punpcklqdq xm4, xm4, xm5 movq xm5, [r0 + 8 * 64] movq xm6, [r0 + 24 * 64] punpcklqdq xm5, xm5, xm6 vinserti128 m4, m4, xm5, 1 ;[10 26 8 24] movq xm5, [r0 + 5 * 64] movq xm6, [r0 + 13 * 64] punpcklqdq xm5, xm5, xm6 movq xm6, [r0 + 21 * 64] movq xm7, [r0 + 29 * 64] punpcklqdq xm6, xm6, xm7 vinserti128 m5, m5, xm6, 1 ;[5 13 21 9] movq xm6, [r0 + 14 * 64] movq xm7, [r0 + 30 * 64] punpcklqdq xm6, xm6, xm7 movq xm7, [r0 + 12 * 64] movq xm8, [r0 + 28 * 64] punpcklqdq xm7, xm7, xm8 vinserti128 m6, m6, xm7, 1 ;[14 30 12 28] movq xm7, [r0 + 7 * 64] movq xm8, [r0 + 15 * 64] punpcklqdq xm7, xm7, xm8 movq xm8, [r0 + 23 * 64] movq xm9, [r0 + 31 * 64] punpcklqdq xm8, xm8, xm9 vinserti128 m7, m7, xm8, 1 ;[7 15 23 31] punpckhwd m8, m0, m2 ;[18 22 16 20] punpcklwd m0, m2 ;[2 6 0 4] punpckhwd m2, m1, m3 ;[9 11 25 27] punpcklwd m1, m3 ;[1 3 17 19] punpckhwd m3, m4, m6 ;[26 30 24 28] punpcklwd m4, m6 ;[10 14 8 12] punpckhwd m6, m5, m7 ;[13 15 29 31] punpcklwd m5, m7 ;[5 7 21 23] punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123] punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121] punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283] punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281] punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233] punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231] punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313] punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311] punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281] punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280] punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283] punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282] punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311] punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310] punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313] punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312] vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301] vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281] vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303] vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283] vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311] vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151] vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313] vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153] IDCT32_PASS1 0 IDCT32_PASS1 1 IDCT32_PASS1 2 IDCT32_PASS1 3 IDCT32_PASS1 4 IDCT32_PASS1 5 IDCT32_PASS1 6 IDCT32_PASS1 7 add r0, 8 add r3, 4 add r4, 4 dec r5d jnz .pass1 %if BIT_DEPTH == 12 %define IDCT_SHIFT2 8 vpbroadcastd m15, [pd_256] %elif BIT_DEPTH == 10 %define IDCT_SHIFT2 10 vpbroadcastd m15, [pd_512] %elif BIT_DEPTH == 8 %define IDCT_SHIFT2 12 vpbroadcastd m15, [pd_2048] %else %error Unsupported BIT_DEPTH! %endif mov r3, rsp add r2d, r2d mov r4d, 32 mova m7, [tab_idct32_4] mova m8, [tab_idct32_4 + 32] mova m9, [tab_idct32_4 + 64] mova m10, [tab_idct32_4 + 96] mova m11, [tab_idct32_4 + 128] mova m12, [tab_idct32_4 + 160] mova m13, [tab_idct32_4 + 192] mova m14, [tab_idct32_4 + 224] .pass2: movu m0, [r3] movu m1, [r3 + 32] pmaddwd m2, m0, m7 pmaddwd m3, m0, m8 phaddd m2, m3 pmaddwd m3, m0, m9 pmaddwd m4, m0, m10 phaddd m3, m4 phaddd m2, m3 pmaddwd m3, m0, m11 pmaddwd m4, m0, m12 phaddd m3, m4 pmaddwd m4, m0, m13 pmaddwd m5, m0, m14 phaddd m4, m5 phaddd m3, m4 vperm2i128 m4, m2, m3, 0x31 vperm2i128 m2, m2, m3, 0x20 paddd m2, m4 pmaddwd m3, m0, [tab_idct32_4 + 256] pmaddwd m4, m0, [tab_idct32_4 + 288] phaddd m3, m4 pmaddwd m4, m0, [tab_idct32_4 + 320] pmaddwd m5, m0, [tab_idct32_4 + 352] phaddd m4, m5 phaddd m3, m4 pmaddwd m4, m0, [tab_idct32_4 + 384] pmaddwd m5, m0, [tab_idct32_4 + 416] phaddd m4, m5 pmaddwd m5, m0, [tab_idct32_4 + 448] pmaddwd m0, [tab_idct32_4 + 480] phaddd m5, m0 phaddd m4, m5 vperm2i128 m0, m3, m4, 0x31 vperm2i128 m3, m3, m4, 0x20 paddd m3, m0 pmaddwd m4, m1, [tab_idct32_1] pmaddwd m0, m1, [tab_idct32_1 + 32] phaddd m4, m0 pmaddwd m5, m1, [tab_idct32_1 + 64] pmaddwd m0, m1, [tab_idct32_1 + 96] phaddd m5, m0 phaddd m4, m5 pmaddwd m5, m1, [tab_idct32_1 + 128] pmaddwd m0, m1, [tab_idct32_1 + 160] phaddd m5, m0 pmaddwd m6, m1, [tab_idct32_1 + 192] pmaddwd m0, m1, [tab_idct32_1 + 224] phaddd m6, m0 phaddd m5, m6 vperm2i128 m0, m4, m5, 0x31 vperm2i128 m4, m4, m5, 0x20 paddd m4, m0 pmaddwd m5, m1, [tab_idct32_1 + 256] pmaddwd m0, m1, [tab_idct32_1 + 288] phaddd m5, m0 pmaddwd m6, m1, [tab_idct32_1 + 320] pmaddwd m0, m1, [tab_idct32_1 + 352] phaddd m6, m0 phaddd m5, m6 pmaddwd m6, m1, [tab_idct32_1 + 384] pmaddwd m0, m1, [tab_idct32_1 + 416] phaddd m6, m0 pmaddwd m0, m1, [tab_idct32_1 + 448] pmaddwd m1, [tab_idct32_1 + 480] phaddd m0, m1 phaddd m6, m0 vperm2i128 m0, m5, m6, 0x31 vperm2i128 m5, m5, m6, 0x20 paddd m5, m0 paddd m6, m2, m4 paddd m6, m15 psrad m6, IDCT_SHIFT2 psubd m2, m4 paddd m2, m15 psrad m2, IDCT_SHIFT2 paddd m4, m3, m5 paddd m4, m15 psrad m4, IDCT_SHIFT2 psubd m3, m5 paddd m3, m15 psrad m3, IDCT_SHIFT2 packssdw m6, m4 packssdw m2, m3 vpermq m6, m6, 0xD8 vpermq m2, m2, 0x8D pshufb m2, [dct16_shuf1] mova [r1], m6 mova [r1 + 32], m2 add r1, r2 add r3, 64 dec r4d jnz .pass2 RET ;------------------------------------------------------- ; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- INIT_YMM avx2 cglobal idct4, 3, 4, 6 %define IDCT_SHIFT1 7 %if BIT_DEPTH == 12 %define IDCT_SHIFT2 8 vpbroadcastd m5, [pd_256] %elif BIT_DEPTH == 10 %define IDCT_SHIFT2 10 vpbroadcastd m5, [pd_512] %elif BIT_DEPTH == 8 %define IDCT_SHIFT2 12 vpbroadcastd m5, [pd_2048] %else %error Unsupported BIT_DEPTH! %endif vbroadcasti128 m4, [pd_64] add r2d, r2d lea r3, [r2 * 3] movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33] pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33] vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33] punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23] punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33] vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23] vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33] mova m1, [avx2_idct4_1] mova m3, [avx2_idct4_1 + 32] pmaddwd m1, m2 pmaddwd m3, m0 paddd m0, m1, m3 paddd m0, m4 psrad m0, IDCT_SHIFT1 ;[00 20 10 30 01 21 11 31] psubd m1, m3 paddd m1, m4 psrad m1, IDCT_SHIFT1 ;[03 23 13 33 02 22 12 32] packssdw m0, m1 ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32] vmovshdup m1, m0 ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32] vmovsldup m0, m0 ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22] vpbroadcastq m2, [avx2_idct4_2] vpbroadcastq m3, [avx2_idct4_2 + 8] pmaddwd m0, m2 pmaddwd m1, m3 paddd m2, m0, m1 paddd m2, m5 psrad m2, IDCT_SHIFT2 ;[00 01 10 11 30 31 20 21] psubd m0, m1 paddd m0, m5 psrad m0, IDCT_SHIFT2 ;[03 02 13 12 33 32 23 22] pshufb m0, [idct4_shuf2] ;[02 03 12 13 32 33 22 23] punpcklqdq m1, m2, m0 ;[00 01 02 03 10 11 12 13] punpckhqdq m2, m0 ;[30 31 32 33 20 21 22 23] packssdw m1, m2 ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23] vextracti128 xm0, m1, 1 movq [r1], xm1 movq [r1 + r2], xm0 movhps [r1 + 2 * r2], xm0 movhps [r1 + r3], xm1 RET %endif