libbpg/x265/source/common/x86/intrapred8.asm
2015-10-27 11:46:00 +01:00

22682 lines
748 KiB
NASM

;*****************************************************************************
;* Copyright (C) 2013 x265 project
;*
;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************/
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
const intra_pred_shuff_0_8, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_0_8 times 8 db 0, 8
pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0
c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
const tab_S1, db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
const tab_S2, db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
const tab_Si, db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0
c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0
c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15
c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0
c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0
c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2
c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2
c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1
c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
ALIGN 32
c_ang8_src1_9_2_10: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c_ang8_26_20: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_src3_11_4_12: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
c_ang8_14_8: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_src5_13_5_13: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
c_ang8_2_28: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
c_ang8_src6_14_7_15: db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
c_ang8_22_16: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
c_ang8_21_10 : db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
c_ang8_src2_10_3_11: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
c_ang8_31_20: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_src4_12_4_12: times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
c_ang8_9_30: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
c_ang8_src5_13_6_14: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13
c_ang8_19_8: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_17_2: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
c_ang8_19_4: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
c_ang8_21_6: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
c_ang8_23_8: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
c_ang8_src4_12_5_13: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
c_ang8_13_26: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
c_ang8_7_20: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_1_14: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
c_ang8_27_8: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_src2_10_2_10: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c_ang8_src3_11_3_11: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
c_ang8_31_8: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_13_22: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
c_ang8_27_4: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
c_ang8_9_18: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
c_ang8_5_10: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
c_ang8_15_20: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_25_30: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
c_ang8_3_8: db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
c_ang8_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
c_ang8_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
c_ang8_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
ALIGN 32
c_ang16_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
ALIGN 32
c_ang16_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
ALIGN 32
c_ang16_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
ALIGN 32
c_ang16_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
ALIGN 32
intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15
ALIGN 32
c_ang16_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
intra_pred_shuff_0_4: times 4 db 0, 1, 1, 2, 2, 3, 3, 4
intra_pred4_shuff1: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
intra_pred4_shuff2: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
intra_pred4_shuff31: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
intra_pred4_shuff33: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
intra_pred4_shuff3: db 8, 9, 9, 10, 10, 11, 11, 12, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
intra_pred4_shuff4: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
intra_pred4_shuff5: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
intra_pred4_shuff6: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14
intra_pred4_shuff7: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14
intra_pred4_shuff9: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13
intra_pred4_shuff12: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12,0, 9, 9, 10, 10, 11, 11, 12
intra_pred4_shuff13: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
intra_pred4_shuff14: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
intra_pred4_shuff15: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
intra_pred4_shuff16: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
intra_pred4_shuff17: db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
intra_pred4_shuff19: db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
intra_pred4_shuff20: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
intra_pred4_shuff21: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
intra_pred4_shuff22: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
intra_pred4_shuff23: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
c_ang4_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
c_ang4_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
c_ang4_mode_5: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_6: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_7: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_8: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
c_ang4_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
c_ang4_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
c_ang4_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
c_ang4_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4
c_ang4_mode_16: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_17: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
c_ang4_mode_19: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
c_ang4_mode_20: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
c_ang4_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
c_ang4_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
c_ang4_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
ALIGN 32
;; (blkSize - 1 - x)
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
ALIGN 32
c_ang8_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
ALIGN 32
c_ang8_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
ALIGN 32
c_ang8_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
const c_ang8_mode_16, db 8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 12, 13, 15, 0, 0
const intra_pred8_shuff16, db 0, 1, 1, 2, 3, 3, 4, 5
db 1, 2, 2, 3, 4, 4, 5, 6
db 2, 3, 3, 4, 5, 5, 6, 7
db 3, 4, 4, 5, 6, 6, 7, 8
db 4, 5, 5, 6, 7, 7, 8, 9
const angHor8_tab_16, db (32-11), 11, (32-22), 22, (32-1 ), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24
const c_ang8_mode_20, db 15, 13, 12, 10, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0
; NOTE: this big table improve speed ~10%, if we have broadcast instruction work on high-128bits infuture, we can remove the table
const angHor8_tab_20, times 8 db (32-24), 24
times 8 db (32-13), 13
times 8 db (32- 2), 2
times 8 db (32-23), 23
times 8 db (32-12), 12
times 8 db (32- 1), 1
times 8 db (32-22), 22
times 8 db (32-11), 11
const ang16_shuf_mode9, times 8 db 0, 1
times 8 db 1, 2
const angHor_tab_9, db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
db (32-18), 18, (32-20), 20, (32-22), 22, (32-24), 24, (32-26), 26, (32-28), 28, (32-30), 30, (32-32), 32
const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0
const ang16_shuf_mode12, db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
const ang16_shuf_mode13, db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
const ang16_shuf_mode14, db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
const ang16_shuf_mode15, db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
const ang16_shuf_mode16, db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
const ang16_shuf_mode17, db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
const angHor_tab_17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16
db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0
; Intrapred_angle32x32, modes 1 to 33 constants
const ang32_shuf_mode9, times 8 db 0, 1
times 8 db 1, 2
const ang32_shuf_mode11, times 8 db 1, 2
times 8 db 0, 1
const ang32_fact_mode12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7), 7, (32- 2), 2, (32-29), 29, (32-24), 24
db (32-11), 11, (32- 6), 6, (32- 1), 1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8), 8
db (32-19), 19, (32-14), 14, (32- 9), 9, (32- 4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
db (32- 3), 3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15, (32-10), 10, (32- 5), 5, (32- 0), 0
const ang32_shuf_mode12, db 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
db 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
const ang32_shuf_mode24, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 3, 3
dd 0, 0, 7, 3, 0, 0, 7, 3
const ang32_fact_mode13, db (32-23), 23, (32-14), 14, (32- 5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1), 1, (32-24), 24
db (32- 7), 7, (32-30), 30, (32-21), 21, (32-12), 12, (32- 3), 3, (32-26), 26, (32-17), 17, (32- 8), 8
db (32-15), 15, (32- 6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32- 2), 2, (32-25), 25, (32-16), 16
db (32-31), 31, (32-22), 22, (32-13), 13, (32- 4), 4, (32-27), 27, (32-18), 18, (32- 9), 9, (32- 0), 0
const ang32_shuf_mode13, db 14, 15, 14, 15, 14, 15, 13, 14, 13, 14, 13, 14, 13, 14, 12, 13, 10, 11, 9, 10, 9, 10, 9, 10, 9, 10, 8, 9, 8, 9, 8, 9
db 12, 13, 12, 13, 11, 12, 11, 12, 11, 12, 11, 12, 10, 11, 10, 11, 7, 8, 7, 8, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 6, 7
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 9, 5, 2
const ang32_shuf_mode23, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 11, 11, 7, 7, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 5, 5, 2, 2
const ang32_fact_mode14, db (32-19), 19, (32- 6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5), 5, (32-24), 24
db (32- 3), 3, (32-22), 22, (32- 9), 9, (32-28), 28, (32-15), 15, (32- 2), 2, (32-21), 21, (32- 8), 8
db (32-11), 11, (32-30), 30, (32-17), 17, (32- 4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
db (32-27), 27, (32-14), 14, (32- 1), 1, (32-20), 20, (32- 7), 7, (32-26), 26, (32-13), 13, (32- 0), 0
const ang32_shuf_mode14, db 14, 15, 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 12, 13, 11, 12, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 5, 6, 5, 6
db 11, 12, 10, 11, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3
db 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 9, 6, 4, 1
const ang32_shuf_mode22, db 0, 0, 15, 15, 13, 13, 10, 10, 8, 8, 5, 5, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 7, 7, 4, 4, 2
const ang32_fact_mode15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9), 9, (32-24), 24
db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8), 8
db (32- 7), 7, (32-22), 22, (32- 5), 5, (32-20), 20, (32- 3), 3, (32-18), 18, (32- 1), 1, (32-16), 16
db (32-23), 23, (32- 6), 6, (32-21), 21, (32- 4), 4, (32-19), 19, (32- 2), 2, (32-17), 17, (32- 0), 0
const ang32_shuf_mode15, db 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 11, 12, 11, 12, 10, 11, 5, 6, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3
db 12, 13, 11, 12, 11, 12, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1
db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 7, 5, 3, 1
const ang32_shuf_mode21, db 15, 15, 13, 13, 11, 11, 9, 9, 8, 8, 6, 6, 4, 4, 2, 2, 14, 14, 12, 12, 10, 10, 8, 8, 7, 7, 5, 5, 3, 3, 1, 1
const ang32_fact_mode16, db (32-11), 11, (32-22), 22, (32- 1), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24
db (32- 3), 3, (32-14), 14, (32-25), 25, (32- 4), 4, (32-15), 15, (32-26), 26, (32- 5), 5, (32-16), 16
db (32-27), 27, (32- 6), 6, (32-17), 17, (32-28), 28, (32- 7), 7, (32-18), 18, (32-29), 29, (32- 8), 8
db (32-19), 19, (32-30), 30, (32- 9), 9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0), 0
const ang32_shuf_mode16, db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 5, 6, 4, 5
db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6
db 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 13, 11, 10, 8, 7, 5, 4, 2, 1
dd 7, 1, 2, 3, 7, 1, 2, 3
const ang32_shuf_mode20, db 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 14, 15, 8, 7, 5, 4, 2, 1, 0, 0, 14, 13, 13, 11, 11, 10, 10, 8
db 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 1, 1, 0, 0
const ang32_fact_mode17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16
db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0
const ang32_shuf_mode17, db 14, 15, 13, 14, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 4, 5, 3, 4, 2, 3, 2, 3
db 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
const ang32_shuf_mode19, db 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15
dd 0, 0, 2, 3, 0, 0, 7, 1
dd 0, 0, 5, 6, 0, 0, 0, 0
const ang_table
%assign x 0
%rep 32
times 8 db (32-x), x
%assign x x+1
%endrep
const ang_table_avx2
%assign x 0
%rep 32
times 16 db (32-x), x
%assign x x+1
%endrep
const pw_ang_table
%assign x 0
%rep 32
times 4 dw (32-x), x
%assign x x+1
%endrep
SECTION .text
cextern pb_1
cextern pw_2
cextern pw_3
cextern pw_4
cextern pw_7
cextern pw_8
cextern pw_16
cextern pw_15
cextern pw_31
cextern pw_32
cextern pw_257
cextern pw_512
cextern pw_1024
cextern pw_4096
cextern pw_00ff
cextern pb_unpackbd1
cextern multiL
cextern multiH
cextern multiH2
cextern multiH3
cextern multi_2Row
cextern trans8_shuf
cextern pw_planar16_mul
cextern pw_planar32_mul
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_dc4, 5,5,3
inc r2
pxor m0, m0
movu m1, [r2]
pshufd m1, m1, 0xF8
psadbw m1, m0 ; m1 = sum
test r4d, r4d
paddw m1, [pw_4]
psraw m1, 3
movd r4d, m1 ; r4d = dc_val
pmullw m1, [pw_257]
pshuflw m1, m1, 0x00
; store DC 4x4
lea r3, [r1 * 3]
movd [r0], m1
movd [r0 + r1], m1
movd [r0 + r1 * 2], m1
movd [r0 + r3], m1
; do DC filter
jz .end
lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
add r4d, r3d ; r4d = DC * 3 + 2
movd m1, r4d
pshuflw m1, m1, 0 ; m1 = pixDCx3
; filter top
movd m2, [r2]
punpcklbw m2, m0
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movd [r0], m2 ; overwrite top-left pixel, we will update it later
; filter top-left
movzx r4d, byte [r2 + 8]
add r3d, r4d
movzx r4d, byte [r2]
add r3d, r4d
shr r3d, 2
mov [r0], r3b
; filter left
add r0, r1
movq m2, [r2 + 9]
punpcklbw m2, m0
paddw m2, m1
psraw m2, 2
packuswb m2, m2
%if ARCH_X86_64
movq r4, m2
mov [r0], r4b
shr r4, 8
mov [r0 + r1], r4b
shr r4, 8
mov [r0 + r1 * 2], r4b
%else
movd r2d, m2
mov [r0], r2b
shr r2, 8
mov [r0 + r1], r2b
shr r2, 8
mov [r0 + r1 * 2], r2b
%endif
.end:
RET
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_dc8, 5, 7, 3
pxor m0, m0
movh m1, [r2 + 1]
movh m2, [r2 + 17]
punpcklqdq m1, m2
psadbw m1, m0
pshufd m2, m1, 2
paddw m1, m2
paddw m1, [pw_8]
psraw m1, 4
pmullw m1, [pw_257]
pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
test r4d, r4d
; store DC 8x8
lea r6, [r1 + r1 * 2]
lea r5, [r6 + r1 * 2]
movh [r0], m1
movh [r0 + r1], m1
movh [r0 + r1 * 2], m1
movh [r0 + r6], m1
movh [r0 + r1 * 4], m1
movh [r0 + r5], m1
movh [r0 + r6 * 2], m1
lea r5, [r5 + r1 * 2]
movh [r0 + r5], m1
; Do DC Filter
jz .end
psrlw m1, 8
movq m2, [pw_2]
pmullw m2, m1
paddw m2, [pw_2]
movd r4d, m2 ; r4d = DC * 2 + 2
paddw m1, m2 ; m1 = DC * 3 + 2
pshufd m1, m1, 0
; filter top
movq m2, [r2 + 1]
punpcklbw m2, m0
paddw m2, m1
psraw m2, 2 ; sum = sum / 16
packuswb m2, m2
movh [r0], m2
; filter top-left
movzx r3d, byte [r2 + 17]
add r4d, r3d
movzx r3d, byte [r2 + 1]
add r3d, r4d
shr r3d, 2
mov [r0], r3b
; filter left
movq m2, [r2 + 18]
punpcklbw m2, m0
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movd r2d, m2
lea r0, [r0 + r1]
lea r5, [r6 + r1 * 2]
mov [r0], r2b
shr r2, 8
mov [r0 + r1], r2b
shr r2, 8
mov [r0 + r1 * 2], r2b
shr r2, 8
mov [r0 + r6], r2b
pshufd m2, m2, 0x01
movd r2d, m2
mov [r0 + r1 * 4], r2b
shr r2, 8
mov [r0 + r5], r2b
shr r2, 8
mov [r0 + r6 * 2], r2b
.end:
RET
;--------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;--------------------------------------------------------------------------------------------
INIT_XMM sse2
%if ARCH_X86_64
cglobal intra_pred_dc16, 5, 10, 4
%else
cglobal intra_pred_dc16, 5, 7, 4
%endif
pxor m0, m0
movu m1, [r2 + 1]
movu m2, [r2 + 33]
psadbw m1, m0
psadbw m2, m0
paddw m1, m2
pshufd m2, m1, 2
paddw m1, m2
paddw m1, [pw_16]
psraw m1, 5
pmullw m1, [pw_257]
pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
pshufd m1, m1, 0x00
test r4d, r4d
; store DC 16x16
%if ARCH_X86_64
lea r6, [r1 + r1 * 2] ;index 3
lea r7, [r1 + r1 * 4] ;index 5
lea r8, [r6 + r1 * 4] ;index 7
lea r9, [r0 + r8] ;base + 7
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r6], m1
movu [r0 + r1 * 4], m1
movu [r0 + r7], m1
movu [r0 + r6 * 2], m1
movu [r0 + r8], m1
movu [r0 + r1 * 8], m1
movu [r9 + r1 * 2], m1
movu [r0 + r7 * 2], m1
movu [r9 + r1 * 4], m1
movu [r0 + r6 * 4], m1
movu [r9 + r6 * 2], m1
movu [r0 + r8 * 2], m1
movu [r9 + r1 * 8], m1
%else ;32 bit
mov r6, r0
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
%endif
; Do DC Filter
jz .end
psrlw m1, 8
mova m2, [pw_2]
pmullw m2, m1
paddw m2, [pw_2]
movd r4d, m2
paddw m1, m2
; filter top
movh m2, [r2 + 1]
punpcklbw m2, m0
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movh m3, [r2 + 9]
punpcklbw m3, m0
paddw m3, m1
psraw m3, 2
packuswb m3, m3
; filter top-left
movzx r5d, byte [r2 + 33]
add r4d, r5d
movzx r3d, byte [r2 + 1]
add r3d, r4d
shr r3d, 2
%if ARCH_X86_64
movh [r0], m2
movh [r0 + 8], m3
mov [r0], r3b
%else ;32 bit
movh [r6], m2
movh [r6 + 8], m3
mov [r6], r3b
add r6, r1
%endif
; filter left
movh m2, [r2 + 34]
punpcklbw m2, m0
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movh m3, [r2 + 42]
punpcklbw m3, m0
paddw m3, m1
psraw m3, 2
packuswb m3, m3
%if ARCH_X86_64
movh r3, m2
mov [r0 + r1], r3b
shr r3, 8
mov [r0 + r1 * 2], r3b
shr r3, 8
mov [r0 + r6], r3b
shr r3, 8
mov [r0 + r1 * 4], r3b
shr r3, 8
mov [r0 + r7], r3b
shr r3, 8
mov [r0 + r6 * 2], r3b
shr r3, 8
mov [r0 + r8], r3b
shr r3, 8
mov [r0 + r1 * 8], r3b
movh r3, m3
mov [r9 + r1 * 2], r3b
shr r3, 8
mov [r0 + r7 * 2], r3b
shr r3, 8
mov [r9 + r1 * 4], r3b
shr r3, 8
mov [r0 + r6 * 4], r3b
shr r3, 8
mov [r9 + r6 * 2], r3b
shr r3, 8
mov [r0 + r8 * 2], r3b
shr r3, 8
mov [r9 + r1 * 8], r3b
%else ;32 bit
movd r2d, m2
pshufd m2, m2, 0x01
mov [r6], r2b
shr r2, 8
mov [r6 + r1], r2b
shr r2, 8
mov [r6 + r1 * 2], r2b
lea r6, [r6 + r1 * 2]
shr r2, 8
mov [r6 + r1], r2b
movd r2d, m2
mov [r6 + r1 * 2], r2b
lea r6, [r6 + r1 * 2]
shr r2, 8
mov [r6 + r1], r2b
shr r2, 8
mov [r6 + r1 * 2], r2b
lea r6, [r6 + r1 * 2]
shr r2, 8
mov [r6 + r1], r2b
movd r2d, m3
pshufd m3, m3, 0x01
mov [r6 + r1 * 2], r2b
lea r6, [r6 + r1 * 2]
shr r2, 8
mov [r6 + r1], r2b
shr r2, 8
mov [r6 + r1 * 2], r2b
lea r6, [r6 + r1 * 2]
shr r2, 8
mov [r6 + r1], r2b
movd r2d, m3
mov [r6 + r1 * 2], r2b
lea r6, [r6 + r1 * 2]
shr r2, 8
mov [r6 + r1], r2b
shr r2, 8
mov [r6 + r1 * 2], r2b
%endif
.end:
RET
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_dc32, 3, 3, 5
pxor m0, m0
movu m1, [r2 + 1]
movu m2, [r2 + 17]
movu m3, [r2 + 65]
movu m4, [r2 + 81]
psadbw m1, m0
psadbw m2, m0
psadbw m3, m0
psadbw m4, m0
paddw m1, m2
paddw m3, m4
paddw m1, m3
pshufd m2, m1, 2
paddw m1, m2
paddw m1, [pw_32]
psraw m1, 6
pmullw m1, [pw_257]
pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
pshufd m1, m1, 0x00
%assign x 0
%rep 16
; store DC 16x16
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16], m1
%if x < 16
lea r0, [r0 + 2 * r1]
%endif
%assign x x+1
%endrep
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_planar4, 3,3,5
pxor m0, m0
movh m1, [r2 + 1]
punpcklbw m1, m0
movh m2, [r2 + 9]
punpcklbw m2, m0
pshufhw m3, m1, 0 ; topRight
pshufd m3, m3, 0xAA
pshufhw m4, m2, 0 ; bottomLeft
pshufd m4, m4, 0xAA
pmullw m3, [multi_2Row] ; (x + 1) * topRight
pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
paddw m3, [pw_4]
paddw m3, m4
paddw m3, m0
psubw m4, m1
pshuflw m1, m2, 0
pmullw m1, [pw_planar4_0]
paddw m1, m3
paddw m3, m4
psraw m1, 3
packuswb m1, m1
movd [r0], m1
pshuflw m1, m2, 01010101b
pmullw m1, [pw_planar4_0]
paddw m1, m3
paddw m3, m4
psraw m1, 3
packuswb m1, m1
movd [r0 + r1], m1
lea r0, [r0 + 2 * r1]
pshuflw m1, m2, 10101010b
pmullw m1, [pw_planar4_0]
paddw m1, m3
paddw m3, m4
psraw m1, 3
packuswb m1, m1
movd [r0], m1
pshuflw m1, m2, 11111111b
pmullw m1, [pw_planar4_0]
paddw m1, m3
psraw m1, 3
packuswb m1, m1
movd [r0 + r1], m1
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_planar8, 3,3,6
pxor m0, m0
movh m1, [r2 + 1]
punpcklbw m1, m0
movh m2, [r2 + 17]
punpcklbw m2, m0
movd m3, [r2 + 9] ; topRight = above[8];
movd m4, [r2 + 25] ; bottomLeft = left[8];
pand m3, [pw_00ff]
pand m4, [pw_00ff]
pshuflw m3, m3, 0x00
pshuflw m4, m4, 0x00
pshufd m3, m3, 0x44
pshufd m4, m4, 0x44
pmullw m3, [multiL] ; (x + 1) * topRight
pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
paddw m3, [pw_8]
paddw m3, m4
paddw m3, m0
psubw m4, m1
%macro INTRA_PRED_PLANAR_8 1
%if (%1 < 4)
pshuflw m5, m2, 0x55 * %1
pshufd m5, m5, 0
%else
pshufhw m5, m2, 0x55 * (%1 - 4)
pshufd m5, m5, 0xAA
%endif
pmullw m5, [pw_planar16_mul + mmsize]
paddw m5, m3
psraw m5, 4
packuswb m5, m5
movh [r0], m5
%if (%1 < 7)
paddw m3, m4
lea r0, [r0 + r1]
%endif
%endmacro
INTRA_PRED_PLANAR_8 0
INTRA_PRED_PLANAR_8 1
INTRA_PRED_PLANAR_8 2
INTRA_PRED_PLANAR_8 3
INTRA_PRED_PLANAR_8 4
INTRA_PRED_PLANAR_8 5
INTRA_PRED_PLANAR_8 6
INTRA_PRED_PLANAR_8 7
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_planar16, 3,5,8
pxor m0, m0
movh m2, [r2 + 1]
punpcklbw m2, m0
movh m7, [r2 + 9]
punpcklbw m7, m0
movd m3, [r2 + 17] ; topRight = above[16]
movd m6, [r2 + 49] ; bottomLeft = left[16]
pand m3, [pw_00ff]
pand m6, [pw_00ff]
pshuflw m3, m3, 0x00
pshuflw m6, m6, 0x00
pshufd m3, m3, 0x44 ; v_topRight
pshufd m6, m6, 0x44 ; v_bottomLeft
pmullw m4, m3, [multiH] ; (x + 1) * topRight
pmullw m3, [multiL] ; (x + 1) * topRight
pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
paddw m4, [pw_16]
paddw m3, [pw_16]
paddw m4, m6
paddw m3, m6
paddw m4, m5
paddw m3, m1
psubw m1, m6, m7
psubw m6, m2
movh m2, [r2 + 33]
punpcklbw m2, m0
movh m7, [r2 + 41]
punpcklbw m7, m0
%macro INTRA_PRED_PLANAR_16 1
%if (%1 < 4)
pshuflw m5, m2, 0x55 * %1
pshufd m5, m5, 0
%else
%if (%1 < 8)
pshufhw m5, m2, 0x55 * (%1 - 4)
pshufd m5, m5, 0xAA
%else
%if (%1 < 12)
pshuflw m5, m7, 0x55 * (%1 - 8)
pshufd m5, m5, 0
%else
pshufhw m5, m7, 0x55 * (%1 - 12)
pshufd m5, m5, 0xAA
%endif
%endif
%endif
%if (%1 > 0)
paddw m3, m6
paddw m4, m1
lea r0, [r0 + r1]
%endif
pmullw m0, m5, [pw_planar16_mul + mmsize]
pmullw m5, [pw_planar16_mul]
paddw m0, m4
paddw m5, m3
psraw m5, 5
psraw m0, 5
packuswb m5, m0
movu [r0], m5
%endmacro
INTRA_PRED_PLANAR_16 0
INTRA_PRED_PLANAR_16 1
INTRA_PRED_PLANAR_16 2
INTRA_PRED_PLANAR_16 3
INTRA_PRED_PLANAR_16 4
INTRA_PRED_PLANAR_16 5
INTRA_PRED_PLANAR_16 6
INTRA_PRED_PLANAR_16 7
INTRA_PRED_PLANAR_16 8
INTRA_PRED_PLANAR_16 9
INTRA_PRED_PLANAR_16 10
INTRA_PRED_PLANAR_16 11
INTRA_PRED_PLANAR_16 12
INTRA_PRED_PLANAR_16 13
INTRA_PRED_PLANAR_16 14
INTRA_PRED_PLANAR_16 15
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
%if ARCH_X86_64 == 1
cglobal intra_pred_planar32, 3,3,16
movd m3, [r2 + 33] ; topRight = above[32]
pxor m7, m7
pand m3, [pw_00ff]
pshuflw m3, m3, 0x00
pshufd m3, m3, 0x44
pmullw m0, m3, [multiL] ; (x + 1) * topRight
pmullw m1, m3, [multiH] ; (x + 1) * topRight
pmullw m2, m3, [multiH2] ; (x + 1) * topRight
pmullw m3, [multiH3] ; (x + 1) * topRight
movd m11, [r2 + 97] ; bottomLeft = left[32]
pand m11, [pw_00ff]
pshuflw m11, m11, 0x00
pshufd m11, m11, 0x44
mova m5, m11
paddw m5, [pw_32]
paddw m0, m5
paddw m1, m5
paddw m2, m5
paddw m3, m5
mova m8, m11
mova m9, m11
mova m10, m11
mova m12, [pw_31]
movh m4, [r2 + 1]
punpcklbw m4, m7
psubw m8, m4
pmullw m4, m12
paddw m0, m4
movh m4, [r2 + 9]
punpcklbw m4, m7
psubw m9, m4
pmullw m4, m12
paddw m1, m4
movh m4, [r2 + 17]
punpcklbw m4, m7
psubw m10, m4
pmullw m4, m12
paddw m2, m4
movh m4, [r2 + 25]
punpcklbw m4, m7
psubw m11, m4
pmullw m4, m12
paddw m3, m4
mova m12, [pw_planar32_mul]
mova m13, [pw_planar32_mul + mmsize]
mova m14, [pw_planar16_mul]
mova m15, [pw_planar16_mul + mmsize]
%macro PROCESS 1
pmullw m5, %1, m12
pmullw m6, %1, m13
paddw m5, m0
paddw m6, m1
psraw m5, 6
psraw m6, 6
packuswb m5, m6
movu [r0], m5
pmullw m5, %1, m14
pmullw %1, m15
paddw m5, m2
paddw %1, m3
psraw m5, 6
psraw %1, 6
packuswb m5, %1
movu [r0 + 16], m5
%endmacro
%macro INCREMENT 0
paddw m2, m10
paddw m3, m11
paddw m0, m8
paddw m1, m9
add r0, r1
%endmacro
%assign x 0
%rep 4
pxor m7, m7
movq m4, [r2 + 65 + x * 8]
punpcklbw m4, m7
%assign y 0
%rep 8
%if y < 4
pshuflw m7, m4, 0x55 * y
pshufd m7, m7, 0x44
%else
pshufhw m7, m4, 0x55 * (y - 4)
pshufd m7, m7, 0xEE
%endif
PROCESS m7
%if x + y < 10
INCREMENT
%endif
%assign y y+1
%endrep
%assign x x+1
%endrep
RET
%else ;end ARCH_X86_64, start ARCH_X86_32
cglobal intra_pred_planar32, 3,3,8,0-(4*mmsize)
movd m3, [r2 + 33] ; topRight = above[32]
pxor m7, m7
pand m3, [pw_00ff]
pshuflw m3, m3, 0x00
pshufd m3, m3, 0x44
pmullw m0, m3, [multiL] ; (x + 1) * topRight
pmullw m1, m3, [multiH] ; (x + 1) * topRight
pmullw m2, m3, [multiH2] ; (x + 1) * topRight
pmullw m3, [multiH3] ; (x + 1) * topRight
movd m6, [r2 + 97] ; bottomLeft = left[32]
pand m6, [pw_00ff]
pshuflw m6, m6, 0x00
pshufd m6, m6, 0x44
mova m5, m6
paddw m5, [pw_32]
paddw m0, m5
paddw m1, m5
paddw m2, m5
paddw m3, m5
movh m4, [r2 + 1]
punpcklbw m4, m7
psubw m5, m6, m4
mova [rsp + 0 * mmsize], m5
pmullw m4, [pw_31]
paddw m0, m4
movh m4, [r2 + 9]
punpcklbw m4, m7
psubw m5, m6, m4
mova [rsp + 1 * mmsize], m5
pmullw m4, [pw_31]
paddw m1, m4
movh m4, [r2 + 17]
punpcklbw m4, m7
psubw m5, m6, m4
mova [rsp + 2 * mmsize], m5
pmullw m4, [pw_31]
paddw m2, m4
movh m4, [r2 + 25]
punpcklbw m4, m7
psubw m5, m6, m4
mova [rsp + 3 * mmsize], m5
pmullw m4, [pw_31]
paddw m3, m4
%macro PROCESS 1
pmullw m5, %1, [pw_planar32_mul]
pmullw m6, %1, [pw_planar32_mul + mmsize]
paddw m5, m0
paddw m6, m1
psraw m5, 6
psraw m6, 6
packuswb m5, m6
movu [r0], m5
pmullw m5, %1, [pw_planar16_mul]
pmullw %1, [pw_planar16_mul + mmsize]
paddw m5, m2
paddw %1, m3
psraw m5, 6
psraw %1, 6
packuswb m5, %1
movu [r0 + 16], m5
%endmacro
%macro INCREMENT 0
paddw m0, [rsp + 0 * mmsize]
paddw m1, [rsp + 1 * mmsize]
paddw m2, [rsp + 2 * mmsize]
paddw m3, [rsp + 3 * mmsize]
add r0, r1
%endmacro
%assign y 0
%rep 4
pxor m7, m7
movq m4, [r2 + 65 + y * 8]
punpcklbw m4, m7
%assign x 0
%rep 8
%if x < 4
pshuflw m7, m4, 0x55 * x
pshufd m7, m7, 0x44
%else
pshufhw m7, m4, 0x55 * (x - 4)
pshufd m7, m7, 0xEE
%endif
PROCESS m7
%if x + y < 10
INCREMENT
%endif
%assign x x+1
%endrep
%assign y y+1
%endrep
RET
%endif ; end ARCH_X86_32
%macro STORE_4x4 0
movd [r0], m0
psrldq m0, 4
movd [r0 + r1], m0
psrldq m0, 4
movd [r0 + r1 * 2], m0
lea r1, [r1 * 3]
psrldq m0, 4
movd [r0 + r1], m0
%endmacro
%macro TRANSPOSE_4x4 0
pshufd m0, m0, 0xD8
pshufd m1, m2, 0xD8
pshuflw m0, m0, 0xD8
pshuflw m1, m1, 0xD8
pshufhw m0, m0, 0xD8
pshufhw m1, m1, 0xD8
mova m2, m0
punpckldq m0, m1
punpckhdq m2, m1
packuswb m0, m2
%endmacro
;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal intra_pred_ang4_2, 3,5,1
lea r4, [r2 + 2]
add r2, 10
cmp r3m, byte 34
cmove r2, r4
movh m0, [r2]
movd [r0], m0
psrldq m0, 1
movd [r0 + r1], m0
psrldq m0, 1
movd [r0 + r1 * 2], m0
lea r1, [r1 * 3]
psrldq m0, 1
movd [r0 + r1], m0
RET
INIT_XMM sse2
cglobal intra_pred_ang4_3, 3,3,5
movh m3, [r2 + 9] ; [8 7 6 5 4 3 2 1]
punpcklbw m3, m3
psrldq m3, 1
movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
psrldq m3, 2
movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
psrldq m3, 2
movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4]
pxor m4, m4
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 20 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 26 * 16]
packssdw m0, m1
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m4
pmaddwd m3, [pw_ang_table + 8 * 16]
punpcklbw m2, m4
pmaddwd m2, [pw_ang_table + 14 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_4, 3,3,5
movh m1, [r2 + 9] ;[8 7 6 5 4 3 2 1]
punpcklbw m1, m1
psrldq m1, 1
movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
psrldq m1, 2
movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
pxor m4, m4
punpcklbw m2, m4
mova m3, m2
pmaddwd m3, [pw_ang_table + 10 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 21 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 20 * 16]
pmaddwd m2, [pw_ang_table + 31 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_5, 3,3,5
movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1]
punpcklbw m3, m3
psrldq m3, 1
mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
psrldq m3, 2
mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
pxor m1, m1
punpcklbw m2, m1
mova m4, m2
pmaddwd m4, [pw_ang_table + 2 * 16]
punpcklbw m0, m1
pmaddwd m0, [pw_ang_table + 17 * 16]
packssdw m0, m4
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m1
pmaddwd m3, [pw_ang_table + 4 * 16]
pmaddwd m2, [pw_ang_table + 19 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_6, 3,3,4
movh m2, [r2 + 9] ;[8 7 6 5 4 3 2 1]
punpcklbw m2, m2
psrldq m2, 1
movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
pxor m1, m1
punpcklbw m0, m1
mova m3, m0
pmaddwd m3, [pw_ang_table + 26 * 16]
pmaddwd m0, [pw_ang_table + 13 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m2, m1
mova m3, m2
pmaddwd m3, [pw_ang_table + 20 * 16]
pmaddwd m2, [pw_ang_table + 7 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_7, 3,3,5
movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1]
punpcklbw m3, m3
psrldq m3, 1
movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
pxor m1, m1
punpcklbw m0, m1
mova m4, m0
mova m2, m0
pmaddwd m4, [pw_ang_table + 18 * 16]
pmaddwd m0, [pw_ang_table + 9 * 16]
packssdw m0, m4
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m1
pmaddwd m3, [pw_ang_table + 4 * 16]
pmaddwd m2, [pw_ang_table + 27 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_8, 3,3,5
movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
punpcklbw m0, m0
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 10 * 16]
pmaddwd m0, [pw_ang_table + 5 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 20 * 16]
pmaddwd m2, [pw_ang_table + 15 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_9, 3,3,5
movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
punpcklbw m0, m0
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 4 * 16]
pmaddwd m0, [pw_ang_table + 2 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 8 * 16]
pmaddwd m2, [pw_ang_table + 6 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_10, 3,5,4
movd m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
punpcklbw m0, m0
punpcklwd m0, m0
pshufd m1, m0, 1
movhlps m2, m0
pshufd m3, m0, 3
movd [r0 + r1], m1
movd [r0 + r1 * 2], m2
lea r1, [r1 * 3]
movd [r0 + r1], m3
cmp r4m, byte 0
jz .quit
; filter
pxor m3, m3
punpcklbw m0, m3
movh m1, [r2] ;[4 3 2 1 0]
punpcklbw m1, m3
pshuflw m2, m1, 0x00
psrldq m1, 2
psubw m1, m2
psraw m1, 1
paddw m0, m1
packuswb m0, m0
.quit:
movd [r0], m0
RET
cglobal intra_pred_ang4_11, 3,3,5
movd m1, [r2 + 9] ;[4 3 2 1]
movh m0, [r2 - 7] ;[A x x x x x x x]
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]]
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 28 * 16]
pmaddwd m0, [pw_ang_table + 30 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 24 * 16]
pmaddwd m2, [pw_ang_table + 26 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_12, 3,3,5
movd m1, [r2 + 9] ;[4 3 2 1]
movh m0, [r2 - 7] ;[A x x x x x x x]
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 22 * 16]
pmaddwd m0, [pw_ang_table + 27 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 12 * 16]
pmaddwd m2, [pw_ang_table + 17 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_24, 3,3,5
movd m1, [r2 + 1] ;[4 3 2 1]
movh m0, [r2 - 7] ;[A x x x x x x x]
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 22 * 16]
pmaddwd m0, [pw_ang_table + 27 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 12 * 16]
pmaddwd m2, [pw_ang_table + 17 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_13, 3,3,5
movd m1, [r2 - 1] ;[x x A x]
movd m2, [r2 + 9] ;[4 3 2 1]
movd m0, [r2 + 3] ;[x x B x]
punpcklbw m0, m1 ;[x x x x A B x x]
punpckldq m0, m2 ;[4 3 2 1 A B x x]
psrldq m0, 2 ;[x x 4 3 2 1 A B]
punpcklbw m0, m0
psrldq m0, 1
movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m4, m0
mova m2, m0
pmaddwd m4, [pw_ang_table + 14 * 16]
pmaddwd m0, [pw_ang_table + 23 * 16]
packssdw m0, m4
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m1
pmaddwd m3, [pw_ang_table + 28 * 16]
pmaddwd m2, [pw_ang_table + 5 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_14, 3,3,4
movd m1, [r2 - 1] ;[x x A x]
movd m0, [r2 + 1] ;[x x B x]
punpcklbw m0, m1 ;[A B x x]
movd m1, [r2 + 9] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B x x]
psrldq m0, 2 ;[x x 4 3 2 1 A B]
punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
psrldq m0, 1
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m3, m0
pmaddwd m3, [pw_ang_table + 6 * 16]
pmaddwd m0, [pw_ang_table + 19 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m2, m1
mova m3, m2
pmaddwd m3, [pw_ang_table + 12 * 16]
pmaddwd m2, [pw_ang_table + 25 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_15, 3,3,5
movd m0, [r2] ;[x x x A]
movd m1, [r2 + 2] ;[x x x B]
punpcklbw m1, m0 ;[x x A B]
movd m0, [r2 + 3] ;[x x C x]
punpcklwd m0, m1 ;[A B C x]
movd m1, [r2 + 9] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B C x]
psrldq m0, 1 ;[x 4 3 2 1 A B C]
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
psrldq m0, 1
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
psrldq m0, 2
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m4, m4
punpcklbw m2, m4
mova m3, m2
pmaddwd m3, [pw_ang_table + 30 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 15 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 28 * 16]
pmaddwd m2, [pw_ang_table + 13 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_16, 3,3,5
movd m2, [r2] ;[x x x A]
movd m1, [r2 + 2] ;[x x x B]
punpcklbw m1, m2 ;[x x A B]
movd m0, [r2 + 2] ;[x x C x]
punpcklwd m0, m1 ;[A B C x]
movd m1, [r2 + 9] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B C x]
psrldq m0, 1 ;[x 4 3 2 1 A B C]
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
psrldq m0, 1
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
psrldq m0, 2
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m4, m4
punpcklbw m2, m4
mova m3, m2
pmaddwd m3, [pw_ang_table + 22 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 11 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 12 * 16]
pmaddwd m2, [pw_ang_table + 1 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_17, 3,3,5
movd m2, [r2] ;[x x x A]
movd m3, [r2 + 1] ;[x x x B]
movd m4, [r2 + 2] ;[x x x C]
movd m0, [r2 + 4] ;[x x x D]
punpcklbw m3, m2 ;[x x A B]
punpcklbw m0, m4 ;[x x C D]
punpcklwd m0, m3 ;[A B C D]
movd m1, [r2 + 9] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B C D]
punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D]
psrldq m0, 1
movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
psrldq m0, 2
movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
psrldq m0, 2
movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m4, m4
punpcklbw m3, m4
pmaddwd m3, [pw_ang_table + 12 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 6 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 24 * 16]
punpcklbw m2, m4
pmaddwd m2, [pw_ang_table + 18 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
TRANSPOSE_4x4
STORE_4x4
RET
cglobal intra_pred_ang4_18, 3,4,2
mov r3d, [r2 + 8]
mov r3b, byte [r2]
bswap r3d
movd m0, r3d
movd m1, [r2 + 1]
punpckldq m0, m1
lea r3, [r1 * 3]
movd [r0 + r3], m0
psrldq m0, 1
movd [r0 + r1 * 2], m0
psrldq m0, 1
movd [r0 + r1], m0
psrldq m0, 1
movd [r0], m0
RET
cglobal intra_pred_ang4_19, 3,3,5
movd m2, [r2] ;[x x x A]
movd m3, [r2 + 9] ;[x x x B]
movd m4, [r2 + 10] ;[x x x C]
movd m0, [r2 + 12] ;[x x x D]
punpcklbw m3, m2 ;[x x A B]
punpcklbw m0, m4 ;[x x C D]
punpcklwd m0, m3 ;[A B C D]
movd m1, [r2 + 1] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B C D]
punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D]
psrldq m0, 1
movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
psrldq m0, 2
movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
psrldq m0, 2
movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m4, m4
punpcklbw m3, m4
pmaddwd m3, [pw_ang_table + 12 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 6 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 24 * 16]
punpcklbw m2, m4
pmaddwd m2, [pw_ang_table + 18 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_20, 3,3,5
movd m2, [r2] ;[x x x A]
movd m1, [r2 + 10] ;[x x x B]
punpcklbw m1, m2 ;[x x A B]
movd m0, [r2 + 10] ;[x x C x]
punpcklwd m0, m1 ;[A B C x]
movd m1, [r2 + 1] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B C x]
psrldq m0, 1 ;[x 4 3 2 1 A B C]
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
psrldq m0, 1
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
psrldq m0, 2
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m4, m4
punpcklbw m2, m4
mova m3, m2
pmaddwd m3, [pw_ang_table + 22 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 11 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 12 * 16]
pmaddwd m2, [pw_ang_table + 1 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_21, 3,3,5
movd m0, [r2] ;[x x x A]
movd m1, [r2 + 10] ;[x x x B]
punpcklbw m1, m0 ;[x x A B]
movd m0, [r2 + 11] ;[x x C x]
punpcklwd m0, m1 ;[A B C x]
movd m1, [r2 + 1] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B C x]
psrldq m0, 1 ;[x 4 3 2 1 A B C]
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
psrldq m0, 1
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
psrldq m0, 2
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m4, m4
punpcklbw m2, m4
mova m3, m2
pmaddwd m3, [pw_ang_table + 30 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 15 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 28 * 16]
pmaddwd m2, [pw_ang_table + 13 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_22, 3,3,4
movd m1, [r2 - 1] ;[x x A x]
movd m0, [r2 + 9] ;[x x B x]
punpcklbw m0, m1 ;[A B x x]
movd m1, [r2 + 1] ;[4 3 2 1]
punpckldq m0, m1 ;[4 3 2 1 A B x x]
psrldq m0, 2 ;[x x 4 3 2 1 A B]
punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
psrldq m0, 1
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m3, m0
pmaddwd m3, [pw_ang_table + 6 * 16]
pmaddwd m0, [pw_ang_table + 19 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m2, m1
mova m3, m2
pmaddwd m3, [pw_ang_table + 12 * 16]
pmaddwd m2, [pw_ang_table + 25 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_23, 3,3,5
movd m1, [r2 - 1] ;[x x A x]
movd m2, [r2 + 1] ;[4 3 2 1]
movd m0, [r2 + 11] ;[x x B x]
punpcklbw m0, m1 ;[x x x x A B x x]
punpckldq m0, m2 ;[4 3 2 1 A B x x]
psrldq m0, 2 ;[x x 4 3 2 1 A B]
punpcklbw m0, m0
psrldq m0, 1
mova m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m4, m0
mova m2, m0
pmaddwd m4, [pw_ang_table + 14 * 16]
pmaddwd m0, [pw_ang_table + 23 * 16]
packssdw m0, m4
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m1
pmaddwd m3, [pw_ang_table + 28 * 16]
pmaddwd m2, [pw_ang_table + 5 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_25, 3,3,5
movd m1, [r2 + 1] ;[4 3 2 1]
movh m0, [r2 - 7] ;[A x x x x x x x]
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 28 * 16]
pmaddwd m0, [pw_ang_table + 30 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 24 * 16]
pmaddwd m2, [pw_ang_table + 26 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_26, 3,4,4
movd m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
; store
movd [r0], m0
movd [r0 + r1], m0
movd [r0 + r1 * 2], m0
lea r3, [r1 * 3]
movd [r0 + r3], m0
; filter
cmp r4m, byte 0
jz .quit
pxor m3, m3
punpcklbw m0, m3
pshuflw m0, m0, 0x00
movd m2, [r2]
punpcklbw m2, m3
pshuflw m2, m2, 0x00
movd m1, [r2 + 9]
punpcklbw m1, m3
psubw m1, m2
psraw m1, 1
paddw m0, m1
packuswb m0, m0
movd r2, m0
mov [r0], r2b
shr r2, 8
mov [r0 + r1], r2b
shr r2, 8
mov [r0 + r1 * 2], r2b
shr r2, 8
mov [r0 + r3], r2b
.quit:
RET
cglobal intra_pred_ang4_27, 3,3,5
movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
punpcklbw m0, m0
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 4 * 16]
pmaddwd m0, [pw_ang_table + 2 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 8 * 16]
pmaddwd m2, [pw_ang_table + 6 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_28, 3,3,5
movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
punpcklbw m0, m0
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pxor m1, m1
punpcklbw m0, m1
mova m2, m0
mova m3, m0
mova m4, m2
pmaddwd m3, [pw_ang_table + 10 * 16]
pmaddwd m0, [pw_ang_table + 5 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
pmaddwd m4, [pw_ang_table + 20 * 16]
pmaddwd m2, [pw_ang_table + 15 * 16]
packssdw m2, m4
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_29, 3,3,5
movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1]
punpcklbw m3, m3
psrldq m3, 1
movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
pxor m1, m1
punpcklbw m0, m1
mova m4, m0
mova m2, m0
pmaddwd m4, [pw_ang_table + 18 * 16]
pmaddwd m0, [pw_ang_table + 9 * 16]
packssdw m0, m4
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m1
pmaddwd m3, [pw_ang_table + 4 * 16]
pmaddwd m2, [pw_ang_table + 27 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_30, 3,3,4
movh m2, [r2 + 1] ;[8 7 6 5 4 3 2 1]
punpcklbw m2, m2
psrldq m2, 1
movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
pxor m1, m1
punpcklbw m0, m1
mova m3, m0
pmaddwd m3, [pw_ang_table + 26 * 16]
pmaddwd m0, [pw_ang_table + 13 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m2, m1
mova m3, m2
pmaddwd m3, [pw_ang_table + 20 * 16]
pmaddwd m2, [pw_ang_table + 7 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_31, 3,3,5
movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1]
punpcklbw m3, m3
psrldq m3, 1
mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
psrldq m3, 2
mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
pxor m1, m1
punpcklbw m2, m1
mova m4, m2
pmaddwd m4, [pw_ang_table + 2 * 16]
punpcklbw m0, m1
pmaddwd m0, [pw_ang_table + 17 * 16]
packssdw m0, m4
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m1
pmaddwd m3, [pw_ang_table + 4 * 16]
pmaddwd m2, [pw_ang_table + 19 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_32, 3,3,5
movh m1, [r2 + 1] ;[8 7 6 5 4 3 2 1]
punpcklbw m1, m1
psrldq m1, 1
movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
psrldq m1, 2
movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
pxor m4, m4
punpcklbw m2, m4
mova m3, m2
pmaddwd m3, [pw_ang_table + 10 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 21 * 16]
packssdw m0, m3
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 20 * 16]
pmaddwd m2, [pw_ang_table + 31 * 16]
packssdw m2, m1
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
cglobal intra_pred_ang4_33, 3,3,5
movh m3, [r2 + 1] ; [8 7 6 5 4 3 2 1]
punpcklbw m3, m3
psrldq m3, 1
movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
psrldq m3, 2
movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
psrldq m3, 2
movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4]
pxor m4, m4
punpcklbw m1, m4
pmaddwd m1, [pw_ang_table + 20 * 16]
punpcklbw m0, m4
pmaddwd m0, [pw_ang_table + 26 * 16]
packssdw m0, m1
paddw m0, [pw_16]
psraw m0, 5
punpcklbw m3, m4
pmaddwd m3, [pw_ang_table + 8 * 16]
punpcklbw m2, m4
pmaddwd m2, [pw_ang_table + 14 * 16]
packssdw m2, m3
paddw m2, [pw_16]
psraw m2, 5
packuswb m0, m2
STORE_4x4
RET
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc4, 5,5,3
inc r2
pxor m0, m0
movd m1, [r2]
movd m2, [r2 + 8]
punpckldq m1, m2
psadbw m1, m0 ; m1 = sum
test r4d, r4d
pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8
movd r4d, m1 ; r4d = dc_val
pshufb m1, m0 ; m1 = byte [dc_val ...]
; store DC 4x4
lea r3, [r1 * 3]
movd [r0], m1
movd [r0 + r1], m1
movd [r0 + r1 * 2], m1
movd [r0 + r3], m1
; do DC filter
jz .end
lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
add r4d, r3d ; r4d = DC * 3 + 2
movd m1, r4d
pshuflw m1, m1, 0 ; m1 = pixDCx3
pshufd m1, m1, 0
; filter top
movd m2, [r2]
movd m0, [r2 + 9]
punpckldq m2, m0
pmovzxbw m2, m2
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movd [r0], m2 ; overwrite top-left pixel, we will update it later
; filter top-left
movzx r4d, byte [r2 + 8]
add r3d, r4d
movzx r4d, byte [r2]
add r3d, r4d
shr r3d, 2
mov [r0], r3b
; filter left
add r0, r1
pextrb [r0], m2, 4
pextrb [r0 + r1], m2, 5
pextrb [r0 + r1 * 2], m2, 6
.end:
RET
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc8, 5, 7, 3
lea r3, [r2 + 17]
inc r2
pxor m0, m0
movh m1, [r2]
movh m2, [r3]
punpcklqdq m1, m2
psadbw m1, m0
pshufd m2, m1, 2
paddw m1, m2
movd r5d, m1
add r5d, 8
shr r5d, 4 ; sum = sum / 16
movd m1, r5d
pshufb m1, m0 ; m1 = byte [dc_val ...]
test r4d, r4d
; store DC 8x8
mov r6, r0
movh [r0], m1
movh [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movh [r0], m1
movh [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movh [r0], m1
movh [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movh [r0], m1
movh [r0 + r1], m1
; Do DC Filter
jz .end
lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
add r5d, r4d ; r5d = DC * 3 + 2
movd m1, r5d
pshuflw m1, m1, 0 ; m1 = pixDCx3
pshufd m1, m1, 0
; filter top
pmovzxbw m2, [r2]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movh [r6], m2
; filter top-left
movzx r5d, byte [r3]
add r4d, r5d
movzx r3d, byte [r2]
add r3d, r4d
shr r3d, 2
mov [r6], r3b
; filter left
add r6, r1
pmovzxbw m2, [r2 + 17]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
pextrb [r6], m2, 0
pextrb [r6 + r1], m2, 1
pextrb [r6 + 2 * r1], m2, 2
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m2, 3
pextrb [r6 + r1 * 2], m2, 4
pextrb [r6 + r1 * 4], m2, 6
lea r1, [r1 * 3]
pextrb [r6 + r1], m2, 5
.end:
RET
;--------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;--------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc16, 5, 7, 4
lea r3, [r2 + 33]
inc r2
pxor m0, m0
movu m1, [r2]
movu m2, [r3]
psadbw m1, m0
psadbw m2, m0
paddw m1, m2
pshufd m2, m1, 2
paddw m1, m2
movd r5d, m1
add r5d, 16
shr r5d, 5 ; sum = sum / 32
movd m1, r5d
pshufb m1, m0 ; m1 = byte [dc_val ...]
test r4d, r4d
; store DC 16x16
mov r6, r0
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
lea r0, [r0 + r1 * 2]
movu [r0], m1
movu [r0 + r1], m1
; Do DC Filter
jz .end
lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
add r5d, r4d ; r5d = DC * 3 + 2
movd m1, r5d
pshuflw m1, m1, 0 ; m1 = pixDCx3
pshufd m1, m1, 0
; filter top
pmovzxbw m2, [r2]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
movh [r6], m2
pmovzxbw m3, [r2 + 8]
paddw m3, m1
psraw m3, 2
packuswb m3, m3
movh [r6 + 8], m3
; filter top-left
movzx r5d, byte [r3]
add r4d, r5d
movzx r3d, byte [r2]
add r3d, r4d
shr r3d, 2
mov [r6], r3b
; filter left
add r6, r1
pmovzxbw m2, [r2 + 33]
paddw m2, m1
psraw m2, 2
packuswb m2, m2
pextrb [r6], m2, 0
pextrb [r6 + r1], m2, 1
pextrb [r6 + r1 * 2], m2, 2
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m2, 3
pextrb [r6 + r1 * 2], m2, 4
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m2, 5
pextrb [r6 + r1 * 2], m2, 6
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m2, 7
pmovzxbw m3, [r2 + 41]
paddw m3, m1
psraw m3, 2
packuswb m3, m3
pextrb [r6 + r1 * 2], m3, 0
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m3, 1
pextrb [r6 + r1 * 2], m3, 2
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m3, 3
pextrb [r6 + r1 * 2], m3, 4
lea r6, [r6 + r1 * 2]
pextrb [r6 + r1], m3, 5
pextrb [r6 + r1 * 2], m3, 6
.end:
RET
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_dc32, 3, 5, 5
lea r3, [r2 + 65]
inc r2
pxor m0, m0
movu m1, [r2]
movu m2, [r2 + 16]
movu m3, [r3]
movu m4, [r3 + 16]
psadbw m1, m0
psadbw m2, m0
psadbw m3, m0
psadbw m4, m0
paddw m1, m2
paddw m3, m4
paddw m1, m3
pshufd m2, m1, 2
paddw m1, m2
movd r4d, m1
add r4d, 32
shr r4d, 6 ; sum = sum / 64
movd m1, r4d
pshufb m1, m0 ; m1 = byte [dc_val ...]
%rep 2
; store DC 16x16
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
movu [r0], m1
movu [r0 + r1], m1
movu [r0 + 16], m1
movu [r0 + r1 + 16],m1
lea r0, [r0 + 2 * r1]
%endrep
RET
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal intra_pred_dc32, 3, 4, 3
lea r3, [r1 * 3]
pxor m0, m0
movu m1, [r2 + 1]
movu m2, [r2 + 65]
psadbw m1, m0
psadbw m2, m0
paddw m1, m2
vextracti128 xm2, m1, 1
paddw m1, m2
pshufd m2, m1, 2
paddw m1, m2
pmulhrsw m1, [pw_512] ; sum = (sum + 32) / 64
vpbroadcastb m1, xm1 ; m1 = byte [dc_val ...]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
lea r0, [r0 + 4 * r1]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
lea r0, [r0 + 4 * r1]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
lea r0, [r0 + 4 * r1]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
lea r0, [r0 + 4 * r1]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
lea r0, [r0 + 4 * r1]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
lea r0, [r0 + 4 * r1]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
lea r0, [r0 + 4 * r1]
movu [r0 + r1 * 0], m1
movu [r0 + r1 * 1], m1
movu [r0 + r1 * 2], m1
movu [r0 + r3 * 1], m1
RET
%endif ;; ARCH_X86_64 == 1
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar4, 3,3,7
pmovzxbw m1, [r2 + 1]
pmovzxbw m2, [r2 + 9]
pshufhw m3, m1, 0 ; topRight
pshufd m3, m3, 0xAA
pshufhw m4, m2, 0 ; bottomLeft
pshufd m4, m4, 0xAA
pmullw m3, [multi_2Row] ; (x + 1) * topRight
pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
mova m6, [pw_planar4_0]
paddw m3, [pw_4]
paddw m3, m4
paddw m3, m0
psubw m4, m1
pshuflw m5, m2, 0
pmullw m5, m6
paddw m5, m3
paddw m3, m4
psraw m5, 3
packuswb m5, m5
movd [r0], m5
pshuflw m5, m2, 01010101b
pmullw m5, m6
paddw m5, m3
paddw m3, m4
psraw m5, 3
packuswb m5, m5
movd [r0 + r1], m5
lea r0, [r0 + 2 * r1]
pshuflw m5, m2, 10101010b
pmullw m5, m6
paddw m5, m3
paddw m3, m4
psraw m5, 3
packuswb m5, m5
movd [r0], m5
pshuflw m5, m2, 11111111b
pmullw m5, m6
paddw m5, m3
paddw m3, m4
psraw m5, 3
packuswb m5, m5
movd [r0 + r1], m5
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar8, 3,3,7
pmovzxbw m1, [r2 + 1]
pmovzxbw m2, [r2 + 17]
movd m3, [r2 + 9] ; topRight = above[8];
movd m4, [r2 + 25] ; bottomLeft = left[8];
pxor m0, m0
pshufb m3, m0
pshufb m4, m0
punpcklbw m3, m0 ; v_topRight
punpcklbw m4, m0 ; v_bottomLeft
pmullw m3, [multiL] ; (x + 1) * topRight
pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
mova m6, [pw_planar16_mul + mmsize]
paddw m3, [pw_8]
paddw m3, m4
paddw m3, m0
psubw m4, m1
%macro INTRA_PRED_PLANAR8 1
%if (%1 < 4)
pshuflw m5, m2, 0x55 * %1
pshufd m5, m5, 0
%else
pshufhw m5, m2, 0x55 * (%1 - 4)
pshufd m5, m5, 0xAA
%endif
pmullw m5, m6
paddw m5, m3
paddw m3, m4
psraw m5, 4
packuswb m5, m5
movh [r0], m5
lea r0, [r0 + r1]
%endmacro
INTRA_PRED_PLANAR8 0
INTRA_PRED_PLANAR8 1
INTRA_PRED_PLANAR8 2
INTRA_PRED_PLANAR8 3
INTRA_PRED_PLANAR8 4
INTRA_PRED_PLANAR8 5
INTRA_PRED_PLANAR8 6
INTRA_PRED_PLANAR8 7
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_pred_planar16, 3,3,8
pmovzxbw m2, [r2 + 1]
pmovzxbw m7, [r2 + 9]
movd m3, [r2 + 17] ; topRight = above[16]
movd m6, [r2 + 49] ; bottomLeft = left[16]
pxor m0, m0
pshufb m3, m0
pshufb m6, m0
punpcklbw m3, m0 ; v_topRight
punpcklbw m6, m0 ; v_bottomLeft
pmullw m4, m3, [multiH] ; (x + 1) * topRight
pmullw m3, [multiL] ; (x + 1) * topRight
pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
paddw m4, [pw_16]
paddw m3, [pw_16]
paddw m4, m6
paddw m3, m6
paddw m4, m5
paddw m3, m1
psubw m1, m6, m7
psubw m6, m2
pmovzxbw m2, [r2 + 33]
pmovzxbw m7, [r2 + 41]
%macro INTRA_PRED_PLANAR16 1
%if (%1 < 4)
pshuflw m5, m2, 0x55 * %1
pshufd m5, m5, 0
%else
%if (%1 < 8)
pshufhw m5, m2, 0x55 * (%1 - 4)
pshufd m5, m5, 0xAA
%else
%if (%1 < 12)
pshuflw m5, m7, 0x55 * (%1 - 8)
pshufd m5, m5, 0
%else
pshufhw m5, m7, 0x55 * (%1 - 12)
pshufd m5, m5, 0xAA
%endif
%endif
%endif
pmullw m0, m5, [pw_planar16_mul + mmsize]
pmullw m5, [pw_planar16_mul]
paddw m0, m4
paddw m5, m3
paddw m3, m6
paddw m4, m1
psraw m5, 5
psraw m0, 5
packuswb m5, m0
movu [r0], m5
lea r0, [r0 + r1]
%endmacro
INTRA_PRED_PLANAR16 0
INTRA_PRED_PLANAR16 1
INTRA_PRED_PLANAR16 2
INTRA_PRED_PLANAR16 3
INTRA_PRED_PLANAR16 4
INTRA_PRED_PLANAR16 5
INTRA_PRED_PLANAR16 6
INTRA_PRED_PLANAR16 7
INTRA_PRED_PLANAR16 8
INTRA_PRED_PLANAR16 9
INTRA_PRED_PLANAR16 10
INTRA_PRED_PLANAR16 11
INTRA_PRED_PLANAR16 12
INTRA_PRED_PLANAR16 13
INTRA_PRED_PLANAR16 14
INTRA_PRED_PLANAR16 15
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_YMM avx2
cglobal intra_pred_planar16, 3,3,6
vpbroadcastw m3, [r2 + 17]
mova m5, [pw_00ff]
vpbroadcastw m4, [r2 + 49]
mova m0, [pw_planar16_mul]
pmovzxbw m2, [r2 + 1]
pand m3, m5 ; v_topRight
pand m4, m5 ; v_bottomLeft
pmullw m3, [multiL] ; (x + 1) * topRight
pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
paddw m3, [pw_16]
paddw m3, m4
paddw m3, m1
psubw m4, m2
add r2, 33
%macro INTRA_PRED_PLANAR16_AVX2 1
vpbroadcastw m1, [r2 + %1]
vpsrlw m2, m1, 8
pand m1, m5
pmullw m1, m0
pmullw m2, m0
paddw m1, m3
paddw m3, m4
psraw m1, 5
paddw m2, m3
psraw m2, 5
paddw m3, m4
packuswb m1, m2
vpermq m1, m1, 11011000b
movu [r0], xm1
vextracti128 [r0 + r1], m1, 1
lea r0, [r0 + r1 * 2]
%endmacro
INTRA_PRED_PLANAR16_AVX2 0
INTRA_PRED_PLANAR16_AVX2 2
INTRA_PRED_PLANAR16_AVX2 4
INTRA_PRED_PLANAR16_AVX2 6
INTRA_PRED_PLANAR16_AVX2 8
INTRA_PRED_PLANAR16_AVX2 10
INTRA_PRED_PLANAR16_AVX2 12
INTRA_PRED_PLANAR16_AVX2 14
%undef INTRA_PRED_PLANAR16_AVX2
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse4
%if ARCH_X86_64 == 1
cglobal intra_pred_planar32, 3,4,12
%else
cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
%define m10 [rsp + 2 * mmsize]
%define m11 [rsp + 3 * mmsize]
%endif
movd m3, [r2 + 33] ; topRight = above[32]
pxor m7, m7
pshufb m3, m7
punpcklbw m3, m7 ; v_topRight
pmullw m0, m3, [multiL] ; (x + 1) * topRight
pmullw m1, m3, [multiH] ; (x + 1) * topRight
pmullw m2, m3, [multiH2] ; (x + 1) * topRight
pmullw m3, [multiH3] ; (x + 1) * topRight
movd m6, [r2 + 97] ; bottomLeft = left[32]
pshufb m6, m7
punpcklbw m6, m7 ; v_bottomLeft
paddw m0, m6
paddw m1, m6
paddw m2, m6
paddw m3, m6
paddw m0, [pw_32]
paddw m1, [pw_32]
paddw m2, [pw_32]
paddw m3, [pw_32]
pmovzxbw m4, [r2 + 1]
pmullw m5, m4, [pw_31]
paddw m0, m5
psubw m5, m6, m4
mova m8, m5
pmovzxbw m4, [r2 + 9]
pmullw m5, m4, [pw_31]
paddw m1, m5
psubw m5, m6, m4
mova m9, m5
pmovzxbw m4, [r2 + 17]
pmullw m5, m4, [pw_31]
paddw m2, m5
psubw m5, m6, m4
mova m10, m5
pmovzxbw m4, [r2 + 25]
pmullw m5, m4, [pw_31]
paddw m3, m5
psubw m5, m6, m4
mova m11, m5
add r2, 65 ; (2 * blkSize + 1)
%macro INTRA_PRED_PLANAR32 0
movd m4, [r2]
pshufb m4, m7
punpcklbw m4, m7
pmullw m5, m4, [pw_planar32_mul]
pmullw m6, m4, [pw_planar32_mul + mmsize]
paddw m5, m0
paddw m6, m1
paddw m0, m8
paddw m1, m9
psraw m5, 6
psraw m6, 6
packuswb m5, m6
movu [r0], m5
pmullw m5, m4, [pw_planar16_mul]
pmullw m4, [pw_planar16_mul + mmsize]
paddw m5, m2
paddw m4, m3
paddw m2, m10
paddw m3, m11
psraw m5, 6
psraw m4, 6
packuswb m5, m4
movu [r0 + 16], m5
lea r0, [r0 + r1]
inc r2
%endmacro
mov r3, 4
.loop:
INTRA_PRED_PLANAR32
INTRA_PRED_PLANAR32
INTRA_PRED_PLANAR32
INTRA_PRED_PLANAR32
INTRA_PRED_PLANAR32
INTRA_PRED_PLANAR32
INTRA_PRED_PLANAR32
INTRA_PRED_PLANAR32
dec r3
jnz .loop
RET
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
%if ARCH_X86_64 == 1
INIT_YMM avx2
cglobal intra_pred_planar32, 3,4,11
mova m6, [pw_00ff]
vpbroadcastw m3, [r2 + 33] ; topRight = above[32]
vpbroadcastw m2, [r2 + 97] ; bottomLeft = left[32]
pand m3, m6
pand m2, m6
pmullw m0, m3, [multiL] ; (x + 1) * topRight
pmullw m3, [multiH2] ; (x + 1) * topRight
paddw m0, m2
paddw m3, m2
paddw m0, [pw_32]
paddw m3, [pw_32]
pmovzxbw m4, [r2 + 1]
pmovzxbw m1, [r2 + 17]
pmullw m5, m4, [pw_31]
paddw m0, m5
psubw m5, m2, m4
psubw m2, m1
pmullw m1, [pw_31]
paddw m3, m1
mova m1, m5
add r2, 65 ; (2 * blkSize + 1)
mova m9, [pw_planar32_mul]
mova m10, [pw_planar16_mul]
%macro INTRA_PRED_PLANAR32_AVX2 0
vpbroadcastw m4, [r2]
vpsrlw m7, m4, 8
pand m4, m6
pmullw m5, m4, m9
pmullw m4, m4, m10
paddw m5, m0
paddw m4, m3
paddw m0, m1
paddw m3, m2
psraw m5, 6
psraw m4, 6
packuswb m5, m4
pmullw m8, m7, m9
pmullw m7, m7, m10
vpermq m5, m5, 11011000b
paddw m8, m0
paddw m7, m3
paddw m0, m1
paddw m3, m2
psraw m8, 6
psraw m7, 6
packuswb m8, m7
add r2, 2
vpermq m8, m8, 11011000b
movu [r0], m5
movu [r0 + r1], m8
lea r0, [r0 + r1 * 2]
%endmacro
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
INTRA_PRED_PLANAR32_AVX2
%undef INTRA_PRED_PLANAR32_AVX2
RET
%endif ;; ARCH_X86_64 == 1
;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang4_2, 3,5,3
lea r4, [r2 + 2]
add r2, 10
cmp r3m, byte 34
cmove r2, r4
movh m0, [r2]
movd [r0], m0
palignr m1, m0, 1
movd [r0 + r1], m1
palignr m2, m0, 2
movd [r0 + r1 * 2], m2
lea r1, [r1 * 3]
psrldq m0, 3
movd [r0 + r1], m0
RET
INIT_XMM sse4
cglobal intra_pred_ang4_3, 3,5,5
mov r4, 1
cmp r3m, byte 33
mov r3, 9
cmove r3, r4
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4]
punpcklqdq m0, m1
punpcklqdq m2, m3
lea r3, [ang_table + 20 * 16]
movh m3, [r3 + 6 * 16] ; [26]
movhps m3, [r3] ; [20]
movh m4, [r3 - 6 * 16] ; [14]
movhps m4, [r3 - 12 * 16] ; [ 8]
jmp .do_filter4x4
; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
ALIGN 16
.do_filter4x4:
mova m1, [pw_1024]
pmaddubsw m0, m3
pmulhrsw m0, m1
pmaddubsw m2, m4
pmulhrsw m2, m1
packuswb m0, m2
; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
jz .store
; transpose 4x4
pshufb m0, [c_trans_4x4]
.store:
; TODO: use pextrd here after intrinsic ssse3 removed
movd [r0], m0
pextrd [r0 + r1], m0, 1
pextrd [r0 + r1 * 2], m0, 2
lea r1, [r1 * 3]
pextrd [r0 + r1], m0, 3
RET
cglobal intra_pred_ang4_4, 3,5,5
xor r4, r4
inc r4
cmp r3m, byte 32
mov r3, 9
cmove r3, r4
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
punpcklqdq m0, m1
punpcklqdq m2, m1, m3
lea r3, [ang_table + 18 * 16]
movh m3, [r3 + 3 * 16] ; [21]
movhps m3, [r3 - 8 * 16] ; [10]
movh m4, [r3 + 13 * 16] ; [31]
movhps m4, [r3 + 2 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_5, 3,5,5
xor r4, r4
inc r4
cmp r3m, byte 31
mov r3, 9
cmove r3, r4
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
punpcklqdq m0, m1
punpcklqdq m2, m1, m3
lea r3, [ang_table + 10 * 16]
movh m3, [r3 + 7 * 16] ; [17]
movhps m3, [r3 - 8 * 16] ; [ 2]
movh m4, [r3 + 9 * 16] ; [19]
movhps m4, [r3 - 6 * 16] ; [ 4]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_6, 3,5,5
xor r4, r4
inc r4
cmp r3m, byte 30
mov r3, 9
cmove r3, r4
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
punpcklqdq m0, m0
punpcklqdq m2, m2
lea r3, [ang_table + 19 * 16]
movh m3, [r3 - 6 * 16] ; [13]
movhps m3, [r3 + 7 * 16] ; [26]
movh m4, [r3 - 12 * 16] ; [ 7]
movhps m4, [r3 + 1 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_7, 3,5,5
xor r4, r4
inc r4
cmp r3m, byte 29
mov r3, 9
cmove r3, r4
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
punpcklqdq m2, m0, m3
punpcklqdq m0, m0
lea r3, [ang_table + 20 * 16]
movh m3, [r3 - 11 * 16] ; [ 9]
movhps m3, [r3 - 2 * 16] ; [18]
movh m4, [r3 + 7 * 16] ; [27]
movhps m4, [r3 - 16 * 16] ; [ 4]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_8, 3,5,5
xor r4, r4
inc r4
cmp r3m, byte 28
mov r3, 9
cmove r3, r4
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklqdq m0, m0
mova m2, m0
lea r3, [ang_table + 13 * 16]
movh m3, [r3 - 8 * 16] ; [ 5]
movhps m3, [r3 - 3 * 16] ; [10]
movh m4, [r3 + 2 * 16] ; [15]
movhps m4, [r3 + 7 * 16] ; [20]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_9, 3,5,5
xor r4, r4
inc r4
cmp r3m, byte 27
mov r3, 9
cmove r3, r4
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklqdq m0, m0
mova m2, m0
lea r3, [ang_table + 4 * 16]
movh m3, [r3 - 2 * 16] ; [ 2]
movhps m3, [r3 - 0 * 16] ; [ 4]
movh m4, [r3 + 2 * 16] ; [ 6]
movhps m4, [r3 + 4 * 16] ; [ 8]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_10, 3,3,4
movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1]
pshufb m0, [pb_unpackbd1]
pshufd m1, m0, 1
movhlps m2, m0
pshufd m3, m0, 3
movd [r0 + r1], m1
movd [r0 + r1 * 2], m2
lea r1, [r1 * 3]
movd [r0 + r1], m3
cmp r4m, byte 0
jz .quit
; filter
pmovzxbw m0, m0 ; [-1 -1 -1 -1]
movh m1, [r2] ; [4 3 2 1 0]
pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
pshufb m1, [pb_unpackbw1] ; [4 3 2 1]
psubw m1, m2
psraw m1, 1
paddw m0, m1
packuswb m0, m0
.quit:
movd [r0], m0
RET
INIT_XMM sse4
cglobal intra_pred_ang4_26, 3,4,3
movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
; store
movd [r0], m0
movd [r0 + r1], m0
movd [r0 + r1 * 2], m0
lea r3, [r1 * 3]
movd [r0 + r3], m0
; filter
cmp r4m, byte 0
jz .quit
pshufb m0, [pb_0_8] ; [ 1 1 1 1]
movh m1, [r2 + 8] ; [-4 -3 -2 -1 0]
pinsrb m1, [r2], 0
pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
psubw m1, m2
psraw m1, 1
paddw m0, m1
packuswb m0, m0
pextrb [r0], m0, 0
pextrb [r0 + r1], m0, 1
pextrb [r0 + r1 * 2], m0, 2
pextrb [r0 + r3], m0, 3
.quit:
RET
cglobal intra_pred_ang4_11, 3,5,5
xor r4, r4
cmp r3m, byte 25
mov r3, 8
cmove r3, r4
movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
pinsrb m0, [r2], 0
palignr m1, m0, 1 ; [x x x x 4 3 2 1]
punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
punpcklqdq m0, m0
mova m2, m0
lea r3, [ang_table + 24 * 16]
movh m3, [r3 + 6 * 16] ; [24]
movhps m3, [r3 + 4 * 16] ; [26]
movh m4, [r3 + 2 * 16] ; [28]
movhps m4, [r3 + 0 * 16] ; [30]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_12, 3,5,5
xor r4, r4
cmp r3m, byte 24
mov r3, 8
cmove r3, r4
movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
pinsrb m0, [r2], 0
palignr m1, m0, 1 ; [x x x x 4 3 2 1]
punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
punpcklqdq m0, m0
mova m2, m0
lea r3, [ang_table + 20 * 16]
movh m3, [r3 + 7 * 16] ; [27]
movhps m3, [r3 + 2 * 16] ; [22]
movh m4, [r3 - 3 * 16] ; [17]
movhps m4, [r3 - 8 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_13, 4,5,5
xor r4, r4
cmp r3m, byte 23
mov r3, 8
jz .next
xchg r3, r4
.next:
movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
pinsrb m1, [r2], 1
palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
palignr m2, m1, 2 ; [x x x x 4 3 2 1]
pinsrb m1, [r2 + r3 + 4], 0
punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
punpcklqdq m2, m0, m1
punpcklqdq m0, m0
lea r3, [ang_table + 21 * 16]
movh m3, [r3 + 2 * 16] ; [23]
movhps m3, [r3 - 7 * 16] ; [14]
movh m4, [r3 - 16 * 16] ; [ 5]
movhps m4, [r3 + 7 * 16] ; [28]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_14, 4,5,5
xor r4, r4
cmp r3m, byte 22
mov r3, 8
jz .next
xchg r3, r4
.next:
movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
pinsrb m2, [r2], 1
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
pinsrb m2, [r2 + r3 + 2], 0
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
punpcklqdq m0, m0
punpcklqdq m2, m2
lea r3, [ang_table + 19 * 16]
movh m3, [r3 + 0 * 16] ; [19]
movhps m3, [r3 - 13 * 16] ; [ 6]
movh m4, [r3 + 6 * 16] ; [25]
movhps m4, [r3 - 7 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_15, 4,5,5
xor r4, r4
cmp r3m, byte 21
mov r3, 8
jz .next
xchg r3, r4
.next:
movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
pinsrb m2, [r2], 1
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
pinsrb m2, [r2 + r3 + 2], 0
pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
pinsrb m3, [r2 + r3 + 4], 0
punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
punpcklqdq m0, m2
punpcklqdq m2, m4
lea r3, [ang_table + 23 * 16]
movh m3, [r3 - 8 * 16] ; [15]
movhps m3, [r3 + 7 * 16] ; [30]
movh m4, [r3 - 10 * 16] ; [13]
movhps m4, [r3 + 5 * 16] ; [28]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_16, 3,5,5
xor r4, r4
cmp r3m, byte 20
mov r3, 8
jz .next
xchg r3, r4
.next:
movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
pinsrb m2, [r2], 1
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
pinsrb m2, [r2 + r3 + 2], 0
pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
pinsrb m3, [r2 + r3 + 3], 0
punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
punpcklqdq m0, m2
punpcklqdq m2, m4
lea r3, [ang_table + 19 * 16]
movh m3, [r3 - 8 * 16] ; [11]
movhps m3, [r3 + 3 * 16] ; [22]
movh m4, [r3 - 18 * 16] ; [ 1]
movhps m4, [r3 - 7 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_17, 3,5,5
xor r4, r4
cmp r3m, byte 19
mov r3, 8
jz .next
xchg r3, r4
.next:
movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x]
pinsrb m3, [r2], 1
palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
palignr m1, m3, 2 ; [- - - - 4 3 2 1]
mova m4, m0
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
pinsrb m3, [r2 + r3 + 1], 0
punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
punpcklqdq m0, m1
pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
pinsrb m2, [r2 + r3 + 2], 0
pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
pinsrb m1, [r2 + r3 + 4], 0
punpcklbw m1, m2 ; [1 0 0 x x y y z]
punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
punpcklqdq m2, m1
lea r3, [ang_table + 14 * 16]
movh m3, [r3 - 8 * 16] ; [ 6]
movhps m3, [r3 - 2 * 16] ; [12]
movh m4, [r3 + 4 * 16] ; [18]
movhps m4, [r3 + 10 * 16] ; [24]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
cglobal intra_pred_ang4_18, 3,5,1
mov r4d, [r2 + 8]
mov r3b, byte [r2]
mov [r2 + 8], r3b
mov r3d, [r2 + 8]
bswap r3d
movd m0, r3d
pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
lea r3, [r1 * 3]
movd [r0 + r3], m0
psrldq m0, 1
movd [r0 + r1 * 2], m0
psrldq m0, 1
movd [r0 + r1], m0
psrldq m0, 1
movd [r0], m0
mov [r2 + 8], r4w
RET
;-----------------------------------------------------------------------------------------
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang8_2, 3,5,2
lea r4, [r2 + 2]
add r2, 18
cmp r3m, byte 34
cmove r2, r4
movu m0, [r2]
lea r4, [r1 * 3]
movh [r0], m0
palignr m1, m0, 1
movh [r0 + r1], m1
palignr m1, m0, 2
movh [r0 + r1 * 2], m1
palignr m1, m0, 3
movh [r0 + r4], m1
palignr m1, m0, 4
lea r0, [r0 + r1 * 4]
movh [r0], m1
palignr m1, m0, 5
movh [r0 + r1], m1
palignr m1, m0, 6
movh [r0 + r1 * 2], m1
palignr m1, m0, 7
movh [r0 + r4], m1
RET
INIT_XMM sse4
cglobal intra_pred_ang8_3, 3,5,8
lea r4, [r2 + 1]
add r2, 17
cmp r3m, byte 33
cmove r2, r4
lea r3, [ang_table + 22 * 16]
lea r4, [ang_table + 8 * 16]
mova m3, [pw_1024]
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
pmaddubsw m4, m0, [r3 + 4 * 16] ; [26]
pmulhrsw m4, m3
pmaddubsw m1, [r3 - 2 * 16] ; [20]
pmulhrsw m1, m3
packuswb m4, m1
palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
pmaddubsw m5, [r3 - 8 * 16] ; [14]
pmulhrsw m5, m3
palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
pmaddubsw m6, [r4] ; [ 8]
pmulhrsw m6, m3
packuswb m5, m6
palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2]
pmulhrsw m6, m3
pmaddubsw m1, [r3 + 6 * 16] ; [28]
pmulhrsw m1, m3
packuswb m6, m1
palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
pmaddubsw m1, [r3] ; [22]
pmulhrsw m1, m3
palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
pmaddubsw m2, [r3 - 6 * 16] ; [16]
pmulhrsw m2, m3
packuswb m1, m2
jmp .transpose8x8
ALIGN 16
.transpose8x8:
jz .store
; transpose 8x8
punpckhbw m0, m4, m5
punpcklbw m4, m5
punpckhbw m2, m4, m0
punpcklbw m4, m0
punpckhbw m0, m6, m1
punpcklbw m6, m1
punpckhbw m1, m6, m0
punpcklbw m6, m0
punpckhdq m5, m4, m6
punpckldq m4, m6
punpckldq m6, m2, m1
punpckhdq m2, m1
mova m1, m2
.store:
lea r4, [r1 * 3]
movh [r0], m4
movhps [r0 + r1], m4
movh [r0 + r1 * 2], m5
movhps [r0 + r4], m5
add r0, r4
movh [r0 + r1], m6
movhps [r0 + r1 * 2], m6
movh [r0 + r4], m1
movhps [r0 + r1 * 4], m1
RET
cglobal intra_pred_ang8_4, 3,5,8
lea r4, [r2 + 1]
add r2, 17
cmp r3m, byte 32
cmove r2, r4
lea r3, [ang_table + 24 * 16]
lea r4, [ang_table + 10 * 16]
mova m3, [pw_1024]
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
mova m5, m1
pmaddubsw m4, m0, [r3 - 3 * 16] ; [21]
pmulhrsw m4, m3
pmaddubsw m1, [r4] ; [10]
pmulhrsw m1, m3
packuswb m4, m1
pmaddubsw m5, [r3 + 7 * 16] ; [31]
pmulhrsw m5, m3
palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
pmaddubsw m6, [r3 - 4 * 16] ; [ 20]
pmulhrsw m6, m3
packuswb m5, m6
palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9]
pmulhrsw m6, m3
pmaddubsw m1, [r3 + 6 * 16] ; [30]
pmulhrsw m1, m3
packuswb m6, m1
palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
pmaddubsw m1, [r3 - 5 * 16] ; [19]
pmulhrsw m1, m3
palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]
pmaddubsw m2, [r4 - 2 * 16] ; [8]
pmulhrsw m2, m3
packuswb m1, m2
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_5, 3,5,8
lea r4, [r2 + 1]
add r2, 17
cmp r3m, byte 31
cmove r2, r4
lea r3, [ang_table + 17 * 16]
lea r4, [ang_table + 2 * 16]
mova m3, [pw_1024]
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
mova m5, m1
pmaddubsw m4, m0, [r3] ; [17]
pmulhrsw m4, m3
pmaddubsw m1, [r4] ; [2]
pmulhrsw m1, m3
packuswb m4, m1
pmaddubsw m5, [r3 + 2 * 16] ; [19]
pmulhrsw m5, m3
palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
mova m1, m6
pmaddubsw m1, [r4 + 2 * 16] ; [4]
pmulhrsw m1, m3
packuswb m5, m1
pmaddubsw m6, [r3 + 4 * 16] ; [21]
pmulhrsw m6, m3
palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
mova m7, m1
pmaddubsw m7, [r4 + 4 * 16] ; [6]
pmulhrsw m7, m3
packuswb m6, m7
pmaddubsw m1, [r3 + 6 * 16] ; [23]
pmulhrsw m1, m3
palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]
pmaddubsw m2, [r4 + 6 * 16] ; [8]
pmulhrsw m2, m3
packuswb m1, m2
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_6, 3,5,8
lea r4, [r2 + 1]
add r2, 17
cmp r3m, byte 30
cmove r2, r4
lea r3, [ang_table + 20 * 16]
lea r4, [ang_table + 8 * 16]
mova m7, [pw_1024]
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
mova m1, m0
pmaddubsw m4, m0, [r3 - 7 * 16] ; [13]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 6 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
pmaddubsw m5, m6, [r4 - 1 * 16] ; [7]
pmulhrsw m5, m7
pmaddubsw m6, [r3] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
pmaddubsw m6, m1, [r4 - 7 * 16] ; [1]
pmulhrsw m6, m7
mova m3, m1
pmaddubsw m3, [r3 - 6 * 16] ; [14]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 7 * 16] ; [27]
pmulhrsw m1, m7
palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
pmaddubsw m2, [r4] ; [8]
pmulhrsw m2, m7
packuswb m1, m2
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_7, 3,5,8
lea r4, [r2 + 1]
add r2, 17
cmp r3m, byte 29
cmove r2, r4
lea r3, [ang_table + 24 * 16]
lea r4, [ang_table + 6 * 16]
mova m7, [pw_1024]
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m0, [r4 + 3 * 16] ; [9]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r3 - 6 * 16] ; [18]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r3 + 3 * 16] ; [27]
pmulhrsw m5, m7
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
pmaddubsw m6, m1, [r4 - 2 * 16] ; [4]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r4 + 7 * 16] ; [13]
pmulhrsw m6, m7
mova m3, m1
pmaddubsw m3, [r3 - 2 * 16] ; [22]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 7 * 16] ; [31]
pmulhrsw m1, m7
palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
pmaddubsw m2, [r4 + 2 * 16] ; [8]
pmulhrsw m2, m7
packuswb m1, m2
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_8, 3,5,8
lea r4, [r2 + 1]
add r2, 17
cmp r3m, byte 28
cmove r2, r4
lea r3, [ang_table + 23 * 16]
lea r4, [ang_table + 8 * 16]
mova m7, [pw_1024]
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
pmaddubsw m4, m0, [r4 - 3 * 16] ; [5]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r4 + 2 * 16] ; [10]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r3 - 8 * 16] ; [15]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 - 3 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 + 2 * 16] ; [25]
pmulhrsw m6, m7
pmaddubsw m0, [r3 + 7 * 16] ; [30]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r4 - 5 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m2, [r4] ; [8]
pmulhrsw m2, m7
packuswb m1, m2
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_9, 3,5,8
lea r4, [r2 + 1]
add r2, 17
cmp r3m, byte 27
cmove r2, r4
lea r3, [ang_table + 10 * 16]
mova m7, [pw_1024]
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m0, [r3 - 8 * 16] ; [2]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r3 - 6 * 16] ; [4]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r3 - 4 * 16] ; [6]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 - 2 * 16] ; [8]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3] ; [10]
pmulhrsw m6, m7
pmaddubsw m2, m0, [r3 + 2 * 16] ; [12]
pmulhrsw m2, m7
packuswb m6, m2
pmaddubsw m1, m0, [r3 + 4 * 16] ; [14]
pmulhrsw m1, m7
pmaddubsw m0, [r3 + 6 * 16] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_10, 3,6,5
movh m0, [r2 + 17]
mova m4, [pb_unpackbq]
palignr m1, m0, 2
pshufb m1, m4
palignr m2, m0, 4
pshufb m2, m4
palignr m3, m0, 6
pshufb m3, m4
pshufb m0, m4
lea r5, [r1 * 3]
movhps [r0 + r1], m0
movh [r0 + r1 * 2], m1
movhps [r0 + r5], m1
lea r3, [r0 + r1 * 4]
movh [r3], m2
movhps [r3 + r1], m2
movh [r3 + r1 * 2], m3
movhps [r3 + r5], m3
; filter
cmp r4m, byte 0
jz .quit
pmovzxbw m0, m0
movu m1, [r2]
palignr m2, m1, 1
pshufb m1, m4
pmovzxbw m1, m1
pmovzxbw m2, m2
psubw m2, m1
psraw m2, 1
paddw m0, m2
packuswb m0, m0
.quit:
movh [r0], m0
RET
cglobal intra_pred_ang8_26, 3,6,3
movu m2, [r2]
palignr m0, m2, 1
lea r5, [r1 * 3]
movh [r0], m0
movh [r0 + r1], m0
movh [r0 + r1 * 2], m0
movh [r0 + r5], m0
lea r3, [r0 + r1 * 4]
movh [r3], m0
movh [r3 + r1], m0
movh [r3 + r1 * 2], m0
movh [r3 + r5], m0
; filter
cmp r4m, byte 0
jz .quit
pshufb m2, [pb_unpackbq]
movhlps m1, m2
pmovzxbw m2, m2
movu m0, [r2 + 17]
pmovzxbw m1, m1
pmovzxbw m0, m0
psubw m0, m2
psraw m0, 1
paddw m1, m0
packuswb m1, m1
pextrb [r0], m1, 0
pextrb [r0 + r1], m1, 1
pextrb [r0 + r1 * 2], m1, 2
pextrb [r0 + r5], m1, 3
pextrb [r3], m1, 4
pextrb [r3 + r1], m1, 5
pextrb [r3 + r1 * 2], m1, 6
pextrb [r3 + r5], m1, 7
.quit:
RET
cglobal intra_pred_ang8_11, 3,5,8
xor r4, r4
cmp r3m, byte 25
mov r3, 16
cmove r3, r4
movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m0, [r2], 0
palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
lea r3, [ang_table + 23 * 16]
mova m7, [pw_1024]
pmaddubsw m4, m0, [r3 + 7 * 16] ; [30]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r3 + 5 * 16] ; [28]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r3 + 3 * 16] ; [26]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 + 1 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 - 1 * 16] ; [22]
pmulhrsw m6, m7
pmaddubsw m2, m0, [r3 - 3 * 16] ; [20]
pmulhrsw m2, m7
packuswb m6, m2
pmaddubsw m1, m0, [r3 - 5 * 16] ; [18]
pmulhrsw m1, m7
pmaddubsw m0, [r3 - 7 * 16] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_12, 3,5,8
xor r4, r4
cmp r3m, byte 24
mov r3, 16
jz .next
xchg r3, r4
.next:
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m1, [r2], 0
pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
pinsrb m0, [r2 + r3 + 6], 0
lea r4, [ang_table + 22 * 16]
mova m7, [pw_1024]
punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m2, [r4 + 5 * 16] ; [27]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r4] ; [22]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m1, m0, [r4 + 7 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m0, [r4 + 2 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
pmaddubsw m5, m2, [r4 - 5 * 16] ; [17]
pmulhrsw m5, m7
lea r4, [ang_table + 7 * 16]
pmaddubsw m6, m2, [r4 + 5 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4] ; [7]
pmulhrsw m6, m7
pmaddubsw m2, [r4 - 5 * 16] ; [2]
pmulhrsw m2, m7
packuswb m6, m2
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_13, 4,5,8
xor r4, r4
cmp r3m, byte 23
mov r3, 16
jz .next
xchg r3, r4
.next:
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m1, [r2], 0
pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
pinsrb m1, [r2 + r3 + 4], 0
pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
pinsrb m0, [r2 + r3 + 7], 0
punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
lea r4, [ang_table + 24 * 16]
mova m7, [pw_1024]
pmaddubsw m4, m5, [r4 - 1 * 16] ; [23]
pmulhrsw m4, m7
pmaddubsw m6, m1, [r4 + 4 * 16] ; [28]
pmulhrsw m6, m7
pmaddubsw m0, [r4] ; [24]
pmulhrsw m0, m7
lea r4, [ang_table + 13 * 16]
pmaddubsw m3, m5, [r4 + 1 * 16] ; [14]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, [r4 - 8 * 16] ; [5]
pmulhrsw m5, m7
packuswb m5, m6
pmaddubsw m6, m1, [r4 + 6 * 16] ; [19]
pmulhrsw m6, m7
pmaddubsw m2, m1, [r4 - 3 * 16] ; [10]
pmulhrsw m2, m7
packuswb m6, m2
pmaddubsw m1, [r4 - 12 * 16] ; [1]
pmulhrsw m1, m7
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_14, 4,5,8
xor r4, r4
cmp r3m, byte 22
mov r3, 16
jz .next
xchg r3, r4
.next:
movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
pinsrb m1, [r2], 2
pinsrb m1, [r2 + r3 + 2], 1
pinsrb m1, [r2 + r3 + 5], 0
pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
pinsrb m0, [r2 + r3 + 7], 0
punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
lea r4, [ang_table + 24 * 16]
mova m3, [pw_1024]
pmaddubsw m4, m2, [r4 - 5 * 16] ; [19]
pmulhrsw m4, m3
pmaddubsw m0, [r4] ; [24]
pmulhrsw m0, m3
pmaddubsw m5, m6, [r4 + 1 * 16] ; [25]
pmulhrsw m5, m3
lea r4, [ang_table + 12 * 16]
pmaddubsw m6, [r4] ; [12]
pmulhrsw m6, m3
packuswb m5, m6
pmaddubsw m6, m1, [r4 + 19 * 16] ; [31]
pmulhrsw m6, m3
pmaddubsw m2, [r4 - 6 * 16] ; [6]
pmulhrsw m2, m3
packuswb m4, m2
pmaddubsw m2, m1, [r4 + 6 * 16] ; [18]
pmulhrsw m2, m3
packuswb m6, m2
pmaddubsw m1, [r4 - 7 * 16] ; [5]
pmulhrsw m1, m3
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_15, 4,5,8
xor r4, r4
cmp r3m, byte 21
mov r3, 16
jz .next
xchg r3, r4
.next:
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m1, [r2], 0
movu m2, [r2 + r3]
pshufb m2, [c_mode16_15]
palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
pinsrb m0, [r2 + r3 + 8], 0
punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
lea r4, [ang_table + 23 * 16]
mova m3, [pw_1024]
pmaddubsw m4, [r4 - 8 * 16] ; [15]
pmulhrsw m4, m3
pmaddubsw m2, m5, [r4 + 7 * 16] ; [30]
pmulhrsw m2, m3
packuswb m4, m2
pmaddubsw m5, [r4 - 10 * 16] ; [13]
pmulhrsw m5, m3
pmaddubsw m2, m6, [r4 + 5 * 16] ; [28]
pmulhrsw m2, m3
packuswb m5, m2
pmaddubsw m2, m1, [r4 + 3 * 16] ; [26]
pmulhrsw m2, m3
pmaddubsw m0, [r4 + 1 * 16] ; [24]
pmulhrsw m0, m3
lea r4, [ang_table + 11 * 16]
pmaddubsw m6, [r4] ; [11]
pmulhrsw m6, m3
packuswb m6, m2
pmaddubsw m1, [r4 - 2 * 16] ; [9]
pmulhrsw m1, m3
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_16, 4,5,8
xor r4, r4
cmp r3m, byte 20
mov r3, 16
jz .next
xchg r3, r4
.next:
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m1, [r2], 0
movu m2, [r2 + r3]
pshufb m2, [c_mode16_16]
palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
pinsrb m0, [r2 + r3 + 8], 0
punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e]
palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
lea r4, [ang_table + 22 * 16]
mova m7, [pw_1024]
pmaddubsw m3, m5, [r4] ; [22]
pmulhrsw m3, m7
pmaddubsw m0, [r4 + 2 * 16] ; [24]
pmulhrsw m0, m7
lea r4, [ang_table + 9 * 16]
pmaddubsw m4, [r4 + 2 * 16] ; [11]
pmulhrsw m4, m7
packuswb m4, m3
pmaddubsw m2, [r4 + 3 * 16] ; [12]
pmulhrsw m2, m7
pmaddubsw m5, [r4 - 8 * 16] ; [1]
pmulhrsw m5, m7
packuswb m5, m2
mova m2, m6
pmaddubsw m6, [r4 + 14 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m2, [r4 - 7 * 16] ; [2]
pmulhrsw m2, m7
packuswb m6, m2
pmaddubsw m1, [r4 + 4 * 16] ; [13]
pmulhrsw m1, m7
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_17, 4,5,8
xor r4, r4
cmp r3m, byte 19
mov r3, 16
jz .next
xchg r3, r4
.next:
movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m2, [r2], 0
movu m1, [r2 + r3]
pshufb m1, [c_mode16_17]
palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
pinsrb m0, [r2 + r3 + 7], 0
punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f]
palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
lea r4, [ang_table + 17 * 16]
mova m3, [pw_1024]
pmaddubsw m2, [r4 - 5 * 16] ; [12]
pmulhrsw m2, m3
pmaddubsw m4, [r4 - 11 * 16] ; [6]
pmulhrsw m4, m3
packuswb m4, m2
pmaddubsw m5, [r4 + 1 * 16] ; [18]
pmulhrsw m5, m3
palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
pmaddubsw m2, [r4 + 7 * 16] ; [24]
pmulhrsw m2, m3
packuswb m5, m2
palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
mova m2, m6
pmaddubsw m6, [r4 + 13 * 16] ; [30]
pmulhrsw m6, m3
pmaddubsw m2, [r4 - 13 * 16] ; [4]
pmulhrsw m2, m3
packuswb m6, m2
palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e]
pmaddubsw m1, [r4 - 7 * 16] ; [10]
pmulhrsw m1, m3
pmaddubsw m0, [r4 - 1 * 16] ; [16]
pmulhrsw m0, m3
packuswb m1, m0
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
cglobal intra_pred_ang8_18, 4,4,1
movu m0, [r2 + 16]
pinsrb m0, [r2], 0
pshufb m0, [pb_swap8]
movhps m0, [r2 + 1]
lea r2, [r0 + r1 * 4]
lea r3, [r1 * 3]
movh [r2 + r3], m0
psrldq m0, 1
movh [r2 + r1 * 2], m0
psrldq m0, 1
movh [r2 + r1], m0
psrldq m0, 1
movh [r2], m0
psrldq m0, 1
movh [r0 + r3], m0
psrldq m0, 1
movh [r0 + r1 * 2], m0
psrldq m0, 1
movh [r0 + r1], m0
psrldq m0, 1
movh [r0], m0
RET
%macro TRANSPOSE_STORE_8x8 6
%if %2 == 1
; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
punpckhbw m0, %3, %4
punpcklbw %3, %4
punpckhbw %4, %3, m0
punpcklbw %3, m0
punpckhbw m0, %5, m1
punpcklbw %5, %6
punpckhbw %6, %5, m0
punpcklbw %5, m0
punpckhdq m0, %3, %5
punpckldq %3, %5
punpckldq %5, %4, %6
punpckhdq %4, %6
movh [r0 + + %1 * 8], %3
movhps [r0 + r1 + %1 * 8], %3
movh [r0 + r1*2 + %1 * 8], m0
movhps [r0 + r5 + %1 * 8], m0
movh [r6 + %1 * 8], %5
movhps [r6 + r1 + %1 * 8], %5
movh [r6 + r1*2 + %1 * 8], %4
movhps [r6 + r5 + %1 * 8], %4
%else
; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
movh [r0 ], %3
movhps [r0 + r1 ], %3
movh [r0 + r1 * 2], %4
movhps [r0 + r5 ], %4
lea r0, [r0 + r1 * 4]
movh [r0 ], %5
movhps [r0 + r1 ], %5
movh [r0 + r1 * 2], %6
movhps [r0 + r5 ], %6
lea r0, [r0 + r1 * 4]
%endif
%endmacro
;------------------------------------------------------------------------------------------
; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;------------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang16_2, 3,5,3
lea r4, [r2 + 2]
add r2, 34
cmp r3m, byte 34
cmove r2, r4
movu m0, [r2]
movu m1, [r2 + 16]
movu [r0], m0
palignr m2, m1, m0, 1
movu [r0 + r1], m2
lea r0, [r0 + r1 * 2]
palignr m2, m1, m0, 2
movu [r0], m2
palignr m2, m1, m0, 3
movu [r0 + r1], m2
lea r0, [r0 + r1 * 2]
palignr m2, m1, m0, 4
movu [r0], m2
palignr m2, m1, m0, 5
movu [r0 + r1], m2
lea r0, [r0 + r1 * 2]
palignr m2, m1, m0, 6
movu [r0], m2
palignr m2, m1, m0, 7
movu [r0 + r1], m2
lea r0, [r0 + r1 * 2]
palignr m2, m1, m0, 8
movu [r0], m2
palignr m2, m1, m0, 9
movu [r0 + r1], m2
lea r0, [r0 + r1 * 2]
palignr m2, m1, m0, 10
movu [r0], m2
palignr m2, m1, m0, 11
movu [r0 + r1], m2
lea r0, [r0 + r1 * 2]
palignr m2, m1, m0, 12
movu [r0], m2
palignr m2, m1, m0, 13
movu [r0 + r1], m2
lea r0, [r0 + r1 * 2]
palignr m2, m1, m0, 14
movu [r0], m2
palignr m2, m1, m0, 15
movu [r0 + r1], m2
RET
INIT_XMM sse4
cglobal intra_pred_ang16_3, 3,7,8
add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 4
pmaddubsw m5, [r3 - 2 * 16] ; [14]
pmulhrsw m5, m7
palignr m6, m2, m0, 6
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 8
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 + 6 * 16] ; [22]
pmulhrsw m1, m7
palignr m2, m0, 12
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
movu m0, [r2 + 8]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m5, m2, m0, 2
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 14 * 16] ; [30]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 6
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
pmulhrsw m6, m7
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 - 4 * 16] ; [12]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 - 10 * 16] ; [06]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 14] ; [00]
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_33, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 4
pmaddubsw m5, [r3 - 2 * 16] ; [14]
pmulhrsw m5, m7
palignr m6, m2, m0, 6
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 8
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 + 6 * 16] ; [22]
pmulhrsw m1, m7
palignr m2, m0, 12
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
movu m0, [r2 + 8]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m5, m2, m0, 2
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 14 * 16] ; [30]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 6
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
pmulhrsw m6, m7
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 - 4 * 16] ; [12]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 - 10 * 16] ; [06]
pmulhrsw m1, m7
packuswb m1, m1
movh m2, [r2 + 14] ; [00]
movh [r0 ], m4
movhps [r0 + r1 ], m4
movh [r0 + r1 * 2], m5
movhps [r0 + r5 ], m5
lea r0, [r0 + r1 * 4]
movh [r0 ], m6
movhps [r0 + r1 ], m6
movh [r0 + r1 * 2], m1
movh [r0 + r5 ], m2
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_4, 3,7,8
add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
mova m5, m1
pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
pmulhrsw m4, m7
pmaddubsw m1, [r3 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 15 * 16] ; [31]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 6
pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 + 3 * 16] ; [19]
pmulhrsw m1, m7
palignr m2, m0, 10
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
pmulhrsw m4, m7
movu m0, [r2 + 6]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
pmaddubsw m1, [r3 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 4
mova m6, m5
pmaddubsw m5, [r3 - 9 * 16] ; [07]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
palignr m6, m2, m0, 6
pmaddubsw m6, [r3 + 16] ; [17]
pmulhrsw m6, m7
palignr m1, m2, m0, 8
palignr m2, m0, 10
pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 11 * 16] ; [27]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_32, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
mova m5, m1
pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
pmulhrsw m4, m7
pmaddubsw m1, [r3 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 15 * 16] ; [31]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 6
pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 + 3 * 16] ; [19]
pmulhrsw m1, m7
palignr m2, m0, 10
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
pmulhrsw m4, m7
movu m0, [r2 + 6]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
pmaddubsw m1, [r3 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 4
mova m6, m5
pmaddubsw m5, [r3 - 9 * 16] ; [07]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
palignr m6, m2, m0, 6
pmaddubsw m6, [r3 + 16] ; [17]
pmulhrsw m6, m7
palignr m1, m2, m0, 8
palignr m2, m0, 10
pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 11 * 16] ; [27]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_5, 3,7,8
add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m5, m2, m3, 2
pmaddubsw m4, m3, [r3 + 16] ; [17]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m3, 4
pmaddubsw m5, [r3 + 3 * 16] ; [19]
pmulhrsw m5, m7
pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
pmulhrsw m1, m7
packuswb m5, m1
palignr m1, m2, m3, 6
pmaddubsw m6, [r3 + 5 * 16] ; [21]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
pmulhrsw m0, m7
packuswb m6, m0
palignr m0, m2, m3, 8
pmaddubsw m1, [r3 + 7 * 16] ; [23]
pmulhrsw m1, m7
pmaddubsw m0, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
palignr m4, m2, m3, 8
palignr m5, m2, m3, 10
pmaddubsw m4, [r3 + 9 * 16] ; [25]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m3, 12
pmaddubsw m5, [r3 + 11 * 16] ; [27]
pmulhrsw m5, m7
pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
pmulhrsw m1, m7
packuswb m5, m1
palignr m1, m2, m3, 14
pmaddubsw m6, [r3 + 13 * 16] ; [29]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, [r3 + 15 * 16] ; [31]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_31, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m5, m2, m3, 2
pmaddubsw m4, m3, [r3 + 16] ; [17]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m3, 4
pmaddubsw m5, [r3 + 3 * 16] ; [19]
pmulhrsw m5, m7
pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
pmulhrsw m1, m7
packuswb m5, m1
palignr m1, m2, m3, 6
pmaddubsw m6, [r3 + 5 * 16] ; [21]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
pmulhrsw m0, m7
packuswb m6, m0
palignr m0, m2, m3, 8
pmaddubsw m1, [r3 + 7 * 16] ; [23]
pmulhrsw m1, m7
pmaddubsw m0, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
palignr m4, m2, m3, 8
palignr m5, m2, m3, 10
pmaddubsw m4, [r3 + 9 * 16] ; [25]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m3, 12
pmaddubsw m5, [r3 + 11 * 16] ; [27]
pmulhrsw m5, m7
pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
pmulhrsw m1, m7
packuswb m5, m1
palignr m1, m2, m3, 14
pmaddubsw m6, [r3 + 13 * 16] ; [29]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, [r3 + 15 * 16] ; [31]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_6, 3,7,8
add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
pmulhrsw m4, m7
pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m3, 2
pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m3, 4
pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
pmulhrsw m0, m7
packuswb m6, m0
palignr m0, m2, m3, 6
pmaddubsw m1, [r3 + 11 * 16] ; [27]
pmulhrsw m1, m7
pmaddubsw m0, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
palignr m4, m2, m3, 6
palignr m6, m2, m3, 8
pmaddubsw m4, [r3 + 5 * 16] ; [21]
pmulhrsw m4, m7
pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m6, [r3 - 16] ; [15]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
palignr m0, m2, m3, 10
pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
pmulhrsw m6, m7
pmaddubsw m0, [r3 + 6 * 16] ; [22]
pmulhrsw m0, m7
packuswb m6, m0
palignr m2, m3, 12
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_30, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
pmulhrsw m4, m7
pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m3, 2
pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m3, 4
pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
pmulhrsw m0, m7
packuswb m6, m0
palignr m0, m2, m3, 6
pmaddubsw m1, [r3 + 11 * 16] ; [27]
pmulhrsw m1, m7
pmaddubsw m0, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
palignr m4, m2, m3, 6
palignr m6, m2, m3, 8
pmaddubsw m4, [r3 + 5 * 16] ; [21]
pmulhrsw m4, m7
pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m6, [r3 - 16] ; [15]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
palignr m0, m2, m3, 10
pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
pmulhrsw m6, m7
pmaddubsw m0, [r3 + 6 * 16] ; [22]
pmulhrsw m0, m7
packuswb m6, m0
palignr m2, m3, 12
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_7, 3,7,8
add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
pmulhrsw m4, m7
pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
pmulhrsw m0, m7
packuswb m4, m0
palignr m1, m2, m3, 2
pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
pmulhrsw m5, m7
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
pmulhrsw m0, m7
packuswb m6, m0
palignr m0, m2, m3, 4
pmaddubsw m1, [r3 + 15 * 16] ; [31]
pmulhrsw m1, m7
pmaddubsw m0, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
palignr m1, m2, m3, 4
pmaddubsw m4, m1, [r3 + 16] ; [17]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m0, m2, m3, 6
pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
pmulhrsw m6, m7
pmaddubsw m0, [r3 + 14 * 16] ; [30]
pmulhrsw m0, m7
packuswb m6, m0
palignr m2, m3, 8
pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_29, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
pmulhrsw m4, m7
pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
pmulhrsw m0, m7
packuswb m4, m0
palignr m1, m2, m3, 2
pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
pmulhrsw m5, m7
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
pmulhrsw m0, m7
packuswb m6, m0
palignr m0, m2, m3, 4
pmaddubsw m1, [r3 + 15 * 16] ; [31]
pmulhrsw m1, m7
pmaddubsw m0, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
palignr m1, m2, m3, 4
pmaddubsw m4, m1, [r3 + 16] ; [17]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m0, m2, m3, 6
pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
pmulhrsw m6, m7
pmaddubsw m0, [r3 + 14 * 16] ; [30]
pmulhrsw m0, m7
packuswb m6, m0
palignr m2, m3, 8
pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
pmulhrsw m1, m7
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_8, 3,7,8
add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
pmulhrsw m4, m7
pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
pmulhrsw m2, m7
packuswb m4, m2
pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
pmulhrsw m5, m7
pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
pmulhrsw m6, m7
pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
pmulhrsw m2, m7
packuswb m6, m2
palignr m2, m0, m1, 2
palignr m3, m0, m1, 4
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
pmulhrsw m5, m7
pmaddubsw m2, [r3 + 12 * 16] ; [28]
pmulhrsw m2, m7
packuswb m5, m2
pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
pmulhrsw m1, m7
pmaddubsw m3, [r3] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_28, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
pmulhrsw m4, m7
pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
pmulhrsw m2, m7
packuswb m4, m2
pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
pmulhrsw m5, m7
pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
pmulhrsw m6, m7
pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
pmulhrsw m2, m7
packuswb m6, m2
palignr m2, m0, m1, 2
palignr m3, m0, m1, 4
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
pmulhrsw m5, m7
pmaddubsw m2, [r3 + 12 * 16] ; [28]
pmulhrsw m2, m7
packuswb m5, m2
pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
pmulhrsw m1, m7
pmaddubsw m3, [r3] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_9, 3,7,8
add r2, 32
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
pmulhrsw m4, m7
pmaddubsw m0, m2, [r3 - 12 * 16] ; [4]
pmulhrsw m0, m7
packuswb m4, m0
pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
pmulhrsw m6, m7
pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m1, m1
punpcklqdq m1, m3 ; [00]
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_27, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
pmaddubsw m4, m3, [r3 - 14 * 16] ; [2]
pmulhrsw m4, m7
pmaddubsw m0, m3, [r3 - 12 * 16] ; [4]
pmulhrsw m0, m7
packuswb m4, m0
pmaddubsw m5, m3, [r3 - 10 * 16] ; [6]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 - 8 * 16] ; [8]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 - 6 * 16] ; [10]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r3 - 4 * 16] ; [12]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r3 - 2 * 16] ; [14]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r3 + 2 * 16] ; [18]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r3 + 4 * 16] ; [20]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r3 + 6 * 16] ; [22]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 + 10 * 16] ; [26]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m1, m1
movh [r0 ], m4
movhps [r0 + r1 ], m4
movh [r0 + r1 * 2], m5
movhps [r0 + r5 ], m5
lea r0, [r0 + r1 * 4]
movh [r0 ], m6
movhps [r0 + r1 ], m6
movh [r0 + r1 * 2], m1
movh [r0 + r5 ], m2
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_10, 5,6,8
lea r5, [r1 * 3]
pxor m7, m7
movu m0, [r2 + 1 + 32]
palignr m1, m0, 1
pshufb m1, m7
palignr m2, m0, 2
pshufb m2, m7
palignr m3, m0, 3
pshufb m3, m7
palignr m4, m0, 4
pshufb m4, m7
palignr m5, m0, 5
pshufb m5, m7
palignr m6, m0, 6
pshufb m6, m7
movu [r0 + r1], m1
movu [r0 + r1 * 2], m2
movu [r0 + r5], m3
lea r3, [r0 + r1 * 4]
movu [r3], m4
movu [r3 + r1], m5
movu [r3 + r1 * 2], m6
palignr m1, m0, 7
pshufb m1, m7
movhlps m2, m0
pshufb m2, m7
palignr m3, m0, 9
pshufb m3, m7
palignr m4, m0, 10
pshufb m4, m7
palignr m5, m0, 11
pshufb m5, m7
palignr m6, m0, 12
pshufb m6, m7
movu [r3 + r5], m1
lea r3, [r3 + r1 * 4]
movu [r3], m2
movu [r3 + r1], m3
movu [r3 + r1 * 2], m4
movu [r3 + r5], m5
lea r3, [r3 + r1 * 4]
movu [r3], m6
palignr m1, m0, 13
pshufb m1, m7
palignr m2, m0, 14
pshufb m2, m7
palignr m3, m0, 15
pshufb m3, m7
pshufb m0, m7
movu [r3 + r1], m1
movu [r3 + r1 * 2], m2
movu [r3 + r5], m3
; filter
cmp r4w, byte 0
jz .quit
pmovzxbw m0, m0
mova m1, m0
movu m2, [r2]
movu m3, [r2 + 1]
pshufb m2, m7
pmovzxbw m2, m2
movhlps m4, m3
pmovzxbw m3, m3
pmovzxbw m4, m4
psubw m3, m2
psubw m4, m2
psraw m3, 1
psraw m4, 1
paddw m0, m3
paddw m1, m4
packuswb m0, m1
.quit:
movu [r0], m0
RET
INIT_XMM sse4
%if ARCH_X86_64 == 1
cglobal intra_pred_ang16_26, 3,8,5
mov r7, r4mp
%define bfilter r7w
%else
cglobal intra_pred_ang16_26, 5,7,5,0-4
%define bfilter dword[rsp]
mov bfilter, r4
%endif
movu m0, [r2 + 1]
lea r4, [r1 * 3]
lea r3, [r0 + r1 * 4]
lea r5, [r3 + r1 * 4]
lea r6, [r5 + r1 * 4]
movu [r0], m0
movu [r0 + r1], m0
movu [r0 + r1 * 2], m0
movu [r0 + r4], m0
movu [r3], m0
movu [r3 + r1], m0
movu [r3 + r1 * 2], m0
movu [r3 + r4], m0
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
movu [r6], m0
movu [r6 + r1], m0
movu [r6 + r1 * 2], m0
movu [r6 + r4], m0
; filter
cmp bfilter, byte 0
jz .quit
pxor m4, m4
pshufb m0, m4
pmovzxbw m0, m0
mova m1, m0
movu m2, [r2 + 32]
pinsrb m2, [r2], 0
movu m3, [r2 + 1 + 32]
pshufb m2, m4
pmovzxbw m2, m2
movhlps m4, m3
pmovzxbw m3, m3
pmovzxbw m4, m4
psubw m3, m2
psubw m4, m2
psraw m3, 1
psraw m4, 1
paddw m0, m3
paddw m1, m4
packuswb m0, m1
pextrb [r0], m0, 0
pextrb [r0 + r1], m0, 1
pextrb [r0 + r1 * 2], m0, 2
pextrb [r0 + r4], m0, 3
pextrb [r3], m0, 4
pextrb [r3 + r1], m0, 5
pextrb [r3 + r1 * 2], m0, 6
pextrb [r3 + r4], m0, 7
pextrb [r5], m0, 8
pextrb [r5 + r1], m0, 9
pextrb [r5 + r1 * 2], m0, 10
pextrb [r5 + r4], m0, 11
pextrb [r6], m0, 12
pextrb [r6 + r1], m0, 13
pextrb [r6 + r1 * 2], m0, 14
pextrb [r6 + r4], m0, 15
.quit:
RET
INIT_XMM sse4
cglobal intra_pred_ang16_11, 3,7,8
lea r3, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m3, [r2], 0
mova m2, m3
palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
pmulhrsw m4, m7
pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
pmulhrsw m0, m7
packuswb m4, m0
pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
pmulhrsw m1, m7
packuswb m1, m1
punpcklqdq m1, m2 ;[00]
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
mova m2, m3
palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
pmulhrsw m4, m7
pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
pmulhrsw m0, m7
packuswb m4, m0
pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
pmulhrsw m1, m7
packuswb m1, m1
punpcklqdq m1, m2 ;[00]
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_25, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 2
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
.loop:
movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
mova m2, m3
palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
pmulhrsw m4, m7
pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
pmulhrsw m0, m7
packuswb m4, m0
pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
pmulhrsw m1, m7
packuswb m1, m1
movh [r0 ], m4
movhps [r0 + r1 ], m4
movh [r0 + r1 * 2], m5
movhps [r0 + r5 ], m5
lea r0, [r0 + r1 * 4]
movh [r0 ], m6
movhps [r0 + r1 ], m6
movh [r0 + r1 * 2], m1
movh [r0 + r5 ], m2
lea r0, [r6 + 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang16_12, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m3, [r2], 0
punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2]
pshufb m2, [c_mode16_12]
palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
pmulhrsw m4, m7
pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
pmulhrsw m6, m7
pmaddubsw m0, [r4 - 14 * 16] ; [2]
pmulhrsw m0, m7
packuswb m6, m0
palignr m3, m2, 15
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
pmulhrsw m0, m7
packuswb m6, m0
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_24, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2 + 32]
pshufb m2, [c_mode16_12]
palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
pmulhrsw m4, m7
pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
pmulhrsw m6, m7
pmaddubsw m0, [r4 - 14 * 16] ; [2]
pmulhrsw m0, m7
packuswb m6, m0
palignr m3, m2, 15
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
pmulhrsw m0, m7
packuswb m6, m0
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_13, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m3, [r2], 0
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2]
pshufb m2, [c_mode16_13]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
pmulhrsw m4, m7
pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
pmulhrsw m0, m7
packuswb m4, m0
pmaddubsw m5, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
palignr m3, m2, 15
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 16] ; [15]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 16] ; [15]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_23, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2 + 32]
pshufb m2, [c_mode16_13]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
pmulhrsw m4, m7
pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
pmulhrsw m0, m7
packuswb m4, m0
pmaddubsw m5, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
palignr m3, m2, 15
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 16] ; [15]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 16] ; [15]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_14, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m3, [r2], 0
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2]
pshufb m2, [c_mode16_14]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 + 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 + 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_22, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2 + 32]
pshufb m2, [c_mode16_14]
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 + 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m5, m7
packuswb m4, m5
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 + 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_15, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m3, [r2], 0
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2]
pshufb m2, [c_mode16_15]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, [r4 - 16] ; [15]
pmulhrsw m4, m7
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m5, m7
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
pmaddubsw m4, m3, [r4 - 16] ; [15]
pmulhrsw m4, m7
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_21, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2 + 32]
pinsrb m2, [r2], 0
pshufb m2, [c_mode16_15]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, [r4 - 16] ; [15]
pmulhrsw m4, m7
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m5, m7
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
pmaddubsw m4, m3, [r4 - 16] ; [15]
pmulhrsw m4, m7
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_16, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m3, [r2], 0
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2]
pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m5, m7
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m1, m7
pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m4, m7
pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 16] ; [15]
pmulhrsw m6, m7
pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 16] ; [15]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_20, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2 + 32]
pinsrb m2, [r2], 0
pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m5, m7
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m1, m7
pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m4, m7
pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 16] ; [15]
pmulhrsw m6, m7
pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 16] ; [15]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m3, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_17, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
pinsrb m3, [r2], 0
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2]
pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, [r4 - 10 * 16] ; [06]
pmulhrsw m4, m7
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m5, m7
packuswb m4, m5
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m5, m7
pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
palignr m3, m2, 14
pmaddubsw m0, m3, [r4] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m4, m7
pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m5, m7
pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m6, m7
pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
pmaddubsw m3, [r4 - 16 * 16] ; [00]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]
pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m4, m7
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
pmaddubsw m3, [r4 - 16 * 16] ; [00]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_19, 4,7,8
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
movu m2, [r2 + 32]
pinsrb m2, [r2], 0
pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
pmaddubsw m4, [r4 - 10 * 16] ; [06]
pmulhrsw m4, m7
palignr m3, m2, 15
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m5, m7
packuswb m4, m5
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m5, m7
pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
palignr m3, m2, 14
pmaddubsw m0, m3, [r4] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m4, m7
pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m5, m7
pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m6, m7
pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
pmaddubsw m3, [r4 - 16 * 16] ; [00]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
pmulhrsw m4, m7
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
pmulhrsw m6, m7
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
pmulhrsw m0, m7
packuswb m6, m0
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m0, m3, [r4] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
pmulhrsw m4, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
pmulhrsw m5, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
pmulhrsw m6, m7
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m2, 1
palignr m3, m2, 14
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
pmaddubsw m3, [r4 - 16 * 16] ; [00]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
RET
INIT_XMM sse4
cglobal intra_pred_ang16_18, 4,5,3
movu m0, [r2]
movu m1, [r2 + 32]
mova m2, [c_mode16_18]
pshufb m1, m2
lea r2, [r1 * 2]
lea r3, [r1 * 3]
lea r4, [r1 * 4]
movu [r0], m0
palignr m2, m0, m1, 15
movu [r0 + r1], m2
palignr m2, m0, m1, 14
movu [r0 + r2], m2
palignr m2, m0, m1, 13
movu [r0 + r3], m2
lea r0, [r0 + r4]
palignr m2, m0, m1, 12
movu [r0], m2
palignr m2, m0, m1, 11
movu [r0 + r1], m2
palignr m2, m0, m1, 10
movu [r0 + r2], m2
palignr m2, m0, m1, 9
movu [r0 + r3], m2
lea r0, [r0 + r4]
palignr m2, m0, m1, 8
movu [r0], m2
palignr m2, m0, m1, 7
movu [r0 + r1], m2
palignr m2, m0, m1, 6
movu [r0 + r2], m2
palignr m2, m0, m1, 5
movu [r0 + r3], m2
lea r0, [r0 + r4]
palignr m2, m0, m1, 4
movu [r0], m2
palignr m2, m0, m1, 3
movu [r0 + r1], m2
palignr m2, m0, m1, 2
movu [r0 + r2], m2
palignr m0, m1, 1
movu [r0 + r3], m0
RET
; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
%macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
%if %3 == 0
%else
pshufb m0, [r3]
pmaddubsw m0, [r4 + %3 * 16]
pmulhrsw m0, [pw_1024]
%endif
%if %4 == 0
pmovzxbw m1, m1
%else
pshufb m1, [r3]
pmaddubsw m1, [r4 + %4 * 16]
pmulhrsw m1, [pw_1024]
%endif
%if %3 == 0
packuswb m1, m1
movlhps m0, m1
%else
packuswb m0, m1
%endif
mova m1, [pw_1024]
%if %5 == 0
%else
pshufb m2, [r3]
pmaddubsw m2, [r4 + %5 * 16]
pmulhrsw m2, m1
%endif
%if %6 == 0
pmovzxbw m3, m3
%else
pshufb m3, [r3]
pmaddubsw m3, [r4 + %6 * 16]
pmulhrsw m3, m1
%endif
%if %5 == 0
packuswb m3, m3
movlhps m2, m3
%else
packuswb m2, m3
%endif
%if %7 == 0
%else
pshufb m4, [r3]
pmaddubsw m4, [r4 + %7 * 16]
pmulhrsw m4, m1
%endif
%if %8 == 0
pmovzxbw m5, m5
%else
pshufb m5, [r3]
pmaddubsw m5, [r4 + %8 * 16]
pmulhrsw m5, m1
%endif
%if %7 == 0
packuswb m5, m5
movlhps m4, m5
%else
packuswb m4, m5
%endif
%if %9 == 0
%else
pshufb m6, [r3]
pmaddubsw m6, [r4 + %9 * 16]
pmulhrsw m6, m1
%endif
%if %10 == 0
pmovzxbw m7, m7
%else
pshufb m7, [r3]
pmaddubsw m7, [r4 + %10 * 16]
pmulhrsw m7, m1
%endif
%if %9 == 0
packuswb m7, m7
movlhps m6, m7
%else
packuswb m6, m7
%endif
%if %2 == 1
; transpose
punpckhbw m1, m0, m2
punpcklbw m0, m2
punpckhbw m3, m0, m1
punpcklbw m0, m1
punpckhbw m1, m4, m6
punpcklbw m4, m6
punpckhbw m6, m4, m1
punpcklbw m4, m1
punpckhdq m2, m0, m4
punpckldq m0, m4
punpckldq m4, m3, m6
punpckhdq m3, m6
movh [r0 + + %1 * 8], m0
movhps [r0 + r1 + %1 * 8], m0
movh [r0 + r1*2 + %1 * 8], m2
movhps [r0 + r5 + %1 * 8], m2
movh [r6 + %1 * 8], m4
movhps [r6 + r1 + %1 * 8], m4
movh [r6 + r1*2 + %1 * 8], m3
movhps [r6 + r5 + %1 * 8], m3
%else
movh [r0 ], m0
movhps [r0 + r1 ], m0
movh [r0 + r1 * 2], m2
movhps [r0 + r5 ], m2
lea r0, [r0 + r1 * 4]
movh [r0 ], m4
movhps [r0 + r1 ], m4
movh [r0 + r1 * 2], m6
movhps [r0 + r5 ], m6
%endif
%endmacro
%macro MODE_3_33 1
movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 4
pmaddubsw m5, [r3 - 2 * 16] ; [14]
pmulhrsw m5, m7
palignr m6, m2, m0, 6
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 8
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 + 6 * 16] ; [22]
pmulhrsw m1, m7
palignr m2, m0, 12
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
movu m0, [r2 + 8]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m5, m2, m0, 2
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 14 * 16] ; [30]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 6
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
pmulhrsw m6, m7
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 - 4 * 16] ; [12]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 - 10 * 16] ; [06]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 14] ; [00]
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
movu m0, [r2 + 14]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 4
pmaddubsw m5, [r3 - 2 * 16] ; [14]
pmulhrsw m5, m7
palignr m6, m2, m0, 6
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 8
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 + 6 * 16] ; [22]
pmulhrsw m1, m7
palignr m2, m0, 12
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
movu m0, [r2 + 21]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m5, m2, m0, 2
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
pmulhrsw m4, m7
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 14 * 16] ; [30]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 6
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
pmulhrsw m6, m7
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 - 4 * 16] ; [12]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 10
pmaddubsw m1, [r3 - 10 * 16] ; [06]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 27] ; [00]
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_4_32 1
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
mova m5, m1
pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
pmulhrsw m4, m7
pmaddubsw m1, [r3 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 15 * 16] ; [31]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 6
pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 + 3 * 16] ; [19]
pmulhrsw m1, m7
palignr m2, m0, 10
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
pmulhrsw m4, m7
movu m0, [r2 + 6]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
pmaddubsw m1, [r3 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 4
mova m6, m5
pmaddubsw m5, [r3 - 9 * 16] ; [07]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
palignr m6, m2, m0, 6
pmaddubsw m6, [r3 + 16] ; [17]
pmulhrsw m6, m7
palignr m1, m2, m0, 8
pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 11 * 16] ; [27]
pmulhrsw m1, m7
palignr m2, m0, 10
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
movu m0, [r2 + 12]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
mova m1, m0
pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 2
pmaddubsw m5, [r3 - 16] ; [15]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
mova m1, m6
pmaddubsw m1, [r3 - 12 * 16] ; [4]
pmulhrsw m1, m7
packuswb m5, m1
pmaddubsw m6, [r3 + 9 * 16] ; [25]
pmulhrsw m6, m7
palignr m1, m2, m0, 6
pmaddubsw m1, [r3 - 2 * 16] ; [14]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 8
mova m2, m1
pmaddubsw m1, [r3 - 13 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m2, [r3 + 8 * 16] ; [24]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
movu m0, [r2 + 17]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
pmulhrsw m4, m7
palignr m5, m2, m0, 2
pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 7 * 16] ; [23]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
pmaddubsw m6, [r3 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
palignr m6, m2, m0, 6
mova m1, m6
pmaddubsw m6, [r3 - 15 * 16] ; [1]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 6 * 16] ; [22]
pmulhrsw m1, m7
packuswb m6, m1
palignr m1, m2, m0, 8
pmaddubsw m1, [r3 - 5 * 16] ; [11]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 22] ; [00]
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_5_31 1
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
mova m5, m1
pmaddubsw m4, m0, [r3 + 16] ; [17]
pmulhrsw m4, m7
pmaddubsw m1, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 3 * 16] ; [19]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
mova m1, m6
pmaddubsw m6, [r3 - 12 * 16] ; [4]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 + 5 * 16] ; [21]
pmulhrsw m6, m7
palignr m1, m2, m0, 6
mova m3, m1
pmaddubsw m3, [r3 - 10 * 16] ; [6]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 7 * 16] ; [23]
pmulhrsw m1, m7
palignr m2, m0, 8
pmaddubsw m2, [r3 - 8 * 16] ; [8]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
movu m0, [r2 + 5]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m1, m2, m0, 2
mova m5, m1
pmaddubsw m4, m0, [r3 + 9 * 16] ; [25]
pmulhrsw m4, m7
pmaddubsw m1, [r3 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 11 * 16] ; [27]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
mova m1, m6
pmaddubsw m6, [r3 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 + 13 * 16] ; [29]
pmulhrsw m6, m7
palignr m1, m2, m0, 6
mova m3, m1
pmaddubsw m3, [r3 - 2 * 16] ; [14]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 15 * 16] ; [31]
pmulhrsw m1, m7
palignr m2, m0, 8
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
movu m0, [r2 + 10]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
mova m1, m0
pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 2
mova m1, m5
pmaddubsw m5, [r3 - 13 * 16] ; [3]
pmulhrsw m5, m7
pmaddubsw m1, [r3 + 4 * 16] ; [20]
pmulhrsw m1, m7
packuswb m5, m1
palignr m1, m2, m0, 4
pmaddubsw m6, m1, [r3 - 11 * 16] ; [5]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 6 * 16] ; [22]
pmulhrsw m1, m7
packuswb m6, m1
palignr m2, m0, 6
pmaddubsw m1, m2, [r3 - 9 * 16] ; [7]
pmulhrsw m1, m7
pmaddubsw m2, [r3 + 8 * 16] ; [24]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
movu m0, [r2 + 14]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
mova m1, m0
pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 2
mova m1, m5
pmaddubsw m5, [r3 - 5 * 16] ; [11]
pmulhrsw m5, m7
pmaddubsw m1, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m5, m1
palignr m1, m2, m0, 4
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m6, m1
palignr m2, m0, 6
pmaddubsw m1, m2, [r3 - 16] ; [15]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 18] ; [00]
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_6_30 1
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
mova m1, m0
pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m0, 2
pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
palignr m1, m2, m0, 4
pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
pmulhrsw m6, m7
pmaddubsw m3, m1, [r3 - 2 * 16] ; [14]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, [r3 + 11 * 16] ; [27]
pmulhrsw m1, m7
palignr m2, m0, 6
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
pmulhrsw m4, m7
movu m0, [r2 + 5]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
mova m6, m0
pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m6, [r3 - 16] ; [15]
pmulhrsw m5, m7
pmaddubsw m6, [r3 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
palignr m3, m2, m0, 2
pmaddubsw m6, m3, [r3 - 7 * 16] ; [9]
pmulhrsw m6, m7
pmaddubsw m3, [r3 + 6 * 16] ; [22]
pmulhrsw m3, m7
packuswb m6, m3
palignr m2, m0, 4
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r3] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
pmulhrsw m4, m7
movu m0, [r2 + 7]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m5, m2, m0, 2
pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 7 * 16] ; [23]
pmulhrsw m5, m7
palignr m1, m2, m0, 4
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 + 16] ; [17]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m6, m1
palignr m2, m2, m0, 6
pmaddubsw m1, m2, [r3 - 5 * 16] ; [11]
pmulhrsw m1, m7
pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
movu m0, [r2 + 11]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
mova m5, m0
pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
pmulhrsw m4, m7
pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, [r3 + 15 * 16] ; [31]
pmulhrsw m5, m7
palignr m6, m2, m0, 2
pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
pmulhrsw m1, m7
packuswb m5, m1
pmaddubsw m6, [r3 + 9 * 16] ; [25]
pmulhrsw m6, m7
palignr m1, m2, m0, 4
pmaddubsw m2, m1, [r3 - 10 * 16] ; [6]
pmulhrsw m2, m7
packuswb m6, m2
pmaddubsw m1, [r3 + 3 * 16] ; [19]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 14] ; [00]
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_7_29 1
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
mova m5, m0
pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
pmulhrsw m4, m7
pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, [r3 + 11 * 16] ; [27]
pmulhrsw m5, m7
palignr m1, m2, m0, 2
palignr m2, m0, 4
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
pmulhrsw m6, m7
pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, [r3 + 15 * 16] ; [31]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 16] ; [17]
pmulhrsw m4, m7
pmaddubsw m2, [r3 + 10 * 16] ; [26]
pmulhrsw m2, m7
packuswb m4, m2
movu m0, [r2 + 4]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m2, m0, 2
pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
pmulhrsw m6, m7
pmaddubsw m0, [r3 + 14 * 16] ; [30]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r3] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 9 * 16] ; [25]
pmulhrsw m4, m7
movu m0, [r2 + 6]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m2, m0, 2
pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r3 - 5 * 16] ; [11]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 + 13 * 16] ; [29]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r3 - 10 * 16] ; [6]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r3 - 16] ; [15]
pmulhrsw m1, m7
pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
pmulhrsw m2, m7
packuswb m1, m2
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
movu m0, [r2 + 8]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r3 + 3 * 16] ; [19]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
palignr m2, m0, 2
pmaddubsw m6, m2, [r3 - 11 * 16] ; [5]
pmulhrsw m6, m7
pmaddubsw m0, m2, [r3 - 2 * 16] ; [14]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r3 + 7 * 16] ; [23]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 10] ; [0]
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_8_28 1
movu m0, [r2 + 1]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m2, m0, 2
pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r3 - 1 * 16] ; [15]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 + 9 * 16] ; [25]
pmulhrsw m6, m7
pmaddubsw m0, [r3 + 14 * 16] ; [30]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
pmulhrsw m5, m7
pmaddubsw m2, [r3 + 12 * 16] ; [28]
pmulhrsw m2, m7
packuswb m5, m2
movu m0, [r2 + 3]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m6, m0, [r3 - 15 * 16] ; [01]
pmulhrsw m6, m7
pmaddubsw m1, m0, [r3 - 10 * 16] ; [06]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m0, [r3 - 5 * 16] ; [11]
pmulhrsw m1, m7
mova m2, m0
pmaddubsw m0, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r3 + 10 * 16] ; [26]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r3 + 15 * 16] ; [31]
pmulhrsw m5, m7
movu m0, [r2 + 4]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m2, m0, [r3 - 12 * 16] ; [4]
pmulhrsw m2, m7
packuswb m5, m2
pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
pmulhrsw m6, m7
pmaddubsw m1, m0, [r3 - 2 * 16] ; [14]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m0, [r3 + 3 * 16] ; [19]
pmulhrsw m1, m7
mova m2, m0
pmaddubsw m0, [r3 + 8 * 16] ; [24]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
pmulhrsw m4, m7
movu m0, [r2 + 5]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r3 - 9 * 16] ; [7]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r3 + 16] ; [17]
pmulhrsw m6, m7
pmaddubsw m1, m0, [r3 + 6 * 16] ; [22]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m0, [r3 + 11 * 16] ; [27]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 6] ; [00]
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_9_27 1
movu m2, [r2 + 1]
palignr m1, m2, 1
punpckhbw m0, m2, m1
punpcklbw m2, m1
pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
pmulhrsw m6, m7
pmaddubsw m3, m2, [r3 - 4 * 16] ; [12]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 2] ; [00]
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
movu m2, [r2 + 2]
palignr m1, m2, 1
punpcklbw m2, m1
pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
pmulhrsw m6, m7
pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r3] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
movu m2, [r2 + 2]
palignr m1, m2, 1
punpcklbw m2, m1
pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
pmulhrsw m1, m7
packuswb m1, m1
movhps m1, [r2 + 3] ; [00]
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_12_24 1
movu m2, [r2]
palignr m1, m2, 1
punpckhbw m0, m2, m1
punpcklbw m2, m1
palignr m0, m2, 2
pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r4 + 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
pmulhrsw m6, m7
pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
movu m0, [r2 - 2]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m2, m0, 2
pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
pmulhrsw m5, m7
movu m0, [r2 - 3]
palignr m1, m0, 1
punpckhbw m2, m0, m1
punpcklbw m0, m1
palignr m2, m0, 2
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
pmulhrsw m4, m7
movu m2, [r2 - 4]
palignr m1, m2, 1
punpckhbw m0, m2, m1
punpcklbw m2, m1
palignr m0, m2, 2
pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m0, [r4 - 16] ; [15]
pmulhrsw m6, m7
pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
movu m2, [pb_fact0]
pshufb m0, m2
pmovzxbw m0, m0
packuswb m1, m0
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
;------------------------------------------------------------------------------------------
; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;------------------------------------------------------------------------------------------
INIT_XMM ssse3
cglobal intra_pred_ang32_2, 3,5,4
lea r4, [r2]
add r2, 64
cmp r3m, byte 34
cmove r2, r4
movu m0, [r2 + 2]
movu m1, [r2 + 18]
movu m3, [r2 + 34]
lea r3, [r1 * 3]
movu [r0], m0
movu [r0 + 16], m1
palignr m2, m1, m0, 1
movu [r0 + r1], m2
palignr m2, m3, m1, 1
movu [r0 + r1 + 16], m2
palignr m2, m1, m0, 2
movu [r0 + r1 * 2], m2
palignr m2, m3, m1, 2
movu [r0 + r1 * 2 + 16], m2
palignr m2, m1, m0, 3
movu [r0 + r3], m2
palignr m2, m3, m1, 3
movu [r0 + r3 + 16], m2
lea r0, [r0 + r1 * 4]
palignr m2, m1, m0, 4
movu [r0], m2
palignr m2, m3, m1, 4
movu [r0 + 16], m2
palignr m2, m1, m0, 5
movu [r0 + r1], m2
palignr m2, m3, m1, 5
movu [r0 + r1 + 16], m2
palignr m2, m1, m0, 6
movu [r0 + r1 * 2], m2
palignr m2, m3, m1, 6
movu [r0 + r1 * 2 + 16], m2
palignr m2, m1, m0, 7
movu [r0 + r3], m2
palignr m2, m3, m1, 7
movu [r0 + r3 + 16], m2
lea r0, [r0 + r1 * 4]
palignr m2, m1, m0, 8
movu [r0], m2
palignr m2, m3, m1, 8
movu [r0 + 16], m2
palignr m2, m1, m0, 9
movu [r0 + r1], m2
palignr m2, m3, m1, 9
movu [r0 + r1 + 16], m2
palignr m2, m1, m0, 10
movu [r0 + r1 * 2], m2
palignr m2, m3, m1, 10
movu [r0 + r1 * 2 + 16], m2
palignr m2, m1, m0, 11
movu [r0 + r3], m2
palignr m2, m3, m1, 11
movu [r0 + r3 + 16], m2
lea r0, [r0 + r1 * 4]
palignr m2, m1, m0, 12
movu [r0], m2
palignr m2, m3, m1, 12
movu [r0 + 16], m2
palignr m2, m1, m0, 13
movu [r0 + r1], m2
palignr m2, m3, m1, 13
movu [r0 + r1 + 16], m2
palignr m2, m1, m0, 14
movu [r0 + r1 * 2], m2
palignr m2, m3, m1, 14
movu [r0 + r1 * 2 + 16], m2
palignr m2, m1, m0, 15
movu [r0 + r3], m2
palignr m2, m3, m1, 15
movu [r0 + r3 + 16], m2
lea r0, [r0 + r1 * 4]
movu [r0], m1
movu m0, [r2 + 50]
movu [r0 + 16], m3
palignr m2, m3, m1, 1
movu [r0 + r1], m2
palignr m2, m0, m3, 1
movu [r0 + r1 + 16], m2
palignr m2, m3, m1, 2
movu [r0 + r1 * 2], m2
palignr m2, m0, m3, 2
movu [r0 + r1 * 2 + 16], m2
palignr m2, m3, m1, 3
movu [r0 + r3], m2
palignr m2, m0, m3, 3
movu [r0 + r3 + 16], m2
lea r0, [r0 + r1 * 4]
palignr m2, m3, m1, 4
movu [r0], m2
palignr m2, m0, m3, 4
movu [r0 + 16], m2
palignr m2, m3, m1, 5
movu [r0 + r1], m2
palignr m2, m0, m3, 5
movu [r0 + r1 + 16], m2
palignr m2, m3, m1, 6
movu [r0 + r1 * 2], m2
palignr m2, m0, m3, 6
movu [r0 + r1 * 2 + 16], m2
palignr m2, m3, m1, 7
movu [r0 + r3], m2
palignr m2, m0, m3, 7
movu [r0 + r3 + 16], m2
lea r0, [r0 + r1 * 4]
palignr m2, m3, m1, 8
movu [r0], m2
palignr m2, m0, m3, 8
movu [r0 + 16], m2
palignr m2, m3, m1, 9
movu [r0 + r1], m2
palignr m2, m0, m3, 9
movu [r0 + r1 + 16], m2
palignr m2, m3, m1, 10
movu [r0 + r1 * 2], m2
palignr m2, m0, m3, 10
movu [r0 + r1 * 2 + 16], m2
palignr m2, m3, m1, 11
movu [r0 + r3], m2
palignr m2, m0, m3, 11
movu [r0 + r3 + 16], m2
lea r0, [r0 + r1 * 4]
palignr m2, m3, m1, 12
movu [r0], m2
palignr m2, m0, m3, 12
movu [r0 + 16], m2
palignr m2, m3, m1, 13
movu [r0 + r1], m2
palignr m2, m0, m3, 13
movu [r0 + r1 + 16], m2
palignr m2, m3, m1, 14
movu [r0 + r1 * 2], m2
palignr m2, m0, m3, 14
movu [r0 + r1 * 2 + 16], m2
palignr m2, m3, m1, 15
movu [r0 + r3], m2
palignr m2, m0, m3, 15
movu [r0 + r3 + 16], m2
RET
INIT_XMM sse4
cglobal intra_pred_ang32_3, 3,7,8
add r2, 64
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
MODE_3_33 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_4, 3,7,8
add r2, 64
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
MODE_4_32 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_5, 3,7,8
add r2, 64
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
MODE_5_31 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_6, 3,7,8
add r2, 64
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
MODE_6_30 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_7, 3,7,8
add r2, 64
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
MODE_7_29 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_8, 3,7,8
add r2, 64
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
MODE_8_28 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_9, 3,7,8
add r2, 64
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
MODE_9_27 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
pxor m7, m7
mov r6, 2
movu m0, [r2]
movu m1, [r2 + 1]
mova m8, m0
mova m9, m1
mov r3d, r4d
lea r4, [r1 * 3]
.loop:
movu m0, [r2 + 1 + 64]
palignr m1, m0, 1
pshufb m1, m7
palignr m2, m0, 2
pshufb m2, m7
palignr m3, m0, 3
pshufb m3, m7
palignr m4, m0, 4
pshufb m4, m7
palignr m5, m0, 5
pshufb m5, m7
palignr m6, m0, 6
pshufb m6, m7
movu [r0 + r1], m1
movu [r0 + r1 + 16], m1
movu [r0 + r1 * 2], m2
movu [r0 + r1 * 2 + 16], m2
movu [r0 + r4], m3
movu [r0 + r4 + 16], m3
lea r5, [r0 + r1 * 4]
movu [r5], m4
movu [r5 + 16], m4
movu [r5 + r1], m5
movu [r5 + r1 + 16], m5
movu [r5 + r1 * 2], m6
movu [r5 + r1 * 2 + 16], m6
palignr m1, m0, 7
pshufb m1, m7
movhlps m2, m0
pshufb m2, m7
palignr m3, m0, 9
pshufb m3, m7
palignr m4, m0, 10
pshufb m4, m7
palignr m5, m0, 11
pshufb m5, m7
palignr m6, m0, 12
pshufb m6, m7
movu [r5 + r4], m1
movu [r5 + r4 + 16], m1
lea r5, [r5 + r1 * 4]
movu [r5], m2
movu [r5 + 16], m2
movu [r5 + r1], m3
movu [r5 + r1 + 16], m3
movu [r5 + r1 * 2], m4
movu [r5 + r1 * 2 + 16], m4
movu [r5 + r4], m5
movu [r5 + r4 + 16], m5
lea r5, [r5 + r1 * 4]
movu [r5], m6
movu [r5 + 16], m6
palignr m1, m0, 13
pshufb m1, m7
palignr m2, m0, 14
pshufb m2, m7
palignr m3, m0, 15
pshufb m3, m7
pshufb m0, m7
movu [r5 + r1], m1
movu [r5 + r1 + 16], m1
movu [r5 + r1 * 2], m2
movu [r5 + r1 * 2 + 16], m2
movu [r5 + r4], m3
movu [r5 + r4 + 16], m3
; filter
cmp r3d, byte 0
jz .quit
movhlps m1, m0
pmovzxbw m0, m0
mova m1, m0
movu m2, m8
movu m3, m9
pshufb m2, m7
pmovzxbw m2, m2
movhlps m4, m3
pmovzxbw m3, m3
pmovzxbw m4, m4
psubw m3, m2
psubw m4, m2
psraw m3, 1
psraw m4, 1
paddw m0, m3
paddw m1, m4
packuswb m0, m1
.quit:
movu [r0], m0
movu [r0 + 16], m0
dec r6
lea r0, [r5 + r1 * 4]
lea r2, [r2 + 16]
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_11, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2 + 16]
pxor m1, m1
pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
mova [rsp], m0
movu m0, [r2 + 64]
pinsrb m0, [r2], 0
movu m1, [r2 + 16 + 64]
movu m2, [r2 + 32 + 64]
movu [rsp + 1], m0
movu [rsp + 1 + 16], m1
movu [rsp + 1 + 32], m2
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 1] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16
; Row[8 - 15]
movu m7, [r2]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0
; Row[16 - 23]
movu m7, [r2 - 1]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16
; Row[24 - 31]
movu m7, [r2 - 1]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
%macro MODE_12_24_ROW0 1
movu m0, [r3 + 6]
pshufb m0, [c_mode32_12_0]
pinsrb m0, [r3 + 26], 12
mova above, m0
movu m2, [r2]
%if %1 == 1
pinsrb m2, [r3], 0
%endif
palignr m1, m2, 1
punpcklbw m2, m1
pmaddubsw m4, m2, [r4 + 11 * 16] ; [27]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r4 + 6 * 16] ; [22]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r4 + 16] ; [17]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 - 9 * 16] ; [7]
pmulhrsw m6, m7
pmaddubsw m3, m2, [r4 - 14 * 16] ; [2]
pmulhrsw m3, m7
packuswb m6, m3
movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
%if %1 == 1
pinsrb m1, [r3], 0
%endif
palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
pmulhrsw m5, m7
pslldq m1, above, 1
palignr m2, m1, 14
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
pmulhrsw m6, m7
pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
pmulhrsw m4, m7
pslldq m1, above, 2
palignr m2, m1, 14
pmaddubsw m5, m2, [r4 + 14 * 16] ; [30]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r4 + 9 * 16] ; [25]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 - 16] ; [15]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r4 - 6 * 16] ; [10]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
pmulhrsw m1, m7
movu m0, [pb_fact0]
pshufb m2, m0
pmovzxbw m2, m2
packuswb m1, m2
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
INIT_XMM sse4
cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
mov r3, r2
add r2, 64
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
MODE_12_24_ROW0 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 7
mov r3, 3
.loop:
MODE_12_24 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r3
jnz .loop
RET
%macro MODE_13_23_ROW0 1
movu m0, [r3 + 1]
movu m1, [r3 + 15]
pshufb m0, [c_mode32_13_0]
pshufb m1, [c_mode32_13_0]
punpckldq m0, m1
pshufb m0, [c_mode32_13_shuf]
mova above, m0
movu m2, [r2]
%if (%1 == 1)
pinsrb m2, [r3], 0
%endif
palignr m1, m2, 1
punpcklbw m2, m1
pmaddubsw m4, m2, [r4 + 7 * 16] ; [23]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r4 - 2 * 16] ; [14]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r4 - 11 * 16] ; [5]
pmulhrsw m5, m7
movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
%if (%1 == 1)
pinsrb m1, [r3], 0
%endif
palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
pmulhrsw m6, m7
pmaddubsw m0, m2, [r4 - 6 * 16] ; [10]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
pmulhrsw m1, m7
palignr m2, above, 14
pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 - 16] ; [15]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r4 - 10 * 16] ; [6]
pmulhrsw m5, m7
packuswb m4, m5
pslldq m0, above, 1
palignr m2, m0, 14
pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m6, m1
pslldq m0, 1
palignr m2, m0, 14
pmaddubsw m1, m2, [r4 + 9 * 16] ; [25]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r4] ; [16]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 - 9 * 16] ; [7]
pmulhrsw m4, m7
pslldq m0, above, 3
palignr m2, m0, 14
pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
pmulhrsw m6, m7
pslldq m0, 1
palignr m2, m0, 14
pmaddubsw m0, m2, [r4 + 10 * 16] ; [26]
pmulhrsw m0, m7
packuswb m6, m0
pmaddubsw m1, m2, [r4 + 16] ; [17]
pmulhrsw m1, m7
pmaddubsw m0, m2, [r4 - 8 * 16] ; [8]
pmulhrsw m0, m7
packuswb m1, m0
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
pslldq m0, above, 5
palignr m2, m0, 14
pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
pslldq m0, 1
palignr m2, m0, 14
pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
pmulhrsw m1, m7
pmaddubsw m3, m2, [r4 - 16 * 16] ; [00]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
%macro MODE_13_23 2
movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
pmaddubsw m4, m0, [r4 + 7 * 16] ; [23]
pmulhrsw m4, m7
pmaddubsw m3, m0, [r4 - 2 * 16] ; [14]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m0, [r4 - 11 * 16] ; [05]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
pmulhrsw m6, m7
pmaddubsw m3, m2, [r4 - 6 * 16] ; [10]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
pmulhrsw m1, m7
movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
punpckhbw m0, m2, m3
punpcklbw m2, m3
palignr m0, m2, 2
pmaddubsw m3, m0, [r4 + 8 * 16] ; [24]
pmulhrsw m3, m7
packuswb m1, m3
mova m3, m0
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 16] ; [15]
pmulhrsw m4, m7
pmaddubsw m5, m3, [r4 - 10 * 16] ; [6]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
pmulhrsw m1, m7
packuswb m6, m1
movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
pmaddubsw m1, m0, [r4 + 9 * 16] ; [25]
pmulhrsw m1, m7
pmaddubsw m3, m0, [r4] ; [16]
pmulhrsw m3, m7
packuswb m1, m3
mova m3, m0
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
pmaddubsw m4, m3, [r4 - 9 * 16] ; [7]
pmulhrsw m4, m7
pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
pmulhrsw m6, m7
packuswb m5, m6
pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
pmulhrsw m6, m7
movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
pmaddubsw m3, m0, [r4 + 10 * 16] ; [26]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m1, m0, [r4 + 16] ; [17]
pmulhrsw m1, m7
pmaddubsw m3, m0, [r4 - 8 * 16] ; [8]
pmulhrsw m3, m7
packuswb m1, m3
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
pmulhrsw m4, m7
pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
pmulhrsw m5, m7
packuswb m4, m5
pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
pmulhrsw m5, m7
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
pmulhrsw m6, m7
packuswb m5, m6
movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
%if ((%1 & %2) == 1)
pinsrb m2, [r3], 0
%endif
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
pmulhrsw m6, m7
pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
pmulhrsw m1, m7
movu m0, [pb_fact0]
pshufb m2, m0
pmovzxbw m2, m2
packuswb m1, m2
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
%endmacro
INIT_XMM sse4
cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
mov r3, r2
add r2, 64
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
MODE_13_23_ROW0 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 7
MODE_13_23 1, 1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
mov r3, 2
.loop:
MODE_13_23 1, 0
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec r3
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_14, 3,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2]
movu m1, [r2 + 15]
pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
mova [rsp], m0
movu m0, [r2 + 1 + 64]
movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 13], m0
movu [rsp + 13 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 13] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 4]
palignr m0, m7, 3
mova m1, m0
palignr m2, m7, 2
mova m3, m2
palignr m4, m7, 1
mova m5, m4
mova m6, m4
PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24
; Row[8 - 15]
movu m7, [r2 - 7]
palignr m0, m7, 3
palignr m1, m7, 2
mova m2, m1
mova m3, m1
palignr m4, m7, 1
mova m5, m4
mova m6, m7
PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16
; Row[16 - 23]
movu m7, [r2 - 10]
palignr m0, m7, 3
palignr m1, m7, 2
mova m2, m1
palignr m3, m7, 1
mova m4, m3
mova m5, m3
mova m6, m7
PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8
; Row[24 - 31]
movu m7, [r2 - 13]
palignr m0, m7, 2
mova m1, m0
mova m2, m0
palignr m3, m7, 1
mova m4, m3
mova m5, m7
mova m6, m7
PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_15, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2]
movu m1, [r2 + 15]
pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
mova [rsp], m1
movu [rsp + 8], m0
movu m0, [r2 + 1 + 64]
movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 17], m0
movu [rsp + 17 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 17] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 5]
palignr m0, m7, 4
palignr m1, m7, 3
mova m2, m1
palignr m3, m7, 2
mova m4, m3
palignr m5, m7, 1
mova m6, m5
PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24
; Row[8 - 15]
movu m7, [r2 - 9]
palignr m0, m7, 4
palignr m1, m7, 3
mova m2, m1
palignr m3, m7, 2
mova m4, m3
palignr m5, m7, 1
mova m6, m5
PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16
; Row[16 - 23]
movu m7, [r2 - 13]
palignr m0, m7, 3
mova m1, m0
palignr m2, m7, 2
mova m3, m2
palignr m4, m7, 1
mova m5, m4
mova m6, m7
PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8
; Row[24 - 31]
movu m7, [r2 - 17]
palignr m0, m7, 3
mova m1, m0
palignr m2, m7, 2
mova m3, m2
palignr m4, m7, 1
mova m5, m4
mova m6, m7
PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_16, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2]
movu m1, [r2 + 15]
pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
mova [rsp], m1
movu [rsp + 10], m0
movu m0, [r2 + 1 + 64]
movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 21], m0
movu [rsp + 21 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 21] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 6]
palignr m0, m7, 5
palignr m1, m7, 4
mova m2, m1
palignr m3, m7, 3
palignr m4, m7, 2
mova m5, m4
palignr m6, m7, 1
PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24
; Row[8 - 15]
movu m7, [r2 - 11]
palignr m0, m7, 5
palignr m1, m7, 4
palignr m2, m7, 3
mova m3, m2
palignr m4, m7, 2
palignr m5, m7, 1
mova m6, m5
PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16
; Row[16 - 23]
movu m7, [r2 - 16]
palignr m0, m7, 4
mova m1, m0
palignr m2, m7, 3
palignr m3, m7, 2
mova m4, m3
palignr m5, m7, 1
mova m6, m7
PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8
; Row[24 - 31]
movu m7, [r2 - 21]
palignr m0, m7, 4
palignr m1, m7, 3
mova m2, m1
palignr m3, m7, 2
palignr m4, m7, 1
mova m5, m4
mova m6, m7
PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_17, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2]
movu m1, [r2 + 16]
pshufb m0, [c_mode32_17_0]
pshufb m1, [c_mode32_17_0]
mova [rsp ], m1
movu [rsp + 13], m0
movu m0, [r2 + 1 + 64]
movu m1, [r2 + 1 + 16 + 64]
movu [rsp + 26], m0
movu [rsp + 26 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 25] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 6]
palignr m0, m7, 6
palignr m1, m7, 5
palignr m2, m7, 4
palignr m3, m7, 3
palignr m4, m7, 2
mova m5, m4
palignr m6, m7, 1
PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16
; Row[7 - 15]
movu m7, [r2 - 12]
palignr m0, m7, 5
palignr m1, m7, 4
mova m2, m1
palignr m3, m7, 3
palignr m4, m7, 2
palignr m5, m7, 1
mova m6, m7
PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0
; Row[16 - 23]
movu m7, [r2 - 19]
palignr m0, m7, 6
palignr m1, m7, 5
palignr m2, m7, 4
palignr m3, m7, 3
palignr m4, m7, 2
mova m5, m4
palignr m6, m7, 1
PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16
; Row[24 - 31]
movu m7, [r2 - 25]
palignr m0, m7, 5
palignr m1, m7, 4
mova m2, m1
palignr m3, m7, 3
palignr m4, m7, 2
palignr m5, m7, 1
mova m6, m7
PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_YMM avx2
cglobal intra_pred_ang32_18, 4, 4, 3
movu m0, [r2]
movu xm1, [r2 + 1 + 64]
pshufb xm1, [intra_pred_shuff_15_0]
mova xm2, xm0
vinserti128 m1, m1, xm2, 1
lea r3, [r1 * 3]
movu [r0], m0
palignr m2, m0, m1, 15
movu [r0 + r1], m2
palignr m2, m0, m1, 14
movu [r0 + r1 * 2], m2
palignr m2, m0, m1, 13
movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
palignr m2, m0, m1, 12
movu [r0], m2
palignr m2, m0, m1, 11
movu [r0 + r1], m2
palignr m2, m0, m1, 10
movu [r0 + r1 * 2], m2
palignr m2, m0, m1, 9
movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
palignr m2, m0, m1, 8
movu [r0], m2
palignr m2, m0, m1, 7
movu [r0 + r1], m2
palignr m2, m0, m1, 6
movu [r0 + r1 * 2], m2
palignr m2, m0, m1, 5
movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
palignr m2, m0, m1, 4
movu [r0], m2
palignr m2, m0, m1, 3
movu [r0 + r1], m2
palignr m2, m0, m1, 2
movu [r0 + r1 * 2], m2
palignr m2, m0, m1, 1
movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
movu [r0], m1
movu xm0, [r2 + 64 + 17]
pshufb xm0, [intra_pred_shuff_15_0]
vinserti128 m0, m0, xm1, 1
palignr m2, m1, m0, 15
movu [r0 + r1], m2
palignr m2, m1, m0, 14
movu [r0 + r1 * 2], m2
palignr m2, m1, m0, 13
movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
palignr m2, m1, m0, 12
movu [r0], m2
palignr m2, m1, m0, 11
movu [r0 + r1], m2
palignr m2, m1, m0, 10
movu [r0 + r1 * 2], m2
palignr m2, m1, m0, 9
movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
palignr m2, m1, m0, 8
movu [r0], m2
palignr m2, m1, m0, 7
movu [r0 + r1], m2
palignr m2, m1, m0,6
movu [r0 + r1 * 2], m2
palignr m2, m1, m0, 5
movu [r0 + r3], m2
lea r0, [r0 + r1 * 4]
palignr m2, m1, m0, 4
movu [r0], m2
palignr m2, m1, m0, 3
movu [r0 + r1], m2
palignr m2, m1, m0,2
movu [r0 + r1 * 2], m2
palignr m2, m1, m0, 1
movu [r0 + r3], m2
RET
INIT_XMM sse4
cglobal intra_pred_ang32_18, 4,5,5
movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
lea r2, [r1 * 2]
lea r3, [r1 * 3]
lea r4, [r1 * 4]
movu [r0], m0
movu [r0 + 16], m1
pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]
palignr m4, m0, m2, 15
movu [r0 + r1], m4
palignr m4, m1, m0, 15
movu [r0 + r1 + 16], m4
palignr m4, m0, m2, 14
movu [r0 + r2], m4
palignr m4, m1, m0, 14
movu [r0 + r2 + 16], m4
palignr m4, m0, m2, 13
movu [r0 + r3], m4
palignr m4, m1, m0, 13
movu [r0 + r3 + 16], m4
lea r0, [r0 + r4]
palignr m4, m0, m2, 12
movu [r0], m4
palignr m4, m1, m0, 12
movu [r0 + 16], m4
palignr m4, m0, m2, 11
movu [r0 + r1], m4
palignr m4, m1, m0, 11
movu [r0 + r1 + 16], m4
palignr m4, m0, m2, 10
movu [r0 + r2], m4
palignr m4, m1, m0, 10
movu [r0 + r2 + 16], m4
palignr m4, m0, m2, 9
movu [r0 + r3], m4
palignr m4, m1, m0, 9
movu [r0 + r3 + 16], m4
lea r0, [r0 + r4]
palignr m4, m0, m2, 8
movu [r0], m4
palignr m4, m1, m0, 8
movu [r0 + 16], m4
palignr m4, m0, m2, 7
movu [r0 + r1], m4
palignr m4, m1, m0, 7
movu [r0 + r1 + 16], m4
palignr m4, m0, m2, 6
movu [r0 + r2], m4
palignr m4, m1, m0, 6
movu [r0 + r2 + 16], m4
palignr m4, m0, m2, 5
movu [r0 + r3], m4
palignr m4, m1, m0, 5
movu [r0 + r3 + 16], m4
lea r0, [r0 + r4]
palignr m4, m0, m2, 4
movu [r0], m4
palignr m4, m1, m0, 4
movu [r0 + 16], m4
palignr m4, m0, m2, 3
movu [r0 + r1], m4
palignr m4, m1, m0, 3
movu [r0 + r1 + 16], m4
palignr m4, m0, m2, 2
movu [r0 + r2], m4
palignr m4, m1, m0, 2
movu [r0 + r2 + 16], m4
palignr m4, m0, m2, 1
movu [r0 + r3], m4
palignr m4, m1, m0, 1
movu [r0 + r3 + 16], m4
lea r0, [r0 + r4]
movu [r0], m2
movu [r0 + 16], m0
palignr m4, m2, m3, 15
movu [r0 + r1], m4
palignr m4, m0, m2, 15
movu [r0 + r1 + 16], m4
palignr m4, m2, m3, 14
movu [r0 + r2], m4
palignr m4, m0, m2, 14
movu [r0 + r2 + 16], m4
palignr m4, m2, m3, 13
movu [r0 + r3], m4
palignr m4, m0, m2, 13
movu [r0 + r3 + 16], m4
lea r0, [r0 + r4]
palignr m4, m2, m3, 12
movu [r0], m4
palignr m4, m0, m2, 12
movu [r0 + 16], m4
palignr m4, m2, m3, 11
movu [r0 + r1], m4
palignr m4, m0, m2, 11
movu [r0 + r1 + 16], m4
palignr m4, m2, m3, 10
movu [r0 + r2], m4
palignr m4, m0, m2, 10
movu [r0 + r2 + 16], m4
palignr m4, m2, m3, 9
movu [r0 + r3], m4
palignr m4, m0, m2, 9
movu [r0 + r3 + 16], m4
lea r0, [r0 + r4]
palignr m4, m2, m3, 8
movu [r0], m4
palignr m4, m0, m2, 8
movu [r0 + 16], m4
palignr m4, m2, m3, 7
movu [r0 + r1], m4
palignr m4, m0, m2, 7
movu [r0 + r1 + 16], m4
palignr m4, m2, m3, 6
movu [r0 + r2], m4
palignr m4, m0, m2, 6
movu [r0 + r2 + 16], m4
palignr m4, m2, m3, 5
movu [r0 + r3], m4
palignr m4, m0, m2, 5
movu [r0 + r3 + 16], m4
lea r0, [r0 + r4]
palignr m4, m2, m3, 4
movu [r0], m4
palignr m4, m0, m2, 4
movu [r0 + 16], m4
palignr m4, m2, m3, 3
movu [r0 + r1], m4
palignr m4, m0, m2, 3
movu [r0 + r1 + 16], m4
palignr m4, m2, m3, 2
movu [r0 + r2], m4
palignr m4, m0, m2, 2
movu [r0 + r2 + 16], m4
palignr m4, m2, m3, 1
movu [r0 + r3], m4
palignr m4, m0, m2, 1
movu [r0 + r3 + 16], m4
RET
INIT_XMM sse4
cglobal intra_pred_ang32_19, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2 + 64]
pinsrb m0, [r2], 0
movu m1, [r2 + 16 + 64]
pshufb m0, [c_mode32_17_0]
pshufb m1, [c_mode32_17_0]
mova [rsp ], m1
movu [rsp + 13], m0
movu m0, [r2 + 1]
movu m1, [r2 + 1 + 16]
movu [rsp + 26], m0
movu [rsp + 26 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 25] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0] ; r6 -> r0
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 6]
palignr m0, m7, 6
palignr m1, m7, 5
palignr m2, m7, 4
palignr m3, m7, 3
palignr m4, m7, 2
mova m5, m4
palignr m6, m7, 1
PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16
; Row[7 - 15]
movu m7, [r2 - 12]
palignr m0, m7, 5
palignr m1, m7, 4
mova m2, m1
palignr m3, m7, 3
palignr m4, m7, 2
palignr m5, m7, 1
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0
; Row[16 - 23]
movu m7, [r2 - 19]
palignr m0, m7, 6
palignr m1, m7, 5
palignr m2, m7, 4
palignr m3, m7, 3
palignr m4, m7, 2
mova m5, m4
palignr m6, m7, 1
lea r0, [r0 + r1 * 4]
PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16
; Row[24 - 31]
movu m7, [r2 - 25]
palignr m0, m7, 5
palignr m1, m7, 4
mova m2, m1
palignr m3, m7, 3
palignr m4, m7, 2
palignr m5, m7, 1
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0
add r6, 8
mov r0, r6
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_20, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2 + 64]
pinsrb m0, [r2], 0
movu m1, [r2 + 15 + 64]
pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
mova [rsp], m1
movu [rsp + 10], m0
movu m0, [r2 + 1]
movu m1, [r2 + 1 + 16]
movu [rsp + 21], m0
movu [rsp + 21 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 21] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0] ; r6 -> r0
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 6]
palignr m0, m7, 5
palignr m1, m7, 4
mova m2, m1
palignr m3, m7, 3
palignr m4, m7, 2
mova m5, m4
palignr m6, m7, 1
PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24
; Row[8 - 15]
movu m7, [r2 - 11]
palignr m0, m7, 5
palignr m1, m7, 4
palignr m2, m7, 3
mova m3, m2
palignr m4, m7, 2
palignr m5, m7, 1
mova m6, m5
lea r0, [r0 + r1 * 4]
PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16
; Row[16 - 23]
movu m7, [r2 - 16]
palignr m0, m7, 4
mova m1, m0
palignr m2, m7, 3
palignr m3, m7, 2
mova m4, m3
palignr m5, m7, 1
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8
; Row[24 - 31]
movu m7, [r2 - 21]
palignr m0, m7, 4
palignr m1, m7, 3
mova m2, m1
palignr m3, m7, 2
palignr m4, m7, 1
mova m5, m4
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0
add r6, 8
mov r0, r6
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_21, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2 + 64]
pinsrb m0, [r2], 0
movu m1, [r2 + 15 + 64]
pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
mova [rsp], m1
movu [rsp + 8], m0
movu m0, [r2 + 1]
movu m1, [r2 + 1 + 16]
movu [rsp + 17], m0
movu [rsp + 17 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 17] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0] ; r6 -> r0
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 5]
palignr m0, m7, 4
palignr m1, m7, 3
mova m2, m1
palignr m3, m7, 2
mova m4, m3
palignr m5, m7, 1
mova m6, m5
PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24
; Row[8 - 15]
movu m7, [r2 - 9]
palignr m0, m7, 4
palignr m1, m7, 3
mova m2, m1
palignr m3, m7, 2
mova m4, m3
palignr m5, m7, 1
mova m6, m5
lea r0, [r0 + r1 * 4]
PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16
; Row[16 - 23]
movu m7, [r2 - 13]
palignr m0, m7, 3
mova m1, m0
palignr m2, m7, 2
mova m3, m2
palignr m4, m7, 1
mova m5, m4
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8
; Row[24 - 31]
movu m7, [r2 - 17]
palignr m0, m7, 3
mova m1, m0
palignr m2, m7, 2
mova m3, m2
palignr m4, m7, 1
mova m5, m4
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0
add r6, 8
mov r0, r6
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_22, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2 + 64]
pinsrb m0, [r2], 0
movu m1, [r2 + 15 + 64]
pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
mova [rsp], m0
movu m0, [r2 + 1]
movu m1, [r2 + 1 + 16]
movu [rsp + 13], m0
movu [rsp + 13 + 16], m1
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 13] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0] ; r6 -> r0
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2 - 4]
palignr m0, m7, 3
mova m1, m0
palignr m2, m7, 2
mova m3, m2
palignr m4, m7, 1
mova m5, m4
mova m6, m4
PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24
; Row[8 - 15]
movu m7, [r2 - 7]
palignr m0, m7, 3
palignr m1, m7, 2
mova m2, m1
mova m3, m1
palignr m4, m7, 1
mova m5, m4
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16
; Row[16 - 23]
movu m7, [r2 - 10]
palignr m0, m7, 3
palignr m1, m7, 2
mova m2, m1
palignr m3, m7, 1
mova m4, m3
mova m5, m3
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8
; Row[24 - 31]
movu m7, [r2 - 13]
palignr m0, m7, 2
mova m1, m0
mova m2, m0
palignr m3, m7, 1
mova m4, m3
mova m5, m7
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0
add r6, 8
mov r0, r6
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
lea r3, [r2 + 64]
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
MODE_13_23_ROW0 0
add r6, 8
mov r0, r6
add r2, 7
mov r3, 3
.loop:
MODE_13_23 0, 0
add r6, 8
mov r0, r6
add r2, 8
dec r3
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
%define above [rsp + 0 * mmsize]
lea r3, [r2 + 64]
lea r4, [ang_table + 16 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
MODE_12_24_ROW0 0
add r6, 8
mov r0, r6
add r2, 7
mov r3, 3
.loop:
MODE_12_24 0
add r6, 8
mov r0, r6
add r2, 8
dec r3
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_25, 4,7,8
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
mov r6, rsp
sub rsp, 64+gprsize
and rsp, ~63
mov [rsp+64], r6
; collect reference pixel
movu m0, [r2 + 16 + 64]
pxor m1, m1
pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
mova [rsp], m0
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
movu [rsp + 1], m0
movu [rsp + 1 + 16], m1
movu [rsp + 1 + 32], m2
mov [rsp + 63], byte 4
; filter
lea r2, [rsp + 1] ; r2 -> [0]
lea r3, [c_shuf8_0] ; r3 -> shuffle8
lea r4, [ang_table] ; r4 -> ang_table
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r0] ; r6 -> r0
mova m5, [pw_1024] ; m5 -> 1024
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
.loop:
; Row[0 - 7]
movu m7, [r2]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16
; Row[8 - 15]
movu m7, [r2]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0
; Row[16 - 23]
movu m7, [r2 - 1]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16
; Row[24 - 31]
movu m7, [r2 - 1]
mova m0, m7
mova m1, m7
mova m2, m7
mova m3, m7
mova m4, m7
mova m5, m7
mova m6, m7
lea r0, [r0 + r1 * 4]
PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0
add r6, 8
mov r0, r6
add r2, 8
dec byte [rsp + 63]
jnz .loop
mov rsp, [rsp+64]
RET
INIT_XMM sse4
cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize)
%define m8 [rsp + 0 * mmsize]
%define m9 [rsp + 1 * mmsize]
mov r6, 2
movu m0, [r2 + 64]
pinsrb m0, [r2], 0
movu m1, [r2 + 1 + 64]
mova m8, m0
mova m9, m1
mov r3d, r4d
lea r4, [r1 * 3]
.loop:
movu m0, [r2 + 1]
movu [r0], m0
movu [r0 + r1], m0
movu [r0 + r1 * 2], m0
movu [r0 + r4], m0
lea r5, [r0 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r0 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
lea r5, [r5 + r1 * 4]
movu [r5], m0
movu [r5 + r1], m0
movu [r5 + r1 * 2], m0
movu [r5 + r4], m0
; filter
cmp r3d, byte 0
jz .quit
pxor m4, m4
pshufb m0, m4
pmovzxbw m0, m0
mova m1, m0
movu m2, m8
movu m3, m9
pshufb m2, m4
pmovzxbw m2, m2
movhlps m4, m3
pmovzxbw m3, m3
pmovzxbw m4, m4
psubw m3, m2
psubw m4, m2
psraw m3, 1
psraw m4, 1
paddw m0, m3
paddw m1, m4
packuswb m0, m1
pextrb [r0], m0, 0
pextrb [r0 + r1], m0, 1
pextrb [r0 + r1 * 2], m0, 2
pextrb [r0 + r4], m0, 3
lea r5, [r0 + r1 * 4]
pextrb [r5], m0, 4
pextrb [r5 + r1], m0, 5
pextrb [r5 + r1 * 2], m0, 6
pextrb [r5 + r4], m0, 7
lea r5, [r5 + r1 * 4]
pextrb [r5], m0, 8
pextrb [r5 + r1], m0, 9
pextrb [r5 + r1 * 2], m0, 10
pextrb [r5 + r4], m0, 11
lea r5, [r5 + r1 * 4]
pextrb [r5], m0, 12
pextrb [r5 + r1], m0, 13
pextrb [r5 + r1 * 2], m0, 14
pextrb [r5 + r4], m0, 15
.quit:
lea r2, [r2 + 16]
add r0, 16
dec r6d
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_27, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
MODE_9_27 0
add r6, 8
mov r0, r6
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_28, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
MODE_8_28 0
add r6, 8
mov r0, r6
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_29, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
MODE_7_29 0
add r6, 8
mov r0, r6
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_30, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
MODE_6_30 0
add r6, 8
mov r0, r6
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_31, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
MODE_5_31 0
add r6, 8
mov r0, r6
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_32, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
MODE_4_32 0
add r6, 8
mov r0, r6
add r2, 8
dec r4
jnz .loop
RET
INIT_XMM sse4
cglobal intra_pred_ang32_33, 3,7,8
lea r3, [ang_table + 16 * 16]
mov r4d, 4
lea r5, [r1 * 3]
mov r6, r0
mova m7, [pw_1024]
.loop:
MODE_3_33 0
add r6, 8
mov r0, r6
add r2, 8
dec r4
jnz .loop
RET
;-----------------------------------------------------------------------------------------
; start of intra_pred_ang32 angular modes avx2 asm
;-----------------------------------------------------------------------------------------
%if ARCH_X86_64 == 1
INIT_YMM avx2
; register mapping :
; %1-%8 - output registers
; %9 - temp register
; %10 - for label naming
%macro TRANSPOSE_32x8_AVX2 10
jnz .skip%10
; transpose 8x32 to 32x8 and then store
punpcklbw m%9, m%1, m%2
punpckhbw m%1, m%2
punpcklbw m%2, m%3, m%4
punpckhbw m%3, m%4
punpcklbw m%4, m%5, m%6
punpckhbw m%5, m%6
punpcklbw m%6, m%7, m%8
punpckhbw m%7, m%8
punpcklwd m%8, m%9, m%2
punpckhwd m%9, m%2
punpcklwd m%2, m%4, m%6
punpckhwd m%4, m%6
punpcklwd m%6, m%1, m%3
punpckhwd m%1, m%3
punpcklwd m%3, m%5, m%7
punpckhwd m%5, m%7
punpckldq m%7, m%8, m%2
punpckhdq m%8, m%2
punpckldq m%2, m%6, m%3
punpckhdq m%6, m%3
punpckldq m%3, m%9, m%4
punpckhdq m%9, m%4
punpckldq m%4, m%1, m%5
punpckhdq m%1, m%5
movq [r0 + r1 * 0], xm%7
movhps [r0 + r1 * 1], xm%7
movq [r0 + r1 * 2], xm%8
movhps [r0 + r5 * 1], xm%8
lea r0, [r0 + r6]
movq [r0 + r1 * 0], xm%3
movhps [r0 + r1 * 1], xm%3
movq [r0 + r1 * 2], xm%9
movhps [r0 + r5 * 1], xm%9
lea r0, [r0 + r6]
movq [r0 + r1 * 0], xm%2
movhps [r0 + r1 * 1], xm%2
movq [r0 + r1 * 2], xm%6
movhps [r0 + r5 * 1], xm%6
lea r0, [r0 + r6]
movq [r0 + r1 * 0], xm%4
movhps [r0 + r1 * 1], xm%4
movq [r0 + r1 * 2], xm%1
movhps [r0 + r5 * 1], xm%1
lea r0, [r0 + r6]
vpermq m%8, m%8, 00001110b
vpermq m%7, m%7, 00001110b
vpermq m%6, m%6, 00001110b
vpermq m%3, m%3, 00001110b
vpermq m%9, m%9, 00001110b
vpermq m%2, m%2, 00001110b
vpermq m%4, m%4, 00001110b
vpermq m%1, m%1, 00001110b
movq [r0 + r1 * 0], xm%7
movhps [r0 + r1 * 1], xm%7
movq [r0 + r1 * 2], xm%8
movhps [r0 + r5 * 1], xm%8
lea r0, [r0 + r6]
movq [r0 + r1 * 0], xm%3
movhps [r0 + r1 * 1], xm%3
movq [r0 + r1 * 2], xm%9
movhps [r0 + r5 * 1], xm%9
lea r0, [r0 + r6]
movq [r0 + r1 * 0], xm%2
movhps [r0 + r1 * 1], xm%2
movq [r0 + r1 * 2], xm%6
movhps [r0 + r5 * 1], xm%6
lea r0, [r0 + r6]
movq [r0 + r1 * 0], xm%4
movhps [r0 + r1 * 1], xm%4
movq [r0 + r1 * 2], xm%1
movhps [r0 + r5 * 1], xm%1
lea r0, [r4 + 8]
jmp .end%10
.skip%10:
movu [r0 + r1 * 0], m%1
movu [r0 + r1 * 1], m%2
movu [r0 + r1 * 2], m%3
movu [r0 + r5 * 1], m%4
lea r0, [r0 + r6]
movu [r0 + r1 * 0], m%5
movu [r0 + r1 * 1], m%6
movu [r0 + r1 * 2], m%7
movu [r0 + r5 * 1], m%8
lea r0, [r0 + r6]
.end%10:
%endmacro
cglobal ang32_mode_3_33_row_0_15
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 + 10 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m5, [r3 + 4 * 32] ; [20]
pmulhrsw m5, m7
pmaddubsw m1, [r3 + 4 * 32]
pmulhrsw m1, m7
packuswb m5, m1
palignr m6, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m6, [r3 - 2 * 32] ; [14]
pmulhrsw m6, m7
pmaddubsw m1, [r3 - 2 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m8, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m8, [r3 - 8 * 32] ; [8]
pmulhrsw m8, m7
pmaddubsw m1, [r3 - 8 * 32]
pmulhrsw m1, m7
packuswb m8, m1
palignr m10, m2, m0, 8
palignr m11, m3, m2, 8
pmaddubsw m9, m10, [r3 - 14 * 32] ; [2]
pmulhrsw m9, m7
pmaddubsw m1, m11, [r3 - 14 * 32]
pmulhrsw m1, m7
packuswb m9, m1
pmaddubsw m10, [r3 + 12 * 32] ; [28]
pmulhrsw m10, m7
pmaddubsw m11, [r3 + 12 * 32]
pmulhrsw m11, m7
packuswb m10, m11
palignr m11, m2, m0, 10
palignr m1, m3, m2, 10
pmaddubsw m11, [r3 + 6 * 32] ; [22]
pmulhrsw m11, m7
pmaddubsw m1, [r3 + 6 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m12, m2, m0, 12
palignr m1, m3, m2, 12
pmaddubsw m12, [r3] ; [16]
pmulhrsw m12, m7
pmaddubsw m1, [r3]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m4, m2, m0, 14
palignr m1, m3, m2, 14
pmaddubsw m4, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m7
pmaddubsw m1, [r3 - 6 * 32]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m2, [r3 - 12 * 32] ; [4]
pmulhrsw m5, m7
pmaddubsw m1, m3, [r3 - 12 * 32]
pmulhrsw m1, m7
packuswb m5, m1
pmaddubsw m6, m2, [r3 + 14 * 32] ; [30]
pmulhrsw m6, m7
pmaddubsw m1, m3, [r3 + 14 * 32]
pmulhrsw m1, m7
packuswb m6, m1
movu m0, [r2 + 25]
movu m1, [r2 + 26]
punpcklbw m0, m1
palignr m8, m3, m2, 2
palignr m1, m0, m3, 2
pmaddubsw m8, [r3 + 8 * 32] ; [24]
pmulhrsw m8, m7
pmaddubsw m1, [r3 + 8 * 32]
pmulhrsw m1, m7
packuswb m8, m1
palignr m9, m3, m2, 4
palignr m1, m0, m3, 4
pmaddubsw m9, [r3 + 2 * 32] ; [18]
pmulhrsw m9, m7
pmaddubsw m1, [r3 + 2 * 32]
pmulhrsw m1, m7
packuswb m9, m1
palignr m10, m3, m2, 6
palignr m1, m0, m3, 6
pmaddubsw m10, [r3 - 4 * 32] ; [12]
pmulhrsw m10, m7
pmaddubsw m1, [r3 - 4 * 32]
pmulhrsw m1, m7
packuswb m10, m1
palignr m11, m3, m2, 8
palignr m1, m0, m3, 8
pmaddubsw m11, [r3 - 10 * 32] ; [6]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 10 * 32]
pmulhrsw m1, m7
packuswb m11, m1
movu m12, [r2 + 14]
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
ret
INIT_YMM avx2
cglobal intra_pred_ang32_3, 3,8,13
add r2, 64
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
mov r4, r0
xor r7d, r7d
call ang32_mode_3_33_row_0_15
add r4, 16
mov r0, r4
add r2, 13
call ang32_mode_3_33_row_0_15
RET
INIT_YMM avx2
cglobal intra_pred_ang32_33, 3,8,13
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
xor r7d, r7d
inc r7d
call ang32_mode_3_33_row_0_15
add r2, 13
call ang32_mode_3_33_row_0_15
RET
cglobal ang32_mode_4_32_row_0_15
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 + 5 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m5, m7
pmaddubsw m8, m1, [r3 - 6 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, [r3 + 15 * 32] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 15 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m8, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m8, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
pmaddubsw m1, [r3 + 4 * 32]
pmulhrsw m1, m7
packuswb m8, m1
palignr m10, m2, m0, 6
palignr m11, m3, m2, 6
pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
pmulhrsw m9, m7
pmaddubsw m1, m11, [r3 - 7 * 32]
pmulhrsw m1, m7
packuswb m9, m1
pmaddubsw m10, [r3 + 14 * 32] ; [30]
pmulhrsw m10, m7
pmaddubsw m11, [r3 + 14 * 32]
pmulhrsw m11, m7
packuswb m10, m11
palignr m11, m2, m0, 8
palignr m1, m3, m2, 8
pmaddubsw m11, [r3 + 3 * 32] ; [19]
pmulhrsw m11, m7
pmaddubsw m1, [r3 + 3 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m12, m2, m0, 10
palignr m1, m3, m2, 10
pmaddubsw m12, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
pmaddubsw m1, [r3 - 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m4, m2, m0, 10
palignr m1, m3, m2, 10
pmaddubsw m4, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 13 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m5, m2, m0, 12
palignr m1, m3, m2, 12
pmaddubsw m5, [r3 + 2 * 32] ; [18]
pmulhrsw m5, m7
pmaddubsw m1, [r3 + 2 * 32]
pmulhrsw m1, m7
packuswb m5, m1
palignr m8, m2, m0, 14
palignr m1, m3, m2, 14
pmaddubsw m6, m8, [r3 - 9 * 32] ; [7]
pmulhrsw m6, m7
pmaddubsw m9, m1, [r3 - 9 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, [r3 + 12 * 32] ; [28]
pmulhrsw m8, m7
pmaddubsw m1, [r3 + 12 * 32]
pmulhrsw m1, m7
packuswb m8, m1
pmaddubsw m9, m2, [r3 + 1 * 32] ; [17]
pmulhrsw m9, m7
pmaddubsw m1, m3, [r3 + 1 * 32]
pmulhrsw m1, m7
packuswb m9, m1
movu m0, [r2 + 25]
movu m1, [r2 + 26]
punpcklbw m0, m1
palignr m11, m3, m2, 2
palignr m1, m0, m3, 2
pmaddubsw m10, m11, [r3 - 10 * 32] ; [6]
pmulhrsw m10, m7
pmaddubsw m12, m1, [r3 - 10 * 32]
pmulhrsw m12, m7
packuswb m10, m12
pmaddubsw m11, [r3 + 11 * 32] ; [27]
pmulhrsw m11, m7
pmaddubsw m1, [r3 + 11 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m0, m3, 4
palignr m3, m2, 4
pmaddubsw m3, [r3] ; [16]
pmulhrsw m3, m7
pmaddubsw m0, [r3]
pmulhrsw m0, m7
packuswb m3, m0
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 3, 0, 8
ret
cglobal ang32_mode_4_32_row_16_31
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 - 11 * 32]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m5, m7
pmaddubsw m1, m2, [r3 + 10 * 32]
pmulhrsw m1, m7
packuswb m5, m1
palignr m6, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m6, [r3 - 1 * 32] ; [15]
pmulhrsw m6, m7
pmaddubsw m1, [r3 - 1 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m9, m2, m0, 4
palignr m10, m3, m2, 4
pmaddubsw m8, m9, [r3 - 12 * 32] ; [4]
pmulhrsw m8, m7
pmaddubsw m1, m10, [r3 - 12 * 32]
pmulhrsw m1, m7
packuswb m8, m1
pmaddubsw m9, [r3 + 9 * 32] ; [25]
pmulhrsw m9, m7
pmaddubsw m10, [r3 + 9 * 32]
pmulhrsw m10, m7
packuswb m9, m10
palignr m10, m2, m0, 6
palignr m11, m3, m2, 6
pmaddubsw m10, [r3 - 2 * 32] ; [14]
pmulhrsw m10, m7
pmaddubsw m11, [r3 - 2 * 32]
pmulhrsw m11, m7
packuswb m10, m11
palignr m12, m2, m0, 8
palignr m1, m3, m2, 8
pmaddubsw m11, m12, [r3 - 13 * 32] ; [3]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 13 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m1, m3, m2, 8
pmaddubsw m12, [r3 + 8 * 32] ; [24]
pmulhrsw m12, m7
pmaddubsw m1, [r3 + 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m4, m2, m0, 10
palignr m1, m3, m2, 10
pmaddubsw m4, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m7
pmaddubsw m1, [r3 - 3 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m0, 12
palignr m8, m3, m2, 12
pmaddubsw m5, m6, [r3 - 14 * 32] ; [2]
pmulhrsw m5, m7
pmaddubsw m1, m8, [r3 - 14 * 32]
pmulhrsw m1, m7
packuswb m5, m1
pmaddubsw m6, [r3 + 7 * 32] ; [23]
pmulhrsw m6, m7
pmaddubsw m8, [r3 + 7 * 32]
pmulhrsw m8, m7
packuswb m6, m8
palignr m8, m2, m0, 14
palignr m1, m3, m2, 14
pmaddubsw m8, [r3 - 4 * 32] ; [12]
pmulhrsw m8, m7
pmaddubsw m1, [r3 - 4 * 32]
pmulhrsw m1, m7
packuswb m8, m1
pmaddubsw m9, m2, [r3 - 15 * 32] ; [1]
pmulhrsw m9, m7
pmaddubsw m1, m3, [r3 - 15 * 32]
pmulhrsw m1, m7
packuswb m9, m1
pmaddubsw m10, m2, [r3 + 6 * 32] ; [22]
pmulhrsw m10, m7
pmaddubsw m1, m3, [r3 + 6 * 32]
pmulhrsw m1, m7
packuswb m10, m1
movu m0, [r2 + 25]
movu m1, [r2 + 26]
punpcklbw m0, m1
palignr m11, m3, m2, 2
palignr m1, m0, m3, 2
pmaddubsw m11, [r3 - 5 * 32] ; [11]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 5 * 32]
pmulhrsw m1, m7
packuswb m11, m1
movu m12, [r2 + 11]
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
ret
INIT_YMM avx2
cglobal intra_pred_ang32_4, 3,8,13
add r2, 64
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
mov r4, r0
xor r7d, r7d
call ang32_mode_4_32_row_0_15
add r4, 16
mov r0, r4
add r2, 11
call ang32_mode_4_32_row_16_31
RET
INIT_YMM avx2
cglobal intra_pred_ang32_32, 3,8,13
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
xor r7d, r7d
inc r7d
call ang32_mode_4_32_row_0_15
add r2, 11
call ang32_mode_4_32_row_16_31
RET
cglobal ang32_mode_5_31_row_0_15
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 + 1 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m5, m6, [r3 - 14 * 32] ; [2]
pmulhrsw m5, m7
pmaddubsw m8, m1, [r3 - 14 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, [r3 + 3 * 32] ; [19]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 3 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m9, m2, m0, 4
palignr m10, m3, m2, 4
pmaddubsw m8, m9, [r3 - 12 * 32] ; [4]
pmulhrsw m8, m7
pmaddubsw m1, m10, [r3 - 12 * 32]
pmulhrsw m1, m7
packuswb m8, m1
pmaddubsw m9, [r3 + 5 * 32] ; [21]
pmulhrsw m9, m7
pmaddubsw m10, [r3 + 5 * 32]
pmulhrsw m10, m7
packuswb m9, m10
palignr m11, m2, m0, 6
palignr m12, m3, m2, 6
pmaddubsw m10, m11, [r3 - 10 * 32] ; [6]
pmulhrsw m10, m7
pmaddubsw m1, m12, [r3 - 10 * 32]
pmulhrsw m1, m7
packuswb m10, m1
pmaddubsw m11, [r3 + 7 * 32] ; [23]
pmulhrsw m11, m7
pmaddubsw m12, [r3 + 7 * 32]
pmulhrsw m12, m7
packuswb m11, m12
palignr m12, m2, m0, 8
palignr m1, m3, m2, 8
pmaddubsw m12, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
pmaddubsw m1, [r3 - 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m4, m2, m0, 8
palignr m1, m3, m2, 8
pmaddubsw m4, [r3 + 9 * 32] ; [25]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 9 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m0, 10
palignr m1, m3, m2, 10
pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m5, m7
pmaddubsw m8, m1, [r3 - 6 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, [r3 + 11 * 32] ; [27]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 11 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m9, m2, m0, 12
palignr m1, m3, m2, 12
pmaddubsw m8, m9, [r3 - 4 * 32] ; [12]
pmulhrsw m8, m7
pmaddubsw m10, m1, [r3 - 4 * 32]
pmulhrsw m10, m7
packuswb m8, m10
pmaddubsw m9, [r3 + 13 * 32] ; [29]
pmulhrsw m9, m7
pmaddubsw m1, [r3 + 13 * 32]
pmulhrsw m1, m7
packuswb m9, m1
palignr m11, m2, m0, 14
palignr m1, m3, m2, 14
pmaddubsw m10, m11, [r3 - 2 * 32] ; [14]
pmulhrsw m10, m7
pmaddubsw m12, m1, [r3 - 2 * 32]
pmulhrsw m12, m7
packuswb m10, m12
pmaddubsw m11, [r3 + 15 * 32] ; [31]
pmulhrsw m11, m7
pmaddubsw m1, [r3 + 15 * 32]
pmulhrsw m1, m7
packuswb m11, m1
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
pmaddubsw m3, [r3]
pmulhrsw m3, m7
packuswb m2, m3
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
ret
cglobal ang32_mode_5_31_row_16_31
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 - 15 * 32] ; [1]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 - 15 * 32]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
pmulhrsw m5, m7
pmaddubsw m8, m2, [r3 + 2 * 32]
pmulhrsw m8, m7
packuswb m5, m8
palignr m8, m2, m0, 2
palignr m9, m3, m2, 2
pmaddubsw m6, m8, [r3 - 13 * 32] ; [3]
pmulhrsw m6, m7
pmaddubsw m1, m9, [r3 - 13 * 32]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m8, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
pmaddubsw m9, [r3 + 4 * 32]
pmulhrsw m9, m7
packuswb m8, m9
palignr m10, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m9, m10, [r3 - 11 * 32] ; [5]
pmulhrsw m9, m7
pmaddubsw m11, m1, [r3 - 11 * 32]
pmulhrsw m11, m7
packuswb m9, m11
pmaddubsw m10, [r3 + 6 * 32] ; [22]
pmulhrsw m10, m7
pmaddubsw m1, [r3 + 6 * 32]
pmulhrsw m1, m7
packuswb m10, m1
palignr m12, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m11, m12, [r3 - 9 * 32] ; [7]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 9 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m1, m3, m2, 6
pmaddubsw m12, [r3 + 8 * 32] ; [24]
pmulhrsw m12, m7
pmaddubsw m1, [r3 + 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m5, m2, m0, 8
palignr m8, m3, m2, 8
pmaddubsw m4, m5, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m7
pmaddubsw m1, m8, [r3 - 7 * 32]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, [r3 + 10 * 32] ; [26]
pmulhrsw m5, m7
pmaddubsw m8, [r3 + 10 * 32]
pmulhrsw m8, m7
packuswb m5, m8
palignr m8, m2, m0, 10
palignr m9, m3, m2, 10
pmaddubsw m6, m8, [r3 - 5 * 32] ; [11]
pmulhrsw m6, m7
pmaddubsw m1, m9, [r3 - 5 * 32]
pmulhrsw m1, m7
packuswb m6, m1
pmaddubsw m8, [r3 + 12 * 32] ; [28]
pmulhrsw m8, m7
pmaddubsw m9, [r3 + 12 * 32]
pmulhrsw m9, m7
packuswb m8, m9
palignr m10, m2, m0, 12
palignr m11, m3, m2, 12
pmaddubsw m9, m10, [r3 - 3 * 32] ; [13]
pmulhrsw m9, m7
pmaddubsw m1, m11, [r3 - 3 * 32]
pmulhrsw m1, m7
packuswb m9, m1
pmaddubsw m10, [r3 + 14 * 32] ; [30]
pmulhrsw m10, m7
pmaddubsw m11, [r3 + 14 * 32]
pmulhrsw m11, m7
packuswb m10, m11
palignr m11, m2, m0, 14
palignr m1, m3, m2, 14
pmaddubsw m11, [r3 - 1 * 32] ; [15]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 1 * 32]
pmulhrsw m1, m7
packuswb m11, m1
movu m2, [r2 + 9]
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
ret
INIT_YMM avx2
cglobal intra_pred_ang32_5, 3,8,13
add r2, 64
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
mov r4, r0
xor r7d, r7d
call ang32_mode_5_31_row_0_15
add r4, 16
mov r0, r4
add r2, 9
call ang32_mode_5_31_row_16_31
RET
INIT_YMM avx2
cglobal intra_pred_ang32_31, 3,8,13
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
xor r7d, r7d
inc r7d
call ang32_mode_5_31_row_0_15
add r2, 9
call ang32_mode_5_31_row_16_31
RET
cglobal ang32_mode_6_30_row_0_15
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 - 3 * 32]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m5, m7
pmaddubsw m8, m2, [r3 + 10 * 32]
pmulhrsw m8, m7
packuswb m5, m8
palignr m8, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m6, m8, [r3 - 9 * 32] ; [7]
pmulhrsw m6, m7
pmaddubsw m9, m1, [r3 - 9 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
pmaddubsw m1, [r3 + 4 * 32]
pmulhrsw m1, m7
packuswb m8, m1
palignr m11, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m9, m11, [r3 - 15 * 32] ; [1]
pmulhrsw m9, m7
pmaddubsw m12, m1, [r3 - 15 * 32]
pmulhrsw m12, m7
packuswb m9, m12
pmaddubsw m10, m11, [r3 - 2 * 32] ; [14]
pmulhrsw m10, m7
pmaddubsw m12, m1, [r3 - 2 * 32]
pmulhrsw m12, m7
packuswb m10, m12
pmaddubsw m11, [r3 + 11 * 32] ; [27]
pmulhrsw m11, m7
pmaddubsw m1, [r3 + 11 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m12, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m12, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
pmaddubsw m1, [r3 - 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m4, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m4, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 5 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m8, m2, m0, 8
palignr m1, m3, m2, 8
pmaddubsw m5, m8, [r3 - 14 * 32] ; [2]
pmulhrsw m5, m7
pmaddubsw m9, m1, [r3 - 14 * 32]
pmulhrsw m9, m7
packuswb m5, m9
pmaddubsw m6, m8, [r3 - 1 * 32] ; [15]
pmulhrsw m6, m7
pmaddubsw m9, m1, [r3 - 1 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, [r3 + 12 * 32] ; [28]
pmulhrsw m8, m7
pmaddubsw m1, [r3 + 12 * 32]
pmulhrsw m1, m7
packuswb m8, m1
palignr m10, m2, m0, 10
palignr m1, m3, m2, 10
pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
pmulhrsw m9, m7
pmaddubsw m11, m1, [r3 - 7 * 32]
pmulhrsw m11, m7
packuswb m9, m11
pmaddubsw m10, [r3 + 6 * 32] ; [22]
pmulhrsw m10, m7
pmaddubsw m1, m1, [r3 + 6 * 32]
pmulhrsw m1, m7
packuswb m10, m1
palignr m3, m2, 12
palignr m2, m0, 12
pmaddubsw m11, m2, [r3 - 13 * 32] ; [3]
pmulhrsw m11, m7
pmaddubsw m1, m3, [r3 - 13 * 32]
pmulhrsw m1, m7
packuswb m11, m1
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
pmaddubsw m3, [r3]
pmulhrsw m3, m7
packuswb m2, m3
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
ret
cglobal ang32_mode_6_30_row_16_31
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 + 13 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m6, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m5, m7
pmaddubsw m8, m1, [r3 - 6 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, [r3 + 7 * 32] ; [23]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 7 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m10, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m8, m10, [r3 - 12 * 32] ; [4]
pmulhrsw m8, m7
pmaddubsw m11, m1, [r3 - 12 * 32]
pmulhrsw m11, m7
packuswb m8, m11
pmaddubsw m9, m10, [r3 + 1 * 32] ; [17]
pmulhrsw m9, m7
pmaddubsw m11, m1, [r3 + 1 * 32]
pmulhrsw m11, m7
packuswb m9, m11
pmaddubsw m10, [r3 + 14 * 32] ; [30]
pmulhrsw m10, m7
pmaddubsw m1, [r3 + 14 * 32]
pmulhrsw m1, m7
packuswb m10, m1
palignr m12, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m11, m12, [r3 - 5 * 32] ; [11]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 5 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m1, m3, m2, 6
pmaddubsw m12, [r3 + 8 * 32] ; [24]
pmulhrsw m12, m7
pmaddubsw m1, [r3 + 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m6, m2, m0, 8
palignr m1, m3, m2, 8
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m7
pmaddubsw m8, m1, [r3 - 11 * 32]
pmulhrsw m8, m7
packuswb m4, m8
pmaddubsw m5, m6, [r3 + 2 * 32] ; [18]
pmulhrsw m5, m7
pmaddubsw m9, m1, [r3 + 2 * 32]
pmulhrsw m9, m7
packuswb m5, m9
pmaddubsw m6, [r3 + 15 * 32] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 15 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m9, m2, m0, 10
palignr m1, m3, m2, 10
pmaddubsw m8, m9, [r3 - 4 * 32] ; [12]
pmulhrsw m8, m7
pmaddubsw m10, m1, [r3 - 4 * 32]
pmulhrsw m10, m7
packuswb m8, m10
pmaddubsw m9, [r3 + 9 * 32] ; [25]
pmulhrsw m9, m7
pmaddubsw m1, [r3 + 9 * 32]
pmulhrsw m1, m7
packuswb m9, m1
palignr m3, m2, 12
palignr m2, m0, 12
pmaddubsw m10, m2, [r3 - 10 * 32] ; [6]
pmulhrsw m10, m7
pmaddubsw m1, m3, [r3 - 10 * 32]
pmulhrsw m1, m7
packuswb m10, m1
pmaddubsw m2, [r3 + 3 * 32] ; [19]
pmulhrsw m2, m7
pmaddubsw m3, [r3 + 3 * 32]
pmulhrsw m3, m7
packuswb m2, m3
movu m3, [r2 + 8] ; [0]
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 8
ret
INIT_YMM avx2
cglobal intra_pred_ang32_6, 3,8,13
add r2, 64
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
mov r4, r0
xor r7d, r7d
call ang32_mode_6_30_row_0_15
add r4, 16
mov r0, r4
add r2, 6
call ang32_mode_6_30_row_16_31
RET
INIT_YMM avx2
cglobal intra_pred_ang32_30, 3,8,13
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
xor r7d, r7d
inc r7d
call ang32_mode_6_30_row_0_15
add r2, 6
call ang32_mode_6_30_row_16_31
RET
cglobal ang32_mode_7_29_row_0_15
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 - 7 * 32]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
pmulhrsw m5, m7
pmaddubsw m8, m2, [r3 + 2 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, m0, [r3 + 11 * 32] ; [27]
pmulhrsw m6, m7
pmaddubsw m9, m2, [r3 + 11 * 32]
pmulhrsw m9, m7
packuswb m6, m9
palignr m11, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m8, m11, [r3 - 12 * 32] ; [4]
pmulhrsw m8, m7
pmaddubsw m12, m1, [r3 - 12 * 32]
pmulhrsw m12, m7
packuswb m8, m12
pmaddubsw m9, m11, [r3 - 3 * 32] ; [13]
pmulhrsw m9, m7
pmaddubsw m12, m1, [r3 - 3 * 32]
pmulhrsw m12, m7
packuswb m9, m12
pmaddubsw m10, m11, [r3 + 6 * 32] ; [22]
pmulhrsw m10, m7
pmaddubsw m12, m1, [r3 + 6 * 32]
pmulhrsw m12, m7
packuswb m10, m12
pmaddubsw m11, [r3 + 15 * 32] ; [31]
pmulhrsw m11, m7
pmaddubsw m1, [r3 + 15 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m12, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m12, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
pmaddubsw m1, [r3 - 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m5, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m4, m5, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m7
pmaddubsw m8, m1, [r3 + 1 * 32]
pmulhrsw m8, m7
packuswb m4, m8
pmaddubsw m5, [r3 + 10 * 32] ; [26]
pmulhrsw m5, m7
pmaddubsw m1, [r3 + 10 * 32]
pmulhrsw m1, m7
packuswb m5, m1
palignr m10, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m6, m10, [r3 - 13 * 32] ; [3]
pmulhrsw m6, m7
pmaddubsw m9, m1, [r3 - 13 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, m10, [r3 - 4 * 32] ; [12]
pmulhrsw m8, m7
pmaddubsw m11, m1, [r3 - 4 * 32]
pmulhrsw m11, m7
packuswb m8, m11
pmaddubsw m9, m10, [r3 + 5 * 32] ; [21]
pmulhrsw m9, m7
pmaddubsw m11, m1, [r3 + 5 * 32]
pmulhrsw m11, m7
packuswb m9, m11
pmaddubsw m10, [r3 + 14 * 32] ; [30]
pmulhrsw m10, m7
pmaddubsw m1, [r3 + 14 * 32]
pmulhrsw m1, m7
packuswb m10, m1
palignr m3, m2, 8
palignr m2, m0, 8
pmaddubsw m11, m2, [r3 - 9 * 32] ; [7]
pmulhrsw m11, m7
pmaddubsw m1, m3, [r3 - 9 * 32]
pmulhrsw m1, m7
packuswb m11, m1
pmaddubsw m2, [r3] ; [16]
pmulhrsw m2, m7
pmaddubsw m3, [r3]
pmulhrsw m3, m7
packuswb m2, m3
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
ret
cglobal ang32_mode_7_29_row_16_31
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 + 9 * 32] ; [25]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 + 9 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m9, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m5, m9, [r3 - 14 * 32] ; [2]
pmulhrsw m5, m7
pmaddubsw m8, m1, [r3 - 14 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, m9, [r3 - 5 * 32] ; [11]
pmulhrsw m6, m7
pmaddubsw m10, m1, [r3 - 5 * 32]
pmulhrsw m10, m7
packuswb m6, m10
pmaddubsw m8, m9, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
pmaddubsw m10, m1, [r3 + 4 * 32]
pmulhrsw m10, m7
packuswb m8, m10
pmaddubsw m9, [r3 + 13 * 32] ; [29]
pmulhrsw m9, m7
pmaddubsw m1, [r3 + 13 * 32]
pmulhrsw m1, m7
packuswb m9, m1
palignr m12, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m10, m12, [r3 - 10 * 32] ; [6]
pmulhrsw m10, m7
pmaddubsw m11, m1, [r3 - 10 * 32]
pmulhrsw m11, m7
packuswb m10, m11
pmaddubsw m11, m12, [r3 - 1 * 32] ; [15]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 1 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m1, m3, m2, 4
pmaddubsw m12, [r3 + 8 * 32] ; [24]
pmulhrsw m12, m7
pmaddubsw m1, [r3 + 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m8, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m4, m8, [r3 - 15 * 32] ; [1]
pmulhrsw m4, m7
pmaddubsw m9, m1, [r3 - 15 * 32]
pmulhrsw m9, m7
packuswb m4, m9
pmaddubsw m5, m8, [r3 - 6 * 32] ; [10]
pmulhrsw m5, m7
pmaddubsw m9, m1, [r3 - 6 * 32]
pmulhrsw m9, m7
packuswb m5, m9
pmaddubsw m6, m8, [r3 + 3 * 32] ; [19]
pmulhrsw m6, m7
pmaddubsw m9, m1, [r3 + 3 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, [r3 + 12 * 32] ; [28]
pmulhrsw m8, m7
pmaddubsw m1, [r3 + 12 * 32]
pmulhrsw m1, m7
packuswb m8, m1
palignr m3, m2, 8
palignr m2, m0, 8
pmaddubsw m9, m2, [r3 - 11 * 32] ; [5]
pmulhrsw m9, m7
pmaddubsw m1, m3, [r3 - 11 * 32]
pmulhrsw m1, m7
packuswb m9, m1
pmaddubsw m10, m2, [r3 - 2 * 32] ; [14]
pmulhrsw m10, m7
pmaddubsw m1, m3, [r3 - 2 * 32]
pmulhrsw m1, m7
packuswb m10, m1
pmaddubsw m2, [r3 + 7 * 32] ; [23]
pmulhrsw m2, m7
pmaddubsw m3, [r3 + 7 * 32]
pmulhrsw m3, m7
packuswb m2, m3
movu m1, [r2 + 6] ; [0]
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 1, 0, 8
ret
INIT_YMM avx2
cglobal intra_pred_ang32_7, 3,8,13
add r2, 64
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
mov r4, r0
xor r7d, r7d
call ang32_mode_7_29_row_0_15
add r4, 16
mov r0, r4
add r2, 4
call ang32_mode_7_29_row_16_31
RET
INIT_YMM avx2
cglobal intra_pred_ang32_29, 3,8,13
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
xor r7d, r7d
inc r7d
call ang32_mode_7_29_row_0_15
add r2, 4
call ang32_mode_7_29_row_16_31
RET
cglobal ang32_mode_8_28_avx2
test r7d, r7d
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m7
pmaddubsw m1, m2, [r3 - 11 * 32]
pmulhrsw m1, m7
packuswb m4, m1
pmaddubsw m5, m0, [r3 - 6 * 32] ; [10]
pmulhrsw m5, m7
pmaddubsw m8, m2, [r3 - 6 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, m0, [r3 - 1 * 32] ; [15]
pmulhrsw m6, m7
pmaddubsw m9, m2, [r3 - 1 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, m0, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
pmaddubsw m12, m2, [r3 + 4 * 32]
pmulhrsw m12, m7
packuswb m8, m12
pmaddubsw m9, m0, [r3 + 9 * 32] ; [25]
pmulhrsw m9, m7
pmaddubsw m12, m2, [r3 + 9 * 32]
pmulhrsw m12, m7
packuswb m9, m12
pmaddubsw m10, m0, [r3 + 14 * 32] ; [30]
pmulhrsw m10, m7
pmaddubsw m12, m2, [r3 + 14 * 32]
pmulhrsw m12, m7
packuswb m10, m12
palignr m12, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m11, m12, [r3 - 13 * 32] ; [3]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 13 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m1, m3, m2, 2
pmaddubsw m12, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
pmaddubsw m1, [r3 - 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
; rows 8 to 15
palignr m8, m2, m0, 2
palignr m1, m3, m2, 2
pmaddubsw m4, m8, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m7
pmaddubsw m9, m1, [r3 - 3 * 32]
pmulhrsw m9, m7
packuswb m4, m9
pmaddubsw m5, m8, [r3 + 2 * 32] ; [18]
pmulhrsw m5, m7
pmaddubsw m9, m1, [r3 + 2 * 32]
pmulhrsw m9, m7
packuswb m5, m9
pmaddubsw m6, m8, [r3 + 7 * 32] ; [23]
pmulhrsw m6, m7
pmaddubsw m9, m1, [r3 + 7 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, [r3 + 12 * 32] ; [28]
pmulhrsw m8, m7
pmaddubsw m1, [r3 + 12 * 32]
pmulhrsw m1, m7
packuswb m8, m1
palignr m12, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m9, m12, [r3 - 15 * 32] ; [1]
pmulhrsw m9, m7
pmaddubsw m11, m1, [r3 - 15 * 32]
pmulhrsw m11, m7
packuswb m9, m11
pmaddubsw m10, m12, [r3 - 10 * 32] ; [6]
pmulhrsw m10, m7
pmaddubsw m11, m1, [r3 - 10 * 32]
pmulhrsw m11, m7
packuswb m10, m11
pmaddubsw m11, m12, [r3 - 5 * 32] ; [11]
pmulhrsw m11, m7
pmaddubsw m1, [r3 - 5 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m1, m3, m2, 4
pmaddubsw m12, [r3] ; [16]
pmulhrsw m12, m7
pmaddubsw m1, [r3]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
; rows 16 to 23
jnz .doNotAdjustBufferPtr
lea r4, [r4 + mmsize/2]
mov r0, r4
.doNotAdjustBufferPtr:
palignr m6, m2, m0, 4
palignr m1, m3, m2, 4
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m7
pmaddubsw m8, m1, [r3 + 5 * 32]
pmulhrsw m8, m7
packuswb m4, m8
pmaddubsw m5, m6, [r3 + 10 * 32] ; [26]
pmulhrsw m5, m7
pmaddubsw m8, m1, [r3 + 10 * 32]
pmulhrsw m8, m7
packuswb m5, m8
pmaddubsw m6, [r3 + 15 * 32] ; [31]
pmulhrsw m6, m7
pmaddubsw m1, [r3 + 15 * 32]
pmulhrsw m1, m7
packuswb m6, m1
palignr m12, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m8, m12, [r3 - 12 * 32] ; [4]
pmulhrsw m8, m7
pmaddubsw m11, m1, [r3 - 12 * 32]
pmulhrsw m11, m7
packuswb m8, m11
pmaddubsw m9, m12, [r3 - 7 * 32] ; [9]
pmulhrsw m9, m7
pmaddubsw m11, m1, [r3 - 7 * 32]
pmulhrsw m11, m7
packuswb m9, m11
pmaddubsw m10, m12, [r3 - 2 * 32] ; [14]
pmulhrsw m10, m7
pmaddubsw m11, m1, [r3 - 2 * 32]
pmulhrsw m11, m7
packuswb m10, m11
pmaddubsw m11, m12, [r3 + 3 * 32] ; [19]
pmulhrsw m11, m7
pmaddubsw m1, [r3 + 3 * 32]
pmulhrsw m1, m7
packuswb m11, m1
palignr m1, m3, m2, 6
pmaddubsw m12, [r3 + 8 * 32] ; [24]
pmulhrsw m12, m7
pmaddubsw m1, [r3 + 8 * 32]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 16
; rows 24 to 31
palignr m4, m2, m0, 6
palignr m1, m3, m2, 6
pmaddubsw m4, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m7
pmaddubsw m1, [r3 + 13 * 32]
pmulhrsw m1, m7
packuswb m4, m1
palignr m3, m2, 8
palignr m2, m0, 8
pmaddubsw m5, m2, [r3 - 14 * 32] ; [2]
pmulhrsw m5, m7
pmaddubsw m9, m3, [r3 - 14 * 32]
pmulhrsw m9, m7
packuswb m5, m9
pmaddubsw m6, m2, [r3 - 9 * 32] ; [7]
pmulhrsw m6, m7
pmaddubsw m9, m3, [r3 - 9 * 32]
pmulhrsw m9, m7
packuswb m6, m9
pmaddubsw m8, m2, [r3 - 4 * 32] ; [12]
pmulhrsw m8, m7
pmaddubsw m1, m3, [r3 - 4 * 32]
pmulhrsw m1, m7
packuswb m8, m1
pmaddubsw m9, m2, [r3 + 1 * 32] ; [17]
pmulhrsw m9, m7
pmaddubsw m11, m3, [r3 + 1 * 32]
pmulhrsw m11, m7
packuswb m9, m11
pmaddubsw m10, m2, [r3 + 6 * 32] ; [22]
pmulhrsw m10, m7
pmaddubsw m1, m3, [r3 + 6 * 32]
pmulhrsw m1, m7
packuswb m10, m1
pmaddubsw m2, [r3 + 11 * 32] ; [27]
pmulhrsw m2, m7
pmaddubsw m3, [r3 + 11 * 32]
pmulhrsw m3, m7
packuswb m2, m3
movu m3, [r2 + 6] ; [0]
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 24
ret
INIT_YMM avx2
cglobal intra_pred_ang32_8, 3,8,13
add r2, 64
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
mov r4, r0
xor r7d, r7d
call ang32_mode_8_28_avx2
RET
INIT_YMM avx2
cglobal intra_pred_ang32_28, 3,8,13
lea r3, [ang_table_avx2 + 32 * 16]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
xor r7d, r7d
inc r7d
call ang32_mode_8_28_avx2
RET
INIT_YMM avx2
cglobal intra_pred_ang32_9, 3,5,8
vbroadcasti128 m0, [angHor_tab_9]
vbroadcasti128 m1, [angHor_tab_9 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode9]
lea r3, [r1 * 3]
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
vbroadcasti128 m6, [r2 + mmsize*2 + 17]
pshufb m5, m3, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 1
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 2
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m6, m3, 3
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 5
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 6
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m6, m3, 7
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 9
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 10
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m6, m3, 11
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 13
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 14
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m6, m3, 15
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
vbroadcasti128 m3, [r2 + mmsize*2 + 33]
pshufb m5, m6, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 1
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 2
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m3, m6, 3
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 4
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 5
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 6
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m3, m6, 7
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 8
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 9
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 10
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m3, m6, 11
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 12
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 13
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 14
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1*2], m4
palignr m5, m3, m6, 15
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_27, 3,5,6
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3] ; r4 -> 3 * stride
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m3, m2, 2
palignr m2, m0, 2
movu m1, [r2 + 2] ; [0]
movu [r0 + r4], m1
lea r0, [r0 + r1 * 4]
; rows 16 to 23
pmaddubsw m4, m2, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m2, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m2, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m2, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m2, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m2, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m2, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m2, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m2, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m2, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m2, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m2, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m2, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m2, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m2, [r3 + 14 * 32] ; [30]
pmulhrsw m2, m5
pmaddubsw m3, [r3 + 14 * 32]
pmulhrsw m3, m5
packuswb m2, m3
movu [r0 + r1*2], m2
movu m1, [r2 + 3] ; [0]
movu [r0 + r4], m1
RET
cglobal intra_pred_ang32_10, 5,5,4
pxor m0, m0
mova m1, [pb_1]
lea r4, [r1 * 3]
vbroadcasti128 m2, [r2 + mmsize*2 + 1]
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
lea r0, [r0 + r1 * 4]
paddb m0, m1
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
lea r0, [r0 + r1 * 4]
paddb m0, m1
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
lea r0, [r0 + r1 * 4]
paddb m0, m1
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
lea r0, [r0 + r1 * 4]
pxor m0, m0
vbroadcasti128 m2, [r2 + mmsize*2 + mmsize/2 + 1]
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
lea r0, [r0 + r1 * 4]
paddb m0, m1
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
lea r0, [r0 + r1 * 4]
paddb m0, m1
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
lea r0, [r0 + r1 * 4]
paddb m0, m1
pshufb m3, m2, m0
movu [r0], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r1 * 2], m3
paddb m0, m1
pshufb m3, m2, m0
movu [r0 + r4], m3
RET
cglobal intra_pred_ang32_11, 3,4,8
vbroadcasti128 m0, [angHor_tab_11]
vbroadcasti128 m1, [angHor_tab_11 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode11]
lea r3, [r1 * 3]
; prepare for [16 0 -1 -2 ...]
movu xm3, [r2 + mmsize*2 - 1]
vbroadcasti128 m6, [r2 + mmsize*2 + 15]
pinsrb xm3, [r2 + 0], 1
pinsrb xm3, [r2 + 16], 0
vinserti128 m3, m3, xm3, 1 ; [16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 16 0 16 0 16 0 16 0 16 0 16 0 16 0 16 0]
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 1
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 2
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 3
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 5
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 6
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 7
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 9
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 10
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 11
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 13
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 14
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 15
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
mova m3, m6
vbroadcasti128 m6, [r2 + mmsize*2 + 15 + 16]
pshufb m5, m3, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 1
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 2
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 3
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 5
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 6
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 7
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 9
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 10
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 11
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m6, m3, 13
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m6, m3, 14
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m6, m3, 15
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_25, 3,5,7
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3]
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 0] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
movu m1, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
pinsrb xm3, [r2], 15
pinsrb xm3, [r2 + mmsize*2 + 16], 14
punpckhbw m2, m0, m1 ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
punpcklbw m0, m1 ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
vinserti128 m3, m3, xm2, 1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 0 16 x x x x x x x x x x x x x x]
pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
movu m1, [r2] ; [0]
movu [r0 + r4], m1
lea r0, [r0 + r1 * 4]
palignr m2, m0, 14
palignr m0, m3, 14
; rows 16 to 23
pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 24 to 31
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m0, [r3 - 14 * 32] ; [2]
pmulhrsw m0, m5
pmaddubsw m2, [r3 - 14 * 32]
pmulhrsw m2, m5
packuswb m0, m2
movu [r0 + r1*2], m0
movu m1, [r2 + 1] ; [0]
palignr m1, m3, 14
movu [r0 + r4], m1
RET
cglobal intra_pred_ang32_12, 3,4,9
movu m0, [ang32_fact_mode12]
movu m1, [ang32_fact_mode12 + mmsize]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode12]
mova m8, [ang32_shuf_mode12 + mmsize]
lea r3, [r1 * 3]
; prepare for [26, 19, 13, 6, 0, -1, -2....]
movu xm4, [r2 + mmsize*2 - 4]
vbroadcasti128 m6, [r2 + mmsize*2 + 12]
pinsrb xm4, [r2 + 0], 4
pinsrb xm4, [r2 + 6], 3
pinsrb xm4, [r2 + 13], 2
pinsrb xm4, [r2 + 19], 1
pinsrb xm4, [r2 + 26], 0
vinserti128 m3, m4, xm4, 1 ; [26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
pshufb m4, m3, m7 ; [ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13]
pshufb m5, m3, m8 ; [ 6, 0, 6, 0, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 13, 6, 19, 13, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19]
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 1
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 2
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 3
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m6, m3, 4
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 5
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 6
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 7
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m6, m3, 8
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 9
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 10
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 11
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m6, m3, 12
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 13
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 14
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 15
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
mova m3, m6
vbroadcasti128 m6, [r2 + mmsize*2 + 12 + 16]
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 1
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 2
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 3
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m6, m3, 4
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 5
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 6
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 7
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m6, m3, 8
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 9
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 10
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 11
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m6, m3, 12
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m6, m3, 13
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m6, m3, 14
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m6, m3, 15
pshufb m5, m4, m8
pshufb m4, m7
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_24, 3,5,8
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3]
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 0]
movu m1, [r2 + 1]
punpckhbw m2, m0, m1
punpcklbw m0, m1
movu m4, [r2 + mmsize*2]
pshufb m4, [ang32_shuf_mode24]
mova m3, [ang32_shuf_mode24 + mmsize]
vpermd m4, m3, m4 ; [6 6 13 13 19 19 26 26 x x x...]
palignr m3, m0, m4, 1
vinserti128 m3, m3, xm2, 1
pmaddubsw m4, m0, [r3 + 11 * 32] ; [27]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m0, [r3 - 9 * 32] ; [7]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 14
palignr m7, m2, m0, 14
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m0, m3, 12
palignr m7, m2, m0, 12
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m6, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 16 to 23
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 10
palignr m7, m2, m0, 10
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 24 to 31
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 8
palignr m7, m2, m0, 8
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 - 1 * 32] ; [15]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pand m6, [pw_00ff]
pand m7, [pw_00ff]
packuswb m6, m7
movu [r0 + r4], m6
RET
cglobal intra_pred_ang32_13, 3,4,9
movu m0, [ang32_fact_mode13]
movu m1, [ang32_fact_mode13 + mmsize]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode13]
mova m8, [ang32_shuf_mode13 + mmsize]
lea r3, [r1 * 3]
; prepare for [28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2....]
movu m6, [r2]
pshufb m6, [ang32_shuf_mode13 + mmsize*2]
mova m3, [ang32_shuf_mode24 + mmsize*1]
vpermd m6, m3, m6
palignr m6, m6, 1
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
palignr m5, m3, m6, 1
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 3
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 5
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 7
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 9
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 11
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 13
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 15
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
mova m6, m3
vbroadcasti128 m3, [r2 + mmsize*2 + 17]
palignr m5, m3, m6, 1
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 3
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 4
pshufb m4, m5, m7
pshufb m5, m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 5
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 7
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 9
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 11
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 13
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 15
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_23, 3,5,8
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3]
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 0]
movu m1, [r2 + 1]
punpckhbw m2, m0, m1
punpcklbw m0, m1
movu m4, [r2 + mmsize*2]
pshufb m4, [ang32_shuf_mode23]
vpermq m4, m4, q1313
palignr m3, m0, m4, 1
vinserti128 m3, m3, xm2, 1
pmaddubsw m4, m0, [r3 + 7 * 32] ; [23]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 14
palignr m7, m2, m0, 14
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 12
palignr m7, m2, m0, 12
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m6, [r3 - 1 * 32] ; [15]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 10
palignr m7, m2, m0, 10
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 8
palignr m7, m2, m0, 8
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m6, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 16 to 23
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 6
palignr m7, m2, m0, 6
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 4
palignr m7, m2, m0, 4
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 24 to 31
palignr m6, m0, m3, 2
palignr m7, m2, m0, 2
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m3, [r3 + 11 * 32] ; [27]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 + 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m3, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m3, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 - 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pand m3, [pw_00ff]
pand m0, [pw_00ff]
packuswb m3, m0
movu [r0 + r4], m3
RET
cglobal intra_pred_ang32_14, 3,4,9
movu m0, [ang32_fact_mode14]
movu m1, [ang32_fact_mode14 + mmsize]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode14]
mova m8, [ang32_shuf_mode14 + mmsize]
lea r3, [r1 * 3]
; prepare for [30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2...]
movu m6, [r2]
pshufb m6, [ang32_shuf_mode14 + mmsize*2]
vpermq m6, m6, 01110111b
pslldq m6, m6, 1
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
palignr m5, m3, m6, 1
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 3
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 5
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 7
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 9
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 11
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 13
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 15
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
mova m6, m3
vbroadcasti128 m3, [r2 + mmsize*2 + 17]
palignr m5, m3, m6, 1
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 3
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 4
pshufb m4, m5, m7
pshufb m5, m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 5
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 7
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 9
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 11
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m5, m3, m6, 13
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m5, m3, m6, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m5, m3, m6, 15
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_22, 3,5,9
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3]
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 0]
movu m1, [r2 + 1]
punpckhbw m2, m0, m1
punpcklbw m0, m1
movu m4, [r2 + mmsize*2 + 2]
pshufb m4, [ang32_shuf_mode22]
vextracti128 xm8, m4, 1
palignr m3, m0, m4, 2
palignr m3, m8, 15
vinserti128 m3, m3, xm2, 1
vinserti128 m8, m8, xm0, 1
pmaddubsw m4, m0, [r3 + 3 * 32] ; [19]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 + 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 14
palignr m7, m2, m0, 14
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m0, m3, 12
palignr m7, m2, m0, 12
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 10
palignr m7, m2, m0, 10
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 8
palignr m7, m2, m0, 8
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m0, m3, 6
palignr m7, m2, m0, 6
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 4
palignr m7, m2, m0, 4
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m6, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 16 to 23
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 2
palignr m7, m2, m0, 2
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m3, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m3, [r3 - 1 * 32] ; [15]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 - 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m3, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 14
palignr m7, m0, m3, 14
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 24 to 31
palignr m6, m3, m8, 12
palignr m7, m0, m3, 12
pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
palignr m6, m3, m8, 10
palignr m7, m0, m3, 10
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m0, m3, 8
palignr m3, m8, 8
pmaddubsw m4, m3, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m3, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 - 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pand m3, [pw_00ff]
pand m0, [pw_00ff]
packuswb m3, m0
movu [r0 + r4], m3
RET
cglobal intra_pred_ang32_15, 3,4,9
movu m0, [ang32_fact_mode15]
movu m1, [ang32_fact_mode15 + mmsize]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode15]
mova m8, [ang32_shuf_mode15 + mmsize]
lea r3, [r1 * 3]
; prepare for [30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2...]
movu m6, [r2]
pshufb m6, [ang32_shuf_mode15 + mmsize*2]
vpermq m6, m6, 01110111b
movu xm3, [r2 + mmsize*2]
pinsrb xm3, [r2], 0
vpermq m3, m3, 01000100b
palignr m4, m3, m6, 2
pshufb m4, m7
pshufb m5, m6, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 3
pshufb m4, m7
palignr m5, m3, m6, 1
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m3, m6, 4
pshufb m4, m7
palignr m5, m3, m6, 2
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 5
pshufb m4, m7
palignr m5, m3, m6, 3
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 6
pshufb m4, m7
palignr m5, m3, m6, 4
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 7
pshufb m4, m7
palignr m5, m3, m6, 5
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m3, m6, 8
pshufb m4, m7
palignr m5, m3, m6, 6
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 9
pshufb m4, m7
palignr m5, m3, m6, 7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 10
pshufb m4, m7
palignr m5, m3, m6, 8
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 11
pshufb m4, m7
palignr m5, m3, m6, 9
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m3, m6, 12
pshufb m4, m7
palignr m5, m3, m6, 10
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 13
pshufb m4, m7
palignr m5, m3, m6, 11
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 14
pshufb m4, m7
palignr m5, m3, m6, 12
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 15
pshufb m4, m7
palignr m5, m3, m6, 13
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
pshufb m4, m3, m7
palignr m5, m3, m6, 14
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 15
mova m6, m3
vbroadcasti128 m3, [r2 + mmsize*2 + 16]
palignr m4, m3, m6, 1
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 2
pshufb m4, m7
pshufb m5, m6, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 3
pshufb m4, m7
palignr m5, m3, m6, 1
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m3, m6, 4
pshufb m4, m7
palignr m5, m3, m6, 2
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 5
pshufb m4, m7
palignr m5, m3, m6, 3
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 6
pshufb m4, m7
palignr m5, m3, m6, 4
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 7
pshufb m4, m7
palignr m5, m3, m6, 5
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m3, m6, 8
pshufb m4, m7
palignr m5, m3, m6, 6
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 9
pshufb m4, m7
palignr m5, m3, m6, 7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 10
pshufb m4, m7
palignr m5, m3, m6, 8
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 11
pshufb m4, m7
palignr m5, m3, m6, 9
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
palignr m4, m3, m6, 12
pshufb m4, m7
palignr m5, m3, m6, 10
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 13
pshufb m4, m7
palignr m5, m3, m6, 11
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 14
pshufb m4, m7
palignr m5, m3, m6, 12
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], m4
palignr m4, m3, m6, 15
pshufb m4, m7
palignr m5, m3, m6, 13
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1], m4
pshufb m4, m3, m7
palignr m5, m3, m6, 14
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 15
vbroadcasti128 m6, [r2 + mmsize*2 + 32]
palignr m4, m6, m3, 1
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_21, 3,5,9
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3]
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 0]
movu m1, [r2 + 1]
punpckhbw m2, m0, m1
punpcklbw m0, m1
movu m4, [r2 + mmsize*2]
pshufb m4, [ang32_shuf_mode21]
vextracti128 xm6, m4, 1
palignr m3, m0, m4, 1
palignr m8, m3, m6, 1
vinserti128 m3, m3, xm2, 1
vinserti128 m8, m8, xm0, 1
pmaddubsw m4, m0, [r3 - 1 * 32] ; [15]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 14
palignr m7, m2, m0, 14
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 12
palignr m7, m2, m0, 12
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 10
palignr m7, m2, m0, 10
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 8
palignr m7, m2, m0, 8
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 6
palignr m7, m2, m0, 6
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 4
palignr m7, m2, m0, 4
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 2
palignr m7, m2, m0, 2
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m3, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 16 to 23
palignr m6, m3, m8, 14
palignr m7, m0, m3, 14
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 12
palignr m7, m0, m3, 12
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m3, m8, 10
palignr m7, m0, m3, 10
pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 8
palignr m7, m0, m3, 8
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 24 to 31
palignr m6, m3, m8, 6
palignr m7, m0, m3, 6
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 4
palignr m7, m0, m3, 4
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m3, m8, 2
palignr m7, m0, m3, 2
pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m8, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pand m8, [pw_00ff]
pand m3, [pw_00ff]
packuswb m8, m3
movu [r0 + r4], m8
RET
cglobal intra_pred_ang32_16, 3,4,10
movu m0, [ang32_fact_mode16]
movu m1, [ang32_fact_mode16 + mmsize]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode16]
mova m8, [ang32_shuf_mode16 + mmsize]
lea r3, [r1 * 3]
; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2...]
movu m6, [r2]
pshufb m6, [ang32_shuf_mode16 + mmsize*2]
mova m9, m6
mova m3, [ang32_shuf_mode16 + mmsize*3]
vpermd m6, m3, m6
vpermq m9, m9, q3232
pslldq m9, 4
palignr m6, m9, 15
pslldq m9, 1
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
palignr m4, m3, m6, 1
palignr m5, m6, m9, 6
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 2
palignr m5, m6, m9, 7
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m3, m6, 3
palignr m5, m6, m9, 8
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 4
palignr m5, m6, m9, 9
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 5
palignr m5, m6, m9, 10
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 6
palignr m5, m6, m9, 11
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m3, m6, 7
palignr m5, m6, m9, 12
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 8
palignr m5, m6, m9, 13
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 9
palignr m5, m6, m9, 14
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 10
palignr m5, m6, m9, 15
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m3, m6, 11
pshufb m4, m7
pshufb m5, m6, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 12
palignr m5, m3, m6, 1
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 13
palignr m5, m3, m6, 2
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 14
palignr m5, m3, m6, 3
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m3, m6, 15
palignr m5, m3, m6, 4
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m5, m3, m6, 5
pshufb m4, m3, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
vbroadcasti128 m9, [r2 + mmsize*2 + 17]
palignr m4, m9, m3, 1
palignr m5, m3, m6, 6
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m9, m3, 2
palignr m5, m3, m6, 7
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m9, m3, 3
palignr m5, m3, m6, 8
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m9, m3, 4
palignr m5, m3, m6, 9
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m9, m3, 5
palignr m5, m3, m6, 10
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m9, m3, 6
palignr m5, m3, m6, 11
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m9, m3, 7
palignr m5, m3, m6, 12
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m9, m3, 8
palignr m5, m3, m6, 13
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m9, m3, 9
palignr m5, m3, m6, 14
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m9, m3, 10
palignr m5, m3, m6, 15
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m9, m3, 11
pshufb m4, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m9, m3, 12
palignr m5, m9, m3, 1
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m9, m3, 13
palignr m5, m9, m3, 2
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m9, m3, 14
palignr m5, m9, m3, 3
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m9, m3, 15
palignr m5, m9, m3, 4
pshufb m4, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m5, m9, m3, 5
pshufb m4, m9, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_20, 3,5,10
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3]
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 0]
movu m1, [r2 + 1]
punpckhbw m2, m0, m1
punpcklbw m0, m1
movu m4, [r2 + mmsize*2]
pshufb m4, [ang32_shuf_mode20]
mova m9, m4
vpermq m9, m9, q3333
mova m7, m4
vpermq m7, m7, q1111
palignr m4, m7, 14
pshufb m4, [ang32_shuf_mode20 + mmsize*1]
vextracti128 xm6, m4, 1
palignr m3, m0, m4, 1
palignr m8, m3, m6, 1
vinserti128 m3, m3, xm2, 1
vinserti128 m8, m8, xm0, 1
vinserti128 m9, m9, xm3, 1
pmaddubsw m4, m0, [r3 - 5 * 32] ; [11]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 14
palignr m7, m2, m0, 14
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 12
palignr m7, m2, m0, 12
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m0, m3, 10
palignr m7, m2, m0, 10
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 8
palignr m7, m2, m0, 8
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 6
palignr m7, m2, m0, 6
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 4
palignr m7, m2, m0, 4
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 2
palignr m7, m2, m0, 2
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m3, [r3 - 1 * 32] ; [15]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 - 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m3, m8, 14
palignr m7, m0, m3, 14
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
palignr m6, m3, m8, 12
palignr m7, m0, m3, 12
pmaddubsw m4, m6, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 16 to 23
palignr m6, m3, m8, 10
palignr m7, m0, m3, 10
pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 11 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 8
palignr m7, m0, m3, 8
pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 1 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m3, m8, 6
palignr m7, m0, m3, 6
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 9 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m3, m8, 4
palignr m7, m0, m3, 4
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 2
palignr m7, m0, m3, 2
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 13 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 24 to 31
pmaddubsw m4, m8, [r3 + 3 * 32] ; [19]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 3 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m8, m9, 14
palignr m7, m3, m8, 14
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 7 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
palignr m6, m8, m9, 12
palignr m7, m3, m8, 12
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m8, m9, 10
palignr m7, m3, m8, 10
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 15 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m8, m9, 8
palignr m7, m3, m8, 8
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 5 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pand m6, [pw_00ff]
pand m7, [pw_00ff]
packuswb m6, m7
movu [r0 + r4], m6
RET
cglobal intra_pred_ang32_17, 3,4,8
movu m0, [ang32_fact_mode17]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode17]
lea r3, [r1 * 3]
; prepare for [31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2...]
movu m6, [r2]
pshufb m6, [ang32_shuf_mode17 + mmsize]
mova m1, m6
mova m3, [ang32_shuf_mode16 + mmsize*3]
vpermd m6, m3, m6
vpermq m1, m1, q3232
pslldq m1, 4
movu xm4, [r2 + mmsize*2]
pinsrb xm4, [r2], 0
vinserti128 m3, m4, xm4, 1
palignr m4, m3, m6, 2
palignr m5, m6, m1, 5
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 3
palignr m5, m6, m1, 6
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m3, m6, 4
palignr m5, m6, m1, 7
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 5
palignr m5, m6, m1, 8
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 6
palignr m5, m6, m1, 9
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 7
palignr m5, m6, m1, 10
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m3, m6, 8
palignr m5, m6, m1, 11
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 9
palignr m5, m6, m1, 12
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 10
palignr m5, m6, m1, 13
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 11
palignr m5, m6, m1, 14
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m3, m6, 12
palignr m5, m6, m1, 15
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m3, m6, 13
pshufb m4, m7
pshufb m5, m6, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m3, m6, 14
palignr m5, m3, m6, 1
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m3, m6, 15
palignr m5, m3, m6, 2
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m5, m3, m6, 3
pshufb m4, m3, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
vbroadcasti128 m1, [r2 + mmsize*2 + 16]
palignr m4, m1, m3, 1
palignr m5, m3, m6, 4
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m1, m3, 2
palignr m5, m3, m6, 5
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m1, m3, 3
palignr m5, m3, m6, 6
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m1, m3, 4
palignr m5, m3, m6, 7
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m1, m3, 5
palignr m5, m3, m6, 8
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m1, m3, 6
palignr m5, m3, m6, 9
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m1, m3, 7
palignr m5, m3, m6, 10
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m1, m3, 8
palignr m5, m3, m6, 11
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m1, m3, 9
palignr m5, m3, m6, 12
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m1, m3, 10
palignr m5, m3, m6, 13
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m1, m3, 11
palignr m5, m3, m6, 14
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
palignr m4, m1, m3, 12
palignr m5, m3, m6, 15
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m1, m3, 13
pshufb m4, m7
pshufb m5, m3, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
lea r0, [r0 + r1 * 4]
palignr m4, m1, m3, 14
palignr m5, m1, m3, 1
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0], m4
palignr m4, m1, m3, 15
palignr m5, m1, m3, 2
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1], m4
vbroadcasti128 m6, [r2 + mmsize*2 + mmsize]
palignr m5, m1, m3, 3
pshufb m4, m1, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r1 * 2], m4
palignr m4, m6, m1, 1
palignr m5, m1, m3, 4
pshufb m4, m7
pshufb m5, m7
pmaddubsw m4, m0
pmaddubsw m5, m0
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
vpermq m4, m4, q3120
movu [r0 + r3], m4
RET
cglobal intra_pred_ang32_19, 3,5,10
lea r3, [ang_table_avx2 + 32 * 16]
lea r4, [r1 * 3]
mova m5, [pw_1024]
; rows 0 to 7
movu m0, [r2 + 0]
movu m1, [r2 + 1]
punpckhbw m2, m0, m1
punpcklbw m0, m1
movu m4, [r2 + mmsize*2]
pshufb m4, [ang32_shuf_mode17 + mmsize*1]
mova m3, [ang32_shuf_mode19 + mmsize*1]
mova m6, [ang32_shuf_mode19 + mmsize*2]
mova m9, m4
vpermd m4, m3, m4
vpermd m9, m6, m9
pshufb m4, [ang32_shuf_mode19]
pshufb m9, [ang32_shuf_mode19]
vextracti128 xm6, m4, 1
palignr m3, m0, m4, 1
palignr m8, m3, m6, 1
palignr m7, m8, m9, 1
vinserti128 m3, m3, xm2, 1
vinserti128 m8, m8, xm0, 1
vinserti128 m9, m7, xm3, 1
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m2, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m0, m3, 14
palignr m7, m2, m0, 14
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 12
palignr m7, m2, m0, 12
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 10
palignr m7, m2, m0, 10
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m0, m3, 8
palignr m7, m2, m0, 8
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m0, m3, 6
palignr m7, m2, m0, 6
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m0, m3, 4
palignr m7, m2, m0, 4
pmaddubsw m4, m6, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 8 to 15
palignr m6, m0, m3, 2
palignr m7, m2, m0, 2
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m3, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m3, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m0, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m3, m8, 14
palignr m7, m0, m3, 14
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m3, m8, 12
palignr m7, m0, m3, 12
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m3, m8, 10
palignr m7, m0, m3, 10
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 8
palignr m7, m0, m3, 8
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pand m6, [pw_00ff]
pand m7, [pw_00ff]
packuswb m6, m7
movu [r0 + r4], m6
lea r0, [r0 + r1 * 4]
; rows 16 to 23
palignr m6, m3, m8, 6
palignr m7, m0, m3, 6
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m3, m8, 4
palignr m7, m0, m3, 4
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m3, m8, 2
palignr m7, m0, m3, 2
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
pmaddubsw m4, m8, [r3 + 8 * 32] ; [24]
pmulhrsw m4, m5
pmaddubsw m1, m3, [r3 + 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
palignr m6, m8, m9, 14
palignr m7, m3, m8, 14
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m8, m9, 12
palignr m7, m3, m8, 12
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m8, m9, 10
palignr m7, m3, m8, 10
pmaddubsw m4, m6, [r3] ; [16]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
; rows 24 to 31
palignr m6, m8, m9, 8
palignr m7, m3, m8, 8
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 6 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
palignr m6, m8, m9, 6
palignr m7, m3, m8, 6
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 12 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 14 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1*2], m4
palignr m6, m8, m9, 4
palignr m7, m3, m8, 4
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 8 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r4], m4
lea r0, [r0 + r1 * 4]
vpbroadcastb m0, [r2 + mmsize*2 + 31]
palignr m1, m9, m0, 1
vinserti128 m0, m1, xm8, 1
palignr m6, m8, m9, 2
palignr m7, m3, m8, 2
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 - 2 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0], m4
pmaddubsw m4, m9, [r3 + 4 * 32] ; [20]
pmulhrsw m4, m5
pmaddubsw m1, m8, [r3 + 4 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1], m4
palignr m6, m9, m0, 14
palignr m7, m8, m9, 14
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m5
pmaddubsw m1, m7, [r3 + 10 * 32]
pmulhrsw m1, m5
packuswb m4, m1
movu [r0 + r1 * 2], m4
pand m6, [pw_00ff]
pand m7, [pw_00ff]
packuswb m6, m7
movu [r0 + r4], m6
RET
%endif ; ARCH_X86_64
;-----------------------------------------------------------------------------------------
; end of intra_pred_ang32 angular modes avx2 asm
;-----------------------------------------------------------------------------------------
;-----------------------------------------------------------------------------------------
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
INIT_YMM avx2
cglobal intra_pred_ang8_3, 3,4,5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 17]
pshufb m1, m0, [c_ang8_src1_9_2_10]
pshufb m2, m0, [c_ang8_src3_11_4_12]
pshufb m4, m0, [c_ang8_src5_13_5_13]
pshufb m0, [c_ang8_src6_14_7_15]
pmaddubsw m1, [c_ang8_26_20]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_14_8]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_2_28]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_22_16]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_33, 3,4,5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 1]
pshufb m1, m0, [c_ang8_src1_9_2_10]
pshufb m2, m0, [c_ang8_src3_11_4_12]
pshufb m4, m0, [c_ang8_src5_13_5_13]
pshufb m0, [c_ang8_src6_14_7_15]
pmaddubsw m1, [c_ang8_26_20]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_14_8]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_2_28]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_22_16]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_4, 3,4,5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 17]
pshufb m1, m0, [c_ang8_src1_9_2_10]
pshufb m2, m0, [c_ang8_src2_10_3_11]
pshufb m4, m0, [c_ang8_src4_12_4_12]
pshufb m0, [c_ang8_src5_13_6_14]
pmaddubsw m1, [c_ang8_21_10]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_31_20]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_9_30]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_19_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_32, 3,4,5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 1]
pshufb m1, m0, [c_ang8_src1_9_2_10]
pshufb m2, m0, [c_ang8_src2_10_3_11]
pshufb m4, m0, [c_ang8_src4_12_4_12]
pshufb m0, [c_ang8_src5_13_6_14]
pmaddubsw m1, [c_ang8_21_10]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_31_20]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_9_30]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_19_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_5, 3, 4, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 17]
pshufb m1, m0, [c_ang8_src1_9_2_10]
pshufb m2, m0, [c_ang8_src2_10_3_11]
pshufb m4, m0, [c_ang8_src3_11_4_12]
pshufb m0, [c_ang8_src4_12_5_13]
pmaddubsw m1, [c_ang8_17_2]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_19_4]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_21_6]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_23_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_31, 3, 4, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 1]
pshufb m1, m0, [c_ang8_src1_9_2_10]
pshufb m2, m0, [c_ang8_src2_10_3_11]
pshufb m4, m0, [c_ang8_src3_11_4_12]
pshufb m0, [c_ang8_src4_12_5_13]
pmaddubsw m1, [c_ang8_17_2]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_19_4]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_21_6]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_23_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_6, 3, 4, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 17]
pshufb m1, m0, [intra_pred_shuff_0_8]
pshufb m2, m0, [c_ang8_src2_10_2_10]
pshufb m4, m0, [c_ang8_src3_11_3_11]
pshufb m0, [c_ang8_src3_11_4_12]
pmaddubsw m1, [c_ang8_13_26]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_7_20]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_1_14]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_27_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_30, 3, 4, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 1]
pshufb m1, m0, [intra_pred_shuff_0_8]
pshufb m2, m0, [c_ang8_src2_10_2_10]
pshufb m4, m0, [c_ang8_src3_11_3_11]
pshufb m0, [c_ang8_src3_11_4_12]
pmaddubsw m1, [c_ang8_13_26]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_7_20]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_1_14]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_27_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_9, 3, 5, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 17]
pshufb m0, [intra_pred_shuff_0_8]
lea r4, [c_ang8_mode_27]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_27, 3, 5, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred_shuff_0_8]
lea r4, [c_ang8_mode_27]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_25, 3, 5, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred_shuff_0_8]
lea r4, [c_ang8_mode_25]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_7, 3, 4, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 17]
pshufb m1, m0, [intra_pred_shuff_0_8]
pshufb m2, m0, [c_ang8_src1_9_2_10]
pshufb m4, m0, [c_ang8_src2_10_2_10]
pshufb m0, [c_ang8_src2_10_3_11]
pmaddubsw m1, [c_ang8_9_18]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_27_4]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_13_22]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_31_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_29, 3, 4, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 1]
pshufb m1, m0, [intra_pred_shuff_0_8]
pshufb m2, m0, [c_ang8_src1_9_2_10]
pshufb m4, m0, [c_ang8_src2_10_2_10]
pshufb m0, [c_ang8_src2_10_3_11]
pmaddubsw m1, [c_ang8_9_18]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_27_4]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_13_22]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_31_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_8, 3, 4, 6
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 17]
mova m5, [intra_pred_shuff_0_8]
pshufb m1, m0, m5
pshufb m2, m0, m5
pshufb m4, m0, m5
pshufb m0, [c_ang8_src2_10_2_10]
pmaddubsw m1, [c_ang8_5_10]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_15_20]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_25_30]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_3_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_28, 3, 4, 6
mova m3, [pw_1024]
vbroadcasti128 m0, [r2 + 1]
mova m5, [intra_pred_shuff_0_8]
pshufb m1, m0, m5
pshufb m2, m0, m5
pshufb m4, m0, m5
pshufb m0, [c_ang8_src2_10_2_10]
pmaddubsw m1, [c_ang8_5_10]
pmulhrsw m1, m3
pmaddubsw m2, [c_ang8_15_20]
pmulhrsw m2, m3
pmaddubsw m4, [c_ang8_25_30]
pmulhrsw m4, m3
pmaddubsw m0, [c_ang8_3_8]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_11, 3, 5, 5
mova m3, [pw_1024]
movu xm1, [r2 + 16]
pinsrb xm1, [r2], 0
pshufb xm1, [intra_pred_shuff_0_8]
vinserti128 m0, m1, xm1, 1
lea r4, [c_ang8_mode_25]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_15, 3, 6, 6
mova m3, [pw_1024]
movu xm5, [r2 + 16]
pinsrb xm5, [r2], 0
lea r5, [intra_pred_shuff_0_8]
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 2], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
lea r4, [c_ang8_mode_15]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 4], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 6], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 8], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_16, 3,4,7
lea r0, [r0 + r1 * 8]
sub r0, r1
neg r1
lea r3, [r1 * 3]
vbroadcasti128 m0, [angHor8_tab_16] ; m0 = factor
mova m1, [intra_pred8_shuff16] ; m1 = 4 of Row shuffle
movu m2, [intra_pred8_shuff16 + 8] ; m2 = 4 of Row shuffle
; prepare reference pixel
movq xm3, [r2 + 16 + 1] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x]
movhps xm3, [r2 + 2] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x]
pslldq xm3, 1
pinsrb xm3, [r2], 0 ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8]
pshufb xm3, [c_ang8_mode_16]
vinserti128 m3, m3, xm3, 1 ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1 0 2 3 5 6 8]
; process 4 rows
pshufb m4, m3, m1
pshufb m5, m3, m2
psrldq m3, 4
punpcklbw m6, m5, m4
punpckhbw m5, m4
pmaddubsw m6, m0
pmulhrsw m6, [pw_1024]
pmaddubsw m5, m0
pmulhrsw m5, [pw_1024]
packuswb m6, m5
vextracti128 xm5, m6, 1
movq [r0], xm6
movhps [r0 + r1], xm6
movq [r0 + r1 * 2], xm5
movhps [r0 + r3], xm5
; process 4 rows
lea r0, [r0 + r1 * 4]
pshufb m4, m3, m1
pshufb m5, m3, m2
punpcklbw m6, m5, m4
punpckhbw m5, m4
pmaddubsw m6, m0
pmulhrsw m6, [pw_1024]
pmaddubsw m5, m0
pmulhrsw m5, [pw_1024]
packuswb m6, m5
vextracti128 xm5, m6, 1
movq [r0], xm6
movhps [r0 + r1], xm6
movq [r0 + r1 * 2], xm5
movhps [r0 + r3], xm5
RET
%if 1
INIT_YMM avx2
cglobal intra_pred_ang8_20, 3,5,6
lea r0, [r0 + r1 * 8]
sub r0, r1
neg r1
lea r3, [angHor8_tab_20]
lea r4, [r1 * 3]
movu m5, [intra_pred_shuff_0_8 + 16]
; prepare reference pixel
movq xm1, [r2 + 1] ; m3 = [ 1 2 3 4 5 6 7 8 x x x x x x x x]
movhps xm1, [r2 + 16 + 2] ; m3 = [ 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8 x]
palignr xm1, xm1, [r2 - 15], 15 ; m3 = [ 0 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8]
pshufb xm1, [c_ang8_mode_20]
vinserti128 m1, m1, xm1, 1
; process 4 rows
pshufb m3, m1, m5
psrldq m1, 2
pmaddubsw m3, [r3 + 0 * 16]
pmulhrsw m3, [pw_1024]
pshufb m4, m1, [intra_pred_shuff_0_8]
psrldq m1, 1
pmaddubsw m4, [r3 + 2 * 16]
pmulhrsw m4, [pw_1024]
packuswb m3, m4
vextracti128 xm4, m3, 1
movq [r0], xm3
movq [r0 + r1], xm4
movhps [r0 + r1 * 2], xm3
movhps [r0 + r4], xm4
; process 4 rows
lea r0, [r0 + r1 * 4]
pshufb m3, m1, m5
psrldq m1, 1
pmaddubsw m3, [r3 + 4 * 16]
pmulhrsw m3, [pw_1024]
pshufb m4, m1, m5
pmaddubsw m4, [r3 + 6 * 16]
pmulhrsw m4, [pw_1024]
packuswb m3, m4
vextracti128 xm4, m3, 1
movq [r0], xm3
movq [r0 + r1], xm4
movhps [r0 + r1 * 2], xm3
movhps [r0 + r4], xm4
RET
%else
INIT_YMM avx2
cglobal intra_pred_ang8_20, 3, 6, 6
mova m3, [pw_1024]
movu xm5, [r2]
lea r5, [intra_pred_shuff_0_8]
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 2 + 16], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
lea r4, [c_ang8_mode_20]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 3 + 16], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 5 + 16], 0
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 6 + 16], 0
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 8 + 16], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
%endif
INIT_YMM avx2
cglobal intra_pred_ang8_21, 3, 6, 6
mova m3, [pw_1024]
movu xm5, [r2]
lea r5, [intra_pred_shuff_0_8]
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 2 + 16], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
lea r4, [c_ang8_mode_15]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 4 + 16], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 6 + 16], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
mova xm0, xm5
pslldq xm5, 1
pinsrb xm5, [r2 + 8 + 16], 0
vinserti128 m0, m0, xm5, 1
pshufb m0, [r5]
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_22, 3, 6, 6
mova m3, [pw_1024]
movu xm5, [r2]
lea r5, [intra_pred_shuff_0_8]
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
lea r4, [c_ang8_mode_14]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 2 + 16], 0
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 5 + 16], 0
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 7 + 16], 0
pshufb xm5, [r5]
vinserti128 m0, m0, xm5, 1
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_14, 3, 6, 6
mova m3, [pw_1024]
movu xm5, [r2 + 16]
pinsrb xm5, [r2], 0
lea r5, [intra_pred_shuff_0_8]
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
lea r4, [c_ang8_mode_14]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 2], 0
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 5], 0
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 7], 0
pshufb xm5, [r5]
vinserti128 m0, m0, xm5, 1
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_13, 3, 6, 6
mova m3, [pw_1024]
movu xm5, [r2 + 16]
pinsrb xm5, [r2], 0
lea r5, [intra_pred_shuff_0_8]
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
lea r4, [c_ang8_mode_13]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 4], 0
pshufb xm4, xm5, [r5]
vinserti128 m0, m0, xm4, 1
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
vinserti128 m0, m0, xm4, 0
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 7], 0
pshufb xm5, [r5]
vinserti128 m0, m0, xm5, 1
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_23, 3, 6, 6
mova m3, [pw_1024]
movu xm5, [r2]
lea r5, [intra_pred_shuff_0_8]
vinserti128 m0, m5, xm5, 1
pshufb m0, [r5]
lea r4, [c_ang8_mode_13]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 4 + 16], 0
pshufb xm4, xm5, [r5]
vinserti128 m0, m0, xm4, 1
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
vinserti128 m0, m0, xm4, 0
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pslldq xm5, 1
pinsrb xm5, [r2 + 7 + 16], 0
pshufb xm5, [r5]
vinserti128 m0, m0, xm5, 1
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_12, 3, 5, 5
mova m3, [pw_1024]
movu xm1, [r2 + 16]
pinsrb xm1, [r2], 0
pshufb xm1, [intra_pred_shuff_0_8]
vinserti128 m0, m1, xm1, 1
lea r4, [c_ang8_mode_24]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pslldq xm0, 2
pinsrb xm0, [r2 + 6], 0
pinsrb xm0, [r2 + 0], 1
vinserti128 m0, m0, xm0, 1
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
vperm2i128 m2, m1, m4, 00100000b
vperm2i128 m1, m1, m4, 00110001b
punpcklbw m4, m2, m1
punpckhbw m2, m1
punpcklwd m1, m4, m2
punpckhwd m4, m2
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
lea r3, [3 * r1]
movq [r0], xm1
movhps [r0 + r1], xm1
vextracti128 xm2, m1, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
movhps [r0 + r1], xm4
vextracti128 xm2, m4, 1
movq [r0 + 2 * r1], xm2
movhps [r0 + r3], xm2
RET
INIT_YMM avx2
cglobal intra_pred_ang8_24, 3, 5, 5
mova m3, [pw_1024]
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred_shuff_0_8]
lea r4, [c_ang8_mode_24]
pmaddubsw m1, m0, [r4]
pmulhrsw m1, m3
pmaddubsw m2, m0, [r4 + mmsize]
pmulhrsw m2, m3
pmaddubsw m4, m0, [r4 + 2 * mmsize]
pmulhrsw m4, m3
pslldq xm0, 2
pinsrb xm0, [r2 + 16 + 6], 0
pinsrb xm0, [r2 + 0], 1
vinserti128 m0, m0, xm0, 1
pmaddubsw m0, [r4 + 3 * mmsize]
pmulhrsw m0, m3
packuswb m1, m2
packuswb m4, m0
lea r3, [3 * r1]
movq [r0], xm1
vextracti128 xm2, m1, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm1
movhps [r0 + r3], xm2
lea r0, [r0 + 4 * r1]
movq [r0], xm4
vextracti128 xm2, m4, 1
movq [r0 + r1], xm2
movhps [r0 + 2 * r1], xm4
movhps [r0 + r3], xm2
RET
%macro INTRA_PRED_ANG16_MC0 3
pmaddubsw m3, m1, [r4 + %3 * mmsize]
pmulhrsw m3, m0
pmaddubsw m4, m2, [r4 + %3 * mmsize]
pmulhrsw m4, m0
packuswb m3, m4
movu [%1], xm3
vextracti128 xm4, m3, 1
movu [%2], xm4
%endmacro
%macro INTRA_PRED_ANG16_MC1 1
INTRA_PRED_ANG16_MC0 r0, r0 + r1, %1
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, (%1 + 1)
%endmacro
%macro INTRA_PRED_ANG16_MC2 1
vbroadcasti128 m1, [r2 + %1]
pshufb m1, m5
vbroadcasti128 m2, [r2 + (%1 + 8)]
pshufb m2, m5
%endmacro
%macro INTRA_PRED_ANG16_MC3 2
vperm2i128 m1, m1, m2, 00100000b
pmaddubsw m3, m1, [r4 + (%2 * mmsize)]
pmulhrsw m3, m0
packuswb m3, m3
vpermq m3, m3, 11011000b
movu [%1], xm3
%endmacro
%macro INTRA_PRED_ANG16_MC4 3
vperm2i128 m1, m1, m2, 00100000b
pmaddubsw m4, m1, [r4 + (%3 * mmsize)]
pmulhrsw m4, m0
packuswb m3, m4
vpermq m3, m3, 11011000b
movu [%1], xm3
vextracti128 xm3, m3, 1
movu [%2], xm3
%endmacro
%if ARCH_X86_64 == 1
%macro INTRA_PRED_TRANS_STORE_16x16 0
punpcklbw m8, m0, m1
punpckhbw m0, m1
punpcklbw m1, m2, m3
punpckhbw m2, m3
punpcklbw m3, m4, m5
punpckhbw m4, m5
punpcklbw m5, m6, m7
punpckhbw m6, m7
punpcklwd m7, m8, m1
punpckhwd m8, m1
punpcklwd m1, m3, m5
punpckhwd m3, m5
punpcklwd m5, m0, m2
punpckhwd m0, m2
punpcklwd m2, m4, m6
punpckhwd m4, m6
punpckldq m6, m7, m1
punpckhdq m7, m1
punpckldq m1, m8, m3
punpckhdq m8, m3
punpckldq m3, m5, m2
punpckhdq m5, m2
punpckldq m2, m0, m4
punpckhdq m0, m4
vpermq m6, m6, 0xD8
vpermq m7, m7, 0xD8
vpermq m1, m1, 0xD8
vpermq m8, m8, 0xD8
vpermq m3, m3, 0xD8
vpermq m5, m5, 0xD8
vpermq m2, m2, 0xD8
vpermq m0, m0, 0xD8
movu [r0], xm6
vextracti128 xm4, m6, 1
movu [r0 + r1], xm4
movu [r0 + 2 * r1], xm7
vextracti128 xm4, m7, 1
movu [r0 + r3], xm4
lea r0, [r0 + 4 * r1]
movu [r0], xm1
vextracti128 xm4, m1, 1
movu [r0 + r1], xm4
movu [r0 + 2 * r1], xm8
vextracti128 xm4, m8, 1
movu [r0 + r3], xm4
lea r0, [r0 + 4 * r1]
movu [r0], xm3
vextracti128 xm4, m3, 1
movu [r0 + r1], xm4
movu [r0 + 2 * r1], xm5
vextracti128 xm4, m5, 1
movu [r0 + r3], xm4
lea r0, [r0 + 4 * r1]
movu [r0], xm2
vextracti128 xm4, m2, 1
movu [r0 + r1], xm4
movu [r0 + 2 * r1], xm0
vextracti128 xm4, m0, 1
movu [r0 + r3], xm4
%endmacro
%macro INTRA_PRED_ANG16_CAL_ROW 3
pmaddubsw %1, m9, [r4 + (%3 * mmsize)]
pmulhrsw %1, m11
pmaddubsw %2, m10, [r4 + (%3 * mmsize)]
pmulhrsw %2, m11
packuswb %1, %2
%endmacro
INIT_YMM avx2
cglobal intra_pred_ang16_12, 3,4,9
vbroadcasti128 m0, [angHor_tab_12]
vbroadcasti128 m1, [angHor_tab_12 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang16_shuf_mode12]
mova m8, [ang16_shuf_mode12 + mmsize]
lea r3, [r1 * 3]
movu xm4, [r2 + mmsize - 2]
pinsrb xm4, [r2 + 0], 2
pinsrb xm4, [r2 + 6], 1
pinsrb xm4, [r2 + 13], 0
vbroadcasti128 m6, [r2 + mmsize + 14]
vinserti128 m3, m4, xm4, 1
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
INIT_YMM avx2
cglobal intra_pred_ang16_13, 3,4,9
vbroadcasti128 m0, [angHor_tab_13]
vbroadcasti128 m1, [angHor_tab_13 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang16_shuf_mode13]
mova m8, [ang16_shuf_mode13 + mmsize]
lea r3, [r1 * 3]
vbroadcasti128 m3, [r2 + mmsize + 1]
vbroadcasti128 m4, [r2]
pshufb m4, [ang16_shuf_mode13 + mmsize * 2]
palignr m3, m4, 11
vbroadcasti128 m6, [r2 + mmsize + 12]
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
INIT_YMM avx2
cglobal intra_pred_ang16_14, 3,4,9
vbroadcasti128 m0, [angHor_tab_14]
vbroadcasti128 m1, [angHor_tab_14 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang16_shuf_mode14]
mova m8, [ang16_shuf_mode14 + mmsize]
lea r3, [r1 * 3]
vbroadcasti128 m3, [r2 + mmsize + 1]
vbroadcasti128 m4, [r2]
pshufb m4, [ang16_shuf_mode14 + mmsize * 2]
palignr m3, m4, 9
vbroadcasti128 m6, [r2 + mmsize + 10]
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
INIT_YMM avx2
cglobal intra_pred_ang16_15, 3,4,9
vbroadcasti128 m0, [angHor_tab_15]
vbroadcasti128 m1, [angHor_tab_15 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang16_shuf_mode15]
mova m8, [ang16_shuf_mode15 + mmsize]
lea r3, [r1 * 3]
vbroadcasti128 m3, [r2 + mmsize + 1]
vbroadcasti128 m4, [r2]
pshufb m4, [ang16_shuf_mode15 + mmsize * 2]
palignr m3, m3, m4, 7
vbroadcasti128 m6, [r2 + mmsize + 8]
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
INIT_YMM avx2
cglobal intra_pred_ang16_16, 3,4,9
vbroadcasti128 m0, [angHor_tab_16]
vbroadcasti128 m1, [angHor_tab_16 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang16_shuf_mode16]
mova m8, [ang16_shuf_mode16 + mmsize]
lea r3, [r1 * 3]
vbroadcasti128 m3, [r2 + mmsize + 1]
vbroadcasti128 m4, [r2]
pshufb m4, [ang16_shuf_mode16 + mmsize * 2]
palignr m3, m4, 5
vbroadcasti128 m6, [r2 + mmsize + 6]
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
INIT_YMM avx2
cglobal intra_pred_ang16_17, 3,4,9
vbroadcasti128 m0, [angHor_tab_17]
vbroadcasti128 m1, [angHor_tab_17 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang16_shuf_mode17]
mova m8, [ang16_shuf_mode17 + mmsize]
lea r3, [r1 * 3]
vbroadcasti128 m3, [r2 + mmsize + 1]
vbroadcasti128 m4, [r2]
pshufb m4, [ang16_shuf_mode17 + mmsize * 2]
palignr m3, m4, 3
vbroadcasti128 m6, [r2 + mmsize + 4]
pshufb m4, m3, m7
pshufb m5, m3, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m4, m5, m7
pshufb m5, m8
pmaddubsw m4, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
INIT_YMM avx2
cglobal intra_pred_ang16_11, 3,4,8
vbroadcasti128 m0, [angHor_tab_11]
vbroadcasti128 m1, [angHor_tab_11 + mmsize/2]
mova m2, [pw_1024]
mova m7, [ang32_shuf_mode9]
lea r3, [r1 * 3]
; prepare for [0 -1 -2...]
movu xm3, [r2 + mmsize]
pinsrb xm3, [r2], 0
vbroadcasti128 m6, [r2 + mmsize + 16]
vinserti128 m3, m3, xm3, 1
pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2]
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
%if ARCH_X86_64 == 1
INIT_YMM avx2
%macro TRANSPOSE_STORE_8x32 12
jc .skip
punpcklbw m%9, m%1, m%2
punpckhbw m%1, m%2
punpcklbw m%10, m%3, m%4
punpckhbw m%3, m%4
punpcklwd m%11, m%9, m%10
punpckhwd m%9, m%10
punpcklwd m%10, m%1, m%3
punpckhwd m%1, m%3
punpckldq m%12, m%11, m%10
punpckhdq m%11, m%10
punpckldq m%10, m%9, m%1
punpckhdq m%9, m%1
punpcklbw m%1, m%5, m%6
punpckhbw m%5, m%6
punpcklbw m%2, m%7, m%8
punpckhbw m%7, m%8
punpcklwd m%3, m%1, m%2
punpckhwd m%1, m%2
punpcklwd m%4, m%5, m%7
punpckhwd m%5, m%7
punpckldq m%2, m%3, m%4
punpckhdq m%3, m%4
punpckldq m%4, m%1, m%5
punpckhdq m%1, m%5
punpckldq m%5, m%12, m%2
punpckhdq m%6, m%12, m%2
punpckldq m%7, m%10, m%4
punpckhdq m%8, m%10, m%4
punpckldq m%2, m%11, m%3
punpckhdq m%11, m%11, m%3
punpckldq m%4, m%9, m%1
punpckhdq m%9, m%9, m%1
movu [r0 + r1 * 0], xm%5
movu [r0 + r1 * 1], xm%6
movu [r0 + r1 * 2], xm%2
movu [r0 + r5 * 1], xm%11
add r0, r6
movu [r0 + r1 * 0], xm%7
movu [r0 + r1 * 1], xm%8
movu [r0 + r1 * 2], xm%4
movu [r0 + r5 * 1], xm%9
add r0, r6
vextracti128 [r0 + r1 * 0], m%5, 1
vextracti128 [r0 + r1 * 1], m%6, 1
vextracti128 [r0 + r1 * 2], m%2, 1
vextracti128 [r0 + r5 * 1], m%11, 1
add r0, r6
vextracti128 [r0 + r1 * 0], m%7, 1
vextracti128 [r0 + r1 * 1], m%8, 1
vextracti128 [r0 + r1 * 2], m%4, 1
vextracti128 [r0 + r5 * 1], m%9, 1
jmp .end
.skip:
vpermq m%1, m%1, q3120
vpermq m%2, m%2, q3120
vpermq m%3, m%3, q3120
vpermq m%4, m%4, q3120
vpermq m%5, m%5, q3120
vpermq m%6, m%6, q3120
vpermq m%7, m%7, q3120
vpermq m%8, m%8, q3120
movu [r0 + r1 * 0], xm%1
movu [r0 + r1 * 1], xm%2
movu [r0 + r1 * 2], xm%3
movu [r0 + r5 * 1], xm%4
add r0, r6
movu [r0 + r1 * 0], xm%5
movu [r0 + r1 * 1], xm%6
movu [r0 + r1 * 2], xm%7
movu [r0 + r5 * 1], xm%8
add r0, r6
vextracti128 [r0 + r1 * 0], m%1, 1
vextracti128 [r0 + r1 * 1], m%2, 1
vextracti128 [r0 + r1 * 2], m%3, 1
vextracti128 [r0 + r5 * 1], m%4, 1
add r0, r6
vextracti128 [r0 + r1 * 0], m%5, 1
vextracti128 [r0 + r1 * 1], m%6, 1
vextracti128 [r0 + r1 * 2], m%7, 1
vextracti128 [r0 + r5 * 1], m%8, 1
.end:
%endmacro
cglobal ang16_mode_3_33
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vextracti128 xm1, m0, 1
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m4, m7
palignr m5, m2, m0, 2
pmaddubsw m5, [r3 + 4 * 32] ; [20]
pmulhrsw m5, m7
palignr m6, m2, m0, 4
palignr m8, m2, m0, 6
pmaddubsw m6, [r3 - 2 * 32] ; [14]
pmulhrsw m6, m7
pmaddubsw m8, [r3 - 8 * 32] ; [8]
pmulhrsw m8, m7
palignr m10, m2, m0, 8
pmaddubsw m9, m10, [r3 - 14 * 32] ; [2]
pmulhrsw m9, m7
pmaddubsw m10, [r3 + 12 * 32] ; [28]
pmulhrsw m10, m7
palignr m11, m2, m0, 10
palignr m12, m2, m0, 12
pmaddubsw m11, [r3 + 6 * 32] ; [22]
pmulhrsw m11, m7
pmaddubsw m12, [r3] ; [16]
pmulhrsw m12, m7
; rows 8 to 15
palignr m3, m2, m0, 14
palignr m1, m1, m2, 14
pmaddubsw m3, [r3 - 6 * 32] ; [10]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m3, m2, [r3 - 12 * 32] ; [4]
pmulhrsw m3, m7
packuswb m5, m3
pmaddubsw m3, m2, [r3 + 14 * 32] ; [30]
pmulhrsw m3, m7
packuswb m6, m3
movu xm0, [r2 + 25]
movu xm1, [r2 + 26]
punpcklbw m0, m1
mova m1, m2
vinserti128 m1, m1, xm0, 0
vpermq m1, m1, 01001110b
palignr m3, m1, m2, 2
pmaddubsw m3, [r3 + 8 * 32] ; [24]
pmulhrsw m3, m7
packuswb m8, m3
palignr m3, m1, m2, 4
pmaddubsw m3, [r3 + 2 * 32] ; [18]
pmulhrsw m3, m7
packuswb m9, m3
palignr m3, m1, m2, 6
pmaddubsw m3, [r3 - 4 * 32] ; [12]
pmulhrsw m3, m7
packuswb m10, m3
palignr m3, m1, m2, 8
pmaddubsw m3, [r3 - 10 * 32] ; [6]
pmulhrsw m3, m7
packuswb m11, m3
pmovzxbw m1, [r2 + 14]
packuswb m12, m1
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
ret
INIT_YMM avx2
cglobal intra_pred_ang16_3, 3, 7, 13
add r2, 32
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
clc
call ang16_mode_3_33
RET
INIT_YMM avx2
cglobal intra_pred_ang16_33, 3, 7, 13
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
stc
call ang16_mode_3_33
RET
cglobal ang16_mode_4_32
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vextracti128 xm1, m0, 1
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
pmaddubsw m4, m0, [r3 + 5 * 32] ; [21]
pmulhrsw m4, m7
palignr m1, m2, m0, 2
pmaddubsw m5, m1, [r3 - 6 * 32] ; [10]
pmulhrsw m5, m7
palignr m8, m2, m0, 4
pmaddubsw m6, m1, [r3 + 15 * 32] ; [31]
pmulhrsw m6, m7
pmaddubsw m8, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
palignr m10, m2, m0, 6
pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
pmulhrsw m9, m7
pmaddubsw m10, [r3 + 14 * 32] ; [30]
pmulhrsw m10, m7
palignr m11, m2, m0, 8
palignr m1, m2, m0, 10
pmaddubsw m11, [r3 + 3 * 32] ; [19]
pmulhrsw m11, m7
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
; rows 8 to 15
pmaddubsw m3, m1, [r3 + 13 * 32] ; [29]
pmulhrsw m3, m7
packuswb m4, m3
palignr m3, m2, m0, 12
pmaddubsw m3, m3, [r3 + 2 * 32] ; [18]
pmulhrsw m3, m7
packuswb m5, m3
palignr m1, m2, m0, 14
pmaddubsw m3, m1, [r3 - 9 * 32] ; [7]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
pmulhrsw m3, m7
packuswb m8, m3
palignr m3, m2, m0, 16
pmaddubsw m3, [r3 + 1 * 32] ; [17]
pmulhrsw m3, m7
packuswb m9, m3
movu xm0, [r2 + 25]
movu xm1, [r2 + 26]
punpcklbw m0, m1
mova m1, m2
vinserti128 m1, m1, xm0, 0
vpermq m1, m1, 01001110b
palignr m0, m1, m2, 2
pmaddubsw m3, m0, [r3 - 10 * 32] ; [6]
pmulhrsw m3, m7
packuswb m10, m3
pmaddubsw m3, m0, [r3 + 11 * 32] ; [27]
pmulhrsw m3, m7
packuswb m11, m3
palignr m1, m1, m2, 4
pmaddubsw m1, [r3] ; [16]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
ret
INIT_YMM avx2
cglobal intra_pred_ang16_4, 3, 7, 13
add r2, 32
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
clc
call ang16_mode_4_32
RET
INIT_YMM avx2
cglobal intra_pred_ang16_32, 3, 7, 13
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
stc
call ang16_mode_4_32
RET
cglobal ang16_mode_5
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vextracti128 xm1, m0, 1
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
pmulhrsw m4, m7
palignr m1, m2, m0, 2
pmaddubsw m5, m1, [r3 - 14 * 32] ; [2]
pmulhrsw m5, m7
palignr m3, m2, m0, 4
pmaddubsw m6, m1, [r3 + 3 * 32] ; [19]
pmulhrsw m6, m7
pmaddubsw m8, m3, [r3 - 12 * 32] ; [4]
pmulhrsw m8, m7
pmaddubsw m9, m3, [r3 + 5 * 32] ; [21]
pmulhrsw m9, m7
palignr m3, m2, m0, 6
pmaddubsw m10, m3, [r3 - 10 * 32] ; [6]
pmulhrsw m10, m7
palignr m1, m2, m0, 8
pmaddubsw m11, m3, [r3 + 7 * 32] ; [23]
pmulhrsw m11, m7
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
; rows 8 to 15
pmaddubsw m3, m1, [r3 + 9 * 32] ; [25]
pmulhrsw m3, m7
packuswb m4, m3
palignr m1, m2, m0, 10
pmaddubsw m3, m1, [r3 - 6 * 32] ; [10]
pmulhrsw m3, m7
packuswb m5, m3
pmaddubsw m3, m1, [r3 + 11 * 32] ; [27]
pmulhrsw m3, m7
packuswb m6, m3
palignr m1, m2, m0, 12
pmaddubsw m3, m1, [r3 - 4 * 32] ; [12]
pmulhrsw m3, m7
packuswb m8, m3
pmaddubsw m3, m1, [r3 + 13 * 32] ; [29]
pmulhrsw m3, m7
packuswb m9, m3
palignr m1, m2, m0, 14
pmaddubsw m3, m1, [r3 - 2 * 32] ; [14]
pmulhrsw m3, m7
packuswb m10, m3
pmaddubsw m3, m1, [r3 + 15 * 32] ; [31]
pmulhrsw m3, m7
packuswb m11, m3
palignr m1, m2, m0, 16
pmaddubsw m1, [r3] ; [16]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
ret
INIT_YMM avx2
cglobal intra_pred_ang16_5, 3, 7, 13
add r2, 32
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
clc
call ang16_mode_5
RET
cglobal ang16_mode_6
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vextracti128 xm1, m0, 1
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
pmaddubsw m4, m0, [r3 - 3 * 32] ; [13]
pmulhrsw m4, m7
pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
pmulhrsw m5, m7
palignr m3, m2, m0, 2
pmaddubsw m6, m3, [r3 - 9 * 32] ; [7]
pmulhrsw m6, m7
pmaddubsw m8, m3, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
palignr m3, m2, m0, 4
pmaddubsw m9, m3, [r3 - 15 * 32] ; [1]
pmulhrsw m9, m7
pmaddubsw m10, m3, [r3 - 2 * 32] ; [14]
pmulhrsw m10, m7
pmaddubsw m11, m3, [r3 + 11 * 32] ; [27]
pmulhrsw m11, m7
palignr m1, m2, m0, 6
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
; rows 8 to 15
pmaddubsw m3, m1, [r3 + 5 * 32] ; [21]
pmulhrsw m3, m7
packuswb m4, m3
palignr m1, m2, m0, 8
pmaddubsw m3, m1, [r3 - 14 * 32] ; [2]
pmulhrsw m3, m7
packuswb m5, m3
pmaddubsw m3, m1, [r3 - 1 * 32] ; [15]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
pmulhrsw m3, m7
packuswb m8, m3
palignr m1, m2, m0, 10
pmaddubsw m3, m1, [r3 - 7 * 32] ; [9]
pmulhrsw m3, m7
packuswb m9, m3
pmaddubsw m3, m1, [r3 + 6 * 32] ; [22]
pmulhrsw m3, m7
packuswb m10, m3
palignr m1, m2, m0, 12
pmaddubsw m3, m1, [r3 - 13 * 32] ; [3]
pmulhrsw m3, m7
packuswb m11, m3
pmaddubsw m1, [r3] ; [16]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
ret
INIT_YMM avx2
cglobal intra_pred_ang16_6, 3, 7, 13
add r2, 32
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
clc
call ang16_mode_6
RET
cglobal ang16_mode_7
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vextracti128 xm1, m0, 1
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
pmaddubsw m4, m0, [r3 - 7 * 32] ; [9]
pmulhrsw m4, m7
pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 + 11 * 32] ; [27]
pmulhrsw m6, m7
palignr m3, m2, m0, 2
pmaddubsw m8, m3, [r3 - 12 * 32] ; [4]
pmulhrsw m8, m7
pmaddubsw m9, m3, [r3 - 3 * 32] ; [13]
pmulhrsw m9, m7
pmaddubsw m10, m3, [r3 + 6 * 32] ; [22]
pmulhrsw m10, m7
pmaddubsw m11, m3, [r3 + 15 * 32] ; [31]
pmulhrsw m11, m7
palignr m1, m2, m0, 4
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
; rows 8 to 15
pmaddubsw m3, m1, [r3 + 1 * 32] ; [17]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m3, m1, [r3 + 10 * 32] ; [26]
pmulhrsw m3, m7
packuswb m5, m3
palignr m1, m2, m0, 6
pmaddubsw m3, m1, [r3 - 13 * 32] ; [3]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m3, m1, [r3 - 4 * 32] ; [12]
pmulhrsw m3, m7
packuswb m8, m3
pmaddubsw m3, m1, [r3 + 5 * 32] ; [21]
pmulhrsw m3, m7
packuswb m9, m3
pmaddubsw m3, m1, [r3 + 14 * 32] ; [30]
pmulhrsw m3, m7
packuswb m10, m3
palignr m1, m2, m0, 8
pmaddubsw m3, m1, [r3 - 9 * 32] ; [7]
pmulhrsw m3, m7
packuswb m11, m3
pmaddubsw m1, [r3] ; [16]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
ret
INIT_YMM avx2
cglobal intra_pred_ang16_7, 3, 7, 13
add r2, 32
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
clc
call ang16_mode_7
RET
cglobal ang16_mode_8
; rows 0 to 7
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vextracti128 xm1, m0, 1
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
pmulhrsw m4, m7
pmaddubsw m5, m0, [r3 - 6 * 32] ; [10]
pmulhrsw m5, m7
pmaddubsw m6, m0, [r3 - 1 * 32] ; [15]
pmulhrsw m6, m7
pmaddubsw m8, m0, [r3 + 4 * 32] ; [20]
pmulhrsw m8, m7
pmaddubsw m9, m0, [r3 + 9 * 32] ; [25]
pmulhrsw m9, m7
pmaddubsw m10, m0, [r3 + 14 * 32] ; [30]
pmulhrsw m10, m7
palignr m1, m2, m0, 2
pmaddubsw m11, m1, [r3 - 13 * 32] ; [3]
pmulhrsw m11, m7
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
pmulhrsw m12, m7
; rows 8 to 15
pmaddubsw m3, m1, [r3 - 3 * 32] ; [13]
pmulhrsw m3, m7
packuswb m4, m3
pmaddubsw m3, m1, [r3 + 2 * 32] ; [18]
pmulhrsw m3, m7
packuswb m5, m3
pmaddubsw m3, m1, [r3 + 7 * 32] ; [23]
pmulhrsw m3, m7
packuswb m6, m3
pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
pmulhrsw m3, m7
packuswb m8, m3
palignr m1, m2, m0, 4
pmaddubsw m3, m1, [r3 - 15 * 32] ; [1]
pmulhrsw m3, m7
packuswb m9, m3
pmaddubsw m3, m1, [r3 - 10 * 32] ; [6]
pmulhrsw m3, m7
packuswb m10, m3
pmaddubsw m3, m1, [r3 - 5 * 32] ; [11]
pmulhrsw m3, m7
packuswb m11, m3
pmaddubsw m1, [r3] ; [16]
pmulhrsw m1, m7
packuswb m12, m1
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
ret
INIT_YMM avx2
cglobal intra_pred_ang16_8, 3, 7, 13
add r2, 32
lea r3, [ang_table_avx2 + 16 * 32]
lea r5, [r1 * 3] ; r5 -> 3 * stride
lea r6, [r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
clc
call ang16_mode_8
RET
%endif ; ARCH_X86_64
INIT_YMM avx2
cglobal intra_pred_ang16_9, 3,4,8
vbroadcasti128 m0, [angHor_tab_9]
vbroadcasti128 m1, [angHor_tab_9 + mmsize/2]
mova m2, [pw_1024]
lea r3, [r1 * 3]
mova m7, [ang16_shuf_mode9]
vbroadcasti128 m6, [r2 + mmsize + 17]
vbroadcasti128 m3, [r2 + mmsize + 1]
pshufb m5, m3, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 2
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 4
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 6
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 8
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 10
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
lea r0, [r0 + r1 * 4]
palignr m5, m6, m3, 12
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0], xm4
vextracti128 [r0 + r1], m4, 1
palignr m5, m6, m3, 14
pshufb m5, m7
pmaddubsw m4, m5, m0
pmaddubsw m5, m1
pmulhrsw m4, m2
pmulhrsw m5, m2
packuswb m4, m5
movu [r0 + r1 * 2], xm4
vextracti128 [r0 + r3], m4, 1
RET
%endif
INIT_YMM avx2
cglobal intra_pred_ang16_25, 3, 5, 5
mova m0, [pw_1024]
vbroadcasti128 m1, [r2]
pshufb m1, [intra_pred_shuff_0_8]
vbroadcasti128 m2, [r2 + 8]
pshufb m2, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_25]
INTRA_PRED_ANG16_MC1 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC1 2
add r4, 4 * mmsize
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC1 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC1 2
RET
INIT_YMM avx2
cglobal intra_pred_ang16_28, 3, 5, 6
mova m0, [pw_1024]
mova m5, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_28]
INTRA_PRED_ANG16_MC2 1
INTRA_PRED_ANG16_MC1 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
INTRA_PRED_ANG16_MC2 2
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3
lea r0, [r0 + 4 * r1]
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC1 0
INTRA_PRED_ANG16_MC2 3
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC1 2
RET
INIT_YMM avx2
cglobal intra_pred_ang16_27, 3, 5, 5
mova m0, [pw_1024]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_27]
vbroadcasti128 m1, [r2 + 1]
pshufb m1, [intra_pred_shuff_0_8]
vbroadcasti128 m2, [r2 + 9]
pshufb m2, [intra_pred_shuff_0_8]
INTRA_PRED_ANG16_MC1 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC1 2
lea r0, [r0 + 4 * r1]
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC1 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
vperm2i128 m1, m1, m2, 00100000b
pmaddubsw m3, m1, [r4 + 3 * mmsize]
pmulhrsw m3, m0
vbroadcasti128 m2, [r2 + 2]
pshufb m2, [intra_pred_shuff_0_15]
pmaddubsw m2, [r4 + 4 * mmsize]
pmulhrsw m2, m0
packuswb m3, m2
vpermq m3, m3, 11011000b
movu [r0 + 2 * r1], xm3
vextracti128 xm4, m3, 1
movu [r0 + r3], xm4
RET
INIT_YMM avx2
cglobal intra_pred_ang16_29, 3, 5, 5
mova m0, [pw_1024]
mova m5, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_29]
INTRA_PRED_ANG16_MC2 1
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1
INTRA_PRED_ANG16_MC2 2
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
lea r0, [r0 + r1 * 4]
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
INTRA_PRED_ANG16_MC2 3
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
lea r0, [r0 + r1 * 4]
INTRA_PRED_ANG16_MC3 r0 + r1, 1
INTRA_PRED_ANG16_MC2 4
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2
lea r0, [r0 + r1 * 4]
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC2 5
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
RET
INIT_YMM avx2
cglobal intra_pred_ang16_30, 3, 5, 6
mova m0, [pw_1024]
mova m5, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_30]
INTRA_PRED_ANG16_MC2 1
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
INTRA_PRED_ANG16_MC2 2
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1
INTRA_PRED_ANG16_MC2 3
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3
INTRA_PRED_ANG16_MC2 4
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
INTRA_PRED_ANG16_MC2 5
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
INTRA_PRED_ANG16_MC3 r0 + r3 , 2
INTRA_PRED_ANG16_MC2 6
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
INTRA_PRED_ANG16_MC2 7
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
RET
INIT_YMM avx2
cglobal intra_pred_ang16_31, 3, 5, 6
mova m0, [pw_1024]
mova m5, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_31]
INTRA_PRED_ANG16_MC2 1
INTRA_PRED_ANG16_MC3 r0, 0
INTRA_PRED_ANG16_MC2 2
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
INTRA_PRED_ANG16_MC2 3
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
INTRA_PRED_ANG16_MC2 4
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
INTRA_PRED_ANG16_MC2 5
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
INTRA_PRED_ANG16_MC2 6
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
INTRA_PRED_ANG16_MC2 7
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
INTRA_PRED_ANG16_MC2 8
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
INTRA_PRED_ANG16_MC2 9
INTRA_PRED_ANG16_MC3 r0 + r3, 4
RET
INIT_YMM avx2
cglobal intra_pred_ang16_24, 3, 5, 6
mova m0, [pw_1024]
mova m5, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_24]
INTRA_PRED_ANG16_MC2 0
INTRA_PRED_ANG16_MC1 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
movu xm1, [r2 - 1]
pinsrb xm1, [r2 + 38], 0
vinserti128 m1, m1, xm1, 1
pshufb m1, m5
vbroadcasti128 m2, [r2 + 7]
pshufb m2, m5
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3
lea r0, [r0 + 4 * r1]
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC1 0
movu xm1, [r2 - 2]
pinsrb xm1, [r2 + 45], 0
pinsrb xm1, [r2 + 38], 1
vinserti128 m1, m1, xm1, 1
pshufb m1, m5
vbroadcasti128 m2, [r2 + 6]
pshufb m2, m5
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC1 2
RET
%macro INTRA_PRED_ANG16_MC5 2
pslldq xm6, xm6, 1
pinsrb xm6, [r2 + %1], 0
vinserti128 m1, m6, xm6, 1
pshufb m1, m5
vbroadcasti128 m2, [r2 + %2]
pshufb m2, m5
%endmacro
INIT_YMM avx2
cglobal intra_pred_ang16_23, 3, 5, 7
mova m0, [pw_1024]
mova m5, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_23]
INTRA_PRED_ANG16_MC2 0
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1
movu xm6, [r2 - 1]
pinsrb xm6, [r2 + 36], 0
vinserti128 m1, m6, xm6, 1
pshufb m1, m5
vbroadcasti128 m2, [r2 + 7]
pshufb m2, m5
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC5 39, 6
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC3 r0 + r1, 1
INTRA_PRED_ANG16_MC5 43, 5
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC5 46, 4
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
RET
INIT_YMM avx2
cglobal intra_pred_ang16_22, 3, 5, 7
mova m0, [pw_1024]
mova m5, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
lea r4, [c_ang16_mode_22]
INTRA_PRED_ANG16_MC2 0
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
movu xm6, [r2 - 1]
pinsrb xm6, [r2 + 34], 0
vinserti128 m1, m6, xm6, 1
pshufb m1, m5
vbroadcasti128 m2, [r2 + 7]
pshufb m2, m5
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC5 37, 6
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3
add r4, 4 * mmsize
INTRA_PRED_ANG16_MC5 39, 5
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC5 42, 4
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
INTRA_PRED_ANG16_MC3 r0 + r3, 2
lea r0, [r0 + 4 * r1]
INTRA_PRED_ANG16_MC5 44, 3
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
INTRA_PRED_ANG16_MC5 47, 2
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
RET
%macro INTRA_PRED_ANG32_ALIGNR_STORE 1
lea r0, [r0 + 4 * r1]
palignr m2, m1, m0, %1
movu [r0], m2
palignr m2, m1, m0, (%1 + 1)
movu [r0 + r1], m2
palignr m2, m1, m0, (%1 + 2)
movu [r0 + 2 * r1], m2
palignr m2, m1, m0, (%1 + 3)
movu [r0 + r3], m2
%endmacro
INIT_YMM avx2
cglobal intra_pred_ang32_34, 3, 4,3
lea r3, [3 * r1]
movu m0, [r2 + 2]
movu m1, [r2 + 18]
movu [r0], m0
palignr m2, m1, m0, 1
movu [r0 + r1], m2
palignr m2, m1, m0, 2
movu [r0 + 2 * r1], m2
palignr m2, m1, m0, 3
movu [r0 + r3], m2
INTRA_PRED_ANG32_ALIGNR_STORE 4
INTRA_PRED_ANG32_ALIGNR_STORE 8
INTRA_PRED_ANG32_ALIGNR_STORE 12
lea r0, [r0 + 4 * r1]
palignr m2, m1, m0, 16
movu [r0], m2
movu m0, [r2 + 19]
movu [r0 + r1], m0
movu m1, [r2 + 35]
palignr m2, m1, m0, 1
movu [r0 + 2 * r1], m2
palignr m2, m1, m0, 2
movu [r0 + r3], m2
INTRA_PRED_ANG32_ALIGNR_STORE 3
INTRA_PRED_ANG32_ALIGNR_STORE 7
INTRA_PRED_ANG32_ALIGNR_STORE 11
RET
INIT_YMM avx2
cglobal intra_pred_ang32_2, 3, 4,3
lea r3, [3 * r1]
movu m0, [r2 + 64 + 2]
movu m1, [r2 + 64 + 18]
movu [r0], m0
palignr m2, m1, m0, 1
movu [r0 + r1], m2
palignr m2, m1, m0, 2
movu [r0 + 2 * r1], m2
palignr m2, m1, m0, 3
movu [r0 + r3], m2
INTRA_PRED_ANG32_ALIGNR_STORE 4
INTRA_PRED_ANG32_ALIGNR_STORE 8
INTRA_PRED_ANG32_ALIGNR_STORE 12
lea r0, [r0 + 4 * r1]
palignr m2, m1, m0, 16
movu [r0], m2
movu m0, [r2 + 64 + 19]
movu [r0 + r1], m0
movu m1, [r2 + 64 + 35]
palignr m2, m1, m0, 1
movu [r0 + 2 * r1], m2
palignr m2, m1, m0, 2
movu [r0 + r3], m2
INTRA_PRED_ANG32_ALIGNR_STORE 3
INTRA_PRED_ANG32_ALIGNR_STORE 7
INTRA_PRED_ANG32_ALIGNR_STORE 11
RET
%macro INTRA_PRED_ANG32_STORE 0
lea r0, [r0 + 4 * r1]
movu [r0], m0
movu [r0 + r1], m0
movu [r0 + r1 * 2], m0
movu [r0 + r3], m0
%endmacro
INIT_YMM avx2
cglobal intra_pred_ang32_26, 3, 4, 1
lea r3, [3 * r1]
movu m0, [r2 + 1]
movu [r0], m0
movu [r0 + r1], m0
movu [r0 + r1 * 2], m0
movu [r0 + r3], m0
INTRA_PRED_ANG32_STORE
INTRA_PRED_ANG32_STORE
INTRA_PRED_ANG32_STORE
INTRA_PRED_ANG32_STORE
INTRA_PRED_ANG32_STORE
INTRA_PRED_ANG32_STORE
INTRA_PRED_ANG32_STORE
RET
%macro INTRA_PRED_STORE_4x4 0
movd [r0], xm0
pextrd [r0 + r1], xm0, 1
vextracti128 xm0, m0, 1
lea r0, [r0 + 2 * r1]
movd [r0], xm0
pextrd [r0 + r1], xm0, 1
%endmacro
%macro INTRA_PRED_TRANS_STORE_4x4 0
vpermq m0, m0, 00001000b
pshufb m0, [c_trans_4x4]
;store
movd [r0], xm0
pextrd [r0 + r1], xm0, 1
lea r0, [r0 + 2 * r1]
pextrd [r0], xm0, 2
pextrd [r0 + r1], xm0, 3
%endmacro
INIT_YMM avx2
cglobal intra_pred_ang4_27, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred_shuff_0_4]
pmaddubsw m0, [c_ang4_mode_27]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_28, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred_shuff_0_4]
pmaddubsw m0, [c_ang4_mode_28]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_29, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred4_shuff1]
pmaddubsw m0, [c_ang4_mode_29]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_30, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred4_shuff2]
pmaddubsw m0, [c_ang4_mode_30]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_31, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred4_shuff31]
pmaddubsw m0, [c_ang4_mode_31]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_32, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred4_shuff31]
pmaddubsw m0, [c_ang4_mode_32]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_33, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred4_shuff33]
pmaddubsw m0, [c_ang4_mode_33]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_3, 3, 3, 1
vbroadcasti128 m0, [r2 + 1]
pshufb m0, [intra_pred4_shuff3]
pmaddubsw m0, [c_ang4_mode_33]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_4, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff5]
pmaddubsw m0, [c_ang4_mode_32]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_5, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff5]
pmaddubsw m0, [c_ang4_mode_5]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_6, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff6]
pmaddubsw m0, [c_ang4_mode_6]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_7, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff7]
pmaddubsw m0, [c_ang4_mode_7]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_8, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff9]
pmaddubsw m0, [c_ang4_mode_8]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_9, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff9]
pmaddubsw m0, [c_ang4_mode_9]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_11, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff12]
pmaddubsw m0, [c_ang4_mode_11]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_12, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff12]
pmaddubsw m0, [c_ang4_mode_12]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_13, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff13]
pmaddubsw m0, [c_ang4_mode_13]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_14, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff14]
pmaddubsw m0, [c_ang4_mode_14]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_15, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff15]
pmaddubsw m0, [c_ang4_mode_15]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_16, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff16]
pmaddubsw m0, [c_ang4_mode_16]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_17, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff17]
pmaddubsw m0, [c_ang4_mode_17]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_TRANS_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_19, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff19]
pmaddubsw m0, [c_ang4_mode_19]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_20, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff20]
pmaddubsw m0, [c_ang4_mode_20]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_21, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff21]
pmaddubsw m0, [c_ang4_mode_21]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_22, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff22]
pmaddubsw m0, [c_ang4_mode_22]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_23, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred4_shuff23]
pmaddubsw m0, [c_ang4_mode_23]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_24, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred_shuff_0_4]
pmaddubsw m0, [c_ang4_mode_24]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
INIT_YMM avx2
cglobal intra_pred_ang4_25, 3, 3, 1
vbroadcasti128 m0, [r2]
pshufb m0, [intra_pred_shuff_0_4]
pmaddubsw m0, [c_ang4_mode_25]
pmulhrsw m0, [pw_1024]
packuswb m0, m0
INTRA_PRED_STORE_4x4
RET
;-----------------------------------------------------------------------------------
; void intra_filter_NxN(const pixel* references, pixel* filtered)
;-----------------------------------------------------------------------------------
INIT_XMM sse4
cglobal intra_filter_4x4, 2,4,5
mov r2b, byte [r0 + 8] ; topLast
mov r3b, byte [r0 + 16] ; LeftLast
; filtering top
pmovzxbw m0, [r0 + 0]
pmovzxbw m1, [r0 + 8]
pmovzxbw m2, [r0 + 16]
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
palignr m3, m1, m0, 4
pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1]
psllw m0, 1
paddw m4, m3
paddw m0, m4
paddw m0, [pw_2]
psrlw m0, 2
; filtering left
palignr m4, m1, m1, 14 ; [14 13 12 11 10 9 8 15] samples[i - 1]
pinsrb m4, [r0], 2 ; [14 13 12 11 10 9 0 15] samples[i + 1]
palignr m3, m2, m1, 4
pshufb m3, [intra_filter4_shuf1]
psllw m1, 1
paddw m4, m3
paddw m1, m4
paddw m1, [pw_2]
psrlw m1, 2
packuswb m0, m1
movu [r1], m0
mov [r1 + 8], r2b ; topLast
mov [r1 + 16], r3b ; LeftLast
RET
INIT_XMM sse4
cglobal intra_filter_8x8, 2,4,6
mov r2b, byte [r0 + 16] ; topLast
mov r3b, byte [r0 + 32] ; LeftLast
; filtering top
pmovzxbw m0, [r0 + 0]
pmovzxbw m1, [r0 + 8]
pmovzxbw m2, [r0 + 16]
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
palignr m5, m1, m0, 2
pinsrb m5, [r0 + 17], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
palignr m3, m1, m0, 14
psllw m0, 1
paddw m4, m5
paddw m0, m4
paddw m0, [pw_2]
psrlw m0, 2
palignr m4, m2, m1, 2
psllw m1, 1
paddw m4, m3
paddw m1, m4
paddw m1, [pw_2]
psrlw m1, 2
packuswb m0, m1
movu [r1], m0
; filtering left
pmovzxbw m1, [r0 + 24]
pmovzxbw m0, [r0 + 32]
palignr m4, m2, m2, 14
pinsrb m4, [r0], 2
palignr m5, m1, m2, 2
palignr m3, m1, m2, 14
palignr m0, m1, 2
psllw m2, 1
paddw m4, m5
paddw m2, m4
paddw m2, [pw_2]
psrlw m2, 2
psllw m1, 1
paddw m0, m3
paddw m1, m0
paddw m1, [pw_2]
psrlw m1, 2
packuswb m2, m1
movu [r1 + 16], m2
mov [r1 + 16], r2b ; topLast
mov [r1 + 32], r3b ; LeftLast
RET
INIT_XMM sse4
cglobal intra_filter_16x16, 2,4,6
mov r2b, byte [r0 + 32] ; topLast
mov r3b, byte [r0 + 64] ; LeftLast
; filtering top
pmovzxbw m0, [r0 + 0]
pmovzxbw m1, [r0 + 8]
pmovzxbw m2, [r0 + 16]
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
palignr m5, m1, m0, 2
pinsrb m5, [r0 + 33], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
palignr m3, m1, m0, 14
psllw m0, 1
paddw m4, m5
paddw m0, m4
paddw m0, [pw_2]
psrlw m0, 2
palignr m4, m2, m1, 2
psllw m5, m1, 1
paddw m4, m3
paddw m5, m4
paddw m5, [pw_2]
psrlw m5, 2
packuswb m0, m5
movu [r1], m0
pmovzxbw m0, [r0 + 24]
pmovzxbw m5, [r0 + 32]
palignr m3, m2, m1, 14
palignr m4, m0, m2, 2
psllw m1, m2, 1
paddw m3, m4
paddw m1, m3
paddw m1, [pw_2]
psrlw m1, 2
palignr m3, m0, m2, 14
palignr m4, m5, m0, 2
psllw m0, 1
paddw m4, m3
paddw m0, m4
paddw m0, [pw_2]
psrlw m0, 2
packuswb m1, m0
movu [r1 + 16], m1
; filtering left
pmovzxbw m1, [r0 + 40]
pmovzxbw m2, [r0 + 48]
palignr m4, m5, m5, 14
pinsrb m4, [r0], 2
palignr m0, m1, m5, 2
psllw m3, m5, 1
paddw m4, m0
paddw m3, m4
paddw m3, [pw_2]
psrlw m3, 2
palignr m0, m1, m5, 14
palignr m4, m2, m1, 2
psllw m5, m1, 1
paddw m4, m0
paddw m5, m4
paddw m5, [pw_2]
psrlw m5, 2
packuswb m3, m5
movu [r1 + 32], m3
pmovzxbw m5, [r0 + 56]
pmovzxbw m0, [r0 + 64]
palignr m3, m2, m1, 14
palignr m4, m5, m2, 2
psllw m1, m2, 1
paddw m3, m4
paddw m1, m3
paddw m1, [pw_2]
psrlw m1, 2
palignr m3, m5, m2, 14
palignr m4, m0, m5, 2
psllw m5, 1
paddw m4, m3
paddw m5, m4
paddw m5, [pw_2]
psrlw m5, 2
packuswb m1, m5
movu [r1 + 48], m1
mov [r1 + 32], r2b ; topLast
mov [r1 + 64], r3b ; LeftLast
RET
INIT_XMM sse4
cglobal intra_filter_32x32, 2,4,6
mov r2b, byte [r0 + 64] ; topLast
mov r3b, byte [r0 + 128] ; LeftLast
; filtering top
; 0 to 15
pmovzxbw m0, [r0 + 0]
pmovzxbw m1, [r0 + 8]
pmovzxbw m2, [r0 + 16]
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
palignr m5, m1, m0, 2
pinsrb m5, [r0 + 65], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
palignr m3, m1, m0, 14
psllw m0, 1
paddw m4, m5
paddw m0, m4
paddw m0, [pw_2]
psrlw m0, 2
palignr m4, m2, m1, 2
psllw m5, m1, 1
paddw m4, m3
paddw m5, m4
paddw m5, [pw_2]
psrlw m5, 2
packuswb m0, m5
movu [r1], m0
; 16 to 31
pmovzxbw m0, [r0 + 24]
pmovzxbw m5, [r0 + 32]
palignr m3, m2, m1, 14
palignr m4, m0, m2, 2
psllw m1, m2, 1
paddw m3, m4
paddw m1, m3
paddw m1, [pw_2]
psrlw m1, 2
palignr m3, m0, m2, 14
palignr m4, m5, m0, 2
psllw m2, m0, 1
paddw m4, m3
paddw m2, m4
paddw m2, [pw_2]
psrlw m2, 2
packuswb m1, m2
movu [r1 + 16], m1
; 32 to 47
pmovzxbw m1, [r0 + 40]
pmovzxbw m2, [r0 + 48]
palignr m3, m5, m0, 14
palignr m4, m1, m5, 2
psllw m0, m5, 1
paddw m3, m4
paddw m0, m3
paddw m0, [pw_2]
psrlw m0, 2
palignr m3, m1, m5, 14
palignr m4, m2, m1, 2
psllw m5, m1, 1
paddw m4, m3
paddw m5, m4
paddw m5, [pw_2]
psrlw m5, 2
packuswb m0, m5
movu [r1 + 32], m0
; 48 to 63
pmovzxbw m0, [r0 + 56]
pmovzxbw m5, [r0 + 64]
palignr m3, m2, m1, 14
palignr m4, m0, m2, 2
psllw m1, m2, 1
paddw m3, m4
paddw m1, m3
paddw m1, [pw_2]
psrlw m1, 2
palignr m3, m0, m2, 14
palignr m4, m5, m0, 2
psllw m0, 1
paddw m4, m3
paddw m0, m4
paddw m0, [pw_2]
psrlw m0, 2
packuswb m1, m0
movu [r1 + 48], m1
; filtering left
; 64 to 79
pmovzxbw m1, [r0 + 72]
pmovzxbw m2, [r0 + 80]
palignr m4, m5, m5, 14
pinsrb m4, [r0], 2
palignr m0, m1, m5, 2
psllw m3, m5, 1
paddw m4, m0
paddw m3, m4
paddw m3, [pw_2]
psrlw m3, 2
palignr m0, m1, m5, 14
palignr m4, m2, m1, 2
psllw m5, m1, 1
paddw m4, m0
paddw m5, m4
paddw m5, [pw_2]
psrlw m5, 2
packuswb m3, m5
movu [r1 + 64], m3
; 80 to 95
pmovzxbw m5, [r0 + 88]
pmovzxbw m0, [r0 + 96]
palignr m3, m2, m1, 14
palignr m4, m5, m2, 2
psllw m1, m2, 1
paddw m3, m4
paddw m1, m3
paddw m1, [pw_2]
psrlw m1, 2
palignr m3, m5, m2, 14
palignr m4, m0, m5, 2
psllw m2, m5, 1
paddw m4, m3
paddw m2, m4
paddw m2, [pw_2]
psrlw m2, 2
packuswb m1, m2
movu [r1 + 80], m1
; 96 to 111
pmovzxbw m1, [r0 + 104]
pmovzxbw m2, [r0 + 112]
palignr m3, m0, m5, 14
palignr m4, m1, m0, 2
psllw m5, m0, 1
paddw m3, m4
paddw m5, m3
paddw m5, [pw_2]
psrlw m5, 2
palignr m3, m1, m0, 14
palignr m4, m2, m1, 2
psllw m0, m1, 1
paddw m4, m3
paddw m0, m4
paddw m0, [pw_2]
psrlw m0, 2
packuswb m5, m0
movu [r1 + 96], m5
; 112 to 127
pmovzxbw m5, [r0 + 120]
pmovzxbw m0, [r0 + 128]
palignr m3, m2, m1, 14
palignr m4, m5, m2, 2
psllw m1, m2, 1
paddw m3, m4
paddw m1, m3
paddw m1, [pw_2]
psrlw m1, 2
palignr m3, m5, m2, 14
palignr m4, m0, m5, 2
psllw m5, 1
paddw m4, m3
paddw m5, m4
paddw m5, [pw_2]
psrlw m5, 2
packuswb m1, m5
movu [r1 + 112], m1
mov [r1 + 64], r2b ; topLast
mov [r1 + 128], r3b ; LeftLast
RET
INIT_YMM avx2
cglobal intra_filter_4x4, 2,4,4
mov r2b, byte [r0 + 8] ; topLast
mov r3b, byte [r0 + 16] ; LeftLast
; filtering top
pmovzxbw m0, [r0]
vpbroadcastw m2, xm0
pmovzxbw m1, [r0 + 8]
palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
psllw m0, 1
paddw m3, m1
paddw m0, m3
paddw m0, [pw_2]
psrlw m0, 2
packuswb m0, m0
vpermq m0, m0, 10001000b
movu [r1], xm0
mov [r1 + 8], r2b ; topLast
mov [r1 + 16], r3b ; LeftLast
RET