22682 lines
748 KiB
NASM
22682 lines
748 KiB
NASM
;*****************************************************************************
|
|
;* Copyright (C) 2013 x265 project
|
|
;*
|
|
;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
|
|
;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
|
|
;*
|
|
;* This program is free software; you can redistribute it and/or modify
|
|
;* it under the terms of the GNU General Public License as published by
|
|
;* the Free Software Foundation; either version 2 of the License, or
|
|
;* (at your option) any later version.
|
|
;*
|
|
;* This program is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
;* GNU General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU General Public License
|
|
;* along with this program; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
|
;*
|
|
;* This program is also available under a commercial proprietary license.
|
|
;* For more information, contact us at license @ x265.com.
|
|
;*****************************************************************************/
|
|
|
|
%include "x86inc.asm"
|
|
%include "x86util.asm"
|
|
|
|
SECTION_RODATA 32
|
|
|
|
const intra_pred_shuff_0_8, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
|
db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
|
|
|
intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
|
|
|
intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
|
|
intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
|
|
intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
|
|
pb_0_8 times 8 db 0, 8
|
|
pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
|
|
pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0
|
|
c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
|
|
const tab_S1, db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
|
|
const tab_S2, db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
|
|
const tab_Si, db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
|
|
pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
|
|
c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0
|
|
c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0
|
|
c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15
|
|
c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
|
c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0
|
|
c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0
|
|
c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
|
|
c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
|
c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
|
c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
|
|
pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
|
|
c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
|
|
c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
|
|
c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
|
|
c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2
|
|
c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2
|
|
c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1
|
|
c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
|
|
|
ALIGN 32
|
|
c_ang8_src1_9_2_10: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
|
c_ang8_26_20: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang8_src3_11_4_12: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
|
c_ang8_14_8: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
c_ang8_src5_13_5_13: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
|
|
c_ang8_2_28: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
c_ang8_src6_14_7_15: db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
|
|
c_ang8_22_16: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
c_ang8_21_10 : db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
|
|
c_ang8_src2_10_3_11: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
|
c_ang8_31_20: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang8_src4_12_4_12: times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
|
c_ang8_9_30: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
|
|
c_ang8_src5_13_6_14: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13
|
|
c_ang8_19_8: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
|
|
c_ang8_17_2: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
|
|
c_ang8_19_4: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
c_ang8_21_6: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
|
|
c_ang8_23_8: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
|
|
c_ang8_src4_12_5_13: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
|
|
|
|
c_ang8_13_26: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
|
|
c_ang8_7_20: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang8_1_14: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
|
|
c_ang8_27_8: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
c_ang8_src2_10_2_10: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
|
c_ang8_src3_11_3_11: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
|
|
|
c_ang8_31_8: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
c_ang8_13_22: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
|
|
c_ang8_27_4: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
c_ang8_9_18: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
|
|
|
|
c_ang8_5_10: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
|
|
c_ang8_15_20: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang8_25_30: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
|
|
c_ang8_3_8: db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
|
|
c_ang8_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
c_ang8_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
c_ang8_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
|
|
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
|
|
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
|
|
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
|
|
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
|
|
db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
|
|
db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
|
|
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
|
|
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
|
|
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
|
|
db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
|
|
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
|
|
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
|
|
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
|
|
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
|
|
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
|
|
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
|
|
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
|
|
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
|
|
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
|
|
db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
|
|
db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
|
|
db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
|
|
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
|
|
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
|
|
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
|
|
db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
|
|
db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
|
|
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
|
|
db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
|
|
|
|
ALIGN 32
|
|
intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
|
|
db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
|
|
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
|
|
db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
|
|
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
|
|
db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
|
|
db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
|
|
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
|
|
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
|
|
db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
|
|
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
|
|
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
|
|
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
|
|
db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
|
|
db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
|
|
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
|
|
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
|
|
db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
|
|
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
|
|
db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
|
|
db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
|
|
db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
|
|
db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
|
|
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
|
|
db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
|
|
db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
|
|
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
|
|
db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
|
|
db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
|
|
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
|
|
db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
|
|
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
|
|
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
|
|
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
c_ang16_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
|
|
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
|
|
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
|
|
db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
|
|
db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
|
|
db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
|
|
db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
|
|
|
ALIGN 32
|
|
intra_pred_shuff_0_4: times 4 db 0, 1, 1, 2, 2, 3, 3, 4
|
|
intra_pred4_shuff1: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
|
|
intra_pred4_shuff2: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
|
|
intra_pred4_shuff31: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
|
|
intra_pred4_shuff33: db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
|
|
intra_pred4_shuff3: db 8, 9, 9, 10, 10, 11, 11, 12, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
|
|
intra_pred4_shuff4: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
|
|
intra_pred4_shuff5: db 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14, 11, 12, 12, 13, 13, 14, 14, 15
|
|
intra_pred4_shuff6: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14, 10, 11, 11, 12, 12, 13, 13, 14
|
|
intra_pred4_shuff7: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 10, 11, 11, 12, 12, 13, 13, 14
|
|
intra_pred4_shuff9: db 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13, 9, 10, 10, 11, 11, 12, 12, 13
|
|
intra_pred4_shuff12: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12,0, 9, 9, 10, 10, 11, 11, 12
|
|
intra_pred4_shuff13: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
|
|
intra_pred4_shuff14: db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
|
|
intra_pred4_shuff15: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
|
|
intra_pred4_shuff16: db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
|
|
intra_pred4_shuff17: db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
|
|
intra_pred4_shuff19: db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
|
|
intra_pred4_shuff20: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
|
|
intra_pred4_shuff21: db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
|
|
intra_pred4_shuff22: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
|
|
intra_pred4_shuff23: db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
|
|
|
|
c_ang4_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
|
|
c_ang4_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang4_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
|
|
c_ang4_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang4_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
|
|
c_ang4_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang4_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
|
|
c_ang4_mode_5: db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
|
|
c_ang4_mode_6: db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang4_mode_7: db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
|
|
c_ang4_mode_8: db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
|
|
c_ang4_mode_9: db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
|
|
c_ang4_mode_11: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
|
|
c_ang4_mode_12: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
|
|
c_ang4_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
|
|
c_ang4_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
|
|
c_ang4_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4
|
|
c_ang4_mode_16: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
|
|
c_ang4_mode_17: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
|
|
c_ang4_mode_19: db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
|
|
c_ang4_mode_20: db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
|
|
c_ang4_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
|
|
c_ang4_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
|
|
c_ang4_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
|
|
c_ang4_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
|
|
c_ang4_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
|
|
|
|
ALIGN 32
|
|
;; (blkSize - 1 - x)
|
|
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
|
|
ALIGN 32
|
|
c_ang8_mode_13: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
|
|
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
|
|
db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
|
|
ALIGN 32
|
|
c_ang8_mode_14: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
|
|
db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
|
|
db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
|
|
db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
|
|
ALIGN 32
|
|
c_ang8_mode_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
|
|
db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
|
|
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
|
|
db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
|
|
|
|
const c_ang8_mode_16, db 8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 12, 13, 15, 0, 0
|
|
|
|
const intra_pred8_shuff16, db 0, 1, 1, 2, 3, 3, 4, 5
|
|
db 1, 2, 2, 3, 4, 4, 5, 6
|
|
db 2, 3, 3, 4, 5, 5, 6, 7
|
|
db 3, 4, 4, 5, 6, 6, 7, 8
|
|
db 4, 5, 5, 6, 7, 7, 8, 9
|
|
|
|
const angHor8_tab_16, db (32-11), 11, (32-22), 22, (32-1 ), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24
|
|
|
|
const c_ang8_mode_20, db 15, 13, 12, 10, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0
|
|
|
|
; NOTE: this big table improve speed ~10%, if we have broadcast instruction work on high-128bits infuture, we can remove the table
|
|
const angHor8_tab_20, times 8 db (32-24), 24
|
|
times 8 db (32-13), 13
|
|
times 8 db (32- 2), 2
|
|
times 8 db (32-23), 23
|
|
times 8 db (32-12), 12
|
|
times 8 db (32- 1), 1
|
|
times 8 db (32-22), 22
|
|
times 8 db (32-11), 11
|
|
|
|
const ang16_shuf_mode9, times 8 db 0, 1
|
|
times 8 db 1, 2
|
|
|
|
const angHor_tab_9, db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
|
|
db (32-18), 18, (32-20), 20, (32-22), 22, (32-24), 24, (32-26), 26, (32-28), 28, (32-30), 30, (32-32), 32
|
|
|
|
const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
|
|
db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0
|
|
|
|
const ang16_shuf_mode12, db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
|
|
db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
|
|
|
|
const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
|
|
db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
|
|
|
|
const ang16_shuf_mode13, db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
|
|
db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
|
|
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
|
|
|
|
const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
|
|
db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
|
|
|
|
const ang16_shuf_mode14, db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
|
|
db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
|
|
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
|
|
|
|
const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
|
|
db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
|
|
|
|
const ang16_shuf_mode15, db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
|
|
db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
|
|
db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
|
|
|
|
const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
|
|
db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
|
|
|
|
const ang16_shuf_mode16, db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
|
|
db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
|
|
db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
|
|
|
|
const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
|
|
db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
|
|
|
|
const ang16_shuf_mode17, db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
|
|
db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
|
|
db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
|
|
|
|
const angHor_tab_17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16
|
|
db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0
|
|
|
|
; Intrapred_angle32x32, modes 1 to 33 constants
|
|
const ang32_shuf_mode9, times 8 db 0, 1
|
|
times 8 db 1, 2
|
|
|
|
const ang32_shuf_mode11, times 8 db 1, 2
|
|
times 8 db 0, 1
|
|
|
|
const ang32_fact_mode12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7), 7, (32- 2), 2, (32-29), 29, (32-24), 24
|
|
db (32-11), 11, (32- 6), 6, (32- 1), 1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8), 8
|
|
db (32-19), 19, (32-14), 14, (32- 9), 9, (32- 4), 4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
|
|
db (32- 3), 3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15, (32-10), 10, (32- 5), 5, (32- 0), 0
|
|
const ang32_shuf_mode12, db 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
|
|
db 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
|
const ang32_shuf_mode24, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 3, 3
|
|
dd 0, 0, 7, 3, 0, 0, 7, 3
|
|
|
|
const ang32_fact_mode13, db (32-23), 23, (32-14), 14, (32- 5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1), 1, (32-24), 24
|
|
db (32- 7), 7, (32-30), 30, (32-21), 21, (32-12), 12, (32- 3), 3, (32-26), 26, (32-17), 17, (32- 8), 8
|
|
db (32-15), 15, (32- 6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32- 2), 2, (32-25), 25, (32-16), 16
|
|
db (32-31), 31, (32-22), 22, (32-13), 13, (32- 4), 4, (32-27), 27, (32-18), 18, (32- 9), 9, (32- 0), 0
|
|
const ang32_shuf_mode13, db 14, 15, 14, 15, 14, 15, 13, 14, 13, 14, 13, 14, 13, 14, 12, 13, 10, 11, 9, 10, 9, 10, 9, 10, 9, 10, 8, 9, 8, 9, 8, 9
|
|
db 12, 13, 12, 13, 11, 12, 11, 12, 11, 12, 11, 12, 10, 11, 10, 11, 7, 8, 7, 8, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 6, 7
|
|
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 9, 5, 2
|
|
const ang32_shuf_mode23, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 11, 11, 7, 7, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 5, 5, 2, 2
|
|
|
|
const ang32_fact_mode14, db (32-19), 19, (32- 6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5), 5, (32-24), 24
|
|
db (32- 3), 3, (32-22), 22, (32- 9), 9, (32-28), 28, (32-15), 15, (32- 2), 2, (32-21), 21, (32- 8), 8
|
|
db (32-11), 11, (32-30), 30, (32-17), 17, (32- 4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
|
|
db (32-27), 27, (32-14), 14, (32- 1), 1, (32-20), 20, (32- 7), 7, (32-26), 26, (32-13), 13, (32- 0), 0
|
|
const ang32_shuf_mode14, db 14, 15, 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 12, 13, 11, 12, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 6, 7, 5, 6, 5, 6
|
|
db 11, 12, 10, 11, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 8, 9, 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3
|
|
db 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 9, 6, 4, 1
|
|
const ang32_shuf_mode22, db 0, 0, 15, 15, 13, 13, 10, 10, 8, 8, 5, 5, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 9, 9, 7, 7, 4, 4, 2
|
|
|
|
const ang32_fact_mode15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9), 9, (32-24), 24
|
|
db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8), 8
|
|
db (32- 7), 7, (32-22), 22, (32- 5), 5, (32-20), 20, (32- 3), 3, (32-18), 18, (32- 1), 1, (32-16), 16
|
|
db (32-23), 23, (32- 6), 6, (32-21), 21, (32- 4), 4, (32-19), 19, (32- 2), 2, (32-17), 17, (32- 0), 0
|
|
const ang32_shuf_mode15, db 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 11, 12, 11, 12, 10, 11, 5, 6, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3
|
|
db 12, 13, 11, 12, 11, 12, 10, 11, 10, 11, 9, 10, 9, 10, 8, 9, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1
|
|
db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 12, 10, 8, 7, 5, 3, 1
|
|
const ang32_shuf_mode21, db 15, 15, 13, 13, 11, 11, 9, 9, 8, 8, 6, 6, 4, 4, 2, 2, 14, 14, 12, 12, 10, 10, 8, 8, 7, 7, 5, 5, 3, 3, 1, 1
|
|
|
|
const ang32_fact_mode16, db (32-11), 11, (32-22), 22, (32- 1), 1, (32-12), 12, (32-23), 23, (32- 2), 2, (32-13), 13, (32-24), 24
|
|
db (32- 3), 3, (32-14), 14, (32-25), 25, (32- 4), 4, (32-15), 15, (32-26), 26, (32- 5), 5, (32-16), 16
|
|
db (32-27), 27, (32- 6), 6, (32-17), 17, (32-28), 28, (32- 7), 7, (32-18), 18, (32-29), 29, (32- 8), 8
|
|
db (32-19), 19, (32-30), 30, (32- 9), 9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0), 0
|
|
const ang32_shuf_mode16, db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 5, 6, 4, 5
|
|
db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6
|
|
db 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 14, 13, 11, 10, 8, 7, 5, 4, 2, 1
|
|
dd 7, 1, 2, 3, 7, 1, 2, 3
|
|
const ang32_shuf_mode20, db 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 14, 15, 8, 7, 5, 4, 2, 1, 0, 0, 14, 13, 13, 11, 11, 10, 10, 8
|
|
db 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 1, 1, 0, 0
|
|
|
|
const ang32_fact_mode17, db (32- 6), 6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4), 4, (32-10), 10, (32-16), 16
|
|
db (32-22), 22, (32-28), 28, (32- 2), 2, (32- 8), 8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0), 0
|
|
const ang32_shuf_mode17, db 14, 15, 13, 14, 12, 13, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 7, 8, 6, 7, 6, 7, 5, 6, 4, 5, 3, 4, 2, 3, 2, 3
|
|
db 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
|
|
const ang32_shuf_mode19, db 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15
|
|
dd 0, 0, 2, 3, 0, 0, 7, 1
|
|
dd 0, 0, 5, 6, 0, 0, 0, 0
|
|
|
|
const ang_table
|
|
%assign x 0
|
|
%rep 32
|
|
times 8 db (32-x), x
|
|
%assign x x+1
|
|
%endrep
|
|
|
|
const ang_table_avx2
|
|
%assign x 0
|
|
%rep 32
|
|
times 16 db (32-x), x
|
|
%assign x x+1
|
|
%endrep
|
|
|
|
const pw_ang_table
|
|
%assign x 0
|
|
%rep 32
|
|
times 4 dw (32-x), x
|
|
%assign x x+1
|
|
%endrep
|
|
|
|
SECTION .text
|
|
cextern pb_1
|
|
cextern pw_2
|
|
cextern pw_3
|
|
cextern pw_4
|
|
cextern pw_7
|
|
cextern pw_8
|
|
cextern pw_16
|
|
cextern pw_15
|
|
cextern pw_31
|
|
cextern pw_32
|
|
cextern pw_257
|
|
cextern pw_512
|
|
cextern pw_1024
|
|
cextern pw_4096
|
|
cextern pw_00ff
|
|
cextern pb_unpackbd1
|
|
cextern multiL
|
|
cextern multiH
|
|
cextern multiH2
|
|
cextern multiH3
|
|
cextern multi_2Row
|
|
cextern trans8_shuf
|
|
cextern pw_planar16_mul
|
|
cextern pw_planar32_mul
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_dc4, 5,5,3
|
|
inc r2
|
|
pxor m0, m0
|
|
movu m1, [r2]
|
|
pshufd m1, m1, 0xF8
|
|
psadbw m1, m0 ; m1 = sum
|
|
|
|
test r4d, r4d
|
|
|
|
paddw m1, [pw_4]
|
|
psraw m1, 3
|
|
movd r4d, m1 ; r4d = dc_val
|
|
pmullw m1, [pw_257]
|
|
pshuflw m1, m1, 0x00
|
|
|
|
; store DC 4x4
|
|
lea r3, [r1 * 3]
|
|
movd [r0], m1
|
|
movd [r0 + r1], m1
|
|
movd [r0 + r1 * 2], m1
|
|
movd [r0 + r3], m1
|
|
|
|
; do DC filter
|
|
jz .end
|
|
lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
|
|
add r4d, r3d ; r4d = DC * 3 + 2
|
|
movd m1, r4d
|
|
pshuflw m1, m1, 0 ; m1 = pixDCx3
|
|
|
|
; filter top
|
|
movd m2, [r2]
|
|
punpcklbw m2, m0
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
movd [r0], m2 ; overwrite top-left pixel, we will update it later
|
|
|
|
; filter top-left
|
|
movzx r4d, byte [r2 + 8]
|
|
add r3d, r4d
|
|
movzx r4d, byte [r2]
|
|
add r3d, r4d
|
|
shr r3d, 2
|
|
mov [r0], r3b
|
|
|
|
; filter left
|
|
add r0, r1
|
|
movq m2, [r2 + 9]
|
|
punpcklbw m2, m0
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
%if ARCH_X86_64
|
|
movq r4, m2
|
|
mov [r0], r4b
|
|
shr r4, 8
|
|
mov [r0 + r1], r4b
|
|
shr r4, 8
|
|
mov [r0 + r1 * 2], r4b
|
|
%else
|
|
movd r2d, m2
|
|
mov [r0], r2b
|
|
shr r2, 8
|
|
mov [r0 + r1], r2b
|
|
shr r2, 8
|
|
mov [r0 + r1 * 2], r2b
|
|
%endif
|
|
.end:
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_dc8, 5, 7, 3
|
|
pxor m0, m0
|
|
movh m1, [r2 + 1]
|
|
movh m2, [r2 + 17]
|
|
punpcklqdq m1, m2
|
|
psadbw m1, m0
|
|
pshufd m2, m1, 2
|
|
paddw m1, m2
|
|
|
|
paddw m1, [pw_8]
|
|
psraw m1, 4
|
|
pmullw m1, [pw_257]
|
|
pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
|
|
|
|
test r4d, r4d
|
|
|
|
; store DC 8x8
|
|
lea r6, [r1 + r1 * 2]
|
|
lea r5, [r6 + r1 * 2]
|
|
movh [r0], m1
|
|
movh [r0 + r1], m1
|
|
movh [r0 + r1 * 2], m1
|
|
movh [r0 + r6], m1
|
|
movh [r0 + r1 * 4], m1
|
|
movh [r0 + r5], m1
|
|
movh [r0 + r6 * 2], m1
|
|
lea r5, [r5 + r1 * 2]
|
|
movh [r0 + r5], m1
|
|
|
|
; Do DC Filter
|
|
jz .end
|
|
psrlw m1, 8
|
|
movq m2, [pw_2]
|
|
pmullw m2, m1
|
|
paddw m2, [pw_2]
|
|
movd r4d, m2 ; r4d = DC * 2 + 2
|
|
paddw m1, m2 ; m1 = DC * 3 + 2
|
|
pshufd m1, m1, 0
|
|
|
|
; filter top
|
|
movq m2, [r2 + 1]
|
|
punpcklbw m2, m0
|
|
paddw m2, m1
|
|
psraw m2, 2 ; sum = sum / 16
|
|
packuswb m2, m2
|
|
movh [r0], m2
|
|
|
|
; filter top-left
|
|
movzx r3d, byte [r2 + 17]
|
|
add r4d, r3d
|
|
movzx r3d, byte [r2 + 1]
|
|
add r3d, r4d
|
|
shr r3d, 2
|
|
mov [r0], r3b
|
|
|
|
; filter left
|
|
movq m2, [r2 + 18]
|
|
punpcklbw m2, m0
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
movd r2d, m2
|
|
lea r0, [r0 + r1]
|
|
lea r5, [r6 + r1 * 2]
|
|
mov [r0], r2b
|
|
shr r2, 8
|
|
mov [r0 + r1], r2b
|
|
shr r2, 8
|
|
mov [r0 + r1 * 2], r2b
|
|
shr r2, 8
|
|
mov [r0 + r6], r2b
|
|
pshufd m2, m2, 0x01
|
|
movd r2d, m2
|
|
mov [r0 + r1 * 4], r2b
|
|
shr r2, 8
|
|
mov [r0 + r5], r2b
|
|
shr r2, 8
|
|
mov [r0 + r6 * 2], r2b
|
|
|
|
.end:
|
|
RET
|
|
|
|
;--------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;--------------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
%if ARCH_X86_64
|
|
cglobal intra_pred_dc16, 5, 10, 4
|
|
%else
|
|
cglobal intra_pred_dc16, 5, 7, 4
|
|
%endif
|
|
pxor m0, m0
|
|
movu m1, [r2 + 1]
|
|
movu m2, [r2 + 33]
|
|
psadbw m1, m0
|
|
psadbw m2, m0
|
|
paddw m1, m2
|
|
pshufd m2, m1, 2
|
|
paddw m1, m2
|
|
|
|
paddw m1, [pw_16]
|
|
psraw m1, 5
|
|
pmullw m1, [pw_257]
|
|
pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
|
|
pshufd m1, m1, 0x00
|
|
|
|
|
|
test r4d, r4d
|
|
|
|
; store DC 16x16
|
|
%if ARCH_X86_64
|
|
lea r6, [r1 + r1 * 2] ;index 3
|
|
lea r7, [r1 + r1 * 4] ;index 5
|
|
lea r8, [r6 + r1 * 4] ;index 7
|
|
lea r9, [r0 + r8] ;base + 7
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r6], m1
|
|
movu [r0 + r1 * 4], m1
|
|
movu [r0 + r7], m1
|
|
movu [r0 + r6 * 2], m1
|
|
movu [r0 + r8], m1
|
|
movu [r0 + r1 * 8], m1
|
|
movu [r9 + r1 * 2], m1
|
|
movu [r0 + r7 * 2], m1
|
|
movu [r9 + r1 * 4], m1
|
|
movu [r0 + r6 * 4], m1
|
|
movu [r9 + r6 * 2], m1
|
|
movu [r0 + r8 * 2], m1
|
|
movu [r9 + r1 * 8], m1
|
|
%else ;32 bit
|
|
mov r6, r0
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
%endif
|
|
; Do DC Filter
|
|
jz .end
|
|
psrlw m1, 8
|
|
mova m2, [pw_2]
|
|
pmullw m2, m1
|
|
paddw m2, [pw_2]
|
|
movd r4d, m2
|
|
paddw m1, m2
|
|
|
|
; filter top
|
|
movh m2, [r2 + 1]
|
|
punpcklbw m2, m0
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
movh m3, [r2 + 9]
|
|
punpcklbw m3, m0
|
|
paddw m3, m1
|
|
psraw m3, 2
|
|
packuswb m3, m3
|
|
|
|
; filter top-left
|
|
movzx r5d, byte [r2 + 33]
|
|
add r4d, r5d
|
|
movzx r3d, byte [r2 + 1]
|
|
add r3d, r4d
|
|
shr r3d, 2
|
|
|
|
%if ARCH_X86_64
|
|
movh [r0], m2
|
|
movh [r0 + 8], m3
|
|
mov [r0], r3b
|
|
%else ;32 bit
|
|
movh [r6], m2
|
|
movh [r6 + 8], m3
|
|
mov [r6], r3b
|
|
add r6, r1
|
|
%endif
|
|
|
|
; filter left
|
|
movh m2, [r2 + 34]
|
|
punpcklbw m2, m0
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
|
|
movh m3, [r2 + 42]
|
|
punpcklbw m3, m0
|
|
paddw m3, m1
|
|
psraw m3, 2
|
|
packuswb m3, m3
|
|
%if ARCH_X86_64
|
|
movh r3, m2
|
|
mov [r0 + r1], r3b
|
|
shr r3, 8
|
|
mov [r0 + r1 * 2], r3b
|
|
shr r3, 8
|
|
mov [r0 + r6], r3b
|
|
shr r3, 8
|
|
mov [r0 + r1 * 4], r3b
|
|
shr r3, 8
|
|
mov [r0 + r7], r3b
|
|
shr r3, 8
|
|
mov [r0 + r6 * 2], r3b
|
|
shr r3, 8
|
|
mov [r0 + r8], r3b
|
|
shr r3, 8
|
|
mov [r0 + r1 * 8], r3b
|
|
movh r3, m3
|
|
mov [r9 + r1 * 2], r3b
|
|
shr r3, 8
|
|
mov [r0 + r7 * 2], r3b
|
|
shr r3, 8
|
|
mov [r9 + r1 * 4], r3b
|
|
shr r3, 8
|
|
mov [r0 + r6 * 4], r3b
|
|
shr r3, 8
|
|
mov [r9 + r6 * 2], r3b
|
|
shr r3, 8
|
|
mov [r0 + r8 * 2], r3b
|
|
shr r3, 8
|
|
mov [r9 + r1 * 8], r3b
|
|
%else ;32 bit
|
|
movd r2d, m2
|
|
pshufd m2, m2, 0x01
|
|
mov [r6], r2b
|
|
shr r2, 8
|
|
mov [r6 + r1], r2b
|
|
shr r2, 8
|
|
mov [r6 + r1 * 2], r2b
|
|
lea r6, [r6 + r1 * 2]
|
|
shr r2, 8
|
|
mov [r6 + r1], r2b
|
|
movd r2d, m2
|
|
mov [r6 + r1 * 2], r2b
|
|
lea r6, [r6 + r1 * 2]
|
|
shr r2, 8
|
|
mov [r6 + r1], r2b
|
|
shr r2, 8
|
|
mov [r6 + r1 * 2], r2b
|
|
lea r6, [r6 + r1 * 2]
|
|
shr r2, 8
|
|
mov [r6 + r1], r2b
|
|
movd r2d, m3
|
|
pshufd m3, m3, 0x01
|
|
mov [r6 + r1 * 2], r2b
|
|
lea r6, [r6 + r1 * 2]
|
|
shr r2, 8
|
|
mov [r6 + r1], r2b
|
|
shr r2, 8
|
|
mov [r6 + r1 * 2], r2b
|
|
lea r6, [r6 + r1 * 2]
|
|
shr r2, 8
|
|
mov [r6 + r1], r2b
|
|
movd r2d, m3
|
|
mov [r6 + r1 * 2], r2b
|
|
lea r6, [r6 + r1 * 2]
|
|
shr r2, 8
|
|
mov [r6 + r1], r2b
|
|
shr r2, 8
|
|
mov [r6 + r1 * 2], r2b
|
|
%endif
|
|
.end:
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_dc32, 3, 3, 5
|
|
pxor m0, m0
|
|
movu m1, [r2 + 1]
|
|
movu m2, [r2 + 17]
|
|
movu m3, [r2 + 65]
|
|
movu m4, [r2 + 81]
|
|
psadbw m1, m0
|
|
psadbw m2, m0
|
|
psadbw m3, m0
|
|
psadbw m4, m0
|
|
paddw m1, m2
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
pshufd m2, m1, 2
|
|
paddw m1, m2
|
|
|
|
paddw m1, [pw_32]
|
|
psraw m1, 6
|
|
pmullw m1, [pw_257]
|
|
pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
|
|
pshufd m1, m1, 0x00
|
|
|
|
%assign x 0
|
|
%rep 16
|
|
; store DC 16x16
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16], m1
|
|
%if x < 16
|
|
lea r0, [r0 + 2 * r1]
|
|
%endif
|
|
%assign x x+1
|
|
%endrep
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_planar4, 3,3,5
|
|
pxor m0, m0
|
|
movh m1, [r2 + 1]
|
|
punpcklbw m1, m0
|
|
movh m2, [r2 + 9]
|
|
punpcklbw m2, m0
|
|
pshufhw m3, m1, 0 ; topRight
|
|
pshufd m3, m3, 0xAA
|
|
pshufhw m4, m2, 0 ; bottomLeft
|
|
pshufd m4, m4, 0xAA
|
|
pmullw m3, [multi_2Row] ; (x + 1) * topRight
|
|
pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
|
|
paddw m3, [pw_4]
|
|
paddw m3, m4
|
|
paddw m3, m0
|
|
psubw m4, m1
|
|
|
|
pshuflw m1, m2, 0
|
|
pmullw m1, [pw_planar4_0]
|
|
paddw m1, m3
|
|
paddw m3, m4
|
|
psraw m1, 3
|
|
packuswb m1, m1
|
|
movd [r0], m1
|
|
|
|
pshuflw m1, m2, 01010101b
|
|
pmullw m1, [pw_planar4_0]
|
|
paddw m1, m3
|
|
paddw m3, m4
|
|
psraw m1, 3
|
|
packuswb m1, m1
|
|
movd [r0 + r1], m1
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
pshuflw m1, m2, 10101010b
|
|
pmullw m1, [pw_planar4_0]
|
|
paddw m1, m3
|
|
paddw m3, m4
|
|
psraw m1, 3
|
|
packuswb m1, m1
|
|
movd [r0], m1
|
|
|
|
pshuflw m1, m2, 11111111b
|
|
pmullw m1, [pw_planar4_0]
|
|
paddw m1, m3
|
|
psraw m1, 3
|
|
packuswb m1, m1
|
|
movd [r0 + r1], m1
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_planar8, 3,3,6
|
|
pxor m0, m0
|
|
movh m1, [r2 + 1]
|
|
punpcklbw m1, m0
|
|
movh m2, [r2 + 17]
|
|
punpcklbw m2, m0
|
|
|
|
movd m3, [r2 + 9] ; topRight = above[8];
|
|
movd m4, [r2 + 25] ; bottomLeft = left[8];
|
|
|
|
pand m3, [pw_00ff]
|
|
pand m4, [pw_00ff]
|
|
pshuflw m3, m3, 0x00
|
|
pshuflw m4, m4, 0x00
|
|
pshufd m3, m3, 0x44
|
|
pshufd m4, m4, 0x44
|
|
pmullw m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
|
|
paddw m3, [pw_8]
|
|
paddw m3, m4
|
|
paddw m3, m0
|
|
psubw m4, m1
|
|
|
|
%macro INTRA_PRED_PLANAR_8 1
|
|
%if (%1 < 4)
|
|
pshuflw m5, m2, 0x55 * %1
|
|
pshufd m5, m5, 0
|
|
%else
|
|
pshufhw m5, m2, 0x55 * (%1 - 4)
|
|
pshufd m5, m5, 0xAA
|
|
%endif
|
|
pmullw m5, [pw_planar16_mul + mmsize]
|
|
paddw m5, m3
|
|
psraw m5, 4
|
|
packuswb m5, m5
|
|
movh [r0], m5
|
|
%if (%1 < 7)
|
|
paddw m3, m4
|
|
lea r0, [r0 + r1]
|
|
%endif
|
|
%endmacro
|
|
|
|
INTRA_PRED_PLANAR_8 0
|
|
INTRA_PRED_PLANAR_8 1
|
|
INTRA_PRED_PLANAR_8 2
|
|
INTRA_PRED_PLANAR_8 3
|
|
INTRA_PRED_PLANAR_8 4
|
|
INTRA_PRED_PLANAR_8 5
|
|
INTRA_PRED_PLANAR_8 6
|
|
INTRA_PRED_PLANAR_8 7
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_planar16, 3,5,8
|
|
pxor m0, m0
|
|
movh m2, [r2 + 1]
|
|
punpcklbw m2, m0
|
|
movh m7, [r2 + 9]
|
|
punpcklbw m7, m0
|
|
|
|
movd m3, [r2 + 17] ; topRight = above[16]
|
|
movd m6, [r2 + 49] ; bottomLeft = left[16]
|
|
pand m3, [pw_00ff]
|
|
pand m6, [pw_00ff]
|
|
pshuflw m3, m3, 0x00
|
|
pshuflw m6, m6, 0x00
|
|
pshufd m3, m3, 0x44 ; v_topRight
|
|
pshufd m6, m6, 0x44 ; v_bottomLeft
|
|
pmullw m4, m3, [multiH] ; (x + 1) * topRight
|
|
pmullw m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
|
|
pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
|
|
paddw m4, [pw_16]
|
|
paddw m3, [pw_16]
|
|
paddw m4, m6
|
|
paddw m3, m6
|
|
paddw m4, m5
|
|
paddw m3, m1
|
|
psubw m1, m6, m7
|
|
psubw m6, m2
|
|
|
|
movh m2, [r2 + 33]
|
|
punpcklbw m2, m0
|
|
movh m7, [r2 + 41]
|
|
punpcklbw m7, m0
|
|
|
|
%macro INTRA_PRED_PLANAR_16 1
|
|
%if (%1 < 4)
|
|
pshuflw m5, m2, 0x55 * %1
|
|
pshufd m5, m5, 0
|
|
%else
|
|
%if (%1 < 8)
|
|
pshufhw m5, m2, 0x55 * (%1 - 4)
|
|
pshufd m5, m5, 0xAA
|
|
%else
|
|
%if (%1 < 12)
|
|
pshuflw m5, m7, 0x55 * (%1 - 8)
|
|
pshufd m5, m5, 0
|
|
%else
|
|
pshufhw m5, m7, 0x55 * (%1 - 12)
|
|
pshufd m5, m5, 0xAA
|
|
%endif
|
|
%endif
|
|
%endif
|
|
%if (%1 > 0)
|
|
paddw m3, m6
|
|
paddw m4, m1
|
|
lea r0, [r0 + r1]
|
|
%endif
|
|
pmullw m0, m5, [pw_planar16_mul + mmsize]
|
|
pmullw m5, [pw_planar16_mul]
|
|
paddw m0, m4
|
|
paddw m5, m3
|
|
psraw m5, 5
|
|
psraw m0, 5
|
|
packuswb m5, m0
|
|
movu [r0], m5
|
|
%endmacro
|
|
|
|
INTRA_PRED_PLANAR_16 0
|
|
INTRA_PRED_PLANAR_16 1
|
|
INTRA_PRED_PLANAR_16 2
|
|
INTRA_PRED_PLANAR_16 3
|
|
INTRA_PRED_PLANAR_16 4
|
|
INTRA_PRED_PLANAR_16 5
|
|
INTRA_PRED_PLANAR_16 6
|
|
INTRA_PRED_PLANAR_16 7
|
|
INTRA_PRED_PLANAR_16 8
|
|
INTRA_PRED_PLANAR_16 9
|
|
INTRA_PRED_PLANAR_16 10
|
|
INTRA_PRED_PLANAR_16 11
|
|
INTRA_PRED_PLANAR_16 12
|
|
INTRA_PRED_PLANAR_16 13
|
|
INTRA_PRED_PLANAR_16 14
|
|
INTRA_PRED_PLANAR_16 15
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
%if ARCH_X86_64 == 1
|
|
cglobal intra_pred_planar32, 3,3,16
|
|
movd m3, [r2 + 33] ; topRight = above[32]
|
|
|
|
pxor m7, m7
|
|
pand m3, [pw_00ff]
|
|
pshuflw m3, m3, 0x00
|
|
pshufd m3, m3, 0x44
|
|
|
|
pmullw m0, m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m1, m3, [multiH] ; (x + 1) * topRight
|
|
pmullw m2, m3, [multiH2] ; (x + 1) * topRight
|
|
pmullw m3, [multiH3] ; (x + 1) * topRight
|
|
|
|
movd m11, [r2 + 97] ; bottomLeft = left[32]
|
|
pand m11, [pw_00ff]
|
|
pshuflw m11, m11, 0x00
|
|
pshufd m11, m11, 0x44
|
|
mova m5, m11
|
|
paddw m5, [pw_32]
|
|
|
|
paddw m0, m5
|
|
paddw m1, m5
|
|
paddw m2, m5
|
|
paddw m3, m5
|
|
mova m8, m11
|
|
mova m9, m11
|
|
mova m10, m11
|
|
mova m12, [pw_31]
|
|
movh m4, [r2 + 1]
|
|
punpcklbw m4, m7
|
|
psubw m8, m4
|
|
pmullw m4, m12
|
|
paddw m0, m4
|
|
|
|
movh m4, [r2 + 9]
|
|
punpcklbw m4, m7
|
|
psubw m9, m4
|
|
pmullw m4, m12
|
|
paddw m1, m4
|
|
|
|
movh m4, [r2 + 17]
|
|
punpcklbw m4, m7
|
|
psubw m10, m4
|
|
pmullw m4, m12
|
|
paddw m2, m4
|
|
|
|
movh m4, [r2 + 25]
|
|
punpcklbw m4, m7
|
|
psubw m11, m4
|
|
pmullw m4, m12
|
|
paddw m3, m4
|
|
mova m12, [pw_planar32_mul]
|
|
mova m13, [pw_planar32_mul + mmsize]
|
|
mova m14, [pw_planar16_mul]
|
|
mova m15, [pw_planar16_mul + mmsize]
|
|
%macro PROCESS 1
|
|
pmullw m5, %1, m12
|
|
pmullw m6, %1, m13
|
|
paddw m5, m0
|
|
paddw m6, m1
|
|
psraw m5, 6
|
|
psraw m6, 6
|
|
packuswb m5, m6
|
|
movu [r0], m5
|
|
|
|
pmullw m5, %1, m14
|
|
pmullw %1, m15
|
|
paddw m5, m2
|
|
paddw %1, m3
|
|
psraw m5, 6
|
|
psraw %1, 6
|
|
packuswb m5, %1
|
|
movu [r0 + 16], m5
|
|
%endmacro
|
|
|
|
%macro INCREMENT 0
|
|
paddw m2, m10
|
|
paddw m3, m11
|
|
paddw m0, m8
|
|
paddw m1, m9
|
|
add r0, r1
|
|
%endmacro
|
|
|
|
%assign x 0
|
|
%rep 4
|
|
pxor m7, m7
|
|
movq m4, [r2 + 65 + x * 8]
|
|
punpcklbw m4, m7
|
|
%assign y 0
|
|
%rep 8
|
|
%if y < 4
|
|
pshuflw m7, m4, 0x55 * y
|
|
pshufd m7, m7, 0x44
|
|
%else
|
|
pshufhw m7, m4, 0x55 * (y - 4)
|
|
pshufd m7, m7, 0xEE
|
|
%endif
|
|
PROCESS m7
|
|
%if x + y < 10
|
|
INCREMENT
|
|
%endif
|
|
%assign y y+1
|
|
%endrep
|
|
%assign x x+1
|
|
%endrep
|
|
RET
|
|
|
|
%else ;end ARCH_X86_64, start ARCH_X86_32
|
|
cglobal intra_pred_planar32, 3,3,8,0-(4*mmsize)
|
|
movd m3, [r2 + 33] ; topRight = above[32]
|
|
|
|
pxor m7, m7
|
|
pand m3, [pw_00ff]
|
|
pshuflw m3, m3, 0x00
|
|
pshufd m3, m3, 0x44
|
|
|
|
pmullw m0, m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m1, m3, [multiH] ; (x + 1) * topRight
|
|
pmullw m2, m3, [multiH2] ; (x + 1) * topRight
|
|
pmullw m3, [multiH3] ; (x + 1) * topRight
|
|
|
|
movd m6, [r2 + 97] ; bottomLeft = left[32]
|
|
pand m6, [pw_00ff]
|
|
pshuflw m6, m6, 0x00
|
|
pshufd m6, m6, 0x44
|
|
mova m5, m6
|
|
paddw m5, [pw_32]
|
|
|
|
paddw m0, m5
|
|
paddw m1, m5
|
|
paddw m2, m5
|
|
paddw m3, m5
|
|
|
|
movh m4, [r2 + 1]
|
|
punpcklbw m4, m7
|
|
psubw m5, m6, m4
|
|
mova [rsp + 0 * mmsize], m5
|
|
pmullw m4, [pw_31]
|
|
paddw m0, m4
|
|
movh m4, [r2 + 9]
|
|
punpcklbw m4, m7
|
|
psubw m5, m6, m4
|
|
mova [rsp + 1 * mmsize], m5
|
|
pmullw m4, [pw_31]
|
|
paddw m1, m4
|
|
movh m4, [r2 + 17]
|
|
punpcklbw m4, m7
|
|
psubw m5, m6, m4
|
|
mova [rsp + 2 * mmsize], m5
|
|
pmullw m4, [pw_31]
|
|
paddw m2, m4
|
|
movh m4, [r2 + 25]
|
|
punpcklbw m4, m7
|
|
psubw m5, m6, m4
|
|
mova [rsp + 3 * mmsize], m5
|
|
pmullw m4, [pw_31]
|
|
paddw m3, m4
|
|
%macro PROCESS 1
|
|
pmullw m5, %1, [pw_planar32_mul]
|
|
pmullw m6, %1, [pw_planar32_mul + mmsize]
|
|
paddw m5, m0
|
|
paddw m6, m1
|
|
psraw m5, 6
|
|
psraw m6, 6
|
|
packuswb m5, m6
|
|
movu [r0], m5
|
|
pmullw m5, %1, [pw_planar16_mul]
|
|
pmullw %1, [pw_planar16_mul + mmsize]
|
|
paddw m5, m2
|
|
paddw %1, m3
|
|
psraw m5, 6
|
|
psraw %1, 6
|
|
packuswb m5, %1
|
|
movu [r0 + 16], m5
|
|
%endmacro
|
|
|
|
%macro INCREMENT 0
|
|
paddw m0, [rsp + 0 * mmsize]
|
|
paddw m1, [rsp + 1 * mmsize]
|
|
paddw m2, [rsp + 2 * mmsize]
|
|
paddw m3, [rsp + 3 * mmsize]
|
|
add r0, r1
|
|
%endmacro
|
|
|
|
%assign y 0
|
|
%rep 4
|
|
pxor m7, m7
|
|
movq m4, [r2 + 65 + y * 8]
|
|
punpcklbw m4, m7
|
|
%assign x 0
|
|
%rep 8
|
|
%if x < 4
|
|
pshuflw m7, m4, 0x55 * x
|
|
pshufd m7, m7, 0x44
|
|
%else
|
|
pshufhw m7, m4, 0x55 * (x - 4)
|
|
pshufd m7, m7, 0xEE
|
|
%endif
|
|
|
|
PROCESS m7
|
|
%if x + y < 10
|
|
INCREMENT
|
|
%endif
|
|
%assign x x+1
|
|
%endrep
|
|
%assign y y+1
|
|
%endrep
|
|
RET
|
|
|
|
%endif ; end ARCH_X86_32
|
|
|
|
%macro STORE_4x4 0
|
|
movd [r0], m0
|
|
psrldq m0, 4
|
|
movd [r0 + r1], m0
|
|
psrldq m0, 4
|
|
movd [r0 + r1 * 2], m0
|
|
lea r1, [r1 * 3]
|
|
psrldq m0, 4
|
|
movd [r0 + r1], m0
|
|
%endmacro
|
|
|
|
%macro TRANSPOSE_4x4 0
|
|
pshufd m0, m0, 0xD8
|
|
pshufd m1, m2, 0xD8
|
|
pshuflw m0, m0, 0xD8
|
|
pshuflw m1, m1, 0xD8
|
|
pshufhw m0, m0, 0xD8
|
|
pshufhw m1, m1, 0xD8
|
|
mova m2, m0
|
|
punpckldq m0, m1
|
|
punpckhdq m2, m1
|
|
packuswb m0, m2
|
|
%endmacro
|
|
|
|
;-----------------------------------------------------------------------------------------
|
|
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
|
|
;-----------------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_ang4_2, 3,5,1
|
|
lea r4, [r2 + 2]
|
|
add r2, 10
|
|
cmp r3m, byte 34
|
|
cmove r2, r4
|
|
|
|
movh m0, [r2]
|
|
movd [r0], m0
|
|
psrldq m0, 1
|
|
movd [r0 + r1], m0
|
|
psrldq m0, 1
|
|
movd [r0 + r1 * 2], m0
|
|
lea r1, [r1 * 3]
|
|
psrldq m0, 1
|
|
movd [r0 + r1], m0
|
|
RET
|
|
|
|
INIT_XMM sse2
|
|
cglobal intra_pred_ang4_3, 3,3,5
|
|
movh m3, [r2 + 9] ; [8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m3
|
|
psrldq m3, 1
|
|
movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
|
|
psrldq m3, 2
|
|
movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
psrldq m3, 2
|
|
movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 20 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 26 * 16]
|
|
packssdw m0, m1
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m4
|
|
pmaddwd m3, [pw_ang_table + 8 * 16]
|
|
punpcklbw m2, m4
|
|
pmaddwd m2, [pw_ang_table + 14 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_4, 3,3,5
|
|
movh m1, [r2 + 9] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m1, m1
|
|
psrldq m1, 1
|
|
movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
|
|
psrldq m1, 2
|
|
movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m2, m4
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 10 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 21 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 20 * 16]
|
|
pmaddwd m2, [pw_ang_table + 31 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_5, 3,3,5
|
|
movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m3
|
|
psrldq m3, 1
|
|
mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
psrldq m3, 2
|
|
mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m2, m1
|
|
mova m4, m2
|
|
pmaddwd m4, [pw_ang_table + 2 * 16]
|
|
punpcklbw m0, m1
|
|
pmaddwd m0, [pw_ang_table + 17 * 16]
|
|
packssdw m0, m4
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m1
|
|
pmaddwd m3, [pw_ang_table + 4 * 16]
|
|
pmaddwd m2, [pw_ang_table + 19 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_6, 3,3,4
|
|
movh m2, [r2 + 9] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m2, m2
|
|
psrldq m2, 1
|
|
movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m3, m0
|
|
pmaddwd m3, [pw_ang_table + 26 * 16]
|
|
pmaddwd m0, [pw_ang_table + 13 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m2, m1
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 20 * 16]
|
|
pmaddwd m2, [pw_ang_table + 7 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_7, 3,3,5
|
|
movh m3, [r2 + 9] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m3
|
|
psrldq m3, 1
|
|
movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m4, m0
|
|
mova m2, m0
|
|
pmaddwd m4, [pw_ang_table + 18 * 16]
|
|
pmaddwd m0, [pw_ang_table + 9 * 16]
|
|
packssdw m0, m4
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m1
|
|
pmaddwd m3, [pw_ang_table + 4 * 16]
|
|
pmaddwd m2, [pw_ang_table + 27 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_8, 3,3,5
|
|
movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m0, m0
|
|
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 10 * 16]
|
|
pmaddwd m0, [pw_ang_table + 5 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 20 * 16]
|
|
pmaddwd m2, [pw_ang_table + 15 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_9, 3,3,5
|
|
movh m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m0, m0
|
|
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 4 * 16]
|
|
pmaddwd m0, [pw_ang_table + 2 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 8 * 16]
|
|
pmaddwd m2, [pw_ang_table + 6 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_10, 3,5,4
|
|
movd m0, [r2 + 9] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m0, m0
|
|
punpcklwd m0, m0
|
|
pshufd m1, m0, 1
|
|
movhlps m2, m0
|
|
pshufd m3, m0, 3
|
|
movd [r0 + r1], m1
|
|
movd [r0 + r1 * 2], m2
|
|
lea r1, [r1 * 3]
|
|
movd [r0 + r1], m3
|
|
cmp r4m, byte 0
|
|
jz .quit
|
|
|
|
; filter
|
|
pxor m3, m3
|
|
punpcklbw m0, m3
|
|
movh m1, [r2] ;[4 3 2 1 0]
|
|
punpcklbw m1, m3
|
|
pshuflw m2, m1, 0x00
|
|
psrldq m1, 2
|
|
psubw m1, m2
|
|
psraw m1, 1
|
|
paddw m0, m1
|
|
packuswb m0, m0
|
|
|
|
.quit:
|
|
movd [r0], m0
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_11, 3,3,5
|
|
movd m1, [r2 + 9] ;[4 3 2 1]
|
|
movh m0, [r2 - 7] ;[A x x x x x x x]
|
|
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
|
|
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]]
|
|
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 28 * 16]
|
|
pmaddwd m0, [pw_ang_table + 30 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 24 * 16]
|
|
pmaddwd m2, [pw_ang_table + 26 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_12, 3,3,5
|
|
movd m1, [r2 + 9] ;[4 3 2 1]
|
|
movh m0, [r2 - 7] ;[A x x x x x x x]
|
|
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
|
|
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
|
|
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 22 * 16]
|
|
pmaddwd m0, [pw_ang_table + 27 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 12 * 16]
|
|
pmaddwd m2, [pw_ang_table + 17 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_24, 3,3,5
|
|
movd m1, [r2 + 1] ;[4 3 2 1]
|
|
movh m0, [r2 - 7] ;[A x x x x x x x]
|
|
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
|
|
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
|
|
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 22 * 16]
|
|
pmaddwd m0, [pw_ang_table + 27 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 12 * 16]
|
|
pmaddwd m2, [pw_ang_table + 17 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_13, 3,3,5
|
|
movd m1, [r2 - 1] ;[x x A x]
|
|
movd m2, [r2 + 9] ;[4 3 2 1]
|
|
movd m0, [r2 + 3] ;[x x B x]
|
|
punpcklbw m0, m1 ;[x x x x A B x x]
|
|
punpckldq m0, m2 ;[4 3 2 1 A B x x]
|
|
psrldq m0, 2 ;[x x 4 3 2 1 A B]
|
|
punpcklbw m0, m0
|
|
psrldq m0, 1
|
|
movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m4, m0
|
|
mova m2, m0
|
|
pmaddwd m4, [pw_ang_table + 14 * 16]
|
|
pmaddwd m0, [pw_ang_table + 23 * 16]
|
|
packssdw m0, m4
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m1
|
|
pmaddwd m3, [pw_ang_table + 28 * 16]
|
|
pmaddwd m2, [pw_ang_table + 5 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_14, 3,3,4
|
|
movd m1, [r2 - 1] ;[x x A x]
|
|
movd m0, [r2 + 1] ;[x x B x]
|
|
punpcklbw m0, m1 ;[A B x x]
|
|
movd m1, [r2 + 9] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B x x]
|
|
psrldq m0, 2 ;[x x 4 3 2 1 A B]
|
|
punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
|
|
psrldq m0, 1
|
|
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m3, m0
|
|
pmaddwd m3, [pw_ang_table + 6 * 16]
|
|
pmaddwd m0, [pw_ang_table + 19 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m2, m1
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 12 * 16]
|
|
pmaddwd m2, [pw_ang_table + 25 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_15, 3,3,5
|
|
movd m0, [r2] ;[x x x A]
|
|
movd m1, [r2 + 2] ;[x x x B]
|
|
punpcklbw m1, m0 ;[x x A B]
|
|
movd m0, [r2 + 3] ;[x x C x]
|
|
punpcklwd m0, m1 ;[A B C x]
|
|
movd m1, [r2 + 9] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B C x]
|
|
psrldq m0, 1 ;[x 4 3 2 1 A B C]
|
|
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
|
|
psrldq m0, 1
|
|
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
|
|
psrldq m0, 2
|
|
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m2, m4
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 30 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 15 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 28 * 16]
|
|
pmaddwd m2, [pw_ang_table + 13 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_16, 3,3,5
|
|
movd m2, [r2] ;[x x x A]
|
|
movd m1, [r2 + 2] ;[x x x B]
|
|
punpcklbw m1, m2 ;[x x A B]
|
|
movd m0, [r2 + 2] ;[x x C x]
|
|
punpcklwd m0, m1 ;[A B C x]
|
|
movd m1, [r2 + 9] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B C x]
|
|
psrldq m0, 1 ;[x 4 3 2 1 A B C]
|
|
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
|
|
psrldq m0, 1
|
|
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
|
|
psrldq m0, 2
|
|
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m2, m4
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 22 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 11 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 12 * 16]
|
|
pmaddwd m2, [pw_ang_table + 1 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_17, 3,3,5
|
|
movd m2, [r2] ;[x x x A]
|
|
movd m3, [r2 + 1] ;[x x x B]
|
|
movd m4, [r2 + 2] ;[x x x C]
|
|
movd m0, [r2 + 4] ;[x x x D]
|
|
punpcklbw m3, m2 ;[x x A B]
|
|
punpcklbw m0, m4 ;[x x C D]
|
|
punpcklwd m0, m3 ;[A B C D]
|
|
movd m1, [r2 + 9] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B C D]
|
|
punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D]
|
|
psrldq m0, 1
|
|
movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
|
|
psrldq m0, 2
|
|
movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
|
|
psrldq m0, 2
|
|
movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m3, m4
|
|
pmaddwd m3, [pw_ang_table + 12 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 6 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 24 * 16]
|
|
punpcklbw m2, m4
|
|
pmaddwd m2, [pw_ang_table + 18 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
|
|
TRANSPOSE_4x4
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_18, 3,4,2
|
|
mov r3d, [r2 + 8]
|
|
mov r3b, byte [r2]
|
|
bswap r3d
|
|
movd m0, r3d
|
|
|
|
movd m1, [r2 + 1]
|
|
punpckldq m0, m1
|
|
lea r3, [r1 * 3]
|
|
movd [r0 + r3], m0
|
|
psrldq m0, 1
|
|
movd [r0 + r1 * 2], m0
|
|
psrldq m0, 1
|
|
movd [r0 + r1], m0
|
|
psrldq m0, 1
|
|
movd [r0], m0
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_19, 3,3,5
|
|
movd m2, [r2] ;[x x x A]
|
|
movd m3, [r2 + 9] ;[x x x B]
|
|
movd m4, [r2 + 10] ;[x x x C]
|
|
movd m0, [r2 + 12] ;[x x x D]
|
|
punpcklbw m3, m2 ;[x x A B]
|
|
punpcklbw m0, m4 ;[x x C D]
|
|
punpcklwd m0, m3 ;[A B C D]
|
|
movd m1, [r2 + 1] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B C D]
|
|
punpcklbw m0, m0 ;[4 4 3 3 2 2 1 1 A A B B C C D D]
|
|
psrldq m0, 1
|
|
movh m1, m0 ;[x 4 4 3 3 2 2 1 1 A A B B C C D]
|
|
psrldq m0, 2
|
|
movh m2, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
|
|
psrldq m0, 2
|
|
movh m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m3, m4
|
|
pmaddwd m3, [pw_ang_table + 12 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 6 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 24 * 16]
|
|
punpcklbw m2, m4
|
|
pmaddwd m2, [pw_ang_table + 18 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_20, 3,3,5
|
|
movd m2, [r2] ;[x x x A]
|
|
movd m1, [r2 + 10] ;[x x x B]
|
|
punpcklbw m1, m2 ;[x x A B]
|
|
movd m0, [r2 + 10] ;[x x C x]
|
|
punpcklwd m0, m1 ;[A B C x]
|
|
movd m1, [r2 + 1] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B C x]
|
|
psrldq m0, 1 ;[x 4 3 2 1 A B C]
|
|
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
|
|
psrldq m0, 1
|
|
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
|
|
psrldq m0, 2
|
|
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m2, m4
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 22 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 11 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 12 * 16]
|
|
pmaddwd m2, [pw_ang_table + 1 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_21, 3,3,5
|
|
movd m0, [r2] ;[x x x A]
|
|
movd m1, [r2 + 10] ;[x x x B]
|
|
punpcklbw m1, m0 ;[x x A B]
|
|
movd m0, [r2 + 11] ;[x x C x]
|
|
punpcklwd m0, m1 ;[A B C x]
|
|
movd m1, [r2 + 1] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B C x]
|
|
psrldq m0, 1 ;[x 4 3 2 1 A B C]
|
|
punpcklbw m0, m0 ;[x x 4 4 3 3 2 2 1 1 A A B B C C]
|
|
psrldq m0, 1
|
|
movh m1, m0 ;[x x x 4 4 3 3 2 2 1 1 A A B B C]
|
|
psrldq m0, 2
|
|
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m2, m4
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 30 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 15 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 28 * 16]
|
|
pmaddwd m2, [pw_ang_table + 13 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_22, 3,3,4
|
|
movd m1, [r2 - 1] ;[x x A x]
|
|
movd m0, [r2 + 9] ;[x x B x]
|
|
punpcklbw m0, m1 ;[A B x x]
|
|
movd m1, [r2 + 1] ;[4 3 2 1]
|
|
punpckldq m0, m1 ;[4 3 2 1 A B x x]
|
|
psrldq m0, 2 ;[x x 4 3 2 1 A B]
|
|
punpcklbw m0, m0 ;[x x x x 4 4 3 3 2 2 1 1 A A B B]
|
|
psrldq m0, 1
|
|
movh m2, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m3, m0
|
|
pmaddwd m3, [pw_ang_table + 6 * 16]
|
|
pmaddwd m0, [pw_ang_table + 19 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m2, m1
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 12 * 16]
|
|
pmaddwd m2, [pw_ang_table + 25 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_23, 3,3,5
|
|
movd m1, [r2 - 1] ;[x x A x]
|
|
movd m2, [r2 + 1] ;[4 3 2 1]
|
|
movd m0, [r2 + 11] ;[x x B x]
|
|
punpcklbw m0, m1 ;[x x x x A B x x]
|
|
punpckldq m0, m2 ;[4 3 2 1 A B x x]
|
|
psrldq m0, 2 ;[x x 4 3 2 1 A B]
|
|
punpcklbw m0, m0
|
|
psrldq m0, 1
|
|
mova m3, m0 ;[x x x x x 4 4 3 3 2 2 1 1 A A B]
|
|
psrldq m0, 2 ;[x x x x x x x 4 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m4, m0
|
|
mova m2, m0
|
|
pmaddwd m4, [pw_ang_table + 14 * 16]
|
|
pmaddwd m0, [pw_ang_table + 23 * 16]
|
|
packssdw m0, m4
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m1
|
|
pmaddwd m3, [pw_ang_table + 28 * 16]
|
|
pmaddwd m2, [pw_ang_table + 5 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_25, 3,3,5
|
|
movd m1, [r2 + 1] ;[4 3 2 1]
|
|
movh m0, [r2 - 7] ;[A x x x x x x x]
|
|
punpcklbw m1, m1 ;[4 4 3 3 2 2 1 1]
|
|
punpcklqdq m0, m1 ;[4 4 3 3 2 2 1 1 A x x x x x x x]
|
|
psrldq m0, 7 ;[x x x x x x x x 4 3 3 2 2 1 1 A]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 28 * 16]
|
|
pmaddwd m0, [pw_ang_table + 30 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 24 * 16]
|
|
pmaddwd m2, [pw_ang_table + 26 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_26, 3,4,4
|
|
movd m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
|
|
|
|
; store
|
|
movd [r0], m0
|
|
movd [r0 + r1], m0
|
|
movd [r0 + r1 * 2], m0
|
|
lea r3, [r1 * 3]
|
|
movd [r0 + r3], m0
|
|
|
|
; filter
|
|
cmp r4m, byte 0
|
|
jz .quit
|
|
|
|
pxor m3, m3
|
|
punpcklbw m0, m3
|
|
pshuflw m0, m0, 0x00
|
|
movd m2, [r2]
|
|
punpcklbw m2, m3
|
|
pshuflw m2, m2, 0x00
|
|
movd m1, [r2 + 9]
|
|
punpcklbw m1, m3
|
|
psubw m1, m2
|
|
psraw m1, 1
|
|
paddw m0, m1
|
|
packuswb m0, m0
|
|
|
|
movd r2, m0
|
|
mov [r0], r2b
|
|
shr r2, 8
|
|
mov [r0 + r1], r2b
|
|
shr r2, 8
|
|
mov [r0 + r1 * 2], r2b
|
|
shr r2, 8
|
|
mov [r0 + r3], r2b
|
|
|
|
.quit:
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_27, 3,3,5
|
|
movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m0, m0
|
|
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 4 * 16]
|
|
pmaddwd m0, [pw_ang_table + 2 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 8 * 16]
|
|
pmaddwd m2, [pw_ang_table + 6 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_28, 3,3,5
|
|
movh m0, [r2 + 1] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m0, m0
|
|
psrldq m0, 1 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m2, m0
|
|
mova m3, m0
|
|
mova m4, m2
|
|
pmaddwd m3, [pw_ang_table + 10 * 16]
|
|
pmaddwd m0, [pw_ang_table + 5 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
pmaddwd m4, [pw_ang_table + 20 * 16]
|
|
pmaddwd m2, [pw_ang_table + 15 * 16]
|
|
packssdw m2, m4
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_29, 3,3,5
|
|
movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m3
|
|
psrldq m3, 1
|
|
movh m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
psrldq m3, 2 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m4, m0
|
|
mova m2, m0
|
|
pmaddwd m4, [pw_ang_table + 18 * 16]
|
|
pmaddwd m0, [pw_ang_table + 9 * 16]
|
|
packssdw m0, m4
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m1
|
|
pmaddwd m3, [pw_ang_table + 4 * 16]
|
|
pmaddwd m2, [pw_ang_table + 27 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_30, 3,3,4
|
|
movh m2, [r2 + 1] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m2, m2
|
|
psrldq m2, 1
|
|
movh m0, m2 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
psrldq m2, 2 ;[x x x 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m0, m1
|
|
mova m3, m0
|
|
pmaddwd m3, [pw_ang_table + 26 * 16]
|
|
pmaddwd m0, [pw_ang_table + 13 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m2, m1
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 20 * 16]
|
|
pmaddwd m2, [pw_ang_table + 7 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_31, 3,3,5
|
|
movh m3, [r2 + 1] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m3
|
|
psrldq m3, 1
|
|
mova m0, m3 ;[x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
psrldq m3, 2
|
|
mova m2, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
psrldq m3, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
|
|
pxor m1, m1
|
|
punpcklbw m2, m1
|
|
mova m4, m2
|
|
pmaddwd m4, [pw_ang_table + 2 * 16]
|
|
punpcklbw m0, m1
|
|
pmaddwd m0, [pw_ang_table + 17 * 16]
|
|
packssdw m0, m4
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m1
|
|
pmaddwd m3, [pw_ang_table + 4 * 16]
|
|
pmaddwd m2, [pw_ang_table + 19 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_32, 3,3,5
|
|
movh m1, [r2 + 1] ;[8 7 6 5 4 3 2 1]
|
|
punpcklbw m1, m1
|
|
psrldq m1, 1
|
|
movh m0, m1 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
|
|
psrldq m1, 2
|
|
movh m2, m1 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
psrldq m1, 2 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m2, m4
|
|
mova m3, m2
|
|
pmaddwd m3, [pw_ang_table + 10 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 21 * 16]
|
|
packssdw m0, m3
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 20 * 16]
|
|
pmaddwd m2, [pw_ang_table + 31 * 16]
|
|
packssdw m2, m1
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_33, 3,3,5
|
|
movh m3, [r2 + 1] ; [8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m3
|
|
psrldq m3, 1
|
|
movh m0, m3 ;[x x x x x x x x 5 4 4 3 3 2 2 1]
|
|
psrldq m3, 2
|
|
movh m1, m3 ;[x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
psrldq m3, 2
|
|
movh m2, m3 ;[x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
psrldq m3, 2 ;[x x x x x x x x 8 7 7 6 6 5 5 4]
|
|
|
|
pxor m4, m4
|
|
punpcklbw m1, m4
|
|
pmaddwd m1, [pw_ang_table + 20 * 16]
|
|
punpcklbw m0, m4
|
|
pmaddwd m0, [pw_ang_table + 26 * 16]
|
|
packssdw m0, m1
|
|
paddw m0, [pw_16]
|
|
psraw m0, 5
|
|
punpcklbw m3, m4
|
|
pmaddwd m3, [pw_ang_table + 8 * 16]
|
|
punpcklbw m2, m4
|
|
pmaddwd m2, [pw_ang_table + 14 * 16]
|
|
packssdw m2, m3
|
|
paddw m2, [pw_16]
|
|
psraw m2, 5
|
|
packuswb m0, m2
|
|
|
|
STORE_4x4
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_dc4, 5,5,3
|
|
inc r2
|
|
pxor m0, m0
|
|
movd m1, [r2]
|
|
movd m2, [r2 + 8]
|
|
punpckldq m1, m2
|
|
psadbw m1, m0 ; m1 = sum
|
|
|
|
test r4d, r4d
|
|
|
|
pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8
|
|
movd r4d, m1 ; r4d = dc_val
|
|
pshufb m1, m0 ; m1 = byte [dc_val ...]
|
|
|
|
; store DC 4x4
|
|
lea r3, [r1 * 3]
|
|
movd [r0], m1
|
|
movd [r0 + r1], m1
|
|
movd [r0 + r1 * 2], m1
|
|
movd [r0 + r3], m1
|
|
|
|
; do DC filter
|
|
jz .end
|
|
lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
|
|
add r4d, r3d ; r4d = DC * 3 + 2
|
|
movd m1, r4d
|
|
pshuflw m1, m1, 0 ; m1 = pixDCx3
|
|
pshufd m1, m1, 0
|
|
|
|
; filter top
|
|
movd m2, [r2]
|
|
movd m0, [r2 + 9]
|
|
punpckldq m2, m0
|
|
pmovzxbw m2, m2
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
movd [r0], m2 ; overwrite top-left pixel, we will update it later
|
|
|
|
; filter top-left
|
|
movzx r4d, byte [r2 + 8]
|
|
add r3d, r4d
|
|
movzx r4d, byte [r2]
|
|
add r3d, r4d
|
|
shr r3d, 2
|
|
mov [r0], r3b
|
|
|
|
; filter left
|
|
add r0, r1
|
|
pextrb [r0], m2, 4
|
|
pextrb [r0 + r1], m2, 5
|
|
pextrb [r0 + r1 * 2], m2, 6
|
|
|
|
.end:
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_dc8, 5, 7, 3
|
|
lea r3, [r2 + 17]
|
|
inc r2
|
|
pxor m0, m0
|
|
movh m1, [r2]
|
|
movh m2, [r3]
|
|
punpcklqdq m1, m2
|
|
psadbw m1, m0
|
|
pshufd m2, m1, 2
|
|
paddw m1, m2
|
|
|
|
movd r5d, m1
|
|
add r5d, 8
|
|
shr r5d, 4 ; sum = sum / 16
|
|
movd m1, r5d
|
|
pshufb m1, m0 ; m1 = byte [dc_val ...]
|
|
|
|
test r4d, r4d
|
|
|
|
; store DC 8x8
|
|
mov r6, r0
|
|
movh [r0], m1
|
|
movh [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movh [r0], m1
|
|
movh [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movh [r0], m1
|
|
movh [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movh [r0], m1
|
|
movh [r0 + r1], m1
|
|
|
|
; Do DC Filter
|
|
jz .end
|
|
lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
|
|
add r5d, r4d ; r5d = DC * 3 + 2
|
|
movd m1, r5d
|
|
pshuflw m1, m1, 0 ; m1 = pixDCx3
|
|
pshufd m1, m1, 0
|
|
|
|
; filter top
|
|
pmovzxbw m2, [r2]
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
movh [r6], m2
|
|
|
|
; filter top-left
|
|
movzx r5d, byte [r3]
|
|
add r4d, r5d
|
|
movzx r3d, byte [r2]
|
|
add r3d, r4d
|
|
shr r3d, 2
|
|
mov [r6], r3b
|
|
|
|
; filter left
|
|
add r6, r1
|
|
pmovzxbw m2, [r2 + 17]
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
pextrb [r6], m2, 0
|
|
pextrb [r6 + r1], m2, 1
|
|
pextrb [r6 + 2 * r1], m2, 2
|
|
lea r6, [r6 + r1 * 2]
|
|
pextrb [r6 + r1], m2, 3
|
|
pextrb [r6 + r1 * 2], m2, 4
|
|
pextrb [r6 + r1 * 4], m2, 6
|
|
lea r1, [r1 * 3]
|
|
pextrb [r6 + r1], m2, 5
|
|
|
|
.end:
|
|
RET
|
|
|
|
;--------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;--------------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_dc16, 5, 7, 4
|
|
lea r3, [r2 + 33]
|
|
inc r2
|
|
pxor m0, m0
|
|
movu m1, [r2]
|
|
movu m2, [r3]
|
|
psadbw m1, m0
|
|
psadbw m2, m0
|
|
paddw m1, m2
|
|
pshufd m2, m1, 2
|
|
paddw m1, m2
|
|
|
|
movd r5d, m1
|
|
add r5d, 16
|
|
shr r5d, 5 ; sum = sum / 32
|
|
movd m1, r5d
|
|
pshufb m1, m0 ; m1 = byte [dc_val ...]
|
|
|
|
test r4d, r4d
|
|
|
|
; store DC 16x16
|
|
mov r6, r0
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
lea r0, [r0 + r1 * 2]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
|
|
; Do DC Filter
|
|
jz .end
|
|
lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
|
|
add r5d, r4d ; r5d = DC * 3 + 2
|
|
movd m1, r5d
|
|
pshuflw m1, m1, 0 ; m1 = pixDCx3
|
|
pshufd m1, m1, 0
|
|
|
|
; filter top
|
|
pmovzxbw m2, [r2]
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
movh [r6], m2
|
|
pmovzxbw m3, [r2 + 8]
|
|
paddw m3, m1
|
|
psraw m3, 2
|
|
packuswb m3, m3
|
|
movh [r6 + 8], m3
|
|
|
|
; filter top-left
|
|
movzx r5d, byte [r3]
|
|
add r4d, r5d
|
|
movzx r3d, byte [r2]
|
|
add r3d, r4d
|
|
shr r3d, 2
|
|
mov [r6], r3b
|
|
|
|
; filter left
|
|
add r6, r1
|
|
pmovzxbw m2, [r2 + 33]
|
|
paddw m2, m1
|
|
psraw m2, 2
|
|
packuswb m2, m2
|
|
pextrb [r6], m2, 0
|
|
pextrb [r6 + r1], m2, 1
|
|
pextrb [r6 + r1 * 2], m2, 2
|
|
lea r6, [r6 + r1 * 2]
|
|
pextrb [r6 + r1], m2, 3
|
|
pextrb [r6 + r1 * 2], m2, 4
|
|
lea r6, [r6 + r1 * 2]
|
|
pextrb [r6 + r1], m2, 5
|
|
pextrb [r6 + r1 * 2], m2, 6
|
|
lea r6, [r6 + r1 * 2]
|
|
pextrb [r6 + r1], m2, 7
|
|
|
|
pmovzxbw m3, [r2 + 41]
|
|
paddw m3, m1
|
|
psraw m3, 2
|
|
packuswb m3, m3
|
|
pextrb [r6 + r1 * 2], m3, 0
|
|
lea r6, [r6 + r1 * 2]
|
|
pextrb [r6 + r1], m3, 1
|
|
pextrb [r6 + r1 * 2], m3, 2
|
|
lea r6, [r6 + r1 * 2]
|
|
pextrb [r6 + r1], m3, 3
|
|
pextrb [r6 + r1 * 2], m3, 4
|
|
lea r6, [r6 + r1 * 2]
|
|
pextrb [r6 + r1], m3, 5
|
|
pextrb [r6 + r1 * 2], m3, 6
|
|
|
|
.end:
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_dc32, 3, 5, 5
|
|
lea r3, [r2 + 65]
|
|
inc r2
|
|
pxor m0, m0
|
|
movu m1, [r2]
|
|
movu m2, [r2 + 16]
|
|
movu m3, [r3]
|
|
movu m4, [r3 + 16]
|
|
psadbw m1, m0
|
|
psadbw m2, m0
|
|
psadbw m3, m0
|
|
psadbw m4, m0
|
|
paddw m1, m2
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
pshufd m2, m1, 2
|
|
paddw m1, m2
|
|
|
|
movd r4d, m1
|
|
add r4d, 32
|
|
shr r4d, 6 ; sum = sum / 64
|
|
movd m1, r4d
|
|
pshufb m1, m0 ; m1 = byte [dc_val ...]
|
|
|
|
%rep 2
|
|
; store DC 16x16
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
movu [r0], m1
|
|
movu [r0 + r1], m1
|
|
movu [r0 + 16], m1
|
|
movu [r0 + r1 + 16],m1
|
|
lea r0, [r0 + 2 * r1]
|
|
%endrep
|
|
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
|
|
;---------------------------------------------------------------------------------------------
|
|
%if ARCH_X86_64 == 1
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_dc32, 3, 4, 3
|
|
lea r3, [r1 * 3]
|
|
pxor m0, m0
|
|
movu m1, [r2 + 1]
|
|
movu m2, [r2 + 65]
|
|
psadbw m1, m0
|
|
psadbw m2, m0
|
|
paddw m1, m2
|
|
vextracti128 xm2, m1, 1
|
|
paddw m1, m2
|
|
pshufd m2, m1, 2
|
|
paddw m1, m2
|
|
|
|
pmulhrsw m1, [pw_512] ; sum = (sum + 32) / 64
|
|
vpbroadcastb m1, xm1 ; m1 = byte [dc_val ...]
|
|
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0 + r1 * 0], m1
|
|
movu [r0 + r1 * 1], m1
|
|
movu [r0 + r1 * 2], m1
|
|
movu [r0 + r3 * 1], m1
|
|
RET
|
|
%endif ;; ARCH_X86_64 == 1
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_planar4, 3,3,7
|
|
pmovzxbw m1, [r2 + 1]
|
|
pmovzxbw m2, [r2 + 9]
|
|
pshufhw m3, m1, 0 ; topRight
|
|
pshufd m3, m3, 0xAA
|
|
pshufhw m4, m2, 0 ; bottomLeft
|
|
pshufd m4, m4, 0xAA
|
|
pmullw m3, [multi_2Row] ; (x + 1) * topRight
|
|
pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
|
|
mova m6, [pw_planar4_0]
|
|
paddw m3, [pw_4]
|
|
paddw m3, m4
|
|
paddw m3, m0
|
|
psubw m4, m1
|
|
|
|
pshuflw m5, m2, 0
|
|
pmullw m5, m6
|
|
paddw m5, m3
|
|
paddw m3, m4
|
|
psraw m5, 3
|
|
packuswb m5, m5
|
|
movd [r0], m5
|
|
|
|
pshuflw m5, m2, 01010101b
|
|
pmullw m5, m6
|
|
paddw m5, m3
|
|
paddw m3, m4
|
|
psraw m5, 3
|
|
packuswb m5, m5
|
|
movd [r0 + r1], m5
|
|
lea r0, [r0 + 2 * r1]
|
|
|
|
pshuflw m5, m2, 10101010b
|
|
pmullw m5, m6
|
|
paddw m5, m3
|
|
paddw m3, m4
|
|
psraw m5, 3
|
|
packuswb m5, m5
|
|
movd [r0], m5
|
|
|
|
pshuflw m5, m2, 11111111b
|
|
pmullw m5, m6
|
|
paddw m5, m3
|
|
paddw m3, m4
|
|
psraw m5, 3
|
|
packuswb m5, m5
|
|
movd [r0 + r1], m5
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_planar8, 3,3,7
|
|
pmovzxbw m1, [r2 + 1]
|
|
pmovzxbw m2, [r2 + 17]
|
|
|
|
movd m3, [r2 + 9] ; topRight = above[8];
|
|
movd m4, [r2 + 25] ; bottomLeft = left[8];
|
|
|
|
pxor m0, m0
|
|
pshufb m3, m0
|
|
pshufb m4, m0
|
|
punpcklbw m3, m0 ; v_topRight
|
|
punpcklbw m4, m0 ; v_bottomLeft
|
|
pmullw m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
|
|
mova m6, [pw_planar16_mul + mmsize]
|
|
paddw m3, [pw_8]
|
|
paddw m3, m4
|
|
paddw m3, m0
|
|
psubw m4, m1
|
|
|
|
%macro INTRA_PRED_PLANAR8 1
|
|
%if (%1 < 4)
|
|
pshuflw m5, m2, 0x55 * %1
|
|
pshufd m5, m5, 0
|
|
%else
|
|
pshufhw m5, m2, 0x55 * (%1 - 4)
|
|
pshufd m5, m5, 0xAA
|
|
%endif
|
|
pmullw m5, m6
|
|
paddw m5, m3
|
|
paddw m3, m4
|
|
psraw m5, 4
|
|
packuswb m5, m5
|
|
movh [r0], m5
|
|
lea r0, [r0 + r1]
|
|
%endmacro
|
|
|
|
INTRA_PRED_PLANAR8 0
|
|
INTRA_PRED_PLANAR8 1
|
|
INTRA_PRED_PLANAR8 2
|
|
INTRA_PRED_PLANAR8 3
|
|
INTRA_PRED_PLANAR8 4
|
|
INTRA_PRED_PLANAR8 5
|
|
INTRA_PRED_PLANAR8 6
|
|
INTRA_PRED_PLANAR8 7
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_planar16, 3,3,8
|
|
pmovzxbw m2, [r2 + 1]
|
|
pmovzxbw m7, [r2 + 9]
|
|
|
|
movd m3, [r2 + 17] ; topRight = above[16]
|
|
movd m6, [r2 + 49] ; bottomLeft = left[16]
|
|
|
|
pxor m0, m0
|
|
pshufb m3, m0
|
|
pshufb m6, m0
|
|
punpcklbw m3, m0 ; v_topRight
|
|
punpcklbw m6, m0 ; v_bottomLeft
|
|
pmullw m4, m3, [multiH] ; (x + 1) * topRight
|
|
pmullw m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
|
|
pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
|
|
paddw m4, [pw_16]
|
|
paddw m3, [pw_16]
|
|
paddw m4, m6
|
|
paddw m3, m6
|
|
paddw m4, m5
|
|
paddw m3, m1
|
|
psubw m1, m6, m7
|
|
psubw m6, m2
|
|
|
|
pmovzxbw m2, [r2 + 33]
|
|
pmovzxbw m7, [r2 + 41]
|
|
|
|
%macro INTRA_PRED_PLANAR16 1
|
|
%if (%1 < 4)
|
|
pshuflw m5, m2, 0x55 * %1
|
|
pshufd m5, m5, 0
|
|
%else
|
|
%if (%1 < 8)
|
|
pshufhw m5, m2, 0x55 * (%1 - 4)
|
|
pshufd m5, m5, 0xAA
|
|
%else
|
|
%if (%1 < 12)
|
|
pshuflw m5, m7, 0x55 * (%1 - 8)
|
|
pshufd m5, m5, 0
|
|
%else
|
|
pshufhw m5, m7, 0x55 * (%1 - 12)
|
|
pshufd m5, m5, 0xAA
|
|
%endif
|
|
%endif
|
|
%endif
|
|
pmullw m0, m5, [pw_planar16_mul + mmsize]
|
|
pmullw m5, [pw_planar16_mul]
|
|
paddw m0, m4
|
|
paddw m5, m3
|
|
paddw m3, m6
|
|
paddw m4, m1
|
|
psraw m5, 5
|
|
psraw m0, 5
|
|
packuswb m5, m0
|
|
movu [r0], m5
|
|
lea r0, [r0 + r1]
|
|
%endmacro
|
|
|
|
INTRA_PRED_PLANAR16 0
|
|
INTRA_PRED_PLANAR16 1
|
|
INTRA_PRED_PLANAR16 2
|
|
INTRA_PRED_PLANAR16 3
|
|
INTRA_PRED_PLANAR16 4
|
|
INTRA_PRED_PLANAR16 5
|
|
INTRA_PRED_PLANAR16 6
|
|
INTRA_PRED_PLANAR16 7
|
|
INTRA_PRED_PLANAR16 8
|
|
INTRA_PRED_PLANAR16 9
|
|
INTRA_PRED_PLANAR16 10
|
|
INTRA_PRED_PLANAR16 11
|
|
INTRA_PRED_PLANAR16 12
|
|
INTRA_PRED_PLANAR16 13
|
|
INTRA_PRED_PLANAR16 14
|
|
INTRA_PRED_PLANAR16 15
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_planar16, 3,3,6
|
|
vpbroadcastw m3, [r2 + 17]
|
|
mova m5, [pw_00ff]
|
|
vpbroadcastw m4, [r2 + 49]
|
|
mova m0, [pw_planar16_mul]
|
|
pmovzxbw m2, [r2 + 1]
|
|
pand m3, m5 ; v_topRight
|
|
pand m4, m5 ; v_bottomLeft
|
|
|
|
pmullw m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
|
|
paddw m3, [pw_16]
|
|
paddw m3, m4
|
|
paddw m3, m1
|
|
psubw m4, m2
|
|
add r2, 33
|
|
|
|
%macro INTRA_PRED_PLANAR16_AVX2 1
|
|
vpbroadcastw m1, [r2 + %1]
|
|
vpsrlw m2, m1, 8
|
|
pand m1, m5
|
|
|
|
pmullw m1, m0
|
|
pmullw m2, m0
|
|
paddw m1, m3
|
|
paddw m3, m4
|
|
psraw m1, 5
|
|
paddw m2, m3
|
|
psraw m2, 5
|
|
paddw m3, m4
|
|
packuswb m1, m2
|
|
vpermq m1, m1, 11011000b
|
|
movu [r0], xm1
|
|
vextracti128 [r0 + r1], m1, 1
|
|
lea r0, [r0 + r1 * 2]
|
|
%endmacro
|
|
INTRA_PRED_PLANAR16_AVX2 0
|
|
INTRA_PRED_PLANAR16_AVX2 2
|
|
INTRA_PRED_PLANAR16_AVX2 4
|
|
INTRA_PRED_PLANAR16_AVX2 6
|
|
INTRA_PRED_PLANAR16_AVX2 8
|
|
INTRA_PRED_PLANAR16_AVX2 10
|
|
INTRA_PRED_PLANAR16_AVX2 12
|
|
INTRA_PRED_PLANAR16_AVX2 14
|
|
%undef INTRA_PRED_PLANAR16_AVX2
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
%if ARCH_X86_64 == 1
|
|
cglobal intra_pred_planar32, 3,4,12
|
|
%else
|
|
cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize)
|
|
%define m8 [rsp + 0 * mmsize]
|
|
%define m9 [rsp + 1 * mmsize]
|
|
%define m10 [rsp + 2 * mmsize]
|
|
%define m11 [rsp + 3 * mmsize]
|
|
%endif
|
|
movd m3, [r2 + 33] ; topRight = above[32]
|
|
|
|
pxor m7, m7
|
|
pshufb m3, m7
|
|
punpcklbw m3, m7 ; v_topRight
|
|
|
|
pmullw m0, m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m1, m3, [multiH] ; (x + 1) * topRight
|
|
pmullw m2, m3, [multiH2] ; (x + 1) * topRight
|
|
pmullw m3, [multiH3] ; (x + 1) * topRight
|
|
|
|
movd m6, [r2 + 97] ; bottomLeft = left[32]
|
|
pshufb m6, m7
|
|
punpcklbw m6, m7 ; v_bottomLeft
|
|
|
|
paddw m0, m6
|
|
paddw m1, m6
|
|
paddw m2, m6
|
|
paddw m3, m6
|
|
paddw m0, [pw_32]
|
|
paddw m1, [pw_32]
|
|
paddw m2, [pw_32]
|
|
paddw m3, [pw_32]
|
|
pmovzxbw m4, [r2 + 1]
|
|
pmullw m5, m4, [pw_31]
|
|
paddw m0, m5
|
|
psubw m5, m6, m4
|
|
mova m8, m5
|
|
pmovzxbw m4, [r2 + 9]
|
|
pmullw m5, m4, [pw_31]
|
|
paddw m1, m5
|
|
psubw m5, m6, m4
|
|
mova m9, m5
|
|
pmovzxbw m4, [r2 + 17]
|
|
pmullw m5, m4, [pw_31]
|
|
paddw m2, m5
|
|
psubw m5, m6, m4
|
|
mova m10, m5
|
|
pmovzxbw m4, [r2 + 25]
|
|
pmullw m5, m4, [pw_31]
|
|
paddw m3, m5
|
|
psubw m5, m6, m4
|
|
mova m11, m5
|
|
add r2, 65 ; (2 * blkSize + 1)
|
|
|
|
%macro INTRA_PRED_PLANAR32 0
|
|
movd m4, [r2]
|
|
pshufb m4, m7
|
|
punpcklbw m4, m7
|
|
pmullw m5, m4, [pw_planar32_mul]
|
|
pmullw m6, m4, [pw_planar32_mul + mmsize]
|
|
paddw m5, m0
|
|
paddw m6, m1
|
|
paddw m0, m8
|
|
paddw m1, m9
|
|
psraw m5, 6
|
|
psraw m6, 6
|
|
packuswb m5, m6
|
|
movu [r0], m5
|
|
pmullw m5, m4, [pw_planar16_mul]
|
|
pmullw m4, [pw_planar16_mul + mmsize]
|
|
paddw m5, m2
|
|
paddw m4, m3
|
|
paddw m2, m10
|
|
paddw m3, m11
|
|
psraw m5, 6
|
|
psraw m4, 6
|
|
packuswb m5, m4
|
|
movu [r0 + 16], m5
|
|
|
|
lea r0, [r0 + r1]
|
|
inc r2
|
|
%endmacro
|
|
|
|
mov r3, 4
|
|
.loop:
|
|
INTRA_PRED_PLANAR32
|
|
INTRA_PRED_PLANAR32
|
|
INTRA_PRED_PLANAR32
|
|
INTRA_PRED_PLANAR32
|
|
INTRA_PRED_PLANAR32
|
|
INTRA_PRED_PLANAR32
|
|
INTRA_PRED_PLANAR32
|
|
INTRA_PRED_PLANAR32
|
|
dec r3
|
|
jnz .loop
|
|
RET
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
|
|
;---------------------------------------------------------------------------------------
|
|
%if ARCH_X86_64 == 1
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_planar32, 3,4,11
|
|
mova m6, [pw_00ff]
|
|
vpbroadcastw m3, [r2 + 33] ; topRight = above[32]
|
|
vpbroadcastw m2, [r2 + 97] ; bottomLeft = left[32]
|
|
pand m3, m6
|
|
pand m2, m6
|
|
|
|
pmullw m0, m3, [multiL] ; (x + 1) * topRight
|
|
pmullw m3, [multiH2] ; (x + 1) * topRight
|
|
|
|
paddw m0, m2
|
|
paddw m3, m2
|
|
paddw m0, [pw_32]
|
|
paddw m3, [pw_32]
|
|
|
|
pmovzxbw m4, [r2 + 1]
|
|
pmovzxbw m1, [r2 + 17]
|
|
pmullw m5, m4, [pw_31]
|
|
paddw m0, m5
|
|
psubw m5, m2, m4
|
|
psubw m2, m1
|
|
pmullw m1, [pw_31]
|
|
paddw m3, m1
|
|
mova m1, m5
|
|
|
|
add r2, 65 ; (2 * blkSize + 1)
|
|
mova m9, [pw_planar32_mul]
|
|
mova m10, [pw_planar16_mul]
|
|
|
|
%macro INTRA_PRED_PLANAR32_AVX2 0
|
|
vpbroadcastw m4, [r2]
|
|
vpsrlw m7, m4, 8
|
|
pand m4, m6
|
|
|
|
pmullw m5, m4, m9
|
|
pmullw m4, m4, m10
|
|
paddw m5, m0
|
|
paddw m4, m3
|
|
paddw m0, m1
|
|
paddw m3, m2
|
|
psraw m5, 6
|
|
psraw m4, 6
|
|
packuswb m5, m4
|
|
pmullw m8, m7, m9
|
|
pmullw m7, m7, m10
|
|
vpermq m5, m5, 11011000b
|
|
paddw m8, m0
|
|
paddw m7, m3
|
|
paddw m0, m1
|
|
paddw m3, m2
|
|
psraw m8, 6
|
|
psraw m7, 6
|
|
packuswb m8, m7
|
|
add r2, 2
|
|
vpermq m8, m8, 11011000b
|
|
|
|
movu [r0], m5
|
|
movu [r0 + r1], m8
|
|
lea r0, [r0 + r1 * 2]
|
|
%endmacro
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
INTRA_PRED_PLANAR32_AVX2
|
|
%undef INTRA_PRED_PLANAR32_AVX2
|
|
RET
|
|
%endif ;; ARCH_X86_64 == 1
|
|
|
|
;-----------------------------------------------------------------------------------------
|
|
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
|
|
;-----------------------------------------------------------------------------------------
|
|
INIT_XMM ssse3
|
|
cglobal intra_pred_ang4_2, 3,5,3
|
|
lea r4, [r2 + 2]
|
|
add r2, 10
|
|
cmp r3m, byte 34
|
|
cmove r2, r4
|
|
|
|
movh m0, [r2]
|
|
movd [r0], m0
|
|
palignr m1, m0, 1
|
|
movd [r0 + r1], m1
|
|
palignr m2, m0, 2
|
|
movd [r0 + r1 * 2], m2
|
|
lea r1, [r1 * 3]
|
|
psrldq m0, 3
|
|
movd [r0 + r1], m0
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang4_3, 3,5,5
|
|
mov r4, 1
|
|
cmp r3m, byte 33
|
|
mov r3, 9
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
|
|
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4]
|
|
punpcklqdq m0, m1
|
|
punpcklqdq m2, m3
|
|
|
|
lea r3, [ang_table + 20 * 16]
|
|
movh m3, [r3 + 6 * 16] ; [26]
|
|
movhps m3, [r3] ; [20]
|
|
movh m4, [r3 - 6 * 16] ; [14]
|
|
movhps m4, [r3 - 12 * 16] ; [ 8]
|
|
jmp .do_filter4x4
|
|
|
|
; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
|
|
ALIGN 16
|
|
.do_filter4x4:
|
|
mova m1, [pw_1024]
|
|
|
|
pmaddubsw m0, m3
|
|
pmulhrsw m0, m1
|
|
pmaddubsw m2, m4
|
|
pmulhrsw m2, m1
|
|
packuswb m0, m2
|
|
|
|
; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
|
|
jz .store
|
|
|
|
; transpose 4x4
|
|
pshufb m0, [c_trans_4x4]
|
|
|
|
.store:
|
|
; TODO: use pextrd here after intrinsic ssse3 removed
|
|
movd [r0], m0
|
|
pextrd [r0 + r1], m0, 1
|
|
pextrd [r0 + r1 * 2], m0, 2
|
|
lea r1, [r1 * 3]
|
|
pextrd [r0 + r1], m0, 3
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_4, 3,5,5
|
|
xor r4, r4
|
|
inc r4
|
|
cmp r3m, byte 32
|
|
mov r3, 9
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
|
|
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
punpcklqdq m0, m1
|
|
punpcklqdq m2, m1, m3
|
|
|
|
lea r3, [ang_table + 18 * 16]
|
|
movh m3, [r3 + 3 * 16] ; [21]
|
|
movhps m3, [r3 - 8 * 16] ; [10]
|
|
movh m4, [r3 + 13 * 16] ; [31]
|
|
movhps m4, [r3 + 2 * 16] ; [20]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_5, 3,5,5
|
|
xor r4, r4
|
|
inc r4
|
|
cmp r3m, byte 31
|
|
mov r3, 9
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
|
|
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
|
|
punpcklqdq m0, m1
|
|
punpcklqdq m2, m1, m3
|
|
|
|
lea r3, [ang_table + 10 * 16]
|
|
movh m3, [r3 + 7 * 16] ; [17]
|
|
movhps m3, [r3 - 8 * 16] ; [ 2]
|
|
movh m4, [r3 + 9 * 16] ; [19]
|
|
movhps m4, [r3 - 6 * 16] ; [ 4]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_6, 3,5,5
|
|
xor r4, r4
|
|
inc r4
|
|
cmp r3m, byte 30
|
|
mov r3, 9
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
|
|
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
punpcklqdq m0, m0
|
|
punpcklqdq m2, m2
|
|
|
|
lea r3, [ang_table + 19 * 16]
|
|
movh m3, [r3 - 6 * 16] ; [13]
|
|
movhps m3, [r3 + 7 * 16] ; [26]
|
|
movh m4, [r3 - 12 * 16] ; [ 7]
|
|
movhps m4, [r3 + 1 * 16] ; [20]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_7, 3,5,5
|
|
xor r4, r4
|
|
inc r4
|
|
cmp r3m, byte 29
|
|
mov r3, 9
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
|
|
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
|
|
punpcklqdq m2, m0, m3
|
|
punpcklqdq m0, m0
|
|
|
|
lea r3, [ang_table + 20 * 16]
|
|
movh m3, [r3 - 11 * 16] ; [ 9]
|
|
movhps m3, [r3 - 2 * 16] ; [18]
|
|
movh m4, [r3 + 7 * 16] ; [27]
|
|
movhps m4, [r3 - 16 * 16] ; [ 4]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_8, 3,5,5
|
|
xor r4, r4
|
|
inc r4
|
|
cmp r3m, byte 28
|
|
mov r3, 9
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
|
|
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklqdq m0, m0
|
|
mova m2, m0
|
|
|
|
lea r3, [ang_table + 13 * 16]
|
|
movh m3, [r3 - 8 * 16] ; [ 5]
|
|
movhps m3, [r3 - 3 * 16] ; [10]
|
|
movh m4, [r3 + 2 * 16] ; [15]
|
|
movhps m4, [r3 + 7 * 16] ; [20]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_9, 3,5,5
|
|
xor r4, r4
|
|
inc r4
|
|
cmp r3m, byte 27
|
|
mov r3, 9
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
|
|
punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklqdq m0, m0
|
|
mova m2, m0
|
|
|
|
lea r3, [ang_table + 4 * 16]
|
|
movh m3, [r3 - 2 * 16] ; [ 2]
|
|
movhps m3, [r3 - 0 * 16] ; [ 4]
|
|
movh m4, [r3 + 2 * 16] ; [ 6]
|
|
movhps m4, [r3 + 4 * 16] ; [ 8]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_10, 3,3,4
|
|
movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1]
|
|
pshufb m0, [pb_unpackbd1]
|
|
pshufd m1, m0, 1
|
|
movhlps m2, m0
|
|
pshufd m3, m0, 3
|
|
movd [r0 + r1], m1
|
|
movd [r0 + r1 * 2], m2
|
|
lea r1, [r1 * 3]
|
|
movd [r0 + r1], m3
|
|
cmp r4m, byte 0
|
|
jz .quit
|
|
|
|
; filter
|
|
pmovzxbw m0, m0 ; [-1 -1 -1 -1]
|
|
movh m1, [r2] ; [4 3 2 1 0]
|
|
pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
|
|
pshufb m1, [pb_unpackbw1] ; [4 3 2 1]
|
|
psubw m1, m2
|
|
psraw m1, 1
|
|
paddw m0, m1
|
|
packuswb m0, m0
|
|
.quit:
|
|
movd [r0], m0
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang4_26, 3,4,3
|
|
movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
|
|
|
|
; store
|
|
movd [r0], m0
|
|
movd [r0 + r1], m0
|
|
movd [r0 + r1 * 2], m0
|
|
lea r3, [r1 * 3]
|
|
movd [r0 + r3], m0
|
|
|
|
; filter
|
|
cmp r4m, byte 0
|
|
jz .quit
|
|
|
|
pshufb m0, [pb_0_8] ; [ 1 1 1 1]
|
|
movh m1, [r2 + 8] ; [-4 -3 -2 -1 0]
|
|
pinsrb m1, [r2], 0
|
|
pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
|
|
pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
|
|
psubw m1, m2
|
|
psraw m1, 1
|
|
paddw m0, m1
|
|
packuswb m0, m0
|
|
|
|
pextrb [r0], m0, 0
|
|
pextrb [r0 + r1], m0, 1
|
|
pextrb [r0 + r1 * 2], m0, 2
|
|
pextrb [r0 + r3], m0, 3
|
|
.quit:
|
|
RET
|
|
|
|
cglobal intra_pred_ang4_11, 3,5,5
|
|
xor r4, r4
|
|
cmp r3m, byte 25
|
|
mov r3, 8
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
|
|
pinsrb m0, [r2], 0
|
|
palignr m1, m0, 1 ; [x x x x 4 3 2 1]
|
|
punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
|
|
punpcklqdq m0, m0
|
|
mova m2, m0
|
|
|
|
lea r3, [ang_table + 24 * 16]
|
|
|
|
movh m3, [r3 + 6 * 16] ; [24]
|
|
movhps m3, [r3 + 4 * 16] ; [26]
|
|
movh m4, [r3 + 2 * 16] ; [28]
|
|
movhps m4, [r3 + 0 * 16] ; [30]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_12, 3,5,5
|
|
xor r4, r4
|
|
cmp r3m, byte 24
|
|
mov r3, 8
|
|
cmove r3, r4
|
|
|
|
movh m0, [r2 + r3] ; [x x x 4 3 2 1 0]
|
|
pinsrb m0, [r2], 0
|
|
palignr m1, m0, 1 ; [x x x x 4 3 2 1]
|
|
punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
|
|
punpcklqdq m0, m0
|
|
mova m2, m0
|
|
|
|
lea r3, [ang_table + 20 * 16]
|
|
movh m3, [r3 + 7 * 16] ; [27]
|
|
movhps m3, [r3 + 2 * 16] ; [22]
|
|
movh m4, [r3 - 3 * 16] ; [17]
|
|
movhps m4, [r3 - 8 * 16] ; [12]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_13, 4,5,5
|
|
xor r4, r4
|
|
cmp r3m, byte 23
|
|
mov r3, 8
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
|
|
pinsrb m1, [r2], 1
|
|
palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
|
|
palignr m2, m1, 2 ; [x x x x 4 3 2 1]
|
|
pinsrb m1, [r2 + r3 + 4], 0
|
|
punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
|
|
punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
|
|
punpcklqdq m2, m0, m1
|
|
punpcklqdq m0, m0
|
|
|
|
lea r3, [ang_table + 21 * 16]
|
|
movh m3, [r3 + 2 * 16] ; [23]
|
|
movhps m3, [r3 - 7 * 16] ; [14]
|
|
movh m4, [r3 - 16 * 16] ; [ 5]
|
|
movhps m4, [r3 + 7 * 16] ; [28]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_14, 4,5,5
|
|
xor r4, r4
|
|
cmp r3m, byte 22
|
|
mov r3, 8
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
|
|
pinsrb m2, [r2], 1
|
|
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
|
|
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
|
|
pinsrb m2, [r2 + r3 + 2], 0
|
|
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
|
|
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
|
|
punpcklqdq m0, m0
|
|
punpcklqdq m2, m2
|
|
|
|
lea r3, [ang_table + 19 * 16]
|
|
movh m3, [r3 + 0 * 16] ; [19]
|
|
movhps m3, [r3 - 13 * 16] ; [ 6]
|
|
movh m4, [r3 + 6 * 16] ; [25]
|
|
movhps m4, [r3 - 7 * 16] ; [12]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_15, 4,5,5
|
|
xor r4, r4
|
|
cmp r3m, byte 21
|
|
mov r3, 8
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
|
|
pinsrb m2, [r2], 1
|
|
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
|
|
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
|
|
pinsrb m2, [r2 + r3 + 2], 0
|
|
pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
|
|
pinsrb m3, [r2 + r3 + 4], 0
|
|
punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
|
|
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
|
|
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
|
|
punpcklqdq m0, m2
|
|
punpcklqdq m2, m4
|
|
|
|
lea r3, [ang_table + 23 * 16]
|
|
movh m3, [r3 - 8 * 16] ; [15]
|
|
movhps m3, [r3 + 7 * 16] ; [30]
|
|
movh m4, [r3 - 10 * 16] ; [13]
|
|
movhps m4, [r3 + 5 * 16] ; [28]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_16, 3,5,5
|
|
xor r4, r4
|
|
cmp r3m, byte 20
|
|
mov r3, 8
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x]
|
|
pinsrb m2, [r2], 1
|
|
palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
|
|
palignr m1, m2, 2 ; [x x x x 4 3 2 1]
|
|
pinsrb m2, [r2 + r3 + 2], 0
|
|
pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
|
|
pinsrb m3, [r2 + r3 + 3], 0
|
|
punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
|
|
punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
|
|
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
|
|
punpcklqdq m0, m2
|
|
punpcklqdq m2, m4
|
|
|
|
lea r3, [ang_table + 19 * 16]
|
|
movh m3, [r3 - 8 * 16] ; [11]
|
|
movhps m3, [r3 + 3 * 16] ; [22]
|
|
movh m4, [r3 - 18 * 16] ; [ 1]
|
|
movhps m4, [r3 - 7 * 16] ; [12]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_17, 3,5,5
|
|
xor r4, r4
|
|
cmp r3m, byte 19
|
|
mov r3, 8
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x]
|
|
pinsrb m3, [r2], 1
|
|
palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
|
|
palignr m1, m3, 2 ; [- - - - 4 3 2 1]
|
|
mova m4, m0
|
|
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
|
|
pinsrb m3, [r2 + r3 + 1], 0
|
|
punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
|
|
punpcklqdq m0, m1
|
|
|
|
pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
|
|
pinsrb m2, [r2 + r3 + 2], 0
|
|
pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
|
|
pinsrb m1, [r2 + r3 + 4], 0
|
|
punpcklbw m1, m2 ; [1 0 0 x x y y z]
|
|
punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
|
|
punpcklqdq m2, m1
|
|
|
|
lea r3, [ang_table + 14 * 16]
|
|
movh m3, [r3 - 8 * 16] ; [ 6]
|
|
movhps m3, [r3 - 2 * 16] ; [12]
|
|
movh m4, [r3 + 4 * 16] ; [18]
|
|
movhps m4, [r3 + 10 * 16] ; [24]
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
|
|
|
|
cglobal intra_pred_ang4_18, 3,5,1
|
|
mov r4d, [r2 + 8]
|
|
mov r3b, byte [r2]
|
|
mov [r2 + 8], r3b
|
|
mov r3d, [r2 + 8]
|
|
bswap r3d
|
|
movd m0, r3d
|
|
|
|
pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
|
|
lea r3, [r1 * 3]
|
|
movd [r0 + r3], m0
|
|
psrldq m0, 1
|
|
movd [r0 + r1 * 2], m0
|
|
psrldq m0, 1
|
|
movd [r0 + r1], m0
|
|
psrldq m0, 1
|
|
movd [r0], m0
|
|
mov [r2 + 8], r4w
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------------------
|
|
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
|
|
;-----------------------------------------------------------------------------------------
|
|
INIT_XMM ssse3
|
|
cglobal intra_pred_ang8_2, 3,5,2
|
|
lea r4, [r2 + 2]
|
|
add r2, 18
|
|
cmp r3m, byte 34
|
|
cmove r2, r4
|
|
movu m0, [r2]
|
|
lea r4, [r1 * 3]
|
|
|
|
movh [r0], m0
|
|
palignr m1, m0, 1
|
|
movh [r0 + r1], m1
|
|
palignr m1, m0, 2
|
|
movh [r0 + r1 * 2], m1
|
|
palignr m1, m0, 3
|
|
movh [r0 + r4], m1
|
|
palignr m1, m0, 4
|
|
lea r0, [r0 + r1 * 4]
|
|
movh [r0], m1
|
|
palignr m1, m0, 5
|
|
movh [r0 + r1], m1
|
|
palignr m1, m0, 6
|
|
movh [r0 + r1 * 2], m1
|
|
palignr m1, m0, 7
|
|
movh [r0 + r4], m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang8_3, 3,5,8
|
|
lea r4, [r2 + 1]
|
|
add r2, 17
|
|
cmp r3m, byte 33
|
|
cmove r2, r4
|
|
lea r3, [ang_table + 22 * 16]
|
|
lea r4, [ang_table + 8 * 16]
|
|
mova m3, [pw_1024]
|
|
|
|
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
|
|
pmaddubsw m4, m0, [r3 + 4 * 16] ; [26]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m1, [r3 - 2 * 16] ; [20]
|
|
pmulhrsw m1, m3
|
|
packuswb m4, m1
|
|
|
|
palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
|
|
|
|
pmaddubsw m5, [r3 - 8 * 16] ; [14]
|
|
pmulhrsw m5, m3
|
|
|
|
palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
|
|
|
|
pmaddubsw m6, [r4] ; [ 8]
|
|
pmulhrsw m6, m3
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
|
|
|
|
pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2]
|
|
pmulhrsw m6, m3
|
|
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [28]
|
|
pmulhrsw m1, m3
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
|
|
|
|
pmaddubsw m1, [r3] ; [22]
|
|
pmulhrsw m1, m3
|
|
|
|
palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
|
|
|
|
pmaddubsw m2, [r3 - 6 * 16] ; [16]
|
|
pmulhrsw m2, m3
|
|
packuswb m1, m2
|
|
jmp .transpose8x8
|
|
|
|
ALIGN 16
|
|
.transpose8x8:
|
|
jz .store
|
|
|
|
; transpose 8x8
|
|
punpckhbw m0, m4, m5
|
|
punpcklbw m4, m5
|
|
punpckhbw m2, m4, m0
|
|
punpcklbw m4, m0
|
|
|
|
punpckhbw m0, m6, m1
|
|
punpcklbw m6, m1
|
|
punpckhbw m1, m6, m0
|
|
punpcklbw m6, m0
|
|
|
|
punpckhdq m5, m4, m6
|
|
punpckldq m4, m6
|
|
punpckldq m6, m2, m1
|
|
punpckhdq m2, m1
|
|
mova m1, m2
|
|
|
|
.store:
|
|
lea r4, [r1 * 3]
|
|
movh [r0], m4
|
|
movhps [r0 + r1], m4
|
|
movh [r0 + r1 * 2], m5
|
|
movhps [r0 + r4], m5
|
|
add r0, r4
|
|
movh [r0 + r1], m6
|
|
movhps [r0 + r1 * 2], m6
|
|
movh [r0 + r4], m1
|
|
movhps [r0 + r1 * 4], m1
|
|
RET
|
|
|
|
cglobal intra_pred_ang8_4, 3,5,8
|
|
lea r4, [r2 + 1]
|
|
add r2, 17
|
|
cmp r3m, byte 32
|
|
cmove r2, r4
|
|
lea r3, [ang_table + 24 * 16]
|
|
lea r4, [ang_table + 10 * 16]
|
|
mova m3, [pw_1024]
|
|
|
|
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
mova m5, m1
|
|
|
|
pmaddubsw m4, m0, [r3 - 3 * 16] ; [21]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m1, [r4] ; [10]
|
|
pmulhrsw m1, m3
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, [r3 + 7 * 16] ; [31]
|
|
pmulhrsw m5, m3
|
|
|
|
palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
|
|
|
|
pmaddubsw m6, [r3 - 4 * 16] ; [ 20]
|
|
pmulhrsw m6, m3
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
|
|
|
|
pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9]
|
|
pmulhrsw m6, m3
|
|
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [30]
|
|
pmulhrsw m1, m3
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
|
|
|
|
pmaddubsw m1, [r3 - 5 * 16] ; [19]
|
|
pmulhrsw m1, m3
|
|
|
|
palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]
|
|
|
|
pmaddubsw m2, [r4 - 2 * 16] ; [8]
|
|
pmulhrsw m2, m3
|
|
packuswb m1, m2
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_5, 3,5,8
|
|
lea r4, [r2 + 1]
|
|
add r2, 17
|
|
cmp r3m, byte 31
|
|
cmove r2, r4
|
|
lea r3, [ang_table + 17 * 16]
|
|
lea r4, [ang_table + 2 * 16]
|
|
mova m3, [pw_1024]
|
|
|
|
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
mova m5, m1
|
|
|
|
pmaddubsw m4, m0, [r3] ; [17]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m1, [r4] ; [2]
|
|
pmulhrsw m1, m3
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, [r3 + 2 * 16] ; [19]
|
|
pmulhrsw m5, m3
|
|
|
|
palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
|
|
mova m1, m6
|
|
|
|
pmaddubsw m1, [r4 + 2 * 16] ; [4]
|
|
pmulhrsw m1, m3
|
|
packuswb m5, m1
|
|
|
|
pmaddubsw m6, [r3 + 4 * 16] ; [21]
|
|
pmulhrsw m6, m3
|
|
|
|
palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
|
|
|
|
mova m7, m1
|
|
pmaddubsw m7, [r4 + 4 * 16] ; [6]
|
|
pmulhrsw m7, m3
|
|
packuswb m6, m7
|
|
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [23]
|
|
pmulhrsw m1, m3
|
|
|
|
palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]
|
|
|
|
pmaddubsw m2, [r4 + 6 * 16] ; [8]
|
|
pmulhrsw m2, m3
|
|
packuswb m1, m2
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_6, 3,5,8
|
|
lea r4, [r2 + 1]
|
|
add r2, 17
|
|
cmp r3m, byte 30
|
|
cmove r2, r4
|
|
lea r3, [ang_table + 20 * 16]
|
|
lea r4, [ang_table + 8 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
mova m1, m0
|
|
|
|
pmaddubsw m4, m0, [r3 - 7 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
|
|
pmaddubsw m5, m6, [r4 - 1 * 16] ; [7]
|
|
pmulhrsw m5, m7
|
|
|
|
pmaddubsw m6, [r3] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
|
|
|
|
pmaddubsw m6, m1, [r4 - 7 * 16] ; [1]
|
|
pmulhrsw m6, m7
|
|
|
|
mova m3, m1
|
|
pmaddubsw m3, [r3 - 6 * 16] ; [14]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
pmaddubsw m1, [r3 + 7 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
|
|
|
|
pmaddubsw m2, [r4] ; [8]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_7, 3,5,8
|
|
lea r4, [r2 + 1]
|
|
add r2, 17
|
|
cmp r3m, byte 29
|
|
cmove r2, r4
|
|
lea r3, [ang_table + 24 * 16]
|
|
lea r4, [ang_table + 6 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m0, [r4 + 3 * 16] ; [9]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r3 - 6 * 16] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m5, m0, [r3 + 3 * 16] ; [27]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
|
|
pmaddubsw m6, m1, [r4 - 2 * 16] ; [4]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m1, [r4 + 7 * 16] ; [13]
|
|
pmulhrsw m6, m7
|
|
|
|
mova m3, m1
|
|
pmaddubsw m3, [r3 - 2 * 16] ; [22]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
pmaddubsw m1, [r3 + 7 * 16] ; [31]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
|
|
|
|
pmaddubsw m2, [r4 + 2 * 16] ; [8]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_8, 3,5,8
|
|
lea r4, [r2 + 1]
|
|
add r2, 17
|
|
cmp r3m, byte 28
|
|
cmove r2, r4
|
|
lea r3, [ang_table + 23 * 16]
|
|
lea r4, [ang_table + 8 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
|
|
pmaddubsw m4, m0, [r4 - 3 * 16] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r4 + 2 * 16] ; [10]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m5, m0, [r3 - 8 * 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
|
|
pmaddubsw m6, m0, [r3 - 3 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m0, [r3 + 2 * 16] ; [25]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m0, [r3 + 7 * 16] ; [30]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m2, [r4 - 5 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
|
|
pmaddubsw m2, [r4] ; [8]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_9, 3,5,8
|
|
lea r4, [r2 + 1]
|
|
add r2, 17
|
|
cmp r3m, byte 27
|
|
cmove r2, r4
|
|
lea r3, [ang_table + 10 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m0, [r3 - 8 * 16] ; [2]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r3 - 6 * 16] ; [4]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m5, m0, [r3 - 4 * 16] ; [6]
|
|
pmulhrsw m5, m7
|
|
|
|
pmaddubsw m6, m0, [r3 - 2 * 16] ; [8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m0, [r3] ; [10]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m2, m0, [r3 + 2 * 16] ; [12]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
|
|
pmaddubsw m1, m0, [r3 + 4 * 16] ; [14]
|
|
pmulhrsw m1, m7
|
|
|
|
pmaddubsw m0, [r3 + 6 * 16] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_10, 3,6,5
|
|
movh m0, [r2 + 17]
|
|
mova m4, [pb_unpackbq]
|
|
palignr m1, m0, 2
|
|
pshufb m1, m4
|
|
palignr m2, m0, 4
|
|
pshufb m2, m4
|
|
palignr m3, m0, 6
|
|
pshufb m3, m4
|
|
pshufb m0, m4
|
|
|
|
lea r5, [r1 * 3]
|
|
movhps [r0 + r1], m0
|
|
movh [r0 + r1 * 2], m1
|
|
movhps [r0 + r5], m1
|
|
lea r3, [r0 + r1 * 4]
|
|
movh [r3], m2
|
|
movhps [r3 + r1], m2
|
|
movh [r3 + r1 * 2], m3
|
|
movhps [r3 + r5], m3
|
|
|
|
; filter
|
|
cmp r4m, byte 0
|
|
jz .quit
|
|
|
|
pmovzxbw m0, m0
|
|
movu m1, [r2]
|
|
palignr m2, m1, 1
|
|
pshufb m1, m4
|
|
pmovzxbw m1, m1
|
|
pmovzxbw m2, m2
|
|
psubw m2, m1
|
|
psraw m2, 1
|
|
paddw m0, m2
|
|
packuswb m0, m0
|
|
|
|
.quit:
|
|
movh [r0], m0
|
|
RET
|
|
|
|
cglobal intra_pred_ang8_26, 3,6,3
|
|
movu m2, [r2]
|
|
palignr m0, m2, 1
|
|
lea r5, [r1 * 3]
|
|
movh [r0], m0
|
|
movh [r0 + r1], m0
|
|
movh [r0 + r1 * 2], m0
|
|
movh [r0 + r5], m0
|
|
lea r3, [r0 + r1 * 4]
|
|
movh [r3], m0
|
|
movh [r3 + r1], m0
|
|
movh [r3 + r1 * 2], m0
|
|
movh [r3 + r5], m0
|
|
|
|
; filter
|
|
cmp r4m, byte 0
|
|
jz .quit
|
|
|
|
pshufb m2, [pb_unpackbq]
|
|
movhlps m1, m2
|
|
pmovzxbw m2, m2
|
|
movu m0, [r2 + 17]
|
|
pmovzxbw m1, m1
|
|
pmovzxbw m0, m0
|
|
psubw m0, m2
|
|
psraw m0, 1
|
|
paddw m1, m0
|
|
packuswb m1, m1
|
|
pextrb [r0], m1, 0
|
|
pextrb [r0 + r1], m1, 1
|
|
pextrb [r0 + r1 * 2], m1, 2
|
|
pextrb [r0 + r5], m1, 3
|
|
pextrb [r3], m1, 4
|
|
pextrb [r3 + r1], m1, 5
|
|
pextrb [r3 + r1 * 2], m1, 6
|
|
pextrb [r3 + r5], m1, 7
|
|
.quit:
|
|
RET
|
|
|
|
cglobal intra_pred_ang8_11, 3,5,8
|
|
xor r4, r4
|
|
cmp r3m, byte 25
|
|
mov r3, 16
|
|
cmove r3, r4
|
|
|
|
movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m0, [r2], 0
|
|
palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
|
|
punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
lea r3, [ang_table + 23 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
pmaddubsw m4, m0, [r3 + 7 * 16] ; [30]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r3 + 5 * 16] ; [28]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m5, m0, [r3 + 3 * 16] ; [26]
|
|
pmulhrsw m5, m7
|
|
|
|
pmaddubsw m6, m0, [r3 + 1 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m0, [r3 - 1 * 16] ; [22]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m2, m0, [r3 - 3 * 16] ; [20]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
|
|
pmaddubsw m1, m0, [r3 - 5 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
|
|
pmaddubsw m0, [r3 - 7 * 16] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_12, 3,5,8
|
|
xor r4, r4
|
|
cmp r3m, byte 24
|
|
mov r3, 16
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
|
|
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m1, [r2], 0
|
|
pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
|
|
pinsrb m0, [r2 + r3 + 6], 0
|
|
|
|
lea r4, [ang_table + 22 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
|
|
punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
|
|
palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m2, [r4 + 5 * 16] ; [27]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r4] ; [22]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m1, m0, [r4 + 7 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
|
|
pmaddubsw m0, [r4 + 2 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
pmaddubsw m5, m2, [r4 - 5 * 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
|
|
lea r4, [ang_table + 7 * 16]
|
|
pmaddubsw m6, m2, [r4 + 5 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m2, [r4] ; [7]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m2, [r4 - 5 * 16] ; [2]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_13, 4,5,8
|
|
xor r4, r4
|
|
cmp r3m, byte 23
|
|
mov r3, 16
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
|
|
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m1, [r2], 0
|
|
pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
|
|
pinsrb m1, [r2 + r3 + 4], 0
|
|
pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
|
|
pinsrb m0, [r2 + r3 + 7], 0
|
|
punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
|
|
punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
|
|
palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
|
|
palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
lea r4, [ang_table + 24 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
pmaddubsw m4, m5, [r4 - 1 * 16] ; [23]
|
|
pmulhrsw m4, m7
|
|
|
|
pmaddubsw m6, m1, [r4 + 4 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m0, [r4] ; [24]
|
|
pmulhrsw m0, m7
|
|
|
|
lea r4, [ang_table + 13 * 16]
|
|
pmaddubsw m3, m5, [r4 + 1 * 16] ; [14]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m5, [r4 - 8 * 16] ; [5]
|
|
pmulhrsw m5, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m1, [r4 + 6 * 16] ; [19]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m2, m1, [r4 - 3 * 16] ; [10]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
|
|
pmaddubsw m1, [r4 - 12 * 16] ; [1]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m0
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_14, 4,5,8
|
|
xor r4, r4
|
|
cmp r3m, byte 22
|
|
mov r3, 16
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
|
|
movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
|
|
pinsrb m1, [r2], 2
|
|
pinsrb m1, [r2 + r3 + 2], 1
|
|
pinsrb m1, [r2 + r3 + 5], 0
|
|
pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
|
|
pinsrb m0, [r2 + r3 + 7], 0
|
|
punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
|
|
punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
|
|
palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
|
|
palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
|
|
palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
lea r4, [ang_table + 24 * 16]
|
|
mova m3, [pw_1024]
|
|
|
|
pmaddubsw m4, m2, [r4 - 5 * 16] ; [19]
|
|
pmulhrsw m4, m3
|
|
|
|
pmaddubsw m0, [r4] ; [24]
|
|
pmulhrsw m0, m3
|
|
|
|
pmaddubsw m5, m6, [r4 + 1 * 16] ; [25]
|
|
pmulhrsw m5, m3
|
|
|
|
lea r4, [ang_table + 12 * 16]
|
|
pmaddubsw m6, [r4] ; [12]
|
|
pmulhrsw m6, m3
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m1, [r4 + 19 * 16] ; [31]
|
|
pmulhrsw m6, m3
|
|
|
|
pmaddubsw m2, [r4 - 6 * 16] ; [6]
|
|
pmulhrsw m2, m3
|
|
packuswb m4, m2
|
|
|
|
pmaddubsw m2, m1, [r4 + 6 * 16] ; [18]
|
|
pmulhrsw m2, m3
|
|
packuswb m6, m2
|
|
|
|
pmaddubsw m1, [r4 - 7 * 16] ; [5]
|
|
pmulhrsw m1, m3
|
|
packuswb m1, m0
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_15, 4,5,8
|
|
xor r4, r4
|
|
cmp r3m, byte 21
|
|
mov r3, 16
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
|
|
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m1, [r2], 0
|
|
movu m2, [r2 + r3]
|
|
pshufb m2, [c_mode16_15]
|
|
palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
|
|
pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
|
|
pinsrb m0, [r2 + r3 + 8], 0
|
|
punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
|
|
punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
|
|
palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
|
|
palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
|
|
palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
|
|
palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
lea r4, [ang_table + 23 * 16]
|
|
mova m3, [pw_1024]
|
|
|
|
pmaddubsw m4, [r4 - 8 * 16] ; [15]
|
|
pmulhrsw m4, m3
|
|
|
|
pmaddubsw m2, m5, [r4 + 7 * 16] ; [30]
|
|
pmulhrsw m2, m3
|
|
packuswb m4, m2
|
|
|
|
pmaddubsw m5, [r4 - 10 * 16] ; [13]
|
|
pmulhrsw m5, m3
|
|
|
|
pmaddubsw m2, m6, [r4 + 5 * 16] ; [28]
|
|
pmulhrsw m2, m3
|
|
packuswb m5, m2
|
|
|
|
pmaddubsw m2, m1, [r4 + 3 * 16] ; [26]
|
|
pmulhrsw m2, m3
|
|
|
|
pmaddubsw m0, [r4 + 1 * 16] ; [24]
|
|
pmulhrsw m0, m3
|
|
|
|
lea r4, [ang_table + 11 * 16]
|
|
pmaddubsw m6, [r4] ; [11]
|
|
pmulhrsw m6, m3
|
|
packuswb m6, m2
|
|
|
|
pmaddubsw m1, [r4 - 2 * 16] ; [9]
|
|
pmulhrsw m1, m3
|
|
packuswb m1, m0
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_16, 4,5,8
|
|
xor r4, r4
|
|
cmp r3m, byte 20
|
|
mov r3, 16
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
|
|
movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m1, [r2], 0
|
|
movu m2, [r2 + r3]
|
|
pshufb m2, [c_mode16_16]
|
|
palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
|
|
pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
|
|
pinsrb m0, [r2 + r3 + 8], 0
|
|
punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
|
|
punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e]
|
|
palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
|
|
palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
|
|
palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
|
|
palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
|
|
palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
lea r4, [ang_table + 22 * 16]
|
|
mova m7, [pw_1024]
|
|
|
|
pmaddubsw m3, m5, [r4] ; [22]
|
|
pmulhrsw m3, m7
|
|
|
|
pmaddubsw m0, [r4 + 2 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
|
|
lea r4, [ang_table + 9 * 16]
|
|
|
|
pmaddubsw m4, [r4 + 2 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m2, [r4 + 3 * 16] ; [12]
|
|
pmulhrsw m2, m7
|
|
|
|
pmaddubsw m5, [r4 - 8 * 16] ; [1]
|
|
pmulhrsw m5, m7
|
|
packuswb m5, m2
|
|
|
|
mova m2, m6
|
|
pmaddubsw m6, [r4 + 14 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m2, [r4 - 7 * 16] ; [2]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
|
|
pmaddubsw m1, [r4 + 4 * 16] ; [13]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m0
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_17, 4,5,8
|
|
xor r4, r4
|
|
cmp r3m, byte 19
|
|
mov r3, 16
|
|
jz .next
|
|
xchg r3, r4
|
|
.next:
|
|
|
|
movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m2, [r2], 0
|
|
movu m1, [r2 + r3]
|
|
pshufb m1, [c_mode16_17]
|
|
palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
|
|
pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
|
|
pinsrb m0, [r2 + r3 + 7], 0
|
|
punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f]
|
|
|
|
palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
|
|
palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
|
|
palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
lea r4, [ang_table + 17 * 16]
|
|
mova m3, [pw_1024]
|
|
|
|
pmaddubsw m2, [r4 - 5 * 16] ; [12]
|
|
pmulhrsw m2, m3
|
|
|
|
pmaddubsw m4, [r4 - 11 * 16] ; [6]
|
|
pmulhrsw m4, m3
|
|
packuswb m4, m2
|
|
|
|
pmaddubsw m5, [r4 + 1 * 16] ; [18]
|
|
pmulhrsw m5, m3
|
|
|
|
palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
|
|
pmaddubsw m2, [r4 + 7 * 16] ; [24]
|
|
pmulhrsw m2, m3
|
|
packuswb m5, m2
|
|
|
|
palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
|
|
mova m2, m6
|
|
pmaddubsw m6, [r4 + 13 * 16] ; [30]
|
|
pmulhrsw m6, m3
|
|
|
|
pmaddubsw m2, [r4 - 13 * 16] ; [4]
|
|
pmulhrsw m2, m3
|
|
packuswb m6, m2
|
|
|
|
palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e]
|
|
pmaddubsw m1, [r4 - 7 * 16] ; [10]
|
|
pmulhrsw m1, m3
|
|
|
|
pmaddubsw m0, [r4 - 1 * 16] ; [16]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m0
|
|
jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
|
|
|
|
cglobal intra_pred_ang8_18, 4,4,1
|
|
movu m0, [r2 + 16]
|
|
pinsrb m0, [r2], 0
|
|
pshufb m0, [pb_swap8]
|
|
movhps m0, [r2 + 1]
|
|
lea r2, [r0 + r1 * 4]
|
|
lea r3, [r1 * 3]
|
|
movh [r2 + r3], m0
|
|
psrldq m0, 1
|
|
movh [r2 + r1 * 2], m0
|
|
psrldq m0, 1
|
|
movh [r2 + r1], m0
|
|
psrldq m0, 1
|
|
movh [r2], m0
|
|
psrldq m0, 1
|
|
movh [r0 + r3], m0
|
|
psrldq m0, 1
|
|
movh [r0 + r1 * 2], m0
|
|
psrldq m0, 1
|
|
movh [r0 + r1], m0
|
|
psrldq m0, 1
|
|
movh [r0], m0
|
|
RET
|
|
|
|
%macro TRANSPOSE_STORE_8x8 6
|
|
%if %2 == 1
|
|
; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
|
|
punpckhbw m0, %3, %4
|
|
punpcklbw %3, %4
|
|
punpckhbw %4, %3, m0
|
|
punpcklbw %3, m0
|
|
|
|
punpckhbw m0, %5, m1
|
|
punpcklbw %5, %6
|
|
punpckhbw %6, %5, m0
|
|
punpcklbw %5, m0
|
|
|
|
punpckhdq m0, %3, %5
|
|
punpckldq %3, %5
|
|
punpckldq %5, %4, %6
|
|
punpckhdq %4, %6
|
|
|
|
movh [r0 + + %1 * 8], %3
|
|
movhps [r0 + r1 + %1 * 8], %3
|
|
movh [r0 + r1*2 + %1 * 8], m0
|
|
movhps [r0 + r5 + %1 * 8], m0
|
|
movh [r6 + %1 * 8], %5
|
|
movhps [r6 + r1 + %1 * 8], %5
|
|
movh [r6 + r1*2 + %1 * 8], %4
|
|
movhps [r6 + r5 + %1 * 8], %4
|
|
%else
|
|
; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
|
|
movh [r0 ], %3
|
|
movhps [r0 + r1 ], %3
|
|
movh [r0 + r1 * 2], %4
|
|
movhps [r0 + r5 ], %4
|
|
lea r0, [r0 + r1 * 4]
|
|
movh [r0 ], %5
|
|
movhps [r0 + r1 ], %5
|
|
movh [r0 + r1 * 2], %6
|
|
movhps [r0 + r5 ], %6
|
|
lea r0, [r0 + r1 * 4]
|
|
%endif
|
|
%endmacro
|
|
|
|
;------------------------------------------------------------------------------------------
|
|
; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
|
|
;------------------------------------------------------------------------------------------
|
|
INIT_XMM ssse3
|
|
cglobal intra_pred_ang16_2, 3,5,3
|
|
lea r4, [r2 + 2]
|
|
add r2, 34
|
|
cmp r3m, byte 34
|
|
cmove r2, r4
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 16]
|
|
movu [r0], m0
|
|
palignr m2, m1, m0, 1
|
|
movu [r0 + r1], m2
|
|
lea r0, [r0 + r1 * 2]
|
|
palignr m2, m1, m0, 2
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 3
|
|
movu [r0 + r1], m2
|
|
lea r0, [r0 + r1 * 2]
|
|
palignr m2, m1, m0, 4
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 5
|
|
movu [r0 + r1], m2
|
|
lea r0, [r0 + r1 * 2]
|
|
palignr m2, m1, m0, 6
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 7
|
|
movu [r0 + r1], m2
|
|
lea r0, [r0 + r1 * 2]
|
|
palignr m2, m1, m0, 8
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 9
|
|
movu [r0 + r1], m2
|
|
lea r0, [r0 + r1 * 2]
|
|
palignr m2, m1, m0, 10
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 11
|
|
movu [r0 + r1], m2
|
|
lea r0, [r0 + r1 * 2]
|
|
palignr m2, m1, m0, 12
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 13
|
|
movu [r0 + r1], m2
|
|
lea r0, [r0 + r1 * 2]
|
|
palignr m2, m1, m0, 14
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 15
|
|
movu [r0 + r1], m2
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_3, 3,7,8
|
|
add r2, 32
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
|
|
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m5, m2, m0, 4
|
|
|
|
pmaddubsw m5, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m6, m2, m0, 6
|
|
|
|
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 8
|
|
|
|
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m1, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 10
|
|
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m2, m0, 12
|
|
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 8]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m5, m2, m0, 2
|
|
|
|
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m6, m2, m0, 4
|
|
|
|
pmaddubsw m6, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 6
|
|
|
|
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m6, m7
|
|
|
|
palignr m1, m2, m0, 8
|
|
|
|
pmaddubsw m1, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 10
|
|
|
|
pmaddubsw m1, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
|
|
movhps m1, [r2 + 14] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_33, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
|
|
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m5, m2, m0, 4
|
|
|
|
pmaddubsw m5, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m6, m2, m0, 6
|
|
|
|
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 8
|
|
|
|
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m1, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 10
|
|
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m2, m0, 12
|
|
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 8]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m5, m2, m0, 2
|
|
|
|
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m6, m2, m0, 4
|
|
|
|
pmaddubsw m6, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 6
|
|
|
|
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m6, m7
|
|
|
|
palignr m1, m2, m0, 8
|
|
|
|
pmaddubsw m1, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 10
|
|
|
|
pmaddubsw m1, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
|
|
movh m2, [r2 + 14] ; [00]
|
|
|
|
movh [r0 ], m4
|
|
movhps [r0 + r1 ], m4
|
|
movh [r0 + r1 * 2], m5
|
|
movhps [r0 + r5 ], m5
|
|
lea r0, [r0 + r1 * 4]
|
|
movh [r0 ], m6
|
|
movhps [r0 + r1 ], m6
|
|
movh [r0 + r1 * 2], m1
|
|
movh [r0 + r5 ], m2
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_4, 3,7,8
|
|
add r2, 32
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
mova m5, m1
|
|
|
|
pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m6, m2, m0, 4
|
|
|
|
pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 6
|
|
|
|
pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m1, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 8
|
|
|
|
pmaddubsw m1, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m2, m0, 10
|
|
|
|
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m4, m7
|
|
|
|
movu m0, [r2 + 6]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
|
|
pmaddubsw m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m5, m2, m0, 4
|
|
mova m6, m5
|
|
|
|
pmaddubsw m5, [r3 - 9 * 16] ; [07]
|
|
pmulhrsw m5, m7
|
|
|
|
pmaddubsw m6, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m6, m2, m0, 6
|
|
|
|
pmaddubsw m6, [r3 + 16] ; [17]
|
|
pmulhrsw m6, m7
|
|
|
|
palignr m1, m2, m0, 8
|
|
palignr m2, m0, 10
|
|
|
|
pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
pmaddubsw m1, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_32, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
mova m5, m1
|
|
|
|
|
|
pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m6, m2, m0, 4
|
|
|
|
pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m0, 6
|
|
|
|
pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m1, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m1, m2, m0, 8
|
|
|
|
pmaddubsw m1, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m2, m0, 10
|
|
|
|
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m4, m7
|
|
|
|
movu m0, [r2 + 6]
|
|
palignr m1, m0, 1
|
|
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
|
|
pmaddubsw m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m5, m2, m0, 4
|
|
mova m6, m5
|
|
|
|
pmaddubsw m5, [r3 - 9 * 16] ; [07]
|
|
pmulhrsw m5, m7
|
|
|
|
pmaddubsw m6, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m6, m2, m0, 6
|
|
|
|
pmaddubsw m6, [r3 + 16] ; [17]
|
|
pmulhrsw m6, m7
|
|
|
|
palignr m1, m2, m0, 8
|
|
palignr m2, m0, 10
|
|
|
|
pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
pmaddubsw m1, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_5, 3,7,8
|
|
add r2, 32
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
palignr m5, m2, m3, 2
|
|
|
|
pmaddubsw m4, m3, [r3 + 16] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m3, 4
|
|
|
|
pmaddubsw m5, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m1, m2, m3, 6
|
|
|
|
pmaddubsw m6, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m0, m2, m3, 8
|
|
|
|
pmaddubsw m1, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
palignr m4, m2, m3, 8
|
|
palignr m5, m2, m3, 10
|
|
|
|
pmaddubsw m4, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m3, 12
|
|
|
|
pmaddubsw m5, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m1, m2, m3, 14
|
|
|
|
pmaddubsw m6, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_31, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
palignr m5, m2, m3, 2
|
|
|
|
pmaddubsw m4, m3, [r3 + 16] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m3, 4
|
|
|
|
pmaddubsw m5, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m1, m2, m3, 6
|
|
|
|
pmaddubsw m6, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m0, m2, m3, 8
|
|
|
|
pmaddubsw m1, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
palignr m4, m2, m3, 8
|
|
palignr m5, m2, m3, 10
|
|
|
|
pmaddubsw m4, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m3, 12
|
|
|
|
pmaddubsw m5, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m1, m2, m3, 14
|
|
|
|
pmaddubsw m6, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_6, 3,7,8
|
|
add r2, 32
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m3, 2
|
|
|
|
pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m3, 4
|
|
|
|
pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m0, m2, m3, 6
|
|
|
|
pmaddubsw m1, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
palignr m4, m2, m3, 6
|
|
palignr m6, m2, m3, 8
|
|
|
|
pmaddubsw m4, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m6, [r3 - 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m0, m2, m3, 10
|
|
|
|
pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m2, m3, 12
|
|
|
|
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_30, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m3, 2
|
|
|
|
pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m1, m2, m3, 4
|
|
|
|
pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m0, m2, m3, 6
|
|
|
|
pmaddubsw m1, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
palignr m4, m2, m3, 6
|
|
palignr m6, m2, m3, 8
|
|
|
|
pmaddubsw m4, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m6, [r3 - 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m0, m2, m3, 10
|
|
|
|
pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m2, m3, 12
|
|
|
|
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_7, 3,7,8
|
|
add r2, 32
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
palignr m1, m2, m3, 2
|
|
|
|
pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m0, m2, m3, 4
|
|
|
|
pmaddubsw m1, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
palignr m1, m2, m3, 4
|
|
|
|
pmaddubsw m4, m1, [r3 + 16] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m0, m2, m3, 6
|
|
|
|
pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m2, m3, 8
|
|
|
|
pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_29, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
palignr m1, m2, m3, 2
|
|
|
|
pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m0, m2, m3, 4
|
|
|
|
pmaddubsw m1, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
palignr m1, m2, m3, 4
|
|
|
|
pmaddubsw m4, m1, [r3 + 16] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m0, m2, m3, 6
|
|
|
|
pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m2, m3, 8
|
|
|
|
pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_8, 3,7,8
|
|
add r2, 32
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m2, m7
|
|
packuswb m4, m2
|
|
|
|
pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
|
|
palignr m2, m0, m1, 2
|
|
palignr m3, m0, m1, 4
|
|
|
|
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m2, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m2, m7
|
|
packuswb m5, m2
|
|
|
|
pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r3] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_28, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m2, m7
|
|
packuswb m4, m2
|
|
|
|
pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
|
|
palignr m2, m0, m1, 2
|
|
palignr m3, m0, m1, 4
|
|
|
|
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m2, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m2, m7
|
|
packuswb m5, m2
|
|
|
|
pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r3] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_9, 3,7,8
|
|
add r2, 32
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m2, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
|
|
punpcklqdq m1, m3 ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_27, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
|
|
pmaddubsw m4, m3, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m3, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
pmaddubsw m5, m3, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
|
|
movh [r0 ], m4
|
|
movhps [r0 + r1 ], m4
|
|
movh [r0 + r1 * 2], m5
|
|
movhps [r0 + r5 ], m5
|
|
lea r0, [r0 + r1 * 4]
|
|
movh [r0 ], m6
|
|
movhps [r0 + r1 ], m6
|
|
movh [r0 + r1 * 2], m1
|
|
movh [r0 + r5 ], m2
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_10, 5,6,8
|
|
lea r5, [r1 * 3]
|
|
pxor m7, m7
|
|
|
|
movu m0, [r2 + 1 + 32]
|
|
palignr m1, m0, 1
|
|
pshufb m1, m7
|
|
palignr m2, m0, 2
|
|
pshufb m2, m7
|
|
palignr m3, m0, 3
|
|
pshufb m3, m7
|
|
palignr m4, m0, 4
|
|
pshufb m4, m7
|
|
palignr m5, m0, 5
|
|
pshufb m5, m7
|
|
palignr m6, m0, 6
|
|
pshufb m6, m7
|
|
|
|
movu [r0 + r1], m1
|
|
movu [r0 + r1 * 2], m2
|
|
movu [r0 + r5], m3
|
|
lea r3, [r0 + r1 * 4]
|
|
movu [r3], m4
|
|
movu [r3 + r1], m5
|
|
movu [r3 + r1 * 2], m6
|
|
|
|
palignr m1, m0, 7
|
|
pshufb m1, m7
|
|
movhlps m2, m0
|
|
pshufb m2, m7
|
|
palignr m3, m0, 9
|
|
pshufb m3, m7
|
|
palignr m4, m0, 10
|
|
pshufb m4, m7
|
|
palignr m5, m0, 11
|
|
pshufb m5, m7
|
|
palignr m6, m0, 12
|
|
pshufb m6, m7
|
|
|
|
movu [r3 + r5], m1
|
|
lea r3, [r3 + r1 * 4]
|
|
movu [r3], m2
|
|
movu [r3 + r1], m3
|
|
movu [r3 + r1 * 2], m4
|
|
movu [r3 + r5], m5
|
|
lea r3, [r3 + r1 * 4]
|
|
movu [r3], m6
|
|
|
|
palignr m1, m0, 13
|
|
pshufb m1, m7
|
|
palignr m2, m0, 14
|
|
pshufb m2, m7
|
|
palignr m3, m0, 15
|
|
pshufb m3, m7
|
|
pshufb m0, m7
|
|
|
|
movu [r3 + r1], m1
|
|
movu [r3 + r1 * 2], m2
|
|
movu [r3 + r5], m3
|
|
|
|
; filter
|
|
cmp r4w, byte 0
|
|
jz .quit
|
|
pmovzxbw m0, m0
|
|
mova m1, m0
|
|
movu m2, [r2]
|
|
movu m3, [r2 + 1]
|
|
|
|
pshufb m2, m7
|
|
pmovzxbw m2, m2
|
|
movhlps m4, m3
|
|
pmovzxbw m3, m3
|
|
pmovzxbw m4, m4
|
|
psubw m3, m2
|
|
psubw m4, m2
|
|
psraw m3, 1
|
|
psraw m4, 1
|
|
paddw m0, m3
|
|
paddw m1, m4
|
|
packuswb m0, m1
|
|
.quit:
|
|
movu [r0], m0
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
%if ARCH_X86_64 == 1
|
|
cglobal intra_pred_ang16_26, 3,8,5
|
|
mov r7, r4mp
|
|
%define bfilter r7w
|
|
%else
|
|
cglobal intra_pred_ang16_26, 5,7,5,0-4
|
|
%define bfilter dword[rsp]
|
|
mov bfilter, r4
|
|
%endif
|
|
movu m0, [r2 + 1]
|
|
|
|
lea r4, [r1 * 3]
|
|
lea r3, [r0 + r1 * 4]
|
|
lea r5, [r3 + r1 * 4]
|
|
lea r6, [r5 + r1 * 4]
|
|
|
|
movu [r0], m0
|
|
movu [r0 + r1], m0
|
|
movu [r0 + r1 * 2], m0
|
|
movu [r0 + r4], m0
|
|
movu [r3], m0
|
|
movu [r3 + r1], m0
|
|
movu [r3 + r1 * 2], m0
|
|
movu [r3 + r4], m0
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
|
|
movu [r6], m0
|
|
movu [r6 + r1], m0
|
|
movu [r6 + r1 * 2], m0
|
|
movu [r6 + r4], m0
|
|
|
|
; filter
|
|
cmp bfilter, byte 0
|
|
jz .quit
|
|
|
|
pxor m4, m4
|
|
pshufb m0, m4
|
|
pmovzxbw m0, m0
|
|
mova m1, m0
|
|
movu m2, [r2 + 32]
|
|
pinsrb m2, [r2], 0
|
|
movu m3, [r2 + 1 + 32]
|
|
|
|
pshufb m2, m4
|
|
pmovzxbw m2, m2
|
|
movhlps m4, m3
|
|
pmovzxbw m3, m3
|
|
pmovzxbw m4, m4
|
|
psubw m3, m2
|
|
psubw m4, m2
|
|
psraw m3, 1
|
|
psraw m4, 1
|
|
paddw m0, m3
|
|
paddw m1, m4
|
|
packuswb m0, m1
|
|
|
|
pextrb [r0], m0, 0
|
|
pextrb [r0 + r1], m0, 1
|
|
pextrb [r0 + r1 * 2], m0, 2
|
|
pextrb [r0 + r4], m0, 3
|
|
pextrb [r3], m0, 4
|
|
pextrb [r3 + r1], m0, 5
|
|
pextrb [r3 + r1 * 2], m0, 6
|
|
pextrb [r3 + r4], m0, 7
|
|
pextrb [r5], m0, 8
|
|
pextrb [r5 + r1], m0, 9
|
|
pextrb [r5 + r1 * 2], m0, 10
|
|
pextrb [r5 + r4], m0, 11
|
|
pextrb [r6], m0, 12
|
|
pextrb [r6 + r1], m0, 13
|
|
pextrb [r6 + r1 * 2], m0, 14
|
|
pextrb [r6 + r4], m0, 15
|
|
.quit:
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_11, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m3, [r2], 0
|
|
mova m2, m3
|
|
palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
punpcklqdq m1, m2 ;[00]
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
|
|
movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
mova m2, m3
|
|
palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
punpcklqdq m1, m2 ;[00]
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_25, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 2
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
.loop:
|
|
movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
mova m2, m3
|
|
palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
|
|
movh [r0 ], m4
|
|
movhps [r0 + r1 ], m4
|
|
movh [r0 + r1 * 2], m5
|
|
movhps [r0 + r5 ], m5
|
|
lea r0, [r0 + r1 * 4]
|
|
movh [r0 ], m6
|
|
movhps [r0 + r1 ], m6
|
|
movh [r0 + r1 * 2], m1
|
|
movh [r0 + r5 ], m2
|
|
|
|
lea r0, [r6 + 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_12, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m3, [r2], 0
|
|
punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2]
|
|
pshufb m2, [c_mode16_12]
|
|
|
|
palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
|
|
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
|
|
|
|
pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_24, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2 + 32]
|
|
pshufb m2, [c_mode16_12]
|
|
|
|
palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
|
|
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
|
|
|
|
pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_13, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m3, [r2], 0
|
|
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2]
|
|
pshufb m2, [c_mode16_13]
|
|
|
|
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
pmaddubsw m5, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
|
|
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
|
|
|
|
pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_23, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2 + 32]
|
|
pshufb m2, [c_mode16_13]
|
|
|
|
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m0, m7
|
|
packuswb m4, m0
|
|
|
|
pmaddubsw m5, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
|
|
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
|
|
|
|
pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_14, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m3, [r2], 0
|
|
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2]
|
|
pshufb m2, [c_mode16_14]
|
|
|
|
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 + 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
|
|
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
|
|
|
|
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 + 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_22, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2 + 32]
|
|
pshufb m2, [c_mode16_14]
|
|
|
|
palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 + 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
|
|
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
|
|
|
|
pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 + 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_15, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m3, [r2], 0
|
|
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2]
|
|
pshufb m2, [c_mode16_15]
|
|
|
|
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
|
|
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
|
|
|
|
pmaddubsw m4, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_21, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2 + 32]
|
|
pinsrb m2, [r2], 0
|
|
pshufb m2, [c_mode16_15]
|
|
|
|
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
|
|
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
|
|
|
|
pmaddubsw m4, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_16, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m3, [r2], 0
|
|
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2]
|
|
pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
|
|
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
|
|
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
|
|
|
|
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_20, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2 + 32]
|
|
pinsrb m2, [r2], 0
|
|
pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
|
|
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
|
|
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
|
|
|
|
pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
|
|
pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m3, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_17, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
pinsrb m3, [r2], 0
|
|
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2]
|
|
pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
|
|
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
|
|
pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4 - 16 * 16] ; [00]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
|
|
movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]
|
|
|
|
pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4 - 16 * 16] ; [00]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_19, 4,7,8
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
|
|
punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
movu m2, [r2 + 32]
|
|
pinsrb m2, [r2], 0
|
|
pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
|
|
palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
|
|
pmaddubsw m4, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 15
|
|
|
|
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
|
|
pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4 - 16 * 16] ; [00]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
|
|
lea r0, [r6 + 8]
|
|
|
|
movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
|
|
punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
|
|
movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
|
|
|
|
pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m0, m3, [r4] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m4, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
|
|
pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
|
|
pmulhrsw m5, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m6, m7
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pslldq m2, 1
|
|
palignr m3, m2, 14
|
|
|
|
pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, [r4 - 16 * 16] ; [00]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang16_18, 4,5,3
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 32]
|
|
mova m2, [c_mode16_18]
|
|
pshufb m1, m2
|
|
|
|
lea r2, [r1 * 2]
|
|
lea r3, [r1 * 3]
|
|
lea r4, [r1 * 4]
|
|
movu [r0], m0
|
|
palignr m2, m0, m1, 15
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 14
|
|
movu [r0 + r2], m2
|
|
palignr m2, m0, m1, 13
|
|
movu [r0 + r3], m2
|
|
lea r0, [r0 + r4]
|
|
palignr m2, m0, m1, 12
|
|
movu [r0], m2
|
|
palignr m2, m0, m1, 11
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 10
|
|
movu [r0 + r2], m2
|
|
palignr m2, m0, m1, 9
|
|
movu [r0 + r3], m2
|
|
lea r0, [r0 + r4]
|
|
palignr m2, m0, m1, 8
|
|
movu [r0], m2
|
|
palignr m2, m0, m1, 7
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 6
|
|
movu [r0 + r2], m2
|
|
palignr m2, m0, m1, 5
|
|
movu [r0 + r3], m2
|
|
lea r0, [r0 + r4]
|
|
palignr m2, m0, m1, 4
|
|
movu [r0], m2
|
|
palignr m2, m0, m1, 3
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 2
|
|
movu [r0 + r2], m2
|
|
palignr m0, m1, 1
|
|
movu [r0 + r3], m0
|
|
RET
|
|
|
|
; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
|
|
%macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
|
|
%if %3 == 0
|
|
%else
|
|
pshufb m0, [r3]
|
|
pmaddubsw m0, [r4 + %3 * 16]
|
|
pmulhrsw m0, [pw_1024]
|
|
%endif
|
|
%if %4 == 0
|
|
pmovzxbw m1, m1
|
|
%else
|
|
pshufb m1, [r3]
|
|
pmaddubsw m1, [r4 + %4 * 16]
|
|
pmulhrsw m1, [pw_1024]
|
|
%endif
|
|
%if %3 == 0
|
|
packuswb m1, m1
|
|
movlhps m0, m1
|
|
%else
|
|
packuswb m0, m1
|
|
%endif
|
|
mova m1, [pw_1024]
|
|
%if %5 == 0
|
|
%else
|
|
pshufb m2, [r3]
|
|
pmaddubsw m2, [r4 + %5 * 16]
|
|
pmulhrsw m2, m1
|
|
%endif
|
|
%if %6 == 0
|
|
pmovzxbw m3, m3
|
|
%else
|
|
pshufb m3, [r3]
|
|
pmaddubsw m3, [r4 + %6 * 16]
|
|
pmulhrsw m3, m1
|
|
%endif
|
|
%if %5 == 0
|
|
packuswb m3, m3
|
|
movlhps m2, m3
|
|
%else
|
|
packuswb m2, m3
|
|
%endif
|
|
%if %7 == 0
|
|
%else
|
|
pshufb m4, [r3]
|
|
pmaddubsw m4, [r4 + %7 * 16]
|
|
pmulhrsw m4, m1
|
|
%endif
|
|
%if %8 == 0
|
|
pmovzxbw m5, m5
|
|
%else
|
|
pshufb m5, [r3]
|
|
pmaddubsw m5, [r4 + %8 * 16]
|
|
pmulhrsw m5, m1
|
|
%endif
|
|
%if %7 == 0
|
|
packuswb m5, m5
|
|
movlhps m4, m5
|
|
%else
|
|
packuswb m4, m5
|
|
%endif
|
|
%if %9 == 0
|
|
%else
|
|
pshufb m6, [r3]
|
|
pmaddubsw m6, [r4 + %9 * 16]
|
|
pmulhrsw m6, m1
|
|
%endif
|
|
%if %10 == 0
|
|
pmovzxbw m7, m7
|
|
%else
|
|
pshufb m7, [r3]
|
|
pmaddubsw m7, [r4 + %10 * 16]
|
|
pmulhrsw m7, m1
|
|
%endif
|
|
%if %9 == 0
|
|
packuswb m7, m7
|
|
movlhps m6, m7
|
|
%else
|
|
packuswb m6, m7
|
|
%endif
|
|
|
|
%if %2 == 1
|
|
; transpose
|
|
punpckhbw m1, m0, m2
|
|
punpcklbw m0, m2
|
|
punpckhbw m3, m0, m1
|
|
punpcklbw m0, m1
|
|
|
|
punpckhbw m1, m4, m6
|
|
punpcklbw m4, m6
|
|
punpckhbw m6, m4, m1
|
|
punpcklbw m4, m1
|
|
|
|
punpckhdq m2, m0, m4
|
|
punpckldq m0, m4
|
|
punpckldq m4, m3, m6
|
|
punpckhdq m3, m6
|
|
|
|
movh [r0 + + %1 * 8], m0
|
|
movhps [r0 + r1 + %1 * 8], m0
|
|
movh [r0 + r1*2 + %1 * 8], m2
|
|
movhps [r0 + r5 + %1 * 8], m2
|
|
movh [r6 + %1 * 8], m4
|
|
movhps [r6 + r1 + %1 * 8], m4
|
|
movh [r6 + r1*2 + %1 * 8], m3
|
|
movhps [r6 + r5 + %1 * 8], m3
|
|
%else
|
|
movh [r0 ], m0
|
|
movhps [r0 + r1 ], m0
|
|
movh [r0 + r1 * 2], m2
|
|
movhps [r0 + r5 ], m2
|
|
lea r0, [r0 + r1 * 4]
|
|
movh [r0 ], m4
|
|
movhps [r0 + r1 ], m4
|
|
movh [r0 + r1 * 2], m6
|
|
movhps [r0 + r5 ], m6
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro MODE_3_33 1
|
|
movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
|
|
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
palignr m5, m2, m0, 4
|
|
pmaddubsw m5, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 6
|
|
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m1, m2, m0, 10
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
palignr m2, m0, 12
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 8]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m5, m2, m0, 2
|
|
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 4
|
|
pmaddubsw m6, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m1, m2, m0, 6
|
|
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m6, m7
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m1, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m1, m2, m0, 10
|
|
pmaddubsw m1, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 14] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 14]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
palignr m5, m2, m0, 4
|
|
pmaddubsw m5, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 6
|
|
pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m1, m2, m0, 10
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
palignr m2, m0, 12
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 21]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m5, m2, m0, 2
|
|
pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 4
|
|
pmaddubsw m6, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m1, m2, m0, 6
|
|
pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m6, m7
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m1, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m1, m2, m0, 10
|
|
pmaddubsw m1, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 27] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_4_32 1
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
mova m5, m1
|
|
pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 4
|
|
pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m1, m2, m0, 6
|
|
pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m1, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m1, m7
|
|
palignr m2, m0, 10
|
|
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m4, m7
|
|
movu m0, [r2 + 6]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
pmaddubsw m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
palignr m5, m2, m0, 4
|
|
mova m6, m5
|
|
pmaddubsw m5, [r3 - 9 * 16] ; [07]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m6, m2, m0, 6
|
|
pmaddubsw m6, [r3 + 16] ; [17]
|
|
pmulhrsw m6, m7
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
palignr m2, m0, 10
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 12]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
mova m1, m0
|
|
pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
palignr m5, m2, m0, 2
|
|
pmaddubsw m5, [r3 - 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 4
|
|
mova m1, m6
|
|
pmaddubsw m1, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
pmaddubsw m6, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m6, m7
|
|
palignr m1, m2, m0, 6
|
|
pmaddubsw m1, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m1, m2, m0, 8
|
|
mova m2, m1
|
|
pmaddubsw m1, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 17]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
palignr m5, m2, m0, 2
|
|
pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 4
|
|
pmaddubsw m6, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m6, m2, m0, 6
|
|
mova m1, m6
|
|
pmaddubsw m6, [r3 - 15 * 16] ; [1]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m1, [r3 - 5 * 16] ; [11]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 22] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_5_31 1
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
mova m5, m1
|
|
pmaddubsw m4, m0, [r3 + 16] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 4
|
|
mova m1, m6
|
|
pmaddubsw m6, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m1, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m6, m7
|
|
palignr m1, m2, m0, 6
|
|
mova m3, m1
|
|
pmaddubsw m3, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m1, m7
|
|
palignr m2, m0, 8
|
|
pmaddubsw m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 5]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m1, m2, m0, 2
|
|
mova m5, m1
|
|
pmaddubsw m4, m0, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 4
|
|
mova m1, m6
|
|
pmaddubsw m6, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m1, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m6, m7
|
|
palignr m1, m2, m0, 6
|
|
mova m3, m1
|
|
pmaddubsw m3, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m1, m7
|
|
palignr m2, m0, 8
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 10]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
mova m1, m0
|
|
pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
palignr m5, m2, m0, 2
|
|
mova m1, m5
|
|
pmaddubsw m5, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
palignr m1, m2, m0, 4
|
|
pmaddubsw m6, m1, [r3 - 11 * 16] ; [5]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m2, m0, 6
|
|
pmaddubsw m1, m2, [r3 - 9 * 16] ; [7]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 14]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
mova m1, m0
|
|
pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
palignr m5, m2, m0, 2
|
|
mova m1, m5
|
|
pmaddubsw m5, [r3 - 5 * 16] ; [11]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
palignr m1, m2, m0, 4
|
|
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m2, m0, 6
|
|
pmaddubsw m1, m2, [r3 - 16] ; [15]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 18] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_6_30 1
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
mova m1, m0
|
|
pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
palignr m6, m2, m0, 2
|
|
pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m1, m2, m0, 4
|
|
pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, m1, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
palignr m2, m0, 6
|
|
pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m4, m7
|
|
movu m0, [r2 + 5]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
mova m6, m0
|
|
pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, m6, [r3 - 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m3, m2, m0, 2
|
|
pmaddubsw m6, m3, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
palignr m2, m0, 4
|
|
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r3] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m4, m7
|
|
movu m0, [r2 + 7]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m5, m2, m0, 2
|
|
pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m5, m7
|
|
palignr m1, m2, m0, 4
|
|
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m1, [r3 + 16] ; [17]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
palignr m2, m2, m0, 6
|
|
pmaddubsw m1, m2, [r3 - 5 * 16] ; [11]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 11]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
mova m5, m0
|
|
pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m5, m7
|
|
palignr m6, m2, m0, 2
|
|
pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
pmaddubsw m6, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m6, m7
|
|
palignr m1, m2, m0, 4
|
|
pmaddubsw m2, m1, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m2, m7
|
|
packuswb m6, m2
|
|
pmaddubsw m1, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 14] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_7_29 1
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
mova m5, m0
|
|
pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m5, m7
|
|
palignr m1, m2, m0, 2
|
|
palignr m2, m0, 4
|
|
pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
pmaddubsw m1, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 16] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m2, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m2, m7
|
|
packuswb m4, m2
|
|
movu m0, [r2 + 4]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m2, m0, 2
|
|
pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r3] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m4, m7
|
|
movu m0, [r2 + 6]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m2, m0, 2
|
|
pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, m0, [r3 - 5 * 16] ; [11]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m0, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r3 - 16] ; [15]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m2, m7
|
|
packuswb m1, m2
|
|
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
|
|
movu m0, [r2 + 8]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m0, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m2, m0, 2
|
|
pmaddubsw m6, m2, [r3 - 11 * 16] ; [5]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m2, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
pmaddubsw m1, m2, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 10] ; [0]
|
|
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_8_28 1
|
|
movu m0, [r2 + 1]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m2, m0, 2
|
|
pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m0, [r3 - 1 * 16] ; [15]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m0, [r3 + 9 * 16] ; [25]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m2, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m2, m7
|
|
packuswb m5, m2
|
|
movu m0, [r2 + 3]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m6, m0, [r3 - 15 * 16] ; [01]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m0, [r3 - 10 * 16] ; [06]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m0, [r3 - 5 * 16] ; [11]
|
|
pmulhrsw m1, m7
|
|
mova m2, m0
|
|
pmaddubsw m0, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r3 + 15 * 16] ; [31]
|
|
pmulhrsw m5, m7
|
|
movu m0, [r2 + 4]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m2, m0, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m2, m7
|
|
packuswb m5, m2
|
|
pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m0, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m0, [r3 + 3 * 16] ; [19]
|
|
pmulhrsw m1, m7
|
|
mova m2, m0
|
|
pmaddubsw m0, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
|
|
pmulhrsw m4, m7
|
|
movu m0, [r2 + 5]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
pmaddubsw m5, m0, [r3 - 9 * 16] ; [7]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m0, [r3 + 16] ; [17]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m0, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m0, [r3 + 11 * 16] ; [27]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 6] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_9_27 1
|
|
movu m2, [r2 + 1]
|
|
palignr m1, m2, 1
|
|
punpckhbw m0, m2, m1
|
|
punpcklbw m2, m1
|
|
pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, m2, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
|
|
pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 2] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
|
|
movu m2, [r2 + 2]
|
|
palignr m1, m2, 1
|
|
punpcklbw m2, m1
|
|
pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r3] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
|
|
movu m2, [r2 + 2]
|
|
palignr m1, m2, 1
|
|
punpcklbw m2, m1
|
|
pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
|
|
pmulhrsw m1, m7
|
|
packuswb m1, m1
|
|
movhps m1, [r2 + 3] ; [00]
|
|
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_12_24 1
|
|
movu m2, [r2]
|
|
palignr m1, m2, 1
|
|
punpckhbw m0, m2, m1
|
|
punpcklbw m2, m1
|
|
palignr m0, m2, 2
|
|
pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m0, [r4 + 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
movu m0, [r2 - 2]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m2, m0, 2
|
|
pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
|
|
pmulhrsw m5, m7
|
|
movu m0, [r2 - 3]
|
|
palignr m1, m0, 1
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
palignr m2, m0, 2
|
|
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
|
|
pmulhrsw m4, m7
|
|
movu m2, [r2 - 4]
|
|
palignr m1, m2, 1
|
|
punpckhbw m0, m2, m1
|
|
punpcklbw m2, m1
|
|
palignr m0, m2, 2
|
|
pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m0, [r4 - 16] ; [15]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
movu m2, [pb_fact0]
|
|
pshufb m0, m2
|
|
pmovzxbw m0, m0
|
|
packuswb m1, m0
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
;------------------------------------------------------------------------------------------
|
|
; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
|
|
;------------------------------------------------------------------------------------------
|
|
INIT_XMM ssse3
|
|
cglobal intra_pred_ang32_2, 3,5,4
|
|
lea r4, [r2]
|
|
add r2, 64
|
|
cmp r3m, byte 34
|
|
cmove r2, r4
|
|
movu m0, [r2 + 2]
|
|
movu m1, [r2 + 18]
|
|
movu m3, [r2 + 34]
|
|
|
|
lea r3, [r1 * 3]
|
|
|
|
movu [r0], m0
|
|
movu [r0 + 16], m1
|
|
palignr m2, m1, m0, 1
|
|
movu [r0 + r1], m2
|
|
palignr m2, m3, m1, 1
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m1, m0, 2
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m3, m1, 2
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m1, m0, 3
|
|
movu [r0 + r3], m2
|
|
palignr m2, m3, m1, 3
|
|
movu [r0 + r3 + 16], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m2, m1, m0, 4
|
|
movu [r0], m2
|
|
palignr m2, m3, m1, 4
|
|
movu [r0 + 16], m2
|
|
palignr m2, m1, m0, 5
|
|
movu [r0 + r1], m2
|
|
palignr m2, m3, m1, 5
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m1, m0, 6
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m3, m1, 6
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m1, m0, 7
|
|
movu [r0 + r3], m2
|
|
palignr m2, m3, m1, 7
|
|
movu [r0 + r3 + 16], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m2, m1, m0, 8
|
|
movu [r0], m2
|
|
palignr m2, m3, m1, 8
|
|
movu [r0 + 16], m2
|
|
palignr m2, m1, m0, 9
|
|
movu [r0 + r1], m2
|
|
palignr m2, m3, m1, 9
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m1, m0, 10
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m3, m1, 10
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m1, m0, 11
|
|
movu [r0 + r3], m2
|
|
palignr m2, m3, m1, 11
|
|
movu [r0 + r3 + 16], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m2, m1, m0, 12
|
|
movu [r0], m2
|
|
palignr m2, m3, m1, 12
|
|
movu [r0 + 16], m2
|
|
palignr m2, m1, m0, 13
|
|
movu [r0 + r1], m2
|
|
palignr m2, m3, m1, 13
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m1, m0, 14
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m3, m1, 14
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m1, m0, 15
|
|
movu [r0 + r3], m2
|
|
palignr m2, m3, m1, 15
|
|
movu [r0 + r3 + 16], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
movu [r0], m1
|
|
movu m0, [r2 + 50]
|
|
movu [r0 + 16], m3
|
|
palignr m2, m3, m1, 1
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m3, 1
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m3, m1, 2
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m3, 2
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m3, m1, 3
|
|
movu [r0 + r3], m2
|
|
palignr m2, m0, m3, 3
|
|
movu [r0 + r3 + 16], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m2, m3, m1, 4
|
|
movu [r0], m2
|
|
palignr m2, m0, m3, 4
|
|
movu [r0 + 16], m2
|
|
palignr m2, m3, m1, 5
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m3, 5
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m3, m1, 6
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m3, 6
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m3, m1, 7
|
|
movu [r0 + r3], m2
|
|
palignr m2, m0, m3, 7
|
|
movu [r0 + r3 + 16], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m2, m3, m1, 8
|
|
movu [r0], m2
|
|
palignr m2, m0, m3, 8
|
|
movu [r0 + 16], m2
|
|
palignr m2, m3, m1, 9
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m3, 9
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m3, m1, 10
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m3, 10
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m3, m1, 11
|
|
movu [r0 + r3], m2
|
|
palignr m2, m0, m3, 11
|
|
movu [r0 + r3 + 16], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m2, m3, m1, 12
|
|
movu [r0], m2
|
|
palignr m2, m0, m3, 12
|
|
movu [r0 + 16], m2
|
|
palignr m2, m3, m1, 13
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m3, 13
|
|
movu [r0 + r1 + 16], m2
|
|
palignr m2, m3, m1, 14
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m3, 14
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
palignr m2, m3, m1, 15
|
|
movu [r0 + r3], m2
|
|
palignr m2, m0, m3, 15
|
|
movu [r0 + r3 + 16], m2
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_3, 3,7,8
|
|
add r2, 64
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_3_33 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_4, 3,7,8
|
|
add r2, 64
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_4_32 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_5, 3,7,8
|
|
add r2, 64
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_5_31 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_6, 3,7,8
|
|
add r2, 64
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_6_30 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_7, 3,7,8
|
|
add r2, 64
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_7_29 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_8, 3,7,8
|
|
add r2, 64
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_8_28 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_9, 3,7,8
|
|
add r2, 64
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_9_27 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_10, 5,7,8,0-(2*mmsize)
|
|
%define m8 [rsp + 0 * mmsize]
|
|
%define m9 [rsp + 1 * mmsize]
|
|
pxor m7, m7
|
|
mov r6, 2
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 1]
|
|
mova m8, m0
|
|
mova m9, m1
|
|
mov r3d, r4d
|
|
lea r4, [r1 * 3]
|
|
|
|
.loop:
|
|
movu m0, [r2 + 1 + 64]
|
|
palignr m1, m0, 1
|
|
pshufb m1, m7
|
|
palignr m2, m0, 2
|
|
pshufb m2, m7
|
|
palignr m3, m0, 3
|
|
pshufb m3, m7
|
|
palignr m4, m0, 4
|
|
pshufb m4, m7
|
|
palignr m5, m0, 5
|
|
pshufb m5, m7
|
|
palignr m6, m0, 6
|
|
pshufb m6, m7
|
|
|
|
movu [r0 + r1], m1
|
|
movu [r0 + r1 + 16], m1
|
|
movu [r0 + r1 * 2], m2
|
|
movu [r0 + r1 * 2 + 16], m2
|
|
movu [r0 + r4], m3
|
|
movu [r0 + r4 + 16], m3
|
|
lea r5, [r0 + r1 * 4]
|
|
movu [r5], m4
|
|
movu [r5 + 16], m4
|
|
movu [r5 + r1], m5
|
|
movu [r5 + r1 + 16], m5
|
|
movu [r5 + r1 * 2], m6
|
|
movu [r5 + r1 * 2 + 16], m6
|
|
|
|
palignr m1, m0, 7
|
|
pshufb m1, m7
|
|
movhlps m2, m0
|
|
pshufb m2, m7
|
|
palignr m3, m0, 9
|
|
pshufb m3, m7
|
|
palignr m4, m0, 10
|
|
pshufb m4, m7
|
|
palignr m5, m0, 11
|
|
pshufb m5, m7
|
|
palignr m6, m0, 12
|
|
pshufb m6, m7
|
|
|
|
movu [r5 + r4], m1
|
|
movu [r5 + r4 + 16], m1
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m2
|
|
movu [r5 + 16], m2
|
|
movu [r5 + r1], m3
|
|
movu [r5 + r1 + 16], m3
|
|
movu [r5 + r1 * 2], m4
|
|
movu [r5 + r1 * 2 + 16], m4
|
|
movu [r5 + r4], m5
|
|
movu [r5 + r4 + 16], m5
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m6
|
|
movu [r5 + 16], m6
|
|
|
|
palignr m1, m0, 13
|
|
pshufb m1, m7
|
|
palignr m2, m0, 14
|
|
pshufb m2, m7
|
|
palignr m3, m0, 15
|
|
pshufb m3, m7
|
|
pshufb m0, m7
|
|
|
|
movu [r5 + r1], m1
|
|
movu [r5 + r1 + 16], m1
|
|
movu [r5 + r1 * 2], m2
|
|
movu [r5 + r1 * 2 + 16], m2
|
|
movu [r5 + r4], m3
|
|
movu [r5 + r4 + 16], m3
|
|
|
|
; filter
|
|
cmp r3d, byte 0
|
|
jz .quit
|
|
movhlps m1, m0
|
|
pmovzxbw m0, m0
|
|
mova m1, m0
|
|
movu m2, m8
|
|
movu m3, m9
|
|
|
|
pshufb m2, m7
|
|
pmovzxbw m2, m2
|
|
movhlps m4, m3
|
|
pmovzxbw m3, m3
|
|
pmovzxbw m4, m4
|
|
psubw m3, m2
|
|
psubw m4, m2
|
|
psraw m3, 1
|
|
psraw m4, 1
|
|
paddw m0, m3
|
|
paddw m1, m4
|
|
packuswb m0, m1
|
|
|
|
.quit:
|
|
movu [r0], m0
|
|
movu [r0 + 16], m0
|
|
dec r6
|
|
lea r0, [r5 + r1 * 4]
|
|
lea r2, [r2 + 16]
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_11, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2 + 16]
|
|
pxor m1, m1
|
|
pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
|
mova [rsp], m0
|
|
movu m0, [r2 + 64]
|
|
pinsrb m0, [r2], 0
|
|
movu m1, [r2 + 16 + 64]
|
|
movu m2, [r2 + 32 + 64]
|
|
movu [rsp + 1], m0
|
|
movu [rsp + 1 + 16], m1
|
|
movu [rsp + 1 + 32], m2
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 1] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 1]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 1]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
%macro MODE_12_24_ROW0 1
|
|
movu m0, [r3 + 6]
|
|
pshufb m0, [c_mode32_12_0]
|
|
pinsrb m0, [r3 + 26], 12
|
|
mova above, m0
|
|
movu m2, [r2]
|
|
%if %1 == 1
|
|
pinsrb m2, [r3], 0
|
|
%endif
|
|
palignr m1, m2, 1
|
|
punpcklbw m2, m1
|
|
pmaddubsw m4, m2, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r4 + 16] ; [17]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, m2, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
%if %1 == 1
|
|
pinsrb m1, [r3], 0
|
|
%endif
|
|
palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
|
|
punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
|
|
pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
|
|
pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
|
|
pmulhrsw m5, m7
|
|
pslldq m1, above, 1
|
|
palignr m2, m1, 14
|
|
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
|
|
pmulhrsw m4, m7
|
|
pslldq m1, above, 2
|
|
palignr m2, m1, 14
|
|
pmaddubsw m5, m2, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 - 16] ; [15]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m1, m7
|
|
movu m0, [pb_fact0]
|
|
pshufb m2, m0
|
|
pmovzxbw m2, m2
|
|
packuswb m1, m2
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_12, 3,7,8,0-(1*mmsize)
|
|
%define above [rsp + 0 * mmsize]
|
|
mov r3, r2
|
|
add r2, 64
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
MODE_12_24_ROW0 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 7
|
|
mov r3, 3
|
|
.loop:
|
|
MODE_12_24 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r3
|
|
jnz .loop
|
|
RET
|
|
|
|
%macro MODE_13_23_ROW0 1
|
|
movu m0, [r3 + 1]
|
|
movu m1, [r3 + 15]
|
|
pshufb m0, [c_mode32_13_0]
|
|
pshufb m1, [c_mode32_13_0]
|
|
punpckldq m0, m1
|
|
pshufb m0, [c_mode32_13_shuf]
|
|
mova above, m0
|
|
movu m2, [r2]
|
|
%if (%1 == 1)
|
|
pinsrb m2, [r3], 0
|
|
%endif
|
|
palignr m1, m2, 1
|
|
punpcklbw m2, m1
|
|
pmaddubsw m4, m2, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r4 - 11 * 16] ; [5]
|
|
pmulhrsw m5, m7
|
|
movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
%if (%1 == 1)
|
|
pinsrb m1, [r3], 0
|
|
%endif
|
|
palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
|
|
punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
|
|
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m0, m2, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
|
|
pmulhrsw m1, m7
|
|
palignr m2, above, 14
|
|
pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r4 - 10 * 16] ; [6]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pslldq m0, above, 1
|
|
palignr m2, m0, 14
|
|
pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pslldq m0, 1
|
|
palignr m2, m0, 14
|
|
pmaddubsw m1, m2, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r4] ; [16]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m4, m7
|
|
pslldq m0, above, 3
|
|
palignr m2, m0, 14
|
|
pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
|
|
pmulhrsw m6, m7
|
|
pslldq m0, 1
|
|
palignr m2, m0, 14
|
|
pmaddubsw m0, m2, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m0, m7
|
|
packuswb m6, m0
|
|
pmaddubsw m1, m2, [r4 + 16] ; [17]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m0, m2, [r4 - 8 * 16] ; [8]
|
|
pmulhrsw m0, m7
|
|
packuswb m1, m0
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
pslldq m0, above, 5
|
|
palignr m2, m0, 14
|
|
pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pslldq m0, 1
|
|
palignr m2, m0, 14
|
|
pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m2, [r4 - 16 * 16] ; [00]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
%macro MODE_13_23 2
|
|
movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
|
punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
|
|
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
|
|
palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
|
|
pmaddubsw m4, m0, [r4 + 7 * 16] ; [23]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m0, [r4 - 2 * 16] ; [14]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m0, [r4 - 11 * 16] ; [05]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m3, m2, [r4 - 6 * 16] ; [10]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
|
|
pmulhrsw m1, m7
|
|
movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
|
|
palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
punpckhbw m0, m2, m3
|
|
punpcklbw m2, m3
|
|
palignr m0, m2, 2
|
|
pmaddubsw m3, m0, [r4 + 8 * 16] ; [24]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
mova m3, m0
|
|
TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m3, [r4 - 16] ; [15]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m3, [r4 - 10 * 16] ; [6]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
|
punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
|
|
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
|
|
palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
|
|
pmaddubsw m1, m0, [r4 + 9 * 16] ; [25]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m0, [r4] ; [16]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
mova m3, m0
|
|
TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m3, [r4 - 9 * 16] ; [7]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
|
|
pmulhrsw m6, m7
|
|
movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
|
punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
|
|
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
|
|
palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
|
|
pmaddubsw m3, m0, [r4 + 10 * 16] ; [26]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m1, m0, [r4 + 16] ; [17]
|
|
pmulhrsw m1, m7
|
|
pmaddubsw m3, m0, [r4 - 8 * 16] ; [8]
|
|
pmulhrsw m3, m7
|
|
packuswb m1, m3
|
|
TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
|
|
pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
|
|
pmulhrsw m5, m7
|
|
packuswb m4, m5
|
|
pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
|
|
pmulhrsw m6, m7
|
|
packuswb m5, m6
|
|
movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
%if ((%1 & %2) == 1)
|
|
pinsrb m2, [r3], 0
|
|
%endif
|
|
palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
|
|
punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
|
|
pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
|
|
pmulhrsw m1, m7
|
|
movu m0, [pb_fact0]
|
|
pshufb m2, m0
|
|
pmovzxbw m2, m2
|
|
packuswb m1, m2
|
|
TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
|
|
%endmacro
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize)
|
|
%define above [rsp + 0 * mmsize]
|
|
mov r3, r2
|
|
add r2, 64
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
|
|
MODE_13_23_ROW0 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 7
|
|
|
|
MODE_13_23 1, 1
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
mov r3, 2
|
|
.loop:
|
|
MODE_13_23 1, 0
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec r3
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_14, 3,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 15]
|
|
pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
|
|
pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
|
|
pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
|
|
palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
|
|
mova [rsp], m0
|
|
movu m0, [r2 + 1 + 64]
|
|
movu m1, [r2 + 1 + 16 + 64]
|
|
movu [rsp + 13], m0
|
|
movu [rsp + 13 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 13] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 4]
|
|
palignr m0, m7, 3
|
|
mova m1, m0
|
|
palignr m2, m7, 2
|
|
mova m3, m2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m4
|
|
PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2 - 7]
|
|
palignr m0, m7, 3
|
|
palignr m1, m7, 2
|
|
mova m2, m1
|
|
mova m3, m1
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 10]
|
|
palignr m0, m7, 3
|
|
palignr m1, m7, 2
|
|
mova m2, m1
|
|
palignr m3, m7, 1
|
|
mova m4, m3
|
|
mova m5, m3
|
|
mova m6, m7
|
|
PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 13]
|
|
palignr m0, m7, 2
|
|
mova m1, m0
|
|
mova m2, m0
|
|
palignr m3, m7, 1
|
|
mova m4, m3
|
|
mova m5, m7
|
|
mova m6, m7
|
|
PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_15, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 15]
|
|
pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
|
|
pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
|
|
mova [rsp], m1
|
|
movu [rsp + 8], m0
|
|
movu m0, [r2 + 1 + 64]
|
|
movu m1, [r2 + 1 + 16 + 64]
|
|
movu [rsp + 17], m0
|
|
movu [rsp + 17 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 17] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 5]
|
|
palignr m0, m7, 4
|
|
palignr m1, m7, 3
|
|
mova m2, m1
|
|
palignr m3, m7, 2
|
|
mova m4, m3
|
|
palignr m5, m7, 1
|
|
mova m6, m5
|
|
PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2 - 9]
|
|
palignr m0, m7, 4
|
|
palignr m1, m7, 3
|
|
mova m2, m1
|
|
palignr m3, m7, 2
|
|
mova m4, m3
|
|
palignr m5, m7, 1
|
|
mova m6, m5
|
|
PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 13]
|
|
palignr m0, m7, 3
|
|
mova m1, m0
|
|
palignr m2, m7, 2
|
|
mova m3, m2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 17]
|
|
palignr m0, m7, 3
|
|
mova m1, m0
|
|
palignr m2, m7, 2
|
|
mova m3, m2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_16, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 15]
|
|
pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
|
|
pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
|
|
mova [rsp], m1
|
|
movu [rsp + 10], m0
|
|
movu m0, [r2 + 1 + 64]
|
|
movu m1, [r2 + 1 + 16 + 64]
|
|
movu [rsp + 21], m0
|
|
movu [rsp + 21 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 21] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 6]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
mova m2, m1
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
mova m5, m4
|
|
palignr m6, m7, 1
|
|
PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2 - 11]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
palignr m2, m7, 3
|
|
mova m3, m2
|
|
palignr m4, m7, 2
|
|
palignr m5, m7, 1
|
|
mova m6, m5
|
|
PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 16]
|
|
palignr m0, m7, 4
|
|
mova m1, m0
|
|
palignr m2, m7, 3
|
|
palignr m3, m7, 2
|
|
mova m4, m3
|
|
palignr m5, m7, 1
|
|
mova m6, m7
|
|
PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 21]
|
|
palignr m0, m7, 4
|
|
palignr m1, m7, 3
|
|
mova m2, m1
|
|
palignr m3, m7, 2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_17, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 16]
|
|
pshufb m0, [c_mode32_17_0]
|
|
pshufb m1, [c_mode32_17_0]
|
|
mova [rsp ], m1
|
|
movu [rsp + 13], m0
|
|
movu m0, [r2 + 1 + 64]
|
|
movu m1, [r2 + 1 + 16 + 64]
|
|
movu [rsp + 26], m0
|
|
movu [rsp + 26 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 25] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 6]
|
|
palignr m0, m7, 6
|
|
palignr m1, m7, 5
|
|
palignr m2, m7, 4
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
mova m5, m4
|
|
palignr m6, m7, 1
|
|
PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16
|
|
|
|
; Row[7 - 15]
|
|
movu m7, [r2 - 12]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
mova m2, m1
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
palignr m5, m7, 1
|
|
mova m6, m7
|
|
PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 19]
|
|
palignr m0, m7, 6
|
|
palignr m1, m7, 5
|
|
palignr m2, m7, 4
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
mova m5, m4
|
|
palignr m6, m7, 1
|
|
PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 25]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
mova m2, m1
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
palignr m5, m7, 1
|
|
mova m6, m7
|
|
PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0
|
|
|
|
lea r0, [r6 + r1 * 4]
|
|
lea r6, [r6 + r1 * 8]
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_18, 4, 4, 3
|
|
movu m0, [r2]
|
|
movu xm1, [r2 + 1 + 64]
|
|
pshufb xm1, [intra_pred_shuff_15_0]
|
|
mova xm2, xm0
|
|
vinserti128 m1, m1, xm2, 1
|
|
|
|
lea r3, [r1 * 3]
|
|
|
|
movu [r0], m0
|
|
palignr m2, m0, m1, 15
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 14
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m1, 13
|
|
movu [r0 + r3], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
palignr m2, m0, m1, 12
|
|
movu [r0], m2
|
|
palignr m2, m0, m1, 11
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 10
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m1, 9
|
|
movu [r0 + r3], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
palignr m2, m0, m1, 8
|
|
movu [r0], m2
|
|
palignr m2, m0, m1, 7
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 6
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m1, 5
|
|
movu [r0 + r3], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
palignr m2, m0, m1, 4
|
|
movu [r0], m2
|
|
palignr m2, m0, m1, 3
|
|
movu [r0 + r1], m2
|
|
palignr m2, m0, m1, 2
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m0, m1, 1
|
|
movu [r0 + r3], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
movu [r0], m1
|
|
|
|
movu xm0, [r2 + 64 + 17]
|
|
pshufb xm0, [intra_pred_shuff_15_0]
|
|
vinserti128 m0, m0, xm1, 1
|
|
|
|
palignr m2, m1, m0, 15
|
|
movu [r0 + r1], m2
|
|
palignr m2, m1, m0, 14
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m1, m0, 13
|
|
movu [r0 + r3], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
palignr m2, m1, m0, 12
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 11
|
|
movu [r0 + r1], m2
|
|
palignr m2, m1, m0, 10
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m1, m0, 9
|
|
movu [r0 + r3], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
palignr m2, m1, m0, 8
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 7
|
|
movu [r0 + r1], m2
|
|
palignr m2, m1, m0,6
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m1, m0, 5
|
|
movu [r0 + r3], m2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
palignr m2, m1, m0, 4
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, 3
|
|
movu [r0 + r1], m2
|
|
palignr m2, m1, m0,2
|
|
movu [r0 + r1 * 2], m2
|
|
palignr m2, m1, m0, 1
|
|
movu [r0 + r3], m2
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_18, 4,5,5
|
|
movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
|
|
movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
|
|
lea r2, [r1 * 2]
|
|
lea r3, [r1 * 3]
|
|
lea r4, [r1 * 4]
|
|
|
|
movu [r0], m0
|
|
movu [r0 + 16], m1
|
|
|
|
pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
|
|
pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]
|
|
|
|
palignr m4, m0, m2, 15
|
|
movu [r0 + r1], m4
|
|
palignr m4, m1, m0, 15
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m0, m2, 14
|
|
movu [r0 + r2], m4
|
|
palignr m4, m1, m0, 14
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m0, m2, 13
|
|
movu [r0 + r3], m4
|
|
palignr m4, m1, m0, 13
|
|
movu [r0 + r3 + 16], m4
|
|
|
|
lea r0, [r0 + r4]
|
|
|
|
palignr m4, m0, m2, 12
|
|
movu [r0], m4
|
|
palignr m4, m1, m0, 12
|
|
movu [r0 + 16], m4
|
|
palignr m4, m0, m2, 11
|
|
movu [r0 + r1], m4
|
|
palignr m4, m1, m0, 11
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m0, m2, 10
|
|
movu [r0 + r2], m4
|
|
palignr m4, m1, m0, 10
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m0, m2, 9
|
|
movu [r0 + r3], m4
|
|
palignr m4, m1, m0, 9
|
|
movu [r0 + r3 + 16], m4
|
|
|
|
lea r0, [r0 + r4]
|
|
|
|
palignr m4, m0, m2, 8
|
|
movu [r0], m4
|
|
palignr m4, m1, m0, 8
|
|
movu [r0 + 16], m4
|
|
palignr m4, m0, m2, 7
|
|
movu [r0 + r1], m4
|
|
palignr m4, m1, m0, 7
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m0, m2, 6
|
|
movu [r0 + r2], m4
|
|
palignr m4, m1, m0, 6
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m0, m2, 5
|
|
movu [r0 + r3], m4
|
|
palignr m4, m1, m0, 5
|
|
movu [r0 + r3 + 16], m4
|
|
|
|
lea r0, [r0 + r4]
|
|
|
|
palignr m4, m0, m2, 4
|
|
movu [r0], m4
|
|
palignr m4, m1, m0, 4
|
|
movu [r0 + 16], m4
|
|
palignr m4, m0, m2, 3
|
|
movu [r0 + r1], m4
|
|
palignr m4, m1, m0, 3
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m0, m2, 2
|
|
movu [r0 + r2], m4
|
|
palignr m4, m1, m0, 2
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m0, m2, 1
|
|
movu [r0 + r3], m4
|
|
palignr m4, m1, m0, 1
|
|
movu [r0 + r3 + 16], m4
|
|
|
|
lea r0, [r0 + r4]
|
|
|
|
movu [r0], m2
|
|
movu [r0 + 16], m0
|
|
palignr m4, m2, m3, 15
|
|
movu [r0 + r1], m4
|
|
palignr m4, m0, m2, 15
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m2, m3, 14
|
|
movu [r0 + r2], m4
|
|
palignr m4, m0, m2, 14
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m2, m3, 13
|
|
movu [r0 + r3], m4
|
|
palignr m4, m0, m2, 13
|
|
movu [r0 + r3 + 16], m4
|
|
|
|
lea r0, [r0 + r4]
|
|
|
|
palignr m4, m2, m3, 12
|
|
movu [r0], m4
|
|
palignr m4, m0, m2, 12
|
|
movu [r0 + 16], m4
|
|
palignr m4, m2, m3, 11
|
|
movu [r0 + r1], m4
|
|
palignr m4, m0, m2, 11
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m2, m3, 10
|
|
movu [r0 + r2], m4
|
|
palignr m4, m0, m2, 10
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m2, m3, 9
|
|
movu [r0 + r3], m4
|
|
palignr m4, m0, m2, 9
|
|
movu [r0 + r3 + 16], m4
|
|
|
|
lea r0, [r0 + r4]
|
|
|
|
palignr m4, m2, m3, 8
|
|
movu [r0], m4
|
|
palignr m4, m0, m2, 8
|
|
movu [r0 + 16], m4
|
|
palignr m4, m2, m3, 7
|
|
movu [r0 + r1], m4
|
|
palignr m4, m0, m2, 7
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m2, m3, 6
|
|
movu [r0 + r2], m4
|
|
palignr m4, m0, m2, 6
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m2, m3, 5
|
|
movu [r0 + r3], m4
|
|
palignr m4, m0, m2, 5
|
|
movu [r0 + r3 + 16], m4
|
|
|
|
lea r0, [r0 + r4]
|
|
|
|
palignr m4, m2, m3, 4
|
|
movu [r0], m4
|
|
palignr m4, m0, m2, 4
|
|
movu [r0 + 16], m4
|
|
palignr m4, m2, m3, 3
|
|
movu [r0 + r1], m4
|
|
palignr m4, m0, m2, 3
|
|
movu [r0 + r1 + 16], m4
|
|
palignr m4, m2, m3, 2
|
|
movu [r0 + r2], m4
|
|
palignr m4, m0, m2, 2
|
|
movu [r0 + r2 + 16], m4
|
|
palignr m4, m2, m3, 1
|
|
movu [r0 + r3], m4
|
|
palignr m4, m0, m2, 1
|
|
movu [r0 + r3 + 16], m4
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_19, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2 + 64]
|
|
pinsrb m0, [r2], 0
|
|
movu m1, [r2 + 16 + 64]
|
|
pshufb m0, [c_mode32_17_0]
|
|
pshufb m1, [c_mode32_17_0]
|
|
mova [rsp ], m1
|
|
movu [rsp + 13], m0
|
|
movu m0, [r2 + 1]
|
|
movu m1, [r2 + 1 + 16]
|
|
movu [rsp + 26], m0
|
|
movu [rsp + 26 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 25] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0] ; r6 -> r0
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 6]
|
|
palignr m0, m7, 6
|
|
palignr m1, m7, 5
|
|
palignr m2, m7, 4
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
mova m5, m4
|
|
palignr m6, m7, 1
|
|
PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16
|
|
|
|
; Row[7 - 15]
|
|
movu m7, [r2 - 12]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
mova m2, m1
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
palignr m5, m7, 1
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 19]
|
|
palignr m0, m7, 6
|
|
palignr m1, m7, 5
|
|
palignr m2, m7, 4
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
mova m5, m4
|
|
palignr m6, m7, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 25]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
mova m2, m1
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
palignr m5, m7, 1
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0
|
|
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_20, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2 + 64]
|
|
pinsrb m0, [r2], 0
|
|
movu m1, [r2 + 15 + 64]
|
|
pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
|
|
pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
|
|
mova [rsp], m1
|
|
movu [rsp + 10], m0
|
|
movu m0, [r2 + 1]
|
|
movu m1, [r2 + 1 + 16]
|
|
movu [rsp + 21], m0
|
|
movu [rsp + 21 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 21] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0] ; r6 -> r0
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 6]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
mova m2, m1
|
|
palignr m3, m7, 3
|
|
palignr m4, m7, 2
|
|
mova m5, m4
|
|
palignr m6, m7, 1
|
|
PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2 - 11]
|
|
palignr m0, m7, 5
|
|
palignr m1, m7, 4
|
|
palignr m2, m7, 3
|
|
mova m3, m2
|
|
palignr m4, m7, 2
|
|
palignr m5, m7, 1
|
|
mova m6, m5
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 16]
|
|
palignr m0, m7, 4
|
|
mova m1, m0
|
|
palignr m2, m7, 3
|
|
palignr m3, m7, 2
|
|
mova m4, m3
|
|
palignr m5, m7, 1
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 21]
|
|
palignr m0, m7, 4
|
|
palignr m1, m7, 3
|
|
mova m2, m1
|
|
palignr m3, m7, 2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0
|
|
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_21, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2 + 64]
|
|
pinsrb m0, [r2], 0
|
|
movu m1, [r2 + 15 + 64]
|
|
pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
|
|
pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
|
|
mova [rsp], m1
|
|
movu [rsp + 8], m0
|
|
movu m0, [r2 + 1]
|
|
movu m1, [r2 + 1 + 16]
|
|
movu [rsp + 17], m0
|
|
movu [rsp + 17 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 17] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0] ; r6 -> r0
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 5]
|
|
palignr m0, m7, 4
|
|
palignr m1, m7, 3
|
|
mova m2, m1
|
|
palignr m3, m7, 2
|
|
mova m4, m3
|
|
palignr m5, m7, 1
|
|
mova m6, m5
|
|
PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2 - 9]
|
|
palignr m0, m7, 4
|
|
palignr m1, m7, 3
|
|
mova m2, m1
|
|
palignr m3, m7, 2
|
|
mova m4, m3
|
|
palignr m5, m7, 1
|
|
mova m6, m5
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 13]
|
|
palignr m0, m7, 3
|
|
mova m1, m0
|
|
palignr m2, m7, 2
|
|
mova m3, m2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 17]
|
|
palignr m0, m7, 3
|
|
mova m1, m0
|
|
palignr m2, m7, 2
|
|
mova m3, m2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0
|
|
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_22, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2 + 64]
|
|
pinsrb m0, [r2], 0
|
|
movu m1, [r2 + 15 + 64]
|
|
pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
|
|
pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
|
|
pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
|
|
palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
|
|
mova [rsp], m0
|
|
movu m0, [r2 + 1]
|
|
movu m1, [r2 + 1 + 16]
|
|
movu [rsp + 13], m0
|
|
movu [rsp + 13 + 16], m1
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 13] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0] ; r6 -> r0
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2 - 4]
|
|
palignr m0, m7, 3
|
|
mova m1, m0
|
|
palignr m2, m7, 2
|
|
mova m3, m2
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m4
|
|
PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2 - 7]
|
|
palignr m0, m7, 3
|
|
palignr m1, m7, 2
|
|
mova m2, m1
|
|
mova m3, m1
|
|
palignr m4, m7, 1
|
|
mova m5, m4
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 10]
|
|
palignr m0, m7, 3
|
|
palignr m1, m7, 2
|
|
mova m2, m1
|
|
palignr m3, m7, 1
|
|
mova m4, m3
|
|
mova m5, m3
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 13]
|
|
palignr m0, m7, 2
|
|
mova m1, m0
|
|
mova m2, m0
|
|
palignr m3, m7, 1
|
|
mova m4, m3
|
|
mova m5, m7
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0
|
|
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
|
|
%define above [rsp + 0 * mmsize]
|
|
lea r3, [r2 + 64]
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
MODE_13_23_ROW0 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 7
|
|
mov r3, 3
|
|
.loop:
|
|
MODE_13_23 0, 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r3
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
|
|
%define above [rsp + 0 * mmsize]
|
|
lea r3, [r2 + 64]
|
|
lea r4, [ang_table + 16 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
|
|
MODE_12_24_ROW0 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 7
|
|
mov r3, 3
|
|
.loop:
|
|
MODE_12_24 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r3
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_25, 4,7,8
|
|
; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
|
|
mov r6, rsp
|
|
sub rsp, 64+gprsize
|
|
and rsp, ~63
|
|
mov [rsp+64], r6
|
|
|
|
; collect reference pixel
|
|
movu m0, [r2 + 16 + 64]
|
|
pxor m1, m1
|
|
pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
|
mova [rsp], m0
|
|
movu m0, [r2]
|
|
movu m1, [r2 + 16]
|
|
movu m2, [r2 + 32]
|
|
movu [rsp + 1], m0
|
|
movu [rsp + 1 + 16], m1
|
|
movu [rsp + 1 + 32], m2
|
|
mov [rsp + 63], byte 4
|
|
|
|
; filter
|
|
lea r2, [rsp + 1] ; r2 -> [0]
|
|
lea r3, [c_shuf8_0] ; r3 -> shuffle8
|
|
lea r4, [ang_table] ; r4 -> ang_table
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r0] ; r6 -> r0
|
|
mova m5, [pw_1024] ; m5 -> 1024
|
|
mova m6, [c_deinterval8] ; m6 -> c_deinterval8
|
|
|
|
.loop:
|
|
; Row[0 - 7]
|
|
movu m7, [r2]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16
|
|
|
|
; Row[8 - 15]
|
|
movu m7, [r2]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0
|
|
|
|
; Row[16 - 23]
|
|
movu m7, [r2 - 1]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16
|
|
|
|
; Row[24 - 31]
|
|
movu m7, [r2 - 1]
|
|
mova m0, m7
|
|
mova m1, m7
|
|
mova m2, m7
|
|
mova m3, m7
|
|
mova m4, m7
|
|
mova m5, m7
|
|
mova m6, m7
|
|
lea r0, [r0 + r1 * 4]
|
|
PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0
|
|
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec byte [rsp + 63]
|
|
jnz .loop
|
|
mov rsp, [rsp+64]
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize)
|
|
%define m8 [rsp + 0 * mmsize]
|
|
%define m9 [rsp + 1 * mmsize]
|
|
mov r6, 2
|
|
movu m0, [r2 + 64]
|
|
pinsrb m0, [r2], 0
|
|
movu m1, [r2 + 1 + 64]
|
|
mova m8, m0
|
|
mova m9, m1
|
|
mov r3d, r4d
|
|
lea r4, [r1 * 3]
|
|
|
|
.loop:
|
|
movu m0, [r2 + 1]
|
|
|
|
movu [r0], m0
|
|
movu [r0 + r1], m0
|
|
movu [r0 + r1 * 2], m0
|
|
movu [r0 + r4], m0
|
|
lea r5, [r0 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r0 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
lea r5, [r5 + r1 * 4]
|
|
movu [r5], m0
|
|
movu [r5 + r1], m0
|
|
movu [r5 + r1 * 2], m0
|
|
movu [r5 + r4], m0
|
|
|
|
; filter
|
|
cmp r3d, byte 0
|
|
jz .quit
|
|
|
|
pxor m4, m4
|
|
pshufb m0, m4
|
|
pmovzxbw m0, m0
|
|
mova m1, m0
|
|
movu m2, m8
|
|
movu m3, m9
|
|
|
|
pshufb m2, m4
|
|
pmovzxbw m2, m2
|
|
movhlps m4, m3
|
|
pmovzxbw m3, m3
|
|
pmovzxbw m4, m4
|
|
psubw m3, m2
|
|
psubw m4, m2
|
|
psraw m3, 1
|
|
psraw m4, 1
|
|
paddw m0, m3
|
|
paddw m1, m4
|
|
packuswb m0, m1
|
|
|
|
pextrb [r0], m0, 0
|
|
pextrb [r0 + r1], m0, 1
|
|
pextrb [r0 + r1 * 2], m0, 2
|
|
pextrb [r0 + r4], m0, 3
|
|
lea r5, [r0 + r1 * 4]
|
|
pextrb [r5], m0, 4
|
|
pextrb [r5 + r1], m0, 5
|
|
pextrb [r5 + r1 * 2], m0, 6
|
|
pextrb [r5 + r4], m0, 7
|
|
lea r5, [r5 + r1 * 4]
|
|
pextrb [r5], m0, 8
|
|
pextrb [r5 + r1], m0, 9
|
|
pextrb [r5 + r1 * 2], m0, 10
|
|
pextrb [r5 + r4], m0, 11
|
|
lea r5, [r5 + r1 * 4]
|
|
pextrb [r5], m0, 12
|
|
pextrb [r5 + r1], m0, 13
|
|
pextrb [r5 + r1 * 2], m0, 14
|
|
pextrb [r5 + r4], m0, 15
|
|
|
|
.quit:
|
|
lea r2, [r2 + 16]
|
|
add r0, 16
|
|
dec r6d
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_27, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_9_27 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_28, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_8_28 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_29, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_7_29 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_30, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_6_30 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_31, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_5_31 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_32, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_4_32 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_pred_ang32_33, 3,7,8
|
|
lea r3, [ang_table + 16 * 16]
|
|
mov r4d, 4
|
|
lea r5, [r1 * 3]
|
|
mov r6, r0
|
|
mova m7, [pw_1024]
|
|
.loop:
|
|
MODE_3_33 0
|
|
add r6, 8
|
|
mov r0, r6
|
|
add r2, 8
|
|
dec r4
|
|
jnz .loop
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------------------
|
|
; start of intra_pred_ang32 angular modes avx2 asm
|
|
;-----------------------------------------------------------------------------------------
|
|
|
|
%if ARCH_X86_64 == 1
|
|
INIT_YMM avx2
|
|
|
|
; register mapping :
|
|
; %1-%8 - output registers
|
|
; %9 - temp register
|
|
; %10 - for label naming
|
|
%macro TRANSPOSE_32x8_AVX2 10
|
|
jnz .skip%10
|
|
|
|
; transpose 8x32 to 32x8 and then store
|
|
punpcklbw m%9, m%1, m%2
|
|
punpckhbw m%1, m%2
|
|
punpcklbw m%2, m%3, m%4
|
|
punpckhbw m%3, m%4
|
|
punpcklbw m%4, m%5, m%6
|
|
punpckhbw m%5, m%6
|
|
punpcklbw m%6, m%7, m%8
|
|
punpckhbw m%7, m%8
|
|
|
|
punpcklwd m%8, m%9, m%2
|
|
punpckhwd m%9, m%2
|
|
punpcklwd m%2, m%4, m%6
|
|
punpckhwd m%4, m%6
|
|
punpcklwd m%6, m%1, m%3
|
|
punpckhwd m%1, m%3
|
|
punpcklwd m%3, m%5, m%7
|
|
punpckhwd m%5, m%7
|
|
|
|
punpckldq m%7, m%8, m%2
|
|
punpckhdq m%8, m%2
|
|
punpckldq m%2, m%6, m%3
|
|
punpckhdq m%6, m%3
|
|
punpckldq m%3, m%9, m%4
|
|
punpckhdq m%9, m%4
|
|
punpckldq m%4, m%1, m%5
|
|
punpckhdq m%1, m%5
|
|
|
|
movq [r0 + r1 * 0], xm%7
|
|
movhps [r0 + r1 * 1], xm%7
|
|
movq [r0 + r1 * 2], xm%8
|
|
movhps [r0 + r5 * 1], xm%8
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
movq [r0 + r1 * 0], xm%3
|
|
movhps [r0 + r1 * 1], xm%3
|
|
movq [r0 + r1 * 2], xm%9
|
|
movhps [r0 + r5 * 1], xm%9
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
movq [r0 + r1 * 0], xm%2
|
|
movhps [r0 + r1 * 1], xm%2
|
|
movq [r0 + r1 * 2], xm%6
|
|
movhps [r0 + r5 * 1], xm%6
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
movq [r0 + r1 * 0], xm%4
|
|
movhps [r0 + r1 * 1], xm%4
|
|
movq [r0 + r1 * 2], xm%1
|
|
movhps [r0 + r5 * 1], xm%1
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
vpermq m%8, m%8, 00001110b
|
|
vpermq m%7, m%7, 00001110b
|
|
vpermq m%6, m%6, 00001110b
|
|
vpermq m%3, m%3, 00001110b
|
|
vpermq m%9, m%9, 00001110b
|
|
vpermq m%2, m%2, 00001110b
|
|
vpermq m%4, m%4, 00001110b
|
|
vpermq m%1, m%1, 00001110b
|
|
|
|
movq [r0 + r1 * 0], xm%7
|
|
movhps [r0 + r1 * 1], xm%7
|
|
movq [r0 + r1 * 2], xm%8
|
|
movhps [r0 + r5 * 1], xm%8
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
movq [r0 + r1 * 0], xm%3
|
|
movhps [r0 + r1 * 1], xm%3
|
|
movq [r0 + r1 * 2], xm%9
|
|
movhps [r0 + r5 * 1], xm%9
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
movq [r0 + r1 * 0], xm%2
|
|
movhps [r0 + r1 * 1], xm%2
|
|
movq [r0 + r1 * 2], xm%6
|
|
movhps [r0 + r5 * 1], xm%6
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
movq [r0 + r1 * 0], xm%4
|
|
movhps [r0 + r1 * 1], xm%4
|
|
movq [r0 + r1 * 2], xm%1
|
|
movhps [r0 + r5 * 1], xm%1
|
|
|
|
lea r0, [r4 + 8]
|
|
jmp .end%10
|
|
.skip%10:
|
|
movu [r0 + r1 * 0], m%1
|
|
movu [r0 + r1 * 1], m%2
|
|
movu [r0 + r1 * 2], m%3
|
|
movu [r0 + r5 * 1], m%4
|
|
|
|
lea r0, [r0 + r6]
|
|
|
|
movu [r0 + r1 * 0], m%5
|
|
movu [r0 + r1 * 1], m%6
|
|
movu [r0 + r1 * 2], m%7
|
|
movu [r0 + r5 * 1], m%8
|
|
|
|
lea r0, [r0 + r6]
|
|
.end%10:
|
|
%endmacro
|
|
|
|
cglobal ang32_mode_3_33_row_0_15
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 + 10 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m5, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m5, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, [r3 + 4 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m6, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 - 2 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m8, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m8, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 - 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
palignr m10, m2, m0, 8
|
|
palignr m11, m3, m2, 8
|
|
pmaddubsw m9, m10, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, m11, [r3 - 14 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
pmaddubsw m10, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m11, [r3 + 12 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m10, m11
|
|
|
|
palignr m11, m2, m0, 10
|
|
palignr m1, m3, m2, 10
|
|
pmaddubsw m11, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 + 6 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m12, m2, m0, 12
|
|
palignr m1, m3, m2, 12
|
|
pmaddubsw m12, [r3] ; [16]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m4, m2, m0, 14
|
|
palignr m1, m3, m2, 14
|
|
pmaddubsw m4, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 - 6 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m2, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, m3, [r3 - 12 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
pmaddubsw m6, m2, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m3, [r3 + 14 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
movu m0, [r2 + 25]
|
|
movu m1, [r2 + 26]
|
|
punpcklbw m0, m1
|
|
|
|
palignr m8, m3, m2, 2
|
|
palignr m1, m0, m3, 2
|
|
pmaddubsw m8, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 + 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
palignr m9, m3, m2, 4
|
|
palignr m1, m0, m3, 4
|
|
pmaddubsw m9, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, [r3 + 2 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
palignr m10, m3, m2, 6
|
|
palignr m1, m0, m3, 6
|
|
pmaddubsw m10, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, [r3 - 4 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
palignr m11, m3, m2, 8
|
|
palignr m1, m0, m3, 8
|
|
pmaddubsw m11, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 10 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
movu m12, [r2 + 14]
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_3, 3,8,13
|
|
add r2, 64
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
mov r4, r0
|
|
xor r7d, r7d
|
|
|
|
call ang32_mode_3_33_row_0_15
|
|
|
|
add r4, 16
|
|
mov r0, r4
|
|
add r2, 13
|
|
|
|
call ang32_mode_3_33_row_0_15
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_33, 3,8,13
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
xor r7d, r7d
|
|
inc r7d
|
|
|
|
call ang32_mode_3_33_row_0_15
|
|
|
|
add r2, 13
|
|
|
|
call ang32_mode_3_33_row_0_15
|
|
RET
|
|
|
|
cglobal ang32_mode_4_32_row_0_15
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 + 5 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m1, [r3 - 6 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 15 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m8, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m8, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 + 4 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
palignr m10, m2, m0, 6
|
|
palignr m11, m3, m2, 6
|
|
pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, m11, [r3 - 7 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
pmaddubsw m10, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m11, [r3 + 14 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m10, m11
|
|
|
|
palignr m11, m2, m0, 8
|
|
palignr m1, m3, m2, 8
|
|
pmaddubsw m11, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 + 3 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m12, m2, m0, 10
|
|
palignr m1, m3, m2, 10
|
|
pmaddubsw m12, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 - 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m4, m2, m0, 10
|
|
palignr m1, m3, m2, 10
|
|
pmaddubsw m4, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m5, m2, m0, 12
|
|
palignr m1, m3, m2, 12
|
|
pmaddubsw m5, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, [r3 + 2 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m8, m2, m0, 14
|
|
palignr m1, m3, m2, 14
|
|
pmaddubsw m6, m8, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m1, [r3 - 9 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 + 12 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
pmaddubsw m9, m2, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, m3, [r3 + 1 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
movu m0, [r2 + 25]
|
|
movu m1, [r2 + 26]
|
|
punpcklbw m0, m1
|
|
|
|
palignr m11, m3, m2, 2
|
|
palignr m1, m0, m3, 2
|
|
pmaddubsw m10, m11, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m12, m1, [r3 - 10 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m10, m12
|
|
|
|
pmaddubsw m11, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 + 11 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m0, m3, 4
|
|
palignr m3, m2, 4
|
|
pmaddubsw m3, [r3] ; [16]
|
|
pmulhrsw m3, m7
|
|
pmaddubsw m0, [r3]
|
|
pmulhrsw m0, m7
|
|
packuswb m3, m0
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 3, 0, 8
|
|
ret
|
|
|
|
cglobal ang32_mode_4_32_row_16_31
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 - 11 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, m2, [r3 + 10 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m6, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m6, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 - 1 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m9, m2, m0, 4
|
|
palignr m10, m3, m2, 4
|
|
pmaddubsw m8, m9, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, m10, [r3 - 12 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
pmaddubsw m9, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m10, [r3 + 9 * 32]
|
|
pmulhrsw m10, m7
|
|
packuswb m9, m10
|
|
|
|
palignr m10, m2, m0, 6
|
|
palignr m11, m3, m2, 6
|
|
pmaddubsw m10, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m11, [r3 - 2 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m10, m11
|
|
|
|
palignr m12, m2, m0, 8
|
|
palignr m1, m3, m2, 8
|
|
pmaddubsw m11, m12, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m1, m3, m2, 8
|
|
pmaddubsw m12, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 + 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m4, m2, m0, 10
|
|
palignr m1, m3, m2, 10
|
|
pmaddubsw m4, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 - 3 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m0, 12
|
|
palignr m8, m3, m2, 12
|
|
pmaddubsw m5, m6, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, m8, [r3 - 14 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
pmaddubsw m6, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m8, [r3 + 7 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m6, m8
|
|
|
|
palignr m8, m2, m0, 14
|
|
palignr m1, m3, m2, 14
|
|
pmaddubsw m8, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 - 4 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
pmaddubsw m9, m2, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, m3, [r3 - 15 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
pmaddubsw m10, m2, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, m3, [r3 + 6 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
movu m0, [r2 + 25]
|
|
movu m1, [r2 + 26]
|
|
punpcklbw m0, m1
|
|
|
|
palignr m11, m3, m2, 2
|
|
palignr m1, m0, m3, 2
|
|
pmaddubsw m11, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 5 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
movu m12, [r2 + 11]
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_4, 3,8,13
|
|
add r2, 64
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
mov r4, r0
|
|
xor r7d, r7d
|
|
|
|
call ang32_mode_4_32_row_0_15
|
|
|
|
add r4, 16
|
|
mov r0, r4
|
|
add r2, 11
|
|
|
|
call ang32_mode_4_32_row_16_31
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_32, 3,8,13
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
xor r7d, r7d
|
|
inc r7d
|
|
|
|
call ang32_mode_4_32_row_0_15
|
|
|
|
add r2, 11
|
|
|
|
call ang32_mode_4_32_row_16_31
|
|
RET
|
|
|
|
cglobal ang32_mode_5_31_row_0_15
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 + 1 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m5, m6, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m1, [r3 - 14 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 3 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m9, m2, m0, 4
|
|
palignr m10, m3, m2, 4
|
|
pmaddubsw m8, m9, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, m10, [r3 - 12 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
pmaddubsw m9, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m10, [r3 + 5 * 32]
|
|
pmulhrsw m10, m7
|
|
packuswb m9, m10
|
|
|
|
palignr m11, m2, m0, 6
|
|
palignr m12, m3, m2, 6
|
|
pmaddubsw m10, m11, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, m12, [r3 - 10 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
pmaddubsw m11, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m12, [r3 + 7 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m11, m12
|
|
|
|
palignr m12, m2, m0, 8
|
|
palignr m1, m3, m2, 8
|
|
pmaddubsw m12, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 - 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m4, m2, m0, 8
|
|
palignr m1, m3, m2, 8
|
|
pmaddubsw m4, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 9 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m0, 10
|
|
palignr m1, m3, m2, 10
|
|
pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m1, [r3 - 6 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 11 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m9, m2, m0, 12
|
|
palignr m1, m3, m2, 12
|
|
pmaddubsw m8, m9, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m10, m1, [r3 - 4 * 32]
|
|
pmulhrsw m10, m7
|
|
packuswb m8, m10
|
|
|
|
pmaddubsw m9, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, [r3 + 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
palignr m11, m2, m0, 14
|
|
palignr m1, m3, m2, 14
|
|
pmaddubsw m10, m11, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m12, m1, [r3 - 2 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m10, m12
|
|
|
|
pmaddubsw m11, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 + 15 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
pmaddubsw m3, [r3]
|
|
pmulhrsw m3, m7
|
|
packuswb m2, m3
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
|
|
ret
|
|
|
|
cglobal ang32_mode_5_31_row_16_31
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 - 15 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m2, [r3 + 2 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
palignr m8, m2, m0, 2
|
|
palignr m9, m3, m2, 2
|
|
pmaddubsw m6, m8, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m9, [r3 - 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m8, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m9, [r3 + 4 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m8, m9
|
|
|
|
palignr m10, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m9, m10, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m11, m1, [r3 - 11 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m9, m11
|
|
|
|
pmaddubsw m10, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, [r3 + 6 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
palignr m12, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m11, m12, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 9 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m12, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 + 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m5, m2, m0, 8
|
|
palignr m8, m3, m2, 8
|
|
pmaddubsw m4, m5, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m8, [r3 - 7 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, [r3 + 10 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
palignr m8, m2, m0, 10
|
|
palignr m9, m3, m2, 10
|
|
pmaddubsw m6, m8, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, m9, [r3 - 5 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
pmaddubsw m8, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m9, [r3 + 12 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m8, m9
|
|
|
|
palignr m10, m2, m0, 12
|
|
palignr m11, m3, m2, 12
|
|
pmaddubsw m9, m10, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, m11, [r3 - 3 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
pmaddubsw m10, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m11, [r3 + 14 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m10, m11
|
|
|
|
palignr m11, m2, m0, 14
|
|
palignr m1, m3, m2, 14
|
|
pmaddubsw m11, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 1 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
movu m2, [r2 + 9]
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_5, 3,8,13
|
|
add r2, 64
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
mov r4, r0
|
|
xor r7d, r7d
|
|
|
|
call ang32_mode_5_31_row_0_15
|
|
|
|
add r4, 16
|
|
mov r0, r4
|
|
add r2, 9
|
|
|
|
call ang32_mode_5_31_row_16_31
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_31, 3,8,13
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
xor r7d, r7d
|
|
inc r7d
|
|
|
|
call ang32_mode_5_31_row_0_15
|
|
|
|
add r2, 9
|
|
|
|
call ang32_mode_5_31_row_16_31
|
|
RET
|
|
|
|
cglobal ang32_mode_6_30_row_0_15
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 - 3 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m2, [r3 + 10 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
palignr m8, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m6, m8, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m1, [r3 - 9 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 + 4 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
palignr m11, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m9, m11, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m12, m1, [r3 - 15 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m9, m12
|
|
|
|
pmaddubsw m10, m11, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m12, m1, [r3 - 2 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m10, m12
|
|
|
|
pmaddubsw m11, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 + 11 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m12, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m12, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 - 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m4, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m4, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 5 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m8, m2, m0, 8
|
|
palignr m1, m3, m2, 8
|
|
pmaddubsw m5, m8, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m9, m1, [r3 - 14 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m5, m9
|
|
|
|
pmaddubsw m6, m8, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m1, [r3 - 1 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 + 12 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
palignr m10, m2, m0, 10
|
|
palignr m1, m3, m2, 10
|
|
pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m11, m1, [r3 - 7 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m9, m11
|
|
|
|
pmaddubsw m10, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, m1, [r3 + 6 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
palignr m3, m2, 12
|
|
palignr m2, m0, 12
|
|
pmaddubsw m11, m2, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, m3, [r3 - 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
pmaddubsw m3, [r3]
|
|
pmulhrsw m3, m7
|
|
packuswb m2, m3
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
|
|
ret
|
|
|
|
cglobal ang32_mode_6_30_row_16_31
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 + 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m6, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m5, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m1, [r3 - 6 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 7 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m10, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m8, m10, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m11, m1, [r3 - 12 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m8, m11
|
|
|
|
pmaddubsw m9, m10, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m11, m1, [r3 + 1 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m9, m11
|
|
|
|
pmaddubsw m10, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, [r3 + 14 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
palignr m12, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m11, m12, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 5 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m12, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 + 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m6, m2, m0, 8
|
|
palignr m1, m3, m2, 8
|
|
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m8, m1, [r3 - 11 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m4, m8
|
|
|
|
pmaddubsw m5, m6, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m9, m1, [r3 + 2 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m5, m9
|
|
|
|
pmaddubsw m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 15 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m9, m2, m0, 10
|
|
palignr m1, m3, m2, 10
|
|
pmaddubsw m8, m9, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m10, m1, [r3 - 4 * 32]
|
|
pmulhrsw m10, m7
|
|
packuswb m8, m10
|
|
|
|
pmaddubsw m9, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, [r3 + 9 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
palignr m3, m2, 12
|
|
palignr m2, m0, 12
|
|
pmaddubsw m10, m2, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, m3, [r3 - 10 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
pmaddubsw m2, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m2, m7
|
|
pmaddubsw m3, [r3 + 3 * 32]
|
|
pmulhrsw m3, m7
|
|
packuswb m2, m3
|
|
|
|
movu m3, [r2 + 8] ; [0]
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 8
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_6, 3,8,13
|
|
add r2, 64
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
mov r4, r0
|
|
xor r7d, r7d
|
|
|
|
call ang32_mode_6_30_row_0_15
|
|
|
|
add r4, 16
|
|
mov r0, r4
|
|
add r2, 6
|
|
|
|
call ang32_mode_6_30_row_16_31
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_30, 3,8,13
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
xor r7d, r7d
|
|
inc r7d
|
|
|
|
call ang32_mode_6_30_row_0_15
|
|
|
|
add r2, 6
|
|
|
|
call ang32_mode_6_30_row_16_31
|
|
RET
|
|
|
|
cglobal ang32_mode_7_29_row_0_15
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 - 7 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m2, [r3 + 2 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, m0, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m2, [r3 + 11 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
palignr m11, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m8, m11, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m12, m1, [r3 - 12 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m8, m12
|
|
|
|
pmaddubsw m9, m11, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m12, m1, [r3 - 3 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m9, m12
|
|
|
|
pmaddubsw m10, m11, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m12, m1, [r3 + 6 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m10, m12
|
|
|
|
pmaddubsw m11, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 + 15 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m12, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m12, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 - 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m5, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m4, m5, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m8, m1, [r3 + 1 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m4, m8
|
|
|
|
pmaddubsw m5, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m1, [r3 + 10 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m5, m1
|
|
|
|
palignr m10, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m6, m10, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m1, [r3 - 13 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, m10, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m11, m1, [r3 - 4 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m8, m11
|
|
|
|
pmaddubsw m9, m10, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m11, m1, [r3 + 5 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m9, m11
|
|
|
|
pmaddubsw m10, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, [r3 + 14 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
palignr m3, m2, 8
|
|
palignr m2, m0, 8
|
|
pmaddubsw m11, m2, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, m3, [r3 - 9 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
pmaddubsw m2, [r3] ; [16]
|
|
pmulhrsw m2, m7
|
|
pmaddubsw m3, [r3]
|
|
pmulhrsw m3, m7
|
|
packuswb m2, m3
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
|
|
ret
|
|
|
|
cglobal ang32_mode_7_29_row_16_31
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 + 9 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m9, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m5, m9, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m1, [r3 - 14 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, m9, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m10, m1, [r3 - 5 * 32]
|
|
pmulhrsw m10, m7
|
|
packuswb m6, m10
|
|
|
|
pmaddubsw m8, m9, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m10, m1, [r3 + 4 * 32]
|
|
pmulhrsw m10, m7
|
|
packuswb m8, m10
|
|
|
|
pmaddubsw m9, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, [r3 + 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
palignr m12, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m10, m12, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m11, m1, [r3 - 10 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m10, m11
|
|
|
|
pmaddubsw m11, m12, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 1 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m12, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 + 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
palignr m8, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m4, m8, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m9, m1, [r3 - 15 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m4, m9
|
|
|
|
pmaddubsw m5, m8, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m9, m1, [r3 - 6 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m5, m9
|
|
|
|
pmaddubsw m6, m8, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m1, [r3 + 3 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 + 12 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
palignr m3, m2, 8
|
|
palignr m2, m0, 8
|
|
pmaddubsw m9, m2, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m1, m3, [r3 - 11 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m9, m1
|
|
|
|
pmaddubsw m10, m2, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, m3, [r3 - 2 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
pmaddubsw m2, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m2, m7
|
|
pmaddubsw m3, [r3 + 7 * 32]
|
|
pmulhrsw m3, m7
|
|
packuswb m2, m3
|
|
|
|
movu m1, [r2 + 6] ; [0]
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 1, 0, 8
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_7, 3,8,13
|
|
add r2, 64
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
mov r4, r0
|
|
xor r7d, r7d
|
|
|
|
call ang32_mode_7_29_row_0_15
|
|
|
|
add r4, 16
|
|
mov r0, r4
|
|
add r2, 4
|
|
|
|
call ang32_mode_7_29_row_16_31
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_29, 3,8,13
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
xor r7d, r7d
|
|
inc r7d
|
|
|
|
call ang32_mode_7_29_row_0_15
|
|
|
|
add r2, 4
|
|
|
|
call ang32_mode_7_29_row_16_31
|
|
RET
|
|
|
|
cglobal ang32_mode_8_28_avx2
|
|
test r7d, r7d
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, m2, [r3 - 11 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
pmaddubsw m5, m0, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m2, [r3 - 6 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, m0, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m2, [r3 - 1 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, m0, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m12, m2, [r3 + 4 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m8, m12
|
|
|
|
pmaddubsw m9, m0, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m12, m2, [r3 + 9 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m9, m12
|
|
|
|
pmaddubsw m10, m0, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m12, m2, [r3 + 14 * 32]
|
|
pmulhrsw m12, m7
|
|
packuswb m10, m12
|
|
|
|
palignr m12, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m11, m12, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m12, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 - 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
|
|
|
|
; rows 8 to 15
|
|
|
|
palignr m8, m2, m0, 2
|
|
palignr m1, m3, m2, 2
|
|
pmaddubsw m4, m8, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m9, m1, [r3 - 3 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m4, m9
|
|
|
|
pmaddubsw m5, m8, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m9, m1, [r3 + 2 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m5, m9
|
|
|
|
pmaddubsw m6, m8, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m1, [r3 + 7 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, [r3 + 12 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
palignr m12, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m9, m12, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m11, m1, [r3 - 15 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m9, m11
|
|
|
|
pmaddubsw m10, m12, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m11, m1, [r3 - 10 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m10, m11
|
|
|
|
pmaddubsw m11, m12, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 - 5 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m12, [r3] ; [16]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
|
|
|
|
; rows 16 to 23
|
|
|
|
jnz .doNotAdjustBufferPtr
|
|
lea r4, [r4 + mmsize/2]
|
|
mov r0, r4
|
|
.doNotAdjustBufferPtr:
|
|
|
|
palignr m6, m2, m0, 4
|
|
palignr m1, m3, m2, 4
|
|
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m8, m1, [r3 + 5 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m4, m8
|
|
|
|
pmaddubsw m5, m6, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m8, m1, [r3 + 10 * 32]
|
|
pmulhrsw m8, m7
|
|
packuswb m5, m8
|
|
|
|
pmaddubsw m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m1, [r3 + 15 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m6, m1
|
|
|
|
palignr m12, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m8, m12, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m11, m1, [r3 - 12 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m8, m11
|
|
|
|
pmaddubsw m9, m12, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m11, m1, [r3 - 7 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m9, m11
|
|
|
|
pmaddubsw m10, m12, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m11, m1, [r3 - 2 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m10, m11
|
|
|
|
pmaddubsw m11, m12, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m1, [r3 + 3 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m11, m1
|
|
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m12, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m12, m7
|
|
pmaddubsw m1, [r3 + 8 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 16
|
|
|
|
; rows 24 to 31
|
|
palignr m4, m2, m0, 6
|
|
palignr m1, m3, m2, 6
|
|
pmaddubsw m4, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m1, [r3 + 13 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m4, m1
|
|
|
|
palignr m3, m2, 8
|
|
palignr m2, m0, 8
|
|
pmaddubsw m5, m2, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m9, m3, [r3 - 14 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m5, m9
|
|
|
|
pmaddubsw m6, m2, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m9, m3, [r3 - 9 * 32]
|
|
pmulhrsw m9, m7
|
|
packuswb m6, m9
|
|
|
|
pmaddubsw m8, m2, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m1, m3, [r3 - 4 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m8, m1
|
|
|
|
pmaddubsw m9, m2, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m11, m3, [r3 + 1 * 32]
|
|
pmulhrsw m11, m7
|
|
packuswb m9, m11
|
|
|
|
pmaddubsw m10, m2, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m10, m7
|
|
pmaddubsw m1, m3, [r3 + 6 * 32]
|
|
pmulhrsw m1, m7
|
|
packuswb m10, m1
|
|
|
|
pmaddubsw m2, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m2, m7
|
|
pmaddubsw m3, [r3 + 11 * 32]
|
|
pmulhrsw m3, m7
|
|
packuswb m2, m3
|
|
|
|
movu m3, [r2 + 6] ; [0]
|
|
|
|
TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 24
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_8, 3,8,13
|
|
add r2, 64
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
mov r4, r0
|
|
xor r7d, r7d
|
|
|
|
call ang32_mode_8_28_avx2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_28, 3,8,13
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
xor r7d, r7d
|
|
inc r7d
|
|
|
|
call ang32_mode_8_28_avx2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_9, 3,5,8
|
|
vbroadcasti128 m0, [angHor_tab_9]
|
|
vbroadcasti128 m1, [angHor_tab_9 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode9]
|
|
lea r3, [r1 * 3]
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
|
|
vbroadcasti128 m6, [r2 + mmsize*2 + 17]
|
|
|
|
pshufb m5, m3, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 1
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m6, m3, 3
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 5
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m6, m3, 7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 9
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m6, m3, 11
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 13
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m6, m3, 15
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 33]
|
|
|
|
pshufb m5, m6, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 1
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 2
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m3, m6, 3
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 4
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 5
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 6
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m3, m6, 7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 8
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 9
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 10
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m3, m6, 11
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 12
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 13
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 14
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m5, m3, m6, 15
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_27, 3,5,6
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3] ; r4 -> 3 * stride
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
movu m3, [r2 + 17] ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
|
|
movu m4, [r2 + 18] ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
punpcklbw m3, m4 ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
|
|
|
|
pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m3, m2, 2
|
|
palignr m2, m0, 2
|
|
movu m1, [r2 + 2] ; [0]
|
|
movu [r0 + r4], m1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 16 to 23
|
|
pmaddubsw m4, m2, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m2, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m2, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m2, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m2, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m2, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m2, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m2, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m2, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m2, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m2, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m2, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m2, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m2, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m2, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m2, m5
|
|
pmaddubsw m3, [r3 + 14 * 32]
|
|
pmulhrsw m3, m5
|
|
packuswb m2, m3
|
|
movu [r0 + r1*2], m2
|
|
|
|
movu m1, [r2 + 3] ; [0]
|
|
movu [r0 + r4], m1
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_10, 5,5,4
|
|
pxor m0, m0
|
|
mova m1, [pb_1]
|
|
lea r4, [r1 * 3]
|
|
|
|
vbroadcasti128 m2, [r2 + mmsize*2 + 1]
|
|
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
pxor m0, m0
|
|
vbroadcasti128 m2, [r2 + mmsize*2 + mmsize/2 + 1]
|
|
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r1 * 2], m3
|
|
paddb m0, m1
|
|
pshufb m3, m2, m0
|
|
movu [r0 + r4], m3
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_11, 3,4,8
|
|
vbroadcasti128 m0, [angHor_tab_11]
|
|
vbroadcasti128 m1, [angHor_tab_11 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode11]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [16 0 -1 -2 ...]
|
|
movu xm3, [r2 + mmsize*2 - 1]
|
|
vbroadcasti128 m6, [r2 + mmsize*2 + 15]
|
|
|
|
pinsrb xm3, [r2 + 0], 1
|
|
pinsrb xm3, [r2 + 16], 0
|
|
vinserti128 m3, m3, xm3, 1 ; [16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
|
|
|
|
pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 16 0 16 0 16 0 16 0 16 0 16 0 16 0 16 0]
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 1
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 3
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 5
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 9
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m5, m7
|
|
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 11
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 13
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 15
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
mova m3, m6
|
|
vbroadcasti128 m6, [r2 + mmsize*2 + 15 + 16]
|
|
pshufb m5, m3, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 1
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 3
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 5
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 9
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 11
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m6, m3, 13
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m6, m3, 15
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_25, 3,5,7
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3]
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 0] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
movu m1, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
|
|
pinsrb xm3, [r2], 15
|
|
pinsrb xm3, [r2 + mmsize*2 + 16], 14
|
|
|
|
punpckhbw m2, m0, m1 ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
|
|
punpcklbw m0, m1 ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
|
|
vinserti128 m3, m3, xm2, 1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 0 16 x x x x x x x x x x x x x x]
|
|
|
|
pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
movu m1, [r2] ; [0]
|
|
movu [r0 + r4], m1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
palignr m2, m0, 14
|
|
palignr m0, m3, 14
|
|
|
|
; rows 16 to 23
|
|
pmaddubsw m4, m0, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 24 to 31
|
|
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m0, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m0, m5
|
|
pmaddubsw m2, [r3 - 14 * 32]
|
|
pmulhrsw m2, m5
|
|
packuswb m0, m2
|
|
movu [r0 + r1*2], m0
|
|
|
|
movu m1, [r2 + 1] ; [0]
|
|
palignr m1, m3, 14
|
|
movu [r0 + r4], m1
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_12, 3,4,9
|
|
movu m0, [ang32_fact_mode12]
|
|
movu m1, [ang32_fact_mode12 + mmsize]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode12]
|
|
mova m8, [ang32_shuf_mode12 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [26, 19, 13, 6, 0, -1, -2....]
|
|
|
|
movu xm4, [r2 + mmsize*2 - 4]
|
|
vbroadcasti128 m6, [r2 + mmsize*2 + 12]
|
|
|
|
pinsrb xm4, [r2 + 0], 4
|
|
pinsrb xm4, [r2 + 6], 3
|
|
pinsrb xm4, [r2 + 13], 2
|
|
pinsrb xm4, [r2 + 19], 1
|
|
pinsrb xm4, [r2 + 26], 0
|
|
vinserti128 m3, m4, xm4, 1 ; [26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 26, 19, 13, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
|
|
|
|
pshufb m4, m3, m7 ; [ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13]
|
|
pshufb m5, m3, m8 ; [ 6, 0, 6, 0, 6, 0, 6, 0, 13, 6, 13, 6, 13, 6, 13, 6, 19, 13, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19]
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 1
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 2
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 3
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m6, m3, 4
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 5
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 6
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 7
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m6, m3, 8
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 9
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 10
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 11
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m6, m3, 12
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 13
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 14
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 15
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
mova m3, m6
|
|
vbroadcasti128 m6, [r2 + mmsize*2 + 12 + 16]
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 1
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 2
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 3
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m6, m3, 4
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 5
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 6
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 7
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m6, m3, 8
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 9
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 10
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 11
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m6, m3, 12
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m6, m3, 13
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m6, m3, 14
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m3, 15
|
|
pshufb m5, m4, m8
|
|
pshufb m4, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_24, 3,5,8
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3]
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 0]
|
|
movu m1, [r2 + 1]
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
|
|
movu m4, [r2 + mmsize*2]
|
|
pshufb m4, [ang32_shuf_mode24]
|
|
mova m3, [ang32_shuf_mode24 + mmsize]
|
|
vpermd m4, m3, m4 ; [6 6 13 13 19 19 26 26 x x x...]
|
|
palignr m3, m0, m4, 1
|
|
vinserti128 m3, m3, xm2, 1
|
|
|
|
pmaddubsw m4, m0, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m0, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 14
|
|
palignr m7, m2, m0, 14
|
|
|
|
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m0, m3, 12
|
|
palignr m7, m2, m0, 12
|
|
|
|
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m6, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 16 to 23
|
|
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 10
|
|
palignr m7, m2, m0, 10
|
|
|
|
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 24 to 31
|
|
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 8
|
|
palignr m7, m2, m0, 8
|
|
|
|
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pand m6, [pw_00ff]
|
|
pand m7, [pw_00ff]
|
|
packuswb m6, m7
|
|
movu [r0 + r4], m6
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_13, 3,4,9
|
|
movu m0, [ang32_fact_mode13]
|
|
movu m1, [ang32_fact_mode13 + mmsize]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode13]
|
|
mova m8, [ang32_shuf_mode13 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [28, 25, 21, 18, 14, 11, 7, 4, 0, -1, -2....]
|
|
|
|
movu m6, [r2]
|
|
pshufb m6, [ang32_shuf_mode13 + mmsize*2]
|
|
mova m3, [ang32_shuf_mode24 + mmsize*1]
|
|
vpermd m6, m3, m6
|
|
palignr m6, m6, 1
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
|
|
|
|
palignr m5, m3, m6, 1
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 3
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 5
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 7
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 9
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 11
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 13
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 15
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
mova m6, m3
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 17]
|
|
palignr m5, m3, m6, 1
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 3
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 5
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 7
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 9
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 11
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 13
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 15
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_23, 3,5,8
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3]
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 0]
|
|
movu m1, [r2 + 1]
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
|
|
movu m4, [r2 + mmsize*2]
|
|
pshufb m4, [ang32_shuf_mode23]
|
|
vpermq m4, m4, q1313
|
|
palignr m3, m0, m4, 1
|
|
vinserti128 m3, m3, xm2, 1
|
|
|
|
pmaddubsw m4, m0, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 14
|
|
palignr m7, m2, m0, 14
|
|
|
|
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 12
|
|
palignr m7, m2, m0, 12
|
|
|
|
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m6, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 10
|
|
palignr m7, m2, m0, 10
|
|
|
|
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 8
|
|
palignr m7, m2, m0, 8
|
|
|
|
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m6, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 16 to 23
|
|
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 6
|
|
palignr m7, m2, m0, 6
|
|
|
|
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 4
|
|
palignr m7, m2, m0, 4
|
|
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 24 to 31
|
|
palignr m6, m0, m3, 2
|
|
palignr m7, m2, m0, 2
|
|
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m3, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 + 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m3, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m3, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 - 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pand m3, [pw_00ff]
|
|
pand m0, [pw_00ff]
|
|
packuswb m3, m0
|
|
movu [r0 + r4], m3
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_14, 3,4,9
|
|
movu m0, [ang32_fact_mode14]
|
|
movu m1, [ang32_fact_mode14 + mmsize]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode14]
|
|
mova m8, [ang32_shuf_mode14 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2...]
|
|
|
|
movu m6, [r2]
|
|
pshufb m6, [ang32_shuf_mode14 + mmsize*2]
|
|
vpermq m6, m6, 01110111b
|
|
pslldq m6, m6, 1
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
|
|
|
|
palignr m5, m3, m6, 1
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 3
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 5
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 7
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 9
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 11
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 13
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 15
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
mova m6, m3
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 17]
|
|
palignr m5, m3, m6, 1
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 3
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 5
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 7
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 9
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 11
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m3, m6, 13
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m5, m3, m6, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 15
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_22, 3,5,9
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3]
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 0]
|
|
movu m1, [r2 + 1]
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
|
|
movu m4, [r2 + mmsize*2 + 2]
|
|
pshufb m4, [ang32_shuf_mode22]
|
|
vextracti128 xm8, m4, 1
|
|
|
|
palignr m3, m0, m4, 2
|
|
palignr m3, m8, 15
|
|
vinserti128 m3, m3, xm2, 1
|
|
vinserti128 m8, m8, xm0, 1
|
|
|
|
pmaddubsw m4, m0, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 + 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 14
|
|
palignr m7, m2, m0, 14
|
|
|
|
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m0, m3, 12
|
|
palignr m7, m2, m0, 12
|
|
|
|
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 10
|
|
palignr m7, m2, m0, 10
|
|
|
|
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 8
|
|
palignr m7, m2, m0, 8
|
|
|
|
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m0, m3, 6
|
|
palignr m7, m2, m0, 6
|
|
|
|
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 4
|
|
palignr m7, m2, m0, 4
|
|
|
|
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m6, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 16 to 23
|
|
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 2
|
|
palignr m7, m2, m0, 2
|
|
|
|
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m3, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m3, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 - 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m3, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 14
|
|
palignr m7, m0, m3, 14
|
|
|
|
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 24 to 31
|
|
palignr m6, m3, m8, 12
|
|
palignr m7, m0, m3, 12
|
|
pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m6, m3, m8, 10
|
|
palignr m7, m0, m3, 10
|
|
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m0, m3, 8
|
|
palignr m3, m8, 8
|
|
pmaddubsw m4, m3, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m3, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 - 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pand m3, [pw_00ff]
|
|
pand m0, [pw_00ff]
|
|
packuswb m3, m0
|
|
movu [r0 + r4], m3
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_15, 3,4,9
|
|
movu m0, [ang32_fact_mode15]
|
|
movu m1, [ang32_fact_mode15 + mmsize]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode15]
|
|
mova m8, [ang32_shuf_mode15 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11, 9, 8, 6, 4, 2, 0, -1, -2...]
|
|
|
|
movu m6, [r2]
|
|
pshufb m6, [ang32_shuf_mode15 + mmsize*2]
|
|
vpermq m6, m6, 01110111b
|
|
|
|
movu xm3, [r2 + mmsize*2]
|
|
pinsrb xm3, [r2], 0
|
|
vpermq m3, m3, 01000100b
|
|
|
|
palignr m4, m3, m6, 2
|
|
pshufb m4, m7
|
|
pshufb m5, m6, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 3
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 1
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 4
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 2
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 5
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 3
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 6
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 4
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 7
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 5
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 8
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 6
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 9
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 10
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 8
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 11
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 9
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 12
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 10
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 13
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 11
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 14
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 12
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 15
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 13
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
pshufb m4, m3, m7
|
|
palignr m5, m3, m6, 14
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 15
|
|
mova m6, m3
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 16]
|
|
|
|
palignr m4, m3, m6, 1
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 2
|
|
pshufb m4, m7
|
|
pshufb m5, m6, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 3
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 1
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 4
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 2
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 5
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 3
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 6
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 4
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 7
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 5
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 8
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 6
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 9
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 10
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 8
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 11
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 9
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 12
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 10
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 13
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 11
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 14
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 12
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 15
|
|
pshufb m4, m7
|
|
palignr m5, m3, m6, 13
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1], m4
|
|
|
|
pshufb m4, m3, m7
|
|
palignr m5, m3, m6, 14
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 15
|
|
vbroadcasti128 m6, [r2 + mmsize*2 + 32]
|
|
|
|
palignr m4, m6, m3, 1
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_21, 3,5,9
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3]
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 0]
|
|
movu m1, [r2 + 1]
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
|
|
movu m4, [r2 + mmsize*2]
|
|
pshufb m4, [ang32_shuf_mode21]
|
|
vextracti128 xm6, m4, 1
|
|
|
|
palignr m3, m0, m4, 1
|
|
palignr m8, m3, m6, 1
|
|
vinserti128 m3, m3, xm2, 1
|
|
vinserti128 m8, m8, xm0, 1
|
|
|
|
pmaddubsw m4, m0, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 14
|
|
palignr m7, m2, m0, 14
|
|
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 12
|
|
palignr m7, m2, m0, 12
|
|
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 10
|
|
palignr m7, m2, m0, 10
|
|
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 8
|
|
palignr m7, m2, m0, 8
|
|
|
|
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 6
|
|
palignr m7, m2, m0, 6
|
|
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 4
|
|
palignr m7, m2, m0, 4
|
|
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 2
|
|
palignr m7, m2, m0, 2
|
|
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m3, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 16 to 23
|
|
palignr m6, m3, m8, 14
|
|
palignr m7, m0, m3, 14
|
|
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 12
|
|
palignr m7, m0, m3, 12
|
|
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m3, m8, 10
|
|
palignr m7, m0, m3, 10
|
|
pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 8
|
|
palignr m7, m0, m3, 8
|
|
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 24 to 31
|
|
palignr m6, m3, m8, 6
|
|
palignr m7, m0, m3, 6
|
|
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 4
|
|
palignr m7, m0, m3, 4
|
|
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m3, m8, 2
|
|
palignr m7, m0, m3, 2
|
|
pmaddubsw m4, m6, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m8, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pand m8, [pw_00ff]
|
|
pand m3, [pw_00ff]
|
|
packuswb m8, m3
|
|
movu [r0 + r4], m8
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_16, 3,4,10
|
|
movu m0, [ang32_fact_mode16]
|
|
movu m1, [ang32_fact_mode16 + mmsize]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode16]
|
|
mova m8, [ang32_shuf_mode16 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, -1, -2...]
|
|
|
|
movu m6, [r2]
|
|
pshufb m6, [ang32_shuf_mode16 + mmsize*2]
|
|
mova m9, m6
|
|
mova m3, [ang32_shuf_mode16 + mmsize*3]
|
|
vpermd m6, m3, m6
|
|
vpermq m9, m9, q3232
|
|
pslldq m9, 4
|
|
palignr m6, m9, 15
|
|
pslldq m9, 1
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize*2 + 1]
|
|
|
|
palignr m4, m3, m6, 1
|
|
palignr m5, m6, m9, 6
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 2
|
|
palignr m5, m6, m9, 7
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 3
|
|
palignr m5, m6, m9, 8
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 4
|
|
palignr m5, m6, m9, 9
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 5
|
|
palignr m5, m6, m9, 10
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 6
|
|
palignr m5, m6, m9, 11
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 7
|
|
palignr m5, m6, m9, 12
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 8
|
|
palignr m5, m6, m9, 13
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 9
|
|
palignr m5, m6, m9, 14
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 10
|
|
palignr m5, m6, m9, 15
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 11
|
|
pshufb m4, m7
|
|
pshufb m5, m6, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 12
|
|
palignr m5, m3, m6, 1
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 13
|
|
palignr m5, m3, m6, 2
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 14
|
|
palignr m5, m3, m6, 3
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 15
|
|
palignr m5, m3, m6, 4
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m3, m6, 5
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
vbroadcasti128 m9, [r2 + mmsize*2 + 17]
|
|
|
|
palignr m4, m9, m3, 1
|
|
palignr m5, m3, m6, 6
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m9, m3, 2
|
|
palignr m5, m3, m6, 7
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m9, m3, 3
|
|
palignr m5, m3, m6, 8
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m9, m3, 4
|
|
palignr m5, m3, m6, 9
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m9, m3, 5
|
|
palignr m5, m3, m6, 10
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m9, m3, 6
|
|
palignr m5, m3, m6, 11
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m9, m3, 7
|
|
palignr m5, m3, m6, 12
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m9, m3, 8
|
|
palignr m5, m3, m6, 13
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m9, m3, 9
|
|
palignr m5, m3, m6, 14
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m9, m3, 10
|
|
palignr m5, m3, m6, 15
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m9, m3, 11
|
|
pshufb m4, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m9, m3, 12
|
|
palignr m5, m9, m3, 1
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m9, m3, 13
|
|
palignr m5, m9, m3, 2
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m9, m3, 14
|
|
palignr m5, m9, m3, 3
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m9, m3, 15
|
|
palignr m5, m9, m3, 4
|
|
pshufb m4, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m5, m9, m3, 5
|
|
pshufb m4, m9, m7
|
|
pshufb m5, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_20, 3,5,10
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3]
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 0]
|
|
movu m1, [r2 + 1]
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
|
|
movu m4, [r2 + mmsize*2]
|
|
pshufb m4, [ang32_shuf_mode20]
|
|
mova m9, m4
|
|
vpermq m9, m9, q3333
|
|
mova m7, m4
|
|
vpermq m7, m7, q1111
|
|
palignr m4, m7, 14
|
|
pshufb m4, [ang32_shuf_mode20 + mmsize*1]
|
|
|
|
vextracti128 xm6, m4, 1
|
|
palignr m3, m0, m4, 1
|
|
palignr m8, m3, m6, 1
|
|
vinserti128 m3, m3, xm2, 1
|
|
vinserti128 m8, m8, xm0, 1
|
|
vinserti128 m9, m9, xm3, 1
|
|
|
|
pmaddubsw m4, m0, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 14
|
|
palignr m7, m2, m0, 14
|
|
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 12
|
|
palignr m7, m2, m0, 12
|
|
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m0, m3, 10
|
|
palignr m7, m2, m0, 10
|
|
pmaddubsw m4, m6, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 8
|
|
palignr m7, m2, m0, 8
|
|
pmaddubsw m4, m6, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 6
|
|
palignr m7, m2, m0, 6
|
|
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m4, m6, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 4
|
|
palignr m7, m2, m0, 4
|
|
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 2
|
|
palignr m7, m2, m0, 2
|
|
pmaddubsw m4, m6, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m3, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 - 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m3, m8, 14
|
|
palignr m7, m0, m3, 14
|
|
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m6, m3, m8, 12
|
|
palignr m7, m0, m3, 12
|
|
pmaddubsw m4, m6, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 16 to 23
|
|
palignr m6, m3, m8, 10
|
|
palignr m7, m0, m3, 10
|
|
pmaddubsw m4, m6, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 11 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 8
|
|
palignr m7, m0, m3, 8
|
|
pmaddubsw m4, m6, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 1 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m3, m8, 6
|
|
palignr m7, m0, m3, 6
|
|
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
pmaddubsw m4, m6, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 9 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m3, m8, 4
|
|
palignr m7, m0, m3, 4
|
|
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 2
|
|
palignr m7, m0, m3, 2
|
|
pmaddubsw m4, m6, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 13 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 24 to 31
|
|
pmaddubsw m4, m8, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 3 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m8, m9, 14
|
|
palignr m7, m3, m8, 14
|
|
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 7 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m6, m8, m9, 12
|
|
palignr m7, m3, m8, 12
|
|
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m8, m9, 10
|
|
palignr m7, m3, m8, 10
|
|
pmaddubsw m4, m6, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 15 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m8, m9, 8
|
|
palignr m7, m3, m8, 8
|
|
pmaddubsw m4, m6, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 5 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pand m6, [pw_00ff]
|
|
pand m7, [pw_00ff]
|
|
packuswb m6, m7
|
|
movu [r0 + r4], m6
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_17, 3,4,8
|
|
movu m0, [ang32_fact_mode17]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode17]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, -1, -2...]
|
|
|
|
movu m6, [r2]
|
|
pshufb m6, [ang32_shuf_mode17 + mmsize]
|
|
mova m1, m6
|
|
mova m3, [ang32_shuf_mode16 + mmsize*3]
|
|
vpermd m6, m3, m6
|
|
vpermq m1, m1, q3232
|
|
pslldq m1, 4
|
|
|
|
movu xm4, [r2 + mmsize*2]
|
|
pinsrb xm4, [r2], 0
|
|
vinserti128 m3, m4, xm4, 1
|
|
|
|
palignr m4, m3, m6, 2
|
|
palignr m5, m6, m1, 5
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 3
|
|
palignr m5, m6, m1, 6
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 4
|
|
palignr m5, m6, m1, 7
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 5
|
|
palignr m5, m6, m1, 8
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 6
|
|
palignr m5, m6, m1, 9
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 7
|
|
palignr m5, m6, m1, 10
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 8
|
|
palignr m5, m6, m1, 11
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 9
|
|
palignr m5, m6, m1, 12
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 10
|
|
palignr m5, m6, m1, 13
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 11
|
|
palignr m5, m6, m1, 14
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m3, m6, 12
|
|
palignr m5, m6, m1, 15
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m3, m6, 13
|
|
pshufb m4, m7
|
|
pshufb m5, m6, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m3, m6, 14
|
|
palignr m5, m3, m6, 1
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m3, m6, 15
|
|
palignr m5, m3, m6, 2
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m5, m3, m6, 3
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
vbroadcasti128 m1, [r2 + mmsize*2 + 16]
|
|
palignr m4, m1, m3, 1
|
|
palignr m5, m3, m6, 4
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m1, m3, 2
|
|
palignr m5, m3, m6, 5
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m1, m3, 3
|
|
palignr m5, m3, m6, 6
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m1, m3, 4
|
|
palignr m5, m3, m6, 7
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m1, m3, 5
|
|
palignr m5, m3, m6, 8
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m1, m3, 6
|
|
palignr m5, m3, m6, 9
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m1, m3, 7
|
|
palignr m5, m3, m6, 10
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m1, m3, 8
|
|
palignr m5, m3, m6, 11
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m1, m3, 9
|
|
palignr m5, m3, m6, 12
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m1, m3, 10
|
|
palignr m5, m3, m6, 13
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m1, m3, 11
|
|
palignr m5, m3, m6, 14
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m4, m1, m3, 12
|
|
palignr m5, m3, m6, 15
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m1, m3, 13
|
|
pshufb m4, m7
|
|
pshufb m5, m3, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m4, m1, m3, 14
|
|
palignr m5, m1, m3, 1
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0], m4
|
|
|
|
palignr m4, m1, m3, 15
|
|
palignr m5, m1, m3, 2
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1], m4
|
|
|
|
vbroadcasti128 m6, [r2 + mmsize*2 + mmsize]
|
|
palignr m5, m1, m3, 3
|
|
pshufb m4, m1, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
palignr m4, m6, m1, 1
|
|
palignr m5, m1, m3, 4
|
|
pshufb m4, m7
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
vpermq m4, m4, q3120
|
|
movu [r0 + r3], m4
|
|
RET
|
|
|
|
cglobal intra_pred_ang32_19, 3,5,10
|
|
lea r3, [ang_table_avx2 + 32 * 16]
|
|
lea r4, [r1 * 3]
|
|
mova m5, [pw_1024]
|
|
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 0]
|
|
movu m1, [r2 + 1]
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
|
|
movu m4, [r2 + mmsize*2]
|
|
pshufb m4, [ang32_shuf_mode17 + mmsize*1]
|
|
mova m3, [ang32_shuf_mode19 + mmsize*1]
|
|
mova m6, [ang32_shuf_mode19 + mmsize*2]
|
|
mova m9, m4
|
|
vpermd m4, m3, m4
|
|
vpermd m9, m6, m9
|
|
pshufb m4, [ang32_shuf_mode19]
|
|
pshufb m9, [ang32_shuf_mode19]
|
|
|
|
vextracti128 xm6, m4, 1
|
|
palignr m3, m0, m4, 1
|
|
palignr m8, m3, m6, 1
|
|
palignr m7, m8, m9, 1
|
|
vinserti128 m3, m3, xm2, 1
|
|
vinserti128 m8, m8, xm0, 1
|
|
vinserti128 m9, m7, xm3, 1
|
|
|
|
pmaddubsw m4, m0, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m2, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m0, m3, 14
|
|
palignr m7, m2, m0, 14
|
|
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 12
|
|
palignr m7, m2, m0, 12
|
|
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 10
|
|
palignr m7, m2, m0, 10
|
|
pmaddubsw m4, m6, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m0, m3, 8
|
|
palignr m7, m2, m0, 8
|
|
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m0, m3, 6
|
|
palignr m7, m2, m0, 6
|
|
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m0, m3, 4
|
|
palignr m7, m2, m0, 4
|
|
pmaddubsw m4, m6, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 8 to 15
|
|
palignr m6, m0, m3, 2
|
|
palignr m7, m2, m0, 2
|
|
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m3, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m3, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m0, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m3, m8, 14
|
|
palignr m7, m0, m3, 14
|
|
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m3, m8, 12
|
|
palignr m7, m0, m3, 12
|
|
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m3, m8, 10
|
|
palignr m7, m0, m3, 10
|
|
pmaddubsw m4, m6, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 8
|
|
palignr m7, m0, m3, 8
|
|
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pand m6, [pw_00ff]
|
|
pand m7, [pw_00ff]
|
|
packuswb m6, m7
|
|
movu [r0 + r4], m6
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 16 to 23
|
|
palignr m6, m3, m8, 6
|
|
palignr m7, m0, m3, 6
|
|
pmaddubsw m4, m6, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m3, m8, 4
|
|
palignr m7, m0, m3, 4
|
|
pmaddubsw m4, m6, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m3, m8, 2
|
|
palignr m7, m0, m3, 2
|
|
pmaddubsw m4, m6, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
pmaddubsw m4, m8, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m3, [r3 + 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m6, m8, m9, 14
|
|
palignr m7, m3, m8, 14
|
|
pmaddubsw m4, m6, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m8, m9, 12
|
|
palignr m7, m3, m8, 12
|
|
pmaddubsw m4, m6, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m8, m9, 10
|
|
palignr m7, m3, m8, 10
|
|
pmaddubsw m4, m6, [r3] ; [16]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
; rows 24 to 31
|
|
palignr m6, m8, m9, 8
|
|
palignr m7, m3, m8, 8
|
|
pmaddubsw m4, m6, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 6 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
palignr m6, m8, m9, 6
|
|
palignr m7, m3, m8, 6
|
|
pmaddubsw m4, m6, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 12 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
pmaddubsw m4, m6, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 14 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1*2], m4
|
|
|
|
palignr m6, m8, m9, 4
|
|
palignr m7, m3, m8, 4
|
|
pmaddubsw m4, m6, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 8 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r4], m4
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
vpbroadcastb m0, [r2 + mmsize*2 + 31]
|
|
palignr m1, m9, m0, 1
|
|
vinserti128 m0, m1, xm8, 1
|
|
|
|
palignr m6, m8, m9, 2
|
|
palignr m7, m3, m8, 2
|
|
pmaddubsw m4, m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 - 2 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0], m4
|
|
|
|
pmaddubsw m4, m9, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m8, [r3 + 4 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1], m4
|
|
|
|
palignr m6, m9, m0, 14
|
|
palignr m7, m8, m9, 14
|
|
pmaddubsw m4, m6, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m5
|
|
pmaddubsw m1, m7, [r3 + 10 * 32]
|
|
pmulhrsw m1, m5
|
|
packuswb m4, m1
|
|
movu [r0 + r1 * 2], m4
|
|
|
|
pand m6, [pw_00ff]
|
|
pand m7, [pw_00ff]
|
|
packuswb m6, m7
|
|
movu [r0 + r4], m6
|
|
RET
|
|
|
|
%endif ; ARCH_X86_64
|
|
;-----------------------------------------------------------------------------------------
|
|
; end of intra_pred_ang32 angular modes avx2 asm
|
|
;-----------------------------------------------------------------------------------------
|
|
|
|
;-----------------------------------------------------------------------------------------
|
|
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
|
|
;-----------------------------------------------------------------------------------------
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_3, 3,4,5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 17]
|
|
|
|
pshufb m1, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m2, m0, [c_ang8_src3_11_4_12]
|
|
pshufb m4, m0, [c_ang8_src5_13_5_13]
|
|
pshufb m0, [c_ang8_src6_14_7_15]
|
|
|
|
pmaddubsw m1, [c_ang8_26_20]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_14_8]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_2_28]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_22_16]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_33, 3,4,5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
|
|
pshufb m1, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m2, m0, [c_ang8_src3_11_4_12]
|
|
pshufb m4, m0, [c_ang8_src5_13_5_13]
|
|
pshufb m0, [c_ang8_src6_14_7_15]
|
|
|
|
pmaddubsw m1, [c_ang8_26_20]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_14_8]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_2_28]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_22_16]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_4, 3,4,5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 17]
|
|
|
|
pshufb m1, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m2, m0, [c_ang8_src2_10_3_11]
|
|
pshufb m4, m0, [c_ang8_src4_12_4_12]
|
|
pshufb m0, [c_ang8_src5_13_6_14]
|
|
|
|
pmaddubsw m1, [c_ang8_21_10]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_31_20]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_9_30]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_19_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_32, 3,4,5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
|
|
pshufb m1, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m2, m0, [c_ang8_src2_10_3_11]
|
|
pshufb m4, m0, [c_ang8_src4_12_4_12]
|
|
pshufb m0, [c_ang8_src5_13_6_14]
|
|
|
|
pmaddubsw m1, [c_ang8_21_10]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_31_20]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_9_30]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_19_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_5, 3, 4, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 17]
|
|
|
|
pshufb m1, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m2, m0, [c_ang8_src2_10_3_11]
|
|
pshufb m4, m0, [c_ang8_src3_11_4_12]
|
|
pshufb m0, [c_ang8_src4_12_5_13]
|
|
|
|
pmaddubsw m1, [c_ang8_17_2]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_19_4]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_21_6]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_23_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_31, 3, 4, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
|
|
pshufb m1, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m2, m0, [c_ang8_src2_10_3_11]
|
|
pshufb m4, m0, [c_ang8_src3_11_4_12]
|
|
pshufb m0, [c_ang8_src4_12_5_13]
|
|
|
|
pmaddubsw m1, [c_ang8_17_2]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_19_4]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_21_6]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_23_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_6, 3, 4, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 17]
|
|
|
|
pshufb m1, m0, [intra_pred_shuff_0_8]
|
|
pshufb m2, m0, [c_ang8_src2_10_2_10]
|
|
pshufb m4, m0, [c_ang8_src3_11_3_11]
|
|
pshufb m0, [c_ang8_src3_11_4_12]
|
|
|
|
pmaddubsw m1, [c_ang8_13_26]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_7_20]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_1_14]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_27_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_30, 3, 4, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
|
|
pshufb m1, m0, [intra_pred_shuff_0_8]
|
|
pshufb m2, m0, [c_ang8_src2_10_2_10]
|
|
pshufb m4, m0, [c_ang8_src3_11_3_11]
|
|
pshufb m0, [c_ang8_src3_11_4_12]
|
|
|
|
pmaddubsw m1, [c_ang8_13_26]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_7_20]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_1_14]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_27_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_9, 3, 5, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 17]
|
|
|
|
pshufb m0, [intra_pred_shuff_0_8]
|
|
|
|
lea r4, [c_ang8_mode_27]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_27, 3, 5, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
|
|
pshufb m0, [intra_pred_shuff_0_8]
|
|
|
|
lea r4, [c_ang8_mode_27]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_25, 3, 5, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2]
|
|
|
|
pshufb m0, [intra_pred_shuff_0_8]
|
|
|
|
lea r4, [c_ang8_mode_25]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_7, 3, 4, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 17]
|
|
|
|
pshufb m1, m0, [intra_pred_shuff_0_8]
|
|
pshufb m2, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m4, m0, [c_ang8_src2_10_2_10]
|
|
pshufb m0, [c_ang8_src2_10_3_11]
|
|
|
|
pmaddubsw m1, [c_ang8_9_18]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_27_4]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_13_22]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_31_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_29, 3, 4, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
|
|
pshufb m1, m0, [intra_pred_shuff_0_8]
|
|
pshufb m2, m0, [c_ang8_src1_9_2_10]
|
|
pshufb m4, m0, [c_ang8_src2_10_2_10]
|
|
pshufb m0, [c_ang8_src2_10_3_11]
|
|
|
|
pmaddubsw m1, [c_ang8_9_18]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_27_4]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_13_22]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_31_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_8, 3, 4, 6
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 17]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
|
|
pshufb m1, m0, m5
|
|
pshufb m2, m0, m5
|
|
pshufb m4, m0, m5
|
|
pshufb m0, [c_ang8_src2_10_2_10]
|
|
|
|
pmaddubsw m1, [c_ang8_5_10]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_15_20]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_25_30]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_3_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_28, 3, 4, 6
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
|
|
pshufb m1, m0, m5
|
|
pshufb m2, m0, m5
|
|
pshufb m4, m0, m5
|
|
pshufb m0, [c_ang8_src2_10_2_10]
|
|
|
|
pmaddubsw m1, [c_ang8_5_10]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, [c_ang8_15_20]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, [c_ang8_25_30]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [c_ang8_3_8]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_11, 3, 5, 5
|
|
mova m3, [pw_1024]
|
|
movu xm1, [r2 + 16]
|
|
pinsrb xm1, [r2], 0
|
|
pshufb xm1, [intra_pred_shuff_0_8]
|
|
vinserti128 m0, m1, xm1, 1
|
|
|
|
lea r4, [c_ang8_mode_25]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_15, 3, 6, 6
|
|
mova m3, [pw_1024]
|
|
movu xm5, [r2 + 16]
|
|
pinsrb xm5, [r2], 0
|
|
lea r5, [intra_pred_shuff_0_8]
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 2], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
|
|
lea r4, [c_ang8_mode_15]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 4], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 6], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 8], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_16, 3,4,7
|
|
lea r0, [r0 + r1 * 8]
|
|
sub r0, r1
|
|
neg r1
|
|
lea r3, [r1 * 3]
|
|
vbroadcasti128 m0, [angHor8_tab_16] ; m0 = factor
|
|
mova m1, [intra_pred8_shuff16] ; m1 = 4 of Row shuffle
|
|
movu m2, [intra_pred8_shuff16 + 8] ; m2 = 4 of Row shuffle
|
|
|
|
; prepare reference pixel
|
|
movq xm3, [r2 + 16 + 1] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x]
|
|
movhps xm3, [r2 + 2] ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x]
|
|
pslldq xm3, 1
|
|
pinsrb xm3, [r2], 0 ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8]
|
|
pshufb xm3, [c_ang8_mode_16]
|
|
vinserti128 m3, m3, xm3, 1 ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1 0 2 3 5 6 8]
|
|
|
|
; process 4 rows
|
|
pshufb m4, m3, m1
|
|
pshufb m5, m3, m2
|
|
psrldq m3, 4
|
|
punpcklbw m6, m5, m4
|
|
punpckhbw m5, m4
|
|
pmaddubsw m6, m0
|
|
pmulhrsw m6, [pw_1024]
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m5, [pw_1024]
|
|
packuswb m6, m5
|
|
vextracti128 xm5, m6, 1
|
|
movq [r0], xm6
|
|
movhps [r0 + r1], xm6
|
|
movq [r0 + r1 * 2], xm5
|
|
movhps [r0 + r3], xm5
|
|
|
|
; process 4 rows
|
|
lea r0, [r0 + r1 * 4]
|
|
pshufb m4, m3, m1
|
|
pshufb m5, m3, m2
|
|
punpcklbw m6, m5, m4
|
|
punpckhbw m5, m4
|
|
pmaddubsw m6, m0
|
|
pmulhrsw m6, [pw_1024]
|
|
pmaddubsw m5, m0
|
|
pmulhrsw m5, [pw_1024]
|
|
packuswb m6, m5
|
|
vextracti128 xm5, m6, 1
|
|
movq [r0], xm6
|
|
movhps [r0 + r1], xm6
|
|
movq [r0 + r1 * 2], xm5
|
|
movhps [r0 + r3], xm5
|
|
RET
|
|
|
|
%if 1
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_20, 3,5,6
|
|
lea r0, [r0 + r1 * 8]
|
|
sub r0, r1
|
|
neg r1
|
|
lea r3, [angHor8_tab_20]
|
|
lea r4, [r1 * 3]
|
|
movu m5, [intra_pred_shuff_0_8 + 16]
|
|
|
|
; prepare reference pixel
|
|
movq xm1, [r2 + 1] ; m3 = [ 1 2 3 4 5 6 7 8 x x x x x x x x]
|
|
movhps xm1, [r2 + 16 + 2] ; m3 = [ 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8 x]
|
|
palignr xm1, xm1, [r2 - 15], 15 ; m3 = [ 0 1 2 3 4 5 6 7 8 -2 -3 x -5 -6 x -8]
|
|
pshufb xm1, [c_ang8_mode_20]
|
|
vinserti128 m1, m1, xm1, 1
|
|
|
|
; process 4 rows
|
|
pshufb m3, m1, m5
|
|
psrldq m1, 2
|
|
pmaddubsw m3, [r3 + 0 * 16]
|
|
pmulhrsw m3, [pw_1024]
|
|
|
|
pshufb m4, m1, [intra_pred_shuff_0_8]
|
|
psrldq m1, 1
|
|
pmaddubsw m4, [r3 + 2 * 16]
|
|
pmulhrsw m4, [pw_1024]
|
|
|
|
packuswb m3, m4
|
|
vextracti128 xm4, m3, 1
|
|
movq [r0], xm3
|
|
movq [r0 + r1], xm4
|
|
movhps [r0 + r1 * 2], xm3
|
|
movhps [r0 + r4], xm4
|
|
|
|
; process 4 rows
|
|
lea r0, [r0 + r1 * 4]
|
|
pshufb m3, m1, m5
|
|
psrldq m1, 1
|
|
pmaddubsw m3, [r3 + 4 * 16]
|
|
pmulhrsw m3, [pw_1024]
|
|
|
|
pshufb m4, m1, m5
|
|
pmaddubsw m4, [r3 + 6 * 16]
|
|
pmulhrsw m4, [pw_1024]
|
|
|
|
packuswb m3, m4
|
|
vextracti128 xm4, m3, 1
|
|
movq [r0], xm3
|
|
movq [r0 + r1], xm4
|
|
movhps [r0 + r1 * 2], xm3
|
|
movhps [r0 + r4], xm4
|
|
RET
|
|
|
|
%else
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_20, 3, 6, 6
|
|
mova m3, [pw_1024]
|
|
movu xm5, [r2]
|
|
lea r5, [intra_pred_shuff_0_8]
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 2 + 16], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
|
|
lea r4, [c_ang8_mode_20]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 3 + 16], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 5 + 16], 0
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 6 + 16], 0
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 8 + 16], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
%endif
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_21, 3, 6, 6
|
|
mova m3, [pw_1024]
|
|
movu xm5, [r2]
|
|
lea r5, [intra_pred_shuff_0_8]
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 2 + 16], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
|
|
lea r4, [c_ang8_mode_15]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 4 + 16], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 6 + 16], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
mova xm0, xm5
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 8 + 16], 0
|
|
vinserti128 m0, m0, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_22, 3, 6, 6
|
|
mova m3, [pw_1024]
|
|
movu xm5, [r2]
|
|
lea r5, [intra_pred_shuff_0_8]
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
|
|
lea r4, [c_ang8_mode_14]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 2 + 16], 0
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 5 + 16], 0
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 7 + 16], 0
|
|
pshufb xm5, [r5]
|
|
vinserti128 m0, m0, xm5, 1
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_14, 3, 6, 6
|
|
mova m3, [pw_1024]
|
|
movu xm5, [r2 + 16]
|
|
pinsrb xm5, [r2], 0
|
|
lea r5, [intra_pred_shuff_0_8]
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
|
|
lea r4, [c_ang8_mode_14]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 2], 0
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 5], 0
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 7], 0
|
|
pshufb xm5, [r5]
|
|
vinserti128 m0, m0, xm5, 1
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_13, 3, 6, 6
|
|
mova m3, [pw_1024]
|
|
movu xm5, [r2 + 16]
|
|
pinsrb xm5, [r2], 0
|
|
lea r5, [intra_pred_shuff_0_8]
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
|
|
lea r4, [c_ang8_mode_13]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 4], 0
|
|
pshufb xm4, xm5, [r5]
|
|
vinserti128 m0, m0, xm4, 1
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
vinserti128 m0, m0, xm4, 0
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 7], 0
|
|
pshufb xm5, [r5]
|
|
vinserti128 m0, m0, xm5, 1
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_23, 3, 6, 6
|
|
mova m3, [pw_1024]
|
|
movu xm5, [r2]
|
|
lea r5, [intra_pred_shuff_0_8]
|
|
vinserti128 m0, m5, xm5, 1
|
|
pshufb m0, [r5]
|
|
|
|
lea r4, [c_ang8_mode_13]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 4 + 16], 0
|
|
pshufb xm4, xm5, [r5]
|
|
vinserti128 m0, m0, xm4, 1
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
vinserti128 m0, m0, xm4, 0
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pslldq xm5, 1
|
|
pinsrb xm5, [r2 + 7 + 16], 0
|
|
pshufb xm5, [r5]
|
|
vinserti128 m0, m0, xm5, 1
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_12, 3, 5, 5
|
|
mova m3, [pw_1024]
|
|
movu xm1, [r2 + 16]
|
|
pinsrb xm1, [r2], 0
|
|
pshufb xm1, [intra_pred_shuff_0_8]
|
|
vinserti128 m0, m1, xm1, 1
|
|
|
|
lea r4, [c_ang8_mode_24]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pslldq xm0, 2
|
|
pinsrb xm0, [r2 + 6], 0
|
|
pinsrb xm0, [r2 + 0], 1
|
|
vinserti128 m0, m0, xm0, 1
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
vperm2i128 m2, m1, m4, 00100000b
|
|
vperm2i128 m1, m1, m4, 00110001b
|
|
punpcklbw m4, m2, m1
|
|
punpckhbw m2, m1
|
|
punpcklwd m1, m4, m2
|
|
punpckhwd m4, m2
|
|
mova m0, [trans8_shuf]
|
|
vpermd m1, m0, m1
|
|
vpermd m4, m0, m4
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
movhps [r0 + r1], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
movhps [r0 + r1], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + 2 * r1], xm2
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang8_24, 3, 5, 5
|
|
mova m3, [pw_1024]
|
|
vbroadcasti128 m0, [r2]
|
|
|
|
pshufb m0, [intra_pred_shuff_0_8]
|
|
|
|
lea r4, [c_ang8_mode_24]
|
|
pmaddubsw m1, m0, [r4]
|
|
pmulhrsw m1, m3
|
|
pmaddubsw m2, m0, [r4 + mmsize]
|
|
pmulhrsw m2, m3
|
|
pmaddubsw m4, m0, [r4 + 2 * mmsize]
|
|
pmulhrsw m4, m3
|
|
pslldq xm0, 2
|
|
pinsrb xm0, [r2 + 16 + 6], 0
|
|
pinsrb xm0, [r2 + 0], 1
|
|
vinserti128 m0, m0, xm0, 1
|
|
pmaddubsw m0, [r4 + 3 * mmsize]
|
|
pmulhrsw m0, m3
|
|
packuswb m1, m2
|
|
packuswb m4, m0
|
|
|
|
lea r3, [3 * r1]
|
|
movq [r0], xm1
|
|
vextracti128 xm2, m1, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm1
|
|
movhps [r0 + r3], xm2
|
|
lea r0, [r0 + 4 * r1]
|
|
movq [r0], xm4
|
|
vextracti128 xm2, m4, 1
|
|
movq [r0 + r1], xm2
|
|
movhps [r0 + 2 * r1], xm4
|
|
movhps [r0 + r3], xm2
|
|
RET
|
|
|
|
%macro INTRA_PRED_ANG16_MC0 3
|
|
pmaddubsw m3, m1, [r4 + %3 * mmsize]
|
|
pmulhrsw m3, m0
|
|
pmaddubsw m4, m2, [r4 + %3 * mmsize]
|
|
pmulhrsw m4, m0
|
|
packuswb m3, m4
|
|
movu [%1], xm3
|
|
vextracti128 xm4, m3, 1
|
|
movu [%2], xm4
|
|
%endmacro
|
|
|
|
%macro INTRA_PRED_ANG16_MC1 1
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, %1
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, (%1 + 1)
|
|
%endmacro
|
|
|
|
%macro INTRA_PRED_ANG16_MC2 1
|
|
vbroadcasti128 m1, [r2 + %1]
|
|
pshufb m1, m5
|
|
vbroadcasti128 m2, [r2 + (%1 + 8)]
|
|
pshufb m2, m5
|
|
%endmacro
|
|
|
|
%macro INTRA_PRED_ANG16_MC3 2
|
|
vperm2i128 m1, m1, m2, 00100000b
|
|
pmaddubsw m3, m1, [r4 + (%2 * mmsize)]
|
|
pmulhrsw m3, m0
|
|
packuswb m3, m3
|
|
vpermq m3, m3, 11011000b
|
|
movu [%1], xm3
|
|
%endmacro
|
|
|
|
%macro INTRA_PRED_ANG16_MC4 3
|
|
vperm2i128 m1, m1, m2, 00100000b
|
|
pmaddubsw m4, m1, [r4 + (%3 * mmsize)]
|
|
pmulhrsw m4, m0
|
|
packuswb m3, m4
|
|
vpermq m3, m3, 11011000b
|
|
movu [%1], xm3
|
|
vextracti128 xm3, m3, 1
|
|
movu [%2], xm3
|
|
%endmacro
|
|
|
|
%if ARCH_X86_64 == 1
|
|
%macro INTRA_PRED_TRANS_STORE_16x16 0
|
|
punpcklbw m8, m0, m1
|
|
punpckhbw m0, m1
|
|
|
|
punpcklbw m1, m2, m3
|
|
punpckhbw m2, m3
|
|
|
|
punpcklbw m3, m4, m5
|
|
punpckhbw m4, m5
|
|
|
|
punpcklbw m5, m6, m7
|
|
punpckhbw m6, m7
|
|
|
|
punpcklwd m7, m8, m1
|
|
punpckhwd m8, m1
|
|
|
|
punpcklwd m1, m3, m5
|
|
punpckhwd m3, m5
|
|
|
|
punpcklwd m5, m0, m2
|
|
punpckhwd m0, m2
|
|
|
|
punpcklwd m2, m4, m6
|
|
punpckhwd m4, m6
|
|
|
|
punpckldq m6, m7, m1
|
|
punpckhdq m7, m1
|
|
|
|
punpckldq m1, m8, m3
|
|
punpckhdq m8, m3
|
|
|
|
punpckldq m3, m5, m2
|
|
punpckhdq m5, m2
|
|
|
|
punpckldq m2, m0, m4
|
|
punpckhdq m0, m4
|
|
|
|
vpermq m6, m6, 0xD8
|
|
vpermq m7, m7, 0xD8
|
|
vpermq m1, m1, 0xD8
|
|
vpermq m8, m8, 0xD8
|
|
vpermq m3, m3, 0xD8
|
|
vpermq m5, m5, 0xD8
|
|
vpermq m2, m2, 0xD8
|
|
vpermq m0, m0, 0xD8
|
|
|
|
movu [r0], xm6
|
|
vextracti128 xm4, m6, 1
|
|
movu [r0 + r1], xm4
|
|
|
|
movu [r0 + 2 * r1], xm7
|
|
vextracti128 xm4, m7, 1
|
|
movu [r0 + r3], xm4
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
movu [r0], xm1
|
|
vextracti128 xm4, m1, 1
|
|
movu [r0 + r1], xm4
|
|
|
|
movu [r0 + 2 * r1], xm8
|
|
vextracti128 xm4, m8, 1
|
|
movu [r0 + r3], xm4
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
movu [r0], xm3
|
|
vextracti128 xm4, m3, 1
|
|
movu [r0 + r1], xm4
|
|
|
|
movu [r0 + 2 * r1], xm5
|
|
vextracti128 xm4, m5, 1
|
|
movu [r0 + r3], xm4
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
movu [r0], xm2
|
|
vextracti128 xm4, m2, 1
|
|
movu [r0 + r1], xm4
|
|
|
|
movu [r0 + 2 * r1], xm0
|
|
vextracti128 xm4, m0, 1
|
|
movu [r0 + r3], xm4
|
|
%endmacro
|
|
|
|
%macro INTRA_PRED_ANG16_CAL_ROW 3
|
|
pmaddubsw %1, m9, [r4 + (%3 * mmsize)]
|
|
pmulhrsw %1, m11
|
|
pmaddubsw %2, m10, [r4 + (%3 * mmsize)]
|
|
pmulhrsw %2, m11
|
|
packuswb %1, %2
|
|
%endmacro
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_12, 3,4,9
|
|
vbroadcasti128 m0, [angHor_tab_12]
|
|
vbroadcasti128 m1, [angHor_tab_12 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang16_shuf_mode12]
|
|
mova m8, [ang16_shuf_mode12 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
movu xm4, [r2 + mmsize - 2]
|
|
pinsrb xm4, [r2 + 0], 2
|
|
pinsrb xm4, [r2 + 6], 1
|
|
pinsrb xm4, [r2 + 13], 0
|
|
vbroadcasti128 m6, [r2 + mmsize + 14]
|
|
vinserti128 m3, m4, xm4, 1
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_13, 3,4,9
|
|
vbroadcasti128 m0, [angHor_tab_13]
|
|
vbroadcasti128 m1, [angHor_tab_13 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang16_shuf_mode13]
|
|
mova m8, [ang16_shuf_mode13 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize + 1]
|
|
vbroadcasti128 m4, [r2]
|
|
pshufb m4, [ang16_shuf_mode13 + mmsize * 2]
|
|
|
|
palignr m3, m4, 11
|
|
vbroadcasti128 m6, [r2 + mmsize + 12]
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_14, 3,4,9
|
|
vbroadcasti128 m0, [angHor_tab_14]
|
|
vbroadcasti128 m1, [angHor_tab_14 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang16_shuf_mode14]
|
|
mova m8, [ang16_shuf_mode14 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize + 1]
|
|
vbroadcasti128 m4, [r2]
|
|
pshufb m4, [ang16_shuf_mode14 + mmsize * 2]
|
|
palignr m3, m4, 9
|
|
vbroadcasti128 m6, [r2 + mmsize + 10]
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_15, 3,4,9
|
|
vbroadcasti128 m0, [angHor_tab_15]
|
|
vbroadcasti128 m1, [angHor_tab_15 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang16_shuf_mode15]
|
|
mova m8, [ang16_shuf_mode15 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize + 1]
|
|
vbroadcasti128 m4, [r2]
|
|
pshufb m4, [ang16_shuf_mode15 + mmsize * 2]
|
|
palignr m3, m3, m4, 7
|
|
vbroadcasti128 m6, [r2 + mmsize + 8]
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_16, 3,4,9
|
|
vbroadcasti128 m0, [angHor_tab_16]
|
|
vbroadcasti128 m1, [angHor_tab_16 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang16_shuf_mode16]
|
|
mova m8, [ang16_shuf_mode16 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize + 1]
|
|
vbroadcasti128 m4, [r2]
|
|
pshufb m4, [ang16_shuf_mode16 + mmsize * 2]
|
|
palignr m3, m4, 5
|
|
vbroadcasti128 m6, [r2 + mmsize + 6]
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_17, 3,4,9
|
|
vbroadcasti128 m0, [angHor_tab_17]
|
|
vbroadcasti128 m1, [angHor_tab_17 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang16_shuf_mode17]
|
|
mova m8, [ang16_shuf_mode17 + mmsize]
|
|
lea r3, [r1 * 3]
|
|
|
|
vbroadcasti128 m3, [r2 + mmsize + 1]
|
|
vbroadcasti128 m4, [r2]
|
|
pshufb m4, [ang16_shuf_mode17 + mmsize * 2]
|
|
palignr m3, m4, 3
|
|
vbroadcasti128 m6, [r2 + mmsize + 4]
|
|
|
|
pshufb m4, m3, m7
|
|
pshufb m5, m3, m8
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m4, m5, m7
|
|
pshufb m5, m8
|
|
|
|
pmaddubsw m4, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_11, 3,4,8
|
|
vbroadcasti128 m0, [angHor_tab_11]
|
|
vbroadcasti128 m1, [angHor_tab_11 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
mova m7, [ang32_shuf_mode9]
|
|
lea r3, [r1 * 3]
|
|
|
|
; prepare for [0 -1 -2...]
|
|
|
|
movu xm3, [r2 + mmsize]
|
|
pinsrb xm3, [r2], 0
|
|
vbroadcasti128 m6, [r2 + mmsize + 16]
|
|
vinserti128 m3, m3, xm3, 1
|
|
|
|
pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2]
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
|
|
|
|
; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
|
|
%if ARCH_X86_64 == 1
|
|
INIT_YMM avx2
|
|
%macro TRANSPOSE_STORE_8x32 12
|
|
jc .skip
|
|
|
|
punpcklbw m%9, m%1, m%2
|
|
punpckhbw m%1, m%2
|
|
punpcklbw m%10, m%3, m%4
|
|
punpckhbw m%3, m%4
|
|
|
|
punpcklwd m%11, m%9, m%10
|
|
punpckhwd m%9, m%10
|
|
punpcklwd m%10, m%1, m%3
|
|
punpckhwd m%1, m%3
|
|
|
|
punpckldq m%12, m%11, m%10
|
|
punpckhdq m%11, m%10
|
|
punpckldq m%10, m%9, m%1
|
|
punpckhdq m%9, m%1
|
|
|
|
punpcklbw m%1, m%5, m%6
|
|
punpckhbw m%5, m%6
|
|
punpcklbw m%2, m%7, m%8
|
|
punpckhbw m%7, m%8
|
|
|
|
punpcklwd m%3, m%1, m%2
|
|
punpckhwd m%1, m%2
|
|
punpcklwd m%4, m%5, m%7
|
|
punpckhwd m%5, m%7
|
|
|
|
punpckldq m%2, m%3, m%4
|
|
punpckhdq m%3, m%4
|
|
punpckldq m%4, m%1, m%5
|
|
punpckhdq m%1, m%5
|
|
|
|
punpckldq m%5, m%12, m%2
|
|
punpckhdq m%6, m%12, m%2
|
|
punpckldq m%7, m%10, m%4
|
|
punpckhdq m%8, m%10, m%4
|
|
|
|
punpckldq m%2, m%11, m%3
|
|
punpckhdq m%11, m%11, m%3
|
|
punpckldq m%4, m%9, m%1
|
|
punpckhdq m%9, m%9, m%1
|
|
|
|
movu [r0 + r1 * 0], xm%5
|
|
movu [r0 + r1 * 1], xm%6
|
|
movu [r0 + r1 * 2], xm%2
|
|
movu [r0 + r5 * 1], xm%11
|
|
|
|
add r0, r6
|
|
|
|
movu [r0 + r1 * 0], xm%7
|
|
movu [r0 + r1 * 1], xm%8
|
|
movu [r0 + r1 * 2], xm%4
|
|
movu [r0 + r5 * 1], xm%9
|
|
|
|
add r0, r6
|
|
|
|
vextracti128 [r0 + r1 * 0], m%5, 1
|
|
vextracti128 [r0 + r1 * 1], m%6, 1
|
|
vextracti128 [r0 + r1 * 2], m%2, 1
|
|
vextracti128 [r0 + r5 * 1], m%11, 1
|
|
|
|
add r0, r6
|
|
|
|
vextracti128 [r0 + r1 * 0], m%7, 1
|
|
vextracti128 [r0 + r1 * 1], m%8, 1
|
|
vextracti128 [r0 + r1 * 2], m%4, 1
|
|
vextracti128 [r0 + r5 * 1], m%9, 1
|
|
jmp .end
|
|
|
|
.skip:
|
|
vpermq m%1, m%1, q3120
|
|
vpermq m%2, m%2, q3120
|
|
vpermq m%3, m%3, q3120
|
|
vpermq m%4, m%4, q3120
|
|
vpermq m%5, m%5, q3120
|
|
vpermq m%6, m%6, q3120
|
|
vpermq m%7, m%7, q3120
|
|
vpermq m%8, m%8, q3120
|
|
|
|
movu [r0 + r1 * 0], xm%1
|
|
movu [r0 + r1 * 1], xm%2
|
|
movu [r0 + r1 * 2], xm%3
|
|
movu [r0 + r5 * 1], xm%4
|
|
|
|
add r0, r6
|
|
|
|
movu [r0 + r1 * 0], xm%5
|
|
movu [r0 + r1 * 1], xm%6
|
|
movu [r0 + r1 * 2], xm%7
|
|
movu [r0 + r5 * 1], xm%8
|
|
|
|
add r0, r6
|
|
|
|
vextracti128 [r0 + r1 * 0], m%1, 1
|
|
vextracti128 [r0 + r1 * 1], m%2, 1
|
|
vextracti128 [r0 + r1 * 2], m%3, 1
|
|
vextracti128 [r0 + r5 * 1], m%4, 1
|
|
|
|
add r0, r6
|
|
|
|
vextracti128 [r0 + r1 * 0], m%5, 1
|
|
vextracti128 [r0 + r1 * 1], m%6, 1
|
|
vextracti128 [r0 + r1 * 2], m%7, 1
|
|
vextracti128 [r0 + r5 * 1], m%8, 1
|
|
.end:
|
|
%endmacro
|
|
|
|
cglobal ang16_mode_3_33
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vextracti128 xm1, m0, 1
|
|
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
|
|
pmaddubsw m4, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m5, m2, m0, 2
|
|
pmaddubsw m5, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m6, m2, m0, 4
|
|
palignr m8, m2, m0, 6
|
|
pmaddubsw m6, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m8, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m8, m7
|
|
|
|
palignr m10, m2, m0, 8
|
|
pmaddubsw m9, m10, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m10, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m10, m7
|
|
|
|
palignr m11, m2, m0, 10
|
|
palignr m12, m2, m0, 12
|
|
pmaddubsw m11, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m12, [r3] ; [16]
|
|
pmulhrsw m12, m7
|
|
|
|
; rows 8 to 15
|
|
palignr m3, m2, m0, 14
|
|
palignr m1, m1, m2, 14
|
|
pmaddubsw m3, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m3, m2, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m3, m7
|
|
packuswb m5, m3
|
|
|
|
pmaddubsw m3, m2, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
movu xm0, [r2 + 25]
|
|
movu xm1, [r2 + 26]
|
|
punpcklbw m0, m1
|
|
mova m1, m2
|
|
vinserti128 m1, m1, xm0, 0
|
|
vpermq m1, m1, 01001110b
|
|
|
|
palignr m3, m1, m2, 2
|
|
pmaddubsw m3, [r3 + 8 * 32] ; [24]
|
|
pmulhrsw m3, m7
|
|
packuswb m8, m3
|
|
|
|
palignr m3, m1, m2, 4
|
|
pmaddubsw m3, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m9, m3
|
|
|
|
palignr m3, m1, m2, 6
|
|
pmaddubsw m3, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m3, m7
|
|
packuswb m10, m3
|
|
|
|
palignr m3, m1, m2, 8
|
|
pmaddubsw m3, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m3, m7
|
|
packuswb m11, m3
|
|
|
|
pmovzxbw m1, [r2 + 14]
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_3, 3, 7, 13
|
|
add r2, 32
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
clc
|
|
|
|
call ang16_mode_3_33
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_33, 3, 7, 13
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
stc
|
|
|
|
call ang16_mode_3_33
|
|
RET
|
|
|
|
cglobal ang16_mode_4_32
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vextracti128 xm1, m0, 1
|
|
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
|
|
pmaddubsw m4, m0, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m1, m2, m0, 2
|
|
pmaddubsw m5, m1, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m8, m2, m0, 4
|
|
pmaddubsw m6, m1, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m8, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
|
|
palignr m10, m2, m0, 6
|
|
pmaddubsw m9, m10, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m9, m7
|
|
pmaddubsw m10, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m10, m7
|
|
|
|
palignr m11, m2, m0, 8
|
|
palignr m1, m2, m0, 10
|
|
pmaddubsw m11, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m3, m1, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
palignr m3, m2, m0, 12
|
|
pmaddubsw m3, m3, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m5, m3
|
|
|
|
palignr m1, m2, m0, 14
|
|
pmaddubsw m3, m1, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m3, m7
|
|
packuswb m8, m3
|
|
|
|
palignr m3, m2, m0, 16
|
|
pmaddubsw m3, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m3, m7
|
|
packuswb m9, m3
|
|
|
|
movu xm0, [r2 + 25]
|
|
movu xm1, [r2 + 26]
|
|
punpcklbw m0, m1
|
|
mova m1, m2
|
|
vinserti128 m1, m1, xm0, 0
|
|
vpermq m1, m1, 01001110b
|
|
|
|
palignr m0, m1, m2, 2
|
|
pmaddubsw m3, m0, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m3, m7
|
|
packuswb m10, m3
|
|
|
|
pmaddubsw m3, m0, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m3, m7
|
|
packuswb m11, m3
|
|
|
|
palignr m1, m1, m2, 4
|
|
pmaddubsw m1, [r3] ; [16]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_4, 3, 7, 13
|
|
add r2, 32
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
clc
|
|
|
|
call ang16_mode_4_32
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_32, 3, 7, 13
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
stc
|
|
|
|
call ang16_mode_4_32
|
|
RET
|
|
|
|
cglobal ang16_mode_5
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vextracti128 xm1, m0, 1
|
|
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
|
|
pmaddubsw m4, m0, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m4, m7
|
|
|
|
palignr m1, m2, m0, 2
|
|
pmaddubsw m5, m1, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, m0, 4
|
|
pmaddubsw m6, m1, [r3 + 3 * 32] ; [19]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m8, m3, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m9, m3, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m9, m7
|
|
|
|
palignr m3, m2, m0, 6
|
|
pmaddubsw m10, m3, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m10, m7
|
|
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m11, m3, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m3, m1, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
palignr m1, m2, m0, 10
|
|
pmaddubsw m3, m1, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m3, m7
|
|
packuswb m5, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
palignr m1, m2, m0, 12
|
|
pmaddubsw m3, m1, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m3, m7
|
|
packuswb m8, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 13 * 32] ; [29]
|
|
pmulhrsw m3, m7
|
|
packuswb m9, m3
|
|
|
|
palignr m1, m2, m0, 14
|
|
pmaddubsw m3, m1, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m3, m7
|
|
packuswb m10, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m3, m7
|
|
packuswb m11, m3
|
|
|
|
palignr m1, m2, m0, 16
|
|
pmaddubsw m1, [r3] ; [16]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_5, 3, 7, 13
|
|
add r2, 32
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
clc
|
|
|
|
call ang16_mode_5
|
|
RET
|
|
|
|
cglobal ang16_mode_6
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vextracti128 xm1, m0, 1
|
|
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
|
|
pmaddubsw m4, m0, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m4, m7
|
|
|
|
pmaddubsw m5, m0, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m5, m7
|
|
|
|
palignr m3, m2, m0, 2
|
|
pmaddubsw m6, m3, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m8, m3, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
|
|
palignr m3, m2, m0, 4
|
|
pmaddubsw m9, m3, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m9, m7
|
|
|
|
pmaddubsw m10, m3, [r3 - 2 * 32] ; [14]
|
|
pmulhrsw m10, m7
|
|
|
|
pmaddubsw m11, m3, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m11, m7
|
|
|
|
palignr m1, m2, m0, 6
|
|
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m3, m1, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m3, m1, [r3 - 14 * 32] ; [2]
|
|
pmulhrsw m3, m7
|
|
packuswb m5, m3
|
|
|
|
pmaddubsw m3, m1, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m3, m7
|
|
packuswb m8, m3
|
|
|
|
palignr m1, m2, m0, 10
|
|
pmaddubsw m3, m1, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m3, m7
|
|
packuswb m9, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m3, m7
|
|
packuswb m10, m3
|
|
|
|
palignr m1, m2, m0, 12
|
|
pmaddubsw m3, m1, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m3, m7
|
|
packuswb m11, m3
|
|
|
|
pmaddubsw m1, [r3] ; [16]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_6, 3, 7, 13
|
|
add r2, 32
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
clc
|
|
|
|
call ang16_mode_6
|
|
RET
|
|
|
|
cglobal ang16_mode_7
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vextracti128 xm1, m0, 1
|
|
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
|
|
pmaddubsw m4, m0, [r3 - 7 * 32] ; [9]
|
|
pmulhrsw m4, m7
|
|
|
|
pmaddubsw m5, m0, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m5, m7
|
|
pmaddubsw m6, m0, [r3 + 11 * 32] ; [27]
|
|
pmulhrsw m6, m7
|
|
|
|
palignr m3, m2, m0, 2
|
|
pmaddubsw m8, m3, [r3 - 12 * 32] ; [4]
|
|
pmulhrsw m8, m7
|
|
|
|
pmaddubsw m9, m3, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m9, m7
|
|
|
|
pmaddubsw m10, m3, [r3 + 6 * 32] ; [22]
|
|
pmulhrsw m10, m7
|
|
|
|
pmaddubsw m11, m3, [r3 + 15 * 32] ; [31]
|
|
pmulhrsw m11, m7
|
|
|
|
palignr m1, m2, m0, 4
|
|
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m3, m1, [r3 + 1 * 32] ; [17]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 10 * 32] ; [26]
|
|
pmulhrsw m3, m7
|
|
packuswb m5, m3
|
|
|
|
palignr m1, m2, m0, 6
|
|
pmaddubsw m3, m1, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
|
|
pmaddubsw m3, m1, [r3 - 4 * 32] ; [12]
|
|
pmulhrsw m3, m7
|
|
packuswb m8, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 5 * 32] ; [21]
|
|
pmulhrsw m3, m7
|
|
packuswb m9, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m3, m7
|
|
packuswb m10, m3
|
|
|
|
palignr m1, m2, m0, 8
|
|
pmaddubsw m3, m1, [r3 - 9 * 32] ; [7]
|
|
pmulhrsw m3, m7
|
|
packuswb m11, m3
|
|
|
|
pmaddubsw m1, [r3] ; [16]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_7, 3, 7, 13
|
|
add r2, 32
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
clc
|
|
|
|
call ang16_mode_7
|
|
RET
|
|
|
|
cglobal ang16_mode_8
|
|
; rows 0 to 7
|
|
movu m0, [r2 + 1] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
|
|
movu m1, [r2 + 2] ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
|
|
|
|
punpckhbw m2, m0, m1 ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
punpcklbw m0, m1 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vextracti128 xm1, m0, 1
|
|
vperm2i128 m0, m0, m2, 0x20 ; [17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
|
|
vperm2i128 m2, m2, m1, 0x20 ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
|
|
|
|
pmaddubsw m4, m0, [r3 - 11 * 32] ; [5]
|
|
pmulhrsw m4, m7
|
|
pmaddubsw m5, m0, [r3 - 6 * 32] ; [10]
|
|
pmulhrsw m5, m7
|
|
|
|
pmaddubsw m6, m0, [r3 - 1 * 32] ; [15]
|
|
pmulhrsw m6, m7
|
|
pmaddubsw m8, m0, [r3 + 4 * 32] ; [20]
|
|
pmulhrsw m8, m7
|
|
pmaddubsw m9, m0, [r3 + 9 * 32] ; [25]
|
|
pmulhrsw m9, m7
|
|
|
|
pmaddubsw m10, m0, [r3 + 14 * 32] ; [30]
|
|
pmulhrsw m10, m7
|
|
palignr m1, m2, m0, 2
|
|
pmaddubsw m11, m1, [r3 - 13 * 32] ; [3]
|
|
pmulhrsw m11, m7
|
|
pmaddubsw m12, m1, [r3 - 8 * 32] ; [8]
|
|
pmulhrsw m12, m7
|
|
|
|
; rows 8 to 15
|
|
pmaddubsw m3, m1, [r3 - 3 * 32] ; [13]
|
|
pmulhrsw m3, m7
|
|
packuswb m4, m3
|
|
pmaddubsw m3, m1, [r3 + 2 * 32] ; [18]
|
|
pmulhrsw m3, m7
|
|
packuswb m5, m3
|
|
|
|
pmaddubsw m3, m1, [r3 + 7 * 32] ; [23]
|
|
pmulhrsw m3, m7
|
|
packuswb m6, m3
|
|
pmaddubsw m3, m1, [r3 + 12 * 32] ; [28]
|
|
pmulhrsw m3, m7
|
|
packuswb m8, m3
|
|
|
|
palignr m1, m2, m0, 4
|
|
pmaddubsw m3, m1, [r3 - 15 * 32] ; [1]
|
|
pmulhrsw m3, m7
|
|
packuswb m9, m3
|
|
pmaddubsw m3, m1, [r3 - 10 * 32] ; [6]
|
|
pmulhrsw m3, m7
|
|
packuswb m10, m3
|
|
|
|
pmaddubsw m3, m1, [r3 - 5 * 32] ; [11]
|
|
pmulhrsw m3, m7
|
|
packuswb m11, m3
|
|
pmaddubsw m1, [r3] ; [16]
|
|
pmulhrsw m1, m7
|
|
packuswb m12, m1
|
|
|
|
TRANSPOSE_STORE_8x32 4, 5, 6, 8, 9, 10, 11, 12, 0, 1, 2, 3
|
|
ret
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_8, 3, 7, 13
|
|
add r2, 32
|
|
lea r3, [ang_table_avx2 + 16 * 32]
|
|
lea r5, [r1 * 3] ; r5 -> 3 * stride
|
|
lea r6, [r1 * 4] ; r6 -> 4 * stride
|
|
mova m7, [pw_1024]
|
|
clc
|
|
|
|
call ang16_mode_8
|
|
RET
|
|
%endif ; ARCH_X86_64
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_9, 3,4,8
|
|
vbroadcasti128 m0, [angHor_tab_9]
|
|
vbroadcasti128 m1, [angHor_tab_9 + mmsize/2]
|
|
mova m2, [pw_1024]
|
|
lea r3, [r1 * 3]
|
|
mova m7, [ang16_shuf_mode9]
|
|
|
|
vbroadcasti128 m6, [r2 + mmsize + 17]
|
|
vbroadcasti128 m3, [r2 + mmsize + 1]
|
|
|
|
pshufb m5, m3, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 2
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 4
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 6
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 8
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 10
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
|
|
palignr m5, m6, m3, 12
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0], xm4
|
|
vextracti128 [r0 + r1], m4, 1
|
|
|
|
palignr m5, m6, m3, 14
|
|
pshufb m5, m7
|
|
pmaddubsw m4, m5, m0
|
|
pmaddubsw m5, m1
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m2
|
|
packuswb m4, m5
|
|
movu [r0 + r1 * 2], xm4
|
|
vextracti128 [r0 + r3], m4, 1
|
|
RET
|
|
%endif
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_25, 3, 5, 5
|
|
mova m0, [pw_1024]
|
|
|
|
vbroadcasti128 m1, [r2]
|
|
pshufb m1, [intra_pred_shuff_0_8]
|
|
vbroadcasti128 m2, [r2 + 8]
|
|
pshufb m2, [intra_pred_shuff_0_8]
|
|
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_25]
|
|
|
|
INTRA_PRED_ANG16_MC1 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC1 2
|
|
|
|
add r4, 4 * mmsize
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC1 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC1 2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_28, 3, 5, 6
|
|
mova m0, [pw_1024]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_28]
|
|
|
|
INTRA_PRED_ANG16_MC2 1
|
|
INTRA_PRED_ANG16_MC1 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
|
|
|
|
INTRA_PRED_ANG16_MC2 2
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
add r4, 4 * mmsize
|
|
|
|
INTRA_PRED_ANG16_MC1 0
|
|
INTRA_PRED_ANG16_MC2 3
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC1 2
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_27, 3, 5, 5
|
|
mova m0, [pw_1024]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_27]
|
|
|
|
vbroadcasti128 m1, [r2 + 1]
|
|
pshufb m1, [intra_pred_shuff_0_8]
|
|
vbroadcasti128 m2, [r2 + 9]
|
|
pshufb m2, [intra_pred_shuff_0_8]
|
|
|
|
INTRA_PRED_ANG16_MC1 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC1 2
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
add r4, 4 * mmsize
|
|
INTRA_PRED_ANG16_MC1 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
|
|
|
|
vperm2i128 m1, m1, m2, 00100000b
|
|
pmaddubsw m3, m1, [r4 + 3 * mmsize]
|
|
pmulhrsw m3, m0
|
|
vbroadcasti128 m2, [r2 + 2]
|
|
pshufb m2, [intra_pred_shuff_0_15]
|
|
pmaddubsw m2, [r4 + 4 * mmsize]
|
|
pmulhrsw m2, m0
|
|
packuswb m3, m2
|
|
vpermq m3, m3, 11011000b
|
|
movu [r0 + 2 * r1], xm3
|
|
vextracti128 xm4, m3, 1
|
|
movu [r0 + r3], xm4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_29, 3, 5, 5
|
|
mova m0, [pw_1024]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_29]
|
|
|
|
INTRA_PRED_ANG16_MC2 1
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
|
|
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1
|
|
|
|
INTRA_PRED_ANG16_MC2 2
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
|
|
|
|
lea r0, [r0 + r1 * 4]
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
|
|
|
|
INTRA_PRED_ANG16_MC2 3
|
|
add r4, 4 * mmsize
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
|
|
lea r0, [r0 + r1 * 4]
|
|
INTRA_PRED_ANG16_MC3 r0 + r1, 1
|
|
|
|
INTRA_PRED_ANG16_MC2 4
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2
|
|
lea r0, [r0 + r1 * 4]
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
|
|
|
|
add r4, 4 * mmsize
|
|
|
|
INTRA_PRED_ANG16_MC2 5
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_30, 3, 5, 6
|
|
mova m0, [pw_1024]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_30]
|
|
|
|
INTRA_PRED_ANG16_MC2 1
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
|
|
|
|
INTRA_PRED_ANG16_MC2 2
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1
|
|
|
|
INTRA_PRED_ANG16_MC2 3
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
|
|
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3
|
|
|
|
INTRA_PRED_ANG16_MC2 4
|
|
add r4, 4 * mmsize
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
|
|
|
|
INTRA_PRED_ANG16_MC2 5
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
|
|
INTRA_PRED_ANG16_MC3 r0 + r3 , 2
|
|
|
|
INTRA_PRED_ANG16_MC2 6
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
|
|
|
|
INTRA_PRED_ANG16_MC2 7
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_31, 3, 5, 6
|
|
mova m0, [pw_1024]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_31]
|
|
|
|
INTRA_PRED_ANG16_MC2 1
|
|
INTRA_PRED_ANG16_MC3 r0, 0
|
|
|
|
INTRA_PRED_ANG16_MC2 2
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
|
|
|
|
INTRA_PRED_ANG16_MC2 3
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
|
|
|
|
INTRA_PRED_ANG16_MC2 4
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
|
|
|
|
INTRA_PRED_ANG16_MC2 5
|
|
add r4, 4 * mmsize
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
|
|
|
|
INTRA_PRED_ANG16_MC2 6
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
|
|
|
|
INTRA_PRED_ANG16_MC2 7
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
|
|
|
|
INTRA_PRED_ANG16_MC2 8
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
|
|
|
|
INTRA_PRED_ANG16_MC2 9
|
|
INTRA_PRED_ANG16_MC3 r0 + r3, 4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_24, 3, 5, 6
|
|
mova m0, [pw_1024]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_24]
|
|
|
|
INTRA_PRED_ANG16_MC2 0
|
|
INTRA_PRED_ANG16_MC1 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
|
|
|
|
movu xm1, [r2 - 1]
|
|
pinsrb xm1, [r2 + 38], 0
|
|
vinserti128 m1, m1, xm1, 1
|
|
pshufb m1, m5
|
|
vbroadcasti128 m2, [r2 + 7]
|
|
pshufb m2, m5
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
add r4, 4 * mmsize
|
|
|
|
INTRA_PRED_ANG16_MC1 0
|
|
|
|
movu xm1, [r2 - 2]
|
|
pinsrb xm1, [r2 + 45], 0
|
|
pinsrb xm1, [r2 + 38], 1
|
|
vinserti128 m1, m1, xm1, 1
|
|
pshufb m1, m5
|
|
vbroadcasti128 m2, [r2 + 6]
|
|
pshufb m2, m5
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC1 2
|
|
RET
|
|
|
|
%macro INTRA_PRED_ANG16_MC5 2
|
|
pslldq xm6, xm6, 1
|
|
pinsrb xm6, [r2 + %1], 0
|
|
vinserti128 m1, m6, xm6, 1
|
|
pshufb m1, m5
|
|
vbroadcasti128 m2, [r2 + %2]
|
|
pshufb m2, m5
|
|
%endmacro
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_23, 3, 5, 7
|
|
mova m0, [pw_1024]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_23]
|
|
|
|
INTRA_PRED_ANG16_MC2 0
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
|
|
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1
|
|
|
|
movu xm6, [r2 - 1]
|
|
pinsrb xm6, [r2 + 36], 0
|
|
vinserti128 m1, m6, xm6, 1
|
|
pshufb m1, m5
|
|
vbroadcasti128 m2, [r2 + 7]
|
|
pshufb m2, m5
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3
|
|
|
|
add r4, 4 * mmsize
|
|
|
|
INTRA_PRED_ANG16_MC5 39, 6
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC3 r0 + r1, 1
|
|
INTRA_PRED_ANG16_MC5 43, 5
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
|
|
|
|
add r4, 4 * mmsize
|
|
|
|
INTRA_PRED_ANG16_MC5 46, 4
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang16_22, 3, 5, 7
|
|
mova m0, [pw_1024]
|
|
mova m5, [intra_pred_shuff_0_8]
|
|
lea r3, [3 * r1]
|
|
lea r4, [c_ang16_mode_22]
|
|
|
|
INTRA_PRED_ANG16_MC2 0
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0
|
|
|
|
movu xm6, [r2 - 1]
|
|
pinsrb xm6, [r2 + 34], 0
|
|
vinserti128 m1, m6, xm6, 1
|
|
pshufb m1, m5
|
|
vbroadcasti128 m2, [r2 + 7]
|
|
pshufb m2, m5
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC5 37, 6
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2
|
|
INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3
|
|
|
|
add r4, 4 * mmsize
|
|
|
|
INTRA_PRED_ANG16_MC5 39, 5
|
|
INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC5 42, 4
|
|
INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1
|
|
INTRA_PRED_ANG16_MC3 r0 + r3, 2
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
|
|
INTRA_PRED_ANG16_MC5 44, 3
|
|
INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3
|
|
INTRA_PRED_ANG16_MC5 47, 2
|
|
INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4
|
|
RET
|
|
|
|
%macro INTRA_PRED_ANG32_ALIGNR_STORE 1
|
|
lea r0, [r0 + 4 * r1]
|
|
palignr m2, m1, m0, %1
|
|
movu [r0], m2
|
|
palignr m2, m1, m0, (%1 + 1)
|
|
movu [r0 + r1], m2
|
|
palignr m2, m1, m0, (%1 + 2)
|
|
movu [r0 + 2 * r1], m2
|
|
palignr m2, m1, m0, (%1 + 3)
|
|
movu [r0 + r3], m2
|
|
%endmacro
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_34, 3, 4,3
|
|
lea r3, [3 * r1]
|
|
|
|
movu m0, [r2 + 2]
|
|
movu m1, [r2 + 18]
|
|
movu [r0], m0
|
|
palignr m2, m1, m0, 1
|
|
movu [r0 + r1], m2
|
|
palignr m2, m1, m0, 2
|
|
movu [r0 + 2 * r1], m2
|
|
palignr m2, m1, m0, 3
|
|
movu [r0 + r3], m2
|
|
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 4
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 8
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 12
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
palignr m2, m1, m0, 16
|
|
movu [r0], m2
|
|
movu m0, [r2 + 19]
|
|
movu [r0 + r1], m0
|
|
movu m1, [r2 + 35]
|
|
palignr m2, m1, m0, 1
|
|
movu [r0 + 2 * r1], m2
|
|
palignr m2, m1, m0, 2
|
|
movu [r0 + r3], m2
|
|
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 3
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 7
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 11
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_2, 3, 4,3
|
|
lea r3, [3 * r1]
|
|
|
|
movu m0, [r2 + 64 + 2]
|
|
movu m1, [r2 + 64 + 18]
|
|
movu [r0], m0
|
|
palignr m2, m1, m0, 1
|
|
movu [r0 + r1], m2
|
|
palignr m2, m1, m0, 2
|
|
movu [r0 + 2 * r1], m2
|
|
palignr m2, m1, m0, 3
|
|
movu [r0 + r3], m2
|
|
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 4
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 8
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 12
|
|
|
|
lea r0, [r0 + 4 * r1]
|
|
palignr m2, m1, m0, 16
|
|
movu [r0], m2
|
|
movu m0, [r2 + 64 + 19]
|
|
movu [r0 + r1], m0
|
|
movu m1, [r2 + 64 + 35]
|
|
palignr m2, m1, m0, 1
|
|
movu [r0 + 2 * r1], m2
|
|
palignr m2, m1, m0, 2
|
|
movu [r0 + r3], m2
|
|
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 3
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 7
|
|
INTRA_PRED_ANG32_ALIGNR_STORE 11
|
|
RET
|
|
|
|
%macro INTRA_PRED_ANG32_STORE 0
|
|
lea r0, [r0 + 4 * r1]
|
|
movu [r0], m0
|
|
movu [r0 + r1], m0
|
|
movu [r0 + r1 * 2], m0
|
|
movu [r0 + r3], m0
|
|
%endmacro
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang32_26, 3, 4, 1
|
|
lea r3, [3 * r1]
|
|
movu m0, [r2 + 1]
|
|
movu [r0], m0
|
|
movu [r0 + r1], m0
|
|
movu [r0 + r1 * 2], m0
|
|
movu [r0 + r3], m0
|
|
|
|
INTRA_PRED_ANG32_STORE
|
|
INTRA_PRED_ANG32_STORE
|
|
INTRA_PRED_ANG32_STORE
|
|
INTRA_PRED_ANG32_STORE
|
|
INTRA_PRED_ANG32_STORE
|
|
INTRA_PRED_ANG32_STORE
|
|
INTRA_PRED_ANG32_STORE
|
|
RET
|
|
|
|
%macro INTRA_PRED_STORE_4x4 0
|
|
movd [r0], xm0
|
|
pextrd [r0 + r1], xm0, 1
|
|
vextracti128 xm0, m0, 1
|
|
lea r0, [r0 + 2 * r1]
|
|
movd [r0], xm0
|
|
pextrd [r0 + r1], xm0, 1
|
|
%endmacro
|
|
|
|
%macro INTRA_PRED_TRANS_STORE_4x4 0
|
|
vpermq m0, m0, 00001000b
|
|
pshufb m0, [c_trans_4x4]
|
|
|
|
;store
|
|
movd [r0], xm0
|
|
pextrd [r0 + r1], xm0, 1
|
|
lea r0, [r0 + 2 * r1]
|
|
pextrd [r0], xm0, 2
|
|
pextrd [r0 + r1], xm0, 3
|
|
%endmacro
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_27, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred_shuff_0_4]
|
|
pmaddubsw m0, [c_ang4_mode_27]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_28, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred_shuff_0_4]
|
|
pmaddubsw m0, [c_ang4_mode_28]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_29, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred4_shuff1]
|
|
pmaddubsw m0, [c_ang4_mode_29]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_30, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred4_shuff2]
|
|
pmaddubsw m0, [c_ang4_mode_30]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_31, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred4_shuff31]
|
|
pmaddubsw m0, [c_ang4_mode_31]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_32, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred4_shuff31]
|
|
pmaddubsw m0, [c_ang4_mode_32]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_33, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred4_shuff33]
|
|
pmaddubsw m0, [c_ang4_mode_33]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_3, 3, 3, 1
|
|
vbroadcasti128 m0, [r2 + 1]
|
|
pshufb m0, [intra_pred4_shuff3]
|
|
pmaddubsw m0, [c_ang4_mode_33]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_4, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff5]
|
|
pmaddubsw m0, [c_ang4_mode_32]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_5, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff5]
|
|
pmaddubsw m0, [c_ang4_mode_5]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_6, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff6]
|
|
pmaddubsw m0, [c_ang4_mode_6]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_7, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff7]
|
|
pmaddubsw m0, [c_ang4_mode_7]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_8, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff9]
|
|
pmaddubsw m0, [c_ang4_mode_8]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_9, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff9]
|
|
pmaddubsw m0, [c_ang4_mode_9]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_11, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff12]
|
|
pmaddubsw m0, [c_ang4_mode_11]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_12, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff12]
|
|
pmaddubsw m0, [c_ang4_mode_12]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_13, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff13]
|
|
pmaddubsw m0, [c_ang4_mode_13]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_14, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff14]
|
|
pmaddubsw m0, [c_ang4_mode_14]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_15, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff15]
|
|
pmaddubsw m0, [c_ang4_mode_15]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_16, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff16]
|
|
pmaddubsw m0, [c_ang4_mode_16]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_17, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff17]
|
|
pmaddubsw m0, [c_ang4_mode_17]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_TRANS_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_19, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff19]
|
|
pmaddubsw m0, [c_ang4_mode_19]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_20, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff20]
|
|
pmaddubsw m0, [c_ang4_mode_20]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_21, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff21]
|
|
pmaddubsw m0, [c_ang4_mode_21]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_22, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff22]
|
|
pmaddubsw m0, [c_ang4_mode_22]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_23, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred4_shuff23]
|
|
pmaddubsw m0, [c_ang4_mode_23]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_24, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred_shuff_0_4]
|
|
pmaddubsw m0, [c_ang4_mode_24]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_pred_ang4_25, 3, 3, 1
|
|
vbroadcasti128 m0, [r2]
|
|
pshufb m0, [intra_pred_shuff_0_4]
|
|
pmaddubsw m0, [c_ang4_mode_25]
|
|
pmulhrsw m0, [pw_1024]
|
|
packuswb m0, m0
|
|
|
|
INTRA_PRED_STORE_4x4
|
|
RET
|
|
|
|
;-----------------------------------------------------------------------------------
|
|
; void intra_filter_NxN(const pixel* references, pixel* filtered)
|
|
;-----------------------------------------------------------------------------------
|
|
INIT_XMM sse4
|
|
cglobal intra_filter_4x4, 2,4,5
|
|
mov r2b, byte [r0 + 8] ; topLast
|
|
mov r3b, byte [r0 + 16] ; LeftLast
|
|
|
|
; filtering top
|
|
pmovzxbw m0, [r0 + 0]
|
|
pmovzxbw m1, [r0 + 8]
|
|
pmovzxbw m2, [r0 + 16]
|
|
|
|
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
|
|
palignr m3, m1, m0, 4
|
|
pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1]
|
|
|
|
psllw m0, 1
|
|
paddw m4, m3
|
|
paddw m0, m4
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
|
|
; filtering left
|
|
palignr m4, m1, m1, 14 ; [14 13 12 11 10 9 8 15] samples[i - 1]
|
|
pinsrb m4, [r0], 2 ; [14 13 12 11 10 9 0 15] samples[i + 1]
|
|
palignr m3, m2, m1, 4
|
|
pshufb m3, [intra_filter4_shuf1]
|
|
|
|
psllw m1, 1
|
|
paddw m4, m3
|
|
paddw m1, m4
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
packuswb m0, m1
|
|
|
|
movu [r1], m0
|
|
mov [r1 + 8], r2b ; topLast
|
|
mov [r1 + 16], r3b ; LeftLast
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_filter_8x8, 2,4,6
|
|
mov r2b, byte [r0 + 16] ; topLast
|
|
mov r3b, byte [r0 + 32] ; LeftLast
|
|
|
|
; filtering top
|
|
pmovzxbw m0, [r0 + 0]
|
|
pmovzxbw m1, [r0 + 8]
|
|
pmovzxbw m2, [r0 + 16]
|
|
|
|
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
|
|
palignr m5, m1, m0, 2
|
|
pinsrb m5, [r0 + 17], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
|
|
|
|
palignr m3, m1, m0, 14
|
|
psllw m0, 1
|
|
paddw m4, m5
|
|
paddw m0, m4
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
|
|
palignr m4, m2, m1, 2
|
|
psllw m1, 1
|
|
paddw m4, m3
|
|
paddw m1, m4
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
packuswb m0, m1
|
|
movu [r1], m0
|
|
|
|
; filtering left
|
|
pmovzxbw m1, [r0 + 24]
|
|
pmovzxbw m0, [r0 + 32]
|
|
|
|
palignr m4, m2, m2, 14
|
|
pinsrb m4, [r0], 2
|
|
palignr m5, m1, m2, 2
|
|
|
|
palignr m3, m1, m2, 14
|
|
palignr m0, m1, 2
|
|
|
|
psllw m2, 1
|
|
paddw m4, m5
|
|
paddw m2, m4
|
|
paddw m2, [pw_2]
|
|
psrlw m2, 2
|
|
|
|
psllw m1, 1
|
|
paddw m0, m3
|
|
paddw m1, m0
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
packuswb m2, m1
|
|
movu [r1 + 16], m2
|
|
mov [r1 + 16], r2b ; topLast
|
|
mov [r1 + 32], r3b ; LeftLast
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_filter_16x16, 2,4,6
|
|
mov r2b, byte [r0 + 32] ; topLast
|
|
mov r3b, byte [r0 + 64] ; LeftLast
|
|
|
|
; filtering top
|
|
pmovzxbw m0, [r0 + 0]
|
|
pmovzxbw m1, [r0 + 8]
|
|
pmovzxbw m2, [r0 + 16]
|
|
|
|
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
|
|
palignr m5, m1, m0, 2
|
|
pinsrb m5, [r0 + 33], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
|
|
|
|
palignr m3, m1, m0, 14
|
|
psllw m0, 1
|
|
paddw m4, m5
|
|
paddw m0, m4
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
|
|
palignr m4, m2, m1, 2
|
|
psllw m5, m1, 1
|
|
paddw m4, m3
|
|
paddw m5, m4
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
packuswb m0, m5
|
|
movu [r1], m0
|
|
|
|
pmovzxbw m0, [r0 + 24]
|
|
pmovzxbw m5, [r0 + 32]
|
|
|
|
palignr m3, m2, m1, 14
|
|
palignr m4, m0, m2, 2
|
|
|
|
psllw m1, m2, 1
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
palignr m3, m0, m2, 14
|
|
palignr m4, m5, m0, 2
|
|
|
|
psllw m0, 1
|
|
paddw m4, m3
|
|
paddw m0, m4
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
packuswb m1, m0
|
|
movu [r1 + 16], m1
|
|
|
|
; filtering left
|
|
pmovzxbw m1, [r0 + 40]
|
|
pmovzxbw m2, [r0 + 48]
|
|
|
|
palignr m4, m5, m5, 14
|
|
pinsrb m4, [r0], 2
|
|
palignr m0, m1, m5, 2
|
|
|
|
psllw m3, m5, 1
|
|
paddw m4, m0
|
|
paddw m3, m4
|
|
paddw m3, [pw_2]
|
|
psrlw m3, 2
|
|
|
|
palignr m0, m1, m5, 14
|
|
palignr m4, m2, m1, 2
|
|
|
|
psllw m5, m1, 1
|
|
paddw m4, m0
|
|
paddw m5, m4
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
packuswb m3, m5
|
|
movu [r1 + 32], m3
|
|
|
|
pmovzxbw m5, [r0 + 56]
|
|
pmovzxbw m0, [r0 + 64]
|
|
|
|
palignr m3, m2, m1, 14
|
|
palignr m4, m5, m2, 2
|
|
|
|
psllw m1, m2, 1
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
palignr m3, m5, m2, 14
|
|
palignr m4, m0, m5, 2
|
|
|
|
psllw m5, 1
|
|
paddw m4, m3
|
|
paddw m5, m4
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
packuswb m1, m5
|
|
movu [r1 + 48], m1
|
|
|
|
mov [r1 + 32], r2b ; topLast
|
|
mov [r1 + 64], r3b ; LeftLast
|
|
RET
|
|
|
|
INIT_XMM sse4
|
|
cglobal intra_filter_32x32, 2,4,6
|
|
mov r2b, byte [r0 + 64] ; topLast
|
|
mov r3b, byte [r0 + 128] ; LeftLast
|
|
|
|
; filtering top
|
|
; 0 to 15
|
|
pmovzxbw m0, [r0 + 0]
|
|
pmovzxbw m1, [r0 + 8]
|
|
pmovzxbw m2, [r0 + 16]
|
|
|
|
pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
|
|
palignr m5, m1, m0, 2
|
|
pinsrb m5, [r0 + 65], 0 ; [8 7 6 5 4 3 2 9] samples[i + 1]
|
|
|
|
palignr m3, m1, m0, 14
|
|
psllw m0, 1
|
|
paddw m4, m5
|
|
paddw m0, m4
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
|
|
palignr m4, m2, m1, 2
|
|
psllw m5, m1, 1
|
|
paddw m4, m3
|
|
paddw m5, m4
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
packuswb m0, m5
|
|
movu [r1], m0
|
|
|
|
; 16 to 31
|
|
pmovzxbw m0, [r0 + 24]
|
|
pmovzxbw m5, [r0 + 32]
|
|
|
|
palignr m3, m2, m1, 14
|
|
palignr m4, m0, m2, 2
|
|
|
|
psllw m1, m2, 1
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
palignr m3, m0, m2, 14
|
|
palignr m4, m5, m0, 2
|
|
|
|
psllw m2, m0, 1
|
|
paddw m4, m3
|
|
paddw m2, m4
|
|
paddw m2, [pw_2]
|
|
psrlw m2, 2
|
|
packuswb m1, m2
|
|
movu [r1 + 16], m1
|
|
|
|
; 32 to 47
|
|
pmovzxbw m1, [r0 + 40]
|
|
pmovzxbw m2, [r0 + 48]
|
|
|
|
palignr m3, m5, m0, 14
|
|
palignr m4, m1, m5, 2
|
|
|
|
psllw m0, m5, 1
|
|
paddw m3, m4
|
|
paddw m0, m3
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
|
|
palignr m3, m1, m5, 14
|
|
palignr m4, m2, m1, 2
|
|
|
|
psllw m5, m1, 1
|
|
paddw m4, m3
|
|
paddw m5, m4
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
packuswb m0, m5
|
|
movu [r1 + 32], m0
|
|
|
|
; 48 to 63
|
|
pmovzxbw m0, [r0 + 56]
|
|
pmovzxbw m5, [r0 + 64]
|
|
|
|
palignr m3, m2, m1, 14
|
|
palignr m4, m0, m2, 2
|
|
|
|
psllw m1, m2, 1
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
palignr m3, m0, m2, 14
|
|
palignr m4, m5, m0, 2
|
|
|
|
psllw m0, 1
|
|
paddw m4, m3
|
|
paddw m0, m4
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
packuswb m1, m0
|
|
movu [r1 + 48], m1
|
|
|
|
; filtering left
|
|
; 64 to 79
|
|
pmovzxbw m1, [r0 + 72]
|
|
pmovzxbw m2, [r0 + 80]
|
|
|
|
palignr m4, m5, m5, 14
|
|
pinsrb m4, [r0], 2
|
|
palignr m0, m1, m5, 2
|
|
|
|
psllw m3, m5, 1
|
|
paddw m4, m0
|
|
paddw m3, m4
|
|
paddw m3, [pw_2]
|
|
psrlw m3, 2
|
|
|
|
palignr m0, m1, m5, 14
|
|
palignr m4, m2, m1, 2
|
|
|
|
psllw m5, m1, 1
|
|
paddw m4, m0
|
|
paddw m5, m4
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
packuswb m3, m5
|
|
movu [r1 + 64], m3
|
|
|
|
; 80 to 95
|
|
pmovzxbw m5, [r0 + 88]
|
|
pmovzxbw m0, [r0 + 96]
|
|
|
|
palignr m3, m2, m1, 14
|
|
palignr m4, m5, m2, 2
|
|
|
|
psllw m1, m2, 1
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
palignr m3, m5, m2, 14
|
|
palignr m4, m0, m5, 2
|
|
|
|
psllw m2, m5, 1
|
|
paddw m4, m3
|
|
paddw m2, m4
|
|
paddw m2, [pw_2]
|
|
psrlw m2, 2
|
|
packuswb m1, m2
|
|
movu [r1 + 80], m1
|
|
|
|
; 96 to 111
|
|
pmovzxbw m1, [r0 + 104]
|
|
pmovzxbw m2, [r0 + 112]
|
|
|
|
palignr m3, m0, m5, 14
|
|
palignr m4, m1, m0, 2
|
|
|
|
psllw m5, m0, 1
|
|
paddw m3, m4
|
|
paddw m5, m3
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
|
|
palignr m3, m1, m0, 14
|
|
palignr m4, m2, m1, 2
|
|
|
|
psllw m0, m1, 1
|
|
paddw m4, m3
|
|
paddw m0, m4
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
packuswb m5, m0
|
|
movu [r1 + 96], m5
|
|
|
|
; 112 to 127
|
|
pmovzxbw m5, [r0 + 120]
|
|
pmovzxbw m0, [r0 + 128]
|
|
|
|
palignr m3, m2, m1, 14
|
|
palignr m4, m5, m2, 2
|
|
|
|
psllw m1, m2, 1
|
|
paddw m3, m4
|
|
paddw m1, m3
|
|
paddw m1, [pw_2]
|
|
psrlw m1, 2
|
|
|
|
palignr m3, m5, m2, 14
|
|
palignr m4, m0, m5, 2
|
|
|
|
psllw m5, 1
|
|
paddw m4, m3
|
|
paddw m5, m4
|
|
paddw m5, [pw_2]
|
|
psrlw m5, 2
|
|
packuswb m1, m5
|
|
movu [r1 + 112], m1
|
|
|
|
mov [r1 + 64], r2b ; topLast
|
|
mov [r1 + 128], r3b ; LeftLast
|
|
RET
|
|
|
|
INIT_YMM avx2
|
|
cglobal intra_filter_4x4, 2,4,4
|
|
mov r2b, byte [r0 + 8] ; topLast
|
|
mov r3b, byte [r0 + 16] ; LeftLast
|
|
|
|
; filtering top
|
|
pmovzxbw m0, [r0]
|
|
vpbroadcastw m2, xm0
|
|
pmovzxbw m1, [r0 + 8]
|
|
|
|
palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
|
|
pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
|
|
palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
|
|
palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
|
|
|
|
psllw m0, 1
|
|
paddw m3, m1
|
|
paddw m0, m3
|
|
paddw m0, [pw_2]
|
|
psrlw m0, 2
|
|
|
|
packuswb m0, m0
|
|
vpermq m0, m0, 10001000b
|
|
|
|
movu [r1], xm0
|
|
mov [r1 + 8], r2b ; topLast
|
|
mov [r1 + 16], r3b ; LeftLast
|
|
RET
|