1;***************************************************************************** 2;* Copyright (C) 2013-2020 MulticoreWare, Inc 3;* 4;* Authors: Nabajit Deka <nabajit@multicorewareinc.com> 5;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> 6;* Li Cao <li@multicorewareinc.com> 7;* Praveen Kumar Tiwari <Praveen@multicorewareinc.com> 8;* 9;* This program is free software; you can redistribute it and/or modify 10;* it under the terms of the GNU General Public License as published by 11;* the Free Software Foundation; either version 2 of the License, or 12;* (at your option) any later version. 13;* 14;* This program is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17;* GNU General Public License for more details. 18;* 19;* You should have received a copy of the GNU General Public License 20;* along with this program; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 22;* 23;* This program is also available under a commercial proprietary license. 24;* For more information, contact us at license @ x265.com. 25;*****************************************************************************/ 26 27;TO-DO : Further optimize the routines. 28 29%include "x86inc.asm" 30%include "x86util.asm" 31SECTION_RODATA 64 32 33tab_dct32: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 34 dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 35 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 36 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 37 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 38 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 39 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 40 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 41 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 42 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 43 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 44 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 45 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 46 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 47 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 48 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 49 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 50 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 51 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 52 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 53 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 54 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 55 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 56 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 57 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 58 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 59 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 60 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 61 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 62 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 63 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 64 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 65tab_dct16: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 66 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 67 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 68 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 69 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 70 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 71 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 72 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 73 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 74 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 75 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 76 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 77 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 78 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 79 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 80 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 81 82dct16_shuf_AVX512: dq 0, 1, 8, 9, 4, 5, 12, 13 83dct16_shuf1_AVX512: dq 2, 3, 10, 11, 6, 7, 14, 15 84dct16_shuf3_AVX512: dq 0, 1, 4, 5, 8, 9, 12, 13 85dct16_shuf4_AVX512: dq 2, 3, 6, 7, 10, 11, 14, 15 86dct16_shuf2_AVX512: dd 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 87 88dct8_shuf5_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7 89dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7 90dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 91dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 92dct16_shuf7_AVX512: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 93dct16_shuf9_AVX512: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 94 95dct32_shuf_AVX512: dd 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20 , 21, 24, 25, 28, 29 96dct32_shuf4_AVX512: times 2 dd 0, 4, 8, 12, 0, 4, 8, 12 97dct32_shuf5_AVX512: dd 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 98dct32_shuf6_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0 99dct32_shuf7_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 100dct32_shuf8_AVX512: dd -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 101dct16_shuf5_AVX512: dw 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 102dct16_shuf6_AVX512: dw 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 103dct16_shuf8_AVX512: dw 20, 0, 4, 2, 28, 8, 6, 10, 22, 16, 12, 18, 30, 24, 14, 26 104 105dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30 106dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28 107dct32_shuf1_AVX512: dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 108dct32_shuf2_AVX512: dw 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 15, 14, 13, 12, 11, 10, 9, 8, 31, 30, 29, 28, 27, 26, 25, 24 109dct32_shuf3_AVX512: times 2 dw 0, 8, 16, 24, 2, 10, 18, 26 110 111dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 112dct8_shuf_AVX512: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 113 114tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64 115 dw 89, 75, 50, 18, -18, -50, -75, -89 116 dw 83, 36, -36, -83, -83, -36, 36, 83 117 dw 75, -18, -89, -50, 50, 89, 18, -75 118 dw 64, -64, -64, 64, 64, -64, -64, 64 119 dw 50, -89, 18, 75, -75, -18, 89, -50 120 dw 36, -83, 83, -36, -36, 83, -83, 36 121 dw 18, -50, 75, -89, 89, -75, 50, -18 122 123tab_dct8_avx512: dw 64, 64, 64, 64, 89, 75, 50, 18 124 dw 83, 36, -36, -83, 75, -18, -89, -50 125 dw 64, -64, -64, 64, 50, -89, 18, 75 126 dw 36, -83, 83, -36, 18, -50, 75, -89 127 128tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64 129 dw 90, 87, 80, 70, 57, 43, 25, 9 130 dw 89, 75, 50, 18, -18, -50, -75, -89 131 dw 87, 57, 9, -43, -80, -90, -70, -25 132 dw 83, 36, -36, -83, -83, -36, 36, 83 133 dw 80, 9, -70, -87, -25, 57, 90, 43 134 dw 75, -18, -89, -50, 50, 89, 18, -75 135 dw 70, -43, -87, 9, 90, 25, -80, -57 136 dw 64, -64, -64, 64, 64, -64, -64, 64 137 dw 57, -80, -25, 90, -9, -87, 43, 70 138 dw 50, -89, 18, 75, -75, -18, 89, -50 139 dw 43, -90, 57, 25, -87, 70, 9, -80 140 dw 36, -83, 83, -36, -36, 83, -83, 36 141 dw 25, -70, 90, -80, 43, 9, -57, 87 142 dw 18, -50, 75, -89, 89, -75, 50, -18 143 dw 9, -25, 43, -57, 70, -80, 87, -90 144 145tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64 146 dw -9, -25, -43, -57, -70, -80, -87, -90 147 dw -89, -75, -50, -18, 18, 50, 75, 89 148 dw 25, 70, 90, 80, 43, -9, -57, -87 149 dw 83, 36, -36, -83, -83, -36, 36, 83 150 dw -43, -90, -57, 25, 87, 70, -9, -80 151 dw -75, 18, 89, 50, -50, -89, -18, 75 152 dw 57, 80, -25, -90, -9, 87, 43, -70 153 dw 64, -64, -64, 64, 64, -64, -64, 64 154 dw -70, -43, 87, 9, -90, 25, 80, -57 155 dw -50, 89, -18, -75, 75, 18, -89, 50 156 dw 80, -9, -70, 87, -25, -57, 90, -43 157 dw 36, -83, 83, -36, -36, 83, -83, 36 158 dw -87, 57, -9, -43, 80, -90, 70, -25 159 dw -18, 50, -75, 89, -89, 75, -50, 18 160 dw 90, -87, 80, -70, 57, -43, 25, -9 161 162dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 163 164dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9 165 166tab_dct32_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 167 dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 168 dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 169 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 170 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 171 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 172 dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87 173 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 174 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 175 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 176 dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80 177 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 178 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 179 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 180 dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70 181 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 182 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 183 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 184 dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57 185 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 186 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 187 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 188 dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43 189 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 190 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 191 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 192 dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25 193 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 194 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 195 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 196 dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 197 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 198 199tab_dct32_2: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 200 dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 201 dw -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 202 dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 203 dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 204 dw -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 205 dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 206 dw 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 207 dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 208 dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 209 dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 210 dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 211 dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 212 dw -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 213 dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 214 dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 215 dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 216 dw -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 217 dw -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 218 dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 219 dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 220 dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 221 dw -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 222 dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 223 dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 224 dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 225 dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 226 dw 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 227 dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 228 dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 229 dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 230 dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 231 232avx2_idct8_1: times 4 dw 64, 83, 64, 36 233 times 4 dw 64, 36, -64, -83 234 times 4 dw 64, -36, -64, 83 235 times 4 dw 64, -83, 64, -36 236 237avx2_idct8_2: times 4 dw 89, 75, 50, 18 238 times 4 dw 75, -18, -89, -50 239 times 4 dw 50, -89, 18, 75 240 times 4 dw 18, -50, 75, -89 241 242avx512_idct8_1: times 8 dw 64, 83, 64, 36 243 times 8 dw 64, 36, -64, -83 244 times 8 dw 64, -36, -64, 83 245 times 8 dw 64, -83, 64, -36 246 247avx512_idct8_2: times 8 dw 89, 75, 50, 18 248 times 8 dw 75, -18, -89, -50 249 times 8 dw 50, -89, 18, 75 250 times 8 dw 18, -50, 75, -89 251 252avx512_idct8_3: dw 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36 253 dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83 254 dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83 255 dw -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36 256 dw 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89 257 dw 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75 258 dw 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50 259 dw -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89 260 261idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7 262 263const idct8_shuf2, times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 264 265idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 266 267 268idct8_avx512_shuf3: times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 269 270tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9 271 dw 87, 57, 9, -43, -80, -90, -70, -25 272 dw 80, 9, -70, -87, -25, 57, 90, 43 273 dw 70, -43, -87, 9, 90, 25, -80, -57 274 dw 57, -80, -25, 90, -9, -87, 43, 70 275 dw 43, -90, 57, 25, -87, 70, 9, -80 276 dw 25, -70, 90, -80, 43, 9, -57, 87 277 dw 9, -25, 43, -57, 70, -80, 87, -90 278 279tab_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18 280 dw 64, 75, 36, -18, -64, -89, -83, -50 281 dw 64, 50, -36, -89, -64, 18, 83, 75 282 dw 64, 18, -83, -50, 64, 75, -36, -89 283 dw 64, -18, -83, 50, 64, -75, -36, 89 284 dw 64, -50, -36, 89, -64, -18, 83, -75 285 dw 64, -75, 36, 18, -64, 89, -83, 50 286 dw 64, -89, 83, -75, 64, -50, 36, -18 287 288idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7 289 290idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 291idct16_shuff2: dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 292idct16_shuff3: dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 293idct16_shuff4: dd 0, 8, 2, 10, 4, 12, 6, 14 294idct16_shuff5: dd 1, 9, 3, 11, 5, 13, 7, 15 295 296 297tab_AVX512_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43 298 dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57 299 dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87 300 dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90 301 302tab_AVX512_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75 303 dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89 304 dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50 305 dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18 306 307idct16_AVX512_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15 308 309idct16_AVX512_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13 310 311idct16_AVX512_shuff2: dq 0, 1, 8, 9, 4, 5, 12, 13 312idct16_AVX512_shuff3: dq 2, 3, 10, 11, 6, 7, 14, 15 313idct16_AVX512_shuff4: dq 4, 5, 12, 13, 0, 1, 8, 9 314idct16_AVX512_shuff5: dq 6, 7, 14, 15, 2, 3, 10, 11 315idct16_AVX512_shuff6: times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 316 317tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 318 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 319 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 320 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 321 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 322 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 323 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 324 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 325 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 326 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 327 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 328 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 329 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 330 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 331 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 332 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 333 334 335tab_idct32_2: dw 64, 89, 83, 75, 64, 50, 36, 18 336 dw 64, 75, 36, -18, -64, -89, -83, -50 337 dw 64, 50, -36, -89, -64, 18, 83, 75 338 dw 64, 18, -83, -50, 64, 75, -36, -89 339 dw 64, -18, -83, 50, 64, -75, -36, 89 340 dw 64, -50, -36, 89, -64, -18, 83, -75 341 dw 64, -75, 36, 18, -64, 89, -83, 50 342 dw 64, -89, 83, -75, 64, -50, 36, -18 343 344 345tab_idct32_3: dw 90, 87, 80, 70, 57, 43, 25, 9 346 dw 87, 57, 9, -43, -80, -90, -70, -25 347 dw 80, 9, -70, -87, -25, 57, 90, 43 348 dw 70, -43, -87, 9, 90, 25, -80, -57 349 dw 57, -80, -25, 90, -9, -87, 43, 70 350 dw 43, -90, 57, 25, -87, 70, 9, -80 351 dw 25, -70, 90, -80, 43, 9, -57, 87 352 dw 9, -25, 43, -57, 70, -80, 87, -90 353 354tab_idct32_4: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9 355 dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25 356 dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43 357 dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57 358 dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70 359 dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80 360 dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87 361 dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90 362 dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90 363 dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87 364 dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80 365 dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70 366 dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57 367 dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43 368 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25 369 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 370 371 372tab_idct32_AVX512_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 90 ,90 ,88 ,85, 82, 78, 73, 67, 90, 82, 67, 46, 22, -4, -31, -54, 90, 82, 67, 46, 22, -4, -31, -54 373 dw 61, 54, 46, 38, 31, 22, 13, 4, 61, 54, 46, 38, 31, 22, 13, 4, -73, -85, -90, -88, -78, -61, -38, -13, -73, -85, -90, -88, -78, -61, -38, -13 374 dw 88, 67, 31, -13, -54, -82, -90, -78, 88, 67, 31, -13, -54, -82, -90, -78, 85, 46, -13, -67, -90, -73, -22, 38, 85, 46, -13, -67, -90, -73, -22, 38 375 dw -46, -4, 38, 73, 90, 85, 61, 22, -46, -4, 38, 73, 90, 85, 61, 22, 82, 88, 54, -4, -61, -90, -78, -31, 82, 88, 54, -4, -61, -90, -78, -31 376 dw 82, 22, -54, -90, -61, 13, 78, 85, 82, 22, -54, -90, -61, 13, 78, 85, 78, -4, -82, -73, 13, 85, 67, -22, 78, -4, -82, -73, 13, 85, 67, -22 377 dw 31, -46, -90, -67, 4, 73, 88, 38, 31, -46, -90, -67, 4, 73, 88, 38, -88, -61, 31, 90, 54, -38, -90, -46, -88, -61, 31, 90, 54, -38, -90, -46 378 dw 73, -31, -90, -22, 78, 67, -38, -90, 73, -31, -90, -22, 78, 67, -38, -90, 67, -54, -78, 38, 85, -22, -90, 4, 67, -54, -78, 38, 85, -22, -90, 4 379 dw -13, 82, 61, -46, -88, -4, 85, 54, -13, 82, 61, -46, -88, -4, 85, 54, 90, 13, -88, -31, 82, 46, -73, -61, 90, 13, -88, -31, 82, 46, -73, -61 380 381tab_idct32_AVX512_5: dw 4, -13, 22, -31, 38, -46, 54, -61, 4, -13, 22, -31, 38, -46, 54, -61, 13, -38, 61, -78, 88, -90, 85, -73, 13, -38, 61, -78, 88, -90, 85, -73 382 dw 67, -73, 78, -82, 85, -88, 90, -90, 67, -73, 78, -82, 85, -88, 90, -90, 54, -31, 4, 22, -46, 67, -82, 90, 54, -31, 4, 22, -46, 67, -82, 90 383 dw 22, -61, 85, -90, 73, -38, -4, 46, 22, -61, 85, -90, 73, -38, -4, 46, 31, -78, 90, -61, 4, 54, -88, 82, 31, -78, 90, -61, 4, 54, -88, 82 384 dw -78, 90, -82, 54, -13, -31, 67, -88, -78, 90, -82, 54, -13, -31, 67, -88, -38, -22, 73, -90, 67, -13, -46, 85, -38, -22, 73, -90, 67, -13, -46, 85 385 dw 38, -88, 73, -4, -67, 90, -46, -31, 38, -88, 73, -4, -67, 90, -46, -31, 46, -90, 38, 54, -90, 31, 61, -88, 46, -90, 38, 54, -90, 31, 61, -88 386 dw 85, -78, 13, 61, -90, 54, 22, -82, 85, -78, 13, 61, -90, 54, 22, -82, 22, 67, -85, 13, 73, -82, 4, 78, 22, 67, -85, 13, 73, -82, 4, 78 387 dw 54, -85, -4, 88, -46, -61, 82, 13, 54, -85, -4, 88, -46, -61, 82, 13, 61, -73, -46, 82, 31, -88, -13, 90, 61, -73, -46, 82, 31, -88, -13, 90 388 dw -90, 38, 67, -78, -22, 90, -31, -73, -90, 38, 67, -78, -22, 90, -31, -73, -4, -90, 22, 85, -38, -78, 54, 67, -4, -90, 22, 85, -38, -78, 54, 67 389 390 391tab_idct32_AVX512_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50 392 dw 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89 393 dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75 394 dw 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18 395 396tab_idct32_AVX512_3: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25 397 dw 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57 398 dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80 399 dw 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90 400 401tab_idct32_AVX512_4: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 402 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 403 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 404 dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 405 dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 406 dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 407 dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 408 dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 409 dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 410 dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 411 dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 412 dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 413 dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 414 dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 415 dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 416 dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 417 418tab_idct32_AVX512_6: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9 419 dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25 420 dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43, 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43 421 dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57, 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57 422 dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70, 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70 423 dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80, 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80 424 dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87, 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87 425 dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90, 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90 426 dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90, 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90 427 dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87, 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87 428 dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80, 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80 429 dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70, 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70 430 dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57, 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57 431 dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43, 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43 432 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25, 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25 433 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9, 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 434 435 436avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64 437 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83 438 439avx2_idct4_1: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64 440 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83 441 442avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83 443 444const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 445 446idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11 447 448tab_dct4: times 4 dw 64, 64 449 times 4 dw 83, 36 450 times 4 dw 64, -64 451 times 4 dw 36, -83 452 453dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13 454 455tab_dst4: times 2 dw 29, 55, 74, 84 456 times 2 dw 74, 74, 0, -74 457 times 2 dw 84, -29, -74, 55 458 times 2 dw 55, -84, 74, -29 459 460pw_dst4_tab: times 4 dw 29, 55, 74, 84 461 times 4 dw 74, 74, 0, -74 462 times 4 dw 84, -29, -74, 55 463 times 4 dw 55, -84, 74, -29 464 465tab_idst4: times 4 dw 29, +84 466 times 4 dw +74, +55 467 times 4 dw 55, -29 468 times 4 dw +74, -84 469 times 4 dw 74, -74 470 times 4 dw 0, +74 471 times 4 dw 84, +55 472 times 4 dw -74, -29 473 474pw_idst4_tab: times 4 dw 29, 84 475 times 4 dw 55, -29 476 times 4 dw 74, 55 477 times 4 dw 74, -84 478 times 4 dw 74, -74 479 times 4 dw 84, 55 480 times 4 dw 0, 74 481 times 4 dw -74, -29 482pb_idst4_shuf: times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 483 484tab_dct8_1: times 2 dw 89, 50, 75, 18 485 times 2 dw 75, -89, -18, -50 486 times 2 dw 50, 18, -89, 75 487 times 2 dw 18, 75, -50, -89 488 489tab_dct8_2: times 2 dd 83, 36 490 times 2 dd 36, 83 491 times 1 dd 89, 75, 50, 18 492 times 1 dd 75, -18, -89, -50 493 times 1 dd 50, -89, 18, 75 494 times 1 dd 18, -50, 75, -89 495 496tab_idct8_3: times 4 dw 89, 75 497 times 4 dw 50, 18 498 times 4 dw 75, -18 499 times 4 dw -89, -50 500 times 4 dw 50, -89 501 times 4 dw 18, 75 502 times 4 dw 18, -50 503 times 4 dw 75, -89 504 505pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15 506 507pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13 508 509tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36 510 511tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50 512 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89 513pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 514 515;Scale bits table for rdoQuant 516tab_nonpsyRdo8 : dq 5, 7, 9, 11 517tab_nonpsyRdo10: dq 9, 11, 13, 15 518tab_nonpsyRdo12: dq 13, 15, 17, 19 519 520SECTION .text 521cextern pd_1 522cextern pd_2 523cextern pd_4 524cextern pd_8 525cextern pd_16 526cextern pd_32 527cextern pd_64 528cextern pd_128 529cextern pd_256 530cextern pd_512 531cextern pd_1024 532cextern pd_2048 533cextern pw_ppppmmmm 534cextern trans8_shuf 535 536 537%if BIT_DEPTH == 12 538 %define DCT4_SHIFT 5 539 %define DCT4_ROUND 16 540 %define IDCT_SHIFT 8 541 %define IDCT_ROUND 128 542 %define DST4_SHIFT 5 543 %define DST4_ROUND 16 544 %define DCT8_SHIFT1 6 545 %define DCT8_ROUND1 32 546 %define RDO_MAX_4 3 547 %define RDO_MAX_8 1 548 %define RDO_MAX_16 0 549 %define RDO_MAX_32 0 550%elif BIT_DEPTH == 10 551 %define DCT4_SHIFT 3 552 %define DCT4_ROUND 4 553 %define IDCT_SHIFT 10 554 %define IDCT_ROUND 512 555 %define DST4_SHIFT 3 556 %define DST4_ROUND 4 557 %define DCT8_SHIFT1 4 558 %define DCT8_ROUND1 8 559 %define RDO_MAX_4 7 560 %define RDO_MAX_8 5 561 %define RDO_MAX_16 3 562 %define RDO_MAX_32 1 563%elif BIT_DEPTH == 8 564 %define DCT4_SHIFT 1 565 %define DCT4_ROUND 1 566 %define IDCT_SHIFT 12 567 %define IDCT_ROUND 2048 568 %define DST4_SHIFT 1 569 %define DST4_ROUND 1 570 %define DCT8_SHIFT1 2 571 %define DCT8_ROUND1 2 572 %define RDO_MAX_4 11 573 %define RDO_MAX_8 9 574 %define RDO_MAX_16 7 575 %define RDO_MAX_32 5 576%else 577 %error Unsupported BIT_DEPTH! 578%endif 579 580%define DCT8_ROUND2 256 581%define DCT8_SHIFT2 9 582 583;------------------------------------------------------ 584;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride) 585;------------------------------------------------------ 586INIT_XMM sse2 587cglobal dct4, 3, 4, 8 588 mova m7, [pd_ %+ DCT4_ROUND] 589 add r2d, r2d 590 lea r3, [tab_dct4] 591 592 mova m4, [r3 + 0 * 16] 593 mova m5, [r3 + 1 * 16] 594 mova m6, [r3 + 2 * 16] 595 movh m0, [r0 + 0 * r2] 596 movh m1, [r0 + 1 * r2] 597 punpcklqdq m0, m1 598 pshufd m0, m0, 0xD8 599 pshufhw m0, m0, 0xB1 600 601 lea r0, [r0 + 2 * r2] 602 movh m1, [r0] 603 movh m2, [r0 + r2] 604 punpcklqdq m1, m2 605 pshufd m1, m1, 0xD8 606 pshufhw m1, m1, 0xB1 607 608 punpcklqdq m2, m0, m1 609 punpckhqdq m0, m1 610 611 paddw m1, m2, m0 612 psubw m2, m0 613 pmaddwd m0, m1, m4 614 paddd m0, m7 615 psrad m0, DCT4_SHIFT 616 pmaddwd m3, m2, m5 617 paddd m3, m7 618 psrad m3, DCT4_SHIFT 619 packssdw m0, m3 620 pshufd m0, m0, 0xD8 621 pshufhw m0, m0, 0xB1 622 pmaddwd m1, m6 623 paddd m1, m7 624 psrad m1, DCT4_SHIFT 625 pmaddwd m2, [r3 + 3 * 16] 626 paddd m2, m7 627 psrad m2, DCT4_SHIFT 628 packssdw m1, m2 629 pshufd m1, m1, 0xD8 630 pshufhw m1, m1, 0xB1 631 632 punpcklqdq m2, m0, m1 633 punpckhqdq m0, m1 634 635 mova m7, [pd_128] 636 637 pmaddwd m1, m2, m4 638 pmaddwd m3, m0, m4 639 paddd m1, m3 640 paddd m1, m7 641 psrad m1, 8 642 643 pmaddwd m4, m2, m5 644 pmaddwd m3, m0, m5 645 psubd m4, m3 646 paddd m4, m7 647 psrad m4, 8 648 packssdw m1, m4 649 movu [r1 + 0 * 16], m1 650 651 pmaddwd m1, m2, m6 652 pmaddwd m3, m0, m6 653 paddd m1, m3 654 paddd m1, m7 655 psrad m1, 8 656 657 pmaddwd m2, [r3 + 3 * 16] 658 pmaddwd m0, [r3 + 3 * 16] 659 psubd m2, m0 660 paddd m2, m7 661 psrad m2, 8 662 packssdw m1, m2 663 movu [r1 + 1 * 16], m1 664 RET 665 666; DCT 4x4 667; 668; Input parameters: 669; - r0: source 670; - r1: destination 671; - r2: source stride 672INIT_YMM avx2 673cglobal dct4, 3, 4, 8, src, dst, srcStride 674 vbroadcasti128 m7, [pd_ %+ DCT4_ROUND] 675 add r2d, r2d 676 lea r3, [avx2_dct4] 677 678 vbroadcasti128 m4, [dct4_shuf] 679 mova m5, [r3] 680 mova m6, [r3 + 32] 681 movq xm0, [r0] 682 movhps xm0, [r0 + r2] 683 lea r0, [r0 + 2 * r2] 684 movq xm1, [r0] 685 movhps xm1, [r0 + r2] 686 687 vinserti128 m0, m0, xm1, 1 688 pshufb m0, m4 689 vpermq m1, m0, 11011101b 690 vpermq m0, m0, 10001000b 691 paddw m2, m0, m1 692 psubw m0, m1 693 694 pmaddwd m2, m5 695 paddd m2, m7 696 psrad m2, DCT4_SHIFT 697 698 pmaddwd m0, m6 699 paddd m0, m7 700 psrad m0, DCT4_SHIFT 701 702 packssdw m2, m0 703 pshufb m2, m4 704 vpermq m1, m2, 11011101b 705 vpermq m2, m2, 10001000b 706 vbroadcasti128 m7, [pd_128] 707 708 pmaddwd m0, m2, m5 709 pmaddwd m3, m1, m5 710 paddd m3, m0 711 paddd m3, m7 712 psrad m3, 8 713 714 pmaddwd m2, m6 715 pmaddwd m1, m6 716 psubd m2, m1 717 paddd m2, m7 718 psrad m2, 8 719 720 packssdw m3, m2 721 movu [r1], m3 722 RET 723 724;------------------------------------------------------- 725;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride) 726;------------------------------------------------------- 727INIT_XMM sse2 728cglobal idct4, 3, 4, 6 729 add r2d, r2d 730 lea r3, [tab_dct4] 731 732 movu m0, [r0 + 0 * 16] 733 movu m1, [r0 + 1 * 16] 734 735 punpcklwd m2, m0, m1 736 pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1 737 paddd m3, [pd_64] 738 739 pmaddwd m2, [r3 + 2 * 16] ; m2 = E2 740 paddd m2, [pd_64] 741 742 punpckhwd m0, m1 743 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 744 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 745 746 paddd m4, m3, m1 747 psrad m4, 7 ; m4 = m128iA 748 paddd m5, m2, m0 749 psrad m5, 7 750 packssdw m4, m5 ; m4 = m128iA 751 752 psubd m2, m0 753 psrad m2, 7 754 psubd m3, m1 755 psrad m3, 7 756 packssdw m2, m3 ; m2 = m128iD 757 758 punpcklwd m1, m4, m2 ; m1 = S0 759 punpckhwd m4, m2 ; m4 = S8 760 761 punpcklwd m0, m1, m4 ; m0 = m128iA 762 punpckhwd m1, m4 ; m1 = m128iD 763 764 punpcklwd m2, m0, m1 765 pmaddwd m3, m2, [r3 + 0 * 16] 766 paddd m3, [pd_ %+ IDCT_ROUND] ; m3 = E1 767 768 pmaddwd m2, [r3 + 2 * 16] 769 paddd m2, [pd_ %+ IDCT_ROUND] ; m2 = E2 770 771 punpckhwd m0, m1 772 pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 773 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 774 775 paddd m4, m3, m1 776 psrad m4, IDCT_SHIFT ; m4 = m128iA 777 paddd m5, m2, m0 778 psrad m5, IDCT_SHIFT 779 packssdw m4, m5 ; m4 = m128iA 780 781 psubd m2, m0 782 psrad m2, IDCT_SHIFT 783 psubd m3, m1 784 psrad m3, IDCT_SHIFT 785 packssdw m2, m3 ; m2 = m128iD 786 787 punpcklwd m1, m4, m2 788 punpckhwd m4, m2 789 790 punpcklwd m0, m1, m4 791 movlps [r1 + 0 * r2], m0 792 movhps [r1 + 1 * r2], m0 793 794 punpckhwd m1, m4 795 movlps [r1 + 2 * r2], m1 796 lea r1, [r1 + 2 * r2] 797 movhps [r1 + r2], m1 798 RET 799 800;------------------------------------------------------ 801;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) 802;------------------------------------------------------ 803INIT_XMM sse2 804%if ARCH_X86_64 805cglobal dst4, 3, 4, 8+4 806 %define coef0 m8 807 %define coef1 m9 808 %define coef2 m10 809 %define coef3 m11 810%else ; ARCH_X86_64 = 0 811cglobal dst4, 3, 4, 8 812 %define coef0 [r3 + 0 * 16] 813 %define coef1 [r3 + 1 * 16] 814 %define coef2 [r3 + 2 * 16] 815 %define coef3 [r3 + 3 * 16] 816%endif ; ARCH_X86_64 817 818 mova m5, [pd_ %+ DST4_ROUND] 819 add r2d, r2d 820 lea r3, [tab_dst4] 821%if ARCH_X86_64 822 mova coef0, [r3 + 0 * 16] 823 mova coef1, [r3 + 1 * 16] 824 mova coef2, [r3 + 2 * 16] 825 mova coef3, [r3 + 3 * 16] 826%endif 827 movh m0, [r0 + 0 * r2] ; load 828 movhps m0, [r0 + 1 * r2] 829 lea r0, [r0 + 2 * r2] 830 movh m1, [r0] 831 movhps m1, [r0 + r2] 832 pmaddwd m2, m0, coef0 ; DST1 833 pmaddwd m3, m1, coef0 834 pshufd m6, m2, q2301 835 pshufd m7, m3, q2301 836 paddd m2, m6 837 paddd m3, m7 838 pshufd m2, m2, q3120 839 pshufd m3, m3, q3120 840 punpcklqdq m2, m3 841 paddd m2, m5 842 psrad m2, DST4_SHIFT 843 pmaddwd m3, m0, coef1 844 pmaddwd m4, m1, coef1 845 pshufd m6, m4, q2301 846 pshufd m7, m3, q2301 847 paddd m4, m6 848 paddd m3, m7 849 pshufd m4, m4, q3120 850 pshufd m3, m3, q3120 851 punpcklqdq m3, m4 852 paddd m3, m5 853 psrad m3, DST4_SHIFT 854 packssdw m2, m3 ; m2 = T70 855 pmaddwd m3, m0, coef2 856 pmaddwd m4, m1, coef2 857 pshufd m6, m4, q2301 858 pshufd m7, m3, q2301 859 paddd m4, m6 860 paddd m3, m7 861 pshufd m4, m4, q3120 862 pshufd m3, m3, q3120 863 punpcklqdq m3, m4 864 paddd m3, m5 865 psrad m3, DST4_SHIFT 866 pmaddwd m0, coef3 867 pmaddwd m1, coef3 868 pshufd m6, m0, q2301 869 pshufd m7, m1, q2301 870 paddd m0, m6 871 paddd m1, m7 872 pshufd m0, m0, q3120 873 pshufd m1, m1, q3120 874 punpcklqdq m0, m1 875 paddd m0, m5 876 psrad m0, DST4_SHIFT 877 packssdw m3, m0 ; m3 = T71 878 mova m5, [pd_128] 879 880 pmaddwd m0, m2, coef0 ; DST2 881 pmaddwd m1, m3, coef0 882 pshufd m6, m0, q2301 883 pshufd m7, m1, q2301 884 paddd m0, m6 885 paddd m1, m7 886 pshufd m0, m0, q3120 887 pshufd m1, m1, q3120 888 punpcklqdq m0, m1 889 paddd m0, m5 890 psrad m0, 8 891 892 pmaddwd m4, m2, coef1 893 pmaddwd m1, m3, coef1 894 pshufd m6, m4, q2301 895 pshufd m7, m1, q2301 896 paddd m4, m6 897 paddd m1, m7 898 pshufd m4, m4, q3120 899 pshufd m1, m1, q3120 900 punpcklqdq m4, m1 901 paddd m4, m5 902 psrad m4, 8 903 packssdw m0, m4 904 movu [r1 + 0 * 16], m0 905 906 pmaddwd m0, m2, coef2 907 pmaddwd m1, m3, coef2 908 pshufd m6, m0, q2301 909 pshufd m7, m1, q2301 910 paddd m0, m6 911 paddd m1, m7 912 pshufd m0, m0, q3120 913 pshufd m1, m1, q3120 914 punpcklqdq m0, m1 915 paddd m0, m5 916 psrad m0, 8 917 918 pmaddwd m2, coef3 919 pmaddwd m3, coef3 920 pshufd m6, m2, q2301 921 pshufd m7, m3, q2301 922 paddd m2, m6 923 paddd m3, m7 924 pshufd m2, m2, q3120 925 pshufd m3, m3, q3120 926 punpcklqdq m2, m3 927 paddd m2, m5 928 psrad m2, 8 929 packssdw m0, m2 930 movu [r1 + 1 * 16], m0 931 RET 932 933;------------------------------------------------------ 934;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) 935;------------------------------------------------------ 936INIT_XMM ssse3 937%if ARCH_X86_64 938cglobal dst4, 3, 4, 8+2 939 %define coef2 m8 940 %define coef3 m9 941%else ; ARCH_X86_64 = 0 942cglobal dst4, 3, 4, 8 943 %define coef2 [r3 + 2 * 16] 944 %define coef3 [r3 + 3 * 16] 945%endif ; ARCH_X86_64 946%define coef0 m6 947%define coef1 m7 948 949 mova m5, [pd_ %+ DST4_ROUND] 950 add r2d, r2d 951 lea r3, [tab_dst4] 952 mova coef0, [r3 + 0 * 16] 953 mova coef1, [r3 + 1 * 16] 954%if ARCH_X86_64 955 mova coef2, [r3 + 2 * 16] 956 mova coef3, [r3 + 3 * 16] 957%endif 958 movh m0, [r0 + 0 * r2] ; load 959 movh m1, [r0 + 1 * r2] 960 punpcklqdq m0, m1 961 lea r0, [r0 + 2 * r2] 962 movh m1, [r0] 963 movh m2, [r0 + r2] 964 punpcklqdq m1, m2 965 pmaddwd m2, m0, coef0 ; DST1 966 pmaddwd m3, m1, coef0 967 phaddd m2, m3 968 paddd m2, m5 969 psrad m2, DST4_SHIFT 970 pmaddwd m3, m0, coef1 971 pmaddwd m4, m1, coef1 972 phaddd m3, m4 973 paddd m3, m5 974 psrad m3, DST4_SHIFT 975 packssdw m2, m3 ; m2 = T70 976 pmaddwd m3, m0, coef2 977 pmaddwd m4, m1, coef2 978 phaddd m3, m4 979 paddd m3, m5 980 psrad m3, DST4_SHIFT 981 pmaddwd m0, coef3 982 pmaddwd m1, coef3 983 phaddd m0, m1 984 paddd m0, m5 985 psrad m0, DST4_SHIFT 986 packssdw m3, m0 ; m3 = T71 987 mova m5, [pd_128] 988 989 pmaddwd m0, m2, coef0 ; DST2 990 pmaddwd m1, m3, coef0 991 phaddd m0, m1 992 paddd m0, m5 993 psrad m0, 8 994 995 pmaddwd m4, m2, coef1 996 pmaddwd m1, m3, coef1 997 phaddd m4, m1 998 paddd m4, m5 999 psrad m4, 8 1000 packssdw m0, m4 1001 movu [r1 + 0 * 16], m0 1002 1003 pmaddwd m0, m2, coef2 1004 pmaddwd m1, m3, coef2 1005 phaddd m0, m1 1006 paddd m0, m5 1007 psrad m0, 8 1008 1009 pmaddwd m2, coef3 1010 pmaddwd m3, coef3 1011 phaddd m2, m3 1012 paddd m2, m5 1013 psrad m2, 8 1014 packssdw m0, m2 1015 movu [r1 + 1 * 16], m0 1016 RET 1017 1018;------------------------------------------------------------------ 1019;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride) 1020;------------------------------------------------------------------ 1021INIT_YMM avx2 1022cglobal dst4, 3, 4, 6 1023 vbroadcasti128 m5, [pd_ %+ DST4_ROUND] 1024 mova m4, [trans8_shuf] 1025 add r2d, r2d 1026 lea r3, [pw_dst4_tab] 1027 1028 movq xm0, [r0 + 0 * r2] 1029 movhps xm0, [r0 + 1 * r2] 1030 lea r0, [r0 + 2 * r2] 1031 movq xm1, [r0] 1032 movhps xm1, [r0 + r2] 1033 1034 vinserti128 m0, m0, xm1, 1 ; m0 = src[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] 1035 1036 pmaddwd m2, m0, [r3 + 0 * 32] 1037 pmaddwd m1, m0, [r3 + 1 * 32] 1038 phaddd m2, m1 1039 paddd m2, m5 1040 psrad m2, DST4_SHIFT 1041 pmaddwd m3, m0, [r3 + 2 * 32] 1042 pmaddwd m1, m0, [r3 + 3 * 32] 1043 phaddd m3, m1 1044 paddd m3, m5 1045 psrad m3, DST4_SHIFT 1046 packssdw m2, m3 1047 vpermd m2, m4, m2 1048 1049 vpbroadcastd m5, [pd_128] 1050 pmaddwd m0, m2, [r3 + 0 * 32] 1051 pmaddwd m1, m2, [r3 + 1 * 32] 1052 phaddd m0, m1 1053 paddd m0, m5 1054 psrad m0, 8 1055 pmaddwd m3, m2, [r3 + 2 * 32] 1056 pmaddwd m2, m2, [r3 + 3 * 32] 1057 phaddd m3, m2 1058 paddd m3, m5 1059 psrad m3, 8 1060 packssdw m0, m3 1061 vpermd m0, m4, m0 1062 movu [r1], m0 1063 RET 1064 1065;------------------------------------------------------- 1066;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride) 1067;------------------------------------------------------- 1068INIT_XMM sse2 1069cglobal idst4, 3, 4, 7 1070 mova m6, [pd_ %+ IDCT_ROUND] 1071 add r2d, r2d 1072 lea r3, [tab_idst4] 1073 mova m5, [pd_64] 1074 1075 movu m0, [r0 + 0 * 16] 1076 movu m1, [r0 + 1 * 16] 1077 1078 punpcklwd m2, m0, m1 ; m2 = m128iAC 1079 punpckhwd m0, m1 ; m0 = m128iBD 1080 1081 pmaddwd m1, m2, [r3 + 0 * 16] 1082 pmaddwd m3, m0, [r3 + 1 * 16] 1083 paddd m1, m3 1084 paddd m1, m5 1085 psrad m1, 7 ; m1 = S0 1086 1087 pmaddwd m3, m2, [r3 + 2 * 16] 1088 pmaddwd m4, m0, [r3 + 3 * 16] 1089 paddd m3, m4 1090 paddd m3, m5 1091 psrad m3, 7 ; m3 = S8 1092 packssdw m1, m3 ; m1 = m128iA 1093 1094 pmaddwd m3, m2, [r3 + 4 * 16] 1095 pmaddwd m4, m0, [r3 + 5 * 16] 1096 paddd m3, m4 1097 paddd m3, m5 1098 psrad m3, 7 ; m3 = S0 1099 1100 pmaddwd m2, [r3 + 6 * 16] 1101 pmaddwd m0, [r3 + 7 * 16] 1102 paddd m2, m0 1103 paddd m2, m5 1104 psrad m2, 7 ; m2 = S8 1105 packssdw m3, m2 ; m3 = m128iD 1106 1107 punpcklwd m0, m1, m3 1108 punpckhwd m1, m3 1109 1110 punpcklwd m2, m0, m1 1111 punpckhwd m0, m1 1112 punpcklwd m1, m2, m0 1113 punpckhwd m2, m0 1114 pmaddwd m0, m1, [r3 + 0 * 16] 1115 pmaddwd m3, m2, [r3 + 1 * 16] 1116 paddd m0, m3 1117 paddd m0, m6 1118 psrad m0, IDCT_SHIFT ; m0 = S0 1119 pmaddwd m3, m1, [r3 + 2 * 16] 1120 pmaddwd m4, m2, [r3 + 3 * 16] 1121 paddd m3, m4 1122 paddd m3, m6 1123 psrad m3, IDCT_SHIFT ; m3 = S8 1124 packssdw m0, m3 ; m0 = m128iA 1125 pmaddwd m3, m1, [r3 + 4 * 16] 1126 pmaddwd m4, m2, [r3 + 5 * 16] 1127 paddd m3, m4 1128 paddd m3, m6 1129 psrad m3, IDCT_SHIFT ; m3 = S0 1130 pmaddwd m1, [r3 + 6 * 16] 1131 pmaddwd m2, [r3 + 7 * 16] 1132 paddd m1, m2 1133 paddd m1, m6 1134 psrad m1, IDCT_SHIFT ; m1 = S8 1135 packssdw m3, m1 ; m3 = m128iD 1136 punpcklwd m1, m0, m3 1137 punpckhwd m0, m3 1138 1139 punpcklwd m2, m1, m0 1140 movlps [r1 + 0 * r2], m2 1141 movhps [r1 + 1 * r2], m2 1142 1143 punpckhwd m1, m0 1144 movlps [r1 + 2 * r2], m1 1145 lea r1, [r1 + 2 * r2] 1146 movhps [r1 + r2], m1 1147 RET 1148 1149;----------------------------------------------------------------- 1150;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride) 1151;----------------------------------------------------------------- 1152INIT_YMM avx2 1153cglobal idst4, 3, 4, 6 1154 vbroadcasti128 m4, [pd_ %+ IDCT_ROUND] 1155 add r2d, r2d 1156 lea r3, [pw_idst4_tab] 1157 1158 movu xm0, [r0 + 0 * 16] 1159 movu xm1, [r0 + 1 * 16] 1160 1161 punpcklwd m2, m0, m1 1162 punpckhwd m0, m1 1163 1164 vinserti128 m2, m2, xm2, 1 1165 vinserti128 m0, m0, xm0, 1 1166 1167 vpbroadcastd m5, [pd_64] 1168 pmaddwd m1, m2, [r3 + 0 * 32] 1169 pmaddwd m3, m0, [r3 + 1 * 32] 1170 paddd m1, m3 1171 paddd m1, m5 1172 psrad m1, 7 1173 pmaddwd m3, m2, [r3 + 2 * 32] 1174 pmaddwd m0, [r3 + 3 * 32] 1175 paddd m3, m0 1176 paddd m3, m5 1177 psrad m3, 7 1178 1179 packssdw m0, m1, m3 1180 pshufb m0, [pb_idst4_shuf] 1181 vpermq m1, m0, 11101110b 1182 1183 punpcklwd m2, m0, m1 1184 punpckhwd m0, m1 1185 punpcklwd m1, m2, m0 1186 punpckhwd m2, m0 1187 1188 vpermq m1, m1, 01000100b 1189 vpermq m2, m2, 01000100b 1190 1191 pmaddwd m0, m1, [r3 + 0 * 32] 1192 pmaddwd m3, m2, [r3 + 1 * 32] 1193 paddd m0, m3 1194 paddd m0, m4 1195 psrad m0, IDCT_SHIFT 1196 pmaddwd m3, m1, [r3 + 2 * 32] 1197 pmaddwd m2, m2, [r3 + 3 * 32] 1198 paddd m3, m2 1199 paddd m3, m4 1200 psrad m3, IDCT_SHIFT 1201 1202 packssdw m0, m3 1203 pshufb m1, m0, [pb_idst4_shuf] 1204 vpermq m0, m1, 11101110b 1205 1206 punpcklwd m2, m1, m0 1207 movq [r1 + 0 * r2], xm2 1208 movhps [r1 + 1 * r2], xm2 1209 1210 punpckhwd m1, m0 1211 movq [r1 + 2 * r2], xm1 1212 lea r1, [r1 + 2 * r2] 1213 movhps [r1 + r2], xm1 1214 RET 1215 1216;------------------------------------------------------- 1217; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride) 1218;------------------------------------------------------- 1219INIT_XMM sse2 1220cglobal dct8, 3,6,8,0-16*mmsize 1221 ;------------------------ 1222 ; Stack Mapping(dword) 1223 ;------------------------ 1224 ; Row0[0-3] Row1[0-3] 1225 ; ... 1226 ; Row6[0-3] Row7[0-3] 1227 ; Row0[0-3] Row7[0-3] 1228 ; ... 1229 ; Row6[4-7] Row7[4-7] 1230 ;------------------------ 1231 1232 add r2, r2 1233 lea r3, [r2 * 3] 1234 mov r5, rsp 1235%assign x 0 1236%rep 2 1237 movu m0, [r0] 1238 movu m1, [r0 + r2] 1239 movu m2, [r0 + r2 * 2] 1240 movu m3, [r0 + r3] 1241 1242 punpcklwd m4, m0, m1 1243 punpckhwd m0, m1 1244 punpcklwd m5, m2, m3 1245 punpckhwd m2, m3 1246 punpckldq m1, m4, m5 ; m1 = [1 0] 1247 punpckhdq m4, m5 ; m4 = [3 2] 1248 punpckldq m3, m0, m2 1249 punpckhdq m0, m2 1250 pshufd m2, m3, 0x4E ; m2 = [4 5] 1251 pshufd m0, m0, 0x4E ; m0 = [6 7] 1252 1253 paddw m3, m1, m0 1254 psubw m1, m0 ; m1 = [d1 d0] 1255 paddw m0, m4, m2 1256 psubw m4, m2 ; m4 = [d3 d2] 1257 punpcklqdq m2, m3, m0 ; m2 = [s2 s0] 1258 punpckhqdq m3, m0 1259 pshufd m3, m3, 0x4E ; m3 = [s1 s3] 1260 1261 punpcklwd m0, m1, m4 ; m0 = [d2/d0] 1262 punpckhwd m1, m4 ; m1 = [d3/d1] 1263 punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] 1264 punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] 1265 1266 ; odd 1267 lea r4, [tab_dct8_1] 1268 pmaddwd m1, m4, [r4 + 0*16] 1269 pmaddwd m5, m0, [r4 + 0*16] 1270 pshufd m1, m1, 0xD8 1271 pshufd m5, m5, 0xD8 1272 mova m7, m1 1273 punpckhqdq m7, m5 1274 punpcklqdq m1, m5 1275 paddd m1, m7 1276 paddd m1, [pd_ %+ DCT8_ROUND1] 1277 psrad m1, DCT8_SHIFT1 1278 %if x == 1 1279 pshufd m1, m1, 0x1B 1280 %endif 1281 mova [r5 + 1*2*mmsize], m1 ; Row 1 1282 1283 pmaddwd m1, m4, [r4 + 1*16] 1284 pmaddwd m5, m0, [r4 + 1*16] 1285 pshufd m1, m1, 0xD8 1286 pshufd m5, m5, 0xD8 1287 mova m7, m1 1288 punpckhqdq m7, m5 1289 punpcklqdq m1, m5 1290 paddd m1, m7 1291 paddd m1, [pd_ %+ DCT8_ROUND1] 1292 psrad m1, DCT8_SHIFT1 1293 %if x == 1 1294 pshufd m1, m1, 0x1B 1295 %endif 1296 mova [r5 + 3*2*mmsize], m1 ; Row 3 1297 1298 pmaddwd m1, m4, [r4 + 2*16] 1299 pmaddwd m5, m0, [r4 + 2*16] 1300 pshufd m1, m1, 0xD8 1301 pshufd m5, m5, 0xD8 1302 mova m7, m1 1303 punpckhqdq m7, m5 1304 punpcklqdq m1, m5 1305 paddd m1, m7 1306 paddd m1, [pd_ %+ DCT8_ROUND1] 1307 psrad m1, DCT8_SHIFT1 1308 %if x == 1 1309 pshufd m1, m1, 0x1B 1310 %endif 1311 mova [r5 + 5*2*mmsize], m1 ; Row 5 1312 1313 pmaddwd m4, [r4 + 3*16] 1314 pmaddwd m0, [r4 + 3*16] 1315 pshufd m4, m4, 0xD8 1316 pshufd m0, m0, 0xD8 1317 mova m7, m4 1318 punpckhqdq m7, m0 1319 punpcklqdq m4, m0 1320 paddd m4, m7 1321 paddd m4, [pd_ %+ DCT8_ROUND1] 1322 psrad m4, DCT8_SHIFT1 1323 %if x == 1 1324 pshufd m4, m4, 0x1B 1325 %endif 1326 mova [r5 + 7*2*mmsize], m4; Row 7 1327 1328 ; even 1329 lea r4, [tab_dct4] 1330 paddw m0, m2, m3 ; m0 = [EE1 EE0] 1331 pshufd m0, m0, 0xD8 1332 pshuflw m0, m0, 0xD8 1333 pshufhw m0, m0, 0xD8 1334 psubw m2, m3 ; m2 = [EO1 EO0] 1335 pmullw m2, [pw_ppppmmmm] 1336 pshufd m2, m2, 0xD8 1337 pshuflw m2, m2, 0xD8 1338 pshufhw m2, m2, 0xD8 1339 pmaddwd m3, m0, [r4 + 0*16] 1340 paddd m3, [pd_ %+ DCT8_ROUND1] 1341 psrad m3, DCT8_SHIFT1 1342 %if x == 1 1343 pshufd m3, m3, 0x1B 1344 %endif 1345 mova [r5 + 0*2*mmsize], m3 ; Row 0 1346 pmaddwd m0, [r4 + 2*16] 1347 paddd m0, [pd_ %+ DCT8_ROUND1] 1348 psrad m0, DCT8_SHIFT1 1349 %if x == 1 1350 pshufd m0, m0, 0x1B 1351 %endif 1352 mova [r5 + 4*2*mmsize], m0 ; Row 4 1353 pmaddwd m3, m2, [r4 + 1*16] 1354 paddd m3, [pd_ %+ DCT8_ROUND1] 1355 psrad m3, DCT8_SHIFT1 1356 %if x == 1 1357 pshufd m3, m3, 0x1B 1358 %endif 1359 mova [r5 + 2*2*mmsize], m3 ; Row 2 1360 pmaddwd m2, [r4 + 3*16] 1361 paddd m2, [pd_ %+ DCT8_ROUND1] 1362 psrad m2, DCT8_SHIFT1 1363 %if x == 1 1364 pshufd m2, m2, 0x1B 1365 %endif 1366 mova [r5 + 6*2*mmsize], m2 ; Row 6 1367 1368 %if x != 1 1369 lea r0, [r0 + r2 * 4] 1370 add r5, mmsize 1371 %endif 1372%assign x x+1 1373%endrep 1374 1375 mov r0, rsp ; r0 = pointer to Low Part 1376 lea r4, [tab_dct8_2] 1377 1378%assign x 0 1379%rep 4 1380 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] 1381 mova m1, [r0 + 1*2*mmsize] 1382 paddd m2, m0, [r0 + (0*2+1)*mmsize] 1383 pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] 1384 paddd m3, m1, [r0 + (1*2+1)*mmsize] 1385 pshufd m3, m3, 0x9C ; m3 = ^^ 1386 psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] 1387 psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ 1388 1389 ; even 1390 pshufd m4, m2, 0xD8 1391 pshufd m3, m3, 0xD8 1392 mova m7, m4 1393 punpckhqdq m7, m3 1394 punpcklqdq m4, m3 1395 mova m2, m4 1396 paddd m4, m7 ; m4 = [EE1 EE0 EE1 EE0] 1397 psubd m2, m7 ; m2 = [EO1 EO0 EO1 EO0] 1398 1399 pslld m4, 6 ; m4 = [64*EE1 64*EE0] 1400 mova m5, m2 1401 pmuludq m5, [r4 + 0*16] 1402 pshufd m7, m2, 0xF5 1403 movu m6, [r4 + 0*16 + 4] 1404 pmuludq m7, m6 1405 pshufd m5, m5, 0x88 1406 pshufd m7, m7, 0x88 1407 punpckldq m5, m7 ; m5 = [36*EO1 83*EO0] 1408 pshufd m7, m2, 0xF5 1409 pmuludq m2, [r4 + 1*16] 1410 movu m6, [r4 + 1*16 + 4] 1411 pmuludq m7, m6 1412 pshufd m2, m2, 0x88 1413 pshufd m7, m7, 0x88 1414 punpckldq m2, m7 ; m2 = [83*EO1 36*EO0] 1415 1416 pshufd m3, m4, 0xD8 1417 pshufd m5, m5, 0xD8 1418 mova m7, m3 1419 punpckhqdq m7, m5 1420 punpcklqdq m3, m5 1421 paddd m3, m7 ; m3 = [Row2 Row0] 1422 paddd m3, [pd_ %+ DCT8_ROUND2] 1423 psrad m3, DCT8_SHIFT2 1424 pshufd m4, m4, 0xD8 1425 pshufd m2, m2, 0xD8 1426 mova m7, m4 1427 punpckhqdq m7, m2 1428 punpcklqdq m4, m2 1429 psubd m4, m7 ; m4 = [Row6 Row4] 1430 paddd m4, [pd_ %+ DCT8_ROUND2] 1431 psrad m4, DCT8_SHIFT2 1432 1433 packssdw m3, m3 1434 movd [r1 + 0*mmsize], m3 1435 pshufd m3, m3, 1 1436 movd [r1 + 2*mmsize], m3 1437 1438 packssdw m4, m4 1439 movd [r1 + 4*mmsize], m4 1440 pshufd m4, m4, 1 1441 movd [r1 + 6*mmsize], m4 1442 1443 ; odd 1444 mova m2, m0 1445 pmuludq m2, [r4 + 2*16] 1446 pshufd m7, m0, 0xF5 1447 movu m6, [r4 + 2*16 + 4] 1448 pmuludq m7, m6 1449 pshufd m2, m2, 0x88 1450 pshufd m7, m7, 0x88 1451 punpckldq m2, m7 1452 mova m3, m1 1453 pmuludq m3, [r4 + 2*16] 1454 pshufd m7, m1, 0xF5 1455 pmuludq m7, m6 1456 pshufd m3, m3, 0x88 1457 pshufd m7, m7, 0x88 1458 punpckldq m3, m7 1459 mova m4, m0 1460 pmuludq m4, [r4 + 3*16] 1461 pshufd m7, m0, 0xF5 1462 movu m6, [r4 + 3*16 + 4] 1463 pmuludq m7, m6 1464 pshufd m4, m4, 0x88 1465 pshufd m7, m7, 0x88 1466 punpckldq m4, m7 1467 mova m5, m1 1468 pmuludq m5, [r4 + 3*16] 1469 pshufd m7, m1, 0xF5 1470 pmuludq m7, m6 1471 pshufd m5, m5, 0x88 1472 pshufd m7, m7, 0x88 1473 punpckldq m5, m7 1474 pshufd m2, m2, 0xD8 1475 pshufd m3, m3, 0xD8 1476 mova m7, m2 1477 punpckhqdq m7, m3 1478 punpcklqdq m2, m3 1479 paddd m2, m7 1480 pshufd m4, m4, 0xD8 1481 pshufd m5, m5, 0xD8 1482 mova m7, m4 1483 punpckhqdq m7, m5 1484 punpcklqdq m4, m5 1485 paddd m4, m7 1486 pshufd m2, m2, 0xD8 1487 pshufd m4, m4, 0xD8 1488 mova m7, m2 1489 punpckhqdq m7, m4 1490 punpcklqdq m2, m4 1491 paddd m2, m7 ; m2 = [Row3 Row1] 1492 paddd m2, [pd_ %+ DCT8_ROUND2] 1493 psrad m2, DCT8_SHIFT2 1494 1495 packssdw m2, m2 1496 movd [r1 + 1*mmsize], m2 1497 pshufd m2, m2, 1 1498 movd [r1 + 3*mmsize], m2 1499 1500 mova m2, m0 1501 pmuludq m2, [r4 + 4*16] 1502 pshufd m7, m0, 0xF5 1503 movu m6, [r4 + 4*16 + 4] 1504 pmuludq m7, m6 1505 pshufd m2, m2, 0x88 1506 pshufd m7, m7, 0x88 1507 punpckldq m2, m7 1508 mova m3, m1 1509 pmuludq m3, [r4 + 4*16] 1510 pshufd m7, m1, 0xF5 1511 pmuludq m7, m6 1512 pshufd m3, m3, 0x88 1513 pshufd m7, m7, 0x88 1514 punpckldq m3, m7 1515 mova m4, m0 1516 pmuludq m4, [r4 + 5*16] 1517 pshufd m7, m0, 0xF5 1518 movu m6, [r4 + 5*16 + 4] 1519 pmuludq m7, m6 1520 pshufd m4, m4, 0x88 1521 pshufd m7, m7, 0x88 1522 punpckldq m4, m7 1523 mova m5, m1 1524 pmuludq m5, [r4 + 5*16] 1525 pshufd m7, m1, 0xF5 1526 pmuludq m7, m6 1527 pshufd m5, m5, 0x88 1528 pshufd m7, m7, 0x88 1529 punpckldq m5, m7 1530 pshufd m2, m2, 0xD8 1531 pshufd m3, m3, 0xD8 1532 mova m7, m2 1533 punpckhqdq m7, m3 1534 punpcklqdq m2, m3 1535 paddd m2, m7 1536 pshufd m4, m4, 0xD8 1537 pshufd m5, m5, 0xD8 1538 mova m7, m4 1539 punpckhqdq m7, m5 1540 punpcklqdq m4, m5 1541 paddd m4, m7 1542 pshufd m2, m2, 0xD8 1543 pshufd m4, m4, 0xD8 1544 mova m7, m2 1545 punpckhqdq m7, m4 1546 punpcklqdq m2, m4 1547 paddd m2, m7 ; m2 = [Row7 Row5] 1548 paddd m2, [pd_ %+ DCT8_ROUND2] 1549 psrad m2, DCT8_SHIFT2 1550 1551 packssdw m2, m2 1552 movd [r1 + 5*mmsize], m2 1553 pshufd m2, m2, 1 1554 movd [r1 + 7*mmsize], m2 1555%if x < 3 1556 add r1, mmsize/4 1557 add r0, 2*2*mmsize 1558%endif 1559%assign x x+1 1560%endrep 1561 1562 RET 1563 1564;------------------------------------------------------- 1565; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride) 1566;------------------------------------------------------- 1567INIT_XMM sse4 1568cglobal dct8, 3,6,7,0-16*mmsize 1569 ;------------------------ 1570 ; Stack Mapping(dword) 1571 ;------------------------ 1572 ; Row0[0-3] Row1[0-3] 1573 ; ... 1574 ; Row6[0-3] Row7[0-3] 1575 ; Row0[0-3] Row7[0-3] 1576 ; ... 1577 ; Row6[4-7] Row7[4-7] 1578 ;------------------------ 1579 mova m6, [pd_ %+ DCT8_ROUND1] 1580 1581 add r2, r2 1582 lea r3, [r2 * 3] 1583 mov r5, rsp 1584%assign x 0 1585%rep 2 1586 movu m0, [r0] 1587 movu m1, [r0 + r2] 1588 movu m2, [r0 + r2 * 2] 1589 movu m3, [r0 + r3] 1590 1591 punpcklwd m4, m0, m1 1592 punpckhwd m0, m1 1593 punpcklwd m5, m2, m3 1594 punpckhwd m2, m3 1595 punpckldq m1, m4, m5 ; m1 = [1 0] 1596 punpckhdq m4, m5 ; m4 = [3 2] 1597 punpckldq m3, m0, m2 1598 punpckhdq m0, m2 1599 pshufd m2, m3, 0x4E ; m2 = [4 5] 1600 pshufd m0, m0, 0x4E ; m0 = [6 7] 1601 1602 paddw m3, m1, m0 1603 psubw m1, m0 ; m1 = [d1 d0] 1604 paddw m0, m4, m2 1605 psubw m4, m2 ; m4 = [d3 d2] 1606 punpcklqdq m2, m3, m0 ; m2 = [s2 s0] 1607 punpckhqdq m3, m0 1608 pshufd m3, m3, 0x4E ; m3 = [s1 s3] 1609 1610 punpcklwd m0, m1, m4 ; m0 = [d2/d0] 1611 punpckhwd m1, m4 ; m1 = [d3/d1] 1612 punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] 1613 punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] 1614 1615 ; odd 1616 lea r4, [tab_dct8_1] 1617 pmaddwd m1, m4, [r4 + 0*16] 1618 pmaddwd m5, m0, [r4 + 0*16] 1619 phaddd m1, m5 1620 paddd m1, m6 1621 psrad m1, DCT8_SHIFT1 1622 %if x == 1 1623 pshufd m1, m1, 0x1B 1624 %endif 1625 mova [r5 + 1*2*mmsize], m1 ; Row 1 1626 1627 pmaddwd m1, m4, [r4 + 1*16] 1628 pmaddwd m5, m0, [r4 + 1*16] 1629 phaddd m1, m5 1630 paddd m1, m6 1631 psrad m1, DCT8_SHIFT1 1632 %if x == 1 1633 pshufd m1, m1, 0x1B 1634 %endif 1635 mova [r5 + 3*2*mmsize], m1 ; Row 3 1636 1637 pmaddwd m1, m4, [r4 + 2*16] 1638 pmaddwd m5, m0, [r4 + 2*16] 1639 phaddd m1, m5 1640 paddd m1, m6 1641 psrad m1, DCT8_SHIFT1 1642 %if x == 1 1643 pshufd m1, m1, 0x1B 1644 %endif 1645 mova [r5 + 5*2*mmsize], m1 ; Row 5 1646 1647 pmaddwd m4, [r4 + 3*16] 1648 pmaddwd m0, [r4 + 3*16] 1649 phaddd m4, m0 1650 paddd m4, m6 1651 psrad m4, DCT8_SHIFT1 1652 %if x == 1 1653 pshufd m4, m4, 0x1B 1654 %endif 1655 mova [r5 + 7*2*mmsize], m4; Row 7 1656 1657 ; even 1658 lea r4, [tab_dct4] 1659 paddw m0, m2, m3 ; m0 = [EE1 EE0] 1660 pshufb m0, [pb_unpackhlw1] 1661 psubw m2, m3 ; m2 = [EO1 EO0] 1662 psignw m2, [pw_ppppmmmm] 1663 pshufb m2, [pb_unpackhlw1] 1664 pmaddwd m3, m0, [r4 + 0*16] 1665 paddd m3, m6 1666 psrad m3, DCT8_SHIFT1 1667 %if x == 1 1668 pshufd m3, m3, 0x1B 1669 %endif 1670 mova [r5 + 0*2*mmsize], m3 ; Row 0 1671 pmaddwd m0, [r4 + 2*16] 1672 paddd m0, m6 1673 psrad m0, DCT8_SHIFT1 1674 %if x == 1 1675 pshufd m0, m0, 0x1B 1676 %endif 1677 mova [r5 + 4*2*mmsize], m0 ; Row 4 1678 pmaddwd m3, m2, [r4 + 1*16] 1679 paddd m3, m6 1680 psrad m3, DCT8_SHIFT1 1681 %if x == 1 1682 pshufd m3, m3, 0x1B 1683 %endif 1684 mova [r5 + 2*2*mmsize], m3 ; Row 2 1685 pmaddwd m2, [r4 + 3*16] 1686 paddd m2, m6 1687 psrad m2, DCT8_SHIFT1 1688 %if x == 1 1689 pshufd m2, m2, 0x1B 1690 %endif 1691 mova [r5 + 6*2*mmsize], m2 ; Row 6 1692 1693 %if x != 1 1694 lea r0, [r0 + r2 * 4] 1695 add r5, mmsize 1696 %endif 1697%assign x x+1 1698%endrep 1699 1700 mov r2, 2 1701 mov r0, rsp ; r0 = pointer to Low Part 1702 lea r4, [tab_dct8_2] 1703 mova m6, [pd_256] 1704 1705.pass2: 1706%rep 2 1707 mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] 1708 mova m1, [r0 + 1*2*mmsize] 1709 paddd m2, m0, [r0 + (0*2+1)*mmsize] 1710 pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] 1711 paddd m3, m1, [r0 + (1*2+1)*mmsize] 1712 pshufd m3, m3, 0x9C ; m3 = ^^ 1713 psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] 1714 psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ 1715 1716 ; even 1717 phaddd m4, m2, m3 ; m4 = [EE1 EE0 EE1 EE0] 1718 phsubd m2, m3 ; m2 = [EO1 EO0 EO1 EO0] 1719 1720 pslld m4, 6 ; m4 = [64*EE1 64*EE0] 1721 pmulld m5, m2, [r4 + 0*16] ; m5 = [36*EO1 83*EO0] 1722 pmulld m2, [r4 + 1*16] ; m2 = [83*EO1 36*EO0] 1723 1724 phaddd m3, m4, m5 ; m3 = [Row2 Row0] 1725 paddd m3, m6 1726 psrad m3, 9 1727 phsubd m4, m2 ; m4 = [Row6 Row4] 1728 paddd m4, m6 1729 psrad m4, 9 1730 1731 packssdw m3, m3 1732 movd [r1 + 0*mmsize], m3 1733 pshufd m3, m3, 1 1734 movd [r1 + 2*mmsize], m3 1735 1736 packssdw m4, m4 1737 movd [r1 + 4*mmsize], m4 1738 pshufd m4, m4, 1 1739 movd [r1 + 6*mmsize], m4 1740 1741 ; odd 1742 pmulld m2, m0, [r4 + 2*16] 1743 pmulld m3, m1, [r4 + 2*16] 1744 pmulld m4, m0, [r4 + 3*16] 1745 pmulld m5, m1, [r4 + 3*16] 1746 phaddd m2, m3 1747 phaddd m4, m5 1748 phaddd m2, m4 ; m2 = [Row3 Row1] 1749 paddd m2, m6 1750 psrad m2, 9 1751 1752 packssdw m2, m2 1753 movd [r1 + 1*mmsize], m2 1754 pshufd m2, m2, 1 1755 movd [r1 + 3*mmsize], m2 1756 1757 pmulld m2, m0, [r4 + 4*16] 1758 pmulld m3, m1, [r4 + 4*16] 1759 pmulld m4, m0, [r4 + 5*16] 1760 pmulld m5, m1, [r4 + 5*16] 1761 phaddd m2, m3 1762 phaddd m4, m5 1763 phaddd m2, m4 ; m2 = [Row7 Row5] 1764 paddd m2, m6 1765 psrad m2, 9 1766 1767 packssdw m2, m2 1768 movd [r1 + 5*mmsize], m2 1769 pshufd m2, m2, 1 1770 movd [r1 + 7*mmsize], m2 1771 1772 add r1, mmsize/4 1773 add r0, 2*2*mmsize 1774%endrep 1775 1776 dec r2 1777 jnz .pass2 1778 RET 1779 1780;------------------------------------------------------- 1781; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride) 1782;------------------------------------------------------- 1783%if ARCH_X86_64 1784INIT_XMM sse2 1785cglobal idct8, 3, 6, 16, 0-5*mmsize 1786 mova m9, [r0 + 1 * mmsize] 1787 mova m1, [r0 + 3 * mmsize] 1788 mova m7, m9 1789 punpcklwd m7, m1 1790 punpckhwd m9, m1 1791 mova m14, [tab_idct8_3] 1792 mova m3, m14 1793 pmaddwd m14, m7 1794 pmaddwd m3, m9 1795 mova m0, [r0 + 5 * mmsize] 1796 mova m10, [r0 + 7 * mmsize] 1797 mova m2, m0 1798 punpcklwd m2, m10 1799 punpckhwd m0, m10 1800 mova m15, [tab_idct8_3 + 1 * mmsize] 1801 mova m11, [tab_idct8_3 + 1 * mmsize] 1802 pmaddwd m15, m2 1803 mova m4, [tab_idct8_3 + 2 * mmsize] 1804 pmaddwd m11, m0 1805 mova m1, [tab_idct8_3 + 2 * mmsize] 1806 paddd m15, m14 1807 mova m5, [tab_idct8_3 + 4 * mmsize] 1808 mova m12, [tab_idct8_3 + 4 * mmsize] 1809 paddd m11, m3 1810 mova [rsp + 0 * mmsize], m11 1811 mova [rsp + 1 * mmsize], m15 1812 pmaddwd m4, m7 1813 pmaddwd m1, m9 1814 mova m14, [tab_idct8_3 + 3 * mmsize] 1815 mova m3, [tab_idct8_3 + 3 * mmsize] 1816 pmaddwd m14, m2 1817 pmaddwd m3, m0 1818 paddd m14, m4 1819 paddd m3, m1 1820 mova [rsp + 2 * mmsize], m3 1821 pmaddwd m5, m9 1822 pmaddwd m9, [tab_idct8_3 + 6 * mmsize] 1823 mova m6, [tab_idct8_3 + 5 * mmsize] 1824 pmaddwd m12, m7 1825 pmaddwd m7, [tab_idct8_3 + 6 * mmsize] 1826 mova m4, [tab_idct8_3 + 5 * mmsize] 1827 pmaddwd m6, m2 1828 paddd m6, m12 1829 pmaddwd m2, [tab_idct8_3 + 7 * mmsize] 1830 paddd m7, m2 1831 mova [rsp + 3 * mmsize], m6 1832 pmaddwd m4, m0 1833 pmaddwd m0, [tab_idct8_3 + 7 * mmsize] 1834 paddd m9, m0 1835 paddd m5, m4 1836 mova m6, [r0 + 0 * mmsize] 1837 mova m0, [r0 + 4 * mmsize] 1838 mova m4, m6 1839 punpcklwd m4, m0 1840 punpckhwd m6, m0 1841 mova m12, [r0 + 2 * mmsize] 1842 mova m0, [r0 + 6 * mmsize] 1843 mova m13, m12 1844 mova m8, [tab_dct4] 1845 punpcklwd m13, m0 1846 mova m10, [tab_dct4] 1847 punpckhwd m12, m0 1848 pmaddwd m8, m4 1849 mova m3, m8 1850 pmaddwd m4, [tab_dct4 + 2 * mmsize] 1851 pmaddwd m10, m6 1852 mova m2, [tab_dct4 + 1 * mmsize] 1853 mova m1, m10 1854 pmaddwd m6, [tab_dct4 + 2 * mmsize] 1855 mova m0, [tab_dct4 + 1 * mmsize] 1856 pmaddwd m2, m13 1857 paddd m3, m2 1858 psubd m8, m2 1859 mova m2, m6 1860 pmaddwd m13, [tab_dct4 + 3 * mmsize] 1861 pmaddwd m0, m12 1862 paddd m1, m0 1863 psubd m10, m0 1864 mova m0, m4 1865 pmaddwd m12, [tab_dct4 + 3 * mmsize] 1866 paddd m3, [pd_64] 1867 paddd m1, [pd_64] 1868 paddd m8, [pd_64] 1869 paddd m10, [pd_64] 1870 paddd m0, m13 1871 paddd m2, m12 1872 paddd m0, [pd_64] 1873 paddd m2, [pd_64] 1874 psubd m4, m13 1875 psubd m6, m12 1876 paddd m4, [pd_64] 1877 paddd m6, [pd_64] 1878 mova m12, m8 1879 psubd m8, m7 1880 psrad m8, 7 1881 paddd m15, m3 1882 psubd m3, [rsp + 1 * mmsize] 1883 psrad m15, 7 1884 paddd m12, m7 1885 psrad m12, 7 1886 paddd m11, m1 1887 mova m13, m14 1888 psrad m11, 7 1889 packssdw m15, m11 1890 psubd m1, [rsp + 0 * mmsize] 1891 psrad m1, 7 1892 mova m11, [rsp + 2 * mmsize] 1893 paddd m14, m0 1894 psrad m14, 7 1895 psubd m0, m13 1896 psrad m0, 7 1897 paddd m11, m2 1898 mova m13, [rsp + 3 * mmsize] 1899 psrad m11, 7 1900 packssdw m14, m11 1901 mova m11, m6 1902 psubd m6, m5 1903 paddd m13, m4 1904 psrad m13, 7 1905 psrad m6, 7 1906 paddd m11, m5 1907 psrad m11, 7 1908 packssdw m13, m11 1909 mova m11, m10 1910 psubd m4, [rsp + 3 * mmsize] 1911 psubd m10, m9 1912 psrad m4, 7 1913 psrad m10, 7 1914 packssdw m4, m6 1915 packssdw m8, m10 1916 paddd m11, m9 1917 psrad m11, 7 1918 packssdw m12, m11 1919 psubd m2, [rsp + 2 * mmsize] 1920 mova m5, m15 1921 psrad m2, 7 1922 packssdw m0, m2 1923 mova m2, m14 1924 psrad m3, 7 1925 packssdw m3, m1 1926 mova m6, m13 1927 punpcklwd m5, m8 1928 punpcklwd m2, m4 1929 mova m1, m12 1930 punpcklwd m6, m0 1931 punpcklwd m1, m3 1932 mova m9, m5 1933 punpckhwd m13, m0 1934 mova m0, m2 1935 punpcklwd m9, m6 1936 punpckhwd m5, m6 1937 punpcklwd m0, m1 1938 punpckhwd m2, m1 1939 punpckhwd m15, m8 1940 mova m1, m5 1941 punpckhwd m14, m4 1942 punpckhwd m12, m3 1943 mova m6, m9 1944 punpckhwd m9, m0 1945 punpcklwd m1, m2 1946 mova m4, [tab_idct8_3 + 0 * mmsize] 1947 punpckhwd m5, m2 1948 punpcklwd m6, m0 1949 mova m2, m15 1950 mova m0, m14 1951 mova m7, m9 1952 punpcklwd m2, m13 1953 punpcklwd m0, m12 1954 punpcklwd m7, m5 1955 punpckhwd m14, m12 1956 mova m10, m2 1957 punpckhwd m15, m13 1958 punpckhwd m9, m5 1959 pmaddwd m4, m7 1960 mova m13, m1 1961 punpckhwd m2, m0 1962 punpcklwd m10, m0 1963 mova m0, m15 1964 punpckhwd m15, m14 1965 mova m12, m1 1966 mova m3, [tab_idct8_3 + 0 * mmsize] 1967 punpcklwd m0, m14 1968 pmaddwd m3, m9 1969 mova m11, m2 1970 punpckhwd m2, m15 1971 punpcklwd m11, m15 1972 mova m8, [tab_idct8_3 + 1 * mmsize] 1973 punpcklwd m13, m0 1974 punpckhwd m12, m0 1975 pmaddwd m8, m11 1976 paddd m8, m4 1977 mova [rsp + 4 * mmsize], m8 1978 mova m4, [tab_idct8_3 + 2 * mmsize] 1979 pmaddwd m4, m7 1980 mova m15, [tab_idct8_3 + 2 * mmsize] 1981 mova m5, [tab_idct8_3 + 1 * mmsize] 1982 pmaddwd m15, m9 1983 pmaddwd m5, m2 1984 paddd m5, m3 1985 mova [rsp + 3 * mmsize], m5 1986 mova m14, [tab_idct8_3 + 3 * mmsize] 1987 mova m5, [tab_idct8_3 + 3 * mmsize] 1988 pmaddwd m14, m11 1989 paddd m14, m4 1990 mova [rsp + 2 * mmsize], m14 1991 pmaddwd m5, m2 1992 paddd m5, m15 1993 mova [rsp + 1 * mmsize], m5 1994 mova m15, [tab_idct8_3 + 4 * mmsize] 1995 mova m5, [tab_idct8_3 + 4 * mmsize] 1996 pmaddwd m15, m7 1997 pmaddwd m7, [tab_idct8_3 + 6 * mmsize] 1998 pmaddwd m5, m9 1999 pmaddwd m9, [tab_idct8_3 + 6 * mmsize] 2000 mova m4, [tab_idct8_3 + 5 * mmsize] 2001 pmaddwd m4, m2 2002 paddd m5, m4 2003 mova m4, m6 2004 mova m8, [tab_idct8_3 + 5 * mmsize] 2005 punpckhwd m6, m10 2006 pmaddwd m2, [tab_idct8_3 + 7 * mmsize] 2007 punpcklwd m4, m10 2008 paddd m9, m2 2009 pmaddwd m8, m11 2010 mova m10, [tab_dct4] 2011 paddd m8, m15 2012 pmaddwd m11, [tab_idct8_3 + 7 * mmsize] 2013 paddd m7, m11 2014 mova [rsp + 0 * mmsize], m8 2015 pmaddwd m10, m6 2016 pmaddwd m6, [tab_dct4 + 2 * mmsize] 2017 mova m1, m10 2018 mova m8, [tab_dct4] 2019 mova m3, [tab_dct4 + 1 * mmsize] 2020 pmaddwd m8, m4 2021 pmaddwd m4, [tab_dct4 + 2 * mmsize] 2022 mova m0, m8 2023 mova m2, [tab_dct4 + 1 * mmsize] 2024 pmaddwd m3, m13 2025 psubd m8, m3 2026 paddd m0, m3 2027 mova m3, m6 2028 pmaddwd m13, [tab_dct4 + 3 * mmsize] 2029 pmaddwd m2, m12 2030 paddd m1, m2 2031 psubd m10, m2 2032 mova m2, m4 2033 pmaddwd m12, [tab_dct4 + 3 * mmsize] 2034 mova m15, [pd_ %+ IDCT_ROUND] 2035 paddd m0, m15 2036 paddd m1, m15 2037 paddd m8, m15 2038 paddd m10, m15 2039 paddd m2, m13 2040 paddd m3, m12 2041 paddd m2, m15 2042 paddd m3, m15 2043 psubd m4, m13 2044 psubd m6, m12 2045 paddd m4, m15 2046 paddd m6, m15 2047 mova m15, [rsp + 4 * mmsize] 2048 mova m12, m8 2049 psubd m8, m7 2050 psrad m8, IDCT_SHIFT 2051 mova m11, [rsp + 3 * mmsize] 2052 paddd m15, m0 2053 psrad m15, IDCT_SHIFT 2054 psubd m0, [rsp + 4 * mmsize] 2055 psrad m0, IDCT_SHIFT 2056 paddd m12, m7 2057 paddd m11, m1 2058 mova m14, [rsp + 2 * mmsize] 2059 psrad m11, IDCT_SHIFT 2060 packssdw m15, m11 2061 psubd m1, [rsp + 3 * mmsize] 2062 psrad m1, IDCT_SHIFT 2063 mova m11, [rsp + 1 * mmsize] 2064 paddd m14, m2 2065 psrad m14, IDCT_SHIFT 2066 packssdw m0, m1 2067 psrad m12, IDCT_SHIFT 2068 psubd m2, [rsp + 2 * mmsize] 2069 paddd m11, m3 2070 mova m13, [rsp + 0 * mmsize] 2071 psrad m11, IDCT_SHIFT 2072 packssdw m14, m11 2073 mova m11, m6 2074 psubd m6, m5 2075 paddd m13, m4 2076 psrad m13, IDCT_SHIFT 2077 mova m1, m15 2078 paddd m11, m5 2079 psrad m11, IDCT_SHIFT 2080 packssdw m13, m11 2081 mova m11, m10 2082 psubd m10, m9 2083 psrad m10, IDCT_SHIFT 2084 packssdw m8, m10 2085 psrad m6, IDCT_SHIFT 2086 psubd m4, [rsp + 0 * mmsize] 2087 paddd m11, m9 2088 psrad m11, IDCT_SHIFT 2089 packssdw m12, m11 2090 punpcklwd m1, m14 2091 mova m5, m13 2092 psrad m4, IDCT_SHIFT 2093 packssdw m4, m6 2094 psubd m3, [rsp + 1 * mmsize] 2095 psrad m2, IDCT_SHIFT 2096 mova m6, m8 2097 psrad m3, IDCT_SHIFT 2098 punpcklwd m5, m12 2099 packssdw m2, m3 2100 punpcklwd m6, m4 2101 punpckhwd m8, m4 2102 mova m4, m1 2103 mova m3, m2 2104 punpckhdq m1, m5 2105 punpckldq m4, m5 2106 punpcklwd m3, m0 2107 punpckhwd m2, m0 2108 mova m0, m6 2109 lea r2, [r2 + r2] 2110 lea r4, [r2 + r2] 2111 lea r3, [r4 + r2] 2112 lea r4, [r4 + r3] 2113 lea r0, [r4 + r2 * 2] 2114 movq [r1], m4 2115 punpckhwd m15, m14 2116 movhps [r1 + r2], m4 2117 punpckhdq m0, m3 2118 movq [r1 + r2 * 2], m1 2119 punpckhwd m13, m12 2120 movhps [r1 + r3], m1 2121 mova m1, m6 2122 punpckldq m1, m3 2123 movq [r1 + 8], m1 2124 movhps [r1 + r2 + 8], m1 2125 movq [r1 + r2 * 2 + 8], m0 2126 movhps [r1 + r3 + 8], m0 2127 mova m0, m15 2128 punpckhdq m15, m13 2129 punpckldq m0, m13 2130 movq [r1 + r2 * 4], m0 2131 movhps [r1 + r4], m0 2132 mova m0, m8 2133 punpckhdq m8, m2 2134 movq [r1 + r3 * 2], m15 2135 punpckldq m0, m2 2136 movhps [r1 + r0], m15 2137 movq [r1 + r2 * 4 + 8], m0 2138 movhps [r1 + r4 + 8], m0 2139 movq [r1 + r3 * 2 + 8], m8 2140 movhps [r1 + r0 + 8], m8 2141 RET 2142%endif 2143 2144;------------------------------------------------------- 2145; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride) 2146;------------------------------------------------------- 2147INIT_XMM ssse3 2148cglobal patial_butterfly_inverse_internal_pass1 2149 movh m0, [r0] 2150 movhps m0, [r0 + 2 * 16] 2151 movh m1, [r0 + 4 * 16] 2152 movhps m1, [r0 + 6 * 16] 2153 2154 punpckhwd m2, m0, m1 ; [2 6] 2155 punpcklwd m0, m1 ; [0 4] 2156 pmaddwd m1, m0, [r6] ; EE[0] 2157 pmaddwd m0, [r6 + 32] ; EE[1] 2158 pmaddwd m3, m2, [r6 + 16] ; EO[0] 2159 pmaddwd m2, [r6 + 48] ; EO[1] 2160 2161 paddd m4, m1, m3 ; E[0] 2162 psubd m1, m3 ; E[3] 2163 paddd m3, m0, m2 ; E[1] 2164 psubd m0, m2 ; E[2] 2165 2166 ;E[K] = E[k] + add 2167 mova m5, [pd_64] 2168 paddd m0, m5 2169 paddd m1, m5 2170 paddd m3, m5 2171 paddd m4, m5 2172 2173 movh m2, [r0 + 16] 2174 movhps m2, [r0 + 5 * 16] 2175 movh m5, [r0 + 3 * 16] 2176 movhps m5, [r0 + 7 * 16] 2177 punpcklwd m6, m2, m5 ;[1 3] 2178 punpckhwd m2, m5 ;[5 7] 2179 2180 pmaddwd m5, m6, [r4] 2181 pmaddwd m7, m2, [r4 + 16] 2182 paddd m5, m7 ; O[0] 2183 2184 paddd m7, m4, m5 2185 psrad m7, 7 2186 2187 psubd m4, m5 2188 psrad m4, 7 2189 2190 packssdw m7, m4 2191 movh [r5 + 0 * 16], m7 2192 movhps [r5 + 7 * 16], m7 2193 2194 pmaddwd m5, m6, [r4 + 32] 2195 pmaddwd m4, m2, [r4 + 48] 2196 paddd m5, m4 ; O[1] 2197 2198 paddd m4, m3, m5 2199 psrad m4, 7 2200 2201 psubd m3, m5 2202 psrad m3, 7 2203 2204 packssdw m4, m3 2205 movh [r5 + 1 * 16], m4 2206 movhps [r5 + 6 * 16], m4 2207 2208 pmaddwd m5, m6, [r4 + 64] 2209 pmaddwd m4, m2, [r4 + 80] 2210 paddd m5, m4 ; O[2] 2211 2212 paddd m4, m0, m5 2213 psrad m4, 7 2214 2215 psubd m0, m5 2216 psrad m0, 7 2217 2218 packssdw m4, m0 2219 movh [r5 + 2 * 16], m4 2220 movhps [r5 + 5 * 16], m4 2221 2222 pmaddwd m5, m6, [r4 + 96] 2223 pmaddwd m4, m2, [r4 + 112] 2224 paddd m5, m4 ; O[3] 2225 2226 paddd m4, m1, m5 2227 psrad m4, 7 2228 2229 psubd m1, m5 2230 psrad m1, 7 2231 2232 packssdw m4, m1 2233 movh [r5 + 3 * 16], m4 2234 movhps [r5 + 4 * 16], m4 2235 2236 ret 2237 2238%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1 2239 pshufb m4, %1, [pb_idct8even] 2240 pmaddwd m4, [tab_idct8_1] 2241 phsubd m5, m4 2242 pshufd m4, m4, 0x4E 2243 phaddd m4, m4 2244 punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3] 2245 paddd m4, m6 2246 2247 pshufb %1, %1, [r6] 2248 pmaddwd m5, %1, [r4] 2249 pmaddwd %1, [r4 + 16] 2250 phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3] 2251 2252 paddd %1, m4, m5 2253 psrad %1, IDCT_SHIFT 2254 2255 psubd m4, m5 2256 psrad m4, IDCT_SHIFT 2257 pshufd m4, m4, 0x1B 2258 2259 packssdw %1, m4 2260%endmacro 2261 2262INIT_XMM ssse3 2263cglobal patial_butterfly_inverse_internal_pass2 2264 mova m0, [r5] 2265 PARTIAL_BUTTERFLY_PROCESS_ROW m0 2266 movu [r1], m0 2267 2268 mova m2, [r5 + 16] 2269 PARTIAL_BUTTERFLY_PROCESS_ROW m2 2270 movu [r1 + r2], m2 2271 2272 mova m1, [r5 + 32] 2273 PARTIAL_BUTTERFLY_PROCESS_ROW m1 2274 movu [r1 + 2 * r2], m1 2275 2276 mova m3, [r5 + 48] 2277 PARTIAL_BUTTERFLY_PROCESS_ROW m3 2278 movu [r1 + r3], m3 2279 ret 2280 2281INIT_XMM ssse3 2282cglobal idct8, 3,7,8 ;,0-16*mmsize 2283 ; alignment stack to 64-bytes 2284 mov r5, rsp 2285 sub rsp, 16*mmsize + gprsize 2286 and rsp, ~(64-1) 2287 mov [rsp + 16*mmsize], r5 2288 mov r5, rsp 2289 2290 lea r4, [tab_idct8_3] 2291 lea r6, [tab_dct4] 2292 2293 call patial_butterfly_inverse_internal_pass1 2294 2295 add r0, 8 2296 add r5, 8 2297 2298 call patial_butterfly_inverse_internal_pass1 2299 2300 mova m6, [pd_ %+ IDCT_ROUND] 2301 add r2, r2 2302 lea r3, [r2 * 3] 2303 lea r4, [tab_idct8_2] 2304 lea r6, [pb_idct8odd] 2305 sub r5, 8 2306 2307 call patial_butterfly_inverse_internal_pass2 2308 2309 lea r1, [r1 + 4 * r2] 2310 add r5, 64 2311 2312 call patial_butterfly_inverse_internal_pass2 2313 2314 ; restore origin stack pointer 2315 mov rsp, [rsp + 16*mmsize] 2316 RET 2317 2318 2319;----------------------------------------------------------------------------- 2320; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size) 2321;----------------------------------------------------------------------------- 2322INIT_XMM sse4 2323cglobal denoise_dct, 4, 4, 6 2324 pxor m5, m5 2325 shr r3d, 3 2326.loop: 2327 movu m0, [r0] 2328 pabsw m1, m0 2329 movu m2, [r1] 2330 pmovsxwd m3, m1 2331 paddd m2, m3 2332 movu [r1], m2 2333 movu m2, [r1 + 16] 2334 psrldq m3, m1, 8 2335 pmovsxwd m4, m3 2336 paddd m2, m4 2337 movu [r1 + 16], m2 2338 2339 movu m3, [r2] 2340 psubusw m1, m3 2341 pcmpgtw m4, m1, m5 2342 pand m1, m4 2343 psignw m1, m0 2344 movu [r0], m1 2345 add r0, 16 2346 add r1, 32 2347 add r2, 16 2348 dec r3d 2349 jnz .loop 2350 RET 2351 2352INIT_YMM avx2 2353cglobal denoise_dct, 4, 4, 6 2354 pxor m5, m5 2355 shr r3d, 4 2356.loop: 2357 movu m0, [r0] 2358 pabsw m1, m0 2359 movu m2, [r1] 2360 pmovsxwd m4, xm1 2361 paddd m2, m4 2362 movu [r1], m2 2363 vextracti128 xm4, m1, 1 2364 movu m2, [r1 + 32] 2365 pmovsxwd m3, xm4 2366 paddd m2, m3 2367 movu [r1 + 32], m2 2368 movu m3, [r2] 2369 psubusw m1, m3 2370 pcmpgtw m4, m1, m5 2371 pand m1, m4 2372 psignw m1, m0 2373 movu [r0], m1 2374 add r0, 32 2375 add r1, 64 2376 add r2, 32 2377 dec r3d 2378 jnz .loop 2379 RET 2380%if ARCH_X86_64 == 1 2381INIT_ZMM avx512 2382cglobal denoise_dct, 4, 4, 22 2383 pxor m16, m16 2384 sub r3d, 16 2385 je .coeff16 2386 add r3d, 16 2387 shr r3d, 5 2388 jmp .loop 2389 2390.coeff16: 2391 movu ym19, [r0] 2392 pabsw ym17, ym19 2393 movu m2, [r1] 2394 pmovsxwd m18, ym17 2395 paddd m2, m18 2396 movu [r1], m2 2397 movu ym3, [r2] 2398 psubusw ym17, ym3 2399 pcmpgtw ym18, ym17, ym16 2400 pand ym17, ym18 2401 psignw ym17, ym19 2402 movu [r0], ym17 2403 RET 2404 2405.loop: 2406 movu m21, [r0] 2407 pabsw m17, m21 2408 movu m2, [r1] 2409 pmovsxwd m4, ym17 2410 paddd m2, m4 2411 movu [r1], m2 2412 vextracti64x4 ym4, m17, 1 2413 2414 movu m2, [r1 + mmsize] 2415 pmovsxwd m3, ym4 2416 paddd m2, m3 2417 movu [r1 + mmsize], m2 2418 movu m3, [r2] 2419 psubusw m17, m3 2420 2421 vextracti64x4 ym20, m17, 1 2422 pcmpgtw ym18, ym17, ym16 2423 pcmpgtw ym19, ym20, ym16 2424 vinserti64x4 m18, m18, ym19, 1 2425 2426 pand m17, m18 2427 vextracti64x4 ym19, m17, 1 2428 vextracti64x4 ym20, m21, 1 2429 psignw ym17, ym21 2430 psignw ym19, ym20 2431 vinserti64x4 m17, m17, ym19, 1 2432 2433 movu [r0], m17 2434 add r0, mmsize 2435 add r1, mmsize * 2 2436 add r2, mmsize 2437 dec r3d 2438 jnz .loop 2439 RET 2440%endif ; ARCH_X86_64 == 1 2441 2442%if ARCH_X86_64 == 1 2443%macro DCT8_PASS_1 4 2444 vpbroadcastq m0, [r6 + %1] 2445 pmaddwd m2, m%3, m0 2446 pmaddwd m0, m%4 2447 phaddd m2, m0 2448 paddd m2, m5 2449 psrad m2, DCT8_SHIFT1 2450 packssdw m2, m2 2451 vpermq m2, m2, 0x08 2452 mova [r5 + %2], xm2 2453%endmacro 2454 2455%macro DCT8_PASS_2 2 2456 vbroadcasti128 m4, [r6 + %1] 2457 pmaddwd m6, m0, m4 2458 pmaddwd m7, m1, m4 2459 pmaddwd m8, m2, m4 2460 pmaddwd m9, m3, m4 2461 phaddd m6, m7 2462 phaddd m8, m9 2463 phaddd m6, m8 2464 paddd m6, m5 2465 psrad m6, DCT8_SHIFT2 2466 2467 vbroadcasti128 m4, [r6 + %2] 2468 pmaddwd m10, m0, m4 2469 pmaddwd m7, m1, m4 2470 pmaddwd m8, m2, m4 2471 pmaddwd m9, m3, m4 2472 phaddd m10, m7 2473 phaddd m8, m9 2474 phaddd m10, m8 2475 paddd m10, m5 2476 psrad m10, DCT8_SHIFT2 2477 2478 packssdw m6, m10 2479 vpermq m10, m6, 0xD8 2480 2481%endmacro 2482 2483INIT_YMM avx2 2484cglobal dct8, 3, 7, 11, 0-8*16 2485vbroadcasti128 m5, [pd_ %+ DCT8_ROUND1] 2486%define DCT_SHIFT2 9 2487 2488 add r2d, r2d 2489 lea r3, [r2 * 3] 2490 lea r4, [r0 + r2 * 4] 2491 mov r5, rsp 2492 lea r6, [tab_dct8] 2493 mova m6, [dct8_shuf] 2494 2495 ;pass1 2496 mova xm0, [r0] 2497 vinserti128 m0, m0, [r4], 1 2498 mova xm1, [r0 + r2] 2499 vinserti128 m1, m1, [r4 + r2], 1 2500 mova xm2, [r0 + r2 * 2] 2501 vinserti128 m2, m2, [r4 + r2 * 2], 1 2502 mova xm3, [r0 + r3] 2503 vinserti128 m3, m3, [r4 + r3], 1 2504 2505 punpcklqdq m4, m0, m1 2506 punpckhqdq m0, m1 2507 punpcklqdq m1, m2, m3 2508 punpckhqdq m2, m3 2509 2510 pshufb m0, m6 2511 pshufb m2, m6 2512 2513 paddw m3, m4, m0 2514 paddw m7, m1, m2 2515 2516 psubw m4, m0 2517 psubw m1, m2 2518 2519 DCT8_PASS_1 0 * 16, 0 * 16, 3, 7 2520 DCT8_PASS_1 1 * 16, 2 * 16, 4, 1 2521 DCT8_PASS_1 2 * 16, 4 * 16, 3, 7 2522 DCT8_PASS_1 3 * 16, 6 * 16, 4, 1 2523 DCT8_PASS_1 4 * 16, 1 * 16, 3, 7 2524 DCT8_PASS_1 5 * 16, 3 * 16, 4, 1 2525 DCT8_PASS_1 6 * 16, 5 * 16, 3, 7 2526 DCT8_PASS_1 7 * 16, 7 * 16, 4, 1 2527 2528 ;pass2 2529 vbroadcasti128 m5, [pd_ %+ DCT8_ROUND2] 2530 2531 mova m0, [r5] 2532 mova m1, [r5 + 32] 2533 mova m2, [r5 + 64] 2534 mova m3, [r5 + 96] 2535 2536 DCT8_PASS_2 0 * 16, 1 * 16 2537 movu [r1], m10 2538 DCT8_PASS_2 2 * 16, 3 * 16 2539 movu [r1 + 32], m10 2540 DCT8_PASS_2 4 * 16, 5 * 16 2541 movu [r1 + 64], m10 2542 DCT8_PASS_2 6 * 16, 7 * 16 2543 movu [r1 + 96], m10 2544 RET 2545 2546 2547%macro DCT8_AVX512_PASS_1 4 2548 vpmaddwd m%2, m3, m%1 2549 vpsrlq m8, m%2, 32 2550 vpaddd m%2, m8 2551 vpaddd m%2, m5 2552 vpsrad m%2, DCT8_SHIFT1 2553 2554 vpmaddwd m%4, m2, m%3 2555 vpsrlq m8, m%4, 32 2556 vpaddd m%4, m8 2557 vpaddd m%4, m5 2558 vpsrad m%4, DCT8_SHIFT1 2559 2560 vpackssdw m%2, m%4 2561 vpermw m%2, m1, m%2 2562%endmacro 2563 2564%macro DCT8_AVX512_PASS_2 4 2565 vpmaddwd m0, m9, m%1 2566 vpmaddwd m1, m10, m%1 2567 vpsrldq m2, m0, 8 2568 vpsrldq m3, m1, 8 2569 vpaddd m0, m2 2570 vpaddd m1, m3 2571 vpsrlq m2, m0, 32 2572 vpsrlq m3, m1, 32 2573 vpaddd m0, m2 2574 vpaddd m1, m3 2575 vpaddd m0, m5 2576 vpsrad m0, DCT8_SHIFT2 2577 vpaddd m1, m5 2578 vpsrad m1, DCT8_SHIFT2 2579 vpackssdw m0, m1 2580 vpermw m0, m19, m0 2581 2582 vpmaddwd m1, m9, m%2 2583 vpmaddwd m2, m10, m%2 2584 vpsrldq m3, m1, 8 2585 vpsrldq m4, m2, 8 2586 vpaddd m1, m3 2587 vpaddd m2, m4 2588 vpsrlq m3, m1, 32 2589 vpsrlq m4, m2, 32 2590 vpaddd m1, m3 2591 vpaddd m2, m4 2592 vpaddd m1, m5 2593 vpsrad m1, DCT8_SHIFT2 2594 vpaddd m2, m5 2595 vpsrad m2, DCT8_SHIFT2 2596 vpackssdw m1, m2 2597 vpermw m1, m19, m1 2598 vinserti128 ym0, ym0, xm1, 1 2599 2600 vpmaddwd m1, m9, m%3 2601 vpmaddwd m2, m10, m%3 2602 vpsrldq m3, m1, 8 2603 vpsrldq m4, m2, 8 2604 vpaddd m1, m3 2605 vpaddd m2, m4 2606 vpsrlq m3, m1, 32 2607 vpsrlq m4, m2, 32 2608 vpaddd m1, m3 2609 vpaddd m2, m4 2610 vpaddd m1, m5 2611 vpsrad m1, DCT8_SHIFT2 2612 vpaddd m2, m5 2613 vpsrad m2, DCT8_SHIFT2 2614 vpackssdw m1, m2 2615 vpermw m1, m19, m1 2616 2617 vpmaddwd m2, m9, m%4 2618 vpmaddwd m3, m10, m%4 2619 vpsrldq m4, m2, 8 2620 vpsrldq m6, m3, 8 2621 vpaddd m2, m4 2622 vpaddd m3, m6 2623 vpsrlq m4, m2, 32 2624 vpsrlq m6, m3, 32 2625 vpaddd m2, m4 2626 vpaddd m3, m6 2627 vpaddd m2, m5 2628 vpsrad m2, DCT8_SHIFT2 2629 vpaddd m3, m5 2630 vpsrad m3, DCT8_SHIFT2 2631 vpackssdw m2, m3 2632 vpermw m2, m19, m2 2633 2634 vinserti128 ym1, ym1, xm2, 1 2635 vinserti64x4 m0, m0, ym1, 1 2636%endmacro 2637 2638INIT_ZMM avx512 2639cglobal dct8, 3, 7, 24 2640 2641 vbroadcasti32x4 m5, [pd_ %+ DCT8_ROUND1] 2642 vbroadcasti32x8 m4, [dct8_shuf] 2643 vbroadcasti32x4 m19, [dct8_shuf9_AVX512] 2644 2645 add r2d, r2d 2646 lea r3, [r2 * 3] 2647 lea r4, [r0 + r2 * 4] 2648 lea r5, [tab_dct8] 2649 lea r6, [tab_dct8_avx512] 2650 2651 ;pass1 2652 mova xm0, [r0] 2653 vinserti128 ym0, ym0, [r4], 1 2654 mova xm1, [r0 + r2] 2655 vinserti128 ym1, ym1, [r4 + r2], 1 2656 mova xm2, [r0 + r2 * 2] 2657 vinserti128 ym2, ym2, [r4 + r2 * 2], 1 2658 mova xm3, [r0 + r3] 2659 vinserti128 ym3, ym3, [r4 + r3], 1 2660 2661 vinserti64x4 m0, m0, ym2, 1 2662 vinserti64x4 m1, m1, ym3, 1 2663 2664 vpunpcklqdq m2, m0, m1 2665 vpunpckhqdq m0, m1 2666 2667 vpshufb m0, m4 2668 vpaddw m3, m2, m0 2669 vpsubw m2, m0 2670 2671 vbroadcasti32x8 m1, [dct8_shuf7_AVX512] 2672 2673 ; Load all the coefficients togather for better caching 2674 vpbroadcastq m20, [r6 + 0 * 8] 2675 vpbroadcastq m21, [r6 + 1 * 8] 2676 vpbroadcastq m22, [r6 + 2 * 8] 2677 vpbroadcastq m23, [r6 + 3 * 8] 2678 vpbroadcastq m7, [r6 + 4 * 8] 2679 vpbroadcastq m12, [r6 + 5 * 8] 2680 vpbroadcastq m14, [r6 + 6 * 8] 2681 vpbroadcastq m16, [r6 + 7 * 8] 2682 2683 DCT8_AVX512_PASS_1 20, 9, 21, 10 2684 DCT8_AVX512_PASS_1 22, 11, 23, 10 2685 DCT8_AVX512_PASS_1 7, 13, 12, 10 2686 DCT8_AVX512_PASS_1 14, 15, 16, 10 2687 2688 ;pass2 2689 vbroadcasti32x4 m5, [pd_ %+ DCT8_ROUND2] 2690 2691 vinserti64x4 m9, m9, ym11, 1 2692 vinserti64x4 m10, m13, ym15, 1 2693 2694 ;Load all the coefficients togather for better caching and reuse common coefficients from PASS 1 2695 vbroadcasti32x4 m21, [r5 + 1 * 16] 2696 vbroadcasti32x4 m22, [r5 + 2 * 16] 2697 vbroadcasti32x4 m23, [r5 + 3 * 16] 2698 vbroadcasti32x4 m12, [r5 + 5 * 16] 2699 vbroadcasti32x4 m14, [r5 + 6 * 16] 2700 vbroadcasti32x4 m16, [r5 + 7 * 16] 2701 2702 DCT8_AVX512_PASS_2 20, 21, 22, 23 2703 movu [r1], m0 2704 DCT8_AVX512_PASS_2 7, 12, 14, 16 2705 movu [r1 + 64], m0 2706 RET 2707 2708%macro DCT16_PASS_1_E 2 2709 vpbroadcastq m7, [r7 + %1] 2710 2711 pmaddwd m4, m0, m7 2712 pmaddwd m6, m2, m7 2713 phaddd m4, m6 2714 2715 paddd m4, m9 2716 psrad m4, DCT_SHIFT 2717 2718 packssdw m4, m4 2719 vpermq m4, m4, 0x08 2720 2721 mova [r5 + %2], xm4 2722%endmacro 2723 2724%macro DCT16_PASS_1_O 2 2725 vbroadcasti128 m7, [r7 + %1] 2726 2727 pmaddwd m10, m0, m7 2728 pmaddwd m11, m2, m7 2729 phaddd m10, m11 ; [d0 d0 d1 d1 d4 d4 d5 d5] 2730 2731 pmaddwd m11, m4, m7 2732 pmaddwd m12, m6, m7 2733 phaddd m11, m12 ; [d2 d2 d3 d3 d6 d6 d7 d7] 2734 2735 phaddd m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7] 2736 2737 paddd m10, m9 2738 psrad m10, DCT_SHIFT 2739 2740 packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -] 2741 vpermq m10, m10, 0x08 2742 2743 mova [r5 + %2], xm10 2744%endmacro 2745 2746%macro DCT16_PASS_2 2 2747 vbroadcasti128 m8, [r7 + %1] 2748 vbroadcasti128 m13, [r8 + %1] 2749 2750 pmaddwd m10, m0, m8 2751 pmaddwd m11, m1, m13 2752 paddd m10, m11 2753 2754 pmaddwd m11, m2, m8 2755 pmaddwd m12, m3, m13 2756 paddd m11, m12 2757 phaddd m10, m11 2758 2759 pmaddwd m11, m4, m8 2760 pmaddwd m12, m5, m13 2761 paddd m11, m12 2762 2763 pmaddwd m12, m6, m8 2764 pmaddwd m13, m7, m13 2765 paddd m12, m13 2766 phaddd m11, m12 2767 2768 phaddd m10, m11 2769 paddd m10, m9 2770 psrad m10, DCT_SHIFT2 2771 2772 2773 vbroadcasti128 m8, [r7 + %2] 2774 vbroadcasti128 m13, [r8 + %2] 2775 2776 pmaddwd m14, m0, m8 2777 pmaddwd m11, m1, m13 2778 paddd m14, m11 2779 2780 pmaddwd m11, m2, m8 2781 pmaddwd m12, m3, m13 2782 paddd m11, m12 2783 phaddd m14, m11 2784 2785 pmaddwd m11, m4, m8 2786 pmaddwd m12, m5, m13 2787 paddd m11, m12 2788 2789 pmaddwd m12, m6, m8 2790 pmaddwd m13, m7, m13 2791 paddd m12, m13 2792 phaddd m11, m12 2793 2794 phaddd m14, m11 2795 paddd m14, m9 2796 psrad m14, DCT_SHIFT2 2797 2798 packssdw m10, m14 2799 vextracti128 xm14, m10, 1 2800 movlhps xm15, xm10, xm14 2801 movhlps xm14, xm10 2802%endmacro 2803INIT_YMM avx2 2804cglobal dct16, 3, 9, 16, 0-16*mmsize 2805%if BIT_DEPTH == 12 2806 %define DCT_SHIFT 7 2807 vbroadcasti128 m9, [pd_64] 2808%elif BIT_DEPTH == 10 2809 %define DCT_SHIFT 5 2810 vbroadcasti128 m9, [pd_16] 2811%elif BIT_DEPTH == 8 2812 %define DCT_SHIFT 3 2813 vbroadcasti128 m9, [pd_4] 2814%else 2815 %error Unsupported BIT_DEPTH! 2816%endif 2817%define DCT_SHIFT2 10 2818 2819 add r2d, r2d 2820 2821 mova m13, [dct16_shuf1] 2822 mova m14, [dct16_shuf2] 2823 lea r7, [tab_dct16_1 + 8 * 16] 2824 lea r8, [tab_dct16_2 + 8 * 16] 2825 lea r3, [r2 * 3] 2826 mov r5, rsp 2827 mov r4d, 2 ; Each iteration process 8 rows, so 16/8 iterations 2828 2829.pass1: 2830 lea r6, [r0 + r2 * 4] 2831 2832 movu m2, [r0] 2833 movu m1, [r6] 2834 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo] 2835 vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi] 2836 2837 movu m4, [r0 + r2] 2838 movu m3, [r6 + r2] 2839 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo] 2840 vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi] 2841 2842 movu m6, [r0 + r2 * 2] 2843 movu m5, [r6 + r2 * 2] 2844 vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo] 2845 vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi] 2846 2847 movu m8, [r0 + r3] 2848 movu m7, [r6 + r3] 2849 vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo] 2850 vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi] 2851 2852 pshufb m1, m13 2853 pshufb m3, m13 2854 pshufb m5, m13 2855 pshufb m7, m13 2856 2857 paddw m8, m0, m1 ;E 2858 psubw m0, m1 ;O 2859 2860 paddw m1, m2, m3 ;E 2861 psubw m2, m3 ;O 2862 2863 paddw m3, m4, m5 ;E 2864 psubw m4, m5 ;O 2865 2866 paddw m5, m6, m7 ;E 2867 psubw m6, m7 ;O 2868 2869 DCT16_PASS_1_O -7 * 16, 1 * 32 2870 DCT16_PASS_1_O -5 * 16, 3 * 32 2871 DCT16_PASS_1_O -3 * 16, 1 * 32 + 16 2872 DCT16_PASS_1_O -1 * 16, 3 * 32 + 16 2873 DCT16_PASS_1_O 1 * 16, 5 * 32 2874 DCT16_PASS_1_O 3 * 16, 7 * 32 2875 DCT16_PASS_1_O 5 * 16, 5 * 32 + 16 2876 DCT16_PASS_1_O 7 * 16, 7 * 32 + 16 2877 2878 pshufb m8, m14 2879 pshufb m1, m14 2880 phaddw m0, m8, m1 2881 2882 pshufb m3, m14 2883 pshufb m5, m14 2884 phaddw m2, m3, m5 2885 2886 DCT16_PASS_1_E -8 * 16, 0 * 32 2887 DCT16_PASS_1_E -4 * 16, 0 * 32 + 16 2888 DCT16_PASS_1_E 0 * 16, 4 * 32 2889 DCT16_PASS_1_E 4 * 16, 4 * 32 + 16 2890 2891 phsubw m0, m8, m1 2892 phsubw m2, m3, m5 2893 2894 DCT16_PASS_1_E -6 * 16, 2 * 32 2895 DCT16_PASS_1_E -2 * 16, 2 * 32 + 16 2896 DCT16_PASS_1_E 2 * 16, 6 * 32 2897 DCT16_PASS_1_E 6 * 16, 6 * 32 + 16 2898 2899 lea r0, [r0 + 8 * r2] 2900 add r5, 256 2901 2902 dec r4d 2903 jnz .pass1 2904 2905 mov r5, rsp 2906 mov r4d, 2 2907 mov r2d, 32 2908 lea r3, [r2 * 3] 2909 vbroadcasti128 m9, [pd_512] 2910 2911.pass2: 2912 mova m0, [r5 + 0 * 32] ; [row0lo row4lo] 2913 mova m1, [r5 + 8 * 32] ; [row0hi row4hi] 2914 2915 mova m2, [r5 + 1 * 32] ; [row1lo row5lo] 2916 mova m3, [r5 + 9 * 32] ; [row1hi row5hi] 2917 2918 mova m4, [r5 + 2 * 32] ; [row2lo row6lo] 2919 mova m5, [r5 + 10 * 32] ; [row2hi row6hi] 2920 2921 mova m6, [r5 + 3 * 32] ; [row3lo row7lo] 2922 mova m7, [r5 + 11 * 32] ; [row3hi row7hi] 2923 2924 DCT16_PASS_2 -8 * 16, -7 * 16 2925 movu [r1], xm15 2926 movu [r1 + r2], xm14 2927 2928 DCT16_PASS_2 -6 * 16, -5 * 16 2929 movu [r1 + r2 * 2], xm15 2930 movu [r1 + r3], xm14 2931 2932 lea r6, [r1 + r2 * 4] 2933 DCT16_PASS_2 -4 * 16, -3 * 16 2934 movu [r6], xm15 2935 movu [r6 + r2], xm14 2936 2937 DCT16_PASS_2 -2 * 16, -1 * 16 2938 movu [r6 + r2 * 2], xm15 2939 movu [r6 + r3], xm14 2940 2941 lea r6, [r6 + r2 * 4] 2942 DCT16_PASS_2 0 * 16, 1 * 16 2943 movu [r6], xm15 2944 movu [r6 + r2], xm14 2945 2946 DCT16_PASS_2 2 * 16, 3 * 16 2947 movu [r6 + r2 * 2], xm15 2948 movu [r6 + r3], xm14 2949 2950 lea r6, [r6 + r2 * 4] 2951 DCT16_PASS_2 4 * 16, 5 * 16 2952 movu [r6], xm15 2953 movu [r6 + r2], xm14 2954 2955 DCT16_PASS_2 6 * 16, 7 * 16 2956 movu [r6 + r2 * 2], xm15 2957 movu [r6 + r3], xm14 2958 2959 add r1, 16 2960 add r5, 128 2961 2962 dec r4d 2963 jnz .pass2 2964 RET 2965%macro DCT16_avx512_PASS_1_O 4 2966 vbroadcasti32x4 m1, [r5 + %1] 2967 2968 pmaddwd m3, m6, m1 2969 vpsrldq m11, m3, 8 2970 vpaddd m3, m11 2971 2972 pmaddwd m11, m8, m1 2973 vpsrldq m12, m11, 8 2974 vpaddd m11, m12 2975 2976 vpunpcklqdq m12, m3, m11 2977 vpsrldq m11, m12, 4 2978 vpaddd m11, m12 2979 2980 pmaddwd m3, m10, m1 2981 vpsrldq m12, m3, 8 2982 vpaddd m3, m12 2983 2984 pmaddwd m12, m2, m1 2985 vpsrldq m13, m12, 8 2986 vpaddd m12, m13 2987 2988 vpunpcklqdq m13, m3, m12 2989 vpsrldq m12, m13, 4 2990 vpaddd m12, m13 2991 2992 mova m%3, m26 2993 vpermi2d m%3, m11, m12 2994 paddd m%3, m0 2995 psrad m%3, DCT_SHIFT 2996 2997 ; next row start 2998 vbroadcasti32x4 m1, [r5 + %2] 2999 3000 pmaddwd m3, m6, m1 3001 vpsrldq m11, m3, 8 3002 vpaddd m3, m11 3003 3004 pmaddwd m11, m8, m1 3005 vpsrldq m12, m11, 8 3006 vpaddd m11, m12 3007 3008 vpunpcklqdq m12, m3, m11 3009 vpsrldq m11, m12, 4 3010 vpaddd m11, m12 3011 3012 pmaddwd m3, m10, m1 3013 vpsrldq m12, m3, 8 3014 vpaddd m3, m12 3015 3016 pmaddwd m12, m2, m1 3017 vpsrldq m13, m12, 8 3018 vpaddd m12, m13 3019 3020 vpunpcklqdq m13, m3, m12 3021 vpsrldq m12, m13, 4 3022 vpaddd m12, m13 3023 3024 mova m%4, m26 3025 vpermi2d m%4, m11, m12 3026 paddd m%4, m0 3027 psrad m%4, DCT_SHIFT 3028 ;next row end 3029 3030 packssdw m%3, m%4 3031 vpermw m%4, m25, m%3 3032%endmacro 3033 3034%macro DCT16_AVX512_PASS_1_LOOP 0 3035 vbroadcasti32x8 m1, [dct16_shuf1] 3036 mova m2, [dct16_shuf3_AVX512] 3037 mova m3, [dct16_shuf4_AVX512] 3038 3039 movu ym4, [r0] 3040 movu ym5, [r0 + r2] 3041 vinserti64x4 m4, m4, ym5, 1 3042 3043 movu ym5, [r0 + 2 * r2] 3044 movu ym6, [r0 + r3] 3045 vinserti64x4 m5, m5, ym6, 1 3046 3047 mova m6, m2 3048 mova m7, m3 3049 vpermi2q m6, m4, m5 3050 vpermi2q m7, m4, m5 3051 3052 movu ym4, [r4] 3053 movu ym5, [r4 + r2] 3054 vinserti64x4 m4, m4, ym5, 1 3055 3056 movu ym5, [r4 + 2 * r2] 3057 movu ym8, [r4 + r3] 3058 vinserti64x4 m5, m5, ym8, 1 3059 3060 mova m8, m2 3061 mova m9, m3 3062 vpermi2q m8, m4, m5 3063 vpermi2q m9, m4, m5 3064 3065 vpshufb m7, m1 3066 vpshufb m9, m1 3067 3068 paddw m4, m6, m7 3069 psubw m6, m7 3070 3071 paddw m5, m8, m9 3072 psubw m8, m9 3073 3074 lea r0, [r0 + 8 * r2] 3075 lea r4, [r0 + r2 * 4] 3076 3077 movu ym7, [r0] 3078 movu ym9, [r0 + r2] 3079 vinserti64x4 m7, m7, ym9, 1 3080 3081 movu ym9, [r0 + 2 * r2] 3082 movu ym10, [r0 + r3] 3083 vinserti64x4 m9, m9, ym10, 1 3084 3085 mova m10, m2 3086 mova m11, m3 3087 vpermi2q m10, m7, m9 3088 vpermi2q m11, m7, m9 3089 3090 vpshufb m11, m1 3091 paddw m7, m10, m11 3092 psubw m10, m11 3093 3094 movu ym9, [r4] 3095 movu ym11, [r4 + r2] 3096 vinserti64x4 m9, m9, ym11, 1 3097 3098 movu ym11, [r4 + 2 * r2] 3099 movu ym12, [r4 + r3] 3100 vinserti64x4 m11, m11, ym12, 1 3101 3102 vpermi2q m2, m9, m11 3103 vpermi2q m3, m9, m11 3104 3105 vpshufb m3, m1 3106 paddw m9, m2, m3 3107 psubw m2, m3 3108%endmacro 3109 3110%macro DCT16_avx512_PASS_1_E 4 3111 vpbroadcastq m1, [r5 + %1] 3112 3113 pmaddwd m19, m11, m1 3114 vpsrldq m12, m19, 4 3115 vpaddd m12, m19 3116 3117 pmaddwd m19, m13, m1 3118 vpsrldq m18, m19, 4 3119 vpaddd m18, m19 3120 3121 mova m%2, m27 3122 vpermi2d m%2, m12, m18 3123 paddd m%2, m0 3124 psrad m%2, DCT_SHIFT 3125 3126 ; 2nd row 3127 vpbroadcastq m1, [r5 + %3] 3128 3129 pmaddwd m19, m11, m1 3130 vpsrldq m12, m19, 4 3131 vpaddd m12, m19 3132 3133 pmaddwd m19, m13, m1 3134 vpsrldq m18, m19, 4 3135 vpaddd m18, m19 3136 3137 mova m%4, m27 3138 vpermi2d m%4, m12, m18 3139 paddd m%4, m0 3140 psrad m%4, DCT_SHIFT 3141 3142 packssdw m%2, m%4 3143 vpermw m%4, m25, m%2 3144%endmacro 3145 3146%macro DCT16_PASS2_AVX512 10 3147 vpmaddwd m5, m%2, m%1 3148 vpsrldq m6, m5, 8 3149 vpaddd m5, m6 3150 vpsrldq m6, m5, 4 3151 vpaddd m5, m6 3152 3153 vpmaddwd m6, m%3, m%1 3154 vpsrldq m7, m6, 8 3155 vpaddd m6, m7 3156 vpsrldq m7, m6, 4 3157 vpaddd m6, m7 3158 vpunpckldq m7, m5, m6 3159 3160 vpmaddwd m5, m%4, m%1 3161 vpsrldq m6, m5, 8 3162 vpaddd m5, m6 3163 vpsrldq m6, m5, 4 3164 vpaddd m5, m6 3165 3166 vpmaddwd m6, m%5, m%1 3167 vpsrldq m8, m6, 8 3168 vpaddd m6, m8 3169 vpsrldq m8, m6, 4 3170 vpaddd m6, m8 3171 vpunpckldq m8, m5, m6 3172 3173 vpunpcklqdq m5, m7, m8 3174 vpermd m5, m2, m5 3175 vpsrldq m6, m5, 4 3176 vpaddd m5, m6 3177 3178 vpmaddwd m6, m%6, m%1 3179 vpsrldq m7, m6, 8 3180 vpaddd m6, m7 3181 vpsrldq m7, m6, 4 3182 vpaddd m6, m7 3183 3184 vpmaddwd m7, m%7, m%1 3185 vpsrldq m8, m7, 8 3186 vpaddd m7, m8 3187 vpsrldq m8, m7, 4 3188 vpaddd m7, m8 3189 vpunpckldq m8, m6, m7 3190 3191 vpmaddwd m6, m%8, m%1 3192 vpsrldq m7, m6, 8 3193 vpaddd m6, m7 3194 vpsrldq m7, m6, 4 3195 vpaddd m6, m7 3196 3197 vpmaddwd m7, m%9, m%1 3198 vpsrldq m4, m7, 8 3199 vpaddd m7, m4 3200 vpsrldq m4, m7, 4 3201 vpaddd m7, m4 3202 vpunpckldq m4, m6, m7 3203 3204 vpunpcklqdq m6, m8, m4 3205 vpermd m6, m2, m6 3206 vpsrldq m7, m6, 4 3207 vpaddd m6, m7 3208 3209 paddd m5, m0 3210 psrad m5, DCT_SHIFT2 3211 paddd m6, m0 3212 psrad m6, DCT_SHIFT2 3213 3214 packssdw m5, m6 3215 vpermw m%10, m3, m5 3216%endmacro 3217 3218INIT_ZMM avx512 3219cglobal dct16, 3, 6, 29 3220 3221%if BIT_DEPTH == 12 3222 %define DCT_SHIFT 7 3223 vbroadcasti32x4 m0, [pd_64] 3224%elif BIT_DEPTH == 10 3225 %define DCT_SHIFT 5 3226 vbroadcasti32x4 m0, [pd_16] 3227%elif BIT_DEPTH == 8 3228 %define DCT_SHIFT 3 3229 vbroadcasti32x4 m0, [pd_4] 3230%else 3231 %error Unsupported BIT_DEPTH! 3232%endif 3233%define DCT_SHIFT2 10 3234 3235 add r2d, r2d 3236 lea r3, [r2 * 3] 3237 lea r4, [r0 + r2 * 4] 3238 lea r5, [tab_dct16_1 + 8 * 16] 3239 3240 ;Load reuseable table once to save memory movments 3241 mova m25, [dct16_shuf5_AVX512] 3242 mova m26, [dct16_shuf2_AVX512] 3243 mova m27, [dct16_shuf7_AVX512] 3244 vbroadcasti32x8 m28, [dct16_shuf6_AVX512] 3245 3246 DCT16_AVX512_PASS_1_LOOP 3247 DCT16_avx512_PASS_1_O -7 * 16, -5 * 16, 15, 14 ;row 1, 3 3248 DCT16_avx512_PASS_1_O -3 * 16, -1 * 16, 16, 15 ;row 5, 7 3249 DCT16_avx512_PASS_1_O 1 * 16, 3 * 16, 17, 16 ;row 9, 11 3250 DCT16_avx512_PASS_1_O 5 * 16, 7 * 16, 18, 17 ;row 13, 15 3251 3252 vbroadcasti32x8 m1, [dct16_shuf2] 3253 pshufb m4, m1 3254 pshufb m5, m1 3255 pshufb m7, m1 3256 pshufb m9, m1 3257 3258 vpsrldq m3, m4, 2 3259 vpsubw m11, m4, m3 3260 vpsrldq m6, m5, 2 3261 vpsubw m12, m5, m6 3262 vpsrldq m8, m7, 2 3263 vpsubw m13, m7, m8 3264 vpsrldq m10, m9, 2 3265 vpsubw m18, m9, m10 3266 3267 vpermw m11, m28, m11 3268 vpermw m12, m28, m12 3269 vinserti64x4 m11, m11, ym12, 1 3270 3271 vpermw m13, m28, m13 3272 vpermw m18, m28, m18 3273 vinserti64x4 m13, m13, ym18, 1 3274 3275 DCT16_avx512_PASS_1_E -6 * 16, 21, -2 * 16, 20 ; row 2, 6 3276 DCT16_avx512_PASS_1_E 2 * 16, 22, 6 * 16, 21 ; row 10, 14 3277 3278 vpaddw m11, m4, m3 3279 vpaddw m12, m5, m6 3280 vpaddw m13, m7, m8 3281 vpaddw m18, m9, m10 3282 3283 vpermw m11, m28, m11 3284 vpermw m12, m28, m12 3285 vinserti64x4 m11, m11, ym12, 1 3286 3287 vpermw m13, m28, m13 3288 vpermw m18, m28, m18 3289 vinserti64x4 m13, m13, ym18, 1 3290 3291 DCT16_avx512_PASS_1_E -8 * 16, 23, 0 * 16, 22 ; row 0, 8 3292 DCT16_avx512_PASS_1_E -4 * 16, 24, 4 * 16, 23 ; row 4, 12 3293 3294 ;PASS2 3295 vbroadcasti128 m0, [pd_512] 3296 3297 lea r5, [tab_dct16] 3298 mova m2, [dct16_shuf9_AVX512] 3299 vbroadcasti32x8 m3, [dct16_shuf8_AVX512] 3300 3301 vbroadcasti32x8 m1, [r5 + 0 * 32] 3302 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3303 vbroadcasti32x8 m1, [r5 + 1 * 32] 3304 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3305 vinserti64x4 m9, m9, ym10, 1 3306 movu [r1 + 0 * 64], m9 3307 3308 vbroadcasti32x8 m1, [r5 + 2 * 32] 3309 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3310 vbroadcasti32x8 m1, [r5 + 3 * 32] 3311 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3312 vinserti64x4 m9, m9, ym10, 1 3313 movu [r1 + 1 * 64], m9 3314 3315 vbroadcasti32x8 m1, [r5 + 4 * 32] 3316 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3317 vbroadcasti32x8 m1, [r5 + 5 * 32] 3318 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3319 vinserti64x4 m9, m9, ym10, 1 3320 movu [r1 + 2 * 64], m9 3321 3322 vbroadcasti32x8 m1, [r5 + 6 * 32] 3323 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3324 vbroadcasti32x8 m1, [r5 + 7 * 32] 3325 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3326 vinserti64x4 m9, m9, ym10, 1 3327 movu [r1 + 3 * 64], m9 3328 3329 vbroadcasti32x8 m1, [r5 + 8 * 32] 3330 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3331 vbroadcasti32x8 m1, [r5 + 9 * 32] 3332 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3333 vinserti64x4 m9, m9, ym10, 1 3334 movu [r1 + 4 * 64], m9 3335 3336 vbroadcasti32x8 m1, [r5 + 10 * 32] 3337 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3338 vbroadcasti32x8 m1, [r5 + 11 * 32] 3339 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3340 vinserti64x4 m9, m9, ym10, 1 3341 movu [r1 + 5 * 64], m9 3342 3343 vbroadcasti32x8 m1, [r5 + 12 * 32] 3344 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3345 vbroadcasti32x8 m1, [r5 + 13 * 32] 3346 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3347 vinserti64x4 m9, m9, ym10, 1 3348 movu [r1 + 6 * 64], m9 3349 3350 vbroadcasti32x8 m1, [r5 + 14 * 32] 3351 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9 3352 vbroadcasti32x8 m1, [r5 + 15 * 32] 3353 DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10 3354 vinserti64x4 m9, m9, ym10, 1 3355 movu [r1 + 7 * 64], m9 3356 RET 3357 3358%macro DCT32_PASS_1 4 3359 vbroadcasti128 m8, [r7 + %1] 3360 pmaddwd m11, m%3, m8 3361 pmaddwd m12, m%4, m8 3362 phaddd m11, m12 3363 3364 vbroadcasti128 m8, [r7 + %1 + 32] 3365 vbroadcasti128 m10, [r7 + %1 + 48] 3366 pmaddwd m12, m5, m8 3367 pmaddwd m13, m6, m10 3368 phaddd m12, m13 3369 3370 pmaddwd m13, m4, m8 3371 pmaddwd m14, m7, m10 3372 phaddd m13, m14 3373 3374 phaddd m12, m13 3375 3376 phaddd m11, m12 3377 paddd m11, m9 3378 psrad m11, DCT_SHIFT 3379 3380 vpermq m11, m11, 0xD8 3381 packssdw m11, m11 3382 movq [r5 + %2], xm11 3383 vextracti128 xm10, m11, 1 3384 movq [r5 + %2 + 64], xm10 3385%endmacro 3386 3387%macro DCT32_PASS_2 1 3388 mova m8, [r7 + %1] 3389 mova m10, [r8 + %1] 3390 pmaddwd m11, m0, m8 3391 pmaddwd m12, m1, m10 3392 paddd m11, m12 3393 3394 pmaddwd m12, m2, m8 3395 pmaddwd m13, m3, m10 3396 paddd m12, m13 3397 3398 phaddd m11, m12 3399 3400 pmaddwd m12, m4, m8 3401 pmaddwd m13, m5, m10 3402 paddd m12, m13 3403 3404 pmaddwd m13, m6, m8 3405 pmaddwd m14, m7, m10 3406 paddd m13, m14 3407 3408 phaddd m12, m13 3409 3410 phaddd m11, m12 3411 vextracti128 xm10, m11, 1 3412 paddd xm11, xm10 3413 3414 paddd xm11, xm9 3415 psrad xm11, DCT_SHIFT2 3416 packssdw xm11, xm11 3417 3418%endmacro 3419 3420INIT_YMM avx2 3421cglobal dct32, 3, 9, 16, 0-64*mmsize 3422%if BIT_DEPTH == 12 3423 %define DCT_SHIFT 8 3424 vpbroadcastq m9, [pd_128] 3425%elif BIT_DEPTH == 10 3426 %define DCT_SHIFT 6 3427 vpbroadcastq m9, [pd_32] 3428%elif BIT_DEPTH == 8 3429 %define DCT_SHIFT 4 3430 vpbroadcastq m9, [pd_8] 3431%else 3432 %error Unsupported BIT_DEPTH! 3433%endif 3434%define DCT_SHIFT2 11 3435 3436 add r2d, r2d 3437 3438 lea r7, [tab_dct32_1] 3439 lea r8, [tab_dct32_2] 3440 lea r3, [r2 * 3] 3441 mov r5, rsp 3442 mov r4d, 8 3443 mova m15, [dct16_shuf1] 3444 3445.pass1: 3446 movu m2, [r0] 3447 movu m1, [r0 + 32] 3448 pshufb m1, m15 3449 vpermq m1, m1, 0x4E 3450 psubw m7, m2, m1 3451 paddw m2, m1 3452 3453 movu m1, [r0 + r2 * 2] 3454 movu m0, [r0 + r2 * 2 + 32] 3455 pshufb m0, m15 3456 vpermq m0, m0, 0x4E 3457 psubw m8, m1, m0 3458 paddw m1, m0 3459 vperm2i128 m0, m2, m1, 0x20 ; [row0lo row2lo] for E 3460 vperm2i128 m3, m2, m1, 0x31 ; [row0hi row2hi] for E 3461 pshufb m3, m15 3462 psubw m1, m0, m3 3463 paddw m0, m3 3464 3465 vperm2i128 m5, m7, m8, 0x20 ; [row0lo row2lo] for O 3466 vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O 3467 3468 3469 movu m4, [r0 + r2] 3470 movu m2, [r0 + r2 + 32] 3471 pshufb m2, m15 3472 vpermq m2, m2, 0x4E 3473 psubw m10, m4, m2 3474 paddw m4, m2 3475 3476 movu m3, [r0 + r3] 3477 movu m2, [r0 + r3 + 32] 3478 pshufb m2, m15 3479 vpermq m2, m2, 0x4E 3480 psubw m11, m3, m2 3481 paddw m3, m2 3482 vperm2i128 m2, m4, m3, 0x20 ; [row1lo row3lo] for E 3483 vperm2i128 m8, m4, m3, 0x31 ; [row1hi row3hi] for E 3484 pshufb m8, m15 3485 psubw m3, m2, m8 3486 paddw m2, m8 3487 3488 vperm2i128 m4, m10, m11, 0x20 ; [row1lo row3lo] for O 3489 vperm2i128 m7, m10, m11, 0x31 ; [row1hi row3hi] for O 3490 3491 3492 DCT32_PASS_1 0 * 32, 0 * 64, 0, 2 3493 DCT32_PASS_1 2 * 32, 2 * 64, 1, 3 3494 DCT32_PASS_1 4 * 32, 4 * 64, 0, 2 3495 DCT32_PASS_1 6 * 32, 6 * 64, 1, 3 3496 DCT32_PASS_1 8 * 32, 8 * 64, 0, 2 3497 DCT32_PASS_1 10 * 32, 10 * 64, 1, 3 3498 DCT32_PASS_1 12 * 32, 12 * 64, 0, 2 3499 DCT32_PASS_1 14 * 32, 14 * 64, 1, 3 3500 DCT32_PASS_1 16 * 32, 16 * 64, 0, 2 3501 DCT32_PASS_1 18 * 32, 18 * 64, 1, 3 3502 DCT32_PASS_1 20 * 32, 20 * 64, 0, 2 3503 DCT32_PASS_1 22 * 32, 22 * 64, 1, 3 3504 DCT32_PASS_1 24 * 32, 24 * 64, 0, 2 3505 DCT32_PASS_1 26 * 32, 26 * 64, 1, 3 3506 DCT32_PASS_1 28 * 32, 28 * 64, 0, 2 3507 DCT32_PASS_1 30 * 32, 30 * 64, 1, 3 3508 3509 add r5, 8 3510 lea r0, [r0 + r2 * 4] 3511 3512 dec r4d 3513 jnz .pass1 3514 3515 mov r2d, 64 3516 lea r3, [r2 * 3] 3517 mov r5, rsp 3518 mov r4d, 8 3519 vpbroadcastq m9, [pd_1024] 3520 3521.pass2: 3522 mova m0, [r5 + 0 * 64] 3523 mova m1, [r5 + 0 * 64 + 32] 3524 3525 mova m2, [r5 + 1 * 64] 3526 mova m3, [r5 + 1 * 64 + 32] 3527 3528 mova m4, [r5 + 2 * 64] 3529 mova m5, [r5 + 2 * 64 + 32] 3530 3531 mova m6, [r5 + 3 * 64] 3532 mova m7, [r5 + 3 * 64 + 32] 3533 3534 DCT32_PASS_2 0 * 32 3535 movq [r1], xm11 3536 DCT32_PASS_2 1 * 32 3537 movq [r1 + r2], xm11 3538 DCT32_PASS_2 2 * 32 3539 movq [r1 + r2 * 2], xm11 3540 DCT32_PASS_2 3 * 32 3541 movq [r1 + r3], xm11 3542 3543 lea r6, [r1 + r2 * 4] 3544 DCT32_PASS_2 4 * 32 3545 movq [r6], xm11 3546 DCT32_PASS_2 5 * 32 3547 movq [r6 + r2], xm11 3548 DCT32_PASS_2 6 * 32 3549 movq [r6 + r2 * 2], xm11 3550 DCT32_PASS_2 7 * 32 3551 movq [r6 + r3], xm11 3552 3553 lea r6, [r6 + r2 * 4] 3554 DCT32_PASS_2 8 * 32 3555 movq [r6], xm11 3556 DCT32_PASS_2 9 * 32 3557 movq [r6 + r2], xm11 3558 DCT32_PASS_2 10 * 32 3559 movq [r6 + r2 * 2], xm11 3560 DCT32_PASS_2 11 * 32 3561 movq [r6 + r3], xm11 3562 3563 lea r6, [r6 + r2 * 4] 3564 DCT32_PASS_2 12 * 32 3565 movq [r6], xm11 3566 DCT32_PASS_2 13 * 32 3567 movq [r6 + r2], xm11 3568 DCT32_PASS_2 14 * 32 3569 movq [r6 + r2 * 2], xm11 3570 DCT32_PASS_2 15 * 32 3571 movq [r6 + r3], xm11 3572 3573 lea r6, [r6 + r2 * 4] 3574 DCT32_PASS_2 16 * 32 3575 movq [r6], xm11 3576 DCT32_PASS_2 17 * 32 3577 movq [r6 + r2], xm11 3578 DCT32_PASS_2 18 * 32 3579 movq [r6 + r2 * 2], xm11 3580 DCT32_PASS_2 19 * 32 3581 movq [r6 + r3], xm11 3582 3583 lea r6, [r6 + r2 * 4] 3584 DCT32_PASS_2 20 * 32 3585 movq [r6], xm11 3586 DCT32_PASS_2 21 * 32 3587 movq [r6 + r2], xm11 3588 DCT32_PASS_2 22 * 32 3589 movq [r6 + r2 * 2], xm11 3590 DCT32_PASS_2 23 * 32 3591 movq [r6 + r3], xm11 3592 3593 lea r6, [r6 + r2 * 4] 3594 DCT32_PASS_2 24 * 32 3595 movq [r6], xm11 3596 DCT32_PASS_2 25 * 32 3597 movq [r6 + r2], xm11 3598 DCT32_PASS_2 26 * 32 3599 movq [r6 + r2 * 2], xm11 3600 DCT32_PASS_2 27 * 32 3601 movq [r6 + r3], xm11 3602 3603 lea r6, [r6 + r2 * 4] 3604 DCT32_PASS_2 28 * 32 3605 movq [r6], xm11 3606 DCT32_PASS_2 29 * 32 3607 movq [r6 + r2], xm11 3608 DCT32_PASS_2 30 * 32 3609 movq [r6 + r2 * 2], xm11 3610 DCT32_PASS_2 31 * 32 3611 movq [r6 + r3], xm11 3612 3613 add r5, 256 3614 add r1, 8 3615 3616 dec r4d 3617 jnz .pass2 3618 RET 3619 3620 3621%macro DCT32_avx512_LOOP 4 3622 movu m1, [r0] 3623 movu m2, [r0 + r2] 3624 3625 vinserti64x4 m3, m1, ym2, 1 ; row 0l, 1l 3626 vextracti64x4 ym4, m1, 1 3627 vinserti64x4 m2, m2, ym4, 0 ; row 0h, 1h 3628 vpermw m2, m31, m2 3629 3630 psubw m%1, m3, m2 ; O 3631 paddw m3, m2 ; E 3632 mova [r9 + %3 * 64], m3 3633 3634 movu m1, [r0 + 2 * r2] 3635 movu m5, [r0 + r3] 3636 3637 vinserti64x4 m6, m1, ym5, 1 ; row 2l, 3l 3638 vextracti64x4 ym7, m1, 1 3639 vinserti64x4 m5, m5, ym7, 0 ; row 2h, 3h 3640 vpermw m5, m31, m5 3641 3642 psubw m%2, m6, m5 ; O 3643 paddw m6, m5 ; E 3644 mova [r9 + %4 * 64], m6 3645%endmacro 3646 3647%macro DCT32_avx512_PASS_1_O 3 3648 pmaddwd m10, m%2, m9 3649 vpsrldq m11, m10, 8 3650 vpaddd m10, m11 3651 3652 pmaddwd m11, m%3, m9 3653 vpsrldq m12, m11, 8 3654 vpaddd m11, m12 3655 3656 mova m12, m8 3657 vpermi2d m12, m10, m11 3658 vpsrldq m10, m12, 8 3659 vpaddd m12, m10 3660 vpsrldq m10, m12, 4 3661 vpaddd m12, m10 3662 3663 vpaddd m12, m0 3664 vpsrad m12, DCT_SHIFT 3665 vpackssdw m12, m12 3666 vpermw m12, m30, m12 3667 movq [r5 + %1], xm12 3668%endmacro 3669 3670%macro DCT32_avx512_PASS_1_ROW_O 0 3671 vbroadcasti32x8 m9, [r7 + 1 * 32] 3672 3673 DCT32_avx512_LOOP 13, 14, 0, 1 3674 DCT32_avx512_PASS_1_O 1 * 64 + 0 * 8, 13, 14 3675 3676 lea r0, [r0 + 4 * r2] 3677 DCT32_avx512_LOOP 15, 16, 2, 3 3678 DCT32_avx512_PASS_1_O 1 * 64 + 1 * 8, 15, 16 3679 3680 lea r0, [r0 + 4 * r2] 3681 DCT32_avx512_LOOP 17, 18, 4, 5 3682 DCT32_avx512_PASS_1_O 1 * 64 + 2 * 8, 17, 18 3683 3684 lea r0, [r0 + 4 * r2] 3685 DCT32_avx512_LOOP 19, 20, 6, 7 3686 DCT32_avx512_PASS_1_O 1 * 64 + 3 * 8, 19, 20 3687 3688 lea r0, [r0 + 4 * r2] 3689 DCT32_avx512_LOOP 21, 22, 8, 9 3690 DCT32_avx512_PASS_1_O 1 * 64 + 4 * 8, 21, 22 3691 3692 lea r0, [r0 + 4 * r2] 3693 DCT32_avx512_LOOP 23, 24, 10, 11 3694 DCT32_avx512_PASS_1_O 1 * 64 + 5 * 8, 23, 24 3695 3696 lea r0, [r0 + 4 * r2] 3697 DCT32_avx512_LOOP 25, 26, 12, 13 3698 DCT32_avx512_PASS_1_O 1 * 64 + 6 * 8, 25, 26 3699 3700 lea r0, [r0 + 4 * r2] 3701 DCT32_avx512_LOOP 27, 28, 14, 15 3702 DCT32_avx512_PASS_1_O 1 * 64 + 7 * 8, 27, 28 3703%endmacro 3704 3705%macro DCT32_avx512_PASS_1_ROW_O_1_7 1 3706 vbroadcasti32x8 m9, [r7 + %1 * 32] 3707 3708 DCT32_avx512_PASS_1_O %1 * 64 + 0 * 8, 13, 14 3709 DCT32_avx512_PASS_1_O %1 * 64 + 1 * 8, 15, 16 3710 DCT32_avx512_PASS_1_O %1 * 64 + 2 * 8, 17, 18 3711 DCT32_avx512_PASS_1_O %1 * 64 + 3 * 8, 19, 20 3712 DCT32_avx512_PASS_1_O %1 * 64 + 4 * 8, 21, 22 3713 DCT32_avx512_PASS_1_O %1 * 64 + 5 * 8, 23, 24 3714 DCT32_avx512_PASS_1_O %1 * 64 + 6 * 8, 25, 26 3715 DCT32_avx512_PASS_1_O %1 * 64 + 7 * 8, 27, 28 3716%endmacro 3717 3718%macro DCT32_avx512_LOOP_EO 4 3719 mova m4, [rsp + 32 * mmsize + %3 * 64] 3720 vpermw m4, m8, m4 3721 vextracti64x4 ym5, m4, 1 3722 3723 mova m6, [rsp + 32 * mmsize + %4 * 64] 3724 vpermw m6, m8, m6 3725 vextracti64x4 ym7, m6, 1 3726 3727 vinserti64x4 m4, m4, ym6, 1 3728 vinserti64x4 m5, m5, ym7, 1 3729 3730 psubw m%1, m4, m5 ; EO 3731 paddw m%2, m4, m5 ; EE 3732%endmacro 3733 3734%macro DCT32_avx512_PASS_1_ROW_EO 2 3735 pmaddwd m29, m%2, m12 3736 vpsrldq m30, m29, 8 3737 vpaddd m30, m29 3738 vpsrldq m29, m30, 4 3739 vpaddd m29, m30 3740 3741 vpaddd m29, m0 3742 vpsrad m29, DCT_SHIFT 3743 vpackssdw m29, m29 3744 3745 vpermw m29, m11, m29 3746 movq [r5 + %1], xm29 3747%endmacro 3748 3749%macro DCT32_avx512_PASS_1_ROW_EO_0 0 3750 3751 mova m8, [dct32_shuf2_AVX512] 3752 vbroadcasti32x4 m12, [r7 + 2 * 32] 3753 3754 DCT32_avx512_LOOP_EO 13, 14, 0, 1 3755 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 0 * 8, 13 3756 3757 lea r9, [r9 + 4 * r2] 3758 DCT32_avx512_LOOP_EO 15, 16, 2, 3 3759 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 1 * 8, 15 3760 3761 lea r9, [r9 + 4 * r2] 3762 DCT32_avx512_LOOP_EO 17, 18, 4, 5 3763 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 2 * 8, 17 3764 3765 lea r9, [r9 + 4 * r2] 3766 DCT32_avx512_LOOP_EO 19, 20, 6, 7 3767 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 3 * 8, 19 3768 3769 lea r9, [r9 + 4 * r2] 3770 DCT32_avx512_LOOP_EO 21, 22, 8, 9 3771 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 4 * 8, 21 3772 3773 lea r9, [r9 + 4 * r2] 3774 DCT32_avx512_LOOP_EO 23, 24, 10, 11 3775 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 5 * 8, 23 3776 3777 lea r9, [r9 + 4 * r2] 3778 DCT32_avx512_LOOP_EO 25, 26, 12, 13 3779 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 6 * 8, 25 3780 3781 lea r9, [r9 + 4 * r2] 3782 DCT32_avx512_LOOP_EO 27, 28, 14, 15 3783 DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 7 * 8, 27 3784 3785%endmacro 3786 3787%macro DCT32_avx512_PASS_1_ROW_EO_1_7 1 3788 3789 vbroadcasti32x4 m12, [r7 + %1 * 32] 3790 3791 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 0 * 8, 13 3792 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 1 * 8, 15 3793 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 2 * 8, 17 3794 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 3 * 8, 19 3795 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 4 * 8, 21 3796 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 5 * 8, 23 3797 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 6 * 8, 25 3798 DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 7 * 8, 27 3799 3800%endmacro 3801 3802%macro DCT32_avx512_LOOP_EEO 0 3803 vpunpcklqdq m2, m14, m16 3804 vpunpckhqdq m14, m16 3805 vpshufb m14, m31 3806 3807 vpaddw m16, m2, m14 ; EEE 3808 vpsubw m2, m14 ; EE0 3809 3810 vpunpcklqdq m3, m18, m20 3811 vpunpckhqdq m18, m20 3812 vpshufb m18, m31 3813 3814 vpaddw m20, m3, m18 ; EEE 3815 vpsubw m3, m18 ; EE0 3816 3817 vpunpcklqdq m4, m22, m24 3818 vpunpckhqdq m22, m24 3819 vpshufb m22, m31 3820 3821 vpaddw m24, m4, m22 ; EEE 3822 vpsubw m4, m22 ; EE0 3823 3824 vpunpcklqdq m5, m26, m28 3825 vpunpckhqdq m26, m28 3826 vpshufb m26, m31 3827 3828 vpaddw m28, m5, m26 ; EEE 3829 vpsubw m5, m26 ; EE0 3830%endmacro 3831 3832%macro DCT32_avx512_PASS_1_ROW_EEO 2 3833 pmaddwd m30, m%2, m1 3834 vpsrldq m29, m30, 4 3835 vpaddd m29, m30 3836 3837 vpaddd m29, m0 3838 vpsrad m29, DCT_SHIFT 3839 vpackssdw m29, m29 3840 3841 vpermw m29, m27, m29 3842 movu [r5 + %1], xm29 3843%endmacro 3844 3845%macro DCT32_avx512_PASS_1_ROW_EEO_1_4 1 3846 3847vpbroadcastq m1, [r7 + %1 * 32] 3848DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 0 * 16, 2 3849DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 1 * 16, 3 3850DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 2 * 16, 4 3851DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 3 * 16, 5 3852 3853%endmacro 3854 3855%macro DCT32_avx512_PASS_1_ROW_EEEO_1_4 1 3856 3857vpbroadcastq m1, [r7 + %1 * 32] 3858DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 0 * 16, 16 3859DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 1 * 16, 20 3860DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 2 * 16, 24 3861DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 3 * 16, 28 3862 3863%endmacro 3864 3865%macro DCT32_avx512_PASS2_OPT 5 3866 pmaddwd m9, m1, m%1 3867 vpsrldq m10, m9, 8 3868 vpaddd m9, m10 3869 3870 pmaddwd m10, m1, m%2 3871 vpsrldq m11, m10, 8 3872 vpaddd m10, m11 3873 3874 pmaddwd m11, m1, m%3 3875 vpsrldq m12, m11, 8 3876 vpaddd m11, m12 3877 3878 pmaddwd m12, m1, m%4 3879 vpsrldq m13, m12, 8 3880 vpaddd m12, m13 3881 3882 vpsrldq m13, m9, 4 3883 vpaddd m9, m13 3884 vpsrldq m13, m10, 4 3885 vpaddd m10, m13 3886 vpsrldq m13, m11, 4 3887 vpaddd m11, m13 3888 vpsrldq m13, m12, 4 3889 vpaddd m12, m13 3890 3891 vpermd m9, m31, m9 3892 vpermd m10, m31, m10 3893 vpermd m11, m31, m11 3894 vpermd m12, m31, m12 3895 3896 vpandd m9, m27 3897 vpandd m10, m30 3898 vpandd m11, m29 3899 vpandd m12, m28 3900 3901 vpaddd m9, m10 3902 vpaddd m11, m12 3903 vpaddd m9, m11 3904 3905 vpsrldq m10, m9, 8 3906 vpaddd m9, m10 3907 vpsrldq m10, m9, 4 3908 vpaddd m9, m10 3909 3910 vpermd m9, m31, m9 3911 vpaddd m9, m0 3912 vpsrad m9, DCT_SHIFT2 3913 vpackssdw m9, m9 3914 movq [r1 + %5], xm9 3915 3916%endmacro 3917 3918%macro DCT32_avx512_PASS2 5 3919 3920 mova m9, [r5 + %1] 3921 mova m10, [r5 + %2] 3922 mova m11, [r5 + %3] 3923 mova m12, [r5 + %4] 3924 3925 pmaddwd m9, m1, m9 3926 vpsrldq m13, m9, 8 3927 vpaddd m9, m13 3928 3929 pmaddwd m10, m1, m10 3930 vpsrldq m13, m10, 8 3931 vpaddd m10, m13 3932 3933 pmaddwd m11, m1, m11 3934 vpsrldq m13, m11, 8 3935 vpaddd m11, m13 3936 3937 pmaddwd m12, m1, m12 3938 vpsrldq m13, m12, 8 3939 vpaddd m12, m13 3940 3941 vpsrldq m13, m9, 4 3942 vpaddd m9, m13 3943 vpsrldq m13, m10, 4 3944 vpaddd m10, m13 3945 vpsrldq m13, m11, 4 3946 vpaddd m11, m13 3947 vpsrldq m13, m12, 4 3948 vpaddd m12, m13 3949 3950 vpermd m9, m31, m9 3951 vpermd m10, m31, m10 3952 vpermd m11, m31, m11 3953 vpermd m12, m31, m12 3954 3955 vpandd m9, m27 3956 vpandd m10, m30 3957 vpandd m11, m29 3958 vpandd m12, m28 3959 3960 vpaddd m9, m10 3961 vpaddd m11, m12 3962 vpaddd m9, m11 3963 3964 vpsrldq m10, m9, 8 3965 vpaddd m9, m10 3966 vpsrldq m10, m9, 4 3967 vpaddd m9, m10 3968 3969 vpermd m9, m31, m9 3970 vpaddd m9, m0 3971 vpsrad m9, DCT_SHIFT2 3972 vpackssdw m9, m9 3973 movq [r1 + %5], xm9 3974 3975%endmacro 3976 3977%macro DCT32_avx512_PASS2_1_ROW 1 3978 3979mova m1, [r8 + %1 * 64] 3980 3981DCT32_avx512_PASS2_OPT 2, 3, 4, 14, %1 * 64 + 0 * 8 3982DCT32_avx512_PASS2_OPT 15, 16, 17, 18, %1 * 64 + 1 * 8 3983DCT32_avx512_PASS2_OPT 19, 20, 21, 22, %1 * 64 + 2 * 8 3984DCT32_avx512_PASS2_OPT 23, 24, 25, 26, %1 * 64 + 3 * 8 3985DCT32_avx512_PASS2_OPT 5, 6, 7, 8, %1 * 64 + 4 * 8 3986 3987DCT32_avx512_PASS2 20 * 64, 21 * 64, 22 * 64, 23 * 64, %1 * 64 + 5 * 8 3988DCT32_avx512_PASS2 24 * 64, 25 * 64, 26 * 64, 27 * 64, %1 * 64 + 6 * 8 3989DCT32_avx512_PASS2 28 * 64, 29 * 64, 30 * 64, 31 * 64, %1 * 64 + 7 * 8 3990 3991%endmacro 3992 3993INIT_ZMM avx512 3994cglobal dct32, 3, 10, 32, 0-(32*mmsize + 16*mmsize) 3995 3996%if BIT_DEPTH == 12 3997 %define DCT_SHIFT 8 3998 vpbroadcastq m0, [pd_128] 3999%elif BIT_DEPTH == 10 4000 %define DCT_SHIFT 6 4001 vpbroadcastq m0, [pd_32] 4002%elif BIT_DEPTH == 8 4003 %define DCT_SHIFT 4 4004 vpbroadcastq m0, [pd_8] 4005%else 4006 %error Unsupported BIT_DEPTH! 4007%endif 4008%define DCT_SHIFT2 11 4009 4010 add r2d, r2d 4011 lea r7, [tab_dct32_1] 4012 lea r8, [tab_dct32] 4013 lea r3, [r2 * 3] 4014 mov r5, rsp 4015 mov r9, 2048 ; 32 * mmsize 4016 add r9, rsp 4017 4018 mova m31, [dct32_shuf1_AVX512] 4019 4020 ; PASSS 1 4021 4022 vbroadcasti32x8 m30, [dct8_shuf9_AVX512] 4023 mova m8, [dct32_shuf_AVX512] 4024 4025 DCT32_avx512_PASS_1_ROW_O 4026 DCT32_avx512_PASS_1_ROW_O_1_7 3 4027 DCT32_avx512_PASS_1_ROW_O_1_7 5 4028 DCT32_avx512_PASS_1_ROW_O_1_7 7 4029 DCT32_avx512_PASS_1_ROW_O_1_7 9 4030 DCT32_avx512_PASS_1_ROW_O_1_7 11 4031 DCT32_avx512_PASS_1_ROW_O_1_7 13 4032 DCT32_avx512_PASS_1_ROW_O_1_7 15 4033 DCT32_avx512_PASS_1_ROW_O_1_7 17 4034 DCT32_avx512_PASS_1_ROW_O_1_7 19 4035 DCT32_avx512_PASS_1_ROW_O_1_7 20 4036 DCT32_avx512_PASS_1_ROW_O_1_7 21 4037 DCT32_avx512_PASS_1_ROW_O_1_7 23 4038 DCT32_avx512_PASS_1_ROW_O_1_7 25 4039 DCT32_avx512_PASS_1_ROW_O_1_7 27 4040 DCT32_avx512_PASS_1_ROW_O_1_7 29 4041 DCT32_avx512_PASS_1_ROW_O_1_7 31 4042 4043 vbroadcasti32x8 m11, [dct8_shuf9_AVX512] 4044 4045 DCT32_avx512_PASS_1_ROW_EO_0 4046 DCT32_avx512_PASS_1_ROW_EO_1_7 6 4047 DCT32_avx512_PASS_1_ROW_EO_1_7 10 4048 DCT32_avx512_PASS_1_ROW_EO_1_7 14 4049 DCT32_avx512_PASS_1_ROW_EO_1_7 18 4050 DCT32_avx512_PASS_1_ROW_EO_1_7 22 4051 DCT32_avx512_PASS_1_ROW_EO_1_7 26 4052 DCT32_avx512_PASS_1_ROW_EO_1_7 30 4053 4054 vbroadcasti32x4 m31, [dct8_shuf] 4055 vbroadcasti32x8 m27, [dct32_shuf3_AVX512] 4056 4057 DCT32_avx512_LOOP_EEO 4058 DCT32_avx512_PASS_1_ROW_EEO_1_4 4 4059 DCT32_avx512_PASS_1_ROW_EEO_1_4 12 4060 DCT32_avx512_PASS_1_ROW_EEO_1_4 20 4061 DCT32_avx512_PASS_1_ROW_EEO_1_4 28 4062 4063 DCT32_avx512_PASS_1_ROW_EEEO_1_4 0 4064 DCT32_avx512_PASS_1_ROW_EEEO_1_4 16 4065 DCT32_avx512_PASS_1_ROW_EEEO_1_4 8 4066 DCT32_avx512_PASS_1_ROW_EEEO_1_4 24 4067 4068 ; PASS 2 4069 4070 vpbroadcastq m0, [pd_1024] 4071 vbroadcasti32x8 m31, [dct32_shuf4_AVX512] 4072 movu m30, [dct32_shuf5_AVX512] 4073 movu m29, [dct32_shuf6_AVX512] 4074 movu m28, [dct32_shuf7_AVX512] 4075 movu m27, [dct32_shuf8_AVX512] 4076 4077 ;Load the source coefficents into free registers and reuse them for all rows 4078 4079 mova m2, [r5 + 0 * 64] 4080 mova m3, [r5 + 1 * 64] 4081 mova m4, [r5 + 2 * 64] 4082 mova m14, [r5 + 3 * 64] 4083 mova m15, [r5 + 4 * 64] 4084 mova m16, [r5 + 5 * 64] 4085 mova m17, [r5 + 6 * 64] 4086 mova m18, [r5 + 7 * 64] 4087 mova m19, [r5 + 8 * 64] 4088 mova m20, [r5 + 9 * 64] 4089 mova m21, [r5 + 10 * 64] 4090 mova m22, [r5 + 11 * 64] 4091 mova m23, [r5 + 12 * 64] 4092 mova m24, [r5 + 13 * 64] 4093 mova m25, [r5 + 14 * 64] 4094 mova m26, [r5 + 15 * 64] 4095 mova m5, [r5 + 16 * 64] 4096 mova m6, [r5 + 17 * 64] 4097 mova m7, [r5 + 18 * 64] 4098 mova m8, [r5 + 19 * 64] 4099 4100 DCT32_avx512_PASS2_1_ROW 0 4101 DCT32_avx512_PASS2_1_ROW 1 4102 DCT32_avx512_PASS2_1_ROW 2 4103 DCT32_avx512_PASS2_1_ROW 3 4104 DCT32_avx512_PASS2_1_ROW 4 4105 DCT32_avx512_PASS2_1_ROW 5 4106 DCT32_avx512_PASS2_1_ROW 6 4107 DCT32_avx512_PASS2_1_ROW 7 4108 DCT32_avx512_PASS2_1_ROW 8 4109 DCT32_avx512_PASS2_1_ROW 9 4110 DCT32_avx512_PASS2_1_ROW 10 4111 DCT32_avx512_PASS2_1_ROW 11 4112 DCT32_avx512_PASS2_1_ROW 12 4113 DCT32_avx512_PASS2_1_ROW 13 4114 DCT32_avx512_PASS2_1_ROW 14 4115 DCT32_avx512_PASS2_1_ROW 15 4116 DCT32_avx512_PASS2_1_ROW 16 4117 DCT32_avx512_PASS2_1_ROW 17 4118 DCT32_avx512_PASS2_1_ROW 18 4119 DCT32_avx512_PASS2_1_ROW 19 4120 DCT32_avx512_PASS2_1_ROW 20 4121 DCT32_avx512_PASS2_1_ROW 21 4122 DCT32_avx512_PASS2_1_ROW 22 4123 DCT32_avx512_PASS2_1_ROW 23 4124 DCT32_avx512_PASS2_1_ROW 24 4125 DCT32_avx512_PASS2_1_ROW 25 4126 DCT32_avx512_PASS2_1_ROW 26 4127 DCT32_avx512_PASS2_1_ROW 27 4128 DCT32_avx512_PASS2_1_ROW 28 4129 DCT32_avx512_PASS2_1_ROW 29 4130 DCT32_avx512_PASS2_1_ROW 30 4131 DCT32_avx512_PASS2_1_ROW 31 4132 4133 RET 4134 4135%macro IDCT8_PASS_1 1 4136 vpbroadcastd m7, [r5 + %1] 4137 vpbroadcastd m10, [r5 + %1 + 4] 4138 pmaddwd m5, m4, m7 4139 pmaddwd m6, m0, m10 4140 paddd m5, m6 4141 4142 vpbroadcastd m7, [r6 + %1] 4143 vpbroadcastd m10, [r6 + %1 + 4] 4144 pmaddwd m6, m1, m7 4145 pmaddwd m3, m2, m10 4146 paddd m6, m3 4147 4148 paddd m3, m5, m6 4149 paddd m3, m11 4150 psrad m3, IDCT_SHIFT1 4151 4152 psubd m5, m6 4153 paddd m5, m11 4154 psrad m5, IDCT_SHIFT1 4155 4156 vpbroadcastd m7, [r5 + %1 + 32] 4157 vpbroadcastd m10, [r5 + %1 + 36] 4158 pmaddwd m6, m4, m7 4159 pmaddwd m8, m0, m10 4160 paddd m6, m8 4161 4162 vpbroadcastd m7, [r6 + %1 + 32] 4163 vpbroadcastd m10, [r6 + %1 + 36] 4164 pmaddwd m8, m1, m7 4165 pmaddwd m9, m2, m10 4166 paddd m8, m9 4167 4168 paddd m9, m6, m8 4169 paddd m9, m11 4170 psrad m9, IDCT_SHIFT1 4171 4172 psubd m6, m8 4173 paddd m6, m11 4174 psrad m6, IDCT_SHIFT1 4175 4176 packssdw m3, m9 4177 vpermq m3, m3, 0xD8 4178 4179 packssdw m6, m5 4180 vpermq m6, m6, 0xD8 4181%endmacro 4182 4183%macro IDCT8_PASS_2 0 4184 punpcklqdq m2, m0, m1 4185 punpckhqdq m0, m1 4186 4187 pmaddwd m3, m2, [r5] 4188 pmaddwd m5, m2, [r5 + 32] 4189 pmaddwd m6, m2, [r5 + 64] 4190 pmaddwd m7, m2, [r5 + 96] 4191 phaddd m3, m5 4192 phaddd m6, m7 4193 pshufb m3, [idct8_shuf2] 4194 pshufb m6, [idct8_shuf2] 4195 punpcklqdq m7, m3, m6 4196 punpckhqdq m3, m6 4197 4198 pmaddwd m5, m0, [r6] 4199 pmaddwd m6, m0, [r6 + 32] 4200 pmaddwd m8, m0, [r6 + 64] 4201 pmaddwd m9, m0, [r6 + 96] 4202 phaddd m5, m6 4203 phaddd m8, m9 4204 pshufb m5, [idct8_shuf2] 4205 pshufb m8, [idct8_shuf2] 4206 punpcklqdq m6, m5, m8 4207 punpckhqdq m5, m8 4208 4209 paddd m8, m7, m6 4210 paddd m8, m12 4211 psrad m8, IDCT_SHIFT2 4212 4213 psubd m7, m6 4214 paddd m7, m12 4215 psrad m7, IDCT_SHIFT2 4216 4217 pshufb m7, [idct8_shuf3] 4218 packssdw m8, m7 4219 4220 paddd m9, m3, m5 4221 paddd m9, m12 4222 psrad m9, IDCT_SHIFT2 4223 4224 psubd m3, m5 4225 paddd m3, m12 4226 psrad m3, IDCT_SHIFT2 4227 4228 pshufb m3, [idct8_shuf3] 4229 packssdw m9, m3 4230%endmacro 4231 4232INIT_YMM avx2 4233cglobal idct8, 3, 7, 13, 0-8*16 4234%if BIT_DEPTH == 12 4235 %define IDCT_SHIFT2 8 4236 vpbroadcastd m12, [pd_128] 4237%elif BIT_DEPTH == 10 4238 %define IDCT_SHIFT2 10 4239 vpbroadcastd m12, [pd_512] 4240%elif BIT_DEPTH == 8 4241 %define IDCT_SHIFT2 12 4242 vpbroadcastd m12, [pd_2048] 4243%else 4244 %error Unsupported BIT_DEPTH! 4245%endif 4246%define IDCT_SHIFT1 7 4247 4248 vbroadcasti128 m11, [pd_64] 4249 4250 mov r4, rsp 4251 lea r5, [avx2_idct8_1] 4252 lea r6, [avx2_idct8_2] 4253 4254 ;pass1 4255 mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1] 4256 mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3] 4257 vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] 4258 vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] 4259 vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2] 4260 vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3] 4261 vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3] 4262 4263 mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5] 4264 mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7] 4265 vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] 4266 vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] 4267 vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6] 4268 vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7] 4269 vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7] 4270 4271 mova m5, [idct8_shuf1] 4272 vpermd m4, m5, m4 4273 vpermd m0, m5, m0 4274 vpermd m1, m5, m1 4275 vpermd m2, m5, m2 4276 4277 IDCT8_PASS_1 0 4278 mova [r4], m3 4279 mova [r4 + 96], m6 4280 4281 IDCT8_PASS_1 64 4282 mova [r4 + 32], m3 4283 mova [r4 + 64], m6 4284 4285 ;pass2 4286 add r2d, r2d 4287 lea r3, [r2 * 3] 4288 4289 mova m0, [r4] 4290 mova m1, [r4 + 32] 4291 IDCT8_PASS_2 4292 4293 vextracti128 xm3, m8, 1 4294 mova [r1], xm8 4295 mova [r1 + r2], xm3 4296 vextracti128 xm3, m9, 1 4297 mova [r1 + r2 * 2], xm9 4298 mova [r1 + r3], xm3 4299 4300 lea r1, [r1 + r2 * 4] 4301 mova m0, [r4 + 64] 4302 mova m1, [r4 + 96] 4303 IDCT8_PASS_2 4304 4305 vextracti128 xm3, m8, 1 4306 mova [r1], xm8 4307 mova [r1 + r2], xm3 4308 vextracti128 xm3, m9, 1 4309 mova [r1 + r2 * 2], xm9 4310 mova [r1 + r3], xm3 4311 RET 4312 4313 4314%macro IDCT8_AVX512_PASS_1 0 4315 pmaddwd m5, m29, m17 4316 pmaddwd m6, m25, m18 4317 paddd m5, m6 4318 4319 pmaddwd m6, m30, m21 4320 pmaddwd m3, m26, m22 4321 paddd m6, m3 4322 4323 paddd m3, m5, m6 4324 paddd m3, m11 4325 psrad m3, IDCT_SHIFT1 4326 4327 psubd m5, m6 4328 paddd m5, m11 4329 psrad m5, IDCT_SHIFT1 4330 4331 pmaddwd m6, m29, m19 4332 pmaddwd m8, m25, m20 4333 paddd m6, m8 4334 4335 pmaddwd m8, m30, m23 4336 pmaddwd m9, m26, m24 4337 paddd m8, m9 4338 4339 paddd m9, m6, m8 4340 paddd m9, m11 4341 psrad m9, IDCT_SHIFT1 4342 4343 psubd m6, m8 4344 paddd m6, m11 4345 psrad m6, IDCT_SHIFT1 4346 4347 packssdw m3, m9 4348 vpermq m3, m3, 0xD8 4349 4350 packssdw m6, m5 4351 vpermq m6, m6, 0xD8 4352%endmacro 4353 4354 4355%macro IDCT8_AVX512_PASS_2 0 4356 mov r7d, 0xAAAA 4357 kmovd k1, r7d 4358 punpcklqdq m2, m3, m13 4359 punpckhqdq m0, m3, m13 4360 4361 pmaddwd m3, m2, [r5] 4362 pmaddwd m5, m2, [r5 + 1 * mmsize] 4363 pmaddwd m6, m2, [r5 + 2 * mmsize] 4364 pmaddwd m7, m2, [r5 + 3 * mmsize] 4365 4366 vpsrldq m14, m3, 4 4367 paddd m3, m14 4368 vpslldq m16, m5, 4 4369 paddd m5, m16 4370 vmovdqu32 m3 {k1}, m5 4371 4372 vpsrldq m14, m6, 4 4373 paddd m6, m14 4374 vpslldq m16, m7, 4 4375 paddd m7, m16 4376 vmovdqu32 m6 {k1}, m7 4377 4378 punpcklqdq m7, m3, m6 4379 punpckhqdq m3, m6 4380 4381 pmaddwd m5, m0, [r6] 4382 pmaddwd m6, m0, [r6 + 1 * mmsize] 4383 pmaddwd m8, m0, [r6 + 2 * mmsize] 4384 pmaddwd m9, m0, [r6 + 3 * mmsize] 4385 4386 vpsrldq m14, m5, 4 4387 paddd m5, m14 4388 vpslldq m16, m6, 4 4389 paddd m6, m16 4390 vmovdqu32 m5 {k1}, m6 4391 4392 vpsrldq m14, m8, 4 4393 paddd m8, m14 4394 vpslldq m16, m9, 4 4395 paddd m9, m16 4396 vmovdqu32 m8 {k1}, m9 4397 4398 punpcklqdq m6, m5, m8 4399 punpckhqdq m5, m8 4400 4401 paddd m8, m7, m6 4402 paddd m8, m12 4403 psrad m8, IDCT_SHIFT2 4404 4405 psubd m7, m6 4406 paddd m7, m12 4407 psrad m7, IDCT_SHIFT2 4408 4409 pshufb m7, [idct8_avx512_shuf3] 4410 packssdw m8, m7 4411 4412 paddd m9, m3, m5 4413 paddd m9, m12 4414 psrad m9, IDCT_SHIFT2 4415 4416 psubd m3, m5 4417 paddd m3, m12 4418 psrad m3, IDCT_SHIFT2 4419 4420 pshufb m3, [idct8_avx512_shuf3] 4421 packssdw m9, m3 4422%endmacro 4423 4424 4425%if ARCH_X86_64 4426INIT_ZMM avx512 4427cglobal idct8, 3, 8, 31 4428%if BIT_DEPTH == 12 4429 %define IDCT_SHIFT2 8 4430 vpbroadcastd m12, [pd_128] 4431%elif BIT_DEPTH == 10 4432 %define IDCT_SHIFT2 10 4433 vpbroadcastd m12, [pd_512] 4434%elif BIT_DEPTH == 8 4435 %define IDCT_SHIFT2 12 4436 vpbroadcastd m12, [pd_2048] 4437%else 4438 %error Unsupported BIT_DEPTH! 4439%endif 4440%define IDCT_SHIFT1 7 4441 4442 vpbroadcastd m11, [pd_64] 4443 4444 lea r4, [avx512_idct8_3] 4445 lea r5, [avx2_idct8_1] 4446 lea r6, [avx2_idct8_2] 4447 movu m16, [idct16_shuff2] 4448 movu m17, [idct16_shuff3] 4449 4450 ;pass1 4451 mova ym1, [r0 + 0 * 32] 4452 mova ym0, [r0 + 1 * 32] 4453 mova ym25, ym16 4454 mova ym26, ym17 4455 vpermi2w ym25, ym1, ym0 4456 vpermi2w ym26, ym1, ym0 4457 4458 mova ym1, [r0 + 2 * 32] 4459 mova ym0, [r0 + 3 * 32] 4460 mova ym27, ym16 4461 mova ym28, ym17 4462 vpermi2w ym27, ym1, ym0 4463 vpermi2w ym28, ym1, ym0 4464 4465 vperm2i128 ym29, ym25, ym26, 0x20 4466 vperm2i128 ym30, ym25, ym26, 0x31 4467 vperm2i128 ym25, ym27, ym28, 0x20 4468 vperm2i128 ym26, ym27, ym28, 0x31 4469 4470 vinserti64x4 m29, m29, ym29, 1 4471 vinserti64x4 m25, m25, ym25, 1 4472 vinserti64x4 m30, m30, ym30, 1 4473 vinserti64x4 m26, m26, ym26, 1 4474 4475 movu m17, [r4] 4476 movu m18, [r4 + 1 * mmsize] 4477 movu m19, [r4 + 2 * mmsize] 4478 movu m20, [r4 + 3 * mmsize] 4479 movu m21, [r4 + 4 * mmsize] 4480 movu m22, [r4 + 5 * mmsize] 4481 movu m23, [r4 + 6 * mmsize] 4482 movu m24, [r4 + 7 * mmsize] 4483 4484 IDCT8_AVX512_PASS_1 4485 4486 vextracti64x4 ym13, m3, 1 4487 vextracti64x4 ym14, m6, 1 4488 vinserti64x4 m3, m3, ym14, 1 4489 vinserti64x4 m13, m13, ym6, 1 4490 4491 ;pass2 4492 add r2d, r2d 4493 lea r3, [r2 * 3] 4494 lea r5, [avx512_idct8_1] 4495 lea r6, [avx512_idct8_2] 4496 4497 IDCT8_AVX512_PASS_2 4498 4499 vextracti128 xm3, ym8, 1 4500 mova [r1], xm8 4501 mova [r1 + r2], xm3 4502 vextracti128 xm3, ym9, 1 4503 mova [r1 + r2 * 2], xm9 4504 mova [r1 + r3], xm3 4505 4506 lea r1, [r1 + r2 * 4] 4507 4508 vextracti64x4 ym10, m8, 1 4509 vextracti64x4 ym11, m9, 1 4510 4511 vextracti128 xm3, ym10, 1 4512 mova [r1], xm10 4513 mova [r1 + r2], xm3 4514 vextracti128 xm3, ym11, 1 4515 mova [r1 + r2 * 2], xm11 4516 mova [r1 + r3], xm3 4517 RET 4518%endif 4519 4520%macro IDCT_PASS1 2 4521 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16] 4522 4523 pmaddwd m9, m0, m5 4524 pmaddwd m10, m7, m5 4525 phaddd m9, m10 4526 4527 pmaddwd m10, m6, m5 4528 pmaddwd m11, m8, m5 4529 phaddd m10, m11 4530 4531 phaddd m9, m10 4532 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16] 4533 4534 pmaddwd m10, m1, m5 4535 pmaddwd m11, m3, m5 4536 phaddd m10, m11 4537 4538 pmaddwd m11, m4, m5 4539 pmaddwd m12, m2, m5 4540 phaddd m11, m12 4541 4542 phaddd m10, m11 4543 4544 paddd m11, m9, m10 4545 paddd m11, m14 4546 psrad m11, IDCT_SHIFT1 4547 4548 psubd m9, m10 4549 paddd m9, m14 4550 psrad m9, IDCT_SHIFT1 4551 4552 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16 + 16] 4553 4554 pmaddwd m10, m0, m5 4555 pmaddwd m12, m7, m5 4556 phaddd m10, m12 4557 4558 pmaddwd m12, m6, m5 4559 pmaddwd m13, m8, m5 4560 phaddd m12, m13 4561 4562 phaddd m10, m12 4563 vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16] 4564 4565 pmaddwd m12, m1, m5 4566 pmaddwd m13, m3, m5 4567 phaddd m12, m13 4568 4569 pmaddwd m13, m4, m5 4570 pmaddwd m5, m2 4571 phaddd m13, m5 4572 4573 phaddd m12, m13 4574 4575 paddd m5, m10, m12 4576 paddd m5, m14 4577 psrad m5, IDCT_SHIFT1 4578 4579 psubd m10, m12 4580 paddd m10, m14 4581 psrad m10, IDCT_SHIFT1 4582 4583 packssdw m11, m5 4584 packssdw m9, m10 4585 4586 mova m10, [idct16_shuff] 4587 mova m5, [idct16_shuff1] 4588 4589 vpermd m12, m10, m11 4590 vpermd m13, m5, m9 4591 mova [r3 + %1 * 16 * 2], xm12 4592 mova [r3 + %2 * 16 * 2], xm13 4593 vextracti128 [r3 + %2 * 16 * 2 + 32], m13, 1 4594 vextracti128 [r3 + %1 * 16 * 2 + 32], m12, 1 4595%endmacro 4596 4597;------------------------------------------------------- 4598; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride) 4599;------------------------------------------------------- 4600INIT_YMM avx2 4601cglobal idct16, 3, 7, 16, 0-16*mmsize 4602%if BIT_DEPTH == 12 4603 %define IDCT_SHIFT2 8 4604 vpbroadcastd m15, [pd_128] 4605%elif BIT_DEPTH == 10 4606 %define IDCT_SHIFT2 10 4607 vpbroadcastd m15, [pd_512] 4608%elif BIT_DEPTH == 8 4609 %define IDCT_SHIFT2 12 4610 vpbroadcastd m15, [pd_2048] 4611%else 4612 %error Unsupported BIT_DEPTH! 4613%endif 4614%define IDCT_SHIFT1 7 4615 4616 vbroadcasti128 m14, [pd_64] 4617 4618 add r2d, r2d 4619 mov r3, rsp 4620 mov r4d, 2 4621 4622.pass1: 4623 movu xm0, [r0 + 0 * 32] 4624 movu xm1, [r0 + 8 * 32] 4625 punpckhqdq xm2, xm0, xm1 4626 punpcklqdq xm0, xm1 4627 vinserti128 m0, m0, xm2, 1 4628 4629 movu xm1, [r0 + 1 * 32] 4630 movu xm2, [r0 + 9 * 32] 4631 punpckhqdq xm3, xm1, xm2 4632 punpcklqdq xm1, xm2 4633 vinserti128 m1, m1, xm3, 1 4634 4635 movu xm2, [r0 + 2 * 32] 4636 movu xm3, [r0 + 10 * 32] 4637 punpckhqdq xm4, xm2, xm3 4638 punpcklqdq xm2, xm3 4639 vinserti128 m2, m2, xm4, 1 4640 4641 movu xm3, [r0 + 3 * 32] 4642 movu xm4, [r0 + 11 * 32] 4643 punpckhqdq xm5, xm3, xm4 4644 punpcklqdq xm3, xm4 4645 vinserti128 m3, m3, xm5, 1 4646 4647 movu xm4, [r0 + 4 * 32] 4648 movu xm5, [r0 + 12 * 32] 4649 punpckhqdq xm6, xm4, xm5 4650 punpcklqdq xm4, xm5 4651 vinserti128 m4, m4, xm6, 1 4652 4653 movu xm5, [r0 + 5 * 32] 4654 movu xm6, [r0 + 13 * 32] 4655 punpckhqdq xm7, xm5, xm6 4656 punpcklqdq xm5, xm6 4657 vinserti128 m5, m5, xm7, 1 4658 4659 movu xm6, [r0 + 6 * 32] 4660 movu xm7, [r0 + 14 * 32] 4661 punpckhqdq xm8, xm6, xm7 4662 punpcklqdq xm6, xm7 4663 vinserti128 m6, m6, xm8, 1 4664 4665 movu xm7, [r0 + 7 * 32] 4666 movu xm8, [r0 + 15 * 32] 4667 punpckhqdq xm9, xm7, xm8 4668 punpcklqdq xm7, xm8 4669 vinserti128 m7, m7, xm9, 1 4670 4671 punpckhwd m8, m0, m2 ;[8 10] 4672 punpcklwd m0, m2 ;[0 2] 4673 4674 punpckhwd m2, m1, m3 ;[9 11] 4675 punpcklwd m1, m3 ;[1 3] 4676 4677 punpckhwd m3, m4, m6 ;[12 14] 4678 punpcklwd m4, m6 ;[4 6] 4679 4680 punpckhwd m6, m5, m7 ;[13 15] 4681 punpcklwd m5, m7 ;[5 7] 4682 4683 punpckhdq m7, m0, m4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] 4684 punpckldq m0, m4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] 4685 4686 punpckhdq m4, m8, m3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] 4687 punpckldq m8, m3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] 4688 4689 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] 4690 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] 4691 4692 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] 4693 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] 4694 4695 punpckhqdq m6, m0, m8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] 4696 punpcklqdq m0, m8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] 4697 4698 punpckhqdq m8, m7, m4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] 4699 punpcklqdq m7, m4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] 4700 4701 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] 4702 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] 4703 4704 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] 4705 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] 4706 4707 IDCT_PASS1 0, 14 4708 IDCT_PASS1 2, 12 4709 IDCT_PASS1 4, 10 4710 IDCT_PASS1 6, 8 4711 4712 add r0, 16 4713 add r3, 16 4714 dec r4d 4715 jnz .pass1 4716 4717 mov r3, rsp 4718 mov r4d, 8 4719 lea r5, [tab_idct16_2] 4720 lea r6, [tab_idct16_1] 4721 4722 vbroadcasti128 m7, [r5] 4723 vbroadcasti128 m8, [r5 + 16] 4724 vbroadcasti128 m9, [r5 + 32] 4725 vbroadcasti128 m10, [r5 + 48] 4726 vbroadcasti128 m11, [r5 + 64] 4727 vbroadcasti128 m12, [r5 + 80] 4728 vbroadcasti128 m13, [r5 + 96] 4729 4730.pass2: 4731 movu m1, [r3] 4732 vpermq m0, m1, 0xD8 4733 4734 pmaddwd m1, m0, m7 4735 pmaddwd m2, m0, m8 4736 phaddd m1, m2 4737 4738 pmaddwd m2, m0, m9 4739 pmaddwd m3, m0, m10 4740 phaddd m2, m3 4741 4742 phaddd m1, m2 4743 4744 pmaddwd m2, m0, m11 4745 pmaddwd m3, m0, m12 4746 phaddd m2, m3 4747 4748 vbroadcasti128 m14, [r5 + 112] 4749 pmaddwd m3, m0, m13 4750 pmaddwd m4, m0, m14 4751 phaddd m3, m4 4752 4753 phaddd m2, m3 4754 4755 movu m3, [r3 + 32] 4756 vpermq m0, m3, 0xD8 4757 4758 vbroadcasti128 m14, [r6] 4759 pmaddwd m3, m0, m14 4760 vbroadcasti128 m14, [r6 + 16] 4761 pmaddwd m4, m0, m14 4762 phaddd m3, m4 4763 4764 vbroadcasti128 m14, [r6 + 32] 4765 pmaddwd m4, m0, m14 4766 vbroadcasti128 m14, [r6 + 48] 4767 pmaddwd m5, m0, m14 4768 phaddd m4, m5 4769 4770 phaddd m3, m4 4771 4772 vbroadcasti128 m14, [r6 + 64] 4773 pmaddwd m4, m0, m14 4774 vbroadcasti128 m14, [r6 + 80] 4775 pmaddwd m5, m0, m14 4776 phaddd m4, m5 4777 4778 vbroadcasti128 m14, [r6 + 96] 4779 pmaddwd m6, m0, m14 4780 vbroadcasti128 m14, [r6 + 112] 4781 pmaddwd m0, m14 4782 phaddd m6, m0 4783 4784 phaddd m4, m6 4785 4786 paddd m5, m1, m3 4787 paddd m5, m15 4788 psrad m5, IDCT_SHIFT2 4789 4790 psubd m1, m3 4791 paddd m1, m15 4792 psrad m1, IDCT_SHIFT2 4793 4794 paddd m6, m2, m4 4795 paddd m6, m15 4796 psrad m6, IDCT_SHIFT2 4797 4798 psubd m2, m4 4799 paddd m2, m15 4800 psrad m2, IDCT_SHIFT2 4801 4802 packssdw m5, m6 4803 packssdw m1, m2 4804 pshufb m2, m1, [dct16_shuf1] 4805 4806 mova [r1], xm5 4807 mova [r1 + 16], xm2 4808 vextracti128 [r1 + r2], m5, 1 4809 vextracti128 [r1 + r2 + 16], m2, 1 4810 4811 lea r1, [r1 + 2 * r2] 4812 add r3, 64 4813 dec r4d 4814 jnz .pass2 4815 RET 4816 4817 4818%macro IDCT16_AVX512_PASS1 3 4819 movu m5, [tab_AVX512_idct16_2 + %1 * 64] 4820 pmaddwd m9, m4, m5 4821 pmaddwd m10, m6, m5 4822 4823 vpsrldq m16, m9, 4 4824 paddd m9, m16 4825 vpslldq m17, m10, 4 4826 paddd m10, m17 4827 vmovdqu32 m9 {k1}, m10 4828 4829 pmaddwd m10, m7, m5 4830 pmaddwd m11, m8, m5 4831 4832 vpsrldq m16, m10, 4 4833 paddd m10, m16 4834 vpslldq m17, m11, 4 4835 paddd m11, m17 4836 vmovdqu32 m10 {k1}, m11 4837 4838 vpsrldq m16, m9, 8 4839 paddd m9, m16 4840 vpslldq m17, m10, 8 4841 paddd m10, m17 4842 vmovdqu32 m9 {k2}, m10 4843 4844 mova m5, [tab_AVX512_idct16_1 + %1 * 64] 4845 pmaddwd m10, m28, m5 4846 pmaddwd m11, m29, m5 4847 4848 vpsrldq m16, m10, 4 4849 paddd m10, m16 4850 vpslldq m17, m11, 4 4851 paddd m11, m17 4852 vmovdqu32 m10 {k1}, m11 4853 4854 pmaddwd m11, m30, m5 4855 pmaddwd m12, m31, m5 4856 4857 vpsrldq m16, m11, 4 4858 paddd m11, m16 4859 vpslldq m17, m12, 4 4860 paddd m12, m17 4861 vmovdqu32 m11 {k1}, m12 4862 4863 vpsrldq m16, m10, 8 4864 paddd m10, m16 4865 vpslldq m17, m11, 8 4866 paddd m11, m17 4867 vmovdqu32 m10 {k2}, m11 4868 4869 paddd m11, m9, m10 4870 paddd m11, m14 4871 psrad m11, IDCT_SHIFT1 4872 4873 psubd m9, m10 4874 paddd m9, m14 4875 psrad m9, IDCT_SHIFT1 4876 4877 mova m5, [tab_AVX512_idct16_2 + %1 * 64 + 64] 4878 pmaddwd m10, m4, m5 4879 pmaddwd m12, m6, m5 4880 4881 4882 vpsrldq m16, m10, 4 4883 paddd m10, m16 4884 vpslldq m17, m12, 4 4885 paddd m12, m17 4886 vmovdqu32 m10 {k1}, m12 4887 4888 pmaddwd m12, m7, m5 4889 pmaddwd m13, m8, m5 4890 4891 4892 vpsrldq m16, m12, 4 4893 paddd m12, m16 4894 vpslldq m17, m13, 4 4895 paddd m13, m17 4896 vmovdqu32 m12 {k1}, m13 4897 4898 4899 vpsrldq m16, m10, 8 4900 paddd m10, m16 4901 vpslldq m17, m12, 8 4902 paddd m12, m17 4903 vmovdqu32 m10 {k2}, m12 4904 4905 4906 4907 mova m5, [tab_AVX512_idct16_1 + %1 * 64 + 64] 4908 pmaddwd m12, m28, m5 4909 pmaddwd m13, m29, m5 4910 4911 4912 vpsrldq m16, m12, 4 4913 paddd m12, m16 4914 vpslldq m17, m13, 4 4915 paddd m13, m17 4916 vmovdqu32 m12 {k1}, m13 4917 4918 pmaddwd m13, m30, m5 4919 pmaddwd m5, m31 4920 4921 4922 vpsrldq m16, m13, 4 4923 paddd m13, m16 4924 vpslldq m17, m5, 4 4925 paddd m5, m17 4926 vmovdqu32 m13 {k1}, m5 4927 4928 4929 vpsrldq m16, m12, 8 4930 paddd m12, m16 4931 vpslldq m17, m13, 8 4932 paddd m13, m17 4933 vmovdqu32 m12 {k2}, m13 4934 4935 4936 paddd m5, m10, m12 4937 paddd m5, m14 4938 psrad m5, IDCT_SHIFT1 4939 4940 psubd m10, m12 4941 paddd m10, m14 4942 psrad m10, IDCT_SHIFT1 4943 4944 packssdw m11, m5 4945 packssdw m9, m10 4946 4947 mova m10, [idct16_AVX512_shuff] 4948 mova m5, [idct16_AVX512_shuff1] 4949 4950 vpermd m%2, m10, m11 4951 vpermd m%3, m5, m9 4952%endmacro 4953 4954%macro IDCT16_AVX512_PASS2 2 4955 vpermq m0, m%1, 0xD8 4956 4957 pmaddwd m1, m0, m7 4958 pmaddwd m2, m0, m8 4959 4960 4961 vpsrldq m14, m1, 4 4962 paddd m1, m14 4963 vpslldq m31, m2, 4 4964 paddd m2, m31 4965 vmovdqu32 m1 {k1}, m2 4966 4967 pmaddwd m2, m0, m9 4968 pmaddwd m3, m0, m10 4969 4970 4971 vpsrldq m14, m2, 4 4972 paddd m2, m14 4973 vpslldq m31, m3, 4 4974 paddd m3, m31 4975 vmovdqu32 m2 {k1}, m3 4976 4977 4978 vpsrldq m14, m1, 8 4979 paddd m1, m14 4980 vpslldq m31, m2, 8 4981 paddd m2, m31 4982 vmovdqu32 m1 {k2}, m2 4983 4984 pmaddwd m2, m0, m11 4985 pmaddwd m3, m0, m12 4986 4987 4988 vpsrldq m14, m2, 4 4989 paddd m2, m14 4990 vpslldq m31, m3, 4 4991 paddd m3, m31 4992 vmovdqu32 m2 {k1}, m3 4993 4994 vbroadcasti64x2 m14, [r5 + 112] 4995 pmaddwd m3, m0, m13 4996 pmaddwd m4, m0, m14 4997 4998 4999 vpsrldq m14, m3, 4 5000 paddd m3, m14 5001 vpslldq m31, m4, 4 5002 paddd m4, m31 5003 vmovdqu32 m3 {k1}, m4 5004 5005 5006 vpsrldq m14, m2, 8 5007 paddd m2, m14 5008 vpslldq m31, m3, 8 5009 paddd m3, m31 5010 vmovdqu32 m2 {k2}, m3 5011 5012 vpermq m0, m%2, 0xD8 5013 pmaddwd m3, m0, m16 5014 pmaddwd m4, m0, m17 5015 5016 5017 vpsrldq m14, m3, 4 5018 paddd m3, m14 5019 vpslldq m31, m4, 4 5020 paddd m4, m31 5021 vmovdqu32 m3 {k1}, m4 5022 5023 pmaddwd m4, m0, m19 5024 pmaddwd m5, m0, m23 5025 5026 5027 vpsrldq m14, m4, 4 5028 paddd m4, m14 5029 vpslldq m31, m5, 4 5030 paddd m5, m31 5031 vmovdqu32 m4 {k1}, m5 5032 5033 5034 vpsrldq m14, m3, 8 5035 paddd m3, m14 5036 vpslldq m31, m4, 8 5037 paddd m4, m31 5038 vmovdqu32 m3 {k2}, m4 5039 5040 5041 pmaddwd m4, m0, m28 5042 pmaddwd m5, m0, m29 5043 5044 vpsrldq m14, m4, 4 5045 paddd m4, m14 5046 vpslldq m31, m5, 4 5047 paddd m5, m31 5048 vmovdqu32 m4 {k1}, m5 5049 5050 pmaddwd m6, m0, m30 5051 vbroadcasti64x2 m31, [r6 + 112] 5052 pmaddwd m0, m31 5053 5054 5055 vpsrldq m14, m6, 4 5056 paddd m6, m14 5057 vpslldq m31, m0, 4 5058 paddd m0, m31 5059 vmovdqu32 m6 {k1}, m0 5060 5061 5062 vpsrldq m14, m4, 8 5063 paddd m4, m14 5064 vpslldq m31, m6, 8 5065 paddd m6, m31 5066 vmovdqu32 m4 {k2}, m6 5067 5068 paddd m5, m1, m3 5069 paddd m5, m15 5070 psrad m5, IDCT_SHIFT2 5071 5072 psubd m1, m3 5073 paddd m1, m15 5074 psrad m1, IDCT_SHIFT2 5075 5076 paddd m6, m2, m4 5077 paddd m6, m15 5078 psrad m6, IDCT_SHIFT2 5079 5080 psubd m2, m4 5081 paddd m2, m15 5082 psrad m2, IDCT_SHIFT2 5083 5084 packssdw m5, m6 5085 packssdw m1, m2 5086 pshufb m2, m1, [idct16_AVX512_shuff6] 5087%endmacro 5088 5089 5090;------------------------------------------------------- 5091; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride) 5092;------------------------------------------------------- 5093INIT_ZMM avx512 5094cglobal idct16, 3, 8, 32 5095%if BIT_DEPTH == 12 5096 %define IDCT_SHIFT2 8 5097 vpbroadcastd m15, [pd_128] 5098%elif BIT_DEPTH == 10 5099 %define IDCT_SHIFT2 10 5100 vpbroadcastd m15, [pd_512] 5101%elif BIT_DEPTH == 8 5102 %define IDCT_SHIFT2 12 5103 vpbroadcastd m15, [pd_2048] 5104%else 5105 %error Unsupported BIT_DEPTH! 5106%endif 5107%define IDCT_SHIFT1 7 5108 5109 vpbroadcastd m14, [pd_64] 5110 5111 add r2d, r2d 5112 5113 mov r7d, 0xAAAA 5114 kmovd k1, r7d 5115 mov r7d, 0xCCCC 5116 kmovd k2, r7d 5117 mova ym2, [idct16_shuff2] 5118 mova ym3, [idct16_shuff3] 5119 mova ym26, [idct16_shuff4] 5120 mova ym27, [idct16_shuff5] 5121 5122.pass1: 5123 movu xm0, [r0 + 0 * 32] 5124 vinserti128 ym0, ym0, [r0 + 8 * 32], 1 5125 movu xm1, [r0 + 2 * 32] 5126 vinserti128 ym1, ym1, [r0 + 10 * 32], 1 5127 5128 mova ym9, ym2 5129 mova ym10, ym3 5130 vpermi2w ym9, ym0, ym1 5131 vpermi2w ym10, ym0, ym1 5132 5133 movu xm0, [r0 + 4 * 32] 5134 vinserti128 ym0, ym0, [r0 + 12 * 32], 1 5135 movu xm1, [r0 + 6 * 32] 5136 vinserti128 ym1, ym1, [r0 + 14 * 32], 1 5137 5138 mova ym11, ym2 5139 mova ym12, ym3 5140 vpermi2w ym11, ym0, ym1 5141 vpermi2w ym12, ym0, ym1 5142 5143 mova ym4, ym26 5144 mova ym6, ym27 5145 vpermi2d ym4, ym9, ym11 5146 vpermi2d ym6, ym9, ym11 5147 5148 mova ym7, ym26 5149 mova ym8, ym27 5150 vpermi2d ym7, ym10, ym12 5151 vpermi2d ym8, ym10, ym12 5152 5153 vpermq ym4, ym4, q3120 5154 vpermq ym6, ym6, q3120 5155 vpermq ym7, ym7, q3120 5156 vpermq ym8, ym8, q3120 5157 5158 movu xm0, [r0 + 1 * 32] 5159 vinserti128 ym0, ym0, [r0 + 9 * 32], 1 5160 movu xm1, [r0 + 3 * 32] 5161 vinserti128 ym1, ym1, [r0 + 11 * 32], 1 5162 5163 mova ym9, ym2 5164 mova ym10, ym3 5165 vpermi2w ym9, ym0, ym1 5166 vpermi2w ym10, ym0, ym1 5167 5168 movu xm0, [r0 + 5 * 32] 5169 vinserti128 ym0, ym0, [r0 + 13 * 32], 1 5170 movu xm1, [r0 + 7 * 32] 5171 vinserti128 ym1, ym1, [r0 + 15 * 32], 1 5172 5173 mova ym11, ym2 5174 mova ym12, ym3 5175 vpermi2w ym11, ym0, ym1 5176 vpermi2w ym12, ym0, ym1 5177 5178 mova ym28, ym26 5179 mova ym29, ym27 5180 vpermi2d ym28, ym9, ym11 5181 vpermi2d ym29, ym9, ym11 5182 5183 mova ym30, ym26 5184 mova ym31, ym27 5185 vpermi2d ym30, ym10, ym12 5186 vpermi2d ym31, ym10, ym12 5187 5188 vpermq ym28, ym28, q3120 5189 vpermq ym29, ym29, q3120 5190 vpermq ym30, ym30, q3120 5191 vpermq ym31, ym31, q3120 5192 5193 vinserti64x4 m4, m4, ym4, 1 5194 vinserti64x4 m6, m6, ym6, 1 5195 vinserti64x4 m7, m7, ym7, 1 5196 vinserti64x4 m8, m8, ym8, 1 5197 vinserti64x4 m28, m28, ym28, 1 5198 vinserti64x4 m29, m29, ym29, 1 5199 vinserti64x4 m30, m30, ym30, 1 5200 vinserti64x4 m31, m31, ym31, 1 5201 5202 IDCT16_AVX512_PASS1 0, 18, 19 5203 IDCT16_AVX512_PASS1 2, 20, 21 5204 5205 add r0, 16 5206 5207 movu xm0, [r0 + 0 * 32] 5208 vinserti128 ym0, ym0, [r0 + 8 * 32], 1 5209 movu xm1, [r0 + 2 * 32] 5210 vinserti128 ym1, ym1, [r0 + 10 * 32], 1 5211 5212 mova ym9, ym2 5213 mova ym10, ym3 5214 vpermi2w ym9, ym0, ym1 5215 vpermi2w ym10, ym0, ym1 5216 5217 movu xm0, [r0 + 4 * 32] 5218 vinserti128 ym0, ym0, [r0 + 12 * 32], 1 5219 movu xm1, [r0 + 6 * 32] 5220 vinserti128 ym1, ym1, [r0 + 14 * 32], 1 5221 5222 mova ym11, ym2 5223 mova ym12, ym3 5224 vpermi2w ym11, ym0, ym1 5225 vpermi2w ym12, ym0, ym1 5226 5227 mova ym4, ym26 5228 mova ym6, ym27 5229 vpermi2d ym4, ym9, ym11 5230 vpermi2d ym6, ym9, ym11 5231 5232 mova ym7, ym26 5233 mova ym8, ym27 5234 vpermi2d ym7, ym10, ym12 5235 vpermi2d ym8, ym10, ym12 5236 5237 vpermq ym4, ym4, q3120 5238 vpermq ym6, ym6, q3120 5239 vpermq ym7, ym7, q3120 5240 vpermq ym8, ym8, q3120 5241 5242 movu xm0, [r0 + 1 * 32] 5243 vinserti128 ym0, ym0, [r0 + 9 * 32], 1 5244 movu xm1, [r0 + 3 * 32] 5245 vinserti128 ym1, ym1, [r0 + 11 * 32], 1 5246 5247 mova ym9, ym2 5248 mova ym10, ym3 5249 vpermi2w ym9, ym0, ym1 5250 vpermi2w ym10, ym0, ym1 5251 5252 movu xm0, [r0 + 5 * 32] 5253 vinserti128 ym0, ym0, [r0 + 13 * 32], 1 5254 movu xm1, [r0 + 7 * 32] 5255 vinserti128 ym1, ym1, [r0 + 15 * 32], 1 5256 5257 mova ym11, ym2 5258 mova ym12, ym3 5259 vpermi2w ym11, ym0, ym1 5260 vpermi2w ym12, ym0, ym1 5261 5262 mova ym28, ym26 5263 mova ym29, ym27 5264 vpermi2d ym28, ym9, ym11 5265 vpermi2d ym29, ym9, ym11 5266 5267 mova ym30, ym26 5268 mova ym31, ym27 5269 vpermi2d ym30, ym10, ym12 5270 vpermi2d ym31, ym10, ym12 5271 5272 vpermq ym28, ym28, q3120 5273 vpermq ym29, ym29, q3120 5274 vpermq ym30, ym30, q3120 5275 vpermq ym31, ym31, q3120 5276 5277 vinserti64x4 m4, m4, ym4, 1 5278 vinserti64x4 m6, m6, ym6, 1 5279 vinserti64x4 m7, m7, ym7, 1 5280 vinserti64x4 m8, m8, ym8, 1 5281 vinserti64x4 m28, m28, ym28, 1 5282 vinserti64x4 m29, m29, ym29, 1 5283 vinserti64x4 m30, m30, ym30, 1 5284 vinserti64x4 m31, m31, ym31, 1 5285 5286 5287 IDCT16_AVX512_PASS1 0, 22, 23 5288 IDCT16_AVX512_PASS1 2, 24, 25 5289 5290 mova m26, [idct16_AVX512_shuff2] 5291 mova m27, [idct16_AVX512_shuff3] 5292 vpermi2q m26, m18, m22 5293 vpermi2q m27, m18, m22 5294 mova m18, [idct16_AVX512_shuff2] 5295 mova m22, [idct16_AVX512_shuff3] 5296 vpermi2q m18, m20, m24 5297 vpermi2q m22, m20, m24 5298 mova m20, [idct16_AVX512_shuff4] 5299 mova m24, [idct16_AVX512_shuff5] 5300 vpermi2q m20, m21, m25 5301 vpermi2q m24, m21, m25 5302 mova m21, [idct16_AVX512_shuff4] 5303 mova m25, [idct16_AVX512_shuff5] 5304 vpermi2q m21, m19, m23 5305 vpermi2q m25, m19, m23 5306 5307 lea r5, [tab_idct16_2] 5308 lea r6, [tab_idct16_1] 5309 5310 vbroadcasti64x2 m7, [r5] 5311 vbroadcasti64x2 m8, [r5 + 16] 5312 vbroadcasti64x2 m9, [r5 + 32] 5313 vbroadcasti64x2 m10, [r5 + 48] 5314 vbroadcasti64x2 m11, [r5 + 64] 5315 vbroadcasti64x2 m12, [r5 + 80] 5316 vbroadcasti64x2 m13, [r5 + 96] 5317 5318 vbroadcasti64x2 m16, [r6] 5319 vbroadcasti64x2 m17, [r6 + 16] 5320 vbroadcasti64x2 m19, [r6 + 32] 5321 vbroadcasti64x2 m23, [r6 + 48] 5322 vbroadcasti64x2 m28, [r6 + 64] 5323 vbroadcasti64x2 m29, [r6 + 80] 5324 vbroadcasti64x2 m30, [r6 + 96] 5325 5326 5327 IDCT16_AVX512_PASS2 26, 27 5328 mova [r1], xm5 5329 mova [r1 + 16], xm2 5330 vextracti128 [r1 + r2], ym5, 1 5331 vextracti128 [r1 + r2 + 16], ym2, 1 5332 vextracti64x4 ym14, m5, 1 5333 vextracti64x4 ym31, m2, 1 5334 lea r1, [r1 + 2 * r2] 5335 mova [r1], xm14 5336 mova [r1 + 16], xm31 5337 vextracti128 [r1 + r2], ym14, 1 5338 vextracti128 [r1 + r2 + 16], ym31, 1 5339 5340 IDCT16_AVX512_PASS2 18, 22 5341 lea r1, [r1 + 2 * r2] 5342 mova [r1], xm5 5343 mova [r1 + 16], xm2 5344 vextracti128 [r1 + r2], ym5, 1 5345 vextracti128 [r1 + r2 + 16], ym2, 1 5346 vextracti64x4 ym14, m5, 1 5347 vextracti64x4 ym31, m2, 1 5348 lea r1, [r1 + 2 * r2] 5349 mova [r1], xm14 5350 mova [r1 + 16], xm31 5351 vextracti128 [r1 + r2], ym14, 1 5352 vextracti128 [r1 + r2 + 16], ym31, 1 5353 5354 IDCT16_AVX512_PASS2 20, 24 5355 lea r1, [r1 + 2 * r2] 5356 mova [r1], xm5 5357 mova [r1 + 16], xm2 5358 vextracti128 [r1 + r2], ym5, 1 5359 vextracti128 [r1 + r2 + 16], ym2, 1 5360 vextracti64x4 ym14, m5, 1 5361 vextracti64x4 ym31, m2, 1 5362 lea r1, [r1 + 2 * r2] 5363 mova [r1], xm14 5364 mova [r1 + 16], xm31 5365 vextracti128 [r1 + r2], ym14, 1 5366 vextracti128 [r1 + r2 + 16], ym31, 1 5367 5368 IDCT16_AVX512_PASS2 21, 25 5369 lea r1, [r1 + 2 * r2] 5370 mova [r1], xm5 5371 mova [r1 + 16], xm2 5372 vextracti128 [r1 + r2], ym5, 1 5373 vextracti128 [r1 + r2 + 16], ym2, 1 5374 vextracti64x4 ym14, m5, 1 5375 vextracti64x4 ym31, m2, 1 5376 lea r1, [r1 + 2 * r2] 5377 mova [r1], xm14 5378 mova [r1 + 16], xm31 5379 vextracti128 [r1 + r2], ym14, 1 5380 vextracti128 [r1 + r2 + 16], ym31, 1 5381 RET 5382 5383 5384 5385%macro IDCT32_PASS1 1 5386 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32] 5387 vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16] 5388 pmaddwd m9, m4, m3 5389 pmaddwd m10, m8, m13 5390 phaddd m9, m10 5391 5392 pmaddwd m10, m2, m3 5393 pmaddwd m11, m1, m13 5394 phaddd m10, m11 5395 5396 phaddd m9, m10 5397 5398 vbroadcasti128 m3, [tab_idct32_1 + (15 - %1) * 32] 5399 vbroadcasti128 m13, [tab_idct32_1 + (15- %1) * 32 + 16] 5400 pmaddwd m10, m4, m3 5401 pmaddwd m11, m8, m13 5402 phaddd m10, m11 5403 5404 pmaddwd m11, m2, m3 5405 pmaddwd m12, m1, m13 5406 phaddd m11, m12 5407 5408 phaddd m10, m11 5409 phaddd m9, m10 ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15] 5410 5411 vbroadcasti128 m3, [tab_idct32_2 + %1 * 16] 5412 pmaddwd m10, m0, m3 5413 pmaddwd m11, m7, m3 5414 phaddd m10, m11 5415 phaddd m10, m10 5416 5417 vbroadcasti128 m3, [tab_idct32_3 + %1 * 16] 5418 pmaddwd m11, m5, m3 5419 pmaddwd m12, m6, m3 5420 phaddd m11, m12 5421 phaddd m11, m11 5422 5423 paddd m12, m10, m11 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL] 5424 psubd m10, m11 ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL] 5425 5426 punpcklqdq m12, m10 ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15] 5427 paddd m10, m9, m12 5428 paddd m10, m15 5429 psrad m10, IDCT_SHIFT1 5430 5431 psubd m12, m9 5432 paddd m12, m15 5433 psrad m12, IDCT_SHIFT1 5434 5435 packssdw m10, m12 5436 vextracti128 xm12, m10, 1 5437 movd [r3 + %1 * 64], xm10 5438 movd [r3 + 32 + %1 * 64], xm12 5439 pextrd [r4 - %1 * 64], xm10, 1 5440 pextrd [r4+ 32 - %1 * 64], xm12, 1 5441 pextrd [r3 + 16 * 64 + %1 *64], xm10, 3 5442 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3 5443 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2 5444 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2 5445%endmacro 5446 5447;------------------------------------------------------- 5448; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride) 5449;------------------------------------------------------- 5450 5451; TODO: Reduce PHADDD instruction by PADDD 5452 5453INIT_YMM avx2 5454cglobal idct32, 3, 6, 16, 0-32*64 5455 5456%define IDCT_SHIFT1 7 5457 5458 vbroadcasti128 m15, [pd_64] 5459 5460 mov r3, rsp 5461 lea r4, [r3 + 15 * 64] 5462 mov r5d, 8 5463 5464.pass1: 5465 movq xm0, [r0 + 2 * 64] 5466 movq xm1, [r0 + 18 * 64] 5467 punpcklqdq xm0, xm0, xm1 5468 movq xm1, [r0 + 0 * 64] 5469 movq xm2, [r0 + 16 * 64] 5470 punpcklqdq xm1, xm1, xm2 5471 vinserti128 m0, m0, xm1, 1 ;[2 18 0 16] 5472 5473 movq xm1, [r0 + 1 * 64] 5474 movq xm2, [r0 + 9 * 64] 5475 punpcklqdq xm1, xm1, xm2 5476 movq xm2, [r0 + 17 * 64] 5477 movq xm3, [r0 + 25 * 64] 5478 punpcklqdq xm2, xm2, xm3 5479 vinserti128 m1, m1, xm2, 1 ;[1 9 17 25] 5480 5481 movq xm2, [r0 + 6 * 64] 5482 movq xm3, [r0 + 22 * 64] 5483 punpcklqdq xm2, xm2, xm3 5484 movq xm3, [r0 + 4 * 64] 5485 movq xm4, [r0 + 20 * 64] 5486 punpcklqdq xm3, xm3, xm4 5487 vinserti128 m2, m2, xm3, 1 ;[6 22 4 20] 5488 5489 movq xm3, [r0 + 3 * 64] 5490 movq xm4, [r0 + 11 * 64] 5491 punpcklqdq xm3, xm3, xm4 5492 movq xm4, [r0 + 19 * 64] 5493 movq xm5, [r0 + 27 * 64] 5494 punpcklqdq xm4, xm4, xm5 5495 vinserti128 m3, m3, xm4, 1 ;[3 11 17 25] 5496 5497 movq xm4, [r0 + 10 * 64] 5498 movq xm5, [r0 + 26 * 64] 5499 punpcklqdq xm4, xm4, xm5 5500 movq xm5, [r0 + 8 * 64] 5501 movq xm6, [r0 + 24 * 64] 5502 punpcklqdq xm5, xm5, xm6 5503 vinserti128 m4, m4, xm5, 1 ;[10 26 8 24] 5504 5505 movq xm5, [r0 + 5 * 64] 5506 movq xm6, [r0 + 13 * 64] 5507 punpcklqdq xm5, xm5, xm6 5508 movq xm6, [r0 + 21 * 64] 5509 movq xm7, [r0 + 29 * 64] 5510 punpcklqdq xm6, xm6, xm7 5511 vinserti128 m5, m5, xm6, 1 ;[5 13 21 9] 5512 5513 movq xm6, [r0 + 14 * 64] 5514 movq xm7, [r0 + 30 * 64] 5515 punpcklqdq xm6, xm6, xm7 5516 movq xm7, [r0 + 12 * 64] 5517 movq xm8, [r0 + 28 * 64] 5518 punpcklqdq xm7, xm7, xm8 5519 vinserti128 m6, m6, xm7, 1 ;[14 30 12 28] 5520 5521 movq xm7, [r0 + 7 * 64] 5522 movq xm8, [r0 + 15 * 64] 5523 punpcklqdq xm7, xm7, xm8 5524 movq xm8, [r0 + 23 * 64] 5525 movq xm9, [r0 + 31 * 64] 5526 punpcklqdq xm8, xm8, xm9 5527 vinserti128 m7, m7, xm8, 1 ;[7 15 23 31] 5528 5529 punpckhwd m8, m0, m2 ;[18 22 16 20] 5530 punpcklwd m0, m2 ;[2 6 0 4] 5531 5532 punpckhwd m2, m1, m3 ;[9 11 25 27] 5533 punpcklwd m1, m3 ;[1 3 17 19] 5534 5535 punpckhwd m3, m4, m6 ;[26 30 24 28] 5536 punpcklwd m4, m6 ;[10 14 8 12] 5537 5538 punpckhwd m6, m5, m7 ;[13 15 29 31] 5539 punpcklwd m5, m7 ;[5 7 21 23] 5540 5541 punpckhdq m7, m0, m4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123] 5542 punpckldq m0, m4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121] 5543 5544 punpckhdq m4, m8, m3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283] 5545 punpckldq m8, m3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281] 5546 5547 punpckhdq m3, m1, m5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233] 5548 punpckldq m1, m5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231] 5549 5550 punpckhdq m5, m2, m6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313] 5551 punpckldq m2, m6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311] 5552 5553 punpckhqdq m6, m0, m8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281] 5554 punpcklqdq m0, m8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280] 5555 5556 punpckhqdq m8, m7, m4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283] 5557 punpcklqdq m7, m4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282] 5558 5559 punpckhqdq m4, m1, m2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311] 5560 punpcklqdq m1, m2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310] 5561 5562 punpckhqdq m2, m3, m5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313] 5563 punpcklqdq m3, m5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312] 5564 5565 vperm2i128 m5, m0, m6, 0x20 ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301] 5566 vperm2i128 m0, m0, m6, 0x31 ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281] 5567 5568 vperm2i128 m6, m7, m8, 0x20 ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303] 5569 vperm2i128 m7, m7, m8, 0x31 ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283] 5570 5571 vperm2i128 m8, m1, m4, 0x31 ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311] 5572 vperm2i128 m4, m1, m4, 0x20 ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151] 5573 5574 vperm2i128 m1, m3, m2, 0x31 ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313] 5575 vperm2i128 m2, m3, m2, 0x20 ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153] 5576 5577 IDCT32_PASS1 0 5578 IDCT32_PASS1 1 5579 IDCT32_PASS1 2 5580 IDCT32_PASS1 3 5581 IDCT32_PASS1 4 5582 IDCT32_PASS1 5 5583 IDCT32_PASS1 6 5584 IDCT32_PASS1 7 5585 5586 add r0, 8 5587 add r3, 4 5588 add r4, 4 5589 dec r5d 5590 jnz .pass1 5591 5592%if BIT_DEPTH == 12 5593 %define IDCT_SHIFT2 8 5594 vpbroadcastd m15, [pd_128] 5595%elif BIT_DEPTH == 10 5596 %define IDCT_SHIFT2 10 5597 vpbroadcastd m15, [pd_512] 5598%elif BIT_DEPTH == 8 5599 %define IDCT_SHIFT2 12 5600 vpbroadcastd m15, [pd_2048] 5601%else 5602 %error Unsupported BIT_DEPTH! 5603%endif 5604 5605 mov r3, rsp 5606 add r2d, r2d 5607 mov r4d, 32 5608 5609 mova m7, [tab_idct32_4] 5610 mova m8, [tab_idct32_4 + 32] 5611 mova m9, [tab_idct32_4 + 64] 5612 mova m10, [tab_idct32_4 + 96] 5613 mova m11, [tab_idct32_4 + 128] 5614 mova m12, [tab_idct32_4 + 160] 5615 mova m13, [tab_idct32_4 + 192] 5616 mova m14, [tab_idct32_4 + 224] 5617.pass2: 5618 movu m0, [r3] 5619 movu m1, [r3 + 32] 5620 5621 pmaddwd m2, m0, m7 5622 pmaddwd m3, m0, m8 5623 phaddd m2, m3 5624 5625 pmaddwd m3, m0, m9 5626 pmaddwd m4, m0, m10 5627 phaddd m3, m4 5628 5629 phaddd m2, m3 5630 5631 pmaddwd m3, m0, m11 5632 pmaddwd m4, m0, m12 5633 phaddd m3, m4 5634 5635 pmaddwd m4, m0, m13 5636 pmaddwd m5, m0, m14 5637 phaddd m4, m5 5638 5639 phaddd m3, m4 5640 5641 vperm2i128 m4, m2, m3, 0x31 5642 vperm2i128 m2, m2, m3, 0x20 5643 paddd m2, m4 5644 5645 pmaddwd m3, m0, [tab_idct32_4 + 256] 5646 pmaddwd m4, m0, [tab_idct32_4 + 288] 5647 phaddd m3, m4 5648 5649 pmaddwd m4, m0, [tab_idct32_4 + 320] 5650 pmaddwd m5, m0, [tab_idct32_4 + 352] 5651 phaddd m4, m5 5652 5653 phaddd m3, m4 5654 5655 pmaddwd m4, m0, [tab_idct32_4 + 384] 5656 pmaddwd m5, m0, [tab_idct32_4 + 416] 5657 phaddd m4, m5 5658 5659 pmaddwd m5, m0, [tab_idct32_4 + 448] 5660 pmaddwd m0, [tab_idct32_4 + 480] 5661 phaddd m5, m0 5662 5663 phaddd m4, m5 5664 5665 vperm2i128 m0, m3, m4, 0x31 5666 vperm2i128 m3, m3, m4, 0x20 5667 paddd m3, m0 5668 5669 pmaddwd m4, m1, [tab_idct32_1] 5670 pmaddwd m0, m1, [tab_idct32_1 + 32] 5671 phaddd m4, m0 5672 5673 pmaddwd m5, m1, [tab_idct32_1 + 64] 5674 pmaddwd m0, m1, [tab_idct32_1 + 96] 5675 phaddd m5, m0 5676 5677 phaddd m4, m5 5678 5679 pmaddwd m5, m1, [tab_idct32_1 + 128] 5680 pmaddwd m0, m1, [tab_idct32_1 + 160] 5681 phaddd m5, m0 5682 5683 pmaddwd m6, m1, [tab_idct32_1 + 192] 5684 pmaddwd m0, m1, [tab_idct32_1 + 224] 5685 phaddd m6, m0 5686 5687 phaddd m5, m6 5688 5689 vperm2i128 m0, m4, m5, 0x31 5690 vperm2i128 m4, m4, m5, 0x20 5691 paddd m4, m0 5692 5693 pmaddwd m5, m1, [tab_idct32_1 + 256] 5694 pmaddwd m0, m1, [tab_idct32_1 + 288] 5695 phaddd m5, m0 5696 5697 pmaddwd m6, m1, [tab_idct32_1 + 320] 5698 pmaddwd m0, m1, [tab_idct32_1 + 352] 5699 phaddd m6, m0 5700 5701 phaddd m5, m6 5702 5703 pmaddwd m6, m1, [tab_idct32_1 + 384] 5704 pmaddwd m0, m1, [tab_idct32_1 + 416] 5705 phaddd m6, m0 5706 5707 pmaddwd m0, m1, [tab_idct32_1 + 448] 5708 pmaddwd m1, [tab_idct32_1 + 480] 5709 phaddd m0, m1 5710 5711 phaddd m6, m0 5712 5713 vperm2i128 m0, m5, m6, 0x31 5714 vperm2i128 m5, m5, m6, 0x20 5715 paddd m5, m0 5716 5717 paddd m6, m2, m4 5718 paddd m6, m15 5719 psrad m6, IDCT_SHIFT2 5720 5721 psubd m2, m4 5722 paddd m2, m15 5723 psrad m2, IDCT_SHIFT2 5724 5725 paddd m4, m3, m5 5726 paddd m4, m15 5727 psrad m4, IDCT_SHIFT2 5728 5729 psubd m3, m5 5730 paddd m3, m15 5731 psrad m3, IDCT_SHIFT2 5732 5733 packssdw m6, m4 5734 packssdw m2, m3 5735 5736 vpermq m6, m6, 0xD8 5737 vpermq m2, m2, 0x8D 5738 pshufb m2, [dct16_shuf1] 5739 5740 mova [r1], m6 5741 mova [r1 + 32], m2 5742 5743 add r1, r2 5744 add r3, 64 5745 dec r4d 5746 jnz .pass2 5747 RET 5748 5749 5750%macro IDCT32_AVX512_PASS1 5 5751 pmaddwd m9, m8, m%4 5752 pmaddwd m10, m7, m%5 5753 5754 paddd m9, m10 5755 vpsrldq m0, m9, 8 5756 paddd m9, m0 5757 vpsrldq m0, m9, 4 5758 paddd m9, m0 5759 5760 pmaddwd m10, m4, m%4 5761 pmaddwd m11, m1, m%5 5762 5763 paddd m10, m11 5764 vpsrldq m0, m10, 8 5765 paddd m10, m0 5766 vpslldq m0, m10, 4 5767 paddd m10, m0 5768 5769 vmovdqu32 m9 {k3}, m10 5770 5771 mova m6, [tab_idct32_AVX512_5 + %1 * 64] 5772 mova m5, [tab_idct32_AVX512_5 + %1 * 64 + 64] 5773 5774 pmaddwd m10, m8, m6 5775 pmaddwd m11, m7, m5 5776 5777 paddd m10, m11 5778 vpslldq m0, m10, 8 5779 paddd m10, m0 5780 vpsrldq m0, m10, 4 5781 paddd m10, m0 5782 5783 pmaddwd m11, m4, m6 5784 pmaddwd m12, m1, m5 5785 5786 paddd m11, m12 5787 vpslldq m0, m11, 8 5788 paddd m11, m0 5789 vpslldq m0, m11, 4 5790 paddd m11, m0 5791 5792 vmovdqu32 m10 {k4}, m11 5793 vmovdqu32 m9 {k2}, m10 5794 5795 pmaddwd m10, m3, m%2 5796 pmaddwd m11, m14, m%2 5797 5798 vpsrldq m0, m10, 4 5799 paddd m10, m0 5800 vpslldq m5, m11, 4 5801 paddd m11, m5 5802 vmovdqu32 m10 {k1}, m11 5803 5804 vpsrldq m0, m10, 8 5805 paddd m10, m0 5806 5807 pmaddwd m11, m2, m%3 5808 pmaddwd m12, m13, m%3 5809 5810 vpsrldq m0, m11, 4 5811 paddd m11, m0 5812 vpslldq m5, m12, 4 5813 paddd m12, m5 5814 vmovdqu32 m11 {k1}, m12 5815 5816 vpsrldq m0, m11, 8 5817 paddd m11, m0 5818 5819 paddd m12, m10, m11 5820 psubd m10, m11 5821 5822 punpcklqdq m12, m10 5823 paddd m10, m9, m12 5824 paddd m10, m15 5825 psrad m10, IDCT_SHIFT1 5826 5827 psubd m12, m9 5828 paddd m12, m15 5829 psrad m12, IDCT_SHIFT1 5830 5831 packssdw m10, m12 5832 vextracti128 xm12, m10, 1 5833 vextracti64x4 ym5, m10, 1 5834 vextracti128 xm0, ym5, 1 5835 5836 movd [r3 + %1 * 64], xm10 5837 movd [r3 + 32 + %1 * 64], xm12 5838 pextrd [r4 - %1 * 64], xm10, 1 5839 pextrd [r4+ 32 - %1 * 64], xm12, 1 5840 pextrd [r3 + 16 * 64 + %1 *64], xm10, 3 5841 pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3 5842 pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2 5843 pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2 5844 5845 movd [r3 + (%1 + 1) * 64], xm5 5846 movd [r3 + 32 + (%1 + 1) * 64], xm0 5847 pextrd [r4 - (%1 + 1) * 64], xm5, 1 5848 pextrd [r4+ 32 - (%1 + 1) * 64], xm0, 1 5849 pextrd [r3 + 16 * 64 + (%1 + 1) * 64], xm5, 3 5850 pextrd [r3 + 16 * 64 + 32 + (%1 + 1) * 64], xm0, 3 5851 pextrd [r4 + 16 * 64 - (%1 + 1) * 64], xm5, 2 5852 pextrd [r4 + 16 * 64 + 32 - (%1 + 1) * 64], xm0, 2 5853%endmacro 5854 5855%macro IDCT32_AVX512_PASS2 0 5856 pmaddwd m2, m0, m7 5857 pmaddwd m3, m0, m8 5858 5859 vpsrldq m24, m2, 4 5860 paddd m2, m24 5861 vpslldq m25, m3, 4 5862 paddd m3, m25 5863 vmovdqu32 m2 {k1}, m3 5864 5865 pmaddwd m3, m0, m9 5866 pmaddwd m4, m0, m10 5867 5868 vpsrldq m24, m3, 4 5869 paddd m3, m24 5870 vpslldq m25, m4, 4 5871 paddd m4, m25 5872 vmovdqu32 m3 {k1}, m4 5873 5874 vpsrldq m24, m2, 8 5875 paddd m2, m24 5876 vpslldq m25, m3, 8 5877 paddd m3, m25 5878 vmovdqu32 m2 {k2}, m3 5879 5880 pmaddwd m3, m0, m11 5881 pmaddwd m4, m0, m12 5882 5883 vpsrldq m24, m3, 4 5884 paddd m3, m24 5885 vpslldq m25, m4, 4 5886 paddd m4, m25 5887 vmovdqu32 m3 {k1}, m4 5888 5889 pmaddwd m4, m0, m13 5890 pmaddwd m5, m0, m14 5891 5892 vpsrldq m24, m4, 4 5893 paddd m4, m24 5894 vpslldq m25, m5, 4 5895 paddd m5, m25 5896 vmovdqu32 m4 {k1}, m5 5897 5898 vpsrldq m24, m3, 8 5899 paddd m3, m24 5900 vpslldq m25, m4, 8 5901 paddd m4, m25 5902 vmovdqu32 m3 {k2}, m4 5903 5904 mova m24, [idct16_AVX512_shuff3] 5905 mova m25, [idct16_AVX512_shuff2] 5906 vpermi2q m24, m2, m3 5907 vpermi2q m25, m2, m3 5908 paddd m2, m25, m24 5909 5910 pmaddwd m3, m0, m16 5911 pmaddwd m4, m0, m17 5912 5913 vpsrldq m24, m3, 4 5914 paddd m3, m24 5915 vpslldq m25, m4, 4 5916 paddd m4, m25 5917 vmovdqu32 m3 {k1}, m4 5918 5919 pmaddwd m4, m0, m18 5920 pmaddwd m5, m0, m19 5921 5922 vpsrldq m24, m4, 4 5923 paddd m4, m24 5924 vpslldq m25, m5, 4 5925 paddd m5, m25 5926 vmovdqu32 m4 {k1}, m5 5927 5928 vpsrldq m24, m3, 8 5929 paddd m3, m24 5930 vpslldq m25, m4, 8 5931 paddd m4, m25 5932 vmovdqu32 m3 {k2}, m4 5933 5934 pmaddwd m4, m0, m20 5935 pmaddwd m5, m0, m21 5936 5937 vpsrldq m24, m4, 4 5938 paddd m4, m24 5939 vpslldq m25, m5, 4 5940 paddd m5, m25 5941 vmovdqu32 m4 {k1}, m5 5942 5943 pmaddwd m5, m0, m22 5944 pmaddwd m0, m23 5945 5946 vpsrldq m24, m5, 4 5947 paddd m5, m24 5948 vpslldq m25, m0, 4 5949 paddd m0, m25 5950 vmovdqu32 m5 {k1}, m0 5951 5952 vpsrldq m24, m4, 8 5953 paddd m4, m24 5954 vpslldq m25, m5, 8 5955 paddd m5, m25 5956 vmovdqu32 m4 {k2}, m5 5957 5958 mova m24, [idct16_AVX512_shuff3] 5959 mova m25, [idct16_AVX512_shuff2] 5960 vpermi2q m24, m3, m4 5961 vpermi2q m25, m3, m4 5962 paddd m3, m25, m24 5963 5964 pmaddwd m4, m1, m26 5965 pmaddwd m0, m1, m27 5966 5967 vpsrldq m24, m4, 4 5968 paddd m4, m24 5969 vpslldq m25, m0, 4 5970 paddd m0, m25 5971 vmovdqu32 m4 {k1}, m0 5972 5973 pmaddwd m5, m1, m28 5974 pmaddwd m0, m1, m29 5975 5976 vpsrldq m24, m5, 4 5977 paddd m5, m24 5978 vpslldq m25, m0, 4 5979 paddd m0, m25 5980 vmovdqu32 m5 {k1}, m0 5981 5982 5983 vpsrldq m24, m4, 8 5984 paddd m4, m24 5985 vpslldq m25, m5, 8 5986 paddd m5, m25 5987 vmovdqu32 m4 {k2}, m5 5988 5989 pmaddwd m5, m1, m30 5990 pmaddwd m0, m1, m31 5991 5992 vpsrldq m24, m5, 4 5993 paddd m5, m24 5994 vpslldq m25, m0, 4 5995 paddd m0, m25 5996 vmovdqu32 m5 {k1}, m0 5997 5998 pmaddwd m6, m1, [tab_idct32_AVX512_4 + 6 * mmsize] 5999 pmaddwd m0, m1, [tab_idct32_AVX512_4 + 7 * mmsize] 6000 6001 vpsrldq m24, m6, 4 6002 paddd m6, m24 6003 vpslldq m25, m0, 4 6004 paddd m0, m25 6005 vmovdqu32 m6 {k1}, m0 6006 6007 vpsrldq m24, m5, 8 6008 paddd m5, m24 6009 vpslldq m25, m6, 8 6010 paddd m6, m25 6011 vmovdqu32 m5 {k2}, m6 6012 6013 mova m24, [idct16_AVX512_shuff3] 6014 mova m25, [idct16_AVX512_shuff2] 6015 vpermi2q m24, m4, m5 6016 vpermi2q m25, m4, m5 6017 paddd m4, m25, m24 6018 6019 pmaddwd m5, m1, [tab_idct32_AVX512_4 + 8 * mmsize] 6020 pmaddwd m0, m1, [tab_idct32_AVX512_4 + 9 * mmsize] 6021 6022 vpsrldq m24, m5, 4 6023 paddd m5, m24 6024 vpslldq m25, m0, 4 6025 paddd m0, m25 6026 vmovdqu32 m5 {k1}, m0 6027 6028 pmaddwd m6, m1, [tab_idct32_AVX512_4 + 10 * mmsize] 6029 pmaddwd m0, m1, [tab_idct32_AVX512_4 + 11 * mmsize] 6030 6031 vpsrldq m24, m6, 4 6032 paddd m6, m24 6033 vpslldq m25, m0, 4 6034 paddd m0, m25 6035 vmovdqu32 m6 {k1}, m0 6036 6037 vpsrldq m24, m5, 8 6038 paddd m5, m24 6039 vpslldq m25, m6, 8 6040 paddd m6, m25 6041 vmovdqu32 m5 {k2}, m6 6042 6043 pmaddwd m6, m1, [tab_idct32_AVX512_4 + 12 * mmsize] 6044 pmaddwd m0, m1, [tab_idct32_AVX512_4 + 13 * mmsize] 6045 6046 vpsrldq m24, m6, 4 6047 paddd m6, m24 6048 vpslldq m25, m0, 4 6049 paddd m0, m25 6050 vmovdqu32 m6 {k1}, m0 6051 6052 pmaddwd m0, m1, [tab_idct32_AVX512_4 + 14 * mmsize] 6053 pmaddwd m1, [tab_idct32_AVX512_4 + 15 * mmsize] 6054 6055 vpsrldq m24, m0, 4 6056 paddd m0, m24 6057 vpslldq m25, m1, 4 6058 paddd m1, m25 6059 vmovdqu32 m0 {k1}, m1 6060 6061 vpsrldq m24, m6, 8 6062 paddd m6, m24 6063 vpslldq m25, m0, 8 6064 paddd m0, m25 6065 vmovdqu32 m6 {k2}, m0 6066 6067 mova m24, [idct16_AVX512_shuff3] 6068 mova m25, [idct16_AVX512_shuff2] 6069 vpermi2q m24, m5, m6 6070 vpermi2q m25, m5, m6 6071 paddd m5, m25, m24 6072 6073 paddd m6, m2, m4 6074 paddd m6, m15 6075 psrad m6, IDCT_SHIFT2 6076 6077 psubd m2, m4 6078 paddd m2, m15 6079 psrad m2, IDCT_SHIFT2 6080 6081 paddd m4, m3, m5 6082 paddd m4, m15 6083 psrad m4, IDCT_SHIFT2 6084 6085 psubd m3, m5 6086 paddd m3, m15 6087 psrad m3, IDCT_SHIFT2 6088 6089 packssdw m6, m4 6090 packssdw m2, m3 6091 6092 vpermq m6, m6, 0xD8 6093 vpermq m2, m2, 0x8D 6094 pshufb m2, [idct16_AVX512_shuff6] 6095%endmacro 6096 6097;------------------------------------------------------------------- 6098; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride) 6099;------------------------------------------------------------------- 6100 6101INIT_ZMM avx512 6102cglobal idct32, 3, 8, 32, 0-32*64 6103 6104%define IDCT_SHIFT1 7 6105 6106 vbroadcasti128 m15, [pd_64] 6107 6108 mov r3, rsp 6109 lea r4, [r3 + 15 * 64] 6110 mov r5d, 8 6111 mov r7d, 0xAAAA 6112 kmovd k1, r7d 6113 mov r7d, 0xCCCC 6114 kmovd k2, r7d 6115 mov r7d, 0x2222 6116 kmovd k3, r7d 6117 mov r7d, 0x8888 6118 kmovd k4, r7d 6119 6120 6121 mova m16, [tab_idct32_AVX512_2 + 0 * 64] 6122 mova m17, [tab_idct32_AVX512_2 + 1 * 64] 6123 mova m18, [tab_idct32_AVX512_2 + 2 * 64] 6124 mova m19, [tab_idct32_AVX512_2 + 3 * 64] 6125 6126 mova m20, [tab_idct32_AVX512_3 + 0 * 64] 6127 mova m21, [tab_idct32_AVX512_3 + 1 * 64] 6128 mova m22, [tab_idct32_AVX512_3 + 2 * 64] 6129 mova m23, [tab_idct32_AVX512_3 + 3 * 64] 6130 6131 mova m24, [tab_idct32_AVX512_1 + 0 * 64] 6132 mova m25, [tab_idct32_AVX512_1 + 1 * 64] 6133 mova m26, [tab_idct32_AVX512_1 + 2 * 64] 6134 mova m27, [tab_idct32_AVX512_1 + 3 * 64] 6135 mova m28, [tab_idct32_AVX512_1 + 4 * 64] 6136 mova m29, [tab_idct32_AVX512_1 + 5 * 64] 6137 mova m30, [tab_idct32_AVX512_1 + 6 * 64] 6138 mova m31, [tab_idct32_AVX512_1 + 7 * 64] 6139 6140.pass1: 6141 movq xm0, [r0 + 2 * 64] 6142 movq xm1, [r0 + 18 * 64] 6143 punpcklqdq xm0, xm0, xm1 6144 movq xm1, [r0 + 0 * 64] 6145 movq xm2, [r0 + 16 * 64] 6146 punpcklqdq xm1, xm1, xm2 6147 vinserti128 ym0, ym0, xm1, 1 ;[2 18 0 16] 6148 6149 movq xm1, [r0 + 1 * 64] 6150 movq xm2, [r0 + 9 * 64] 6151 punpcklqdq xm1, xm1, xm2 6152 movq xm2, [r0 + 17 * 64] 6153 movq xm3, [r0 + 25 * 64] 6154 punpcklqdq xm2, xm2, xm3 6155 vinserti128 ym1, ym1, xm2, 1 ;[1 9 17 25] 6156 6157 movq xm2, [r0 + 6 * 64] 6158 movq xm3, [r0 + 22 * 64] 6159 punpcklqdq xm2, xm2, xm3 6160 movq xm3, [r0 + 4 * 64] 6161 movq xm4, [r0 + 20 * 64] 6162 punpcklqdq xm3, xm3, xm4 6163 vinserti128 ym2, ym2, xm3, 1 ;[6 22 4 20] 6164 6165 movq xm3, [r0 + 3 * 64] 6166 movq xm4, [r0 + 11 * 64] 6167 punpcklqdq xm3, xm3, xm4 6168 movq xm4, [r0 + 19 * 64] 6169 movq xm5, [r0 + 27 * 64] 6170 punpcklqdq xm4, xm4, xm5 6171 vinserti128 ym3, ym3, xm4, 1 ;[3 11 17 25] 6172 6173 movq xm4, [r0 + 10 * 64] 6174 movq xm5, [r0 + 26 * 64] 6175 punpcklqdq xm4, xm4, xm5 6176 movq xm5, [r0 + 8 * 64] 6177 movq xm6, [r0 + 24 * 64] 6178 punpcklqdq xm5, xm5, xm6 6179 vinserti128 ym4, ym4, xm5, 1 ;[10 26 8 24] 6180 6181 movq xm5, [r0 + 5 * 64] 6182 movq xm6, [r0 + 13 * 64] 6183 punpcklqdq xm5, xm5, xm6 6184 movq xm6, [r0 + 21 * 64] 6185 movq xm7, [r0 + 29 * 64] 6186 punpcklqdq xm6, xm6, xm7 6187 vinserti128 ym5, ym5, xm6, 1 ;[5 13 21 9] 6188 6189 movq xm6, [r0 + 14 * 64] 6190 movq xm7, [r0 + 30 * 64] 6191 punpcklqdq xm6, xm6, xm7 6192 movq xm7, [r0 + 12 * 64] 6193 movq xm8, [r0 + 28 * 64] 6194 punpcklqdq xm7, xm7, xm8 6195 vinserti128 ym6, ym6, xm7, 1 ;[14 30 12 28] 6196 6197 movq xm7, [r0 + 7 * 64] 6198 movq xm8, [r0 + 15 * 64] 6199 punpcklqdq xm7, xm7, xm8 6200 movq xm8, [r0 + 23 * 64] 6201 movq xm9, [r0 + 31 * 64] 6202 punpcklqdq xm8, xm8, xm9 6203 vinserti128 ym7, ym7, xm8, 1 ;[7 15 23 31] 6204 6205 punpckhwd ym8, ym0, ym2 ;[18 22 16 20] 6206 punpcklwd ym0, ym2 ;[2 6 0 4] 6207 6208 punpckhwd ym2, ym1, ym3 ;[9 11 25 27] 6209 punpcklwd ym1, ym3 ;[1 3 17 19] 6210 6211 punpckhwd ym3, ym4, ym6 ;[26 30 24 28] 6212 punpcklwd ym4, ym6 ;[10 14 8 12] 6213 6214 punpckhwd ym6, ym5, ym7 ;[13 15 29 31] 6215 punpcklwd ym5, ym7 ;[5 7 21 23] 6216 6217 punpckhdq ym7, ym0, ym4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123] 6218 punpckldq ym0, ym4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121] 6219 6220 punpckhdq ym4, ym8, ym3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283] 6221 punpckldq ym8, ym3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281] 6222 6223 punpckhdq ym3, ym1, ym5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233] 6224 punpckldq ym1, ym5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231] 6225 6226 punpckhdq ym5, ym2, ym6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313] 6227 punpckldq ym2, ym6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311] 6228 6229 punpckhqdq ym6, ym0, ym8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281] 6230 punpcklqdq ym0, ym8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280] 6231 6232 punpckhqdq ym8, ym7, ym4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283] 6233 punpcklqdq ym7, ym4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282] 6234 6235 punpckhqdq ym4, ym1, ym2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311] 6236 punpcklqdq ym1, ym2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310] 6237 6238 punpckhqdq ym2, ym3, ym5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313] 6239 punpcklqdq ym3, ym5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312] 6240 6241 vinserti64x4 m7, m7, ym7, 1 6242 vinserti64x4 m8, m8, ym8, 1 6243 movu m13, [idct16_AVX512_shuff2] 6244 movu m14, [idct16_AVX512_shuff3] 6245 vpermi2q m13, m7, m8 6246 vpermi2q m14, m7, m8 6247 6248 vinserti64x4 m1, m1, ym1, 1 6249 vinserti64x4 m4, m4, ym4, 1 6250 movu m7, [idct16_AVX512_shuff3] 6251 movu m8, [idct16_AVX512_shuff2] 6252 vpermi2q m7, m1, m4 6253 vpermi2q m8, m1, m4 6254 6255 vinserti64x4 m3, m3, ym3, 1 6256 vinserti64x4 m2, m2, ym2, 1 6257 movu m1, [idct16_AVX512_shuff3] 6258 movu m4, [idct16_AVX512_shuff2] 6259 vpermi2q m1, m3, m2 6260 vpermi2q m4, m3, m2 6261 6262 vinserti64x4 m0, m0, ym0, 1 6263 vinserti64x4 m6, m6, ym6, 1 6264 movu m2, [idct16_AVX512_shuff2] 6265 movu m3, [idct16_AVX512_shuff3] 6266 vpermi2q m2, m0, m6 6267 vpermi2q m3, m0, m6 6268 6269 6270 IDCT32_AVX512_PASS1 0, 16, 20, 24, 25 6271 IDCT32_AVX512_PASS1 2, 17, 21, 26, 27 6272 IDCT32_AVX512_PASS1 4, 18, 22, 28, 29 6273 IDCT32_AVX512_PASS1 6, 19, 23, 30, 31 6274 6275 add r0, 8 6276 add r3, 4 6277 add r4, 4 6278 dec r5d 6279 jnz .pass1 6280 6281%if BIT_DEPTH == 12 6282 %define IDCT_SHIFT2 8 6283 vpbroadcastd m15, [pd_128] 6284%elif BIT_DEPTH == 10 6285 %define IDCT_SHIFT2 10 6286 vpbroadcastd m15, [pd_512] 6287%elif BIT_DEPTH == 8 6288 %define IDCT_SHIFT2 12 6289 vpbroadcastd m15, [pd_2048] 6290%else 6291 %error Unsupported BIT_DEPTH! 6292%endif 6293 6294 mov r3, rsp 6295 add r2d, r2d 6296 mov r4d, 16 6297 mov r6d, 0xFFFF0000 6298 kmovd k3, r6d 6299 6300 mova m7, [tab_idct32_AVX512_6] 6301 mova m8, [tab_idct32_AVX512_6 + 1 * mmsize] 6302 mova m9, [tab_idct32_AVX512_6 + 2 * mmsize] 6303 mova m10, [tab_idct32_AVX512_6 + 3 * mmsize] 6304 mova m11, [tab_idct32_AVX512_6 + 4 * mmsize] 6305 mova m12, [tab_idct32_AVX512_6 + 5 * mmsize] 6306 mova m13, [tab_idct32_AVX512_6 + 6 * mmsize] 6307 mova m14, [tab_idct32_AVX512_6 + 7 * mmsize] 6308 mova m16, [tab_idct32_AVX512_6 + 8 * mmsize] 6309 mova m17, [tab_idct32_AVX512_6 + 9 * mmsize] 6310 mova m18, [tab_idct32_AVX512_6 + 10 * mmsize] 6311 mova m19, [tab_idct32_AVX512_6 + 11 * mmsize] 6312 mova m20, [tab_idct32_AVX512_6 + 12 * mmsize] 6313 mova m21, [tab_idct32_AVX512_6 + 13 * mmsize] 6314 mova m22, [tab_idct32_AVX512_6 + 14 * mmsize] 6315 mova m23, [tab_idct32_AVX512_6 + 15 * mmsize] 6316 mova m26, [tab_idct32_AVX512_4] 6317 mova m27, [tab_idct32_AVX512_4 + 1 * mmsize] 6318 mova m28, [tab_idct32_AVX512_4 + 2 * mmsize] 6319 mova m29, [tab_idct32_AVX512_4 + 3 * mmsize] 6320 mova m30, [tab_idct32_AVX512_4 + 4 * mmsize] 6321 mova m31, [tab_idct32_AVX512_4 + 5 * mmsize] 6322 6323.pass2: 6324 movu ym0, [r3] 6325 movu ym1, [r3 + 32] 6326 vmovdqu16 m0 {k3}, [r3 + 32] 6327 vmovdqu16 m1 {k3}, [r3 + 64] 6328 6329 IDCT32_AVX512_PASS2 6330 movu [r1], ym6 6331 movu [r1 + 32], ym2 6332 vextracti64x4 ym24, m6, 1 6333 vextracti64x4 ym25, m2, 1 6334 add r1, r2 6335 movu [r1 ], ym24 6336 movu [r1 + 32], ym25 6337 6338 add r1, r2 6339 add r3, 128 6340 dec r4d 6341 jnz .pass2 6342 RET 6343 6344;------------------------------------------------------- 6345; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride) 6346;------------------------------------------------------- 6347INIT_YMM avx2 6348cglobal idct4, 3, 4, 6 6349 6350%define IDCT_SHIFT1 7 6351%if BIT_DEPTH == 12 6352 %define IDCT_SHIFT2 8 6353 vpbroadcastd m5, [pd_128] 6354%elif BIT_DEPTH == 10 6355 %define IDCT_SHIFT2 10 6356 vpbroadcastd m5, [pd_512] 6357%elif BIT_DEPTH == 8 6358 %define IDCT_SHIFT2 12 6359 vpbroadcastd m5, [pd_2048] 6360%else 6361 %error Unsupported BIT_DEPTH! 6362%endif 6363 vbroadcasti128 m4, [pd_64] 6364 6365 add r2d, r2d 6366 lea r3, [r2 * 3] 6367 6368 movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33] 6369 6370 pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33] 6371 vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33] 6372 punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23] 6373 punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33] 6374 vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23] 6375 vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33] 6376 6377 mova m1, [avx2_idct4_1] 6378 mova m3, [avx2_idct4_1 + 32] 6379 pmaddwd m1, m2 6380 pmaddwd m3, m0 6381 6382 paddd m0, m1, m3 6383 paddd m0, m4 6384 psrad m0, IDCT_SHIFT1 ;[00 20 10 30 01 21 11 31] 6385 6386 psubd m1, m3 6387 paddd m1, m4 6388 psrad m1, IDCT_SHIFT1 ;[03 23 13 33 02 22 12 32] 6389 6390 packssdw m0, m1 ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32] 6391 vmovshdup m1, m0 ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32] 6392 vmovsldup m0, m0 ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22] 6393 6394 vpbroadcastq m2, [avx2_idct4_2] 6395 vpbroadcastq m3, [avx2_idct4_2 + 8] 6396 pmaddwd m0, m2 6397 pmaddwd m1, m3 6398 6399 paddd m2, m0, m1 6400 paddd m2, m5 6401 psrad m2, IDCT_SHIFT2 ;[00 01 10 11 30 31 20 21] 6402 6403 psubd m0, m1 6404 paddd m0, m5 6405 psrad m0, IDCT_SHIFT2 ;[03 02 13 12 33 32 23 22] 6406 6407 pshufb m0, [idct4_shuf2] ;[02 03 12 13 32 33 22 23] 6408 punpcklqdq m1, m2, m0 ;[00 01 02 03 10 11 12 13] 6409 punpckhqdq m2, m0 ;[30 31 32 33 20 21 22 23] 6410 packssdw m1, m2 ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23] 6411 vextracti128 xm0, m1, 1 6412 6413 movq [r1], xm1 6414 movq [r1 + r2], xm0 6415 movhps [r1 + 2 * r2], xm0 6416 movhps [r1 + r3], xm1 6417 RET 6418 6419;static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos) 6420;{ 6421; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ 6422; const int scaleBits = SCALE_BITS - 2 * transformShift; 6423; const uint32_t trSize = 1 << log2TrSize; 6424 6425; for (int y = 0; y < MLS_CG_SIZE; y++) 6426; { 6427; for (int x = 0; x < MLS_CG_SIZE; x++) 6428; { 6429; int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ 6430; costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits)); 6431; *totalUncodedCost += costUncoded[blkPos + x]; 6432; *totalRdCost += costUncoded[blkPos + x]; 6433; } 6434; blkPos += trSize; 6435; } 6436;} 6437 6438;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6439; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos) 6440;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6441INIT_ZMM avx512 6442cglobal nonPsyRdoQuant4, 5, 5, 8 6443 mov r4d, r4m 6444 lea r0, [r0 + 2 * r4] 6445 lea r4, [4 * r4] 6446 lea r1, [r1 + 2 * r4] 6447%if BIT_DEPTH == 12 6448 mov r4, [tab_nonpsyRdo12] 6449%elif BIT_DEPTH == 10 6450 mov r4, [tab_nonpsyRdo10] 6451%elif BIT_DEPTH == 8 6452 mov r4, [tab_nonpsyRdo8] 6453%else 6454 %error Unsupported BIT_DEPTH! 6455 %endif 6456 movq xm3, r4 6457 movq xm6, [r2] 6458 movq xm7, [r3] 6459 vpxor m4, m4 6460 vpxor m5, m5 6461;Row 1, 2 6462 movu xm0, [r0] 6463 vpmovsxwq m1, xm0 6464 vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 6465 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 6466 vcvtpd2qq m1, m2 6467 vpsllq m1, xm3 ; costUncoded 6468 paddq m4, m1 6469 movu [r1], m1 6470 ;Row 3, 4 6471 movu xm0, [r0 + 16] 6472 vpmovsxwq m1, xm0 6473 vcvtqq2pd m2, m1 6474 vfmadd213pd m2, m2, m5 6475 vcvtpd2qq m1, m2 6476 vpsllq m1, xm3 ; costUncoded 6477 paddq m4, m1 6478 movu [r1 + 64], m1 6479 vextracti32x8 ym2, m4, 1 6480 paddq ym4, ym2 6481 vextracti32x4 xm2, m4, 1 6482 paddq xm4, xm2 6483 punpckhqdq xm2, xm4, xm5 6484 paddq xm4, xm2 6485 6486 paddq xm6, xm4 6487 paddq xm7, xm4 6488 6489 movq [r2], xm6 6490 movq [r3], xm7 6491 RET 6492INIT_ZMM avx512 6493cglobal nonPsyRdoQuant8, 5, 5, 8 6494 mov r4d, r4m 6495 lea r0, [r0 + 2 * r4] 6496 lea r4, [4 * r4] 6497 lea r1, [r1 + 2 * r4] 6498%if BIT_DEPTH == 12 6499 mov r4, [tab_nonpsyRdo12 + 8] 6500%elif BIT_DEPTH == 10 6501 mov r4, [tab_nonpsyRdo10 + 8] 6502%elif BIT_DEPTH == 8 6503 mov r4, [tab_nonpsyRdo8 + 8] 6504%else 6505 %error Unsupported BIT_DEPTH! 6506 %endif 6507 movq xm3, r4 6508 movq xm6, [r2] 6509 movq xm7, [r3] 6510 vpxor m4, m4 6511 vpxor m5, m5 6512 6513;Row 1, 2 6514 movq xm0, [r0] 6515 pinsrq xm0, [r0 + mmsize/4], 1 6516 vpmovsxwq m1, xm0 6517 vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 6518 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 6519 vcvtpd2qq m1, m2 6520 vpsllq m1, xm3 ; costUncoded 6521 paddq m4, m1 6522 movu [r1], ym1 6523 vextracti32x8 [r1 + mmsize], m1 , 1 6524 6525 ;Row 3, 4 6526 movq xm0, [r0 + mmsize/2] 6527 pinsrq xm0, [r0 + 3 * mmsize/4], 1 6528 vpmovsxwq m1, xm0 6529 vcvtqq2pd m2, m1 6530 vfmadd213pd m2, m2, m5 6531 vcvtpd2qq m1, m2 6532 vpsllq m1, xm3 ; costUncoded 6533 paddq m4, m1 6534 movu [r1 + 2 * mmsize], ym1 6535 vextracti32x8 [r1 + 3 * mmsize], m1 , 1 6536 6537 vextracti32x8 ym2, m4, 1 6538 paddq ym4, ym2 6539 vextracti32x4 xm2, m4, 1 6540 paddq xm4, xm2 6541 punpckhqdq xm2, xm4, xm5 6542 paddq xm4, xm2 6543 6544 paddq xm6, xm4 6545 paddq xm7, xm4 6546 6547 movq [r2], xm6 6548 movq [r3], xm7 6549 RET 6550INIT_ZMM avx512 6551cglobal nonPsyRdoQuant16, 5, 5, 8 6552 mov r4d, r4m 6553 lea r0, [r0 + 2 * r4] 6554 lea r4, [4 * r4] 6555 lea r1, [r1 + 2 * r4] 6556%if BIT_DEPTH == 12 6557 mov r4, [tab_nonpsyRdo12 + 16] 6558%elif BIT_DEPTH == 10 6559 mov r4, [tab_nonpsyRdo10 + 16] 6560%elif BIT_DEPTH == 8 6561 mov r4, [tab_nonpsyRdo8 + 16] 6562%else 6563 %error Unsupported BIT_DEPTH! 6564 %endif 6565 movq xm3, r4 6566 movq xm6, [r2] 6567 movq xm7, [r3] 6568 vpxor m4, m4 6569 vpxor m5, m5 6570 6571;Row 1, 2 6572 movq xm0, [r0] 6573 pinsrq xm0, [r0 + mmsize/2], 1 6574 vpmovsxwq m1, xm0 6575 vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 6576 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 6577 vcvtpd2qq m1, m2 6578 vpsllq m1, xm3 ; costUncoded 6579 paddq m4, m1 6580 movu [r1], ym1 6581 vextracti32x8 [r1 + 2 * mmsize], m1, 1 6582 6583 ;Row 3, 4 6584 movq xm0, [r0 + mmsize] 6585 pinsrq xm0, [r0 + 3 * mmsize/2], 1 6586 vpmovsxwq m1, xm0 6587 vcvtqq2pd m2, m1 6588 vfmadd213pd m2, m2, m5 6589 vcvtpd2qq m1, m2 6590 vpsllq m1, xm3 ; costUncoded 6591 paddq m4, m1 6592 movu [r1 + 4 * mmsize], ym1 6593 vextracti32x8 [r1 + 6 * mmsize], m1 , 1 6594 6595 vextracti32x8 ym2, m4, 1 6596 paddq ym4, ym2 6597 vextracti32x4 xm2, m4, 1 6598 paddq xm4, xm2 6599 punpckhqdq xm2, xm4, xm5 6600 paddq xm4, xm2 6601 6602 paddq xm6, xm4 6603 paddq xm7, xm4 6604 6605 movq [r2], xm6 6606 movq [r3], xm7 6607 RET 6608INIT_ZMM avx512 6609cglobal nonPsyRdoQuant32, 5, 5, 8 6610 mov r4d, r4m 6611 lea r0, [r0 + 2 * r4] 6612 lea r4, [4 * r4] 6613 lea r1, [r1 + 2 * r4] 6614%if BIT_DEPTH == 12 6615 mov r4, [tab_nonpsyRdo12 + 24] 6616%elif BIT_DEPTH == 10 6617 mov r4, [tab_nonpsyRdo10 + 24] 6618%elif BIT_DEPTH == 8 6619 mov r4, [tab_nonpsyRdo8 + 24] 6620%else 6621 %error Unsupported BIT_DEPTH! 6622 %endif 6623 movq xm3, r4 6624 movq xm6, [r2] 6625 movq xm7, [r3] 6626 vpxor m4, m4 6627 vpxor m5, m5 6628 6629;Row 1, 2 6630 movq xm0, [r0] 6631 pinsrq xm0, [r0 + mmsize], 1 6632 vpmovsxwq m1, xm0 6633 vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 6634 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 6635 vcvtpd2qq m1, m2 6636 vpsllq m1, xm3 ; costUncoded 6637 paddq m4, m1 6638 movu [r1], ym1 6639 vextracti32x8 [r1 + 4 * mmsize], m1, 1 6640 6641 ;Row 3, 4 6642 movq xm0, [r0 + 2 * mmsize] 6643 pinsrq xm0, [r0 + 3 * mmsize], 1 6644 vpmovsxwq m1, xm0 6645 vcvtqq2pd m2, m1 6646 vfmadd213pd m2, m2, m5 6647 vcvtpd2qq m1, m2 6648 vpsllq m1, xm3 ; costUncoded 6649 paddq m4, m1 6650 movu [r1 + 8 * mmsize], ym1 6651 vextracti32x8 [r1 + 12 * mmsize], m1 , 1 6652 6653 vextracti32x8 ym2, m4, 1 6654 paddq ym4, ym2 6655 vextracti32x4 xm2, m4, 1 6656 paddq xm4, xm2 6657 punpckhqdq xm2, xm4, xm5 6658 paddq xm4, xm2 6659 6660 paddq xm6, xm4 6661 paddq xm7, xm4 6662 6663 movq [r2], xm6 6664 movq [r3], xm7 6665 RET 6666;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos) 6667;{ 6668; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ 6669; const int scaleBits = SCALE_BITS - 2 * transformShift; 6670; const uint32_t trSize = 1 << log2TrSize; 6671; int max = X265_MAX(0, (2 * transformShift + 1)); 6672; 6673; for (int y = 0; y < MLS_CG_SIZE; y++) 6674; { 6675; for (int x = 0; x < MLS_CG_SIZE; x++) 6676; { 6677; int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */ 6678; int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/ 6679; 6680; costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits); 6681; 6682; /* when no residual coefficient is coded, predicted coef == recon coef */ 6683; costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max); 6684; 6685; *totalUncodedCost += costUncoded[blkPos + x]; 6686; *totalRdCost += costUncoded[blkPos + x]; 6687; } 6688; blkPos += trSize; 6689; } 6690;} 6691 6692;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6693; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) 6694;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6695INIT_ZMM avx512 6696cglobal psyRdoQuant4, 5, 9, 13 6697%if WIN64 6698 mov r5, r5m 6699%endif 6700 mov r6d, r6m 6701 vpbroadcastq m12, [r5] ; psyScale 6702 lea r0, [r0 + 2 * r6] 6703 lea r1, [r1 + 2 * r6] 6704 lea r6, [4 * r6] 6705 lea r2, [r2 + 2 * r6] 6706 movq xm0, [r3] 6707 movq xm1, [r4] 6708 6709%if BIT_DEPTH == 12 6710 mov r5, [tab_nonpsyRdo12] ; scaleBits 6711%elif BIT_DEPTH == 10 6712 mov r5, [tab_nonpsyRdo10] 6713%elif BIT_DEPTH == 8 6714 mov r5, [tab_nonpsyRdo8] 6715%else 6716 %error Unsupported BIT_DEPTH! 6717%endif 6718 6719 movq xm2, r5 6720 vpxor m4, m4 6721 vpxor m3, m3 6722 6723;Row 1, 2 6724 vpmovsxwq m6, [r0] 6725 vpmovsxwq m7, [r1] 6726 psubq m7, m6 ; predictedCoef 6727 6728 vcvtqq2pd m9, m6 6729 vfmadd213pd m9, m9, m3 6730 vcvtpd2qq m8, m9 6731 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 6732 6733 vcvtqq2pd m10, m7 6734 vcvtqq2pd m11, m12 6735 vfmadd213pd m10, m11, m3 6736 vcvtpd2qq m9, m10 6737 vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max 6738 6739 psubq m8, m9 6740 paddq m4, m8 6741 movu [r2], m8 6742 6743 ;Row 3, 4 6744 vpmovsxwq m6, [r0 + 16] 6745 vpmovsxwq m7, [r1 + 16] 6746 psubq m7, m6 ; predictedCoef 6747 6748 vcvtqq2pd m9, m6 6749 vfmadd213pd m9, m9, m3 6750 vcvtpd2qq m8, m9 6751 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 6752 6753 vcvtqq2pd m10, m7 6754 vcvtqq2pd m11, m12 6755 vfmadd213pd m10, m11, m3 6756 vcvtpd2qq m9, m10 6757 vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max 6758 6759 psubq m8, m9 6760 paddq m4, m8 6761 movu [r2 + 64], m8 6762 6763 vextracti32x8 ym2, m4, 1 6764 paddq ym4, ym2 6765 vextracti32x4 xm2, m4, 1 6766 paddq xm4, xm2 6767 punpckhqdq xm2, xm4, xm3 6768 paddq xm4, xm2 6769 6770 paddq xm0, xm4 6771 paddq xm1, xm4 6772 6773 movq [r3], xm0 6774 movq [r4], xm1 6775 RET 6776 6777;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6778; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) 6779;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6780INIT_ZMM avx512 6781cglobal psyRdoQuant8, 5, 9, 15 6782%if WIN64 6783 mov r5, r5m 6784%endif 6785 mov r6d, r6m 6786 vpbroadcastq m12, [r5] ; psyScale 6787 lea r0, [r0 + 2 * r6] 6788 lea r1, [r1 + 2 * r6] 6789 lea r6, [4 * r6] 6790 lea r2, [r2 + 2 * r6] 6791 movq xm0, [r3] 6792 movq xm1, [r4] 6793 6794%if BIT_DEPTH == 12 6795 mov r5, [tab_nonpsyRdo12 + 8] ; scaleBits 6796%elif BIT_DEPTH == 10 6797 mov r5, [tab_nonpsyRdo10 + 8] 6798%elif BIT_DEPTH == 8 6799 mov r5, [tab_nonpsyRdo8 + 8] 6800%else 6801 %error Unsupported BIT_DEPTH! 6802%endif 6803 6804 movq xm2, r5 6805 vpxor m4, m4 6806 vpxor m3, m3 6807 6808;Row 1, 2 6809 movq xm13, [r0] 6810 movq xm14, [r1] 6811 pinsrq xm13, [r0 + mmsize/4], 1 6812 pinsrq xm14, [r1 + mmsize/4], 1 6813 vpmovsxwq m6, xm13 6814 vpmovsxwq m7, xm14 6815 psubq m7, m6 ; predictedCoef 6816 6817 vcvtqq2pd m9, m6 6818 vfmadd213pd m9, m9, m3 6819 vcvtpd2qq m8, m9 6820 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 6821 6822 vcvtqq2pd m10, m7 6823 vcvtqq2pd m11, m12 6824 vfmadd213pd m10, m11, m3 6825 vcvtpd2qq m9, m10 6826 vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max 6827 6828 psubq m8, m9 6829 paddq m4, m8 6830 movu [r2], ym8 6831 vextracti32x8 [r2 + mmsize], m8 , 1 6832 6833 ;Row 3, 4 6834 movq xm13, [r0 + mmsize/2] 6835 movq xm14, [r1 + mmsize/2] 6836 pinsrq xm13, [r0 + 3 * mmsize/4], 1 6837 pinsrq xm14, [r1 + 3 * mmsize/4], 1 6838 vpmovsxwq m6, xm13 6839 vpmovsxwq m7, xm14 6840 psubq m7, m6 ; predictedCoef 6841 6842 vcvtqq2pd m9, m6 6843 vfmadd213pd m9, m9, m3 6844 vcvtpd2qq m8, m9 6845 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 6846 6847 vcvtqq2pd m10, m7 6848 vcvtqq2pd m11, m12 6849 vfmadd213pd m10, m11, m3 6850 vcvtpd2qq m9, m10 6851 vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max 6852 6853 psubq m8, m9 6854 paddq m4, m8 6855 movu [r2 + 2 * mmsize], ym8 6856 vextracti32x8 [r2 + 3 * mmsize], m8 , 1 6857 6858 vextracti32x8 ym2, m4, 1 6859 paddq ym4, ym2 6860 vextracti32x4 xm2, m4, 1 6861 paddq xm4, xm2 6862 punpckhqdq xm2, xm4, xm3 6863 paddq xm4, xm2 6864 6865 paddq xm0, xm4 6866 paddq xm1, xm4 6867 6868 movq [r3], xm0 6869 movq [r4], xm1 6870 RET 6871 6872;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6873; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) 6874;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6875INIT_ZMM avx512 6876cglobal psyRdoQuant16, 5, 9, 15 6877%if WIN64 6878 mov r5, r5m 6879%endif 6880 mov r6d, r6m 6881 vpbroadcastq m12, [r5] ; psyScale 6882 lea r0, [r0 + 2 * r6] 6883 lea r1, [r1 + 2 * r6] 6884 lea r6, [4 * r6] 6885 lea r2, [r2 + 2 * r6] 6886 movq xm0, [r3] 6887 movq xm1, [r4] 6888 6889%if BIT_DEPTH == 12 6890 mov r5, [tab_nonpsyRdo12 + 16] ; scaleBits 6891%elif BIT_DEPTH == 10 6892 mov r5, [tab_nonpsyRdo10 + 16] 6893%elif BIT_DEPTH == 8 6894 mov r5, [tab_nonpsyRdo8 + 16] 6895%else 6896 %error Unsupported BIT_DEPTH! 6897%endif 6898 6899 movq xm2, r5 6900 vpxor m4, m4 6901 vpxor m3, m3 6902 6903;Row 1, 2 6904 movq xm13, [r0] 6905 movq xm14, [r1] 6906 pinsrq xm13, [r0 + mmsize/2], 1 6907 pinsrq xm14, [r1 + mmsize/2], 1 6908 vpmovsxwq m6, xm13 6909 vpmovsxwq m7, xm14 6910 psubq m7, m6 ; predictedCoef 6911 6912 vcvtqq2pd m9, m6 6913 vfmadd213pd m9, m9, m3 6914 vcvtpd2qq m8, m9 6915 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 6916 6917 vcvtqq2pd m10, m7 6918 vcvtqq2pd m11, m12 6919 vfmadd213pd m10, m11, m3 6920 vcvtpd2qq m9, m10 6921 vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max 6922 6923 psubq m8, m9 6924 paddq m4, m8 6925 movu [r2], ym8 6926 vextracti32x8 [r2 + 2 * mmsize], m8 , 1 6927 6928 ;Row 3, 4 6929 movq xm13, [r0 + mmsize] 6930 movq xm14, [r1 + mmsize] 6931 pinsrq xm13, [r0 + 3 * mmsize/2], 1 6932 pinsrq xm14, [r1 + 3 * mmsize/2], 1 6933 vpmovsxwq m6, xm13 6934 vpmovsxwq m7, xm14 6935 psubq m7, m6 ; predictedCoef 6936 6937 vcvtqq2pd m9, m6 6938 vfmadd213pd m9, m9, m3 6939 vcvtpd2qq m8, m9 6940 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 6941 6942 vcvtqq2pd m10, m7 6943 vcvtqq2pd m11, m12 6944 vfmadd213pd m10, m11, m3 6945 vcvtpd2qq m9, m10 6946 vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max 6947 6948 psubq m8, m9 6949 paddq m4, m8 6950 movu [r2 + 4 * mmsize], ym8 6951 vextracti32x8 [r2 + 6 * mmsize], m8 , 1 6952 6953 vextracti32x8 ym2, m4, 1 6954 paddq ym4, ym2 6955 vextracti32x4 xm2, m4, 1 6956 paddq xm4, xm2 6957 punpckhqdq xm2, xm4, xm3 6958 paddq xm4, xm2 6959 6960 paddq xm0, xm4 6961 paddq xm1, xm4 6962 6963 movq [r3], xm0 6964 movq [r4], xm1 6965 RET 6966 6967;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6968; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos) 6969;--------------------------------------------------------------------------------------------------------------------------------------------------------- 6970INIT_ZMM avx512 6971cglobal psyRdoQuant32, 5, 9, 15 6972%if WIN64 6973 mov r5, r5m 6974%endif 6975 mov r6d, r6m 6976 vpbroadcastq m12, [r5] ; psyScale 6977 lea r0, [r0 + 2 * r6] 6978 lea r1, [r1 + 2 * r6] 6979 lea r6, [4 * r6] 6980 lea r2, [r2 + 2 * r6] 6981 movq xm0, [r3] 6982 movq xm1, [r4] 6983 6984%if BIT_DEPTH == 12 6985 mov r5, [tab_nonpsyRdo12 + 24] ; scaleBits 6986%elif BIT_DEPTH == 10 6987 mov r5, [tab_nonpsyRdo10 + 24] 6988%elif BIT_DEPTH == 8 6989 mov r5, [tab_nonpsyRdo8 + 24] 6990%else 6991 %error Unsupported BIT_DEPTH! 6992%endif 6993 6994 movq xm2, r5 6995 vpxor m4, m4 6996 vpxor m3, m3 6997 6998;Row 1, 2 6999 movq xm13, [r0] 7000 movq xm14, [r1] 7001 pinsrq xm13, [r0 + mmsize], 1 7002 pinsrq xm14, [r1 + mmsize], 1 7003 vpmovsxwq m6, xm13 7004 vpmovsxwq m7, xm14 7005 psubq m7, m6 ; predictedCoef 7006 7007 vcvtqq2pd m9, m6 7008 vfmadd213pd m9, m9, m3 7009 vcvtpd2qq m8, m9 7010 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 7011 7012 vcvtqq2pd m10, m7 7013 vcvtqq2pd m11, m12 7014 vfmadd213pd m10, m11, m3 7015 vcvtpd2qq m9, m10 7016 vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max 7017 7018 psubq m8, m9 7019 paddq m4, m8 7020 movu [r2], ym8 7021 vextracti32x8 [r2 + 4 * mmsize], m8 , 1 7022 7023 ;Row 3, 4 7024 movq xm13, [r0 + 2 * mmsize] 7025 movq xm14, [r1 + 2 * mmsize] 7026 pinsrq xm13, [r0 + 3 * mmsize], 1 7027 pinsrq xm14, [r1 + 3 * mmsize], 1 7028 vpmovsxwq m6, xm13 7029 vpmovsxwq m7, xm14 7030 psubq m7, m6 ; predictedCoef 7031 7032 vcvtqq2pd m9, m6 7033 vfmadd213pd m9, m9, m3 7034 vcvtpd2qq m8, m9 7035 vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits 7036 7037 vcvtqq2pd m10, m7 7038 vcvtqq2pd m11, m12 7039 vfmadd213pd m10, m11, m3 7040 vcvtpd2qq m9, m10 7041 vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max 7042 7043 psubq m8, m9 7044 paddq m4, m8 7045 movu [r2 + 8 * mmsize], ym8 7046 vextracti32x8 [r2 + 12 * mmsize], m8 , 1 7047 7048 vextracti32x8 ym2, m4, 1 7049 paddq ym4, ym2 7050 vextracti32x4 xm2, m4, 1 7051 paddq xm4, xm2 7052 punpckhqdq xm2, xm4, xm3 7053 paddq xm4, xm2 7054 7055 paddq xm0, xm4 7056 paddq xm1, xm4 7057 7058 movq [r3], xm0 7059 movq [r4], xm1 7060 RET 7061 7062INIT_YMM avx2 7063cglobal nonPsyRdoQuant4, 5, 9, 16 7064 mov r4d, r4m 7065 lea r0, [r0 + 2 * r4] 7066 lea r4, [4 * r4] 7067 lea r1, [r1 + 2 * r4] 7068 movq xm0, [r2] 7069 movq xm1, [r3] 7070 7071%if BIT_DEPTH == 12 7072 mov r5, [tab_nonpsyRdo12] ; scaleBits 7073%elif BIT_DEPTH == 10 7074 mov r5, [tab_nonpsyRdo10] 7075%elif BIT_DEPTH == 8 7076 mov r5, [tab_nonpsyRdo8] 7077%else 7078 %error Unsupported BIT_DEPTH! 7079%endif 7080 movq xm2, r5 7081 vpxor m4, m4 7082 vpxor m3, m3 7083 vpxor m13, m13 7084 7085 vpmovsxwd m6, [r0] 7086 vcvtdq2pd m9, xm6 7087 vfmadd213pd m9, m9, m3 7088 vcvtpd2dq xm8, m9 7089 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7090 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7091 paddq m4, m13 7092 movu [r1], m13 7093 7094 vpmovsxwd m6, [r0 + 8] 7095 vcvtdq2pd m9, xm6 7096 vfmadd213pd m9, m9, m3 7097 vcvtpd2dq xm8, m9 7098 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7099 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7100 paddq m4, m13 7101 movu [r1 + 32], m13 7102 7103 vpmovsxwd m6, [r0 + 16] 7104 vcvtdq2pd m9, xm6 7105 vfmadd213pd m9, m9, m3 7106 vcvtpd2dq xm8, m9 7107 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7108 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7109 paddq m4, m13 7110 movu [r1 + 64], m13 7111 7112 vpmovsxwd m6, [r0 +24] 7113 vcvtdq2pd m9, xm6 7114 vfmadd213pd m9, m9, m3 7115 vcvtpd2dq xm8, m9 7116 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7117 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7118 paddq m4, m13 7119 movu [r1 + 96], m13 7120 7121 7122 vextracti128 xm2, m4, 1 7123 paddq xm4, xm2 7124 punpckhqdq xm2, xm4, xm3 7125 paddq xm4, xm2 7126 7127 paddq xm0, xm4 7128 paddq xm1, xm4 7129 7130 movq [r2], xm0 7131 movq [r3], xm1 7132 RET 7133 7134 7135 7136INIT_YMM avx2 7137cglobal nonPsyRdoQuant8, 5, 5, 8 7138 mov r4d, r4m 7139 lea r0, [r0 + 2 * r4] 7140 lea r4, [4 * r4] 7141 lea r1, [r1 + 2 * r4] 7142%if BIT_DEPTH == 12 7143 mov r4, [tab_nonpsyRdo12 + 8] 7144%elif BIT_DEPTH == 10 7145 mov r4, [tab_nonpsyRdo10 + 8] 7146%elif BIT_DEPTH == 8 7147 mov r4, [tab_nonpsyRdo8 + 8] 7148%else 7149 %error Unsupported BIT_DEPTH! 7150 %endif 7151 movq xm3, r4 7152 movq xm6, [r2] 7153 movq xm7, [r3] 7154 vpxor m4, m4 7155 vpxor m5, m5 7156 movq xm0, [r0] 7157 vpmovsxwd m1, xm0 7158 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7159 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7160 vcvtpd2dq xm1, m2 7161 vpmovsxdq m0 , xm1 7162 vpsllq m0, xm3 ; costUncoded 7163 paddq m4, m0 7164 movu [r1], ym0 7165 vpxor m0, m0 7166 movq xm0, [r0 +mmsize/2] 7167 vpmovsxwd m1, xm0 7168 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7169 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7170 vcvtpd2dq xm1, m2 7171 vpmovsxdq m0 , xm1 7172 vpsllq m0, xm3 ; costUncoded 7173 paddq m4, m0 7174 movu [r1 +2*mmsize], m0 7175 vpxor m0, m0 7176 movq xm0, [r0 +mmsize] 7177 vpmovsxwd m1, xm0 7178 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7179 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7180 vcvtpd2dq xm1, m2 7181 vpmovsxdq m0 , xm1 7182 vpsllq m0, xm3 ; costUncoded 7183 paddq m4, m0 7184 movu [r1 +4*mmsize], m0 7185 vpxor m0, m0 7186 movq xm0, [r0 +3*mmsize/2] 7187 vpmovsxwd m1, xm0 7188 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7189 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7190 vcvtpd2dq xm1, m2 7191 vpmovsxdq m0 , xm1 7192 vpsllq m0, xm3 ; costUncoded 7193 paddq m4, m0 7194 movu [r1 +6*mmsize], m0 7195 7196 vextracti128 xm2, m4, 1 7197 paddq xm4, xm2 7198 punpckhqdq xm2, xm4, xm5 7199 paddq xm4, xm2 7200 7201 paddq xm6, xm4 7202 paddq xm7, xm4 7203 7204 movq [r2], xm6 7205 movq [r3], xm7 7206 RET 7207INIT_YMM avx2 7208cglobal nonPsyRdoQuant16, 5, 5, 8 7209 mov r4d, r4m 7210 lea r0, [r0 + 2 * r4] 7211 lea r4, [4 * r4] 7212 lea r1, [r1 + 2 * r4] 7213%if BIT_DEPTH == 12 7214 mov r4, [tab_nonpsyRdo12 + 16] 7215%elif BIT_DEPTH == 10 7216 mov r4, [tab_nonpsyRdo10 + 16] 7217%elif BIT_DEPTH == 8 7218 mov r4, [tab_nonpsyRdo8 + 16] 7219%else 7220 %error Unsupported BIT_DEPTH! 7221 %endif 7222 movq xm3, r4 7223 movq xm6, [r2] 7224 movq xm7, [r3] 7225 vpxor m4, m4 7226 vpxor m5, m5 7227 7228;Row 1, 2 7229 movq xm0, [r0] 7230 vpmovsxwd m1, xm0 7231 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7232 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7233 vcvtpd2dq xm1, m2 7234 vpmovsxdq m0 , xm1 7235 vpsllq m0, xm3 ; costUncoded 7236 paddq m4, m0 7237 movu [r1], ym0 7238 7239 movq xm0, [r0 +mmsize] 7240 vpmovsxwd m1, xm0 7241 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7242 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7243 vcvtpd2dq xm1, m2 7244 vpmovsxdq m0 , xm1 7245 vpsllq m0, xm3 ; costUncoded 7246 paddq m4, m0 7247 movu [r1+4*mmsize], ym0 7248 7249 movq xm0, [r0 + 2*mmsize] 7250 vpmovsxwd m1, xm0 7251 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7252 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7253 vcvtpd2dq xm1, m2 7254 vpmovsxdq m0 , xm1 7255 vpsllq m0, xm3 ; costUncoded 7256 paddq m4, m0 7257 movu [r1+8*mmsize], ym0 7258 7259 movq xm0, [r0 + 3*mmsize] 7260 vpmovsxwd m1, xm0 7261 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7262 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7263 vcvtpd2dq xm1, m2 7264 vpmovsxdq m0 , xm1 7265 vpsllq m0, xm3 ; costUncoded 7266 paddq m4, m0 7267 movu [r1+12*mmsize], ym0 7268 7269 7270 vextracti128 xm2, m4, 1 7271 paddq xm4, xm2 7272 punpckhqdq xm2, xm4, xm5 7273 paddq xm4, xm2 7274 7275 paddq xm6, xm4 7276 paddq xm7, xm4 7277 7278 movq [r2], xm6 7279 movq [r3], xm7 7280 RET 7281INIT_YMM avx2 7282cglobal nonPsyRdoQuant32, 5, 5, 8 7283 mov r4d, r4m 7284 lea r0, [r0 + 2 * r4] 7285 lea r4, [4 * r4] 7286 lea r1, [r1 + 2 * r4] 7287%if BIT_DEPTH == 12 7288 mov r4, [tab_nonpsyRdo12 + 24] 7289%elif BIT_DEPTH == 10 7290 mov r4, [tab_nonpsyRdo10 + 24] 7291%elif BIT_DEPTH == 8 7292 mov r4, [tab_nonpsyRdo8 + 24] 7293%else 7294 %error Unsupported BIT_DEPTH! 7295 %endif 7296 movq xm3, r4 7297 movq xm6, [r2] 7298 movq xm7, [r3] 7299 vpxor m4, m4 7300 vpxor m5, m5 7301 7302 movq xm0, [r0] 7303 vpmovsxwd m1, xm0 7304 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7305 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7306 vcvtpd2dq xm1, m2 7307 vpmovsxdq m0 , xm1 7308 vpsllq m0, xm3 ; costUncoded 7309 paddq m4, m0 7310 movu [r1], m0 7311 vpxor m0, m0 7312 7313 movq xm0, [r0 +2*mmsize] 7314 vpmovsxwd m1, xm0 7315 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7316 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7317 vcvtpd2dq xm1, m2 7318 vpmovsxdq m0 , xm1 7319 vpsllq m0, xm3 ; costUncoded 7320 paddq m4, m0 7321 movu [r1 + 8*mmsize], m0 7322 vpxor m0, m0 7323 7324 movq xm0, [r0 +4*mmsize] 7325 vpmovsxwd m1, xm0 7326 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7327 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7328 vcvtpd2dq xm1, m2 7329 vpmovsxdq m0 , xm1 7330 vpsllq m0, xm3 ; costUncoded 7331 paddq m4, m0 7332 movu [r1 +16*mmsize], m0 7333 vpxor m0, m0 7334 7335 movq xm0, [r0 +6*mmsize] 7336 vpmovsxwd m1, xm0 7337 vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements 7338 vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements 7339 vcvtpd2dq xm1, m2 7340 vpmovsxdq m0 , xm1 7341 vpsllq m0, xm3 ; costUncoded 7342 paddq m4, m0 7343 movu [r1 +24*mmsize], m0 7344 7345 vextracti128 xm2, m4, 1 7346 paddq xm4, xm2 7347 punpckhqdq xm2, xm4, xm5 7348 paddq xm4, xm2 7349 7350 paddq xm6, xm4 7351 paddq xm7, xm4 7352 7353 movq [r2], xm6 7354 movq [r3], xm7 7355 RET 7356 7357INIT_YMM avx2 7358cglobal psyRdoQuant_1p4, 5, 9, 16 7359 mov r4d, r4m 7360 lea r0, [r0 + 2 * r4] 7361 lea r4, [4 * r4] 7362 lea r1, [r1 + 2 * r4] 7363 movq xm0, [r2] 7364 movq xm1, [r3] 7365 7366%if BIT_DEPTH == 12 7367 mov r5, [tab_nonpsyRdo12] ; scaleBits 7368%elif BIT_DEPTH == 10 7369 mov r5, [tab_nonpsyRdo10] 7370%elif BIT_DEPTH == 8 7371 mov r5, [tab_nonpsyRdo8] 7372%else 7373 %error Unsupported BIT_DEPTH! 7374%endif 7375 movq xm2, r5 7376 vpxor m4, m4 7377 vpxor m3, m3 7378 vpxor m13, m13 7379 7380 vpmovsxwd m6, [r0] 7381 vcvtdq2pd m9, xm6 7382 vfmadd213pd m9, m9, m3 7383 vcvtpd2dq xm8, m9 7384 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7385 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7386 paddq m4, m13 7387 movu [r1], m13 7388 7389 vpmovsxwd m6, [r0 + 8] 7390 vcvtdq2pd m9, xm6 7391 vfmadd213pd m9, m9, m3 7392 vcvtpd2dq xm8, m9 7393 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7394 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7395 paddq m4, m13 7396 movu [r1 + 32], m13 7397 7398 vpmovsxwd m6, [r0 + 16] 7399 vcvtdq2pd m9, xm6 7400 vfmadd213pd m9, m9, m3 7401 vcvtpd2dq xm8, m9 7402 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7403 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7404 paddq m4, m13 7405 movu [r1 + 64], m13 7406 7407 vpmovsxwd m6, [r0 +24] 7408 vcvtdq2pd m9, xm6 7409 vfmadd213pd m9, m9, m3 7410 vcvtpd2dq xm8, m9 7411 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7412 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7413 paddq m4, m13 7414 movu [r1 + 96], m13 7415 7416 7417 vextracti128 xm2, m4, 1 7418 paddq xm4, xm2 7419 punpckhqdq xm2, xm4, xm3 7420 paddq xm4, xm2 7421 7422 paddq xm0, xm4 7423 paddq xm1, xm4 7424 7425 movq [r2], xm0 7426 movq [r3], xm1 7427 RET 7428INIT_YMM avx2 7429cglobal psyRdoQuant_1p8, 7, 9, 16 7430 mov r4d, r4m 7431 lea r0, [r0 + 2 * r4] 7432 lea r4, [4 * r4] 7433 lea r1, [r1 + 2 * r4] 7434 movq xm0, [r2] 7435 movq xm1, [r3] 7436%if BIT_DEPTH == 12 7437 mov r5, [tab_nonpsyRdo12 +8] ; scaleBits 7438%elif BIT_DEPTH == 10 7439 mov r5, [tab_nonpsyRdo10 +8] 7440%elif BIT_DEPTH == 8 7441 mov r5, [tab_nonpsyRdo8 + 8 ] 7442%else 7443 %error Unsupported BIT_DEPTH! 7444%endif 7445 movq xm2, r5 7446 vpxor m4, m4 7447 vpxor m3, m3 7448 vpxor m13, m13 7449 7450 7451 vpmovsxwd m6, [r0] 7452 vcvtdq2pd m9, xm6 7453 vfmadd213pd m9, m9, m3 7454 vcvtpd2dq xm8, m9 7455 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7456 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7457 paddq m4, m13 7458 movu [r1], m13 7459 7460 vpmovsxwd m6, [r0 + 16] 7461 vcvtdq2pd m9, xm6 7462 vfmadd213pd m9, m9, m3 7463 vcvtpd2dq xm8, m9 7464 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7465 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7466 paddq m4, m13 7467 movu [r1 + 64], m13 7468 7469 vpmovsxwd m6, [r0 +32] 7470 vcvtdq2pd m9, xm6 7471 vfmadd213pd m9, m9, m3 7472 vcvtpd2dq xm8, m9 7473 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7474 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7475 paddq m4, m13 7476 movu [r1 +128], m13 7477 7478 vpmovsxwd m6, [r0 + 48] 7479 vcvtdq2pd m9, xm6 7480 vfmadd213pd m9, m9, m3 7481 vcvtpd2dq xm8, m9 7482 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7483 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7484 paddq m4, m13 7485 movu [r1 + 192], m13 7486 7487 vextracti128 xm2, m4, 1 7488 paddq xm4, xm2 7489 punpckhqdq xm2, xm4, xm3 7490 paddq xm4, xm2 7491 7492 paddq xm0, xm4 7493 paddq xm1, xm4 7494 7495 movq [r2], xm0 7496 movq [r3], xm1 7497 RET 7498 7499INIT_YMM avx2 7500cglobal psyRdoQuant_1p16, 7, 9, 16 7501 mov r4d, r4m 7502 lea r0, [r0 + 2 * r4] 7503 lea r4, [4 * r4] 7504 lea r1, [r1 + 2 * r4] 7505 movq xm0, [r2] 7506 movq xm1, [r3] 7507%if BIT_DEPTH == 12 7508 mov r5, [tab_nonpsyRdo12 + 16] ; scaleBits 7509%elif BIT_DEPTH == 10 7510 mov r5, [tab_nonpsyRdo10 + 16] 7511%elif BIT_DEPTH == 8 7512 mov r5, [tab_nonpsyRdo8 + 16 ] 7513%else 7514 %error Unsupported BIT_DEPTH! 7515%endif 7516 movq xm2, r5 7517 vpxor m4, m4 7518 vpxor m3, m3 7519 vpxor m13, m13 7520 7521 vpmovsxwd m6, [r0] 7522 vcvtdq2pd m9, xm6 7523 vfmadd213pd m9, m9, m3 7524 vcvtpd2dq xm8, m9 7525 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7526 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7527 paddq m4, m13 7528 movu [r1], m13 7529 7530 vpmovsxwd m6, [r0 + mmsize] 7531 7532 vcvtdq2pd m9, xm6 7533 vfmadd213pd m9, m9, m3 7534 vcvtpd2dq xm8, m9 7535 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7536 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7537 paddq m4, m13 7538 movu [r1 + 4*mmsize], m13 7539 7540 vpmovsxwd m6, [r0 + 2 * mmsize] 7541 vcvtdq2pd m9, xm6 7542 vfmadd213pd m9, m9, m3 7543 vcvtpd2dq xm8, m9 7544 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7545 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7546 paddq m4, m13 7547 movu [r1 + 8*mmsize], m13 7548 7549 vpmovsxwd m6, [r0 + 3 * mmsize] 7550 vcvtdq2pd m9, xm6 7551 vfmadd213pd m9, m9, m3 7552 vcvtpd2dq xm8, m9 7553 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7554 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7555 paddq m4, m13 7556 movu [r1 + 12*mmsize], m13 7557 7558 vextracti128 xm2, m4, 1 7559 paddq xm4, xm2 7560 punpckhqdq xm2, xm4, xm3 7561 paddq xm4, xm2 7562 7563 paddq xm0, xm4 7564 paddq xm1, xm4 7565 7566 movq [r2], xm0 7567 movq [r3], xm1 7568 RET 7569 7570INIT_YMM avx2 7571cglobal psyRdoQuant_1p32, 7, 9, 16 7572 mov r4d, r4m 7573 lea r0, [r0 + 2 * r4] 7574 lea r4, [4 * r4] 7575 lea r1, [r1 + 2 * r4] 7576 movq xm0, [r2] 7577 movq xm1, [r3] 7578%if BIT_DEPTH == 12 7579 mov r5, [tab_nonpsyRdo12 + 24] ; scaleBits 7580%elif BIT_DEPTH == 10 7581 mov r5, [tab_nonpsyRdo10 + 24] 7582%elif BIT_DEPTH == 8 7583 mov r5, [tab_nonpsyRdo8 + 24] 7584%else 7585 %error Unsupported BIT_DEPTH! 7586%endif 7587 movq xm2, r5 7588 vpxor m4, m4 7589 vpxor m3, m3 7590 vpxor m13, m13 7591 7592 7593 vpmovsxwd m6, [r0] 7594 vcvtdq2pd m9, xm6 7595 vfmadd213pd m9, m9, m3 7596 vcvtpd2dq xm8, m9 7597 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7598 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7599 paddq m4, m13 7600 movu [r1], m13 7601 7602 vpmovsxwd m6, [r0 + 2 * mmsize] 7603 vcvtdq2pd m9, xm6 7604 vfmadd213pd m9, m9, m3 7605 vcvtpd2dq xm8, m9 7606 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7607 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7608 paddq m4, m13 7609 movu [r1 + 8 * mmsize], m13 7610 7611 vpmovsxwd m6, [r0 + 4 * mmsize] 7612 vcvtdq2pd m9, xm6 7613 vfmadd213pd m9, m9, m3 7614 vcvtpd2dq xm8, m9 7615 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7616 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7617 paddq m4, m13 7618 movu [r1 + 16 * mmsize], m13 7619 7620 vpmovsxwd m6, [r0 + 6 * mmsize] 7621 vcvtdq2pd m9, xm6 7622 vfmadd213pd m9, m9, m3 7623 vcvtpd2dq xm8, m9 7624 vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int 7625 vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits 7626 paddq m4, m13 7627 movu [r1 + 24 *mmsize], m13 7628 7629 vextracti128 xm2, m4, 1 7630 paddq xm4, xm2 7631 punpckhqdq xm2, xm4, xm3 7632 paddq xm4, xm2 7633 7634 paddq xm0, xm4 7635 paddq xm1, xm4 7636 7637 movq [r2], xm0 7638 movq [r3], xm1 7639 RET 7640 7641%endif 7642