1 #include "crypto_core_multsntrup653_ntt.h"
2 #include <immintrin.h>
3
4 // auto-generated; do not edit
5
6
7 #define _mm256_permute2x128_si256_lo(f0,f1) _mm256_permute2x128_si256(f0,f1,0x20)
8 #define _mm256_permute2x128_si256_hi(f0,f1) _mm256_permute2x128_si256(f0,f1,0x31)
9 #define int16x16 __m256i
10
11 typedef int16_t int16;
12 typedef int32_t int32;
13
14 typedef union {
15 int16 data[106 * 16];
16 __m256i _dummy;
17 } vec1696;
18
19 static const vec1696 qdata_7681 = { .data = {
20 #define precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+0)
21 -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625, -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625,
22 #define precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+16)
23 -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182, -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182,
24 #define precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+32)
25 -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182, -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182,
26 #define precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+48)
27 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625, 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625,
28 #define precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+64)
29 -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194, -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194,
30 #define precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+80)
31 -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100, -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100,
32 #define precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+96)
33 -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696, -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696,
34 #define precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+112)
35 -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456, -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456,
36 #define precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+128)
37 -3593, 1701, 2194, 834, -3625, 2319, -1100, 121, -3593, 1701, 2194, 834, -3625, 2319, -1100, 121,
38 #define precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+144)
39 -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250, -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250,
40 #define precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+160)
41 -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414, -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414,
42 #define precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+176)
43 3777, -121, 1100, -2319, 3625, -834, -2194, -1701, 3777, -121, 1100, -2319, 3625, -834, -2194, -1701,
44 #define precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+192)
45 -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816, -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816,
46 #define precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+208)
47 -3625, 617, 2319, 2006, -1100, -1296, 121, 1986, -3625, 617, 2319, 2006, -1100, -1296, 121, 1986,
48 #define precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+224)
49 -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921, -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921,
50 #define precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+240)
51 -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830, -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830,
52 #define precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+256)
53 -3593, 514, 3364, 438, 1701, 2555, -1599, -1738, 2194, 103, 2557, 1881, 834, -549, -2816, 638,
54 #define precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+272)
55 -3625, -1399, 617, -1760, 2319, 2535, 2006, 3266, -1100, -1431, -1296, 3174, 121, 3153, 1986, -810,
56 #define precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+288)
57 -3777, 2956, -2830, -679, 1414, 2440, -1993, -3689, 2456, 2804, 1525, 3555, 2495, 1535, -2088, -7,
58 #define precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+304)
59 3182, -1321, -1921, -1305, 2876, -3772, -3706, 3600, -3696, -2043, 1483, -396, 2250, -2310, -2237, 1887,
60 #define precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+320)
61 -3593, -1887, 2237, 2310, -2250, 396, -1483, 2043, 3696, -3600, 3706, 3772, -2876, 1305, 1921, 1321,
62 #define precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+336)
63 -3182, 7, 2088, -1535, -2495, -3555, -1525, -2804, -2456, 3689, 1993, -2440, -1414, 679, 2830, -2956,
64 #define precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+352)
65 3777, 810, -1986, -3153, -121, -3174, 1296, 1431, 1100, -3266, -2006, -2535, -2319, 1760, -617, 1399,
66 #define precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+368)
67 3625, -638, 2816, 549, -834, -1881, -2557, -103, -2194, 1738, 1599, -2555, -1701, -438, -3364, -514,
68 #define precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+384)
69 -3593, -1532, 514, -373, 3364, -3816, 438, -3456, 1701, 783, 2555, 2883, -1599, 727, -1738, -2385,
70 #define precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+400)
71 2194, -2160, 103, -2391, 2557, 2762, 1881, -2426, 834, 3310, -549, -1350, -2816, 1386, 638, -194,
72 #define precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+416)
73 -3625, 404, -1399, -3692, 617, -2764, -1760, -1054, 2319, 1799, 2535, -3588, 2006, 1533, 3266, 2113,
74 #define precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+432)
75 -1100, -2579, -1431, -1756, -1296, 1598, 3174, -2, 121, -3480, 3153, -2572, 1986, 2743, -810, 2919,
76 #define precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+448)
77 -3593, 2789, -1887, -921, 2237, -1497, 2310, -2133, -2250, -915, 396, 1390, -1483, 3135, 2043, -859,
78 #define precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+464)
79 3696, 2732, -3600, -1464, 3706, 2224, 3772, -2665, -2876, 1698, 1305, 2835, 1921, 730, 1321, 486,
80 #define precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+480)
81 -3182, 3417, 7, -3428, 2088, -3145, -1535, 1168, -2495, -3831, -3555, -3750, -1525, 660, -2804, 2649,
82 #define precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+496)
83 -2456, 3405, 3689, -1521, 1993, 1681, -2440, 1056, -1414, 1166, 679, -2233, 2830, 2175, -2956, -1919,
84 #define precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+512)
85 -3593, -1404, -1532, 451, 514, -402, -373, 1278, 3364, -509, -3816, -3770, 438, -2345, -3456, -226,
86 #define precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+528)
87 1701, -1689, 783, -1509, 2555, 2963, 2883, 1242, -1599, 1669, 727, 2719, -1738, 642, -2385, -436,
88 #define precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+544)
89 2194, 3335, -2160, 1779, 103, 3745, -2391, 17, 2557, 2812, 2762, -1144, 1881, 83, -2426, -1181,
90 #define precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+560)
91 834, -1519, 3310, 3568, -549, -796, -1350, 2072, -2816, -2460, 1386, 2891, 638, -2083, -194, -715,
92 #define precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+576)
93 -3593, -402, -3816, -226, 2555, 1669, -2385, 1779, 2557, 83, 3310, 2072, 638, 1012, -3692, 1295,
94 #define precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+592)
95 2319, -3208, 1533, -2071, -1431, -2005, -2, 1586, 1986, -293, 1919, -929, -679, 777, -1681, -3461,
96 #define precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+608)
97 2456, 3366, 3750, -1203, 1535, -3657, -3417, -1712, -1921, 2515, 2665, -1070, 3600, 2532, -3135, -2589,
98 #define precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+624)
99 2250, -2258, 921, -658, -514, 509, 3456, 1509, 1599, -642, 2160, -17, -1881, 1519, 1350, -2891,
100 #define precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+640)
101 -3593, -3434, -1497, 893, 396, -2422, -859, 2965, 3706, -2339, 1698, -2937, 1321, -670, -3428, -3163,
102 #define precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+656)
103 -2495, -1072, 660, 1084, 3689, -179, 1056, -1338, 2830, 2786, -2919, -3677, -3153, -151, -1598, 3334,
104 #define precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+672)
105 1100, -3314, 3588, 2262, 1760, -2230, -404, 2083, 2816, -3568, 2426, -2812, -103, 436, -727, -2963,
106 #define precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+688)
107 -1701, 3770, 373, 1404, 1887, -1649, 2133, -826, 1483, 434, -2732, 3287, -3772, -2378, -2835, 3723,
108 #define precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+704)
109 -3593, 658, 2789, 370, -1887, -3434, -921, -3752, 2237, 1649, -1497, 2258, 2310, 3581, -2133, 893,
110 #define precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+720)
111 -2250, 3794, -915, 826, 396, 2589, 1390, 592, -1483, -2422, 3135, 3214, 2043, -434, -859, -2532,
112 #define precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+736)
113 3696, 1121, 2732, 2965, -3600, 2998, -1464, -3287, 3706, 1070, 2224, -589, 3772, -2339, -2665, 2070,
114 #define precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+752)
115 -2876, 2378, 1698, -2515, 1305, -2815, 2835, -2937, 1921, -1348, 730, -3723, 1321, 1712, 486, 2130,
116 #define q_x16 *(const int16x16 *)(qdata+768)
117 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
118 #define qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+784)
119 -9, -9, -9, -9, -16425, -16425, -16425, -16425, -9, -9, -9, -9, -16425, -16425, -16425, -16425,
120 #define qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+800)
121 -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350, -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350,
122 #define qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+816)
123 -9, -9, -9, -9, -10350, -10350, -10350, -10350, -9, -9, -9, -9, -10350, -10350, -10350, -10350,
124 #define qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+832)
125 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425, 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425,
126 #define qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+848)
127 -9, -9, -9, -9, -4974, -4974, -4974, -4974, -9, -9, -9, -9, -4974, -4974, -4974, -4974,
128 #define qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+864)
129 -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244, -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244,
130 #define qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+880)
131 -9, -9, -9, -9, -4496, -4496, -4496, -4496, -9, -9, -9, -9, -4496, -4496, -4496, -4496,
132 #define qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+896)
133 -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744, -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744,
134 #define qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+912)
135 -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655, -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655,
136 #define qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+928)
137 -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754, -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754,
138 #define qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+944)
139 -9, -23754, -4496, -828, -10350, 22593, -14744, -20870, -9, -23754, -4496, -828, -10350, 22593, -14744, -20870,
140 #define qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+960)
141 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315, 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315,
142 #define qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+976)
143 -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816, -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816,
144 #define qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+992)
145 -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394, -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394,
146 #define qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+1008)
147 -9, -7491, -23754, -15307, -4496, -15750, -828, -5759, -9, -7491, -23754, -15307, -4496, -15750, -828, -5759,
148 #define qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1024)
149 -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382, -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382,
150 #define qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1040)
151 -9, -14846, -10972, -21066, -20315, -24581, 23489, -23242, -4974, -4505, 25597, -26279, 18242, 21467, -2816, 15998,
152 #define qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1056)
153 -16425, -4983, -19351, 14624, 18191, -2073, -3114, 20674, -7244, -21399, -9488, 6246, -11655, -29103, 19394, -5930,
154 #define qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1072)
155 -28865, -23668, -26382, -28839, 20870, 6536, -31177, 16279, 14744, 29428, 20469, 29667, -22593, 9215, -22568, -11783,
156 #define qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1088)
157 10350, -14121, 5759, -5913, 828, -1724, 15750, 11792, 4496, 25093, 15307, 26228, 23754, -21766, 7491, -6817,
158 #define qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1104)
159 -9, 6817, -7491, 21766, -23754, -26228, -15307, -25093, -4496, -11792, -15750, 1724, -828, 5913, -5759, 14121,
160 #define qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1120)
161 -10350, 11783, 22568, -9215, 22593, -29667, -20469, -29428, -14744, -16279, 31177, -6536, -20870, 28839, 26382, 23668,
162 #define qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1136)
163 28865, 5930, -19394, 29103, 11655, -6246, 9488, 21399, 7244, -20674, 3114, 2073, -18191, -14624, 19351, 4983,
164 #define qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1152)
165 16425, -15998, 2816, -21467, -18242, 26279, -25597, 4505, 4974, 23242, -23489, 24581, 20315, 21066, 10972, 14846,
166 #define qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1168)
167 -9, -32252, -14846, -19317, -10972, 8472, -21066, -3456, -20315, 16655, -24581, 12611, 23489, -12073, -23242, 29871,
168 #define qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1184)
169 -4974, 6032, -4505, 10409, 25597, 24266, -26279, 17030, 18242, 10478, 21467, 11962, -2816, -26262, 15998, -17602,
170 #define qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1200)
171 -16425, -22124, -4983, -26220, -19351, -8908, 14624, 32738, 18191, 13575, -2073, 27132, -3114, 24573, 20674, 27201,
172 #define qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1216)
173 -7244, 12269, -21399, -16092, -9488, -15810, 6246, 15358, -11655, -15768, -29103, 24052, 19394, -26441, -5930, -1689,
174 #define qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1232)
175 -9, 13541, 6817, -5529, -7491, 26663, 21766, -4693, -23754, 13933, -26228, 8558, -15307, -21953, -25093, -22875,
176 #define qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1248)
177 -4496, -7508, -11792, -30136, -15750, 26800, 1724, 17303, -828, 2722, 5913, -12013, -5759, 30426, 14121, 3558,
178 #define qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1264)
179 -10350, -24743, 11783, -21860, 22568, -32329, -9215, 9360, 22593, -7415, -29667, 25946, -20469, -21868, -29428, -25511,
180 #define qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1280)
181 -14744, 1869, -16279, 14351, 31177, 2193, -6536, 17440, -20870, 24718, 28839, -23225, 26382, 9855, 23668, -9599,
182 #define qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1296)
183 -9, -32124, -32252, 10179, -14846, 6766, -19317, 16638, -10972, -23549, 8472, -17082, -21066, -15145, -3456, 31518,
184 #define qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1312)
185 -20315, -6297, 16655, -12261, -24581, -11885, 12611, 30938, 23489, 28805, -12073, 26783, -23242, -14718, 29871, 5708,
186 #define qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1328)
187 -4974, 15111, 6032, -29453, -4505, 12449, 10409, 529, 25597, -32004, 24266, 2952, -26279, 18003, 17030, 24931,
188 #define qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1344)
189 18242, -1007, 10478, -4624, 21467, 17636, 11962, 14360, -2816, 15972, -26262, 16715, 15998, 4573, -17602, -14539,
190 #define qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1360)
191 -9, 6766, 8472, 31518, -24581, 28805, 29871, -29453, 25597, 18003, 10478, 14360, 15998, 27636, -26220, 17167,
192 #define qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1376)
193 18191, -7304, 24573, -22039, -21399, -4565, 15358, 10802, 19394, 21723, 9599, -9633, -28839, -2807, -2193, -30597,
194 #define qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1392)
195 14744, -26330, -25946, -2739, 9215, 32695, 24743, -26288, 5759, 20435, -17303, 24530, 11792, 20964, 21953, 23523,
196 #define qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1408)
197 23754, -27858, 5529, 6510, 14846, 23549, 3456, 12261, -23489, 14718, -6032, -529, 26279, 1007, -11962, -16715,
198 #define qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1424)
199 -9, 24214, 26663, 23933, -26228, -13686, -22875, -27243, -15750, 4317, 2722, 8839, 14121, -32414, -21860, -25179,
200 #define qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1440)
201 22593, -25648, -21868, -964, -16279, -1715, 17440, -14650, 26382, -28958, 1689, -10333, 29103, -20119, 15810, 22790,
202 #define qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1456)
203 7244, 20238, -27132, -2858, -14624, 19274, 22124, -4573, 2816, 4624, -17030, 32004, 4505, -5708, 12073, 11885,
204 #define qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1472)
205 20315, 17082, 19317, 32124, -6817, 14223, 4693, -14138, 15307, 9650, 7508, -9513, -1724, -23882, 12013, -15221,
206 #define qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1488)
207 -9, -6510, 13541, -23182, 6817, 24214, -5529, -24232, -7491, -14223, 26663, 27858, 21766, 26621, -4693, 23933,
208 #define qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1504)
209 -23754, 29394, 13933, 14138, -26228, -23523, 8558, -23984, -15307, -13686, -21953, 26766, -25093, -9650, -22875, -20964,
210 #define qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1520)
211 -4496, -22943, -7508, -27243, -11792, -18506, -30136, 9513, -15750, -24530, 26800, 947, 1724, 4317, 17303, 29718,
212 #define qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1536)
213 -828, 23882, 2722, -20435, 5913, -10495, -12013, 8839, -5759, -3396, 30426, 15221, 14121, 26288, 3558, 27730,
214 #define qinvscaledzeta_x16_4_1 *(const int16x16 *)(qdata+1552)
215 -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
216 #define qinvscaledzeta_x16_4_3 *(const int16x16 *)(qdata+1568)
217 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865,
218 #define qinvscaledzeta_x16_8_1 *(const int16x16 *)(qdata+1584)
219 -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
220 #define qinvscaledzeta_x16_8_7 *(const int16x16 *)(qdata+1600)
221 -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
222 #define qround32_x16 *(const int16x16 *)(qdata+1616)
223 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
224 #define scaledzeta_x16_4_1 *(const int16x16 *)(qdata+1632)
225 -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
226 #define scaledzeta_x16_4_3 *(const int16x16 *)(qdata+1648)
227 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777,
228 #define scaledzeta_x16_8_1 *(const int16x16 *)(qdata+1664)
229 -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
230 #define scaledzeta_x16_8_7 *(const int16x16 *)(qdata+1680)
231 -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
232 }
233 } ;
234
235 static const vec1696 qdata_10753 = { .data = {
236 // precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
237 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688, 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688,
238 // precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
239 -223, -223, -223, -223, -4188, -4188, -4188, -4188, -223, -223, -223, -223, -4188, -4188, -4188, -4188,
240 // precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
241 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188, 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188,
242 // precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
243 223, 223, 223, 223, -3688, -3688, -3688, -3688, 223, 223, 223, 223, -3688, -3688, -3688, -3688,
244 // precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
245 1018, 1018, 1018, 1018, -376, -376, -376, -376, 1018, 1018, 1018, 1018, -376, -376, -376, -376,
246 // precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
247 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686, 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686,
248 // precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
249 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413, 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413,
250 // precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
251 4188, 4188, 4188, 4188, -357, -357, -357, -357, 4188, 4188, 4188, 4188, -357, -357, -357, -357,
252 // precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
253 1018, -3364, -376, 4855, 3688, 425, -3686, 2695, 1018, -3364, -376, 4855, 3688, 425, -3686, 2695,
254 // precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
255 -223, -3784, 357, -2236, -4188, 4544, 2413, 730, -223, -3784, 357, -2236, -4188, 4544, 2413, 730,
256 // precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
257 1018, -730, -2413, -4544, 4188, 2236, -357, 3784, 1018, -730, -2413, -4544, 4188, 2236, -357, 3784,
258 // precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
259 223, -2695, 3686, -425, -3688, -4855, 376, 3364, 223, -2695, 3686, -425, -3688, -4855, 376, 3364,
260 // precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
261 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875, 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875,
262 // precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
263 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035, 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035,
264 // precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
265 1018, 5063, -730, 341, -2413, -3012, -4544, -5213, 1018, 5063, -730, 341, -2413, -3012, -4544, -5213,
266 // precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
267 4188, 1520, 2236, 1931, -357, 918, 3784, 4095, 4188, 1520, 2236, 1931, -357, 918, 3784, 4095,
268 // precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
269 1018, 3085, -5175, 2982, -3364, -4744, 2503, -4129, -376, -2576, 1341, -193, 4855, 3062, -4875, 4,
270 // precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
271 3688, 2388, -2629, -4513, 425, 4742, -4347, 2935, -3686, -544, 3823, -2178, 2695, 847, -4035, 268,
272 // precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
273 -223, -1299, -4095, -1287, -3784, -4876, -918, 3091, 357, -4189, -1931, 4616, -2236, 2984, -1520, -3550,
274 // precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
275 -4188, -1009, 5213, -205, 4544, -4102, 3012, 2790, 2413, -1085, -341, -2565, 730, -4379, -5063, -1284,
276 // precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
277 1018, 1284, 5063, 4379, -730, 2565, 341, 1085, -2413, -2790, -3012, 4102, -4544, 205, -5213, 1009,
278 // precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
279 4188, 3550, 1520, -2984, 2236, -4616, 1931, 4189, -357, -3091, 918, 4876, 3784, 1287, 4095, 1299,
280 // precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
281 223, -268, 4035, -847, -2695, 2178, -3823, 544, 3686, -2935, 4347, -4742, -425, 4513, 2629, -2388,
282 // precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
283 -3688, -4, 4875, -3062, -4855, 193, -1341, 2576, 376, 4129, -2503, 4744, 3364, -2982, 5175, -3085,
284 // precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
285 1018, 5116, 3085, -3615, -5175, 400, 2982, 3198, -3364, 2234, -4744, -4828, 2503, 326, -4129, -512,
286 // precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
287 -376, 1068, -2576, -4580, 1341, 3169, -193, -2998, 4855, -635, 3062, -4808, -4875, -2740, 4, 675,
288 // precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
289 3688, -1324, 2388, 5114, -2629, 5294, -4513, -794, 425, -864, 4742, -886, -4347, 336, 2935, -2045,
290 // precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
291 -3686, -3715, -544, 4977, 3823, -2737, -2178, 3441, 2695, 467, 847, 454, -4035, -779, 268, 2213,
292 // precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
293 1018, 1615, 1284, 2206, 5063, 5064, 4379, 472, -730, -5341, 2565, -4286, 341, 2981, 1085, -1268,
294 // precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
295 -2413, -3057, -2790, -2884, -3012, -1356, 4102, -3337, -4544, 5023, 205, -636, -5213, 909, 1009, -2973,
296 // precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
297 4188, 2271, 3550, -1572, 1520, 1841, -2984, 970, 2236, -4734, -4616, 578, 1931, -116, 4189, 1586,
298 // precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
299 -357, -2774, -3091, -1006, 918, -5156, 4876, 4123, 3784, -567, 1287, 151, 4095, 1458, 1299, 2684,
300 // precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
301 1018, -3260, 5116, -1722, 3085, 5120, -3615, 3760, -5175, 73, 400, 4254, 2982, 2788, 3198, -2657,
302 // precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
303 -3364, 569, 2234, 1930, -4744, -2279, -4828, 5215, 2503, -4403, 326, 1639, -4129, 5068, -512, -5015,
304 // precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
305 -376, -4859, 1068, -40, -2576, 4003, -4580, -4621, 1341, 2487, 3169, -2374, -193, 2625, -2998, 4784,
306 // precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
307 4855, 825, -635, 2118, 3062, -2813, -4808, -4250, -4875, -2113, -2740, -4408, 4, -1893, 675, 458,
308 // precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
309 1018, 5120, 400, -2657, -4744, -4403, -512, -40, 1341, 2625, -635, -4250, 4, -3360, 5114, -5313,
310 // precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
311 425, -2151, 336, -2662, -544, 5334, 3441, 2117, -4035, 2205, -2684, -3570, -1287, -4973, 5156, 2419,
312 // precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
313 357, 1204, -578, 1635, 2984, -1111, -2271, 4359, 5213, -2449, 3337, 3453, 2790, 554, -2981, -1409,
314 // precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
315 730, -279, -2206, 3524, -3085, -73, -3198, -1930, -2503, -5068, -1068, 4621, 193, -825, 4808, 4408,
316 // precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
317 1018, 4428, 5064, -4000, 2565, 573, -1268, 3125, -3012, -4144, 5023, 1927, 1009, -2139, -1572, 3535,
318 // precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
319 2236, 663, -116, 4967, -3091, -854, 4123, 1160, 4095, -1349, -2213, 1782, -847, 2062, 2737, 624,
320 // precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
321 3686, -2283, 886, 4889, 4513, -4601, 1324, 1893, 4875, -2118, 2998, -2487, 2576, 5015, -326, 2279,
322 // precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
323 3364, -4254, 3615, 3260, -1284, -1381, -472, -3891, -341, 2087, 3057, 4720, -4102, 3410, 636, 1689,
324 // precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
325 1018, -3524, 1615, 5268, 1284, 4428, 2206, -834, 5063, 1381, 5064, 279, 4379, 2439, 472, -4000,
326 // precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
327 -730, -2015, -5341, 3891, 2565, 1409, -4286, 2605, 341, 573, 2981, 5356, 1085, -2087, -1268, -554,
328 // precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
329 -2413, 3135, -3057, 3125, -2790, -778, -2884, -4720, -3012, -3453, -1356, -355, 4102, -4144, -3337, -152,
330 // precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
331 -4544, -3410, 5023, 2449, 205, -97, -636, 1927, -5213, 2624, 909, -1689, 1009, -4359, -2973, -3419,
332 // q_x16
333 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
334 // qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
335 -6, -6, -6, -6, -408, -408, -408, -408, -6, -6, -6, -6, -408, -408, -408, -408,
336 // qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
337 -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956, -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956,
338 // qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
339 -6, -6, -6, -6, -1956, -1956, -1956, -1956, -6, -6, -6, -6, -1956, -1956, -1956, -1956,
340 // qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
341 27359, 27359, 27359, 27359, 408, 408, 408, 408, 27359, 27359, 27359, 27359, 408, 408, 408, 408,
342 // qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
343 -6, -6, -6, -6, -20856, -20856, -20856, -20856, -6, -6, -6, -6, -20856, -20856, -20856, -20856,
344 // qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
345 -408, -408, -408, -408, -21094, -21094, -21094, -21094, -408, -408, -408, -408, -21094, -21094, -21094, -21094,
346 // qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
347 -6, -6, -6, -6, -10093, -10093, -10093, -10093, -6, -6, -6, -6, -10093, -10093, -10093, -10093,
348 // qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
349 -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517, -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517,
350 // qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
351 -6, -9508, -20856, -29449, -408, 18345, -21094, -7033, -6, -9508, -20856, -29449, -408, 18345, -21094, -7033,
352 // qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
353 -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090, -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090,
354 // qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
355 -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072, -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072,
356 // qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
357 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508, 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508,
358 // qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
359 -6, -3639, -9508, 25543, -20856, 829, -29449, -17675, -6, -3639, -9508, 25543, -20856, 829, -29449, -17675,
360 // qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
361 -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547, -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547,
362 // qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
363 -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683, -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683,
364 // qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
365 -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847, -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847,
366 // qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
367 -6, -5619, -3639, -12378, -9508, 15736, 25543, 23007, -20856, -27152, 829, -22209, -29449, -20490, -17675, 22532,
368 // qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
369 -408, 16724, 18363, 22623, 18345, 5766, 7429, -31369, -21094, 15840, -10001, 19326, -7033, 3407, -4547, 2316,
370 // qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
371 -27359, 6381, -14847, 8441, -16072, -6924, -26518, -4589, 28517, 12707, -14731, -15864, -12476, 31656, 23056, 24098,
372 // qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
373 1956, -31217, -11683, -24269, -28224, -5126, -7228, 20198, 10093, -573, -3925, -14341, 16090, 23781, -28103, -23812,
374 // qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
375 -6, 23812, 28103, -23781, -16090, 14341, 3925, 573, -10093, -20198, 7228, 5126, 28224, 24269, 11683, 31217,
376 // qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
377 -1956, -24098, -23056, -31656, 12476, 15864, 14731, -12707, -28517, 4589, 26518, 6924, 16072, -8441, 14847, -6381,
378 // qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
379 27359, -2316, 4547, -3407, 7033, -19326, 10001, -15840, 21094, 31369, -7429, -5766, -18345, -22623, -18363, -16724,
380 // qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
381 408, -22532, 17675, 20490, 29449, 22209, -829, 27152, 20856, -23007, -25543, -15736, 9508, 12378, 3639, 5619,
382 // qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
383 -6, -17412, -5619, 2017, -3639, 24976, -12378, 24702, -9508, -31558, 15736, 1316, 25543, -31418, 23007, -512,
384 // qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
385 -20856, -13268, -27152, 22044, 829, 8801, -22209, -12214, -29449, 11141, -20490, -17096, -17675, 32076, 22532, 17571,
386 // qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
387 -408, 13012, 16724, 4090, 18363, -30546, 22623, 16614, 18345, -17248, 5766, 22666, 7429, -7856, -31369, 31235,
388 // qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
389 -21094, 28541, 15840, -30351, -10001, -177, 19326, -31887, -7033, 25555, 3407, -31290, -4547, -13579, 2316, -2395,
390 // qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
391 -6, 4175, 23812, 7326, 28103, 17352, -23781, -28200, -16090, 11555, 14341, 6978, 3925, -1627, 573, 780,
392 // qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
393 -10093, 32271, -20198, 7356, 7228, 29364, 5126, 27895, 28224, -609, 24269, 21892, 11683, -7795, 31217, -18845,
394 // qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
395 -1956, 29407, -24098, -7716, -23056, -719, -31656, -8246, 12476, -26238, 15864, 11842, 14731, 1932, -12707, -11726,
396 // qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
397 -28517, 4394, 4589, 2066, 26518, -11300, 6924, -24037, 16072, 969, -8441, 14999, 14847, -11854, -6381, -19844,
398 // qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
399 -6, -13500, -17412, 32070, -5619, 5120, 2017, 11952, -3639, 1609, 24976, 9374, -12378, -23836, 24702, -8289,
400 // qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
401 -9508, -22471, -31558, 25482, 15736, -8935, 1316, 32351, 25543, 19661, -31418, 8295, 23007, -25652, -512, -19863,
402 // qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
403 -20856, 6917, -13268, -28712, -27152, 20899, 22044, 4083, 829, 951, 8801, 29370, -22209, 24641, -12214, 12976,
404 // qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
405 -29449, -22215, 11141, -29626, -20490, 30467, -17096, 13158, -17675, -24129, 32076, 7880, 22532, -30053, 17571, -8758,
406 // qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
407 -6, 5120, 24976, -8289, 15736, 19661, -512, -28712, 829, 24641, 11141, 13158, 22532, 13024, 4090, -27329,
408 // qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
409 18345, -8807, -7856, -20070, 15840, -1834, -31887, -18875, -4547, 18077, 19844, -23026, 8441, -12653, 11300, 11123,
410 // qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
411 28517, 31924, -11842, -14237, 31656, 16809, -29407, -5369, -11683, -16273, -27895, -29827, 20198, 7722, 1627, 9343,
412 // qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
413 16090, -15127, -7326, -6716, 5619, -1609, -24702, -25482, -25543, 25652, 13268, -4083, 22209, 22215, 17096, -7880,
414 // qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
415 -6, -26292, 17352, 12384, 14341, 61, 780, 23093, 7228, -12336, -609, -7801, 31217, -6747, -7716, 6095,
416 // qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
417 12476, 15511, 1932, 11623, 4589, 6314, -24037, -19320, 14847, 19643, 2395, -21770, -3407, -17394, 177, -23952,
418 // qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
419 21094, -31467, -22666, -1767, -22623, -14329, -13012, 30053, 17675, 29626, 12214, -951, 27152, 19863, 31418, 8935,
420 // qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
421 9508, -9374, -2017, 13500, -23812, -29541, 28200, 20173, -3925, -24025, -32271, -19856, -5126, -26286, -21892, -4967,
422 // qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
423 -6, 6716, 4175, -13164, 23812, -26292, 7326, -12098, 28103, 29541, 17352, 15127, -23781, -7289, -28200, 12384,
424 // qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
425 -16090, -29151, 11555, -20173, 14341, -9343, 6978, -22483, 3925, 61, -1627, 23788, 573, 24025, 780, -7722,
426 // qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
427 -10093, -18881, 32271, 23093, -20198, -24330, 7356, 19856, 7228, 29827, 29364, 15517, 5126, -12336, 27895, -4248,
428 // qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
429 28224, 26286, -609, 16273, 24269, -5729, 21892, -7801, 11683, -30144, -7795, 4967, 31217, 5369, -18845, -8027,
430 // qinvscaledzeta_x16_4_1
431 -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359,
432 // qinvscaledzeta_x16_4_3
433 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
434 // qinvscaledzeta_x16_8_1
435 -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
436 // qinvscaledzeta_x16_8_7
437 -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
438 // qround32_x16
439 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
440 // scaledzeta_x16_4_1
441 -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223,
442 // scaledzeta_x16_4_3
443 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
444 // scaledzeta_x16_8_1
445 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
446 // scaledzeta_x16_8_7
447 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
448 }
449 } ;
450
add_x16(int16x16 a,int16x16 b)451 static inline int16x16 add_x16(int16x16 a, int16x16 b) {
452 return _mm256_add_epi16(a, b);
453 }
454
sub_x16(int16x16 a,int16x16 b)455 static inline int16x16 sub_x16(int16x16 a, int16x16 b) {
456 return _mm256_sub_epi16(a, b);
457 }
458
mulmod_scaled_x16(int16x16 x,int16x16 y,int16x16 yqinv,const int16 * qdata)459 static inline int16x16 mulmod_scaled_x16(int16x16 x, int16x16 y, int16x16 yqinv, const int16 *qdata) {
460 int16x16 b = _mm256_mulhi_epi16(x, y);
461 int16x16 d = _mm256_mullo_epi16(x, yqinv);
462 int16x16 e = _mm256_mulhi_epi16(d, q_x16);
463 return sub_x16(b, e);
464 }
465
reduce_x16(int16x16 x,const int16 * qdata)466 static inline int16x16 reduce_x16(int16x16 x, const int16 *qdata) {
467 int16x16 y = _mm256_mulhrs_epi16(x, qround32_x16);
468 y = _mm256_mullo_epi16(y, q_x16);
469 return sub_x16(x, y);
470 }
471
472 // ----- codegen pass 1
473 //
474 // startntt 512
475 // startbatch 512
476 // // ----- PRECONDITIONS
477 // physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
478 // // transform size 512
479 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
480 // // transforms per batch 1
481 // // batch indexing []
482 // // total batch size 512
483 //
484 // // modulus x^512-1 pos 0:512 q 7681,10753 bound 512*(5629,5800)
485 // assertranges ...
486 //
487 // // ----- LAYER 1
488 //
489 // // butterfly(0,256,1,256,1,0)
490 // butterfly 0 256 1 256 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
491 //
492 // // ----- POSTCONDITIONS AFTER LAYER 1
493 // // transform size 512
494 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
495 // // transforms per batch 1
496 // // batch indexing []
497 // // total batch size 512
498 //
499 // // modulus x^256-1 pos 0:256 q 7681,10753 bound 256*(11258,11600)
500 // assertranges ...
501 //
502 // // modulus x^256+1 pos 256:512 q 7681,10753 bound 256*(11258,11600)
503 // assertranges ...
504 //
505 // // ----- LAYER 2
506 //
507 // // reduce_ifreverse(0,64,1)
508 // reduce_ifreverse 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
509 //
510 // // reduce_ifreverse(256,320,1)
511 // reduce_ifreverse 256 320 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
512 //
513 // // butterfly(0,128,1,128,1,0)
514 // butterfly 0 128 1 128 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
515 //
516 // // butterfly(256,384,1,128,4,1)
517 // butterfly 256 384 1 128 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
518 //
519 // // ----- POSTCONDITIONS AFTER LAYER 2
520 // // transform size 512
521 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
522 // // transforms per batch 1
523 // // batch indexing []
524 // // total batch size 512
525 //
526 // // modulus x^128-1 pos 0:128 q 7681,10753 bound 128*(22516,23200)
527 // assertranges ...
528 //
529 // // modulus x^128+1 pos 128:256 q 7681,10753 bound 128*(22516,23200)
530 // assertranges ...
531 //
532 // // modulus x^128-zeta4 pos 256:384 q 7681,10753 bound 128*(15747,17016)
533 // assertranges ...
534 //
535 // // modulus x^128+zeta4 pos 384:512 q 7681,10753 bound 128*(15747,17016)
536 // assertranges ...
537 //
538 // // ----- LAYER 3
539 //
540 // // reduce_ifforward(64,128,1)
541 // reduce_ifforward 64 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
542 //
543 // // butterfly(0,64,1,64,1,0)
544 // butterfly 0 64 1 64 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
545 //
546 // // butterfly(128,192,1,64,4,1)
547 // butterfly 128 192 1 64 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
548 //
549 // // butterfly(256,320,1,64,8,1)
550 // butterfly 256 320 1 64 8 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
551 //
552 // // butterfly(384,448,1,64,8,-1)
553 // butterfly 384 448 1 64 8 7 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
554 //
555 // // reduce(0,64,1)
556 // reduce 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
557 //
558 // // twist(64,128,1,128,1)
559 // twist 64 128 1 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
560 //
561 // // twist(128,192,1,256,1)
562 // twist 128 192 1 256 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
563 //
564 // // twist(192,256,1,256,-1)
565 // twist 192 256 1 256 255 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
566 //
567 // // twist(256,320,1,512,1)
568 // twist 256 320 1 512 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
569 //
570 // // twist(320,384,1,512,5)
571 // twist 320 384 1 512 5 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
572 //
573 // // twist(384,448,1,512,-1)
574 // twist 384 448 1 512 511 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
575 //
576 // // twist(448,512,1,512,-5)
577 // twist 448 512 1 512 507 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
578 //
579 // // physical_permute(3,6)
580 // physical_permute (3, 6) (0, 1, 2, 3, 4, 5, 6, 7, 8) () (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
581 //
582 // // fold(256)
583 // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
584 // physical_map (0, 1, 2, 6, 4, 5, 3, 7) (8,)
585 //
586 // // fold(128)
587 // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7) (8,)
588 // physical_map (0, 1, 2, 6, 4, 5, 3) (7, 8)
589 //
590 // // fold(64)
591 // physical_unmap (0, 1, 2, 6, 4, 5, 3) (7, 8)
592 // physical_map (0, 1, 2, 6, 4, 5) (3, 7, 8)
593 //
594 // // nextbatch()
595 // stopbatch 512
596 // startbatch 512
597 //
598 // // halfbatch()
599 // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7, 8)
600 // stopbatch 512
601 // doublereps
602 // startbatch 256
603 // physical_map (0, 1, 2, 6, 4, 5) (3, 7)
604 //
605 // // halfbatch()
606 // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7)
607 // stopbatch 256
608 // doublereps
609 // startbatch 128
610 // physical_map (0, 1, 2, 6, 4, 5) (3,)
611 //
612 // // ----- POSTCONDITIONS AFTER LAYER 3
613 // // transform size 64
614 // // transform indexing [0, 1, 2, 6, 4, 5]
615 // // transforms per batch 2
616 // // batch indexing [3]
617 // // total batch size 128
618 //
619 // // modulus x^64-1 pos 0:64 q 7681,10753 bound 1*(5629,5827) 1*(5629,7613) 1*(5629,7666) 1*(5629,7264) 1*(5629,7639) 1*(5629,7591) 1*(5629,7291) 1*(5629,7204) ...
620 // assertranges ...
621 //
622 // // ----- LAYER 4
623 //
624 // // butterfly(0,32,1,32,1,0)
625 // butterfly 0 32 1 32 1 0 (0, 1, 2, 6, 4, 5) (3,)
626 //
627 // // ----- POSTCONDITIONS AFTER LAYER 4
628 // // transform size 64
629 // // transform indexing [0, 1, 2, 6, 4, 5]
630 // // transforms per batch 2
631 // // batch indexing [3]
632 // // total batch size 128
633 //
634 // // modulus x^32-1 pos 0:32 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ...
635 // assertranges ...
636 //
637 // // modulus x^32+1 pos 32:64 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ...
638 // assertranges ...
639 //
640 // // ----- LAYER 5
641 //
642 // // butterfly(0,16,1,16,1,0)
643 // butterfly 0 16 1 16 1 0 (0, 1, 2, 6, 4, 5) (3,)
644 //
645 // // butterfly(32,48,1,16,4,1)
646 // butterfly 32 48 1 16 4 1 (0, 1, 2, 6, 4, 5) (3,)
647 //
648 // // reduce(0,16,1)
649 // reduce 0 16 1 (0, 1, 2, 6, 4, 5) (3,)
650 //
651 // // twist(16,32,1,32,1)
652 // twist 16 32 1 32 1 (0, 1, 2, 6, 4, 5) (3,)
653 //
654 // // twist(32,48,1,64,1)
655 // twist 32 48 1 64 1 (0, 1, 2, 6, 4, 5) (3,)
656 //
657 // // twist(48,64,1,64,-1)
658 // twist 48 64 1 64 63 (0, 1, 2, 6, 4, 5) (3,)
659 //
660 // // physical_permute(0,1,2,5)
661 // physical_permute (0, 1, 2, 5) (0, 1, 2, 6, 4, 5) (3,) (1, 2, 5, 6, 4, 0) (3,)
662 //
663 // // fold(32)
664 // physical_unmap (1, 2, 5, 6, 4, 0) (3,)
665 // physical_map (1, 2, 5, 6, 4) (0, 3)
666 //
667 // // fold(16)
668 // physical_unmap (1, 2, 5, 6, 4) (0, 3)
669 // physical_map (1, 2, 5, 6) (0, 3, 4)
670 //
671 // // ----- POSTCONDITIONS AFTER LAYER 5
672 // // transform size 16
673 // // transform indexing [1, 2, 5, 6]
674 // // transforms per batch 8
675 // // batch indexing [0, 3, 4]
676 // // total batch size 128
677 //
678 // // modulus x^16-1 pos 0:16 q 7681,10753 bound 1*(5629,5800) 1*(5629,6967) 1*(5629,6418) 1*(5629,7585) 1*(5629,7017) 1*(5629,6328) 1*(5629,7033) 1*(5629,6943) ...
679 // assertranges ...
680 //
681 // // ----- LAYER 6
682 //
683 // // butterfly(0,8,1,8,1,0)
684 // butterfly 0 8 1 8 1 0 (1, 2, 5, 6) (0, 3, 4)
685 //
686 // // physical_permute(1,2,4)
687 // physical_permute (1, 2, 4) (1, 2, 5, 6) (0, 3, 4) (2, 4, 5, 6) (0, 3, 1)
688 //
689 // // nextbatch()
690 // stopbatch 128
691 // startbatch 128
692 //
693 // // ----- POSTCONDITIONS AFTER LAYER 6
694 // // transform size 16
695 // // transform indexing [2, 4, 5, 6]
696 // // transforms per batch 8
697 // // batch indexing [0, 3, 1]
698 // // total batch size 128
699 //
700 // // modulus x^8-1 pos 0:8 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555)
701 // assertranges ...
702 //
703 // // modulus x^8+1 pos 8:16 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555)
704 // assertranges ...
705 //
706 // // ----- LAYER 7
707 //
708 // // butterfly(0,4,1,4,1,0)
709 // butterfly 0 4 1 4 1 0 (2, 4, 5, 6) (0, 3, 1)
710 //
711 // // butterfly(8,12,1,4,4,1)
712 // butterfly 8 12 1 4 4 1 (2, 4, 5, 6) (0, 3, 1)
713 //
714 // // reduce(0,4,1)
715 // reduce 0 4 1 (2, 4, 5, 6) (0, 3, 1)
716 //
717 // // twist(4,8,1,8,1)
718 // twist 4 8 1 8 1 (2, 4, 5, 6) (0, 3, 1)
719 //
720 // // twist(8,12,1,16,1)
721 // twist 8 12 1 16 1 (2, 4, 5, 6) (0, 3, 1)
722 //
723 // // twist(12,16,1,16,-1)
724 // twist 12 16 1 16 15 (2, 4, 5, 6) (0, 3, 1)
725 //
726 // // physical_permute(2,6)
727 // physical_permute (2, 6) (2, 4, 5, 6) (0, 3, 1) (6, 4, 5, 2) (0, 3, 1)
728 //
729 // // fold(8)
730 // physical_unmap (6, 4, 5, 2) (0, 3, 1)
731 // physical_map (6, 4, 5) (0, 1, 2, 3)
732 //
733 // // fold(4)
734 // physical_unmap (6, 4, 5) (0, 1, 2, 3)
735 // physical_map (6, 4) (0, 1, 2, 3, 5)
736 //
737 // // ----- POSTCONDITIONS AFTER LAYER 7
738 // // transform size 4
739 // // transform indexing [6, 4]
740 // // transforms per batch 32
741 // // batch indexing [0, 1, 2, 3, 5]
742 // // total batch size 128
743 //
744 // // modulus x^4-1 pos 0:4 q 7681,10753 bound 1*(5629,5800) 1*(5629,6938) 1*(5629,6521) 1*(5629,7157)
745 // assertranges ...
746 //
747 // // ----- LAYER 8
748 //
749 // // butterfly(0,2,1,2,1,0)
750 // butterfly 0 2 1 2 1 0 (6, 4) (0, 1, 2, 3, 5)
751 //
752 // // ----- POSTCONDITIONS AFTER LAYER 8
753 // // transform size 4
754 // // transform indexing [6, 4]
755 // // transforms per batch 32
756 // // batch indexing [0, 1, 2, 3, 5]
757 // // total batch size 128
758 //
759 // // modulus x^2-1 pos 0:2 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095)
760 // assertranges ...
761 //
762 // // modulus x^2+1 pos 2:4 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095)
763 // assertranges ...
764 //
765 // // ----- LAYER 9
766 //
767 // // butterfly(0,1,1,1,1,0)
768 // butterfly 0 1 1 1 1 0 (6, 4) (0, 1, 2, 3, 5)
769 //
770 // // butterfly(2,3,1,1,4,1)
771 // butterfly 2 3 1 1 4 1 (6, 4) (0, 1, 2, 3, 5)
772 //
773 // // ----- POSTCONDITIONS AFTER LAYER 9
774 // // transform size 4
775 // // transform indexing [6, 4]
776 // // transforms per batch 32
777 // // batch indexing [0, 1, 2, 3, 5]
778 // // total batch size 128
779 //
780 // // modulus x^1-1 pos 0:1 q 7681,10753 bound 1*(22516,26416)
781 // assertranges ...
782 //
783 // // modulus x^1+1 pos 1:2 q 7681,10753 bound 1*(22516,26416)
784 // assertranges ...
785 //
786 // // modulus x^1-zeta4 pos 2:3 q 7681,10753 bound 1*(15747,17745)
787 // assertranges ...
788 //
789 // // modulus x^1+zeta4 pos 3:4 q 7681,10753 bound 1*(15747,17745)
790 // assertranges ...
791 // stopbatch 128
792 // physical_unmap (6, 4) (0, 1, 2, 3, 5)
793 // stopntt 512
794
795 // ----- codegen pass 2
796 //
797 // startntt 512
798 // startbatch 512
799 // vector_butterfly 0 256 1 0
800 // vector_butterfly 128 384 1 0
801 // vector_butterfly 64 320 1 0
802 // vector_butterfly 192 448 1 0
803 // vector_reduce_ifreverse 0
804 // vector_reduce_ifreverse 256
805 // vector_butterfly 0 128 1 0
806 // vector_butterfly 64 192 1 0
807 // vector_butterfly 256 384 4 1
808 // vector_butterfly 320 448 4 1
809 // vector_reduce_ifforward 64
810 // vector_butterfly 0 64 1 0
811 // vector_butterfly 128 192 4 1
812 // vector_butterfly 256 320 8 1
813 // vector_butterfly 384 448 8 7
814 // vector_reduce 0
815 // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
816 // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
817 // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
818 // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
819 // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
820 // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
821 // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
822 // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
823 // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
824 // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
825 // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
826 // stopbatch 512
827 // startbatch 512
828 // vector_butterfly 16 272 1 0
829 // vector_butterfly 144 400 1 0
830 // vector_butterfly 80 336 1 0
831 // vector_butterfly 208 464 1 0
832 // vector_reduce_ifreverse 16
833 // vector_reduce_ifreverse 272
834 // vector_butterfly 16 144 1 0
835 // vector_butterfly 80 208 1 0
836 // vector_butterfly 272 400 4 1
837 // vector_butterfly 336 464 4 1
838 // vector_reduce_ifforward 80
839 // vector_butterfly 16 80 1 0
840 // vector_butterfly 144 208 4 1
841 // vector_butterfly 272 336 8 1
842 // vector_butterfly 400 464 8 7
843 // vector_reduce 16
844 // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
845 // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
846 // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
847 // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
848 // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
849 // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
850 // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
851 // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
852 // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
853 // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
854 // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
855 // stopbatch 512
856 // startbatch 512
857 // vector_butterfly 32 288 1 0
858 // vector_butterfly 160 416 1 0
859 // vector_butterfly 96 352 1 0
860 // vector_butterfly 224 480 1 0
861 // vector_reduce_ifreverse 32
862 // vector_reduce_ifreverse 288
863 // vector_butterfly 32 160 1 0
864 // vector_butterfly 96 224 1 0
865 // vector_butterfly 288 416 4 1
866 // vector_butterfly 352 480 4 1
867 // vector_reduce_ifforward 96
868 // vector_butterfly 32 96 1 0
869 // vector_butterfly 160 224 4 1
870 // vector_butterfly 288 352 8 1
871 // vector_butterfly 416 480 8 7
872 // vector_reduce 32
873 // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
874 // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
875 // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
876 // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
877 // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
878 // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
879 // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
880 // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
881 // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
882 // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
883 // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
884 // stopbatch 512
885 // startbatch 512
886 // vector_butterfly 48 304 1 0
887 // vector_butterfly 176 432 1 0
888 // vector_butterfly 112 368 1 0
889 // vector_butterfly 240 496 1 0
890 // vector_reduce_ifreverse 48
891 // vector_reduce_ifreverse 304
892 // vector_butterfly 48 176 1 0
893 // vector_butterfly 112 240 1 0
894 // vector_butterfly 304 432 4 1
895 // vector_butterfly 368 496 4 1
896 // vector_reduce_ifforward 112
897 // vector_butterfly 48 112 1 0
898 // vector_butterfly 176 240 4 1
899 // vector_butterfly 304 368 8 1
900 // vector_butterfly 432 496 8 7
901 // vector_reduce 48
902 // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
903 // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
904 // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
905 // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
906 // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
907 // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
908 // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
909 // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
910 // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
911 // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
912 // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
913 // stopbatch 512
914 // doublereps
915 // doublereps
916 // startbatch 128
917 // vector_butterfly 0 32 1 0
918 // vector_butterfly 64 96 1 0
919 // vector_butterfly 16 48 1 0
920 // vector_butterfly 80 112 1 0
921 // vector_butterfly 0 16 1 0
922 // vector_butterfly 64 80 1 0
923 // vector_butterfly 32 48 4 1
924 // vector_butterfly 96 112 4 1
925 // vector_reduce 0
926 // vector_reduce 64
927 // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
928 // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
929 // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
930 // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
931 // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
932 // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
933 // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
934 // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
935 // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
936 // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
937 // vector_butterfly 0 64 1 0
938 // vector_butterfly 32 96 1 0
939 // vector_butterfly 16 80 1 0
940 // vector_butterfly 48 112 1 0
941 // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
942 // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
943 // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
944 // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
945 // stopbatch 128
946 // startbatch 128
947 // vector_butterfly 0 32 1 0
948 // vector_butterfly 16 48 1 0
949 // vector_butterfly 64 96 4 1
950 // vector_butterfly 80 112 4 1
951 // vector_reduce 0
952 // vector_reduce 16
953 // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
954 // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
955 // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
956 // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
957 // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
958 // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
959 // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
960 // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
961 // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
962 // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
963 // vector_butterfly 0 16 1 0
964 // vector_butterfly 64 80 1 0
965 // vector_butterfly 32 48 1 0
966 // vector_butterfly 96 112 1 0
967 // vector_butterfly 0 64 1 0
968 // vector_butterfly 32 96 1 0
969 // vector_butterfly 16 80 4 1
970 // vector_butterfly 48 112 4 1
971 // stopbatch 128
972 // stopntt 512
973 // startntt 512
974
ntt512(int16 * f,int reps,const int16 * qdata)975 static void ntt512(int16 *f, int reps, const int16 *qdata) {
976 // startbatch 512
977 for (long long r = 0; r < reps; ++r) {
978 // vector_butterfly 0 256 1 0
979 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
980 int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256));
981 int16x16 b0 = add_x16(a0, a16);
982 int16x16 b16 = sub_x16(a0, a16);
983 // vector_butterfly 128 384 1 0
984 int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128));
985 int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384));
986 int16x16 b8 = add_x16(a8, a24);
987 int16x16 b24 = sub_x16(a8, a24);
988 // vector_butterfly 64 320 1 0
989 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
990 int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320));
991 int16x16 b4 = add_x16(a4, a20);
992 int16x16 b20 = sub_x16(a4, a20);
993 // vector_butterfly 192 448 1 0
994 int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192));
995 int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448));
996 int16x16 b12 = add_x16(a12, a28);
997 int16x16 b28 = sub_x16(a12, a28);
998 // vector_reduce_ifreverse 0
999 // vector_reduce_ifreverse 256
1000 // vector_butterfly 0 128 1 0
1001 int16x16 c0 = add_x16(b0, b8);
1002 int16x16 c8 = sub_x16(b0, b8);
1003 // vector_butterfly 64 192 1 0
1004 int16x16 c4 = add_x16(b4, b12);
1005 int16x16 c12 = sub_x16(b4, b12);
1006 // vector_butterfly 256 384 4 1
1007 b24 = mulmod_scaled_x16(b24, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1008 int16x16 c16 = add_x16(b16, b24);
1009 int16x16 c24 = sub_x16(b16, b24);
1010 // vector_butterfly 320 448 4 1
1011 b28 = mulmod_scaled_x16(b28, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1012 int16x16 c20 = add_x16(b20, b28);
1013 int16x16 c28 = sub_x16(b20, b28);
1014 // vector_reduce_ifforward 64
1015 c4 = reduce_x16(c4, qdata);
1016 // vector_butterfly 0 64 1 0
1017 int16x16 d0 = add_x16(c0, c4);
1018 int16x16 d4 = sub_x16(c0, c4);
1019 // vector_butterfly 128 192 4 1
1020 c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1021 int16x16 d8 = add_x16(c8, c12);
1022 int16x16 d12 = sub_x16(c8, c12);
1023 // vector_butterfly 256 320 8 1
1024 c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1025 int16x16 d16 = add_x16(c16, c20);
1026 int16x16 d20 = sub_x16(c16, c20);
1027 // vector_butterfly 384 448 8 7
1028 c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1029 int16x16 d24 = add_x16(c24, c28);
1030 int16x16 d28 = sub_x16(c24, c28);
1031 // vector_reduce 0
1032 d0 = reduce_x16(d0, qdata);
1033 // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1034 d4 = mulmod_scaled_x16(d4, precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1035 // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1036 d8 = mulmod_scaled_x16(d8, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1037 // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1038 d12 = mulmod_scaled_x16(d12, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1039 // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1040 d16 = mulmod_scaled_x16(d16, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1041 // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1042 d20 = mulmod_scaled_x16(d20, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1043 // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1044 d24 = mulmod_scaled_x16(d24, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1045 // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1046 d28 = mulmod_scaled_x16(d28, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1047 // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1048 int16x16 e0 = _mm256_permute2x128_si256_lo(d0, d4);
1049 int16x16 e4 = _mm256_permute2x128_si256_hi(d0, d4);
1050 // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1051 int16x16 e8 = _mm256_permute2x128_si256_lo(d8, d12);
1052 int16x16 e12 = _mm256_permute2x128_si256_hi(d8, d12);
1053 // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1054 int16x16 e16 = _mm256_permute2x128_si256_lo(d16, d20);
1055 int16x16 e20 = _mm256_permute2x128_si256_hi(d16, d20);
1056 // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1057 int16x16 e24 = _mm256_permute2x128_si256_lo(d24, d28);
1058 int16x16 e28 = _mm256_permute2x128_si256_hi(d24, d28);
1059 // stopbatch 512
1060 _mm256_storeu_si256((int16x16 *) (f + 0), e0);
1061 _mm256_storeu_si256((int16x16 *) (f + 64), e4);
1062 _mm256_storeu_si256((int16x16 *) (f + 128), e8);
1063 _mm256_storeu_si256((int16x16 *) (f + 192), e12);
1064 _mm256_storeu_si256((int16x16 *) (f + 256), e16);
1065 _mm256_storeu_si256((int16x16 *) (f + 320), e20);
1066 _mm256_storeu_si256((int16x16 *) (f + 384), e24);
1067 _mm256_storeu_si256((int16x16 *) (f + 448), e28);
1068 f += 512;
1069 }
1070 f -= 512 * reps;
1071 // startbatch 512
1072 for (long long r = 0; r < reps; ++r) {
1073 // vector_butterfly 16 272 1 0
1074 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1075 int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272));
1076 int16x16 b1 = add_x16(a1, a17);
1077 int16x16 b17 = sub_x16(a1, a17);
1078 // vector_butterfly 144 400 1 0
1079 int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144));
1080 int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400));
1081 int16x16 b9 = add_x16(a9, a25);
1082 int16x16 b25 = sub_x16(a9, a25);
1083 // vector_butterfly 80 336 1 0
1084 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1085 int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336));
1086 int16x16 b5 = add_x16(a5, a21);
1087 int16x16 b21 = sub_x16(a5, a21);
1088 // vector_butterfly 208 464 1 0
1089 int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208));
1090 int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464));
1091 int16x16 b13 = add_x16(a13, a29);
1092 int16x16 b29 = sub_x16(a13, a29);
1093 // vector_reduce_ifreverse 16
1094 // vector_reduce_ifreverse 272
1095 // vector_butterfly 16 144 1 0
1096 int16x16 c1 = add_x16(b1, b9);
1097 int16x16 c9 = sub_x16(b1, b9);
1098 // vector_butterfly 80 208 1 0
1099 int16x16 c5 = add_x16(b5, b13);
1100 int16x16 c13 = sub_x16(b5, b13);
1101 // vector_butterfly 272 400 4 1
1102 b25 = mulmod_scaled_x16(b25, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1103 int16x16 c17 = add_x16(b17, b25);
1104 int16x16 c25 = sub_x16(b17, b25);
1105 // vector_butterfly 336 464 4 1
1106 b29 = mulmod_scaled_x16(b29, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1107 int16x16 c21 = add_x16(b21, b29);
1108 int16x16 c29 = sub_x16(b21, b29);
1109 // vector_reduce_ifforward 80
1110 c5 = reduce_x16(c5, qdata);
1111 // vector_butterfly 16 80 1 0
1112 int16x16 d1 = add_x16(c1, c5);
1113 int16x16 d5 = sub_x16(c1, c5);
1114 // vector_butterfly 144 208 4 1
1115 c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1116 int16x16 d9 = add_x16(c9, c13);
1117 int16x16 d13 = sub_x16(c9, c13);
1118 // vector_butterfly 272 336 8 1
1119 c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1120 int16x16 d17 = add_x16(c17, c21);
1121 int16x16 d21 = sub_x16(c17, c21);
1122 // vector_butterfly 400 464 8 7
1123 c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1124 int16x16 d25 = add_x16(c25, c29);
1125 int16x16 d29 = sub_x16(c25, c29);
1126 // vector_reduce 16
1127 d1 = reduce_x16(d1, qdata);
1128 // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1129 d5 = mulmod_scaled_x16(d5, precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1130 // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1131 d9 = mulmod_scaled_x16(d9, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1132 // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1133 d13 = mulmod_scaled_x16(d13, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1134 // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1135 d17 = mulmod_scaled_x16(d17, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1136 // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1137 d21 = mulmod_scaled_x16(d21, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1138 // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1139 d25 = mulmod_scaled_x16(d25, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1140 // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1141 d29 = mulmod_scaled_x16(d29, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1142 // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1143 int16x16 e1 = _mm256_permute2x128_si256_lo(d1, d5);
1144 int16x16 e5 = _mm256_permute2x128_si256_hi(d1, d5);
1145 // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1146 int16x16 e9 = _mm256_permute2x128_si256_lo(d9, d13);
1147 int16x16 e13 = _mm256_permute2x128_si256_hi(d9, d13);
1148 // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1149 int16x16 e17 = _mm256_permute2x128_si256_lo(d17, d21);
1150 int16x16 e21 = _mm256_permute2x128_si256_hi(d17, d21);
1151 // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1152 int16x16 e25 = _mm256_permute2x128_si256_lo(d25, d29);
1153 int16x16 e29 = _mm256_permute2x128_si256_hi(d25, d29);
1154 // stopbatch 512
1155 _mm256_storeu_si256((int16x16 *) (f + 16), e1);
1156 _mm256_storeu_si256((int16x16 *) (f + 80), e5);
1157 _mm256_storeu_si256((int16x16 *) (f + 144), e9);
1158 _mm256_storeu_si256((int16x16 *) (f + 208), e13);
1159 _mm256_storeu_si256((int16x16 *) (f + 272), e17);
1160 _mm256_storeu_si256((int16x16 *) (f + 336), e21);
1161 _mm256_storeu_si256((int16x16 *) (f + 400), e25);
1162 _mm256_storeu_si256((int16x16 *) (f + 464), e29);
1163 f += 512;
1164 }
1165 f -= 512 * reps;
1166 // startbatch 512
1167 for (long long r = 0; r < reps; ++r) {
1168 // vector_butterfly 32 288 1 0
1169 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1170 int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288));
1171 int16x16 b2 = add_x16(a2, a18);
1172 int16x16 b18 = sub_x16(a2, a18);
1173 // vector_butterfly 160 416 1 0
1174 int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160));
1175 int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416));
1176 int16x16 b10 = add_x16(a10, a26);
1177 int16x16 b26 = sub_x16(a10, a26);
1178 // vector_butterfly 96 352 1 0
1179 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1180 int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352));
1181 int16x16 b6 = add_x16(a6, a22);
1182 int16x16 b22 = sub_x16(a6, a22);
1183 // vector_butterfly 224 480 1 0
1184 int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224));
1185 int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480));
1186 int16x16 b14 = add_x16(a14, a30);
1187 int16x16 b30 = sub_x16(a14, a30);
1188 // vector_reduce_ifreverse 32
1189 // vector_reduce_ifreverse 288
1190 // vector_butterfly 32 160 1 0
1191 int16x16 c2 = add_x16(b2, b10);
1192 int16x16 c10 = sub_x16(b2, b10);
1193 // vector_butterfly 96 224 1 0
1194 int16x16 c6 = add_x16(b6, b14);
1195 int16x16 c14 = sub_x16(b6, b14);
1196 // vector_butterfly 288 416 4 1
1197 b26 = mulmod_scaled_x16(b26, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1198 int16x16 c18 = add_x16(b18, b26);
1199 int16x16 c26 = sub_x16(b18, b26);
1200 // vector_butterfly 352 480 4 1
1201 b30 = mulmod_scaled_x16(b30, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1202 int16x16 c22 = add_x16(b22, b30);
1203 int16x16 c30 = sub_x16(b22, b30);
1204 // vector_reduce_ifforward 96
1205 c6 = reduce_x16(c6, qdata);
1206 // vector_butterfly 32 96 1 0
1207 int16x16 d2 = add_x16(c2, c6);
1208 int16x16 d6 = sub_x16(c2, c6);
1209 // vector_butterfly 160 224 4 1
1210 c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1211 int16x16 d10 = add_x16(c10, c14);
1212 int16x16 d14 = sub_x16(c10, c14);
1213 // vector_butterfly 288 352 8 1
1214 c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1215 int16x16 d18 = add_x16(c18, c22);
1216 int16x16 d22 = sub_x16(c18, c22);
1217 // vector_butterfly 416 480 8 7
1218 c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1219 int16x16 d26 = add_x16(c26, c30);
1220 int16x16 d30 = sub_x16(c26, c30);
1221 // vector_reduce 32
1222 d2 = reduce_x16(d2, qdata);
1223 // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1224 d6 = mulmod_scaled_x16(d6, precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1225 // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1226 d10 = mulmod_scaled_x16(d10, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1227 // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1228 d14 = mulmod_scaled_x16(d14, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1229 // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1230 d18 = mulmod_scaled_x16(d18, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1231 // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1232 d22 = mulmod_scaled_x16(d22, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1233 // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1234 d26 = mulmod_scaled_x16(d26, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1235 // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1236 d30 = mulmod_scaled_x16(d30, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1237 // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1238 int16x16 e2 = _mm256_permute2x128_si256_lo(d2, d6);
1239 int16x16 e6 = _mm256_permute2x128_si256_hi(d2, d6);
1240 // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1241 int16x16 e10 = _mm256_permute2x128_si256_lo(d10, d14);
1242 int16x16 e14 = _mm256_permute2x128_si256_hi(d10, d14);
1243 // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1244 int16x16 e18 = _mm256_permute2x128_si256_lo(d18, d22);
1245 int16x16 e22 = _mm256_permute2x128_si256_hi(d18, d22);
1246 // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1247 int16x16 e26 = _mm256_permute2x128_si256_lo(d26, d30);
1248 int16x16 e30 = _mm256_permute2x128_si256_hi(d26, d30);
1249 // stopbatch 512
1250 _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1251 _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1252 _mm256_storeu_si256((int16x16 *) (f + 160), e10);
1253 _mm256_storeu_si256((int16x16 *) (f + 224), e14);
1254 _mm256_storeu_si256((int16x16 *) (f + 288), e18);
1255 _mm256_storeu_si256((int16x16 *) (f + 352), e22);
1256 _mm256_storeu_si256((int16x16 *) (f + 416), e26);
1257 _mm256_storeu_si256((int16x16 *) (f + 480), e30);
1258 f += 512;
1259 }
1260 f -= 512 * reps;
1261 // startbatch 512
1262 for (long long r = 0; r < reps; ++r) {
1263 // vector_butterfly 48 304 1 0
1264 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1265 int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304));
1266 int16x16 b3 = add_x16(a3, a19);
1267 int16x16 b19 = sub_x16(a3, a19);
1268 // vector_butterfly 176 432 1 0
1269 int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176));
1270 int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432));
1271 int16x16 b11 = add_x16(a11, a27);
1272 int16x16 b27 = sub_x16(a11, a27);
1273 // vector_butterfly 112 368 1 0
1274 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1275 int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368));
1276 int16x16 b7 = add_x16(a7, a23);
1277 int16x16 b23 = sub_x16(a7, a23);
1278 // vector_butterfly 240 496 1 0
1279 int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240));
1280 int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496));
1281 int16x16 b15 = add_x16(a15, a31);
1282 int16x16 b31 = sub_x16(a15, a31);
1283 // vector_reduce_ifreverse 48
1284 // vector_reduce_ifreverse 304
1285 // vector_butterfly 48 176 1 0
1286 int16x16 c3 = add_x16(b3, b11);
1287 int16x16 c11 = sub_x16(b3, b11);
1288 // vector_butterfly 112 240 1 0
1289 int16x16 c7 = add_x16(b7, b15);
1290 int16x16 c15 = sub_x16(b7, b15);
1291 // vector_butterfly 304 432 4 1
1292 b27 = mulmod_scaled_x16(b27, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1293 int16x16 c19 = add_x16(b19, b27);
1294 int16x16 c27 = sub_x16(b19, b27);
1295 // vector_butterfly 368 496 4 1
1296 b31 = mulmod_scaled_x16(b31, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1297 int16x16 c23 = add_x16(b23, b31);
1298 int16x16 c31 = sub_x16(b23, b31);
1299 // vector_reduce_ifforward 112
1300 c7 = reduce_x16(c7, qdata);
1301 // vector_butterfly 48 112 1 0
1302 int16x16 d3 = add_x16(c3, c7);
1303 int16x16 d7 = sub_x16(c3, c7);
1304 // vector_butterfly 176 240 4 1
1305 c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1306 int16x16 d11 = add_x16(c11, c15);
1307 int16x16 d15 = sub_x16(c11, c15);
1308 // vector_butterfly 304 368 8 1
1309 c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1310 int16x16 d19 = add_x16(c19, c23);
1311 int16x16 d23 = sub_x16(c19, c23);
1312 // vector_butterfly 432 496 8 7
1313 c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1314 int16x16 d27 = add_x16(c27, c31);
1315 int16x16 d31 = sub_x16(c27, c31);
1316 // vector_reduce 48
1317 d3 = reduce_x16(d3, qdata);
1318 // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1319 d7 = mulmod_scaled_x16(d7, precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1320 // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1321 d11 = mulmod_scaled_x16(d11, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1322 // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1323 d15 = mulmod_scaled_x16(d15, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1324 // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1325 d19 = mulmod_scaled_x16(d19, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1326 // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1327 d23 = mulmod_scaled_x16(d23, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1328 // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1329 d27 = mulmod_scaled_x16(d27, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1330 // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1331 d31 = mulmod_scaled_x16(d31, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1332 // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1333 int16x16 e3 = _mm256_permute2x128_si256_lo(d3, d7);
1334 int16x16 e7 = _mm256_permute2x128_si256_hi(d3, d7);
1335 // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1336 int16x16 e11 = _mm256_permute2x128_si256_lo(d11, d15);
1337 int16x16 e15 = _mm256_permute2x128_si256_hi(d11, d15);
1338 // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1339 int16x16 e19 = _mm256_permute2x128_si256_lo(d19, d23);
1340 int16x16 e23 = _mm256_permute2x128_si256_hi(d19, d23);
1341 // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1342 int16x16 e27 = _mm256_permute2x128_si256_lo(d27, d31);
1343 int16x16 e31 = _mm256_permute2x128_si256_hi(d27, d31);
1344 // stopbatch 512
1345 _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1346 _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1347 _mm256_storeu_si256((int16x16 *) (f + 176), e11);
1348 _mm256_storeu_si256((int16x16 *) (f + 240), e15);
1349 _mm256_storeu_si256((int16x16 *) (f + 304), e19);
1350 _mm256_storeu_si256((int16x16 *) (f + 368), e23);
1351 _mm256_storeu_si256((int16x16 *) (f + 432), e27);
1352 _mm256_storeu_si256((int16x16 *) (f + 496), e31);
1353 f += 512;
1354 }
1355 f -= 512 * reps;
1356 // doublereps
1357 reps *= 2;
1358 // doublereps
1359 reps *= 2;
1360 // startbatch 128
1361 for (long long r = 0; r < reps; ++r) {
1362 // vector_butterfly 0 32 1 0
1363 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1364 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1365 int16x16 b0 = add_x16(a0, a2);
1366 int16x16 b2 = sub_x16(a0, a2);
1367 // vector_butterfly 64 96 1 0
1368 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1369 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1370 int16x16 b4 = add_x16(a4, a6);
1371 int16x16 b6 = sub_x16(a4, a6);
1372 // vector_butterfly 16 48 1 0
1373 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1374 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1375 int16x16 b1 = add_x16(a1, a3);
1376 int16x16 b3 = sub_x16(a1, a3);
1377 // vector_butterfly 80 112 1 0
1378 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1379 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1380 int16x16 b5 = add_x16(a5, a7);
1381 int16x16 b7 = sub_x16(a5, a7);
1382 // vector_butterfly 0 16 1 0
1383 int16x16 c0 = add_x16(b0, b1);
1384 int16x16 c1 = sub_x16(b0, b1);
1385 // vector_butterfly 64 80 1 0
1386 int16x16 c4 = add_x16(b4, b5);
1387 int16x16 c5 = sub_x16(b4, b5);
1388 // vector_butterfly 32 48 4 1
1389 b3 = mulmod_scaled_x16(b3, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1390 int16x16 c2 = add_x16(b2, b3);
1391 int16x16 c3 = sub_x16(b2, b3);
1392 // vector_butterfly 96 112 4 1
1393 b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1394 int16x16 c6 = add_x16(b6, b7);
1395 int16x16 c7 = sub_x16(b6, b7);
1396 // vector_reduce 0
1397 c0 = reduce_x16(c0, qdata);
1398 // vector_reduce 64
1399 c4 = reduce_x16(c4, qdata);
1400 // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1401 c1 = mulmod_scaled_x16(c1, precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1402 // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1403 c5 = mulmod_scaled_x16(c5, precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1404 // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1405 c2 = mulmod_scaled_x16(c2, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1406 // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1407 c6 = mulmod_scaled_x16(c6, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1408 // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1409 c3 = mulmod_scaled_x16(c3, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1410 // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1411 c7 = mulmod_scaled_x16(c7, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1412 // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1413 int16x16 d0 = _mm256_unpacklo_epi16(c0, c2);
1414 int16x16 d2 = _mm256_unpackhi_epi16(c0, c2);
1415 // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1416 int16x16 d1 = _mm256_unpacklo_epi16(c1, c3);
1417 int16x16 d3 = _mm256_unpackhi_epi16(c1, c3);
1418 // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1419 int16x16 d4 = _mm256_unpacklo_epi16(c4, c6);
1420 int16x16 d6 = _mm256_unpackhi_epi16(c4, c6);
1421 // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1422 int16x16 d5 = _mm256_unpacklo_epi16(c5, c7);
1423 int16x16 d7 = _mm256_unpackhi_epi16(c5, c7);
1424 // vector_butterfly 0 64 1 0
1425 int16x16 e0 = add_x16(d0, d4);
1426 int16x16 e4 = sub_x16(d0, d4);
1427 // vector_butterfly 32 96 1 0
1428 int16x16 e2 = add_x16(d2, d6);
1429 int16x16 e6 = sub_x16(d2, d6);
1430 // vector_butterfly 16 80 1 0
1431 int16x16 e1 = add_x16(d1, d5);
1432 int16x16 e5 = sub_x16(d1, d5);
1433 // vector_butterfly 48 112 1 0
1434 int16x16 e3 = add_x16(d3, d7);
1435 int16x16 e7 = sub_x16(d3, d7);
1436 // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1437 int16x16 f0 = _mm256_unpacklo_epi32(e0, e1);
1438 int16x16 f1 = _mm256_unpackhi_epi32(e0, e1);
1439 // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1440 int16x16 f2 = _mm256_unpacklo_epi32(e2, e3);
1441 int16x16 f3 = _mm256_unpackhi_epi32(e2, e3);
1442 // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1443 int16x16 f4 = _mm256_unpacklo_epi32(e4, e5);
1444 int16x16 f5 = _mm256_unpackhi_epi32(e4, e5);
1445 // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1446 int16x16 f6 = _mm256_unpacklo_epi32(e6, e7);
1447 int16x16 f7 = _mm256_unpackhi_epi32(e6, e7);
1448 // stopbatch 128
1449 _mm256_storeu_si256((int16x16 *) (f + 0), f0);
1450 _mm256_storeu_si256((int16x16 *) (f + 16), f1);
1451 _mm256_storeu_si256((int16x16 *) (f + 32), f2);
1452 _mm256_storeu_si256((int16x16 *) (f + 48), f3);
1453 _mm256_storeu_si256((int16x16 *) (f + 64), f4);
1454 _mm256_storeu_si256((int16x16 *) (f + 80), f5);
1455 _mm256_storeu_si256((int16x16 *) (f + 96), f6);
1456 _mm256_storeu_si256((int16x16 *) (f + 112), f7);
1457 f += 128;
1458 }
1459 f -= 128 * reps;
1460 // startbatch 128
1461 for (long long r = 0; r < reps; ++r) {
1462 // vector_butterfly 0 32 1 0
1463 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1464 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1465 int16x16 b0 = add_x16(a0, a2);
1466 int16x16 b2 = sub_x16(a0, a2);
1467 // vector_butterfly 16 48 1 0
1468 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1469 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1470 int16x16 b1 = add_x16(a1, a3);
1471 int16x16 b3 = sub_x16(a1, a3);
1472 // vector_butterfly 64 96 4 1
1473 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1474 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1475 a6 = mulmod_scaled_x16(a6, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1476 int16x16 b4 = add_x16(a4, a6);
1477 int16x16 b6 = sub_x16(a4, a6);
1478 // vector_butterfly 80 112 4 1
1479 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1480 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1481 a7 = mulmod_scaled_x16(a7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1482 int16x16 b5 = add_x16(a5, a7);
1483 int16x16 b7 = sub_x16(a5, a7);
1484 // vector_reduce 0
1485 b0 = reduce_x16(b0, qdata);
1486 // vector_reduce 16
1487 b1 = reduce_x16(b1, qdata);
1488 // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1489 b2 = mulmod_scaled_x16(b2, precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1490 // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1491 b3 = mulmod_scaled_x16(b3, precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1492 // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1493 b4 = mulmod_scaled_x16(b4, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1494 // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1495 b5 = mulmod_scaled_x16(b5, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1496 // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1497 b6 = mulmod_scaled_x16(b6, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1498 // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1499 b7 = mulmod_scaled_x16(b7, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1500 // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1501 int16x16 c0 = _mm256_unpacklo_epi64(b0, b4);
1502 int16x16 c4 = _mm256_unpackhi_epi64(b0, b4);
1503 // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1504 int16x16 c1 = _mm256_unpacklo_epi64(b1, b5);
1505 int16x16 c5 = _mm256_unpackhi_epi64(b1, b5);
1506 // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1507 int16x16 c2 = _mm256_unpacklo_epi64(b2, b6);
1508 int16x16 c6 = _mm256_unpackhi_epi64(b2, b6);
1509 // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1510 int16x16 c3 = _mm256_unpacklo_epi64(b3, b7);
1511 int16x16 c7 = _mm256_unpackhi_epi64(b3, b7);
1512 // vector_butterfly 0 16 1 0
1513 int16x16 d0 = add_x16(c0, c1);
1514 int16x16 d1 = sub_x16(c0, c1);
1515 // vector_butterfly 64 80 1 0
1516 int16x16 d4 = add_x16(c4, c5);
1517 int16x16 d5 = sub_x16(c4, c5);
1518 // vector_butterfly 32 48 1 0
1519 int16x16 d2 = add_x16(c2, c3);
1520 int16x16 d3 = sub_x16(c2, c3);
1521 // vector_butterfly 96 112 1 0
1522 int16x16 d6 = add_x16(c6, c7);
1523 int16x16 d7 = sub_x16(c6, c7);
1524 // vector_butterfly 0 64 1 0
1525 int16x16 e0 = add_x16(d0, d4);
1526 int16x16 e4 = sub_x16(d0, d4);
1527 // vector_butterfly 32 96 1 0
1528 int16x16 e2 = add_x16(d2, d6);
1529 int16x16 e6 = sub_x16(d2, d6);
1530 // vector_butterfly 16 80 4 1
1531 d5 = mulmod_scaled_x16(d5, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1532 int16x16 e1 = add_x16(d1, d5);
1533 int16x16 e5 = sub_x16(d1, d5);
1534 // vector_butterfly 48 112 4 1
1535 d7 = mulmod_scaled_x16(d7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1536 int16x16 e3 = add_x16(d3, d7);
1537 int16x16 e7 = sub_x16(d3, d7);
1538 // stopbatch 128
1539 _mm256_storeu_si256((int16x16 *) (f + 0), e0);
1540 _mm256_storeu_si256((int16x16 *) (f + 16), e1);
1541 _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1542 _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1543 _mm256_storeu_si256((int16x16 *) (f + 64), e4);
1544 _mm256_storeu_si256((int16x16 *) (f + 80), e5);
1545 _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1546 _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1547 f += 128;
1548 }
1549 // f -= 128*reps;
1550 // stopntt 512
1551 }
1552
PQCLEAN_NTRULPR653_AVX2_ntt512_7681(int16 * f,int reps)1553 void PQCLEAN_NTRULPR653_AVX2_ntt512_7681(int16 *f, int reps) {
1554 ntt512(f, reps, qdata_7681.data);
1555 }
1556
PQCLEAN_NTRULPR653_AVX2_ntt512_10753(int16 * f,int reps)1557 void PQCLEAN_NTRULPR653_AVX2_ntt512_10753(int16 *f, int reps) {
1558 ntt512(f, reps, qdata_10753.data);
1559 }
1560 // inv stopntt 512
1561
invntt512(int16 * f,int reps,const int16 * qdata)1562 static void invntt512(int16 *f, int reps, const int16 *qdata) {
1563 reps *= 4;
1564 // inv stopbatch 128
1565 for (long long r = 0; r < reps; ++r) {
1566 // inv vector_butterfly 48 112 4 1
1567 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1568 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1569 int16x16 b3 = add_x16(a3, a7);
1570 int16x16 b7 = sub_x16(a3, a7);
1571 b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1572 // inv vector_butterfly 16 80 4 1
1573 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1574 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1575 int16x16 b1 = add_x16(a1, a5);
1576 int16x16 b5 = sub_x16(a1, a5);
1577 b5 = mulmod_scaled_x16(b5, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1578 // inv vector_butterfly 32 96 1 0
1579 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1580 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1581 int16x16 b2 = add_x16(a2, a6);
1582 int16x16 b6 = sub_x16(a2, a6);
1583 // inv vector_butterfly 0 64 1 0
1584 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1585 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1586 int16x16 b0 = add_x16(a0, a4);
1587 int16x16 b4 = sub_x16(a0, a4);
1588 // inv vector_butterfly 96 112 1 0
1589 int16x16 c6 = add_x16(b6, b7);
1590 int16x16 c7 = sub_x16(b6, b7);
1591 // inv vector_butterfly 32 48 1 0
1592 int16x16 c2 = add_x16(b2, b3);
1593 int16x16 c3 = sub_x16(b2, b3);
1594 // inv vector_butterfly 64 80 1 0
1595 int16x16 c4 = add_x16(b4, b5);
1596 int16x16 c5 = sub_x16(b4, b5);
1597 // inv vector_butterfly 0 16 1 0
1598 int16x16 c0 = add_x16(b0, b1);
1599 int16x16 c1 = sub_x16(b0, b1);
1600 // inv vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1601 int16x16 d3 = _mm256_unpacklo_epi64(c3, c7);
1602 int16x16 d7 = _mm256_unpackhi_epi64(c3, c7);
1603 // inv vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1604 int16x16 d2 = _mm256_unpacklo_epi64(c2, c6);
1605 int16x16 d6 = _mm256_unpackhi_epi64(c2, c6);
1606 // inv vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1607 int16x16 d1 = _mm256_unpacklo_epi64(c1, c5);
1608 int16x16 d5 = _mm256_unpackhi_epi64(c1, c5);
1609 // inv vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1610 int16x16 d0 = _mm256_unpacklo_epi64(c0, c4);
1611 int16x16 d4 = _mm256_unpackhi_epi64(c0, c4);
1612 // inv vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1613 d7 = mulmod_scaled_x16(d7, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1614 // inv vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1615 d6 = mulmod_scaled_x16(d6, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1616 // inv vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1617 d5 = mulmod_scaled_x16(d5, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1618 // inv vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1619 d4 = mulmod_scaled_x16(d4, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1620 // inv vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1621 d3 = mulmod_scaled_x16(d3, precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1622 // inv vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1623 d2 = mulmod_scaled_x16(d2, precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1624 // inv vector_reduce 16
1625 d1 = reduce_x16(d1, qdata);
1626 // inv vector_reduce 0
1627 d0 = reduce_x16(d0, qdata);
1628 // inv vector_butterfly 80 112 4 1
1629 int16x16 e5 = add_x16(d5, d7);
1630 int16x16 e7 = sub_x16(d5, d7);
1631 e7 = mulmod_scaled_x16(e7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1632 // inv vector_butterfly 64 96 4 1
1633 int16x16 e4 = add_x16(d4, d6);
1634 int16x16 e6 = sub_x16(d4, d6);
1635 e6 = mulmod_scaled_x16(e6, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1636 // inv vector_butterfly 16 48 1 0
1637 int16x16 e1 = add_x16(d1, d3);
1638 int16x16 e3 = sub_x16(d1, d3);
1639 // inv vector_butterfly 0 32 1 0
1640 int16x16 e0 = add_x16(d0, d2);
1641 int16x16 e2 = sub_x16(d0, d2);
1642 // inv startbatch 128
1643 _mm256_storeu_si256((int16x16 *) (f + 0), e0);
1644 _mm256_storeu_si256((int16x16 *) (f + 16), e1);
1645 _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1646 _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1647 _mm256_storeu_si256((int16x16 *) (f + 64), e4);
1648 _mm256_storeu_si256((int16x16 *) (f + 80), e5);
1649 _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1650 _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1651 f += 128;
1652 }
1653 f -= 128 * reps;
1654 // inv stopbatch 128
1655 for (long long r = 0; r < reps; ++r) {
1656 // inv vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1657 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1658 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1659 int16x16 b6 = _mm256_unpacklo_epi32(a6, a7);
1660 int16x16 b7 = _mm256_unpackhi_epi32(a6, a7);
1661 int16x16 c6 = _mm256_unpacklo_epi32(b6, b7);
1662 int16x16 c7 = _mm256_unpackhi_epi32(b6, b7);
1663 // inv vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1664 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1665 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1666 int16x16 b4 = _mm256_unpacklo_epi32(a4, a5);
1667 int16x16 b5 = _mm256_unpackhi_epi32(a4, a5);
1668 int16x16 c4 = _mm256_unpacklo_epi32(b4, b5);
1669 int16x16 c5 = _mm256_unpackhi_epi32(b4, b5);
1670 // inv vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1671 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1672 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1673 int16x16 b2 = _mm256_unpacklo_epi32(a2, a3);
1674 int16x16 b3 = _mm256_unpackhi_epi32(a2, a3);
1675 int16x16 c2 = _mm256_unpacklo_epi32(b2, b3);
1676 int16x16 c3 = _mm256_unpackhi_epi32(b2, b3);
1677 // inv vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1678 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1679 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1680 int16x16 b0 = _mm256_unpacklo_epi32(a0, a1);
1681 int16x16 b1 = _mm256_unpackhi_epi32(a0, a1);
1682 int16x16 c0 = _mm256_unpacklo_epi32(b0, b1);
1683 int16x16 c1 = _mm256_unpackhi_epi32(b0, b1);
1684 // inv vector_butterfly 48 112 1 0
1685 int16x16 d3 = add_x16(c3, c7);
1686 int16x16 d7 = sub_x16(c3, c7);
1687 // inv vector_butterfly 16 80 1 0
1688 int16x16 d1 = add_x16(c1, c5);
1689 int16x16 d5 = sub_x16(c1, c5);
1690 // inv vector_butterfly 32 96 1 0
1691 int16x16 d2 = add_x16(c2, c6);
1692 int16x16 d6 = sub_x16(c2, c6);
1693 // inv vector_butterfly 0 64 1 0
1694 int16x16 d0 = add_x16(c0, c4);
1695 int16x16 d4 = sub_x16(c0, c4);
1696 // inv vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1697 int16x16 e5 = _mm256_unpacklo_epi16(d5, d7);
1698 int16x16 e7 = _mm256_unpackhi_epi16(d5, d7);
1699 int16x16 f5 = _mm256_unpacklo_epi16(e5, e7);
1700 int16x16 f7 = _mm256_unpackhi_epi16(e5, e7);
1701 int16x16 g5 = _mm256_unpacklo_epi16(f5, f7);
1702 int16x16 g7 = _mm256_unpackhi_epi16(f5, f7);
1703 // inv vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1704 int16x16 e4 = _mm256_unpacklo_epi16(d4, d6);
1705 int16x16 e6 = _mm256_unpackhi_epi16(d4, d6);
1706 int16x16 f4 = _mm256_unpacklo_epi16(e4, e6);
1707 int16x16 f6 = _mm256_unpackhi_epi16(e4, e6);
1708 int16x16 g4 = _mm256_unpacklo_epi16(f4, f6);
1709 int16x16 g6 = _mm256_unpackhi_epi16(f4, f6);
1710 // inv vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1711 int16x16 e1 = _mm256_unpacklo_epi16(d1, d3);
1712 int16x16 e3 = _mm256_unpackhi_epi16(d1, d3);
1713 int16x16 f1 = _mm256_unpacklo_epi16(e1, e3);
1714 int16x16 f3 = _mm256_unpackhi_epi16(e1, e3);
1715 int16x16 g1 = _mm256_unpacklo_epi16(f1, f3);
1716 int16x16 g3 = _mm256_unpackhi_epi16(f1, f3);
1717 // inv vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1718 int16x16 e0 = _mm256_unpacklo_epi16(d0, d2);
1719 int16x16 e2 = _mm256_unpackhi_epi16(d0, d2);
1720 int16x16 f0 = _mm256_unpacklo_epi16(e0, e2);
1721 int16x16 f2 = _mm256_unpackhi_epi16(e0, e2);
1722 int16x16 g0 = _mm256_unpacklo_epi16(f0, f2);
1723 int16x16 g2 = _mm256_unpackhi_epi16(f0, f2);
1724 // inv vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1725 g7 = mulmod_scaled_x16(g7, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1726 // inv vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1727 g3 = mulmod_scaled_x16(g3, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1728 // inv vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1729 g6 = mulmod_scaled_x16(g6, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1730 // inv vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1731 g2 = mulmod_scaled_x16(g2, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1732 // inv vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1733 g5 = mulmod_scaled_x16(g5, precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1734 // inv vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1735 g1 = mulmod_scaled_x16(g1, precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1736 // inv vector_reduce 64
1737 g4 = reduce_x16(g4, qdata);
1738 // inv vector_reduce 0
1739 g0 = reduce_x16(g0, qdata);
1740 // inv vector_butterfly 96 112 4 1
1741 int16x16 h6 = add_x16(g6, g7);
1742 int16x16 h7 = sub_x16(g6, g7);
1743 h7 = mulmod_scaled_x16(h7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1744 // inv vector_butterfly 32 48 4 1
1745 int16x16 h2 = add_x16(g2, g3);
1746 int16x16 h3 = sub_x16(g2, g3);
1747 h3 = mulmod_scaled_x16(h3, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1748 // inv vector_butterfly 64 80 1 0
1749 int16x16 h4 = add_x16(g4, g5);
1750 int16x16 h5 = sub_x16(g4, g5);
1751 // inv vector_butterfly 0 16 1 0
1752 int16x16 h0 = add_x16(g0, g1);
1753 int16x16 h1 = sub_x16(g0, g1);
1754 // inv vector_butterfly 80 112 1 0
1755 int16x16 i5 = add_x16(h5, h7);
1756 int16x16 i7 = sub_x16(h5, h7);
1757 // inv vector_butterfly 16 48 1 0
1758 int16x16 i1 = add_x16(h1, h3);
1759 int16x16 i3 = sub_x16(h1, h3);
1760 // inv vector_butterfly 64 96 1 0
1761 int16x16 i4 = add_x16(h4, h6);
1762 int16x16 i6 = sub_x16(h4, h6);
1763 // inv vector_butterfly 0 32 1 0
1764 int16x16 i0 = add_x16(h0, h2);
1765 int16x16 i2 = sub_x16(h0, h2);
1766 // inv startbatch 128
1767 _mm256_storeu_si256((int16x16 *) (f + 0), i0);
1768 _mm256_storeu_si256((int16x16 *) (f + 16), i1);
1769 _mm256_storeu_si256((int16x16 *) (f + 32), i2);
1770 _mm256_storeu_si256((int16x16 *) (f + 48), i3);
1771 _mm256_storeu_si256((int16x16 *) (f + 64), i4);
1772 _mm256_storeu_si256((int16x16 *) (f + 80), i5);
1773 _mm256_storeu_si256((int16x16 *) (f + 96), i6);
1774 _mm256_storeu_si256((int16x16 *) (f + 112), i7);
1775 f += 128;
1776 }
1777 f -= 128 * reps;
1778 // inv doublereps
1779 reps /= 2;
1780 // inv doublereps
1781 reps /= 2;
1782 // inv stopbatch 512
1783 for (long long r = 0; r < reps; ++r) {
1784 // inv vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1785 int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432));
1786 int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496));
1787 int16x16 b27 = _mm256_permute2x128_si256_lo(a27, a31);
1788 int16x16 b31 = _mm256_permute2x128_si256_hi(a27, a31);
1789 // inv vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1790 int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304));
1791 int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368));
1792 int16x16 b19 = _mm256_permute2x128_si256_lo(a19, a23);
1793 int16x16 b23 = _mm256_permute2x128_si256_hi(a19, a23);
1794 // inv vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1795 int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176));
1796 int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240));
1797 int16x16 b11 = _mm256_permute2x128_si256_lo(a11, a15);
1798 int16x16 b15 = _mm256_permute2x128_si256_hi(a11, a15);
1799 // inv vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1800 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1801 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1802 int16x16 b3 = _mm256_permute2x128_si256_lo(a3, a7);
1803 int16x16 b7 = _mm256_permute2x128_si256_hi(a3, a7);
1804 // inv vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1805 b31 = mulmod_scaled_x16(b31, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1806 // inv vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1807 b27 = mulmod_scaled_x16(b27, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1808 // inv vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1809 b23 = mulmod_scaled_x16(b23, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1810 // inv vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1811 b19 = mulmod_scaled_x16(b19, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1812 // inv vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1813 b15 = mulmod_scaled_x16(b15, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1814 // inv vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1815 b11 = mulmod_scaled_x16(b11, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1816 // inv vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1817 b7 = mulmod_scaled_x16(b7, precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1818 // inv vector_reduce 48
1819 b3 = reduce_x16(b3, qdata);
1820 // inv vector_butterfly 432 496 8 7
1821 int16x16 c27 = add_x16(b27, b31);
1822 int16x16 c31 = sub_x16(b27, b31);
1823 c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1824 // inv vector_butterfly 304 368 8 1
1825 int16x16 c19 = add_x16(b19, b23);
1826 int16x16 c23 = sub_x16(b19, b23);
1827 c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1828 // inv vector_butterfly 176 240 4 1
1829 int16x16 c11 = add_x16(b11, b15);
1830 int16x16 c15 = sub_x16(b11, b15);
1831 c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1832 // inv vector_butterfly 48 112 1 0
1833 int16x16 c3 = add_x16(b3, b7);
1834 int16x16 c7 = sub_x16(b3, b7);
1835 // inv vector_reduce_ifforward 112
1836 // inv vector_butterfly 368 496 4 1
1837 int16x16 d23 = add_x16(c23, c31);
1838 int16x16 d31 = sub_x16(c23, c31);
1839 d31 = mulmod_scaled_x16(d31, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1840 // inv vector_butterfly 304 432 4 1
1841 int16x16 d19 = add_x16(c19, c27);
1842 int16x16 d27 = sub_x16(c19, c27);
1843 d27 = mulmod_scaled_x16(d27, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1844 // inv vector_butterfly 112 240 1 0
1845 int16x16 d7 = add_x16(c7, c15);
1846 int16x16 d15 = sub_x16(c7, c15);
1847 // inv vector_butterfly 48 176 1 0
1848 int16x16 d3 = add_x16(c3, c11);
1849 int16x16 d11 = sub_x16(c3, c11);
1850 // inv vector_reduce_ifreverse 304
1851 d19 = reduce_x16(d19, qdata);
1852 // inv vector_reduce_ifreverse 48
1853 d3 = reduce_x16(d3, qdata);
1854 // inv vector_butterfly 240 496 1 0
1855 int16x16 e15 = add_x16(d15, d31);
1856 int16x16 e31 = sub_x16(d15, d31);
1857 // inv vector_butterfly 112 368 1 0
1858 int16x16 e7 = add_x16(d7, d23);
1859 int16x16 e23 = sub_x16(d7, d23);
1860 // inv vector_butterfly 176 432 1 0
1861 int16x16 e11 = add_x16(d11, d27);
1862 int16x16 e27 = sub_x16(d11, d27);
1863 // inv vector_butterfly 48 304 1 0
1864 int16x16 e3 = add_x16(d3, d19);
1865 int16x16 e19 = sub_x16(d3, d19);
1866 // inv startbatch 512
1867 _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1868 _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1869 _mm256_storeu_si256((int16x16 *) (f + 176), e11);
1870 _mm256_storeu_si256((int16x16 *) (f + 240), e15);
1871 _mm256_storeu_si256((int16x16 *) (f + 304), e19);
1872 _mm256_storeu_si256((int16x16 *) (f + 368), e23);
1873 _mm256_storeu_si256((int16x16 *) (f + 432), e27);
1874 _mm256_storeu_si256((int16x16 *) (f + 496), e31);
1875 f += 512;
1876 }
1877 f -= 512 * reps;
1878 // inv stopbatch 512
1879 for (long long r = 0; r < reps; ++r) {
1880 // inv vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1881 int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416));
1882 int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480));
1883 int16x16 b26 = _mm256_permute2x128_si256_lo(a26, a30);
1884 int16x16 b30 = _mm256_permute2x128_si256_hi(a26, a30);
1885 // inv vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1886 int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288));
1887 int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352));
1888 int16x16 b18 = _mm256_permute2x128_si256_lo(a18, a22);
1889 int16x16 b22 = _mm256_permute2x128_si256_hi(a18, a22);
1890 // inv vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1891 int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160));
1892 int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224));
1893 int16x16 b10 = _mm256_permute2x128_si256_lo(a10, a14);
1894 int16x16 b14 = _mm256_permute2x128_si256_hi(a10, a14);
1895 // inv vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1896 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1897 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1898 int16x16 b2 = _mm256_permute2x128_si256_lo(a2, a6);
1899 int16x16 b6 = _mm256_permute2x128_si256_hi(a2, a6);
1900 // inv vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1901 b30 = mulmod_scaled_x16(b30, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1902 // inv vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1903 b26 = mulmod_scaled_x16(b26, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1904 // inv vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1905 b22 = mulmod_scaled_x16(b22, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1906 // inv vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1907 b18 = mulmod_scaled_x16(b18, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1908 // inv vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1909 b14 = mulmod_scaled_x16(b14, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1910 // inv vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1911 b10 = mulmod_scaled_x16(b10, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1912 // inv vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1913 b6 = mulmod_scaled_x16(b6, precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1914 // inv vector_reduce 32
1915 b2 = reduce_x16(b2, qdata);
1916 // inv vector_butterfly 416 480 8 7
1917 int16x16 c26 = add_x16(b26, b30);
1918 int16x16 c30 = sub_x16(b26, b30);
1919 c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1920 // inv vector_butterfly 288 352 8 1
1921 int16x16 c18 = add_x16(b18, b22);
1922 int16x16 c22 = sub_x16(b18, b22);
1923 c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1924 // inv vector_butterfly 160 224 4 1
1925 int16x16 c10 = add_x16(b10, b14);
1926 int16x16 c14 = sub_x16(b10, b14);
1927 c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1928 // inv vector_butterfly 32 96 1 0
1929 int16x16 c2 = add_x16(b2, b6);
1930 int16x16 c6 = sub_x16(b2, b6);
1931 // inv vector_reduce_ifforward 96
1932 // inv vector_butterfly 352 480 4 1
1933 int16x16 d22 = add_x16(c22, c30);
1934 int16x16 d30 = sub_x16(c22, c30);
1935 d30 = mulmod_scaled_x16(d30, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1936 // inv vector_butterfly 288 416 4 1
1937 int16x16 d18 = add_x16(c18, c26);
1938 int16x16 d26 = sub_x16(c18, c26);
1939 d26 = mulmod_scaled_x16(d26, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1940 // inv vector_butterfly 96 224 1 0
1941 int16x16 d6 = add_x16(c6, c14);
1942 int16x16 d14 = sub_x16(c6, c14);
1943 // inv vector_butterfly 32 160 1 0
1944 int16x16 d2 = add_x16(c2, c10);
1945 int16x16 d10 = sub_x16(c2, c10);
1946 // inv vector_reduce_ifreverse 288
1947 d18 = reduce_x16(d18, qdata);
1948 // inv vector_reduce_ifreverse 32
1949 d2 = reduce_x16(d2, qdata);
1950 // inv vector_butterfly 224 480 1 0
1951 int16x16 e14 = add_x16(d14, d30);
1952 int16x16 e30 = sub_x16(d14, d30);
1953 // inv vector_butterfly 96 352 1 0
1954 int16x16 e6 = add_x16(d6, d22);
1955 int16x16 e22 = sub_x16(d6, d22);
1956 // inv vector_butterfly 160 416 1 0
1957 int16x16 e10 = add_x16(d10, d26);
1958 int16x16 e26 = sub_x16(d10, d26);
1959 // inv vector_butterfly 32 288 1 0
1960 int16x16 e2 = add_x16(d2, d18);
1961 int16x16 e18 = sub_x16(d2, d18);
1962 // inv startbatch 512
1963 _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1964 _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1965 _mm256_storeu_si256((int16x16 *) (f + 160), e10);
1966 _mm256_storeu_si256((int16x16 *) (f + 224), e14);
1967 _mm256_storeu_si256((int16x16 *) (f + 288), e18);
1968 _mm256_storeu_si256((int16x16 *) (f + 352), e22);
1969 _mm256_storeu_si256((int16x16 *) (f + 416), e26);
1970 _mm256_storeu_si256((int16x16 *) (f + 480), e30);
1971 f += 512;
1972 }
1973 f -= 512 * reps;
1974 // inv stopbatch 512
1975 for (long long r = 0; r < reps; ++r) {
1976 // inv vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1977 int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400));
1978 int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464));
1979 int16x16 b25 = _mm256_permute2x128_si256_lo(a25, a29);
1980 int16x16 b29 = _mm256_permute2x128_si256_hi(a25, a29);
1981 // inv vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1982 int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272));
1983 int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336));
1984 int16x16 b17 = _mm256_permute2x128_si256_lo(a17, a21);
1985 int16x16 b21 = _mm256_permute2x128_si256_hi(a17, a21);
1986 // inv vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1987 int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144));
1988 int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208));
1989 int16x16 b9 = _mm256_permute2x128_si256_lo(a9, a13);
1990 int16x16 b13 = _mm256_permute2x128_si256_hi(a9, a13);
1991 // inv vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1992 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1993 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1994 int16x16 b1 = _mm256_permute2x128_si256_lo(a1, a5);
1995 int16x16 b5 = _mm256_permute2x128_si256_hi(a1, a5);
1996 // inv vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1997 b29 = mulmod_scaled_x16(b29, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1998 // inv vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1999 b25 = mulmod_scaled_x16(b25, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2000 // inv vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2001 b21 = mulmod_scaled_x16(b21, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2002 // inv vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2003 b17 = mulmod_scaled_x16(b17, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2004 // inv vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2005 b13 = mulmod_scaled_x16(b13, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2006 // inv vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2007 b9 = mulmod_scaled_x16(b9, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2008 // inv vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2009 b5 = mulmod_scaled_x16(b5, precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2010 // inv vector_reduce 16
2011 b1 = reduce_x16(b1, qdata);
2012 // inv vector_butterfly 400 464 8 7
2013 int16x16 c25 = add_x16(b25, b29);
2014 int16x16 c29 = sub_x16(b25, b29);
2015 c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
2016 // inv vector_butterfly 272 336 8 1
2017 int16x16 c17 = add_x16(b17, b21);
2018 int16x16 c21 = sub_x16(b17, b21);
2019 c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
2020 // inv vector_butterfly 144 208 4 1
2021 int16x16 c9 = add_x16(b9, b13);
2022 int16x16 c13 = sub_x16(b9, b13);
2023 c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2024 // inv vector_butterfly 16 80 1 0
2025 int16x16 c1 = add_x16(b1, b5);
2026 int16x16 c5 = sub_x16(b1, b5);
2027 // inv vector_reduce_ifforward 80
2028 // inv vector_butterfly 336 464 4 1
2029 int16x16 d21 = add_x16(c21, c29);
2030 int16x16 d29 = sub_x16(c21, c29);
2031 d29 = mulmod_scaled_x16(d29, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2032 // inv vector_butterfly 272 400 4 1
2033 int16x16 d17 = add_x16(c17, c25);
2034 int16x16 d25 = sub_x16(c17, c25);
2035 d25 = mulmod_scaled_x16(d25, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2036 // inv vector_butterfly 80 208 1 0
2037 int16x16 d5 = add_x16(c5, c13);
2038 int16x16 d13 = sub_x16(c5, c13);
2039 // inv vector_butterfly 16 144 1 0
2040 int16x16 d1 = add_x16(c1, c9);
2041 int16x16 d9 = sub_x16(c1, c9);
2042 // inv vector_reduce_ifreverse 272
2043 d17 = reduce_x16(d17, qdata);
2044 // inv vector_reduce_ifreverse 16
2045 d1 = reduce_x16(d1, qdata);
2046 // inv vector_butterfly 208 464 1 0
2047 int16x16 e13 = add_x16(d13, d29);
2048 int16x16 e29 = sub_x16(d13, d29);
2049 // inv vector_butterfly 80 336 1 0
2050 int16x16 e5 = add_x16(d5, d21);
2051 int16x16 e21 = sub_x16(d5, d21);
2052 // inv vector_butterfly 144 400 1 0
2053 int16x16 e9 = add_x16(d9, d25);
2054 int16x16 e25 = sub_x16(d9, d25);
2055 // inv vector_butterfly 16 272 1 0
2056 int16x16 e1 = add_x16(d1, d17);
2057 int16x16 e17 = sub_x16(d1, d17);
2058 // inv startbatch 512
2059 _mm256_storeu_si256((int16x16 *) (f + 16), e1);
2060 _mm256_storeu_si256((int16x16 *) (f + 80), e5);
2061 _mm256_storeu_si256((int16x16 *) (f + 144), e9);
2062 _mm256_storeu_si256((int16x16 *) (f + 208), e13);
2063 _mm256_storeu_si256((int16x16 *) (f + 272), e17);
2064 _mm256_storeu_si256((int16x16 *) (f + 336), e21);
2065 _mm256_storeu_si256((int16x16 *) (f + 400), e25);
2066 _mm256_storeu_si256((int16x16 *) (f + 464), e29);
2067 f += 512;
2068 }
2069 f -= 512 * reps;
2070 // inv stopbatch 512
2071 for (long long r = 0; r < reps; ++r) {
2072 // inv vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2073 int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384));
2074 int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448));
2075 int16x16 b24 = _mm256_permute2x128_si256_lo(a24, a28);
2076 int16x16 b28 = _mm256_permute2x128_si256_hi(a24, a28);
2077 // inv vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2078 int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256));
2079 int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320));
2080 int16x16 b16 = _mm256_permute2x128_si256_lo(a16, a20);
2081 int16x16 b20 = _mm256_permute2x128_si256_hi(a16, a20);
2082 // inv vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2083 int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128));
2084 int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192));
2085 int16x16 b8 = _mm256_permute2x128_si256_lo(a8, a12);
2086 int16x16 b12 = _mm256_permute2x128_si256_hi(a8, a12);
2087 // inv vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2088 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
2089 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
2090 int16x16 b0 = _mm256_permute2x128_si256_lo(a0, a4);
2091 int16x16 b4 = _mm256_permute2x128_si256_hi(a0, a4);
2092 // inv vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2093 b28 = mulmod_scaled_x16(b28, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2094 // inv vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2095 b24 = mulmod_scaled_x16(b24, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2096 // inv vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2097 b20 = mulmod_scaled_x16(b20, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2098 // inv vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2099 b16 = mulmod_scaled_x16(b16, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2100 // inv vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2101 b12 = mulmod_scaled_x16(b12, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2102 // inv vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2103 b8 = mulmod_scaled_x16(b8, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2104 // inv vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2105 b4 = mulmod_scaled_x16(b4, precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2106 // inv vector_reduce 0
2107 b0 = reduce_x16(b0, qdata);
2108 // inv vector_butterfly 384 448 8 7
2109 int16x16 c24 = add_x16(b24, b28);
2110 int16x16 c28 = sub_x16(b24, b28);
2111 c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
2112 // inv vector_butterfly 256 320 8 1
2113 int16x16 c16 = add_x16(b16, b20);
2114 int16x16 c20 = sub_x16(b16, b20);
2115 c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
2116 // inv vector_butterfly 128 192 4 1
2117 int16x16 c8 = add_x16(b8, b12);
2118 int16x16 c12 = sub_x16(b8, b12);
2119 c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2120 // inv vector_butterfly 0 64 1 0
2121 int16x16 c0 = add_x16(b0, b4);
2122 int16x16 c4 = sub_x16(b0, b4);
2123 // inv vector_reduce_ifforward 64
2124 // inv vector_butterfly 320 448 4 1
2125 int16x16 d20 = add_x16(c20, c28);
2126 int16x16 d28 = sub_x16(c20, c28);
2127 d28 = mulmod_scaled_x16(d28, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2128 // inv vector_butterfly 256 384 4 1
2129 int16x16 d16 = add_x16(c16, c24);
2130 int16x16 d24 = sub_x16(c16, c24);
2131 d24 = mulmod_scaled_x16(d24, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2132 // inv vector_butterfly 64 192 1 0
2133 int16x16 d4 = add_x16(c4, c12);
2134 int16x16 d12 = sub_x16(c4, c12);
2135 // inv vector_butterfly 0 128 1 0
2136 int16x16 d0 = add_x16(c0, c8);
2137 int16x16 d8 = sub_x16(c0, c8);
2138 // inv vector_reduce_ifreverse 256
2139 d16 = reduce_x16(d16, qdata);
2140 // inv vector_reduce_ifreverse 0
2141 d0 = reduce_x16(d0, qdata);
2142 // inv vector_butterfly 192 448 1 0
2143 int16x16 e12 = add_x16(d12, d28);
2144 int16x16 e28 = sub_x16(d12, d28);
2145 // inv vector_butterfly 64 320 1 0
2146 int16x16 e4 = add_x16(d4, d20);
2147 int16x16 e20 = sub_x16(d4, d20);
2148 // inv vector_butterfly 128 384 1 0
2149 int16x16 e8 = add_x16(d8, d24);
2150 int16x16 e24 = sub_x16(d8, d24);
2151 // inv vector_butterfly 0 256 1 0
2152 int16x16 e0 = add_x16(d0, d16);
2153 int16x16 e16 = sub_x16(d0, d16);
2154 // inv startbatch 512
2155 _mm256_storeu_si256((int16x16 *) (f + 0), e0);
2156 _mm256_storeu_si256((int16x16 *) (f + 64), e4);
2157 _mm256_storeu_si256((int16x16 *) (f + 128), e8);
2158 _mm256_storeu_si256((int16x16 *) (f + 192), e12);
2159 _mm256_storeu_si256((int16x16 *) (f + 256), e16);
2160 _mm256_storeu_si256((int16x16 *) (f + 320), e20);
2161 _mm256_storeu_si256((int16x16 *) (f + 384), e24);
2162 _mm256_storeu_si256((int16x16 *) (f + 448), e28);
2163 f += 512;
2164 }
2165 // f -= 512*reps;
2166 // inv startntt 512
2167 }
2168
PQCLEAN_NTRULPR653_AVX2_invntt512_7681(int16 * f,int reps)2169 void PQCLEAN_NTRULPR653_AVX2_invntt512_7681(int16 *f, int reps) {
2170 invntt512(f, reps, qdata_7681.data);
2171 }
2172
PQCLEAN_NTRULPR653_AVX2_invntt512_10753(int16 * f,int reps)2173 void PQCLEAN_NTRULPR653_AVX2_invntt512_10753(int16 *f, int reps) {
2174 invntt512(f, reps, qdata_10753.data);
2175 }
2176