1 #include "crypto_core_multsntrup761_ntt.h"
2 #include <immintrin.h>
3 
4 // auto-generated; do not edit
5 
6 
7 #define _mm256_permute2x128_si256_lo(f0,f1) _mm256_permute2x128_si256(f0,f1,0x20)
8 #define _mm256_permute2x128_si256_hi(f0,f1) _mm256_permute2x128_si256(f0,f1,0x31)
9 #define int16x16 __m256i
10 
11 typedef int16_t int16;
12 typedef int32_t int32;
13 
14 typedef union {
15     int16 data[106 * 16];
16     __m256i _dummy;
17 } vec1696;
18 
19 static const vec1696 qdata_7681 = { .data = {
20 #define precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+0)
21         -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625, -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625,
22 #define precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+16)
23             -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182, -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182,
24 #define precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+32)
25             -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182, -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182,
26 #define precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+48)
27             3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625, 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625,
28 #define precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+64)
29             -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194, -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194,
30 #define precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+80)
31             -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100, -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100,
32 #define precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+96)
33             -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696, -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696,
34 #define precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+112)
35             -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456, -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456,
36 #define precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+128)
37             -3593, 1701, 2194, 834, -3625, 2319, -1100, 121, -3593, 1701, 2194, 834, -3625, 2319, -1100, 121,
38 #define precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+144)
39             -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250, -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250,
40 #define precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+160)
41             -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414, -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414,
42 #define precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+176)
43             3777, -121, 1100, -2319, 3625, -834, -2194, -1701, 3777, -121, 1100, -2319, 3625, -834, -2194, -1701,
44 #define precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+192)
45             -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816, -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816,
46 #define precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+208)
47             -3625, 617, 2319, 2006, -1100, -1296, 121, 1986, -3625, 617, 2319, 2006, -1100, -1296, 121, 1986,
48 #define precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+224)
49             -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921, -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921,
50 #define precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+240)
51             -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830, -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830,
52 #define precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+256)
53             -3593, 514, 3364, 438, 1701, 2555, -1599, -1738, 2194, 103, 2557, 1881, 834, -549, -2816, 638,
54 #define precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+272)
55             -3625, -1399, 617, -1760, 2319, 2535, 2006, 3266, -1100, -1431, -1296, 3174, 121, 3153, 1986, -810,
56 #define precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+288)
57             -3777, 2956, -2830, -679, 1414, 2440, -1993, -3689, 2456, 2804, 1525, 3555, 2495, 1535, -2088, -7,
58 #define precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+304)
59             3182, -1321, -1921, -1305, 2876, -3772, -3706, 3600, -3696, -2043, 1483, -396, 2250, -2310, -2237, 1887,
60 #define precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+320)
61             -3593, -1887, 2237, 2310, -2250, 396, -1483, 2043, 3696, -3600, 3706, 3772, -2876, 1305, 1921, 1321,
62 #define precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+336)
63             -3182, 7, 2088, -1535, -2495, -3555, -1525, -2804, -2456, 3689, 1993, -2440, -1414, 679, 2830, -2956,
64 #define precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+352)
65             3777, 810, -1986, -3153, -121, -3174, 1296, 1431, 1100, -3266, -2006, -2535, -2319, 1760, -617, 1399,
66 #define precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+368)
67             3625, -638, 2816, 549, -834, -1881, -2557, -103, -2194, 1738, 1599, -2555, -1701, -438, -3364, -514,
68 #define precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+384)
69             -3593, -1532, 514, -373, 3364, -3816, 438, -3456, 1701, 783, 2555, 2883, -1599, 727, -1738, -2385,
70 #define precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+400)
71             2194, -2160, 103, -2391, 2557, 2762, 1881, -2426, 834, 3310, -549, -1350, -2816, 1386, 638, -194,
72 #define precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+416)
73             -3625, 404, -1399, -3692, 617, -2764, -1760, -1054, 2319, 1799, 2535, -3588, 2006, 1533, 3266, 2113,
74 #define precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+432)
75             -1100, -2579, -1431, -1756, -1296, 1598, 3174, -2, 121, -3480, 3153, -2572, 1986, 2743, -810, 2919,
76 #define precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+448)
77             -3593, 2789, -1887, -921, 2237, -1497, 2310, -2133, -2250, -915, 396, 1390, -1483, 3135, 2043, -859,
78 #define precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+464)
79             3696, 2732, -3600, -1464, 3706, 2224, 3772, -2665, -2876, 1698, 1305, 2835, 1921, 730, 1321, 486,
80 #define precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+480)
81             -3182, 3417, 7, -3428, 2088, -3145, -1535, 1168, -2495, -3831, -3555, -3750, -1525, 660, -2804, 2649,
82 #define precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+496)
83             -2456, 3405, 3689, -1521, 1993, 1681, -2440, 1056, -1414, 1166, 679, -2233, 2830, 2175, -2956, -1919,
84 #define precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+512)
85             -3593, -1404, -1532, 451, 514, -402, -373, 1278, 3364, -509, -3816, -3770, 438, -2345, -3456, -226,
86 #define precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+528)
87             1701, -1689, 783, -1509, 2555, 2963, 2883, 1242, -1599, 1669, 727, 2719, -1738, 642, -2385, -436,
88 #define precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+544)
89             2194, 3335, -2160, 1779, 103, 3745, -2391, 17, 2557, 2812, 2762, -1144, 1881, 83, -2426, -1181,
90 #define precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+560)
91             834, -1519, 3310, 3568, -549, -796, -1350, 2072, -2816, -2460, 1386, 2891, 638, -2083, -194, -715,
92 #define precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+576)
93             -3593, -402, -3816, -226, 2555, 1669, -2385, 1779, 2557, 83, 3310, 2072, 638, 1012, -3692, 1295,
94 #define precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+592)
95             2319, -3208, 1533, -2071, -1431, -2005, -2, 1586, 1986, -293, 1919, -929, -679, 777, -1681, -3461,
96 #define precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+608)
97             2456, 3366, 3750, -1203, 1535, -3657, -3417, -1712, -1921, 2515, 2665, -1070, 3600, 2532, -3135, -2589,
98 #define precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+624)
99             2250, -2258, 921, -658, -514, 509, 3456, 1509, 1599, -642, 2160, -17, -1881, 1519, 1350, -2891,
100 #define precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+640)
101             -3593, -3434, -1497, 893, 396, -2422, -859, 2965, 3706, -2339, 1698, -2937, 1321, -670, -3428, -3163,
102 #define precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+656)
103             -2495, -1072, 660, 1084, 3689, -179, 1056, -1338, 2830, 2786, -2919, -3677, -3153, -151, -1598, 3334,
104 #define precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+672)
105             1100, -3314, 3588, 2262, 1760, -2230, -404, 2083, 2816, -3568, 2426, -2812, -103, 436, -727, -2963,
106 #define precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+688)
107             -1701, 3770, 373, 1404, 1887, -1649, 2133, -826, 1483, 434, -2732, 3287, -3772, -2378, -2835, 3723,
108 #define precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+704)
109             -3593, 658, 2789, 370, -1887, -3434, -921, -3752, 2237, 1649, -1497, 2258, 2310, 3581, -2133, 893,
110 #define precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+720)
111             -2250, 3794, -915, 826, 396, 2589, 1390, 592, -1483, -2422, 3135, 3214, 2043, -434, -859, -2532,
112 #define precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+736)
113             3696, 1121, 2732, 2965, -3600, 2998, -1464, -3287, 3706, 1070, 2224, -589, 3772, -2339, -2665, 2070,
114 #define precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+752)
115             -2876, 2378, 1698, -2515, 1305, -2815, 2835, -2937, 1921, -1348, 730, -3723, 1321, 1712, 486, 2130,
116 #define q_x16 *(const int16x16 *)(qdata+768)
117             7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
118 #define qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+784)
119             -9, -9, -9, -9, -16425, -16425, -16425, -16425, -9, -9, -9, -9, -16425, -16425, -16425, -16425,
120 #define qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+800)
121             -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350, -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350,
122 #define qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+816)
123             -9, -9, -9, -9, -10350, -10350, -10350, -10350, -9, -9, -9, -9, -10350, -10350, -10350, -10350,
124 #define qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+832)
125             28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425, 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425,
126 #define qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+848)
127             -9, -9, -9, -9, -4974, -4974, -4974, -4974, -9, -9, -9, -9, -4974, -4974, -4974, -4974,
128 #define qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+864)
129             -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244, -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244,
130 #define qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+880)
131             -9, -9, -9, -9, -4496, -4496, -4496, -4496, -9, -9, -9, -9, -4496, -4496, -4496, -4496,
132 #define qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+896)
133             -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744, -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744,
134 #define qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+912)
135             -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655, -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655,
136 #define qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+928)
137             -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754, -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754,
138 #define qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+944)
139             -9, -23754, -4496, -828, -10350, 22593, -14744, -20870, -9, -23754, -4496, -828, -10350, 22593, -14744, -20870,
140 #define qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+960)
141             28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315, 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315,
142 #define qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+976)
143             -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816, -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816,
144 #define qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+992)
145             -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394, -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394,
146 #define qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+1008)
147             -9, -7491, -23754, -15307, -4496, -15750, -828, -5759, -9, -7491, -23754, -15307, -4496, -15750, -828, -5759,
148 #define qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1024)
149             -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382, -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382,
150 #define qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1040)
151             -9, -14846, -10972, -21066, -20315, -24581, 23489, -23242, -4974, -4505, 25597, -26279, 18242, 21467, -2816, 15998,
152 #define qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1056)
153             -16425, -4983, -19351, 14624, 18191, -2073, -3114, 20674, -7244, -21399, -9488, 6246, -11655, -29103, 19394, -5930,
154 #define qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1072)
155             -28865, -23668, -26382, -28839, 20870, 6536, -31177, 16279, 14744, 29428, 20469, 29667, -22593, 9215, -22568, -11783,
156 #define qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1088)
157             10350, -14121, 5759, -5913, 828, -1724, 15750, 11792, 4496, 25093, 15307, 26228, 23754, -21766, 7491, -6817,
158 #define qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1104)
159             -9, 6817, -7491, 21766, -23754, -26228, -15307, -25093, -4496, -11792, -15750, 1724, -828, 5913, -5759, 14121,
160 #define qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1120)
161             -10350, 11783, 22568, -9215, 22593, -29667, -20469, -29428, -14744, -16279, 31177, -6536, -20870, 28839, 26382, 23668,
162 #define qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1136)
163             28865, 5930, -19394, 29103, 11655, -6246, 9488, 21399, 7244, -20674, 3114, 2073, -18191, -14624, 19351, 4983,
164 #define qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1152)
165             16425, -15998, 2816, -21467, -18242, 26279, -25597, 4505, 4974, 23242, -23489, 24581, 20315, 21066, 10972, 14846,
166 #define qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1168)
167             -9, -32252, -14846, -19317, -10972, 8472, -21066, -3456, -20315, 16655, -24581, 12611, 23489, -12073, -23242, 29871,
168 #define qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1184)
169             -4974, 6032, -4505, 10409, 25597, 24266, -26279, 17030, 18242, 10478, 21467, 11962, -2816, -26262, 15998, -17602,
170 #define qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1200)
171             -16425, -22124, -4983, -26220, -19351, -8908, 14624, 32738, 18191, 13575, -2073, 27132, -3114, 24573, 20674, 27201,
172 #define qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1216)
173             -7244, 12269, -21399, -16092, -9488, -15810, 6246, 15358, -11655, -15768, -29103, 24052, 19394, -26441, -5930, -1689,
174 #define qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1232)
175             -9, 13541, 6817, -5529, -7491, 26663, 21766, -4693, -23754, 13933, -26228, 8558, -15307, -21953, -25093, -22875,
176 #define qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1248)
177             -4496, -7508, -11792, -30136, -15750, 26800, 1724, 17303, -828, 2722, 5913, -12013, -5759, 30426, 14121, 3558,
178 #define qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1264)
179             -10350, -24743, 11783, -21860, 22568, -32329, -9215, 9360, 22593, -7415, -29667, 25946, -20469, -21868, -29428, -25511,
180 #define qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1280)
181             -14744, 1869, -16279, 14351, 31177, 2193, -6536, 17440, -20870, 24718, 28839, -23225, 26382, 9855, 23668, -9599,
182 #define qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1296)
183             -9, -32124, -32252, 10179, -14846, 6766, -19317, 16638, -10972, -23549, 8472, -17082, -21066, -15145, -3456, 31518,
184 #define qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1312)
185             -20315, -6297, 16655, -12261, -24581, -11885, 12611, 30938, 23489, 28805, -12073, 26783, -23242, -14718, 29871, 5708,
186 #define qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1328)
187             -4974, 15111, 6032, -29453, -4505, 12449, 10409, 529, 25597, -32004, 24266, 2952, -26279, 18003, 17030, 24931,
188 #define qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1344)
189             18242, -1007, 10478, -4624, 21467, 17636, 11962, 14360, -2816, 15972, -26262, 16715, 15998, 4573, -17602, -14539,
190 #define qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1360)
191             -9, 6766, 8472, 31518, -24581, 28805, 29871, -29453, 25597, 18003, 10478, 14360, 15998, 27636, -26220, 17167,
192 #define qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1376)
193             18191, -7304, 24573, -22039, -21399, -4565, 15358, 10802, 19394, 21723, 9599, -9633, -28839, -2807, -2193, -30597,
194 #define qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1392)
195             14744, -26330, -25946, -2739, 9215, 32695, 24743, -26288, 5759, 20435, -17303, 24530, 11792, 20964, 21953, 23523,
196 #define qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1408)
197             23754, -27858, 5529, 6510, 14846, 23549, 3456, 12261, -23489, 14718, -6032, -529, 26279, 1007, -11962, -16715,
198 #define qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1424)
199             -9, 24214, 26663, 23933, -26228, -13686, -22875, -27243, -15750, 4317, 2722, 8839, 14121, -32414, -21860, -25179,
200 #define qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1440)
201             22593, -25648, -21868, -964, -16279, -1715, 17440, -14650, 26382, -28958, 1689, -10333, 29103, -20119, 15810, 22790,
202 #define qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1456)
203             7244, 20238, -27132, -2858, -14624, 19274, 22124, -4573, 2816, 4624, -17030, 32004, 4505, -5708, 12073, 11885,
204 #define qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1472)
205             20315, 17082, 19317, 32124, -6817, 14223, 4693, -14138, 15307, 9650, 7508, -9513, -1724, -23882, 12013, -15221,
206 #define qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1488)
207             -9, -6510, 13541, -23182, 6817, 24214, -5529, -24232, -7491, -14223, 26663, 27858, 21766, 26621, -4693, 23933,
208 #define qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1504)
209             -23754, 29394, 13933, 14138, -26228, -23523, 8558, -23984, -15307, -13686, -21953, 26766, -25093, -9650, -22875, -20964,
210 #define qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1520)
211             -4496, -22943, -7508, -27243, -11792, -18506, -30136, 9513, -15750, -24530, 26800, 947, 1724, 4317, 17303, 29718,
212 #define qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1536)
213             -828, 23882, 2722, -20435, 5913, -10495, -12013, 8839, -5759, -3396, 30426, 15221, 14121, 26288, 3558, 27730,
214 #define qinvscaledzeta_x16_4_1 *(const int16x16 *)(qdata+1552)
215             -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
216 #define qinvscaledzeta_x16_4_3 *(const int16x16 *)(qdata+1568)
217             28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865,
218 #define qinvscaledzeta_x16_8_1 *(const int16x16 *)(qdata+1584)
219             -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
220 #define qinvscaledzeta_x16_8_7 *(const int16x16 *)(qdata+1600)
221             -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
222 #define qround32_x16 *(const int16x16 *)(qdata+1616)
223             4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
224 #define scaledzeta_x16_4_1 *(const int16x16 *)(qdata+1632)
225             -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
226 #define scaledzeta_x16_4_3 *(const int16x16 *)(qdata+1648)
227             3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777,
228 #define scaledzeta_x16_8_1 *(const int16x16 *)(qdata+1664)
229             -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
230 #define scaledzeta_x16_8_7 *(const int16x16 *)(qdata+1680)
231             -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
232         }
233 } ;
234 
235 static const vec1696 qdata_10753 = { .data = {
236         // precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
237         1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688, 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688,
238         // precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
239         -223, -223, -223, -223, -4188, -4188, -4188, -4188, -223, -223, -223, -223, -4188, -4188, -4188, -4188,
240         // precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
241         1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188, 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188,
242         // precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
243         223, 223, 223, 223, -3688, -3688, -3688, -3688, 223, 223, 223, 223, -3688, -3688, -3688, -3688,
244         // precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
245         1018, 1018, 1018, 1018, -376, -376, -376, -376, 1018, 1018, 1018, 1018, -376, -376, -376, -376,
246         // precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
247         3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686, 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686,
248         // precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
249         1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413, 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413,
250         // precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
251         4188, 4188, 4188, 4188, -357, -357, -357, -357, 4188, 4188, 4188, 4188, -357, -357, -357, -357,
252         // precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
253         1018, -3364, -376, 4855, 3688, 425, -3686, 2695, 1018, -3364, -376, 4855, 3688, 425, -3686, 2695,
254         // precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
255         -223, -3784, 357, -2236, -4188, 4544, 2413, 730, -223, -3784, 357, -2236, -4188, 4544, 2413, 730,
256         // precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
257         1018, -730, -2413, -4544, 4188, 2236, -357, 3784, 1018, -730, -2413, -4544, 4188, 2236, -357, 3784,
258         // precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
259         223, -2695, 3686, -425, -3688, -4855, 376, 3364, 223, -2695, 3686, -425, -3688, -4855, 376, 3364,
260         // precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
261         1018, -5175, -3364, 2503, -376, 1341, 4855, -4875, 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875,
262         // precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
263         3688, -2629, 425, -4347, -3686, 3823, 2695, -4035, 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035,
264         // precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
265         1018, 5063, -730, 341, -2413, -3012, -4544, -5213, 1018, 5063, -730, 341, -2413, -3012, -4544, -5213,
266         // precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
267         4188, 1520, 2236, 1931, -357, 918, 3784, 4095, 4188, 1520, 2236, 1931, -357, 918, 3784, 4095,
268         // precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
269         1018, 3085, -5175, 2982, -3364, -4744, 2503, -4129, -376, -2576, 1341, -193, 4855, 3062, -4875, 4,
270         // precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
271         3688, 2388, -2629, -4513, 425, 4742, -4347, 2935, -3686, -544, 3823, -2178, 2695, 847, -4035, 268,
272         // precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
273         -223, -1299, -4095, -1287, -3784, -4876, -918, 3091, 357, -4189, -1931, 4616, -2236, 2984, -1520, -3550,
274         // precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
275         -4188, -1009, 5213, -205, 4544, -4102, 3012, 2790, 2413, -1085, -341, -2565, 730, -4379, -5063, -1284,
276         // precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
277         1018, 1284, 5063, 4379, -730, 2565, 341, 1085, -2413, -2790, -3012, 4102, -4544, 205, -5213, 1009,
278         // precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
279         4188, 3550, 1520, -2984, 2236, -4616, 1931, 4189, -357, -3091, 918, 4876, 3784, 1287, 4095, 1299,
280         // precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
281         223, -268, 4035, -847, -2695, 2178, -3823, 544, 3686, -2935, 4347, -4742, -425, 4513, 2629, -2388,
282         // precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
283         -3688, -4, 4875, -3062, -4855, 193, -1341, 2576, 376, 4129, -2503, 4744, 3364, -2982, 5175, -3085,
284         // precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
285         1018, 5116, 3085, -3615, -5175, 400, 2982, 3198, -3364, 2234, -4744, -4828, 2503, 326, -4129, -512,
286         // precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
287         -376, 1068, -2576, -4580, 1341, 3169, -193, -2998, 4855, -635, 3062, -4808, -4875, -2740, 4, 675,
288         // precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
289         3688, -1324, 2388, 5114, -2629, 5294, -4513, -794, 425, -864, 4742, -886, -4347, 336, 2935, -2045,
290         // precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
291         -3686, -3715, -544, 4977, 3823, -2737, -2178, 3441, 2695, 467, 847, 454, -4035, -779, 268, 2213,
292         // precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
293         1018, 1615, 1284, 2206, 5063, 5064, 4379, 472, -730, -5341, 2565, -4286, 341, 2981, 1085, -1268,
294         // precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
295         -2413, -3057, -2790, -2884, -3012, -1356, 4102, -3337, -4544, 5023, 205, -636, -5213, 909, 1009, -2973,
296         // precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
297         4188, 2271, 3550, -1572, 1520, 1841, -2984, 970, 2236, -4734, -4616, 578, 1931, -116, 4189, 1586,
298         // precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
299         -357, -2774, -3091, -1006, 918, -5156, 4876, 4123, 3784, -567, 1287, 151, 4095, 1458, 1299, 2684,
300         // precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
301         1018, -3260, 5116, -1722, 3085, 5120, -3615, 3760, -5175, 73, 400, 4254, 2982, 2788, 3198, -2657,
302         // precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
303         -3364, 569, 2234, 1930, -4744, -2279, -4828, 5215, 2503, -4403, 326, 1639, -4129, 5068, -512, -5015,
304         // precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
305         -376, -4859, 1068, -40, -2576, 4003, -4580, -4621, 1341, 2487, 3169, -2374, -193, 2625, -2998, 4784,
306         // precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
307         4855, 825, -635, 2118, 3062, -2813, -4808, -4250, -4875, -2113, -2740, -4408, 4, -1893, 675, 458,
308         // precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
309         1018, 5120, 400, -2657, -4744, -4403, -512, -40, 1341, 2625, -635, -4250, 4, -3360, 5114, -5313,
310         // precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
311         425, -2151, 336, -2662, -544, 5334, 3441, 2117, -4035, 2205, -2684, -3570, -1287, -4973, 5156, 2419,
312         // precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
313         357, 1204, -578, 1635, 2984, -1111, -2271, 4359, 5213, -2449, 3337, 3453, 2790, 554, -2981, -1409,
314         // precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
315         730, -279, -2206, 3524, -3085, -73, -3198, -1930, -2503, -5068, -1068, 4621, 193, -825, 4808, 4408,
316         // precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
317         1018, 4428, 5064, -4000, 2565, 573, -1268, 3125, -3012, -4144, 5023, 1927, 1009, -2139, -1572, 3535,
318         // precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
319         2236, 663, -116, 4967, -3091, -854, 4123, 1160, 4095, -1349, -2213, 1782, -847, 2062, 2737, 624,
320         // precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
321         3686, -2283, 886, 4889, 4513, -4601, 1324, 1893, 4875, -2118, 2998, -2487, 2576, 5015, -326, 2279,
322         // precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
323         3364, -4254, 3615, 3260, -1284, -1381, -472, -3891, -341, 2087, 3057, 4720, -4102, 3410, 636, 1689,
324         // precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
325         1018, -3524, 1615, 5268, 1284, 4428, 2206, -834, 5063, 1381, 5064, 279, 4379, 2439, 472, -4000,
326         // precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
327         -730, -2015, -5341, 3891, 2565, 1409, -4286, 2605, 341, 573, 2981, 5356, 1085, -2087, -1268, -554,
328         // precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
329         -2413, 3135, -3057, 3125, -2790, -778, -2884, -4720, -3012, -3453, -1356, -355, 4102, -4144, -3337, -152,
330         // precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
331         -4544, -3410, 5023, 2449, 205, -97, -636, 1927, -5213, 2624, 909, -1689, 1009, -4359, -2973, -3419,
332         // q_x16
333         10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
334         // qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
335         -6, -6, -6, -6, -408, -408, -408, -408, -6, -6, -6, -6, -408, -408, -408, -408,
336         // qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
337         -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956, -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956,
338         // qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
339         -6, -6, -6, -6, -1956, -1956, -1956, -1956, -6, -6, -6, -6, -1956, -1956, -1956, -1956,
340         // qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
341         27359, 27359, 27359, 27359, 408, 408, 408, 408, 27359, 27359, 27359, 27359, 408, 408, 408, 408,
342         // qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
343         -6, -6, -6, -6, -20856, -20856, -20856, -20856, -6, -6, -6, -6, -20856, -20856, -20856, -20856,
344         // qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
345         -408, -408, -408, -408, -21094, -21094, -21094, -21094, -408, -408, -408, -408, -21094, -21094, -21094, -21094,
346         // qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
347         -6, -6, -6, -6, -10093, -10093, -10093, -10093, -6, -6, -6, -6, -10093, -10093, -10093, -10093,
348         // qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
349         -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517, -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517,
350         // qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
351         -6, -9508, -20856, -29449, -408, 18345, -21094, -7033, -6, -9508, -20856, -29449, -408, 18345, -21094, -7033,
352         // qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
353         -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090, -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090,
354         // qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
355         -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072, -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072,
356         // qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
357         27359, 7033, 21094, -18345, 408, 29449, 20856, 9508, 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508,
358         // qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
359         -6, -3639, -9508, 25543, -20856, 829, -29449, -17675, -6, -3639, -9508, 25543, -20856, 829, -29449, -17675,
360         // qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
361         -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547, -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547,
362         // qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
363         -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683, -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683,
364         // qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
365         -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847, -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847,
366         // qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
367         -6, -5619, -3639, -12378, -9508, 15736, 25543, 23007, -20856, -27152, 829, -22209, -29449, -20490, -17675, 22532,
368         // qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
369         -408, 16724, 18363, 22623, 18345, 5766, 7429, -31369, -21094, 15840, -10001, 19326, -7033, 3407, -4547, 2316,
370         // qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
371         -27359, 6381, -14847, 8441, -16072, -6924, -26518, -4589, 28517, 12707, -14731, -15864, -12476, 31656, 23056, 24098,
372         // qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
373         1956, -31217, -11683, -24269, -28224, -5126, -7228, 20198, 10093, -573, -3925, -14341, 16090, 23781, -28103, -23812,
374         // qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
375         -6, 23812, 28103, -23781, -16090, 14341, 3925, 573, -10093, -20198, 7228, 5126, 28224, 24269, 11683, 31217,
376         // qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
377         -1956, -24098, -23056, -31656, 12476, 15864, 14731, -12707, -28517, 4589, 26518, 6924, 16072, -8441, 14847, -6381,
378         // qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
379         27359, -2316, 4547, -3407, 7033, -19326, 10001, -15840, 21094, 31369, -7429, -5766, -18345, -22623, -18363, -16724,
380         // qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
381         408, -22532, 17675, 20490, 29449, 22209, -829, 27152, 20856, -23007, -25543, -15736, 9508, 12378, 3639, 5619,
382         // qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
383         -6, -17412, -5619, 2017, -3639, 24976, -12378, 24702, -9508, -31558, 15736, 1316, 25543, -31418, 23007, -512,
384         // qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
385         -20856, -13268, -27152, 22044, 829, 8801, -22209, -12214, -29449, 11141, -20490, -17096, -17675, 32076, 22532, 17571,
386         // qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
387         -408, 13012, 16724, 4090, 18363, -30546, 22623, 16614, 18345, -17248, 5766, 22666, 7429, -7856, -31369, 31235,
388         // qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
389         -21094, 28541, 15840, -30351, -10001, -177, 19326, -31887, -7033, 25555, 3407, -31290, -4547, -13579, 2316, -2395,
390         // qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
391         -6, 4175, 23812, 7326, 28103, 17352, -23781, -28200, -16090, 11555, 14341, 6978, 3925, -1627, 573, 780,
392         // qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
393         -10093, 32271, -20198, 7356, 7228, 29364, 5126, 27895, 28224, -609, 24269, 21892, 11683, -7795, 31217, -18845,
394         // qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
395         -1956, 29407, -24098, -7716, -23056, -719, -31656, -8246, 12476, -26238, 15864, 11842, 14731, 1932, -12707, -11726,
396         // qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
397         -28517, 4394, 4589, 2066, 26518, -11300, 6924, -24037, 16072, 969, -8441, 14999, 14847, -11854, -6381, -19844,
398         // qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
399         -6, -13500, -17412, 32070, -5619, 5120, 2017, 11952, -3639, 1609, 24976, 9374, -12378, -23836, 24702, -8289,
400         // qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
401         -9508, -22471, -31558, 25482, 15736, -8935, 1316, 32351, 25543, 19661, -31418, 8295, 23007, -25652, -512, -19863,
402         // qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
403         -20856, 6917, -13268, -28712, -27152, 20899, 22044, 4083, 829, 951, 8801, 29370, -22209, 24641, -12214, 12976,
404         // qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
405         -29449, -22215, 11141, -29626, -20490, 30467, -17096, 13158, -17675, -24129, 32076, 7880, 22532, -30053, 17571, -8758,
406         // qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
407         -6, 5120, 24976, -8289, 15736, 19661, -512, -28712, 829, 24641, 11141, 13158, 22532, 13024, 4090, -27329,
408         // qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
409         18345, -8807, -7856, -20070, 15840, -1834, -31887, -18875, -4547, 18077, 19844, -23026, 8441, -12653, 11300, 11123,
410         // qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
411         28517, 31924, -11842, -14237, 31656, 16809, -29407, -5369, -11683, -16273, -27895, -29827, 20198, 7722, 1627, 9343,
412         // qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
413         16090, -15127, -7326, -6716, 5619, -1609, -24702, -25482, -25543, 25652, 13268, -4083, 22209, 22215, 17096, -7880,
414         // qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
415         -6, -26292, 17352, 12384, 14341, 61, 780, 23093, 7228, -12336, -609, -7801, 31217, -6747, -7716, 6095,
416         // qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
417         12476, 15511, 1932, 11623, 4589, 6314, -24037, -19320, 14847, 19643, 2395, -21770, -3407, -17394, 177, -23952,
418         // qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
419         21094, -31467, -22666, -1767, -22623, -14329, -13012, 30053, 17675, 29626, 12214, -951, 27152, 19863, 31418, 8935,
420         // qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
421         9508, -9374, -2017, 13500, -23812, -29541, 28200, 20173, -3925, -24025, -32271, -19856, -5126, -26286, -21892, -4967,
422         // qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
423         -6, 6716, 4175, -13164, 23812, -26292, 7326, -12098, 28103, 29541, 17352, 15127, -23781, -7289, -28200, 12384,
424         // qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
425         -16090, -29151, 11555, -20173, 14341, -9343, 6978, -22483, 3925, 61, -1627, 23788, 573, 24025, 780, -7722,
426         // qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
427         -10093, -18881, 32271, 23093, -20198, -24330, 7356, 19856, 7228, 29827, 29364, 15517, 5126, -12336, 27895, -4248,
428         // qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
429         28224, 26286, -609, 16273, 24269, -5729, 21892, -7801, 11683, -30144, -7795, 4967, 31217, 5369, -18845, -8027,
430         // qinvscaledzeta_x16_4_1
431         -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359,
432         // qinvscaledzeta_x16_4_3
433         27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
434         // qinvscaledzeta_x16_8_1
435         -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
436         // qinvscaledzeta_x16_8_7
437         -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
438         // qround32_x16
439         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
440         // scaledzeta_x16_4_1
441         -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223,
442         // scaledzeta_x16_4_3
443         223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
444         // scaledzeta_x16_8_1
445         3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
446         // scaledzeta_x16_8_7
447         4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
448     }
449 } ;
450 
add_x16(int16x16 a,int16x16 b)451 static inline int16x16 add_x16(int16x16 a, int16x16 b) {
452     return _mm256_add_epi16(a, b);
453 }
454 
sub_x16(int16x16 a,int16x16 b)455 static inline int16x16 sub_x16(int16x16 a, int16x16 b) {
456     return _mm256_sub_epi16(a, b);
457 }
458 
mulmod_scaled_x16(int16x16 x,int16x16 y,int16x16 yqinv,const int16 * qdata)459 static inline int16x16 mulmod_scaled_x16(int16x16 x, int16x16 y, int16x16 yqinv, const int16 *qdata) {
460     int16x16 b = _mm256_mulhi_epi16(x, y);
461     int16x16 d = _mm256_mullo_epi16(x, yqinv);
462     int16x16 e = _mm256_mulhi_epi16(d, q_x16);
463     return sub_x16(b, e);
464 }
465 
reduce_x16(int16x16 x,const int16 * qdata)466 static inline int16x16 reduce_x16(int16x16 x, const int16 *qdata) {
467     int16x16 y = _mm256_mulhrs_epi16(x, qround32_x16);
468     y = _mm256_mullo_epi16(y, q_x16);
469     return sub_x16(x, y);
470 }
471 
472 // ----- codegen pass 1
473 //
474 // startntt 512
475 // startbatch 512
476 // // ----- PRECONDITIONS
477 // physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
478 // // transform size 512
479 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
480 // // transforms per batch 1
481 // // batch indexing []
482 // // total batch size 512
483 //
484 // // modulus x^512-1 pos 0:512 q 7681,10753 bound 512*(5629,5800)
485 // assertranges ...
486 //
487 // // ----- LAYER 1
488 //
489 // // butterfly(0,256,1,256,1,0)
490 // butterfly 0 256 1 256 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
491 //
492 // // ----- POSTCONDITIONS AFTER LAYER 1
493 // // transform size 512
494 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
495 // // transforms per batch 1
496 // // batch indexing []
497 // // total batch size 512
498 //
499 // // modulus x^256-1 pos 0:256 q 7681,10753 bound 256*(11258,11600)
500 // assertranges ...
501 //
502 // // modulus x^256+1 pos 256:512 q 7681,10753 bound 256*(11258,11600)
503 // assertranges ...
504 //
505 // // ----- LAYER 2
506 //
507 // // reduce_ifreverse(0,64,1)
508 // reduce_ifreverse 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
509 //
510 // // reduce_ifreverse(256,320,1)
511 // reduce_ifreverse 256 320 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
512 //
513 // // butterfly(0,128,1,128,1,0)
514 // butterfly 0 128 1 128 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
515 //
516 // // butterfly(256,384,1,128,4,1)
517 // butterfly 256 384 1 128 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
518 //
519 // // ----- POSTCONDITIONS AFTER LAYER 2
520 // // transform size 512
521 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
522 // // transforms per batch 1
523 // // batch indexing []
524 // // total batch size 512
525 //
526 // // modulus x^128-1 pos 0:128 q 7681,10753 bound 128*(22516,23200)
527 // assertranges ...
528 //
529 // // modulus x^128+1 pos 128:256 q 7681,10753 bound 128*(22516,23200)
530 // assertranges ...
531 //
532 // // modulus x^128-zeta4 pos 256:384 q 7681,10753 bound 128*(15747,17016)
533 // assertranges ...
534 //
535 // // modulus x^128+zeta4 pos 384:512 q 7681,10753 bound 128*(15747,17016)
536 // assertranges ...
537 //
538 // // ----- LAYER 3
539 //
540 // // reduce_ifforward(64,128,1)
541 // reduce_ifforward 64 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
542 //
543 // // butterfly(0,64,1,64,1,0)
544 // butterfly 0 64 1 64 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
545 //
546 // // butterfly(128,192,1,64,4,1)
547 // butterfly 128 192 1 64 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
548 //
549 // // butterfly(256,320,1,64,8,1)
550 // butterfly 256 320 1 64 8 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
551 //
552 // // butterfly(384,448,1,64,8,-1)
553 // butterfly 384 448 1 64 8 7 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
554 //
555 // // reduce(0,64,1)
556 // reduce 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
557 //
558 // // twist(64,128,1,128,1)
559 // twist 64 128 1 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
560 //
561 // // twist(128,192,1,256,1)
562 // twist 128 192 1 256 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
563 //
564 // // twist(192,256,1,256,-1)
565 // twist 192 256 1 256 255 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
566 //
567 // // twist(256,320,1,512,1)
568 // twist 256 320 1 512 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
569 //
570 // // twist(320,384,1,512,5)
571 // twist 320 384 1 512 5 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
572 //
573 // // twist(384,448,1,512,-1)
574 // twist 384 448 1 512 511 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
575 //
576 // // twist(448,512,1,512,-5)
577 // twist 448 512 1 512 507 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
578 //
579 // // physical_permute(3,6)
580 // physical_permute (3, 6) (0, 1, 2, 3, 4, 5, 6, 7, 8) () (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
581 //
582 // // fold(256)
583 // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
584 // physical_map (0, 1, 2, 6, 4, 5, 3, 7) (8,)
585 //
586 // // fold(128)
587 // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7) (8,)
588 // physical_map (0, 1, 2, 6, 4, 5, 3) (7, 8)
589 //
590 // // fold(64)
591 // physical_unmap (0, 1, 2, 6, 4, 5, 3) (7, 8)
592 // physical_map (0, 1, 2, 6, 4, 5) (3, 7, 8)
593 //
594 // // nextbatch()
595 // stopbatch 512
596 // startbatch 512
597 //
598 // // halfbatch()
599 // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7, 8)
600 // stopbatch 512
601 // doublereps
602 // startbatch 256
603 // physical_map (0, 1, 2, 6, 4, 5) (3, 7)
604 //
605 // // halfbatch()
606 // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7)
607 // stopbatch 256
608 // doublereps
609 // startbatch 128
610 // physical_map (0, 1, 2, 6, 4, 5) (3,)
611 //
612 // // ----- POSTCONDITIONS AFTER LAYER 3
613 // // transform size 64
614 // // transform indexing [0, 1, 2, 6, 4, 5]
615 // // transforms per batch 2
616 // // batch indexing [3]
617 // // total batch size 128
618 //
619 // // modulus x^64-1 pos 0:64 q 7681,10753 bound 1*(5629,5827) 1*(5629,7613) 1*(5629,7666) 1*(5629,7264) 1*(5629,7639) 1*(5629,7591) 1*(5629,7291) 1*(5629,7204) ...
620 // assertranges ...
621 //
622 // // ----- LAYER 4
623 //
624 // // butterfly(0,32,1,32,1,0)
625 // butterfly 0 32 1 32 1 0 (0, 1, 2, 6, 4, 5) (3,)
626 //
627 // // ----- POSTCONDITIONS AFTER LAYER 4
628 // // transform size 64
629 // // transform indexing [0, 1, 2, 6, 4, 5]
630 // // transforms per batch 2
631 // // batch indexing [3]
632 // // total batch size 128
633 //
634 // // modulus x^32-1 pos 0:32 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ...
635 // assertranges ...
636 //
637 // // modulus x^32+1 pos 32:64 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ...
638 // assertranges ...
639 //
640 // // ----- LAYER 5
641 //
642 // // butterfly(0,16,1,16,1,0)
643 // butterfly 0 16 1 16 1 0 (0, 1, 2, 6, 4, 5) (3,)
644 //
645 // // butterfly(32,48,1,16,4,1)
646 // butterfly 32 48 1 16 4 1 (0, 1, 2, 6, 4, 5) (3,)
647 //
648 // // reduce(0,16,1)
649 // reduce 0 16 1 (0, 1, 2, 6, 4, 5) (3,)
650 //
651 // // twist(16,32,1,32,1)
652 // twist 16 32 1 32 1 (0, 1, 2, 6, 4, 5) (3,)
653 //
654 // // twist(32,48,1,64,1)
655 // twist 32 48 1 64 1 (0, 1, 2, 6, 4, 5) (3,)
656 //
657 // // twist(48,64,1,64,-1)
658 // twist 48 64 1 64 63 (0, 1, 2, 6, 4, 5) (3,)
659 //
660 // // physical_permute(0,1,2,5)
661 // physical_permute (0, 1, 2, 5) (0, 1, 2, 6, 4, 5) (3,) (1, 2, 5, 6, 4, 0) (3,)
662 //
663 // // fold(32)
664 // physical_unmap (1, 2, 5, 6, 4, 0) (3,)
665 // physical_map (1, 2, 5, 6, 4) (0, 3)
666 //
667 // // fold(16)
668 // physical_unmap (1, 2, 5, 6, 4) (0, 3)
669 // physical_map (1, 2, 5, 6) (0, 3, 4)
670 //
671 // // ----- POSTCONDITIONS AFTER LAYER 5
672 // // transform size 16
673 // // transform indexing [1, 2, 5, 6]
674 // // transforms per batch 8
675 // // batch indexing [0, 3, 4]
676 // // total batch size 128
677 //
678 // // modulus x^16-1 pos 0:16 q 7681,10753 bound 1*(5629,5800) 1*(5629,6967) 1*(5629,6418) 1*(5629,7585) 1*(5629,7017) 1*(5629,6328) 1*(5629,7033) 1*(5629,6943) ...
679 // assertranges ...
680 //
681 // // ----- LAYER 6
682 //
683 // // butterfly(0,8,1,8,1,0)
684 // butterfly 0 8 1 8 1 0 (1, 2, 5, 6) (0, 3, 4)
685 //
686 // // physical_permute(1,2,4)
687 // physical_permute (1, 2, 4) (1, 2, 5, 6) (0, 3, 4) (2, 4, 5, 6) (0, 3, 1)
688 //
689 // // nextbatch()
690 // stopbatch 128
691 // startbatch 128
692 //
693 // // ----- POSTCONDITIONS AFTER LAYER 6
694 // // transform size 16
695 // // transform indexing [2, 4, 5, 6]
696 // // transforms per batch 8
697 // // batch indexing [0, 3, 1]
698 // // total batch size 128
699 //
700 // // modulus x^8-1 pos 0:8 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555)
701 // assertranges ...
702 //
703 // // modulus x^8+1 pos 8:16 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555)
704 // assertranges ...
705 //
706 // // ----- LAYER 7
707 //
708 // // butterfly(0,4,1,4,1,0)
709 // butterfly 0 4 1 4 1 0 (2, 4, 5, 6) (0, 3, 1)
710 //
711 // // butterfly(8,12,1,4,4,1)
712 // butterfly 8 12 1 4 4 1 (2, 4, 5, 6) (0, 3, 1)
713 //
714 // // reduce(0,4,1)
715 // reduce 0 4 1 (2, 4, 5, 6) (0, 3, 1)
716 //
717 // // twist(4,8,1,8,1)
718 // twist 4 8 1 8 1 (2, 4, 5, 6) (0, 3, 1)
719 //
720 // // twist(8,12,1,16,1)
721 // twist 8 12 1 16 1 (2, 4, 5, 6) (0, 3, 1)
722 //
723 // // twist(12,16,1,16,-1)
724 // twist 12 16 1 16 15 (2, 4, 5, 6) (0, 3, 1)
725 //
726 // // physical_permute(2,6)
727 // physical_permute (2, 6) (2, 4, 5, 6) (0, 3, 1) (6, 4, 5, 2) (0, 3, 1)
728 //
729 // // fold(8)
730 // physical_unmap (6, 4, 5, 2) (0, 3, 1)
731 // physical_map (6, 4, 5) (0, 1, 2, 3)
732 //
733 // // fold(4)
734 // physical_unmap (6, 4, 5) (0, 1, 2, 3)
735 // physical_map (6, 4) (0, 1, 2, 3, 5)
736 //
737 // // ----- POSTCONDITIONS AFTER LAYER 7
738 // // transform size 4
739 // // transform indexing [6, 4]
740 // // transforms per batch 32
741 // // batch indexing [0, 1, 2, 3, 5]
742 // // total batch size 128
743 //
744 // // modulus x^4-1 pos 0:4 q 7681,10753 bound 1*(5629,5800) 1*(5629,6938) 1*(5629,6521) 1*(5629,7157)
745 // assertranges ...
746 //
747 // // ----- LAYER 8
748 //
749 // // butterfly(0,2,1,2,1,0)
750 // butterfly 0 2 1 2 1 0 (6, 4) (0, 1, 2, 3, 5)
751 //
752 // // ----- POSTCONDITIONS AFTER LAYER 8
753 // // transform size 4
754 // // transform indexing [6, 4]
755 // // transforms per batch 32
756 // // batch indexing [0, 1, 2, 3, 5]
757 // // total batch size 128
758 //
759 // // modulus x^2-1 pos 0:2 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095)
760 // assertranges ...
761 //
762 // // modulus x^2+1 pos 2:4 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095)
763 // assertranges ...
764 //
765 // // ----- LAYER 9
766 //
767 // // butterfly(0,1,1,1,1,0)
768 // butterfly 0 1 1 1 1 0 (6, 4) (0, 1, 2, 3, 5)
769 //
770 // // butterfly(2,3,1,1,4,1)
771 // butterfly 2 3 1 1 4 1 (6, 4) (0, 1, 2, 3, 5)
772 //
773 // // ----- POSTCONDITIONS AFTER LAYER 9
774 // // transform size 4
775 // // transform indexing [6, 4]
776 // // transforms per batch 32
777 // // batch indexing [0, 1, 2, 3, 5]
778 // // total batch size 128
779 //
780 // // modulus x^1-1 pos 0:1 q 7681,10753 bound 1*(22516,26416)
781 // assertranges ...
782 //
783 // // modulus x^1+1 pos 1:2 q 7681,10753 bound 1*(22516,26416)
784 // assertranges ...
785 //
786 // // modulus x^1-zeta4 pos 2:3 q 7681,10753 bound 1*(15747,17745)
787 // assertranges ...
788 //
789 // // modulus x^1+zeta4 pos 3:4 q 7681,10753 bound 1*(15747,17745)
790 // assertranges ...
791 // stopbatch 128
792 // physical_unmap (6, 4) (0, 1, 2, 3, 5)
793 // stopntt 512
794 
795 // ----- codegen pass 2
796 //
797 // startntt 512
798 // startbatch 512
799 // vector_butterfly 0 256 1 0
800 // vector_butterfly 128 384 1 0
801 // vector_butterfly 64 320 1 0
802 // vector_butterfly 192 448 1 0
803 // vector_reduce_ifreverse 0
804 // vector_reduce_ifreverse 256
805 // vector_butterfly 0 128 1 0
806 // vector_butterfly 64 192 1 0
807 // vector_butterfly 256 384 4 1
808 // vector_butterfly 320 448 4 1
809 // vector_reduce_ifforward 64
810 // vector_butterfly 0 64 1 0
811 // vector_butterfly 128 192 4 1
812 // vector_butterfly 256 320 8 1
813 // vector_butterfly 384 448 8 7
814 // vector_reduce 0
815 // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
816 // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
817 // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
818 // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
819 // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
820 // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
821 // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
822 // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
823 // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
824 // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
825 // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
826 // stopbatch 512
827 // startbatch 512
828 // vector_butterfly 16 272 1 0
829 // vector_butterfly 144 400 1 0
830 // vector_butterfly 80 336 1 0
831 // vector_butterfly 208 464 1 0
832 // vector_reduce_ifreverse 16
833 // vector_reduce_ifreverse 272
834 // vector_butterfly 16 144 1 0
835 // vector_butterfly 80 208 1 0
836 // vector_butterfly 272 400 4 1
837 // vector_butterfly 336 464 4 1
838 // vector_reduce_ifforward 80
839 // vector_butterfly 16 80 1 0
840 // vector_butterfly 144 208 4 1
841 // vector_butterfly 272 336 8 1
842 // vector_butterfly 400 464 8 7
843 // vector_reduce 16
844 // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
845 // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
846 // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
847 // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
848 // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
849 // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
850 // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
851 // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
852 // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
853 // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
854 // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
855 // stopbatch 512
856 // startbatch 512
857 // vector_butterfly 32 288 1 0
858 // vector_butterfly 160 416 1 0
859 // vector_butterfly 96 352 1 0
860 // vector_butterfly 224 480 1 0
861 // vector_reduce_ifreverse 32
862 // vector_reduce_ifreverse 288
863 // vector_butterfly 32 160 1 0
864 // vector_butterfly 96 224 1 0
865 // vector_butterfly 288 416 4 1
866 // vector_butterfly 352 480 4 1
867 // vector_reduce_ifforward 96
868 // vector_butterfly 32 96 1 0
869 // vector_butterfly 160 224 4 1
870 // vector_butterfly 288 352 8 1
871 // vector_butterfly 416 480 8 7
872 // vector_reduce 32
873 // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
874 // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
875 // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
876 // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
877 // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
878 // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
879 // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
880 // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
881 // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
882 // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
883 // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
884 // stopbatch 512
885 // startbatch 512
886 // vector_butterfly 48 304 1 0
887 // vector_butterfly 176 432 1 0
888 // vector_butterfly 112 368 1 0
889 // vector_butterfly 240 496 1 0
890 // vector_reduce_ifreverse 48
891 // vector_reduce_ifreverse 304
892 // vector_butterfly 48 176 1 0
893 // vector_butterfly 112 240 1 0
894 // vector_butterfly 304 432 4 1
895 // vector_butterfly 368 496 4 1
896 // vector_reduce_ifforward 112
897 // vector_butterfly 48 112 1 0
898 // vector_butterfly 176 240 4 1
899 // vector_butterfly 304 368 8 1
900 // vector_butterfly 432 496 8 7
901 // vector_reduce 48
902 // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
903 // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
904 // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
905 // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
906 // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
907 // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
908 // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
909 // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
910 // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
911 // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
912 // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
913 // stopbatch 512
914 // doublereps
915 // doublereps
916 // startbatch 128
917 // vector_butterfly 0 32 1 0
918 // vector_butterfly 64 96 1 0
919 // vector_butterfly 16 48 1 0
920 // vector_butterfly 80 112 1 0
921 // vector_butterfly 0 16 1 0
922 // vector_butterfly 64 80 1 0
923 // vector_butterfly 32 48 4 1
924 // vector_butterfly 96 112 4 1
925 // vector_reduce 0
926 // vector_reduce 64
927 // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
928 // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
929 // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
930 // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
931 // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
932 // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
933 // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
934 // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
935 // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
936 // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
937 // vector_butterfly 0 64 1 0
938 // vector_butterfly 32 96 1 0
939 // vector_butterfly 16 80 1 0
940 // vector_butterfly 48 112 1 0
941 // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
942 // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
943 // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
944 // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
945 // stopbatch 128
946 // startbatch 128
947 // vector_butterfly 0 32 1 0
948 // vector_butterfly 16 48 1 0
949 // vector_butterfly 64 96 4 1
950 // vector_butterfly 80 112 4 1
951 // vector_reduce 0
952 // vector_reduce 16
953 // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
954 // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
955 // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
956 // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
957 // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
958 // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
959 // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
960 // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
961 // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
962 // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
963 // vector_butterfly 0 16 1 0
964 // vector_butterfly 64 80 1 0
965 // vector_butterfly 32 48 1 0
966 // vector_butterfly 96 112 1 0
967 // vector_butterfly 0 64 1 0
968 // vector_butterfly 32 96 1 0
969 // vector_butterfly 16 80 4 1
970 // vector_butterfly 48 112 4 1
971 // stopbatch 128
972 // stopntt 512
973 // startntt 512
974 
ntt512(int16 * f,int reps,const int16 * qdata)975 static void ntt512(int16 *f, int reps, const int16 *qdata) {
976     // startbatch 512
977     for (long long r = 0; r < reps; ++r) {
978         // vector_butterfly 0 256 1 0
979         int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
980         int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256));
981         int16x16 b0 = add_x16(a0, a16);
982         int16x16 b16 = sub_x16(a0, a16);
983         // vector_butterfly 128 384 1 0
984         int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128));
985         int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384));
986         int16x16 b8 = add_x16(a8, a24);
987         int16x16 b24 = sub_x16(a8, a24);
988         // vector_butterfly 64 320 1 0
989         int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
990         int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320));
991         int16x16 b4 = add_x16(a4, a20);
992         int16x16 b20 = sub_x16(a4, a20);
993         // vector_butterfly 192 448 1 0
994         int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192));
995         int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448));
996         int16x16 b12 = add_x16(a12, a28);
997         int16x16 b28 = sub_x16(a12, a28);
998         // vector_reduce_ifreverse 0
999         // vector_reduce_ifreverse 256
1000         // vector_butterfly 0 128 1 0
1001         int16x16 c0 = add_x16(b0, b8);
1002         int16x16 c8 = sub_x16(b0, b8);
1003         // vector_butterfly 64 192 1 0
1004         int16x16 c4 = add_x16(b4, b12);
1005         int16x16 c12 = sub_x16(b4, b12);
1006         // vector_butterfly 256 384 4 1
1007         b24 = mulmod_scaled_x16(b24, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1008         int16x16 c16 = add_x16(b16, b24);
1009         int16x16 c24 = sub_x16(b16, b24);
1010         // vector_butterfly 320 448 4 1
1011         b28 = mulmod_scaled_x16(b28, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1012         int16x16 c20 = add_x16(b20, b28);
1013         int16x16 c28 = sub_x16(b20, b28);
1014         // vector_reduce_ifforward 64
1015         c4 = reduce_x16(c4, qdata);
1016         // vector_butterfly 0 64 1 0
1017         int16x16 d0 = add_x16(c0, c4);
1018         int16x16 d4 = sub_x16(c0, c4);
1019         // vector_butterfly 128 192 4 1
1020         c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1021         int16x16 d8 = add_x16(c8, c12);
1022         int16x16 d12 = sub_x16(c8, c12);
1023         // vector_butterfly 256 320 8 1
1024         c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1025         int16x16 d16 = add_x16(c16, c20);
1026         int16x16 d20 = sub_x16(c16, c20);
1027         // vector_butterfly 384 448 8 7
1028         c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1029         int16x16 d24 = add_x16(c24, c28);
1030         int16x16 d28 = sub_x16(c24, c28);
1031         // vector_reduce 0
1032         d0 = reduce_x16(d0, qdata);
1033         // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1034         d4 = mulmod_scaled_x16(d4, precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1035         // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1036         d8 = mulmod_scaled_x16(d8, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1037         // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1038         d12 = mulmod_scaled_x16(d12, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1039         // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1040         d16 = mulmod_scaled_x16(d16, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1041         // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1042         d20 = mulmod_scaled_x16(d20, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1043         // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1044         d24 = mulmod_scaled_x16(d24, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1045         // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1046         d28 = mulmod_scaled_x16(d28, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
1047         // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1048         int16x16 e0 = _mm256_permute2x128_si256_lo(d0, d4);
1049         int16x16 e4 = _mm256_permute2x128_si256_hi(d0, d4);
1050         // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1051         int16x16 e8 = _mm256_permute2x128_si256_lo(d8, d12);
1052         int16x16 e12 = _mm256_permute2x128_si256_hi(d8, d12);
1053         // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1054         int16x16 e16 = _mm256_permute2x128_si256_lo(d16, d20);
1055         int16x16 e20 = _mm256_permute2x128_si256_hi(d16, d20);
1056         // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1057         int16x16 e24 = _mm256_permute2x128_si256_lo(d24, d28);
1058         int16x16 e28 = _mm256_permute2x128_si256_hi(d24, d28);
1059         // stopbatch 512
1060         _mm256_storeu_si256((int16x16 *) (f + 0), e0);
1061         _mm256_storeu_si256((int16x16 *) (f + 64), e4);
1062         _mm256_storeu_si256((int16x16 *) (f + 128), e8);
1063         _mm256_storeu_si256((int16x16 *) (f + 192), e12);
1064         _mm256_storeu_si256((int16x16 *) (f + 256), e16);
1065         _mm256_storeu_si256((int16x16 *) (f + 320), e20);
1066         _mm256_storeu_si256((int16x16 *) (f + 384), e24);
1067         _mm256_storeu_si256((int16x16 *) (f + 448), e28);
1068         f += 512;
1069     }
1070     f -= 512 * reps;
1071     // startbatch 512
1072     for (long long r = 0; r < reps; ++r) {
1073         // vector_butterfly 16 272 1 0
1074         int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1075         int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272));
1076         int16x16 b1 = add_x16(a1, a17);
1077         int16x16 b17 = sub_x16(a1, a17);
1078         // vector_butterfly 144 400 1 0
1079         int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144));
1080         int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400));
1081         int16x16 b9 = add_x16(a9, a25);
1082         int16x16 b25 = sub_x16(a9, a25);
1083         // vector_butterfly 80 336 1 0
1084         int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1085         int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336));
1086         int16x16 b5 = add_x16(a5, a21);
1087         int16x16 b21 = sub_x16(a5, a21);
1088         // vector_butterfly 208 464 1 0
1089         int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208));
1090         int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464));
1091         int16x16 b13 = add_x16(a13, a29);
1092         int16x16 b29 = sub_x16(a13, a29);
1093         // vector_reduce_ifreverse 16
1094         // vector_reduce_ifreverse 272
1095         // vector_butterfly 16 144 1 0
1096         int16x16 c1 = add_x16(b1, b9);
1097         int16x16 c9 = sub_x16(b1, b9);
1098         // vector_butterfly 80 208 1 0
1099         int16x16 c5 = add_x16(b5, b13);
1100         int16x16 c13 = sub_x16(b5, b13);
1101         // vector_butterfly 272 400 4 1
1102         b25 = mulmod_scaled_x16(b25, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1103         int16x16 c17 = add_x16(b17, b25);
1104         int16x16 c25 = sub_x16(b17, b25);
1105         // vector_butterfly 336 464 4 1
1106         b29 = mulmod_scaled_x16(b29, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1107         int16x16 c21 = add_x16(b21, b29);
1108         int16x16 c29 = sub_x16(b21, b29);
1109         // vector_reduce_ifforward 80
1110         c5 = reduce_x16(c5, qdata);
1111         // vector_butterfly 16 80 1 0
1112         int16x16 d1 = add_x16(c1, c5);
1113         int16x16 d5 = sub_x16(c1, c5);
1114         // vector_butterfly 144 208 4 1
1115         c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1116         int16x16 d9 = add_x16(c9, c13);
1117         int16x16 d13 = sub_x16(c9, c13);
1118         // vector_butterfly 272 336 8 1
1119         c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1120         int16x16 d17 = add_x16(c17, c21);
1121         int16x16 d21 = sub_x16(c17, c21);
1122         // vector_butterfly 400 464 8 7
1123         c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1124         int16x16 d25 = add_x16(c25, c29);
1125         int16x16 d29 = sub_x16(c25, c29);
1126         // vector_reduce 16
1127         d1 = reduce_x16(d1, qdata);
1128         // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1129         d5 = mulmod_scaled_x16(d5, precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1130         // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1131         d9 = mulmod_scaled_x16(d9, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1132         // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1133         d13 = mulmod_scaled_x16(d13, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1134         // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1135         d17 = mulmod_scaled_x16(d17, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1136         // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1137         d21 = mulmod_scaled_x16(d21, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1138         // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1139         d25 = mulmod_scaled_x16(d25, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1140         // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1141         d29 = mulmod_scaled_x16(d29, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1142         // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1143         int16x16 e1 = _mm256_permute2x128_si256_lo(d1, d5);
1144         int16x16 e5 = _mm256_permute2x128_si256_hi(d1, d5);
1145         // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1146         int16x16 e9 = _mm256_permute2x128_si256_lo(d9, d13);
1147         int16x16 e13 = _mm256_permute2x128_si256_hi(d9, d13);
1148         // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1149         int16x16 e17 = _mm256_permute2x128_si256_lo(d17, d21);
1150         int16x16 e21 = _mm256_permute2x128_si256_hi(d17, d21);
1151         // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1152         int16x16 e25 = _mm256_permute2x128_si256_lo(d25, d29);
1153         int16x16 e29 = _mm256_permute2x128_si256_hi(d25, d29);
1154         // stopbatch 512
1155         _mm256_storeu_si256((int16x16 *) (f + 16), e1);
1156         _mm256_storeu_si256((int16x16 *) (f + 80), e5);
1157         _mm256_storeu_si256((int16x16 *) (f + 144), e9);
1158         _mm256_storeu_si256((int16x16 *) (f + 208), e13);
1159         _mm256_storeu_si256((int16x16 *) (f + 272), e17);
1160         _mm256_storeu_si256((int16x16 *) (f + 336), e21);
1161         _mm256_storeu_si256((int16x16 *) (f + 400), e25);
1162         _mm256_storeu_si256((int16x16 *) (f + 464), e29);
1163         f += 512;
1164     }
1165     f -= 512 * reps;
1166     // startbatch 512
1167     for (long long r = 0; r < reps; ++r) {
1168         // vector_butterfly 32 288 1 0
1169         int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1170         int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288));
1171         int16x16 b2 = add_x16(a2, a18);
1172         int16x16 b18 = sub_x16(a2, a18);
1173         // vector_butterfly 160 416 1 0
1174         int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160));
1175         int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416));
1176         int16x16 b10 = add_x16(a10, a26);
1177         int16x16 b26 = sub_x16(a10, a26);
1178         // vector_butterfly 96 352 1 0
1179         int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1180         int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352));
1181         int16x16 b6 = add_x16(a6, a22);
1182         int16x16 b22 = sub_x16(a6, a22);
1183         // vector_butterfly 224 480 1 0
1184         int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224));
1185         int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480));
1186         int16x16 b14 = add_x16(a14, a30);
1187         int16x16 b30 = sub_x16(a14, a30);
1188         // vector_reduce_ifreverse 32
1189         // vector_reduce_ifreverse 288
1190         // vector_butterfly 32 160 1 0
1191         int16x16 c2 = add_x16(b2, b10);
1192         int16x16 c10 = sub_x16(b2, b10);
1193         // vector_butterfly 96 224 1 0
1194         int16x16 c6 = add_x16(b6, b14);
1195         int16x16 c14 = sub_x16(b6, b14);
1196         // vector_butterfly 288 416 4 1
1197         b26 = mulmod_scaled_x16(b26, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1198         int16x16 c18 = add_x16(b18, b26);
1199         int16x16 c26 = sub_x16(b18, b26);
1200         // vector_butterfly 352 480 4 1
1201         b30 = mulmod_scaled_x16(b30, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1202         int16x16 c22 = add_x16(b22, b30);
1203         int16x16 c30 = sub_x16(b22, b30);
1204         // vector_reduce_ifforward 96
1205         c6 = reduce_x16(c6, qdata);
1206         // vector_butterfly 32 96 1 0
1207         int16x16 d2 = add_x16(c2, c6);
1208         int16x16 d6 = sub_x16(c2, c6);
1209         // vector_butterfly 160 224 4 1
1210         c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1211         int16x16 d10 = add_x16(c10, c14);
1212         int16x16 d14 = sub_x16(c10, c14);
1213         // vector_butterfly 288 352 8 1
1214         c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1215         int16x16 d18 = add_x16(c18, c22);
1216         int16x16 d22 = sub_x16(c18, c22);
1217         // vector_butterfly 416 480 8 7
1218         c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1219         int16x16 d26 = add_x16(c26, c30);
1220         int16x16 d30 = sub_x16(c26, c30);
1221         // vector_reduce 32
1222         d2 = reduce_x16(d2, qdata);
1223         // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1224         d6 = mulmod_scaled_x16(d6, precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1225         // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1226         d10 = mulmod_scaled_x16(d10, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1227         // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1228         d14 = mulmod_scaled_x16(d14, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1229         // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1230         d18 = mulmod_scaled_x16(d18, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1231         // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1232         d22 = mulmod_scaled_x16(d22, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1233         // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1234         d26 = mulmod_scaled_x16(d26, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1235         // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1236         d30 = mulmod_scaled_x16(d30, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1237         // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1238         int16x16 e2 = _mm256_permute2x128_si256_lo(d2, d6);
1239         int16x16 e6 = _mm256_permute2x128_si256_hi(d2, d6);
1240         // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1241         int16x16 e10 = _mm256_permute2x128_si256_lo(d10, d14);
1242         int16x16 e14 = _mm256_permute2x128_si256_hi(d10, d14);
1243         // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1244         int16x16 e18 = _mm256_permute2x128_si256_lo(d18, d22);
1245         int16x16 e22 = _mm256_permute2x128_si256_hi(d18, d22);
1246         // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1247         int16x16 e26 = _mm256_permute2x128_si256_lo(d26, d30);
1248         int16x16 e30 = _mm256_permute2x128_si256_hi(d26, d30);
1249         // stopbatch 512
1250         _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1251         _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1252         _mm256_storeu_si256((int16x16 *) (f + 160), e10);
1253         _mm256_storeu_si256((int16x16 *) (f + 224), e14);
1254         _mm256_storeu_si256((int16x16 *) (f + 288), e18);
1255         _mm256_storeu_si256((int16x16 *) (f + 352), e22);
1256         _mm256_storeu_si256((int16x16 *) (f + 416), e26);
1257         _mm256_storeu_si256((int16x16 *) (f + 480), e30);
1258         f += 512;
1259     }
1260     f -= 512 * reps;
1261     // startbatch 512
1262     for (long long r = 0; r < reps; ++r) {
1263         // vector_butterfly 48 304 1 0
1264         int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1265         int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304));
1266         int16x16 b3 = add_x16(a3, a19);
1267         int16x16 b19 = sub_x16(a3, a19);
1268         // vector_butterfly 176 432 1 0
1269         int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176));
1270         int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432));
1271         int16x16 b11 = add_x16(a11, a27);
1272         int16x16 b27 = sub_x16(a11, a27);
1273         // vector_butterfly 112 368 1 0
1274         int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1275         int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368));
1276         int16x16 b7 = add_x16(a7, a23);
1277         int16x16 b23 = sub_x16(a7, a23);
1278         // vector_butterfly 240 496 1 0
1279         int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240));
1280         int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496));
1281         int16x16 b15 = add_x16(a15, a31);
1282         int16x16 b31 = sub_x16(a15, a31);
1283         // vector_reduce_ifreverse 48
1284         // vector_reduce_ifreverse 304
1285         // vector_butterfly 48 176 1 0
1286         int16x16 c3 = add_x16(b3, b11);
1287         int16x16 c11 = sub_x16(b3, b11);
1288         // vector_butterfly 112 240 1 0
1289         int16x16 c7 = add_x16(b7, b15);
1290         int16x16 c15 = sub_x16(b7, b15);
1291         // vector_butterfly 304 432 4 1
1292         b27 = mulmod_scaled_x16(b27, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1293         int16x16 c19 = add_x16(b19, b27);
1294         int16x16 c27 = sub_x16(b19, b27);
1295         // vector_butterfly 368 496 4 1
1296         b31 = mulmod_scaled_x16(b31, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1297         int16x16 c23 = add_x16(b23, b31);
1298         int16x16 c31 = sub_x16(b23, b31);
1299         // vector_reduce_ifforward 112
1300         c7 = reduce_x16(c7, qdata);
1301         // vector_butterfly 48 112 1 0
1302         int16x16 d3 = add_x16(c3, c7);
1303         int16x16 d7 = sub_x16(c3, c7);
1304         // vector_butterfly 176 240 4 1
1305         c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1306         int16x16 d11 = add_x16(c11, c15);
1307         int16x16 d15 = sub_x16(c11, c15);
1308         // vector_butterfly 304 368 8 1
1309         c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1310         int16x16 d19 = add_x16(c19, c23);
1311         int16x16 d23 = sub_x16(c19, c23);
1312         // vector_butterfly 432 496 8 7
1313         c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1314         int16x16 d27 = add_x16(c27, c31);
1315         int16x16 d31 = sub_x16(c27, c31);
1316         // vector_reduce 48
1317         d3 = reduce_x16(d3, qdata);
1318         // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1319         d7 = mulmod_scaled_x16(d7, precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1320         // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1321         d11 = mulmod_scaled_x16(d11, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1322         // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1323         d15 = mulmod_scaled_x16(d15, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1324         // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1325         d19 = mulmod_scaled_x16(d19, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1326         // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1327         d23 = mulmod_scaled_x16(d23, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1328         // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1329         d27 = mulmod_scaled_x16(d27, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1330         // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1331         d31 = mulmod_scaled_x16(d31, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1332         // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1333         int16x16 e3 = _mm256_permute2x128_si256_lo(d3, d7);
1334         int16x16 e7 = _mm256_permute2x128_si256_hi(d3, d7);
1335         // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1336         int16x16 e11 = _mm256_permute2x128_si256_lo(d11, d15);
1337         int16x16 e15 = _mm256_permute2x128_si256_hi(d11, d15);
1338         // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1339         int16x16 e19 = _mm256_permute2x128_si256_lo(d19, d23);
1340         int16x16 e23 = _mm256_permute2x128_si256_hi(d19, d23);
1341         // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1342         int16x16 e27 = _mm256_permute2x128_si256_lo(d27, d31);
1343         int16x16 e31 = _mm256_permute2x128_si256_hi(d27, d31);
1344         // stopbatch 512
1345         _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1346         _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1347         _mm256_storeu_si256((int16x16 *) (f + 176), e11);
1348         _mm256_storeu_si256((int16x16 *) (f + 240), e15);
1349         _mm256_storeu_si256((int16x16 *) (f + 304), e19);
1350         _mm256_storeu_si256((int16x16 *) (f + 368), e23);
1351         _mm256_storeu_si256((int16x16 *) (f + 432), e27);
1352         _mm256_storeu_si256((int16x16 *) (f + 496), e31);
1353         f += 512;
1354     }
1355     f -= 512 * reps;
1356     // doublereps
1357     reps *= 2;
1358     // doublereps
1359     reps *= 2;
1360     // startbatch 128
1361     for (long long r = 0; r < reps; ++r) {
1362         // vector_butterfly 0 32 1 0
1363         int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1364         int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1365         int16x16 b0 = add_x16(a0, a2);
1366         int16x16 b2 = sub_x16(a0, a2);
1367         // vector_butterfly 64 96 1 0
1368         int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1369         int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1370         int16x16 b4 = add_x16(a4, a6);
1371         int16x16 b6 = sub_x16(a4, a6);
1372         // vector_butterfly 16 48 1 0
1373         int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1374         int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1375         int16x16 b1 = add_x16(a1, a3);
1376         int16x16 b3 = sub_x16(a1, a3);
1377         // vector_butterfly 80 112 1 0
1378         int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1379         int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1380         int16x16 b5 = add_x16(a5, a7);
1381         int16x16 b7 = sub_x16(a5, a7);
1382         // vector_butterfly 0 16 1 0
1383         int16x16 c0 = add_x16(b0, b1);
1384         int16x16 c1 = sub_x16(b0, b1);
1385         // vector_butterfly 64 80 1 0
1386         int16x16 c4 = add_x16(b4, b5);
1387         int16x16 c5 = sub_x16(b4, b5);
1388         // vector_butterfly 32 48 4 1
1389         b3 = mulmod_scaled_x16(b3, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1390         int16x16 c2 = add_x16(b2, b3);
1391         int16x16 c3 = sub_x16(b2, b3);
1392         // vector_butterfly 96 112 4 1
1393         b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1394         int16x16 c6 = add_x16(b6, b7);
1395         int16x16 c7 = sub_x16(b6, b7);
1396         // vector_reduce 0
1397         c0 = reduce_x16(c0, qdata);
1398         // vector_reduce 64
1399         c4 = reduce_x16(c4, qdata);
1400         // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1401         c1 = mulmod_scaled_x16(c1, precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1402         // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1403         c5 = mulmod_scaled_x16(c5, precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1404         // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1405         c2 = mulmod_scaled_x16(c2, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1406         // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1407         c6 = mulmod_scaled_x16(c6, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1408         // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1409         c3 = mulmod_scaled_x16(c3, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1410         // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1411         c7 = mulmod_scaled_x16(c7, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1412         // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1413         int16x16 d0 = _mm256_unpacklo_epi16(c0, c2);
1414         int16x16 d2 = _mm256_unpackhi_epi16(c0, c2);
1415         // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1416         int16x16 d1 = _mm256_unpacklo_epi16(c1, c3);
1417         int16x16 d3 = _mm256_unpackhi_epi16(c1, c3);
1418         // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1419         int16x16 d4 = _mm256_unpacklo_epi16(c4, c6);
1420         int16x16 d6 = _mm256_unpackhi_epi16(c4, c6);
1421         // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1422         int16x16 d5 = _mm256_unpacklo_epi16(c5, c7);
1423         int16x16 d7 = _mm256_unpackhi_epi16(c5, c7);
1424         // vector_butterfly 0 64 1 0
1425         int16x16 e0 = add_x16(d0, d4);
1426         int16x16 e4 = sub_x16(d0, d4);
1427         // vector_butterfly 32 96 1 0
1428         int16x16 e2 = add_x16(d2, d6);
1429         int16x16 e6 = sub_x16(d2, d6);
1430         // vector_butterfly 16 80 1 0
1431         int16x16 e1 = add_x16(d1, d5);
1432         int16x16 e5 = sub_x16(d1, d5);
1433         // vector_butterfly 48 112 1 0
1434         int16x16 e3 = add_x16(d3, d7);
1435         int16x16 e7 = sub_x16(d3, d7);
1436         // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1437         int16x16 f0 = _mm256_unpacklo_epi32(e0, e1);
1438         int16x16 f1 = _mm256_unpackhi_epi32(e0, e1);
1439         // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1440         int16x16 f2 = _mm256_unpacklo_epi32(e2, e3);
1441         int16x16 f3 = _mm256_unpackhi_epi32(e2, e3);
1442         // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1443         int16x16 f4 = _mm256_unpacklo_epi32(e4, e5);
1444         int16x16 f5 = _mm256_unpackhi_epi32(e4, e5);
1445         // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1446         int16x16 f6 = _mm256_unpacklo_epi32(e6, e7);
1447         int16x16 f7 = _mm256_unpackhi_epi32(e6, e7);
1448         // stopbatch 128
1449         _mm256_storeu_si256((int16x16 *) (f + 0), f0);
1450         _mm256_storeu_si256((int16x16 *) (f + 16), f1);
1451         _mm256_storeu_si256((int16x16 *) (f + 32), f2);
1452         _mm256_storeu_si256((int16x16 *) (f + 48), f3);
1453         _mm256_storeu_si256((int16x16 *) (f + 64), f4);
1454         _mm256_storeu_si256((int16x16 *) (f + 80), f5);
1455         _mm256_storeu_si256((int16x16 *) (f + 96), f6);
1456         _mm256_storeu_si256((int16x16 *) (f + 112), f7);
1457         f += 128;
1458     }
1459     f -= 128 * reps;
1460     // startbatch 128
1461     for (long long r = 0; r < reps; ++r) {
1462         // vector_butterfly 0 32 1 0
1463         int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1464         int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1465         int16x16 b0 = add_x16(a0, a2);
1466         int16x16 b2 = sub_x16(a0, a2);
1467         // vector_butterfly 16 48 1 0
1468         int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1469         int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1470         int16x16 b1 = add_x16(a1, a3);
1471         int16x16 b3 = sub_x16(a1, a3);
1472         // vector_butterfly 64 96 4 1
1473         int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1474         int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1475         a6 = mulmod_scaled_x16(a6, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1476         int16x16 b4 = add_x16(a4, a6);
1477         int16x16 b6 = sub_x16(a4, a6);
1478         // vector_butterfly 80 112 4 1
1479         int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1480         int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1481         a7 = mulmod_scaled_x16(a7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1482         int16x16 b5 = add_x16(a5, a7);
1483         int16x16 b7 = sub_x16(a5, a7);
1484         // vector_reduce 0
1485         b0 = reduce_x16(b0, qdata);
1486         // vector_reduce 16
1487         b1 = reduce_x16(b1, qdata);
1488         // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1489         b2 = mulmod_scaled_x16(b2, precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1490         // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1491         b3 = mulmod_scaled_x16(b3, precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1492         // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1493         b4 = mulmod_scaled_x16(b4, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1494         // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1495         b5 = mulmod_scaled_x16(b5, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1496         // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1497         b6 = mulmod_scaled_x16(b6, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1498         // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1499         b7 = mulmod_scaled_x16(b7, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1500         // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1501         int16x16 c0 = _mm256_unpacklo_epi64(b0, b4);
1502         int16x16 c4 = _mm256_unpackhi_epi64(b0, b4);
1503         // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1504         int16x16 c1 = _mm256_unpacklo_epi64(b1, b5);
1505         int16x16 c5 = _mm256_unpackhi_epi64(b1, b5);
1506         // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1507         int16x16 c2 = _mm256_unpacklo_epi64(b2, b6);
1508         int16x16 c6 = _mm256_unpackhi_epi64(b2, b6);
1509         // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1510         int16x16 c3 = _mm256_unpacklo_epi64(b3, b7);
1511         int16x16 c7 = _mm256_unpackhi_epi64(b3, b7);
1512         // vector_butterfly 0 16 1 0
1513         int16x16 d0 = add_x16(c0, c1);
1514         int16x16 d1 = sub_x16(c0, c1);
1515         // vector_butterfly 64 80 1 0
1516         int16x16 d4 = add_x16(c4, c5);
1517         int16x16 d5 = sub_x16(c4, c5);
1518         // vector_butterfly 32 48 1 0
1519         int16x16 d2 = add_x16(c2, c3);
1520         int16x16 d3 = sub_x16(c2, c3);
1521         // vector_butterfly 96 112 1 0
1522         int16x16 d6 = add_x16(c6, c7);
1523         int16x16 d7 = sub_x16(c6, c7);
1524         // vector_butterfly 0 64 1 0
1525         int16x16 e0 = add_x16(d0, d4);
1526         int16x16 e4 = sub_x16(d0, d4);
1527         // vector_butterfly 32 96 1 0
1528         int16x16 e2 = add_x16(d2, d6);
1529         int16x16 e6 = sub_x16(d2, d6);
1530         // vector_butterfly 16 80 4 1
1531         d5 = mulmod_scaled_x16(d5, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1532         int16x16 e1 = add_x16(d1, d5);
1533         int16x16 e5 = sub_x16(d1, d5);
1534         // vector_butterfly 48 112 4 1
1535         d7 = mulmod_scaled_x16(d7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
1536         int16x16 e3 = add_x16(d3, d7);
1537         int16x16 e7 = sub_x16(d3, d7);
1538         // stopbatch 128
1539         _mm256_storeu_si256((int16x16 *) (f + 0), e0);
1540         _mm256_storeu_si256((int16x16 *) (f + 16), e1);
1541         _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1542         _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1543         _mm256_storeu_si256((int16x16 *) (f + 64), e4);
1544         _mm256_storeu_si256((int16x16 *) (f + 80), e5);
1545         _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1546         _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1547         f += 128;
1548     }
1549     // f -= 128*reps;
1550     // stopntt 512
1551 }
1552 
PQCLEAN_SNTRUP761_AVX2_ntt512_7681(int16 * f,int reps)1553 void PQCLEAN_SNTRUP761_AVX2_ntt512_7681(int16 *f, int reps) {
1554     ntt512(f, reps, qdata_7681.data);
1555 }
1556 
PQCLEAN_SNTRUP761_AVX2_ntt512_10753(int16 * f,int reps)1557 void PQCLEAN_SNTRUP761_AVX2_ntt512_10753(int16 *f, int reps) {
1558     ntt512(f, reps, qdata_10753.data);
1559 }
1560 // inv stopntt 512
1561 
invntt512(int16 * f,int reps,const int16 * qdata)1562 static void invntt512(int16 *f, int reps, const int16 *qdata) {
1563     reps *= 4;
1564     // inv stopbatch 128
1565     for (long long r = 0; r < reps; ++r) {
1566         // inv vector_butterfly 48 112 4 1
1567         int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1568         int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1569         int16x16 b3 = add_x16(a3, a7);
1570         int16x16 b7 = sub_x16(a3, a7);
1571         b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1572         // inv vector_butterfly 16 80 4 1
1573         int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1574         int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1575         int16x16 b1 = add_x16(a1, a5);
1576         int16x16 b5 = sub_x16(a1, a5);
1577         b5 = mulmod_scaled_x16(b5, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1578         // inv vector_butterfly 32 96 1 0
1579         int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1580         int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1581         int16x16 b2 = add_x16(a2, a6);
1582         int16x16 b6 = sub_x16(a2, a6);
1583         // inv vector_butterfly 0 64 1 0
1584         int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1585         int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1586         int16x16 b0 = add_x16(a0, a4);
1587         int16x16 b4 = sub_x16(a0, a4);
1588         // inv vector_butterfly 96 112 1 0
1589         int16x16 c6 = add_x16(b6, b7);
1590         int16x16 c7 = sub_x16(b6, b7);
1591         // inv vector_butterfly 32 48 1 0
1592         int16x16 c2 = add_x16(b2, b3);
1593         int16x16 c3 = sub_x16(b2, b3);
1594         // inv vector_butterfly 64 80 1 0
1595         int16x16 c4 = add_x16(b4, b5);
1596         int16x16 c5 = sub_x16(b4, b5);
1597         // inv vector_butterfly 0 16 1 0
1598         int16x16 c0 = add_x16(b0, b1);
1599         int16x16 c1 = sub_x16(b0, b1);
1600         // inv vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1601         int16x16 d3 = _mm256_unpacklo_epi64(c3, c7);
1602         int16x16 d7 = _mm256_unpackhi_epi64(c3, c7);
1603         // inv vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1604         int16x16 d2 = _mm256_unpacklo_epi64(c2, c6);
1605         int16x16 d6 = _mm256_unpackhi_epi64(c2, c6);
1606         // inv vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1607         int16x16 d1 = _mm256_unpacklo_epi64(c1, c5);
1608         int16x16 d5 = _mm256_unpackhi_epi64(c1, c5);
1609         // inv vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
1610         int16x16 d0 = _mm256_unpacklo_epi64(c0, c4);
1611         int16x16 d4 = _mm256_unpackhi_epi64(c0, c4);
1612         // inv vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1613         d7 = mulmod_scaled_x16(d7, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1614         // inv vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1615         d6 = mulmod_scaled_x16(d6, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1616         // inv vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1617         d5 = mulmod_scaled_x16(d5, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1618         // inv vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1619         d4 = mulmod_scaled_x16(d4, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1620         // inv vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
1621         d3 = mulmod_scaled_x16(d3, precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
1622         // inv vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
1623         d2 = mulmod_scaled_x16(d2, precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
1624         // inv vector_reduce 16
1625         d1 = reduce_x16(d1, qdata);
1626         // inv vector_reduce 0
1627         d0 = reduce_x16(d0, qdata);
1628         // inv vector_butterfly 80 112 4 1
1629         int16x16 e5 = add_x16(d5, d7);
1630         int16x16 e7 = sub_x16(d5, d7);
1631         e7 = mulmod_scaled_x16(e7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1632         // inv vector_butterfly 64 96 4 1
1633         int16x16 e4 = add_x16(d4, d6);
1634         int16x16 e6 = sub_x16(d4, d6);
1635         e6 = mulmod_scaled_x16(e6, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1636         // inv vector_butterfly 16 48 1 0
1637         int16x16 e1 = add_x16(d1, d3);
1638         int16x16 e3 = sub_x16(d1, d3);
1639         // inv vector_butterfly 0 32 1 0
1640         int16x16 e0 = add_x16(d0, d2);
1641         int16x16 e2 = sub_x16(d0, d2);
1642         // inv startbatch 128
1643         _mm256_storeu_si256((int16x16 *) (f + 0), e0);
1644         _mm256_storeu_si256((int16x16 *) (f + 16), e1);
1645         _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1646         _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1647         _mm256_storeu_si256((int16x16 *) (f + 64), e4);
1648         _mm256_storeu_si256((int16x16 *) (f + 80), e5);
1649         _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1650         _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1651         f += 128;
1652     }
1653     f -= 128 * reps;
1654     // inv stopbatch 128
1655     for (long long r = 0; r < reps; ++r) {
1656         // inv vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1657         int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1658         int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1659         int16x16 b6 = _mm256_unpacklo_epi32(a6, a7);
1660         int16x16 b7 = _mm256_unpackhi_epi32(a6, a7);
1661         int16x16 c6 = _mm256_unpacklo_epi32(b6, b7);
1662         int16x16 c7 = _mm256_unpackhi_epi32(b6, b7);
1663         // inv vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1664         int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
1665         int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1666         int16x16 b4 = _mm256_unpacklo_epi32(a4, a5);
1667         int16x16 b5 = _mm256_unpackhi_epi32(a4, a5);
1668         int16x16 c4 = _mm256_unpacklo_epi32(b4, b5);
1669         int16x16 c5 = _mm256_unpackhi_epi32(b4, b5);
1670         // inv vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1671         int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1672         int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1673         int16x16 b2 = _mm256_unpacklo_epi32(a2, a3);
1674         int16x16 b3 = _mm256_unpackhi_epi32(a2, a3);
1675         int16x16 c2 = _mm256_unpacklo_epi32(b2, b3);
1676         int16x16 c3 = _mm256_unpackhi_epi32(b2, b3);
1677         // inv vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
1678         int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
1679         int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1680         int16x16 b0 = _mm256_unpacklo_epi32(a0, a1);
1681         int16x16 b1 = _mm256_unpackhi_epi32(a0, a1);
1682         int16x16 c0 = _mm256_unpacklo_epi32(b0, b1);
1683         int16x16 c1 = _mm256_unpackhi_epi32(b0, b1);
1684         // inv vector_butterfly 48 112 1 0
1685         int16x16 d3 = add_x16(c3, c7);
1686         int16x16 d7 = sub_x16(c3, c7);
1687         // inv vector_butterfly 16 80 1 0
1688         int16x16 d1 = add_x16(c1, c5);
1689         int16x16 d5 = sub_x16(c1, c5);
1690         // inv vector_butterfly 32 96 1 0
1691         int16x16 d2 = add_x16(c2, c6);
1692         int16x16 d6 = sub_x16(c2, c6);
1693         // inv vector_butterfly 0 64 1 0
1694         int16x16 d0 = add_x16(c0, c4);
1695         int16x16 d4 = sub_x16(c0, c4);
1696         // inv vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1697         int16x16 e5 = _mm256_unpacklo_epi16(d5, d7);
1698         int16x16 e7 = _mm256_unpackhi_epi16(d5, d7);
1699         int16x16 f5 = _mm256_unpacklo_epi16(e5, e7);
1700         int16x16 f7 = _mm256_unpackhi_epi16(e5, e7);
1701         int16x16 g5 = _mm256_unpacklo_epi16(f5, f7);
1702         int16x16 g7 = _mm256_unpackhi_epi16(f5, f7);
1703         // inv vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1704         int16x16 e4 = _mm256_unpacklo_epi16(d4, d6);
1705         int16x16 e6 = _mm256_unpackhi_epi16(d4, d6);
1706         int16x16 f4 = _mm256_unpacklo_epi16(e4, e6);
1707         int16x16 f6 = _mm256_unpackhi_epi16(e4, e6);
1708         int16x16 g4 = _mm256_unpacklo_epi16(f4, f6);
1709         int16x16 g6 = _mm256_unpackhi_epi16(f4, f6);
1710         // inv vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1711         int16x16 e1 = _mm256_unpacklo_epi16(d1, d3);
1712         int16x16 e3 = _mm256_unpackhi_epi16(d1, d3);
1713         int16x16 f1 = _mm256_unpacklo_epi16(e1, e3);
1714         int16x16 f3 = _mm256_unpackhi_epi16(e1, e3);
1715         int16x16 g1 = _mm256_unpacklo_epi16(f1, f3);
1716         int16x16 g3 = _mm256_unpackhi_epi16(f1, f3);
1717         // inv vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
1718         int16x16 e0 = _mm256_unpacklo_epi16(d0, d2);
1719         int16x16 e2 = _mm256_unpackhi_epi16(d0, d2);
1720         int16x16 f0 = _mm256_unpacklo_epi16(e0, e2);
1721         int16x16 f2 = _mm256_unpackhi_epi16(e0, e2);
1722         int16x16 g0 = _mm256_unpacklo_epi16(f0, f2);
1723         int16x16 g2 = _mm256_unpackhi_epi16(f0, f2);
1724         // inv vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1725         g7 = mulmod_scaled_x16(g7, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1726         // inv vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1727         g3 = mulmod_scaled_x16(g3, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1728         // inv vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1729         g6 = mulmod_scaled_x16(g6, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1730         // inv vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1731         g2 = mulmod_scaled_x16(g2, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1732         // inv vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
1733         g5 = mulmod_scaled_x16(g5, precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
1734         // inv vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1735         g1 = mulmod_scaled_x16(g1, precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
1736         // inv vector_reduce 64
1737         g4 = reduce_x16(g4, qdata);
1738         // inv vector_reduce 0
1739         g0 = reduce_x16(g0, qdata);
1740         // inv vector_butterfly 96 112 4 1
1741         int16x16 h6 = add_x16(g6, g7);
1742         int16x16 h7 = sub_x16(g6, g7);
1743         h7 = mulmod_scaled_x16(h7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1744         // inv vector_butterfly 32 48 4 1
1745         int16x16 h2 = add_x16(g2, g3);
1746         int16x16 h3 = sub_x16(g2, g3);
1747         h3 = mulmod_scaled_x16(h3, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1748         // inv vector_butterfly 64 80 1 0
1749         int16x16 h4 = add_x16(g4, g5);
1750         int16x16 h5 = sub_x16(g4, g5);
1751         // inv vector_butterfly 0 16 1 0
1752         int16x16 h0 = add_x16(g0, g1);
1753         int16x16 h1 = sub_x16(g0, g1);
1754         // inv vector_butterfly 80 112 1 0
1755         int16x16 i5 = add_x16(h5, h7);
1756         int16x16 i7 = sub_x16(h5, h7);
1757         // inv vector_butterfly 16 48 1 0
1758         int16x16 i1 = add_x16(h1, h3);
1759         int16x16 i3 = sub_x16(h1, h3);
1760         // inv vector_butterfly 64 96 1 0
1761         int16x16 i4 = add_x16(h4, h6);
1762         int16x16 i6 = sub_x16(h4, h6);
1763         // inv vector_butterfly 0 32 1 0
1764         int16x16 i0 = add_x16(h0, h2);
1765         int16x16 i2 = sub_x16(h0, h2);
1766         // inv startbatch 128
1767         _mm256_storeu_si256((int16x16 *) (f + 0), i0);
1768         _mm256_storeu_si256((int16x16 *) (f + 16), i1);
1769         _mm256_storeu_si256((int16x16 *) (f + 32), i2);
1770         _mm256_storeu_si256((int16x16 *) (f + 48), i3);
1771         _mm256_storeu_si256((int16x16 *) (f + 64), i4);
1772         _mm256_storeu_si256((int16x16 *) (f + 80), i5);
1773         _mm256_storeu_si256((int16x16 *) (f + 96), i6);
1774         _mm256_storeu_si256((int16x16 *) (f + 112), i7);
1775         f += 128;
1776     }
1777     f -= 128 * reps;
1778     // inv doublereps
1779     reps /= 2;
1780     // inv doublereps
1781     reps /= 2;
1782     // inv stopbatch 512
1783     for (long long r = 0; r < reps; ++r) {
1784         // inv vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1785         int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432));
1786         int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496));
1787         int16x16 b27 = _mm256_permute2x128_si256_lo(a27, a31);
1788         int16x16 b31 = _mm256_permute2x128_si256_hi(a27, a31);
1789         // inv vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1790         int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304));
1791         int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368));
1792         int16x16 b19 = _mm256_permute2x128_si256_lo(a19, a23);
1793         int16x16 b23 = _mm256_permute2x128_si256_hi(a19, a23);
1794         // inv vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1795         int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176));
1796         int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240));
1797         int16x16 b11 = _mm256_permute2x128_si256_lo(a11, a15);
1798         int16x16 b15 = _mm256_permute2x128_si256_hi(a11, a15);
1799         // inv vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1800         int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
1801         int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
1802         int16x16 b3 = _mm256_permute2x128_si256_lo(a3, a7);
1803         int16x16 b7 = _mm256_permute2x128_si256_hi(a3, a7);
1804         // inv vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1805         b31 = mulmod_scaled_x16(b31, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1806         // inv vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1807         b27 = mulmod_scaled_x16(b27, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1808         // inv vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1809         b23 = mulmod_scaled_x16(b23, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1810         // inv vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1811         b19 = mulmod_scaled_x16(b19, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1812         // inv vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1813         b15 = mulmod_scaled_x16(b15, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1814         // inv vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1815         b11 = mulmod_scaled_x16(b11, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1816         // inv vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1817         b7 = mulmod_scaled_x16(b7, precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
1818         // inv vector_reduce 48
1819         b3 = reduce_x16(b3, qdata);
1820         // inv vector_butterfly 432 496 8 7
1821         int16x16 c27 = add_x16(b27, b31);
1822         int16x16 c31 = sub_x16(b27, b31);
1823         c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1824         // inv vector_butterfly 304 368 8 1
1825         int16x16 c19 = add_x16(b19, b23);
1826         int16x16 c23 = sub_x16(b19, b23);
1827         c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1828         // inv vector_butterfly 176 240 4 1
1829         int16x16 c11 = add_x16(b11, b15);
1830         int16x16 c15 = sub_x16(b11, b15);
1831         c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1832         // inv vector_butterfly 48 112 1 0
1833         int16x16 c3 = add_x16(b3, b7);
1834         int16x16 c7 = sub_x16(b3, b7);
1835         // inv vector_reduce_ifforward 112
1836         // inv vector_butterfly 368 496 4 1
1837         int16x16 d23 = add_x16(c23, c31);
1838         int16x16 d31 = sub_x16(c23, c31);
1839         d31 = mulmod_scaled_x16(d31, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1840         // inv vector_butterfly 304 432 4 1
1841         int16x16 d19 = add_x16(c19, c27);
1842         int16x16 d27 = sub_x16(c19, c27);
1843         d27 = mulmod_scaled_x16(d27, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1844         // inv vector_butterfly 112 240 1 0
1845         int16x16 d7 = add_x16(c7, c15);
1846         int16x16 d15 = sub_x16(c7, c15);
1847         // inv vector_butterfly 48 176 1 0
1848         int16x16 d3 = add_x16(c3, c11);
1849         int16x16 d11 = sub_x16(c3, c11);
1850         // inv vector_reduce_ifreverse 304
1851         d19 = reduce_x16(d19, qdata);
1852         // inv vector_reduce_ifreverse 48
1853         d3 = reduce_x16(d3, qdata);
1854         // inv vector_butterfly 240 496 1 0
1855         int16x16 e15 = add_x16(d15, d31);
1856         int16x16 e31 = sub_x16(d15, d31);
1857         // inv vector_butterfly 112 368 1 0
1858         int16x16 e7 = add_x16(d7, d23);
1859         int16x16 e23 = sub_x16(d7, d23);
1860         // inv vector_butterfly 176 432 1 0
1861         int16x16 e11 = add_x16(d11, d27);
1862         int16x16 e27 = sub_x16(d11, d27);
1863         // inv vector_butterfly 48 304 1 0
1864         int16x16 e3 = add_x16(d3, d19);
1865         int16x16 e19 = sub_x16(d3, d19);
1866         // inv startbatch 512
1867         _mm256_storeu_si256((int16x16 *) (f + 48), e3);
1868         _mm256_storeu_si256((int16x16 *) (f + 112), e7);
1869         _mm256_storeu_si256((int16x16 *) (f + 176), e11);
1870         _mm256_storeu_si256((int16x16 *) (f + 240), e15);
1871         _mm256_storeu_si256((int16x16 *) (f + 304), e19);
1872         _mm256_storeu_si256((int16x16 *) (f + 368), e23);
1873         _mm256_storeu_si256((int16x16 *) (f + 432), e27);
1874         _mm256_storeu_si256((int16x16 *) (f + 496), e31);
1875         f += 512;
1876     }
1877     f -= 512 * reps;
1878     // inv stopbatch 512
1879     for (long long r = 0; r < reps; ++r) {
1880         // inv vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1881         int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416));
1882         int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480));
1883         int16x16 b26 = _mm256_permute2x128_si256_lo(a26, a30);
1884         int16x16 b30 = _mm256_permute2x128_si256_hi(a26, a30);
1885         // inv vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1886         int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288));
1887         int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352));
1888         int16x16 b18 = _mm256_permute2x128_si256_lo(a18, a22);
1889         int16x16 b22 = _mm256_permute2x128_si256_hi(a18, a22);
1890         // inv vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1891         int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160));
1892         int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224));
1893         int16x16 b10 = _mm256_permute2x128_si256_lo(a10, a14);
1894         int16x16 b14 = _mm256_permute2x128_si256_hi(a10, a14);
1895         // inv vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1896         int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
1897         int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
1898         int16x16 b2 = _mm256_permute2x128_si256_lo(a2, a6);
1899         int16x16 b6 = _mm256_permute2x128_si256_hi(a2, a6);
1900         // inv vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1901         b30 = mulmod_scaled_x16(b30, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1902         // inv vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1903         b26 = mulmod_scaled_x16(b26, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1904         // inv vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1905         b22 = mulmod_scaled_x16(b22, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1906         // inv vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1907         b18 = mulmod_scaled_x16(b18, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1908         // inv vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1909         b14 = mulmod_scaled_x16(b14, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1910         // inv vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1911         b10 = mulmod_scaled_x16(b10, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1912         // inv vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1913         b6 = mulmod_scaled_x16(b6, precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
1914         // inv vector_reduce 32
1915         b2 = reduce_x16(b2, qdata);
1916         // inv vector_butterfly 416 480 8 7
1917         int16x16 c26 = add_x16(b26, b30);
1918         int16x16 c30 = sub_x16(b26, b30);
1919         c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
1920         // inv vector_butterfly 288 352 8 1
1921         int16x16 c18 = add_x16(b18, b22);
1922         int16x16 c22 = sub_x16(b18, b22);
1923         c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
1924         // inv vector_butterfly 160 224 4 1
1925         int16x16 c10 = add_x16(b10, b14);
1926         int16x16 c14 = sub_x16(b10, b14);
1927         c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1928         // inv vector_butterfly 32 96 1 0
1929         int16x16 c2 = add_x16(b2, b6);
1930         int16x16 c6 = sub_x16(b2, b6);
1931         // inv vector_reduce_ifforward 96
1932         // inv vector_butterfly 352 480 4 1
1933         int16x16 d22 = add_x16(c22, c30);
1934         int16x16 d30 = sub_x16(c22, c30);
1935         d30 = mulmod_scaled_x16(d30, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1936         // inv vector_butterfly 288 416 4 1
1937         int16x16 d18 = add_x16(c18, c26);
1938         int16x16 d26 = sub_x16(c18, c26);
1939         d26 = mulmod_scaled_x16(d26, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
1940         // inv vector_butterfly 96 224 1 0
1941         int16x16 d6 = add_x16(c6, c14);
1942         int16x16 d14 = sub_x16(c6, c14);
1943         // inv vector_butterfly 32 160 1 0
1944         int16x16 d2 = add_x16(c2, c10);
1945         int16x16 d10 = sub_x16(c2, c10);
1946         // inv vector_reduce_ifreverse 288
1947         d18 = reduce_x16(d18, qdata);
1948         // inv vector_reduce_ifreverse 32
1949         d2 = reduce_x16(d2, qdata);
1950         // inv vector_butterfly 224 480 1 0
1951         int16x16 e14 = add_x16(d14, d30);
1952         int16x16 e30 = sub_x16(d14, d30);
1953         // inv vector_butterfly 96 352 1 0
1954         int16x16 e6 = add_x16(d6, d22);
1955         int16x16 e22 = sub_x16(d6, d22);
1956         // inv vector_butterfly 160 416 1 0
1957         int16x16 e10 = add_x16(d10, d26);
1958         int16x16 e26 = sub_x16(d10, d26);
1959         // inv vector_butterfly 32 288 1 0
1960         int16x16 e2 = add_x16(d2, d18);
1961         int16x16 e18 = sub_x16(d2, d18);
1962         // inv startbatch 512
1963         _mm256_storeu_si256((int16x16 *) (f + 32), e2);
1964         _mm256_storeu_si256((int16x16 *) (f + 96), e6);
1965         _mm256_storeu_si256((int16x16 *) (f + 160), e10);
1966         _mm256_storeu_si256((int16x16 *) (f + 224), e14);
1967         _mm256_storeu_si256((int16x16 *) (f + 288), e18);
1968         _mm256_storeu_si256((int16x16 *) (f + 352), e22);
1969         _mm256_storeu_si256((int16x16 *) (f + 416), e26);
1970         _mm256_storeu_si256((int16x16 *) (f + 480), e30);
1971         f += 512;
1972     }
1973     f -= 512 * reps;
1974     // inv stopbatch 512
1975     for (long long r = 0; r < reps; ++r) {
1976         // inv vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1977         int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400));
1978         int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464));
1979         int16x16 b25 = _mm256_permute2x128_si256_lo(a25, a29);
1980         int16x16 b29 = _mm256_permute2x128_si256_hi(a25, a29);
1981         // inv vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1982         int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272));
1983         int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336));
1984         int16x16 b17 = _mm256_permute2x128_si256_lo(a17, a21);
1985         int16x16 b21 = _mm256_permute2x128_si256_hi(a17, a21);
1986         // inv vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1987         int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144));
1988         int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208));
1989         int16x16 b9 = _mm256_permute2x128_si256_lo(a9, a13);
1990         int16x16 b13 = _mm256_permute2x128_si256_hi(a9, a13);
1991         // inv vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
1992         int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
1993         int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
1994         int16x16 b1 = _mm256_permute2x128_si256_lo(a1, a5);
1995         int16x16 b5 = _mm256_permute2x128_si256_hi(a1, a5);
1996         // inv vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1997         b29 = mulmod_scaled_x16(b29, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
1998         // inv vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1999         b25 = mulmod_scaled_x16(b25, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2000         // inv vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2001         b21 = mulmod_scaled_x16(b21, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2002         // inv vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2003         b17 = mulmod_scaled_x16(b17, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2004         // inv vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2005         b13 = mulmod_scaled_x16(b13, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2006         // inv vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2007         b9 = mulmod_scaled_x16(b9, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2008         // inv vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2009         b5 = mulmod_scaled_x16(b5, precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
2010         // inv vector_reduce 16
2011         b1 = reduce_x16(b1, qdata);
2012         // inv vector_butterfly 400 464 8 7
2013         int16x16 c25 = add_x16(b25, b29);
2014         int16x16 c29 = sub_x16(b25, b29);
2015         c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
2016         // inv vector_butterfly 272 336 8 1
2017         int16x16 c17 = add_x16(b17, b21);
2018         int16x16 c21 = sub_x16(b17, b21);
2019         c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
2020         // inv vector_butterfly 144 208 4 1
2021         int16x16 c9 = add_x16(b9, b13);
2022         int16x16 c13 = sub_x16(b9, b13);
2023         c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2024         // inv vector_butterfly 16 80 1 0
2025         int16x16 c1 = add_x16(b1, b5);
2026         int16x16 c5 = sub_x16(b1, b5);
2027         // inv vector_reduce_ifforward 80
2028         // inv vector_butterfly 336 464 4 1
2029         int16x16 d21 = add_x16(c21, c29);
2030         int16x16 d29 = sub_x16(c21, c29);
2031         d29 = mulmod_scaled_x16(d29, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2032         // inv vector_butterfly 272 400 4 1
2033         int16x16 d17 = add_x16(c17, c25);
2034         int16x16 d25 = sub_x16(c17, c25);
2035         d25 = mulmod_scaled_x16(d25, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2036         // inv vector_butterfly 80 208 1 0
2037         int16x16 d5 = add_x16(c5, c13);
2038         int16x16 d13 = sub_x16(c5, c13);
2039         // inv vector_butterfly 16 144 1 0
2040         int16x16 d1 = add_x16(c1, c9);
2041         int16x16 d9 = sub_x16(c1, c9);
2042         // inv vector_reduce_ifreverse 272
2043         d17 = reduce_x16(d17, qdata);
2044         // inv vector_reduce_ifreverse 16
2045         d1 = reduce_x16(d1, qdata);
2046         // inv vector_butterfly 208 464 1 0
2047         int16x16 e13 = add_x16(d13, d29);
2048         int16x16 e29 = sub_x16(d13, d29);
2049         // inv vector_butterfly 80 336 1 0
2050         int16x16 e5 = add_x16(d5, d21);
2051         int16x16 e21 = sub_x16(d5, d21);
2052         // inv vector_butterfly 144 400 1 0
2053         int16x16 e9 = add_x16(d9, d25);
2054         int16x16 e25 = sub_x16(d9, d25);
2055         // inv vector_butterfly 16 272 1 0
2056         int16x16 e1 = add_x16(d1, d17);
2057         int16x16 e17 = sub_x16(d1, d17);
2058         // inv startbatch 512
2059         _mm256_storeu_si256((int16x16 *) (f + 16), e1);
2060         _mm256_storeu_si256((int16x16 *) (f + 80), e5);
2061         _mm256_storeu_si256((int16x16 *) (f + 144), e9);
2062         _mm256_storeu_si256((int16x16 *) (f + 208), e13);
2063         _mm256_storeu_si256((int16x16 *) (f + 272), e17);
2064         _mm256_storeu_si256((int16x16 *) (f + 336), e21);
2065         _mm256_storeu_si256((int16x16 *) (f + 400), e25);
2066         _mm256_storeu_si256((int16x16 *) (f + 464), e29);
2067         f += 512;
2068     }
2069     f -= 512 * reps;
2070     // inv stopbatch 512
2071     for (long long r = 0; r < reps; ++r) {
2072         // inv vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2073         int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384));
2074         int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448));
2075         int16x16 b24 = _mm256_permute2x128_si256_lo(a24, a28);
2076         int16x16 b28 = _mm256_permute2x128_si256_hi(a24, a28);
2077         // inv vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2078         int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256));
2079         int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320));
2080         int16x16 b16 = _mm256_permute2x128_si256_lo(a16, a20);
2081         int16x16 b20 = _mm256_permute2x128_si256_hi(a16, a20);
2082         // inv vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2083         int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128));
2084         int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192));
2085         int16x16 b8 = _mm256_permute2x128_si256_lo(a8, a12);
2086         int16x16 b12 = _mm256_permute2x128_si256_hi(a8, a12);
2087         // inv vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
2088         int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
2089         int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
2090         int16x16 b0 = _mm256_permute2x128_si256_lo(a0, a4);
2091         int16x16 b4 = _mm256_permute2x128_si256_hi(a0, a4);
2092         // inv vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2093         b28 = mulmod_scaled_x16(b28, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2094         // inv vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2095         b24 = mulmod_scaled_x16(b24, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2096         // inv vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2097         b20 = mulmod_scaled_x16(b20, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2098         // inv vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2099         b16 = mulmod_scaled_x16(b16, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2100         // inv vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2101         b12 = mulmod_scaled_x16(b12, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2102         // inv vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2103         b8 = mulmod_scaled_x16(b8, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2104         // inv vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2105         b4 = mulmod_scaled_x16(b4, precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
2106         // inv vector_reduce 0
2107         b0 = reduce_x16(b0, qdata);
2108         // inv vector_butterfly 384 448 8 7
2109         int16x16 c24 = add_x16(b24, b28);
2110         int16x16 c28 = sub_x16(b24, b28);
2111         c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
2112         // inv vector_butterfly 256 320 8 1
2113         int16x16 c16 = add_x16(b16, b20);
2114         int16x16 c20 = sub_x16(b16, b20);
2115         c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
2116         // inv vector_butterfly 128 192 4 1
2117         int16x16 c8 = add_x16(b8, b12);
2118         int16x16 c12 = sub_x16(b8, b12);
2119         c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2120         // inv vector_butterfly 0 64 1 0
2121         int16x16 c0 = add_x16(b0, b4);
2122         int16x16 c4 = sub_x16(b0, b4);
2123         // inv vector_reduce_ifforward 64
2124         // inv vector_butterfly 320 448 4 1
2125         int16x16 d20 = add_x16(c20, c28);
2126         int16x16 d28 = sub_x16(c20, c28);
2127         d28 = mulmod_scaled_x16(d28, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2128         // inv vector_butterfly 256 384 4 1
2129         int16x16 d16 = add_x16(c16, c24);
2130         int16x16 d24 = sub_x16(c16, c24);
2131         d24 = mulmod_scaled_x16(d24, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
2132         // inv vector_butterfly 64 192 1 0
2133         int16x16 d4 = add_x16(c4, c12);
2134         int16x16 d12 = sub_x16(c4, c12);
2135         // inv vector_butterfly 0 128 1 0
2136         int16x16 d0 = add_x16(c0, c8);
2137         int16x16 d8 = sub_x16(c0, c8);
2138         // inv vector_reduce_ifreverse 256
2139         d16 = reduce_x16(d16, qdata);
2140         // inv vector_reduce_ifreverse 0
2141         d0 = reduce_x16(d0, qdata);
2142         // inv vector_butterfly 192 448 1 0
2143         int16x16 e12 = add_x16(d12, d28);
2144         int16x16 e28 = sub_x16(d12, d28);
2145         // inv vector_butterfly 64 320 1 0
2146         int16x16 e4 = add_x16(d4, d20);
2147         int16x16 e20 = sub_x16(d4, d20);
2148         // inv vector_butterfly 128 384 1 0
2149         int16x16 e8 = add_x16(d8, d24);
2150         int16x16 e24 = sub_x16(d8, d24);
2151         // inv vector_butterfly 0 256 1 0
2152         int16x16 e0 = add_x16(d0, d16);
2153         int16x16 e16 = sub_x16(d0, d16);
2154         // inv startbatch 512
2155         _mm256_storeu_si256((int16x16 *) (f + 0), e0);
2156         _mm256_storeu_si256((int16x16 *) (f + 64), e4);
2157         _mm256_storeu_si256((int16x16 *) (f + 128), e8);
2158         _mm256_storeu_si256((int16x16 *) (f + 192), e12);
2159         _mm256_storeu_si256((int16x16 *) (f + 256), e16);
2160         _mm256_storeu_si256((int16x16 *) (f + 320), e20);
2161         _mm256_storeu_si256((int16x16 *) (f + 384), e24);
2162         _mm256_storeu_si256((int16x16 *) (f + 448), e28);
2163         f += 512;
2164     }
2165     // f -= 512*reps;
2166     // inv startntt 512
2167 }
2168 
PQCLEAN_SNTRUP761_AVX2_invntt512_7681(int16 * f,int reps)2169 void PQCLEAN_SNTRUP761_AVX2_invntt512_7681(int16 *f, int reps) {
2170     invntt512(f, reps, qdata_7681.data);
2171 }
2172 
PQCLEAN_SNTRUP761_AVX2_invntt512_10753(int16 * f,int reps)2173 void PQCLEAN_SNTRUP761_AVX2_invntt512_10753(int16 *f, int reps) {
2174     invntt512(f, reps, qdata_10753.data);
2175 }
2176