xref: /qemu/target/hexagon/imported/mmvec/ext.idef (revision 940bb5fa)
1/*
2 *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
3 *
4 *  This program is free software; you can redistribute it and/or modify
5 *  it under the terms of the GNU General Public License as published by
6 *  the Free Software Foundation; either version 2 of the License, or
7 *  (at your option) any later version.
8 *
9 *  This program is distributed in the hope that it will be useful,
10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 *  GNU General Public License for more details.
13 *
14 *  You should have received a copy of the GNU General Public License
15 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18/******************************************************************************
19 *
20 *     HOYA: MULTI MEDIA INSTRUCTIONS
21 *
22 ******************************************************************************/
23
24#ifndef EXTINSN
25#define EXTINSN Q6INSN
26#define __SELF_DEF_EXTINSN 1
27#endif
28
29#ifndef NO_MMVEC
30
31#define DO_FOR_EACH_CODE(WIDTH, CODE) \
32{ \
33    fHIDE(int i;) \
34    fVFOREACH(WIDTH, i) {\
35        CODE ;\
36    } \
37}
38
39
40
41
42#define ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
43EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),  \
44DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
45
46
47
48#define ITERATOR_INSN2_ANY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
49ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
50
51#define ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
52EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),  \
53DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
54
55
56#define ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
57ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
58
59
60#define ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
61EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
62DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
63
64
65#define ITERATOR_INSN_SHIFT3_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
66EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS,A_CVI_VS_3SRC,A_NOTE_SHIFT_RESOURCE,A_NOTE_NOVP,A_NOTE_VA_UNARY),  \
67DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
68
69#define ITERATOR_INSN_SHIFT_SLOT_VV_LATE(WIDTH,TAG,SYNTAX,DESCR,CODE) \
70EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
71DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
72
73#define ITERATOR_INSN2_SHIFT_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
74ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
75
76#define ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
77EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),  \
78DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
79
80#define ITERATOR_INSN2_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
81ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
82
83#define ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \
84EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
85
86
87#define ITERATOR_INSN2_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
88ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX2,DESCR,CODE)
89
90#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
91EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
92DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
93
94#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \
95EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
96DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
97
98#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
99ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
100
101#define ITERATOR_INSN_MPY_SLOT(WIDTH,TAG, SYNTAX,DESCR,CODE) \
102EXTINSN(V6_##TAG, SYNTAX, \
103ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
104DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
105
106#define ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,DESCR,CODE) \
107EXTINSN(V6_##TAG, SYNTAX, \
108ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
109DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
110
111#define ITERATOR_INSN2_MPY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
112ITERATOR_INSN_MPY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
113
114#define ITERATOR_INSN2_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,SYNTAX2,DESCR,CODE) \
115ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX2,DESCR,CODE)
116
117
118#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
119EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
120DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
121
122#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(WIDTH,TAG,SYNTAX,DESCR,CODE) \
123EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
124DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
125
126#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
127ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
128
129
130
131
132#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC2(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
133EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_CVI_VX_VSRC0_IS_DST), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
134
135#define ITERATOR_INSN_SLOT2_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
136EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_RESTRICT_SLOT2ONLY), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
137
138#define ITERATOR_INSN_VHISTLIKE(WIDTH,TAG,SYNTAX,DESCR,CODE) \
139EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT),  \
140DESCR, fHIDE(mmvector_t input;) input = fTMPVDATA(); DO_FOR_EACH_CODE(WIDTH, CODE))
141
142
143
144
145
146/******************************************************************************************
147*
148* MMVECTOR MEMORY OPERATIONS - NO NAPALI V1
149*
150*******************************************************************************************/
151
152
153
154#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
155EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
156DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
157
158#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
159ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
160
161
162
163#define ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
164EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
165DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
166
167#define ITERATOR_INSN2_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
168ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
169
170
171#define ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
172EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),  \
173DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
174
175#define ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
176ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
177
178
179#define ITERATOR_INSN_MPY_SLOT_NOV1(WIDTH,TAG, SYNTAX,DESCR,CODE) \
180EXTINSN(V6_##TAG, SYNTAX, \
181ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
182DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
183
184#define ITERATOR_INSN_PERMUTE_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
185EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),  \
186DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
187
188#define ITERATOR_INSN2_PERMUTE_SLOTT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
189ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
190
191#define ITERATOR_INSN_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
192EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
193
194
195#define ITERATOR_INSN2_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
196ITERATOR_INSN_PERMUTE_SLOT_DEP_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
197
198#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
199EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
200DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
201
202#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
203EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
204DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
205
206#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
207ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
208
209#define NARROWING_SHIFT_NOV1(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
210ITERATOR_INSN_SHIFT_SLOT_NOV1(ITERSIZE,TAG, \
211"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \
212"Vector shift right and shuffle", \
213    fHIDE(int )shamt = RtV & SHAMTMASK; \
214    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \
215    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt)))
216
217#define MMVEC_AVGS_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
218ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",          "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",          "Vector Average "DESCR,                                      VdV.DEST[i]  = fVAVGS(       WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
219ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",      "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",      "Vector Average % Round"DESCR,                               VdV.DEST[i]  = fVAVGSRND(    WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
220ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vnavg##TYPE,                       "Vd32=vnavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")",         "Vector Negative Average "DESCR,                             VdV.DEST[i]  = fVNAVGS(      WIDTH,  VuV.SRC[i], VvV.SRC[i]))
221
222  #define MMVEC_AVGU_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
223ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",        "Vector Average "DESCR,                                      VdV.DEST[i] = fVAVGU(   WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
224ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",     "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",    "Vector Average % Round"DESCR,                               VdV.DEST[i] = fVAVGURND(WIDTH,  VuV.SRC[i], VvV.SRC[i]))
225
226
227
228/******************************************************************************************
229*
230* MMVECTOR MEMORY OPERATIONS
231*
232*******************************************************************************************/
233
234#define MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,BEH) \
235EXTINSN(V6_##TAG##_pi,      SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_I(RxV,VEC_SCALE(siV)); }) \
236EXTINSN(V6_##TAG##_ai,      SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_RI(RtV,VEC_SCALE(siV)); BEH;}) \
237EXTINSN(V6_##TAG##_ppu,      SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_M(RxV,MuV); }) \
238
239
240#define MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
241EXTINSN(V6_##TAG##_pred_pi,      "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \
242EXTINSN(V6_##TAG##_pred_ai,      "if (" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB, ATTRIB,DESCR,  { if (fLSBOLD(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \
243EXTINSN(V6_##TAG##_pred_ppu,     "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,  { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) \
244
245#define MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
246EXTINSN(V6_##TAG##_npred_pi,     "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \
247EXTINSN(V6_##TAG##_npred_ai,     "if (!" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLDNOT(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \
248EXTINSN(V6_##TAG##_npred_ppu,    "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}})
249
250#define MMVEC_COND_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
251MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
252MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH)
253
254
255#define VEC_SCALE(X) X*fVECSIZE()
256
257
258#define MMVEC_LD(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmem","",fLOADMMV(EA,VdV))
259#define MMVEC_LDC(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_cur,DESCR,ATTRIB,NT,"Vd32.cur=vmem","",fLOADMMV(EA,VdV))
260#define MMVEC_LDT(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_tmp,DESCR,ATTRIB,NT,"Vd32.tmp=vmem","",fLOADMMV(EA,VdV))
261#define MMVEC_LDU(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmemu","",fLOADMMVU(EA,VdV))
262
263
264#define MMVEC_STQ(TAG,DESCR,ATTRIB,NT) \
265MMVEC_EACH_EA(TAG##_qpred,DESCR,ATTRIB,NT,"if (Qv4) vmem","=Vs32",fSTOREMMVQ(EA,VsV,QvV)) \
266MMVEC_EACH_EA(TAG##_nqpred,DESCR,ATTRIB,NT,"if (!Qv4) vmem","=Vs32",fSTOREMMVNQ(EA,VsV,QvV))
267
268/****************************************************************
269* MAPPING FOR VMEMs
270****************************************************************/
271
272#define ATTR_VMEM A_EXTENSION,A_CVI,A_CVI_VM
273#define ATTR_VMEMU A_EXTENSION,A_CVI,A_CVI_VM,A_CVI_VP
274
275
276MMVEC_LD(vL32b,  "Aligned Vector Load",        ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),)
277MMVEC_LDC(vL32b,  "Aligned Vector Load Cur",	ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_NEW,A_CVI_VA),)
278MMVEC_LDT(vL32b,  "Aligned Vector Load Tmp",	ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),)
279
280MMVEC_COND_EACH_EA(vL32b,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),,"Vd32=vmem",,Pv,fLOADMMV(EA,VdV);)
281MMVEC_COND_EACH_EA(vL32b_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",,Pv,fLOADMMV(EA,VdV);)
282MMVEC_COND_EACH_EA(vL32b_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),,"Vd32.tmp=vmem",,Pv,fLOADMMV(EA,VdV);)
283
284MMVEC_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",fSTOREMMV(EA,VsV))
285MMVEC_COND_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",Pv,fSTOREMMV(EA,VsV))
286
287
288MMVEC_STQ(vS32b,  "Aligned Vector Store",      ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),)
289
290MMVEC_LDU(vL32Ub, "Unaligned Vector Load",     ATTRIBS(ATTR_VMEMU,A_LOAD,A_RESTRICT_NOSLOT1),)
291
292MMVEC_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",fSTOREMMVU(EA,VsV))
293
294MMVEC_COND_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",Pv,fSTOREMMVU(EA,VsV))
295
296MMVEC_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN)))
297
298// V65 store release, zero byte store
299MMVEC_EACH_EA(vS32b_srls,"Aligned Vector Scatter Release",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_SCATTER_RELEASE,A_CVI_NEW,A_RESTRICT_SLOT0ONLY),,"vmem",":scatter_release",fSTORERELEASE(EA,0))
300
301
302
303MMVEC_COND_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN)))
304
305
306/******************************************************************************************
307*
308* MMVECTOR MEMORY OPERATIONS - NON TEMPORAL
309*
310*******************************************************************************************/
311
312#define ATTR_VMEM_NT A_EXTENSION,A_CVI,A_CVI_VM
313
314MMVEC_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",fSTOREMMV(EA,VsV))
315MMVEC_COND_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",Pv,fSTOREMMV(EA,VsV))
316
317MMVEC_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN)))
318MMVEC_COND_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN)))
319
320
321MMVEC_STQ(vS32b_nt,  "Aligned Vector Store - Non temporal",      ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt")
322
323MMVEC_LD(vL32b_nt,  "Aligned Vector Load - Non temporal",       ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_VA),":nt")
324MMVEC_LDC(vL32b_nt,  "Aligned Vector Load Cur - Non temporal",	ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_NEW,A_CVI_VA),":nt")
325MMVEC_LDT(vL32b_nt,  "Aligned Vector Load Tmp - Non temporal",	ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_TMP),":nt")
326
327MMVEC_COND_EACH_EA(vL32b_nt,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA),,"Vd32=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
328MMVEC_COND_EACH_EA(vL32b_nt_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
329MMVEC_COND_EACH_EA(vL32b_nt_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM_NT,A_CVI_TMP),,"Vd32.tmp=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
330
331
332#undef VEC_SCALE
333
334
335/***************************************************
336 * Vector Alignment
337 ************************************************/
338
339#define VALIGNB(SHIFT)  \
340    fHIDE(int i;) \
341    for(i = 0; i < fVBYTES(); i++) {\
342        VdV.ub[i] = (i+SHIFT>=fVBYTES()) ? VuV.ub[i+SHIFT-fVBYTES()] : VvV.ub[i+SHIFT];\
343	}
344
345EXTINSN(V6_valignb,  "Vd32=valign(Vu32,Vv32,Rt8)",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control",
346{
347	unsigned shift = RtV & (fVBYTES()-1);
348	VALIGNB(shift)
349})
350EXTINSN(V6_vlalignb, "Vd32=vlalign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control",
351{
352	unsigned shift = fVBYTES() - (RtV & (fVBYTES()-1));
353	VALIGNB(shift)
354})
355EXTINSN(V6_valignbi, "Vd32=valign(Vu32,Vv32,#u3)", 	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control",
356{
357	VALIGNB(uiV)
358})
359EXTINSN(V6_vlalignbi,"Vd32=vlalign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control",
360{
361	unsigned shift = fVBYTES() - uiV;
362	VALIGNB(shift)
363})
364
365EXTINSN(V6_vror, "Vd32=vror(Vu32,Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
366"Align Two vectors by Rt32 as control",
367{
368	fHIDE(int k;)
369	for (k=0;k<fVBYTES();k++) {
370		VdV.ub[k] = VuV.ub[(k+RtV)&(fVBYTES()-1)];
371	}
372	})
373
374
375
376
377
378
379
380/**************************************************************
381* Unpack elements with zero/sign extend and cross lane permute
382***************************************************************/
383
384ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackub,  "Vdd32=vunpackub(Vu32)", "Vdd32.uh=vunpack(Vu32.ub)", "Unpack byte with zero-extend",     fVARRAY_ELEMENT_ACCESS(VddV, uh, i)  = fZE8_16( VuV.ub[i]))
385ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackb,   "Vdd32=vunpackb(Vu32)",  "Vdd32.h=vunpack(Vu32.b)",   "Unpack bytes with sign-extend",    fVARRAY_ELEMENT_ACCESS(VddV, h,  i)  = fSE8_16( VuV.b[i] ))
386ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackuh, "Vdd32=vunpackuh(Vu32)", "Vdd32.uw=vunpack(Vu32.uh)", "Unpack halves with zero-extend",   fVARRAY_ELEMENT_ACCESS(VddV, uw, i)  = fZE16_32(VuV.uh[i]))
387ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackh,  "Vdd32=vunpackh(Vu32)",  "Vdd32.w=vunpack(Vu32.h)",   "Unpack halves with sign-extend",   fVARRAY_ELEMENT_ACCESS(VddV, w,  i)  = fSE16_32(VuV.h[i] ))
388
389ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8, vunpackob, "Vxx32|=vunpackob(Vu32)", "Vxx32.h|=vunpacko(Vu32.b)", "Unpack byte to odd bytes ",       fVARRAY_ELEMENT_ACCESS(VxxV, uh, i) |= fZE8_16( VuV.ub[i])<<8)
390ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackoh, "Vxx32|=vunpackoh(Vu32)", "Vxx32.w|=vunpacko(Vu32.h)", "Unpack halves to odd halves",     fVARRAY_ELEMENT_ACCESS(VxxV, uw, i) |= fZE16_32(VuV.uh[i])<<16)
391
392
393/**************************************************************
394* Pack elements and cross lane permute
395***************************************************************/
396
397 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackeb,  "Vd32=vpackeb(Vu32,Vv32)", "Vd32.b=vpacke(Vu32.h,Vv32.h)",
398 "Pack  bytes",
399    VdV.ub[i]               = fGETUBYTE(0, VvV.uh[i]);
400    VdV.ub[i+fVELEM(16)]    = fGETUBYTE(0, VuV.uh[i]))
401
402 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackeh,  "Vd32=vpackeh(Vu32,Vv32)", "Vd32.h=vpacke(Vu32.w,Vv32.w)",
403 "Pack  halfwords",
404    VdV.uh[i]               = fGETUHALF(0, VvV.uw[i]);
405    VdV.uh[i+fVELEM(32)]    = fGETUHALF(0, VuV.uw[i]))
406
407  ITERATOR_INSN2_PERMUTE_SLOT(16, vpackob,  "Vd32=vpackob(Vu32,Vv32)", "Vd32.b=vpacko(Vu32.h,Vv32.h)",
408 "Pack  bytes",
409    VdV.ub[i]               = fGETUBYTE(1, VvV.uh[i]);
410    VdV.ub[i+fVELEM(16)]    = fGETUBYTE(1, VuV.uh[i]))
411
412 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackoh,  "Vd32=vpackoh(Vu32,Vv32)", "Vd32.h=vpacko(Vu32.w,Vv32.w)",
413 "Pack  halfwords",
414    VdV.uh[i]               = fGETUHALF(1, VvV.uw[i]);
415    VdV.uh[i+fVELEM(32)]    = fGETUHALF(1, VuV.uw[i]))
416
417
418
419ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhub_sat,  "Vd32=vpackhub(Vu32,Vv32):sat", "Vd32.ub=vpack(Vu32.h,Vv32.h):sat",
420 "Pack ubytes with saturation",
421    VdV.ub[i]               = fVSATUB(VvV.h[i]);
422    VdV.ub[i+fVELEM(16)]    = fVSATUB(VuV.h[i]))
423
424
425ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhb_sat,  "Vd32=vpackhb(Vu32,Vv32):sat", "Vd32.b=vpack(Vu32.h,Vv32.h):sat",
426 "Pack bytes with saturation",
427    VdV.b[i]               = fVSATB(VvV.h[i]);
428    VdV.b[i+fVELEM(16)]    = fVSATB(VuV.h[i]))
429
430
431ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwuh_sat,  "Vd32=vpackwuh(Vu32,Vv32):sat", "Vd32.uh=vpack(Vu32.w,Vv32.w):sat",
432 "Pack ubytes with saturation",
433    VdV.uh[i]               = fVSATUH(VvV.w[i]);
434    VdV.uh[i+fVELEM(32)]    = fVSATUH(VuV.w[i]))
435
436ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwh_sat,  "Vd32=vpackwh(Vu32,Vv32):sat", "Vd32.h=vpack(Vu32.w,Vv32.w):sat",
437 "Pack bytes with saturation",
438    VdV.h[i]               = fVSATH(VvV.w[i]);
439    VdV.h[i+fVELEM(32)]    = fVSATH(VuV.w[i]))
440
441
442
443
444
445/**************************************************************
446* Zero/Sign Extend with in-lane permute
447***************************************************************/
448
449ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vzb,"Vdd32=vzxtb(Vu32)","Vdd32.uh=vzxt(Vu32.ub)",
450"Vector Zero Extend Bytes",
451    VddV.v[0].uh[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i]));
452    VddV.v[1].uh[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])))
453
454ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vsb,"Vdd32=vsxtb(Vu32)","Vdd32.h=vsxt(Vu32.b)",
455"Vector Sign Extend Bytes",
456    VddV.v[0].h[i] = fSE8_16(fGETBYTE(0, VuV.h[i]));
457    VddV.v[1].h[i] = fSE8_16(fGETBYTE(1, VuV.h[i])))
458
459ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vzh,"Vdd32=vzxth(Vu32)","Vdd32.uw=vzxt(Vu32.uh)",
460"Vector Zero Extend halfwords",
461    VddV.v[0].uw[i] = fZE16_32(fGETUHALF(0, VuV.uw[i]));
462    VddV.v[1].uw[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])))
463
464ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vsh,"Vdd32=vsxth(Vu32)","Vdd32.w=vsxt(Vu32.h)",
465"Vector Sign Extend halfwords",
466    VddV.v[0].w[i] = fSE16_32(fGETHALF(0, VuV.w[i]));
467    VddV.v[1].w[i] = fSE16_32(fGETHALF(1, VuV.w[i])))
468
469
470/**********************************************************************
471*
472*
473*
474*               MMVECTOR REDUCTION
475*
476*
477*
478**********************************************************************/
479
480/********************************************
481*  2-WAY REDUCTION - UNSIGNED BYTE BY BYTE
482********************************************/
483
484
485ITERATOR_INSN2_MPY_SLOT(16,vdmpybus,"Vd32=vdmpybus(Vu32,Rt32)","Vd32.h=vdmpy(Vu32.ub,Rt32.b)",
486"Vector Dual Multiply-Accumulates unsigned bytes by bytes",
487    VdV.h[i]   = fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV));
488    VdV.h[i]  += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
489
490ITERATOR_INSN2_MPY_SLOT(16,vdmpybus_acc,"Vx32+=vdmpybus(Vu32,Rt32)","Vx32.h+=vdmpy(Vu32.ub,Rt32.b)",
491"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate",
492    VxV.h[i] += fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV));
493    VxV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
494
495
496
497ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv,"Vdd32=vdmpybus(Vuu32,Rt32)","Vdd32.h=vdmpy(Vuu32.ub,Rt32.b)",
498"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate Sliding Window Reduction",
499    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
500    VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV));
501
502    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
503    VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV)))
504
505ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv_acc,"Vxx32+=vdmpybus(Vuu32,Rt32)","Vxx32.h+=vdmpy(Vuu32.ub,Rt32.b)",
506"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate Sliding Window Reduction",
507    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
508    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV));
509
510    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
511    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV)))
512
513
514
515/********************************************
516*  2-WAY REDUCTION - HALF BY BYTE
517********************************************/
518ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb,"Vd32=vdmpyhb(Vu32,Rt32)","Vd32.w=vdmpy(Vu32.h,Rt32.b)",
519"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
520    VdV.w[i]  = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV));
521    VdV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV)))
522
523ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb_acc,"Vx32+=vdmpyhb(Vu32,Rt32)","Vx32.w+=vdmpy(Vu32.h,Rt32.b)",
524"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
525    VxV.w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV));
526    VxV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV)))
527
528
529
530ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv,"Vdd32=vdmpyhb(Vuu32,Rt32)","Vdd32.w=vdmpy(Vuu32.h,Rt32.b)",
531"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
532    VddV.v[0].w[i]  = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
533    VddV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV));
534
535    VddV.v[1].w[i]  = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
536    VddV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV)))
537
538
539ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv_acc,"Vxx32+=vdmpyhb(Vuu32,Rt32)","Vxx32.w+=vdmpy(Vuu32.h,Rt32.b)",
540"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
541    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
542    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV));
543
544    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
545    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV)))
546
547
548
549
550
551/********************************************
552*  2-WAY REDUCTION - HALF BY HALF
553********************************************/
554
555ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat,"Vd32=vdmpyh(Vu32,Vv32):sat","Vd32.w=vdmpy(Vu32.h,Vv32.h):sat",
556"Vector halfword multiply, accumulate pairs, sat to word",
557    fHIDE(size8s_t accum;)
558    accum    = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i]));
559    accum   += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i]));
560    VdV.w[i] = fVSATW(accum))
561
562ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat_acc,"Vx32+=vdmpyh(Vu32,Vv32):sat","Vx32.w+=vdmpy(Vu32.h,Vv32.h):sat",
563"Vector halfword multiply, accumulate pairs, sat to word",
564    fHIDE(size8s_t accum;)
565    accum    = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i]));
566    accum   += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i]));
567    VxV.w[i] = fVSATW(VxV.w[i]+accum))
568
569
570/* VDMPYH */
571
572ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat,"Vd32=vdmpyh(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.h):sat",
573"Vector halfword multiply, accumulate pairs, saturate to word",
574    fHIDE(size8s_t accum;)
575    accum    = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV));
576    accum   += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV));
577    VdV.w[i] = fVSATW(accum))
578
579ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat_acc,"Vx32+=vdmpyh(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.h):sat",
580"Vector halfword multiply, accumulate pairs, saturate to word",
581    fHIDE(size8s_t) accum = VxV.w[i];
582    accum   += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV));
583    accum   += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV));
584    VxV.w[i] = fVSATW(accum))
585
586
587
588
589ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat,"Vd32=vdmpyh(Vuu32,Rt32):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.h):sat",
590"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation",
591    fHIDE(size8s_t accum;)
592    accum    = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV));
593    accum   += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV));
594    VdV.w[i] = fVSATW(accum))
595
596ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat_acc,"Vx32+=vdmpyh(Vuu32,Rt32):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.h):sat",
597"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation",
598    fHIDE(size8s_t) accum = VxV.w[i];
599    accum   += fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV));
600    accum   += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV));
601    VxV.w[i] = fVSATW(accum))
602
603
604
605
606
607
608
609/* VDMPYHSU */
610ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat,"Vd32=vdmpyhsu(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.uh):sat",
611"Vector halfword multiply, accumulate pairs, saturate to word",
612    fHIDE(size8s_t accum;)
613    accum    = fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV));
614    accum   += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV));
615    VdV.w[i] = fVSATW(accum))
616
617ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat_acc,"Vx32+=vdmpyhsu(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.uh):sat",
618"Vector halfword multiply, accumulate pairs, saturate to word",
619    fHIDE(size8s_t) accum=VxV.w[i];
620    accum   += fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV));
621    accum   += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV));
622    VxV.w[i] = fVSATW(accum))
623
624
625
626ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat,"Vd32=vdmpyhsu(Vuu32,Rt32,#1):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.uh,#1):sat",
627"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation",
628    fHIDE(size8s_t accum;)
629    accum    = fMPY16SU(fGETHALF(1,VuuV.v[0].w[i]),fGETUHALF(0,RtV));
630    accum   += fMPY16SU(fGETHALF(0,VuuV.v[1].w[i]),fGETUHALF(1,RtV));
631    VdV.w[i] = fVSATW(accum))
632
633ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat_acc,"Vx32+=vdmpyhsu(Vuu32,Rt32,#1):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.uh,#1):sat",
634"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation",
635    fHIDE(size8s_t) accum=VxV.w[i];
636    accum   += fMPY16SU(fGETHALF(1, VuuV.v[0].w[i]),fGETUHALF(0,RtV));
637    accum   += fMPY16SU(fGETHALF(0, VuuV.v[1].w[i]),fGETUHALF(1,RtV));
638    VxV.w[i] = fVSATW(accum))
639
640
641
642/********************************************
643*  3-WAY REDUCTION - UNSIGNED BYTE BY  BYTE
644********************************************/
645
646 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb, "Vdd32=vtmpyb(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.b,Rt32.b)",
647"Dual Vector 3x1 Reduction",
648    VddV.v[0].h[i]  = fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
649    VddV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV));
650    VddV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]);
651
652    VddV.v[1].h[i]  = fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
653    VddV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV));
654    VddV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i]))
655
656
657ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb_acc, "Vxx32+=vtmpyb(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.b,Rt32.b)",
658"Dual Vector 3x1 Reduction",
659    VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
660    VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV));
661    VxxV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]);
662
663    VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
664    VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV));
665    VxxV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i]))
666
667
668
669ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus, "Vdd32=vtmpybus(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.ub,Rt32.b)",
670"Dual Vector 3x1 Reduction",
671    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
672    VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV));
673    VddV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]);
674
675    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
676    VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV));
677    VddV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i]))
678
679ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus_acc, "Vxx32+=vtmpybus(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.ub,Rt32.b)",
680"Dual Vector 3x1 Reduction",
681    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
682    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV));
683    VxxV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]);
684
685    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
686    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV));
687    VxxV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i]))
688
689
690ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb, "Vdd32=vtmpyhb(Vuu32,Rt32)", "Vdd32.w=vtmpy(Vuu32.h,Rt32.b)",
691"Dual Vector 3x1 Reduction",
692    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
693    VddV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
694    VddV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]);
695
696    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
697    VddV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
698    VddV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i]))
699
700ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb_acc, "Vxx32+=vtmpyhb(Vuu32,Rt32)", "Vxx32.w+=vtmpy(Vuu32.h,Rt32.b)",
701"Dual Vector 3x1 Reduction",
702    VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
703    VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
704    VxxV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]);
705
706    VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
707    VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
708    VxxV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i]))
709
710
711/********************************************
712*  4-WAY REDUCTION - UNSIGNED BYTE BY UNSIGNED BYTE
713********************************************/
714
715
716
717ITERATOR_INSN2_MPY_SLOT(32,vrmpyub,"Vd32=vrmpyub(Vu32,Rt32)","Vd32.uw=vrmpy(Vu32.ub,Rt32.ub)",
718"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
719    VdV.uw[i]  = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV));
720    VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV));
721    VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV));
722    VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV)))
723
724ITERATOR_INSN2_MPY_SLOT(32,vrmpyub_acc,"Vx32+=vrmpyub(Vu32,Rt32)","Vx32.uw+=vrmpy(Vu32.ub,Rt32.ub)",
725"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate",
726    VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV));
727    VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV));
728    VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV));
729    VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV)))
730
731
732ITERATOR_INSN2_MPY_SLOT(32,vrmpyubv,"Vd32=vrmpyub(Vu32,Vv32)","Vd32.uw=vrmpy(Vu32.ub,Vv32.ub)",
733"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
734    VdV.uw[i]  = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i]));
735    VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i]));
736    VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i]));
737    VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i])))
738
739ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubv_acc,"Vx32+=vrmpyub(Vu32,Vv32)","Vx32.uw+=vrmpy(Vu32.ub,Vv32.ub)",
740"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate",
741    VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i]));
742    VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i]));
743    VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i]));
744    VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i])))
745
746ITERATOR_INSN2_MPY_SLOT(32,vrmpybv,"Vd32=vrmpyb(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.b,Vv32.b)",
747"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
748    VdV.w[i]  = fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i]));
749    VdV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i]));
750    VdV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i]));
751    VdV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i])))
752
753
754ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybv_acc,"Vx32+=vrmpyb(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.b,Vv32.b)",
755"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
756    VxV.w[i] += fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i]));
757    VxV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i]));
758    VxV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i]));
759    VxV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i])))
760
761
762ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi,"Vdd32=vrmpyub(Vuu32,Rt32,#u1)","Vdd32.uw=vrmpy(Vuu32.ub,Rt32.ub,#u1)",
763"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word",
764    VddV.v[0].uw[i]  = fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
765    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV));
766    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
767    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
768
769    VddV.v[1].uw[i]  = fMPY8UU(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
770    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
771    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
772    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)))
773
774
775ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi_acc,"Vxx32+=vrmpyub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrmpy(Vuu32.ub,Rt32.ub,#u1)",
776"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word",
777    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
778    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV));
779    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
780    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
781
782    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
783    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
784    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
785    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)))
786
787
788
789
790/********************************************
791*  4-WAY REDUCTION - UNSIGNED BYTE BY  BYTE
792********************************************/
793
794ITERATOR_INSN2_MPY_SLOT(32,vrmpybus,"Vd32=vrmpybus(Vu32,Rt32)","Vd32.w=vrmpy(Vu32.ub,Rt32.b)",
795"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
796    VdV.w[i]  = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV));
797    VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV));
798    VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV));
799    VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV)))
800
801
802ITERATOR_INSN2_MPY_SLOT(32,vrmpybus_acc,"Vx32+=vrmpybus(Vu32,Rt32)","Vx32.w+=vrmpy(Vu32.ub,Rt32.b)",
803"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
804    VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV));
805    VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV));
806    VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV));
807    VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV)))
808
809
810ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi,"Vdd32=vrmpybus(Vuu32,Rt32,#u1)","Vdd32.w=vrmpy(Vuu32.ub,Rt32.b,#u1)",
811"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word",
812    VddV.v[0].w[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
813    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV));
814    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
815    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
816
817    VddV.v[1].w[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
818    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
819    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
820    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)))
821
822
823ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi_acc,"Vxx32+=vrmpybus(Vuu32,Rt32,#u1)","Vxx32.w+=vrmpy(Vuu32.ub,Rt32.b,#u1)",
824"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word",
825    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
826    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV));
827    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
828    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
829
830    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
831    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
832    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
833    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)))
834
835
836
837
838ITERATOR_INSN2_MPY_SLOT(32,vrmpybusv,"Vd32=vrmpybus(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.ub,Vv32.b)",
839"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
840    VdV.w[i]  = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i]));
841    VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i]));
842    VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i]));
843    VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i])))
844
845
846ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusv_acc,"Vx32+=vrmpybus(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.ub,Vv32.b)",
847"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
848    VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i]));
849    VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i]));
850    VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i]));
851    VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i])))
852
853
854
855
856
857
858
859
860
861
862
863/********************************************
864*  2-WAY REDUCTION - SAD
865********************************************/
866
867ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh,"Vdd32=vdsaduh(Vuu32,Rt32)","Vdd32.uw=vdsad(Vuu32.uh,Rt32.uh)",
868"Dual Vector Halfword by Byte 4-Way Reduction to Word",
869    VddV.v[0].uw[i]  = fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
870    VddV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV));
871    VddV.v[1].uw[i]  = fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
872    VddV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV)))
873
874ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh_acc,"Vxx32+=vdsaduh(Vuu32,Rt32)","Vxx32.uw+=vdsad(Vuu32.uh,Rt32.uh)",
875"Dual Vector Halfword by Byte 4-Way Reduction to Word",
876    VxxV.v[0].uw[i] += fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
877    VxxV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV));
878    VxxV.v[1].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
879    VxxV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV)))
880
881
882
883
884/********************************************
885*  4-WAY REDUCTION - SAD
886********************************************/
887
888
889
890ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi,"Vdd32=vrsadub(Vuu32,Rt32,#u1)","Vdd32.uw=vrsad(Vuu32.ub,Rt32.ub,#u1)",
891"Dual Vector Halfword by Byte 4-Way Reduction to Word",
892    VddV.v[0].uw[i]  = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
893    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)));
894    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
895    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
896
897    VddV.v[1].uw[i]  = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
898    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
899    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
900    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))))
901
902ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi_acc,"Vxx32+=vrsadub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrsad(Vuu32.ub,Rt32.ub,#u1)",
903"Dual Vector Halfword by Byte 4-Way Reduction to Word",
904    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
905    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)));
906    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
907    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
908
909    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
910    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
911    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
912    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))))
913
914
915
916
917
918
919
920
921
922
923/*********************************************************************
924 * MMVECTOR SHIFTING
925 * ******************************************************************/
926// Macro to shift arithmetically left/right and by either RT or Vv
927
928#define V_SHIFT(TYPE, DESC, SIZE, LOGSIZE, CASTTYPE)   \
929ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE,   "Vd32=vasr" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Rt32)",         "Vector arithmetic shift right " DESC,    VdV.TYPE[i]     = (VuV.TYPE[i]    >> (RtV & (SIZE-1)))) \
930ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE,   "Vd32=vasl" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Rt32)",         "Vector arithmetic shift left  " DESC,    VdV.TYPE[i]     = (VuV.TYPE[i]    << (RtV & (SIZE-1)))) \
931ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE,   "Vd32=vlsr" #TYPE "(Vu32,Rt32)","Vd32.u"#TYPE"=vlsr(Vu32.u"#TYPE",Rt32)",       "Vector logical shift right "    DESC,    VdV.u##TYPE[i]  = (VuV.u##TYPE[i] >> (RtV & (SIZE-1)))) \
932ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE##v,"Vd32=vasr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift right " DESC,    VdV.TYPE[i]     = fBIDIR_ASHIFTR(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
933ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE##v,"Vd32=vasl" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift left  " DESC,    VdV.TYPE[i]     = fBIDIR_ASHIFTL(VuV.TYPE[i],  fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
934ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE##v,"Vd32=vlsr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vlsr(Vu32."#TYPE",Vv32."#TYPE")", "Vector logical shift right "    DESC,    VdV.u##TYPE[i]  = fBIDIR_LSHIFTR(VuV.u##TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
935
936V_SHIFT(w, "word",   32,5,4_4)
937V_SHIFT(h, "halfword", 16,4,2_2)
938
939ITERATOR_INSN_SHIFT_SLOT(8,vlsrb,"Vd32.ub=vlsr(Vu32.ub,Rt32)","vec log shift right bytes", VdV.b[i] = VuV.ub[i] >> (RtV & 0x7))
940
941ITERATOR_INSN2_SHIFT_SLOT(32,vrotr,"Vd32=vrotr(Vu32,Vv32)","Vd32.uw=vrotr(Vu32.uw,Vv32.uw)","Vector word rotate right", VdV.uw[i] = ((VuV.uw[i] >> (VvV.uw[i] & 0x1f)) | (VuV.uw[i] << (32 - (VvV.uw[i] & 0x1f)))))
942
943/*********************************************************************
944 * MMVECTOR SHIFT AND PERMUTE
945 * ******************************************************************/
946
947ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(32,vasr_into,"Vxx32=vasrinto(Vu32,Vv32)","Vxx32.w=vasrinto(Vu32.w,Vv32.w)","ASR vector 1 elements and overlay dropping bits to MSB of vector 2 elements",
948    fHIDE(int64_t ) shift = (fSE32_64(VuV.w[i]) << 32);
949    fHIDE(int64_t ) mask  = (((fSE32_64(VxxV.v[0].w[i])) << 32) | fZE32_64(VxxV.v[0].w[i]));
950    fHIDE(int64_t) lomask = (((fSE32_64(1)) << 32) - 1);
951    fHIDE(int ) count = -(0x40 & VvV.w[i]) + (VvV.w[i] & 0x3f);
952    fHIDE(int64_t ) result = (count == -0x40) ? 0 : (((count < 0) ? ((shift << -(count)) | (mask & (lomask << -(count)))) : ((shift >> count) | (mask & (lomask >> count)))));
953    VxxV.v[1].w[i] = ((result >> 32) & 0xffffffff);
954    VxxV.v[0].w[i] = (result & 0xffffffff))
955
956#define NEW_NARROWING_SHIFT 1
957
958#if NEW_NARROWING_SHIFT
959#define NARROWING_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
960ITERATOR_INSN_SHIFT_SLOT(ITERSIZE,TAG, \
961"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \
962"Vector shift right and shuffle", \
963    fHIDE(int )shamt = RtV & SHAMTMASK; \
964    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \
965    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt)))
966
967
968
969
970
971/* WORD TO HALF*/
972
973NARROWING_SHIFT(32,vasrwh,fSETHALF,h,w,,fECHO,fVNOROUND,0xF)
974NARROWING_SHIFT(32,vasrwhsat,fSETHALF,h,w,:sat,fVSATH,fVNOROUND,0xF)
975NARROWING_SHIFT(32,vasrwhrndsat,fSETHALF,h,w,:rnd:sat,fVSATH,fVROUND,0xF)
976NARROWING_SHIFT(32,vasrwuhrndsat,fSETHALF,uh,w,:rnd:sat,fVSATUH,fVROUND,0xF)
977NARROWING_SHIFT(32,vasrwuhsat,fSETHALF,uh,w,:sat,fVSATUH,fVNOROUND,0xF)
978NARROWING_SHIFT(32,vasruwuhrndsat,fSETHALF,uh,uw,:rnd:sat,fVSATUH,fVROUND,0xF)
979
980NARROWING_SHIFT_NOV1(32,vasruwuhsat,fSETHALF,uh,uw,:sat,fVSATUH,fVNOROUND,0xF)
981NARROWING_SHIFT(16,vasrhubsat,fSETBYTE,ub,h,:sat,fVSATUB,fVNOROUND,0x7)
982NARROWING_SHIFT(16,vasrhubrndsat,fSETBYTE,ub,h,:rnd:sat,fVSATUB,fVROUND,0x7)
983NARROWING_SHIFT(16,vasrhbsat,fSETBYTE,b,h,:sat,fVSATB,fVNOROUND,0x7)
984NARROWING_SHIFT(16,vasrhbrndsat,fSETBYTE,b,h,:rnd:sat,fVSATB,fVROUND,0x7)
985
986#define NARROWING_VECTOR_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SRCTYPE2,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
987ITERATOR_INSN_SHIFT3_SLOT(ITERSIZE,TAG, \
988"Vd32." #DSTTYPE "=vasr(Vuu32." #SRCTYPE ",Vv32." #SRCTYPE2 ")" #SYNOPTS, \
989"Vector shift by vector right and shuffle", \
990    fHIDE(int )shamt = VvV.SRCTYPE2[2*i+0] & SHAMTMASK; \
991    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[0].SRCTYPE[i],shamt) >> shamt)); \
992    shamt = VvV.SRCTYPE2[2*i+1] & SHAMTMASK; \
993    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[1].SRCTYPE[i],shamt) >> shamt)))
994
995/* WORD TO HALF*/
996NARROWING_VECTOR_SHIFT(32,vasrvwuhsat,fSETHALF,uh,w,uh,:sat,fVSATUH,fVNOROUND,0xF)
997NARROWING_VECTOR_SHIFT(32,vasrvwuhrndsat,fSETHALF,uh,w,uh,:rnd:sat,fVSATUH,fVROUND,0xF)
998/* HALF TO BYTE*/
999NARROWING_VECTOR_SHIFT(16,vasrvuhubsat,fSETBYTE,ub,uh,ub,:sat,fVSATUB,fVNOROUND,0x7)
1000NARROWING_VECTOR_SHIFT(16,vasrvuhubrndsat,fSETBYTE,ub,uh,ub,:rnd:sat,fVSATUB,fVROUND,0x7)
1001
1002NARROWING_SHIFT_NOV1(16,vasruhubsat,fSETBYTE,ub,uh,:sat,fVSATUB,fVNOROUND,0x7)
1003NARROWING_SHIFT_NOV1(16,vasruhubrndsat,fSETBYTE,ub,uh,:rnd:sat,fVSATUB,fVROUND,0x7)
1004
1005#else
1006ITERATOR_INSN2_SHIFT_SLOT(32,vasrwh,"Vd32=vasrwh(Vu32,Vv32,Rt8)","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8)",
1007"Vector arithmetic shift right words, shuffle even halfwords",
1008    fSETHALF(0,VdV.w[i], (VvV.w[i] >> (RtV & 0xF)));
1009    fSETHALF(1,VdV.w[i], (VuV.w[i] >> (RtV & 0xF))))
1010
1011
1012ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):sat",
1013"Vector arithmetic shift right words, shuffle even halfwords",
1014    fSETHALF(0,VdV.w[i], fVSATH(VvV.w[i] >> (RtV & 0xF)));
1015    fSETHALF(1,VdV.w[i], fVSATH(VuV.w[i] >> (RtV & 0xF))))
1016
1017ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhrndsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):rnd:sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat",
1018"Vector arithmetic shift right words, shuffle even halfwords",
1019    fHIDE(int ) shamt = RtV & 0xF;
1020    fSETHALF(0,VdV.w[i], fVSATH(  (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1021    fSETHALF(1,VdV.w[i], fVSATH(  (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1022
1023ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhrndsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat",
1024"Vector arithmetic shift right words, shuffle even halfwords",
1025    fHIDE(int ) shamt = RtV & 0xF;
1026    fSETHALF(0,VdV.w[i], fVSATUH(  (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1027    fSETHALF(1,VdV.w[i], fVSATUH(  (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1028
1029ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):sat",
1030"Vector arithmetic shift right words, shuffle even halfwords",
1031    fSETHALF(0, VdV.uw[i], fVSATUH(VvV.w[i] >> (RtV & 0xF)));
1032    fSETHALF(1, VdV.uw[i], fVSATUH(VuV.w[i] >> (RtV & 0xF))))
1033
1034ITERATOR_INSN2_SHIFT_SLOT(32,vasruwuhrndsat,"Vd32=vasruwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.uw,Vv32.uw,Rt8):rnd:sat",
1035"Vector arithmetic shift right words, shuffle even halfwords",
1036    fHIDE(int ) shamt = RtV & 0xF;
1037    fSETHALF(0,VdV.w[i], fVSATUH(  (VvV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1038    fSETHALF(1,VdV.w[i], fVSATUH(  (VuV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1039#endif
1040
1041
1042
1043ITERATOR_INSN2_SHIFT_SLOT(32,vroundwh,"Vd32=vroundwh(Vu32,Vv32):sat","Vd32.h=vround(Vu32.w,Vv32.w):sat",
1044"Vector round words to halves, shuffle resultant halfwords",
1045    fSETHALF(0, VdV.uw[i], fVSATH((VvV.w[i] + fCONSTLL(0x8000)) >> 16));
1046    fSETHALF(1, VdV.uw[i], fVSATH((VuV.w[i] + fCONSTLL(0x8000)) >> 16)))
1047
1048ITERATOR_INSN2_SHIFT_SLOT(32,vroundwuh,"Vd32=vroundwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.w,Vv32.w):sat",
1049"Vector round words to halves, shuffle resultant halfwords",
1050    fSETHALF(0, VdV.uw[i], fVSATUH((VvV.w[i] + fCONSTLL(0x8000)) >> 16));
1051    fSETHALF(1, VdV.uw[i], fVSATUH((VuV.w[i] + fCONSTLL(0x8000)) >> 16)))
1052
1053ITERATOR_INSN2_SHIFT_SLOT(32,vrounduwuh,"Vd32=vrounduwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.uw,Vv32.uw):sat",
1054"Vector round words to halves, shuffle resultant halfwords",
1055    fSETHALF(0, VdV.uw[i], fVSATUH((VvV.uw[i] + fCONSTLL(0x8000)) >> 16));
1056    fSETHALF(1, VdV.uw[i], fVSATUH((VuV.uw[i] + fCONSTLL(0x8000)) >> 16)))
1057
1058
1059
1060
1061
1062/* HALF TO BYTE*/
1063
1064ITERATOR_INSN2_SHIFT_SLOT(16,vroundhb,"Vd32=vroundhb(Vu32,Vv32):sat","Vd32.b=vround(Vu32.h,Vv32.h):sat",
1065"Vector round words to halves, shuffle resultant halfwords",
1066    fSETBYTE(0, VdV.uh[i], fVSATB((VvV.h[i] + 0x80) >> 8));
1067    fSETBYTE(1, VdV.uh[i], fVSATB((VuV.h[i] + 0x80) >> 8)))
1068
1069ITERATOR_INSN2_SHIFT_SLOT(16,vroundhub,"Vd32=vroundhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.h,Vv32.h):sat",
1070"Vector round words to halves, shuffle resultant halfwords",
1071    fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.h[i] + 0x80) >> 8));
1072    fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.h[i] + 0x80) >> 8)))
1073
1074ITERATOR_INSN2_SHIFT_SLOT(16,vrounduhub,"Vd32=vrounduhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.uh,Vv32.uh):sat",
1075"Vector round words to halves, shuffle resultant halfwords",
1076    fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.uh[i] + 0x80) >> 8));
1077    fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.uh[i] + 0x80) >> 8)))
1078
1079
1080ITERATOR_INSN2_SHIFT_SLOT(32,vaslw_acc,"Vx32+=vaslw(Vu32,Rt32)","Vx32.w+=vasl(Vu32.w,Rt32)",
1081"Vector shift add word",
1082    VxV.w[i]  +=  (VuV.w[i] << (RtV & (32-1))))
1083
1084ITERATOR_INSN2_SHIFT_SLOT(32,vasrw_acc,"Vx32+=vasrw(Vu32,Rt32)","Vx32.w+=vasr(Vu32.w,Rt32)",
1085"Vector shift add word",
1086    VxV.w[i]  +=  (VuV.w[i] >> (RtV & (32-1))))
1087
1088ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vaslh_acc,"Vx32+=vaslh(Vu32,Rt32)","Vx32.h+=vasl(Vu32.h,Rt32)",
1089"Vector shift add halfword",
1090    VxV.h[i]  +=  (VuV.h[i] << (RtV & (16-1))))
1091
1092ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vasrh_acc,"Vx32+=vasrh(Vu32,Rt32)","Vx32.h+=vasr(Vu32.h,Rt32)",
1093"Vector shift add halfword",
1094    VxV.h[i]  +=  (VuV.h[i] >> (RtV & (16-1))))
1095
1096/**************************************************************************
1097*
1098* MMVECTOR ELEMENT-WISE ARITHMETIC
1099*
1100**************************************************************************/
1101
1102/**************************************************************************
1103* MACROS GO IN MACROS.DEF NOT HERE!!!
1104**************************************************************************/
1105
1106
1107#define MMVEC_ABSDIFF(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1108ITERATOR_INSN2_MPY_SLOT(WIDTH, vabsdiff##TYPE,                   "Vd32=vabsdiff"TYPE2"(Vu32,Vv32)" ,"Vd32."#DEST"=vabsdiff(Vu32."#SRC",Vv32."#SRC")" ,     "Vector Absolute of Difference "DESCR,   VdV.DEST[i] = (VuV.SRC[i] > VvV.SRC[i]) ? (VuV.SRC[i] - VvV.SRC[i]) : (VvV.SRC[i] - VuV.SRC[i]))
1109
1110#define MMVEC_ADDU_SAT(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1111ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat,                  "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" ,    "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVUADDSAT(WIDTH,  VuV.SRC[i], VvV.SRC[i]))\
1112ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv,    "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\
1113ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat,                  "Vd32=vsub"TYPE2"(Vu32,Vv32):sat",     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVUSUBSAT(WIDTH,  VuV.SRC[i], VvV.SRC[i]))\
1114ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv,    "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\
1115
1116#define MMVEC_ADDS_SAT(TYPE,TYPE2,DESCR, WIDTH,DEST,SRC)\
1117ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat,                  "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" ,    "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVSADDSAT(WIDTH,  VuV.SRC[i],  VvV.SRC[i]))\
1118ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv,    "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\
1119ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat,                  "Vd32=vsub"TYPE2"(Vu32,Vv32):sat",     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVSSUBSAT(WIDTH,  VuV.SRC[i],  VvV.SRC[i]))\
1120ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv,    "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\
1121
1122#define MMVEC_AVGU(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1123ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",        "Vector Average "DESCR,                                      VdV.DEST[i] = fVAVGU(   WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1124ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",     "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",    "Vector Average % Round"DESCR,                               VdV.DEST[i] = fVAVGURND(WIDTH,  VuV.SRC[i], VvV.SRC[i]))
1125
1126
1127
1128#define MMVEC_AVGS(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1129ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",          "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",          "Vector Average "DESCR,                                      VdV.DEST[i]  = fVAVGS(       WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1130ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",      "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",      "Vector Average % Round"DESCR,                               VdV.DEST[i]  = fVAVGSRND(    WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1131ITERATOR_INSN2_ANY_SLOT(WIDTH,vnavg##TYPE,                       "Vd32=vnavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")",         "Vector Negative Average "DESCR,                             VdV.DEST[i]  = fVNAVGS(      WIDTH,  VuV.SRC[i], VvV.SRC[i]))
1132
1133
1134
1135
1136
1137
1138
1139#define MMVEC_ADDWRAP(TYPE,TYPE2, DESCR, WIDTH , DEST,SRC)\
1140ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE,                  "Vd32=vadd"TYPE2"(Vu32,Vv32)" ,     "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC")",    "Vector Add "DESCR,          VdV.DEST[i] =  VuV.SRC[i] +  VvV.SRC[i])\
1141ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE,                  "Vd32=vsub"TYPE2"(Vu32,Vv32)" ,     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC")",    "Vector Sub "DESCR,          VdV.DEST[i] =  VuV.SRC[i] -  VvV.SRC[i])\
1142ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##_dv,  "Vdd32=vadd"TYPE2"(Vuu32,Vvv32)" ,  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Add "DESCR,   VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] + VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] + VvvV.v[1].SRC[i])\
1143ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##_dv,  "Vdd32=vsub"TYPE2"(Vuu32,Vvv32)" ,  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Sub "DESCR,   VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] - VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] - VvvV.v[1].SRC[i]) \
1144
1145
1146
1147
1148
1149/* Wrapping Adds */
1150MMVEC_ADDWRAP(b,    "b",    "Byte",         8,   b, b)
1151MMVEC_ADDWRAP(h,    "h",    "Halfword",     16,  h, h)
1152MMVEC_ADDWRAP(w,    "w",    "Word",         32,   w,    w)
1153
1154/* Saturating Adds */
1155MMVEC_ADDU_SAT(ub, "ub",    "Unsigned Byte",        8,   ub,    ub)
1156MMVEC_ADDU_SAT(uh, "uh",    "Unsigned Halfword",    16,  uh,    uh)
1157MMVEC_ADDU_SAT(uw, "uw",    "Unsigned word",    32,  uw,    uw)
1158MMVEC_ADDS_SAT(b,  "b",     "byte",             8,  b,     b)
1159MMVEC_ADDS_SAT(h,  "h",     "Halfword",             16,  h,     h)
1160MMVEC_ADDS_SAT(w,  "w",     "Word",                 32,  w,     w)
1161
1162
1163/* Averaging Instructions */
1164MMVEC_AVGU(ub,"ub",     "Unsigned Byte",     8,   ub,   ub)
1165MMVEC_AVGU(uh,"uh",     "Unsigned Halfword", 16,  uh,   uh)
1166MMVEC_AVGU_NOV1(uw,"uw",     "Unsigned Word",     32,  uw,   uw)
1167MMVEC_AVGS_NOV1(b,   "b",    "Byte",               8,   b,   b)
1168MMVEC_AVGS(h,   "h",    "Halfword",          16,   h,   h)
1169MMVEC_AVGS(w,   "w",    "Word",              32,   w,   w)
1170
1171
1172/* Absolute Difference */
1173MMVEC_ABSDIFF(ub,"ub",  "Unsigned Byte",        8,   ub,    ub)
1174MMVEC_ABSDIFF(uh,"uh",  "Unsigned Halfword",    16,  uh,    uh)
1175MMVEC_ABSDIFF(h,"h",        "Halfword",             16,  uh,    h)
1176MMVEC_ABSDIFF(w,"w",        "Word",                 32,  uw,    w)
1177
1178ITERATOR_INSN2_ANY_SLOT(8,vnavgub, "Vd32=vnavgub(Vu32,Vv32)", "Vd32.b=vnavg(Vu32.ub,Vv32.ub)",
1179"Vector Negative Average Unsigned Byte", VdV.b[i]   = fVNAVGU(8, VuV.ub[i], VvV.ub[i]))
1180
1181ITERATOR_INSN_ANY_SLOT(32,vaddcarrysat,"Vd32.w=vadd(Vu32.w,Vv32.w,Qs4):carry:sat","add w/carry and saturate",
1182VdV.w[i] = fVSATW(VuV.w[i]+VvV.w[i]+fGETQBIT(QsV,i*4)))
1183
1184ITERATOR_INSN_ANY_SLOT(32,vaddcarry,"Vd32.w=vadd(Vu32.w,Vv32.w,Qx4):carry","add w/carry",
1185VdV.w[i] = VuV.w[i]+VvV.w[i]+fGETQBIT(QxV,i*4);
1186fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],fGETQBIT(QxV,i*4))))
1187
1188ITERATOR_INSN_ANY_SLOT(32,vsubcarry,"Vd32.w=vsub(Vu32.w,Vv32.w,Qx4):carry","add w/carry",
1189VdV.w[i] = VuV.w[i]+~VvV.w[i]+fGETQBIT(QxV,i*4);
1190fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],fGETQBIT(QxV,i*4))))
1191
1192ITERATOR_INSN_ANY_SLOT(32,vaddcarryo,"Vd32.w,Qe4=vadd(Vu32.w,Vv32.w):carry","add w/carry out-only",
1193VdV.w[i] = VuV.w[i]+VvV.w[i];
1194fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],0)))
1195
1196ITERATOR_INSN_ANY_SLOT(32,vsubcarryo,"Vd32.w,Qe4=vsub(Vu32.w,Vv32.w):carry","subtract w/carry out-only",
1197VdV.w[i] = VuV.w[i]+~VvV.w[i]+1;
1198fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],1)))
1199
1200
1201ITERATOR_INSN_ANY_SLOT(32,vsatdw,"Vd32.w=vsatdw(Vu32.w,Vv32.w)","Saturate from 64-bits (higher 32-bits come from first vector) to 32-bits",VdV.w[i] = fVSATDW(VuV.w[i],VvV.w[i]))
1202
1203
1204#define MMVEC_ADDSAT_MIX(TAGEND,SATF,WIDTH,DEST,SRC1,SRC2)\
1205ITERATOR_INSN_ANY_SLOT(WIDTH, vadd##TAGEND,"Vd32."#DEST"=vadd(Vu32."#SRC1",Vv32."#SRC2"):sat",    "Vector Add mixed", VdV.DEST[i] =  SATF(VuV.SRC1[i] +  VvV.SRC2[i]))\
1206ITERATOR_INSN_ANY_SLOT(WIDTH, vsub##TAGEND,"Vd32."#DEST"=vsub(Vu32."#SRC1",Vv32."#SRC2"):sat",    "Vector Sub mixed", VdV.DEST[i] =  SATF(VuV.SRC1[i] -  VvV.SRC2[i]))\
1207
1208MMVEC_ADDSAT_MIX(ububb_sat,fVSATUB,8,ub,ub,b)
1209
1210/****************************
1211*   WIDENING
1212****************************/
1213
1214
1215
1216
1217ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh,"Vdd32=vaddub(Vu32,Vv32)","Vdd32.h=vadd(Vu32.ub,Vv32.ub)",
1218"Vector addition with widen into two vectors",
1219    VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) + fZE8_16(fGETUBYTE(0, VvV.uh[i]));
1220    VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) + fZE8_16(fGETUBYTE(1, VvV.uh[i])))
1221
1222ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vsububh,"Vdd32=vsubub(Vu32,Vv32)","Vdd32.h=vsub(Vu32.ub,Vv32.ub)",
1223"Vector subtraction with widen into two vectors",
1224    VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) - fZE8_16(fGETUBYTE(0, VvV.uh[i]));
1225    VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) - fZE8_16(fGETUBYTE(1, VvV.uh[i])))
1226
1227
1228
1229ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw,"Vdd32=vaddh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.h,Vv32.h)",
1230"Vector addition with widen into two vectors",
1231    VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]);
1232    VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i]))
1233
1234ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubhw,"Vdd32=vsubh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.h,Vv32.h)",
1235"Vector subtraction with widen into two vectors",
1236    VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) - fGETHALF(0, VvV.w[i]);
1237    VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) - fGETHALF(1, VvV.w[i]))
1238
1239
1240ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw,"Vdd32=vadduh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.uh,Vv32.uh)",
1241"Vector addition with widen into two vectors",
1242    VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) + fZE16_32(fGETUHALF(0, VvV.uw[i]));
1243    VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) + fZE16_32(fGETUHALF(1, VvV.uw[i])))
1244
1245ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubuhw,"Vdd32=vsubuh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.uh,Vv32.uh)",
1246"Vector subtraction with widen into two vectors",
1247    VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) - fZE16_32(fGETUHALF(0, VvV.uw[i]));
1248    VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) - fZE16_32(fGETUHALF(1, VvV.uw[i])))
1249
1250
1251
1252ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw_acc,"Vxx32+=vaddh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.h,Vv32.h)",
1253"Vector addition with widen into two vectors",
1254    VxxV.v[0].w[i] += fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]);
1255    VxxV.v[1].w[i] += fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i]))
1256
1257ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw_acc,"Vxx32+=vadduh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.uh,Vv32.uh)",
1258"Vector addition with widen into two vectors",
1259    VxxV.v[0].w[i] += fGETUHALF(0, VuV.w[i]) + fGETUHALF(0, VvV.w[i]);
1260    VxxV.v[1].w[i] += fGETUHALF(1, VuV.w[i]) + fGETUHALF(1, VvV.w[i]))
1261
1262ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh_acc,"Vxx32+=vaddub(Vu32,Vv32)","Vxx32.h+=vadd(Vu32.ub,Vv32.ub)",
1263"Vector addition with widen into two vectors",
1264    VxxV.v[0].h[i] += fGETUBYTE(0, VuV.h[i]) + fGETUBYTE(0, VvV.h[i]);
1265    VxxV.v[1].h[i] += fGETUBYTE(1, VuV.h[i]) + fGETUBYTE(1, VvV.h[i]))
1266
1267
1268/****************************
1269*   Conditional
1270****************************/
1271
1272#define CONDADDSUB(WIDTH,TAGEND,LHSYN,RHSYN,DESCR,LHBEH,RHBEH) \
1273ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH+RHBEH,LHBEH)) \
1274ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH-RHBEH,LHBEH)) \
1275ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (!Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH+RHBEH)) \
1276ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (!Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH-RHBEH)) \
1277
1278CONDADDSUB(8,b,"Vx32.b","Vu32.b","Conditional add/sub Byte",VxV.ub[i],VuV.ub[i])
1279CONDADDSUB(16,h,"Vx32.h","Vu32.h","Conditional add/sub Half",VxV.h[i],VuV.h[i])
1280CONDADDSUB(32,w,"Vx32.w","Vu32.w","Conditional add/sub Word",VxV.w[i],VuV.w[i])
1281
1282/*****************************************************
1283 ABSOLUTE VALUES
1284*****************************************************/
1285// V65
1286ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb,        "Vd32=vabsb(Vu32)",     "Vd32.b=vabs(Vu32.b)",     "Vector absolute value of bytes",    VdV.b[i]  =  fABS(VuV.b[i]))
1287ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb_sat,    "Vd32=vabsb(Vu32):sat", "Vd32.b=vabs(Vu32.b):sat", "Vector absolute value of bytes",    VdV.b[i]  =  fVSATB(fABS(fSE8_16(VuV.b[i]))))
1288
1289
1290ITERATOR_INSN2_ANY_SLOT(16,vabsh,        "Vd32=vabsh(Vu32)",     "Vd32.h=vabs(Vu32.h)",     "Vector absolute value of halfwords",    VdV.h[i]  =  fABS(VuV.h[i]))
1291ITERATOR_INSN2_ANY_SLOT(16,vabsh_sat,    "Vd32=vabsh(Vu32):sat", "Vd32.h=vabs(Vu32.h):sat", "Vector absolute value of halfwords",    VdV.h[i]  =  fVSATH(fABS(fSE16_32(VuV.h[i]))))
1292ITERATOR_INSN2_ANY_SLOT(32,vabsw,        "Vd32=vabsw(Vu32)",     "Vd32.w=vabs(Vu32.w)",     "Vector absolute value of words",        VdV.w[i]  =  fABS(VuV.w[i]))
1293ITERATOR_INSN2_ANY_SLOT(32,vabsw_sat,    "Vd32=vabsw(Vu32):sat", "Vd32.w=vabs(Vu32.w):sat", "Vector absolute value of words",        VdV.w[i]  =  fVSATW(fABS(fSE32_64(VuV.w[i]))))
1294
1295
1296/**************************************************************************
1297 * MMVECTOR MULTIPLICATIONS
1298 * ************************************************************************/
1299
1300
1301/* Byte by Byte */
1302ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv,"Vdd32=vmpyb(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.b,Vv32.b)",
1303"Vector absolute value of words",
1304    VddV.v[0].h[i] =  fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i]));
1305    VddV.v[1].h[i] =  fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i])))
1306
1307ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv_acc,"Vxx32+=vmpyb(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.b,Vv32.b)",
1308"Vector absolute value of words",
1309    VxxV.v[0].h[i] +=  fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i]));
1310    VxxV.v[1].h[i] +=  fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i])))
1311
1312
1313ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv,"Vdd32=vmpyub(Vu32,Vv32)","Vdd32.uh=vmpy(Vu32.ub,Vv32.ub)",
1314"Vector absolute value of words",
1315    VddV.v[0].uh[i] =  fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) );
1316    VddV.v[1].uh[i] =  fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) ))
1317
1318ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv_acc,"Vxx32+=vmpyub(Vu32,Vv32)","Vxx32.uh+=vmpy(Vu32.ub,Vv32.ub)",
1319"Vector absolute value of words",
1320    VxxV.v[0].uh[i] +=  fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) );
1321    VxxV.v[1].uh[i] +=  fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) ))
1322
1323
1324
1325
1326
1327
1328ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv,"Vdd32=vmpybus(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.ub,Vv32.b)",
1329"Vector absolute value of words",
1330    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i]));
1331    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i])))
1332
1333ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv_acc,"Vxx32+=vmpybus(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.ub,Vv32.b)",
1334"Vector absolute value of words",
1335    VxxV.v[0].h[i]  += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i]));
1336    VxxV.v[1].h[i]  += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i])))
1337
1338
1339
1340
1341ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabusv,"Vdd32=vmpabus(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.b)",
1342"Vertical Byte Multiply",
1343    VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(0, VvvV.v[1].uh[i]));
1344    VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(1, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(1, VvvV.v[1].uh[i])))
1345
1346ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabuuv,"Vdd32=vmpabuu(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.ub)",
1347"Vertical Byte Multiply",
1348    VddV.v[0].h[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(0, VvvV.v[1].uh[i]));
1349    VddV.v[1].h[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(1, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(1, VvvV.v[1].uh[i])))
1350
1351
1352
1353
1354
1355
1356
1357ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv,"Vdd32=vmpyh(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.h)",
1358"Vector by Vector Halfword Multiply",
1359    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i]));
1360    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i])))
1361
1362ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv_acc,"Vxx32+=vmpyh(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.h)",
1363"Vector by Vector Halfword Multiply",
1364    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i]));
1365    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i])))
1366
1367ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv,"Vdd32=vmpyuh(Vu32,Vv32)","Vdd32.uw=vmpy(Vu32.uh,Vv32.uh)",
1368"Vector by Vector Unsigned Halfword Multiply",
1369    VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i]));
1370    VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i])))
1371
1372ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv_acc,"Vxx32+=vmpyuh(Vu32,Vv32)","Vxx32.uw+=vmpy(Vu32.uh,Vv32.uh)",
1373"Vector by Vector Unsigned Halfword Multiply",
1374    VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i]));
1375    VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i])))
1376
1377
1378
1379/* Vector by Vector */
1380ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyhvsrs,"Vd32=vmpyh(Vu32,Vv32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Vv32.h):<<1:rnd:sat",
1381"Vector halfword multiply with round, shift, and sat16",
1382    VdV.h[i] = fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(VuV.h[i],VvV.h[i]    )<<1))))))
1383
1384
1385
1386ITERATOR_INSN_MPY_SLOT(16,vmpyuhvs, "Vd32.uh=vmpy(Vu32.uh,Vv32.uh):>>16",
1387"Vector by Vector Unsigned Halfword Multiply with 16 bit rightshift",
1388    VdV.uh[i] = fGETUHALF(1,fMPY16UU(VuV.uh[i],VvV.uh[i])))
1389
1390
1391ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus, "Vdd32=vmpyhus(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.uh)",
1392"Vector by Vector Halfword Multiply",
1393    VddV.v[0].w[i] = fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i]));
1394    VddV.v[1].w[i] = fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i])))
1395
1396
1397ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus_acc, "Vxx32+=vmpyhus(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.uh)",
1398"Vector by Vector Halfword Multiply",
1399    VxxV.v[0].w[i] += fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i]));
1400    VxxV.v[1].w[i] += fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i])))
1401
1402
1403
1404
1405ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih,"Vd32=vmpyih(Vu32,Vv32)","Vd32.h=vmpyi(Vu32.h,Vv32.h)",
1406"Vector by Vector Halfword Multiply",
1407    VdV.h[i] = fMPY16SS(VuV.h[i], VvV.h[i]))
1408
1409ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih_acc,"Vx32+=vmpyih(Vu32,Vv32)","Vx32.h+=vmpyi(Vu32.h,Vv32.h)",
1410"Vector by Vector Halfword Multiply",
1411    VxV.h[i] += fMPY16SS(VuV.h[i], VvV.h[i]))
1412
1413
1414
1415/* 32x32 high half / frac */
1416
1417
1418ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh,"Vd32=vmpyewuh(Vu32,Vv32)","Vd32.w=vmpye(Vu32.w,Vv32.uh)",
1419"Vector by Vector Halfword Multiply",
1420VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) >> 16)
1421
1422ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh,"Vd32=vmpyowh(Vu32,Vv32):<<1:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:sat",
1423"Vector by Vector Halfword Multiply",
1424VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 0) >> 1)))
1425
1426ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd,"Vd32=vmpyowh(Vu32,Vv32):<<1:rnd:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat",
1427"Vector by Vector Halfword Multiply",
1428VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 1) >> 1)))
1429
1430ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh_64,"Vdd32=vmpye(Vu32.w,Vv32.uh)",
1431"Word times Halfword Multiply, 64-bit result",
1432	fHIDE(size8s_t prod;)
1433	prod = fMPY32SU(VuV.w[i],fGETUHALF(0,VvV.w[i]));
1434	VddV.v[1].w[i] = prod >> 16;
1435	VddV.v[0].w[i] = prod << 16)
1436
1437ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_64_acc,"Vxx32+=vmpyo(Vu32.w,Vv32.h)",
1438"Word times Halfword Multiply, 64-bit result",
1439	fHIDE(size8s_t prod;)
1440	prod = fMPY32SS(VuV.w[i],fGETHALF(1,VvV.w[i]))  + fSE32_64(VxxV.v[1].w[i]);
1441	VxxV.v[1].w[i] = prod >> 16;
1442	fSETHALF(0, VxxV.v[0].w[i], VxxV.v[0].w[i] >> 16);
1443	fSETHALF(1, VxxV.v[0].w[i], prod & 0x0000ffff))
1444
1445ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:sat:shift",
1446"Vector by Vector Halfword Multiply",
1447IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 0) >> 1)))
1448
1449ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:rnd:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat:shift",
1450"Vector by Vector Halfword Multiply",
1451IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 1) >> 1)))
1452
1453/* For 32x32 integer / low half */
1454
1455ITERATOR_INSN_MPY_SLOT(32,vmpyieoh,"Vd32.w=vmpyieo(Vu32.h,Vv32.h)","Odd/Even multiply for 32x32 low half",
1456	VdV.w[i] = (fGETHALF(0,VuV.w[i])*fGETHALF(1,VvV.w[i])) << 16)
1457
1458ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh,"Vd32=vmpyiewuh(Vu32,Vv32)","Vd32.w=vmpyie(Vu32.w,Vv32.uh)",
1459"Vector by Vector Word by Halfword Multiply",
1460IV1DEAD()    VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) )
1461
1462ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiowh,"Vd32=vmpyiowh(Vu32,Vv32)","Vd32.w=vmpyio(Vu32.w,Vv32.h)",
1463"Vector by Vector Word by Halfword Multiply",
1464IV1DEAD()    VdV.w[i] = fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) )
1465
1466/* Add back these... */
1467
1468ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewh_acc,"Vx32+=vmpyiewh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.h)",
1469"Vector by Vector Word by Halfword Multiply",
1470VxV.w[i] = VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(0, VvV.w[i])) )
1471
1472ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh_acc,"Vx32+=vmpyiewuh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.uh)",
1473"Vector by Vector Word by Halfword Multiply",
1474VxV.w[i] = VxV.w[i] + fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) )
1475
1476
1477
1478
1479
1480
1481
1482/* Vector by Scalar */
1483ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub,"Vdd32=vmpyub(Vu32,Rt32)","Vdd32.uh=vmpy(Vu32.ub,Rt32.ub)",
1484"Vector absolute value of words",
1485    VddV.v[0].uh[i]  = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV));
1486    VddV.v[1].uh[i]  = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV)))
1487
1488ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub_acc,"Vxx32+=vmpyub(Vu32,Rt32)","Vxx32.uh+=vmpy(Vu32.ub,Rt32.ub)",
1489"Vector absolute value of words",
1490    VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV));
1491    VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV)))
1492
1493
1494ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus,"Vdd32=vmpybus(Vu32,Rt32)","Vdd32.h=vmpy(Vu32.ub,Rt32.b)",
1495"Vector absolute value of words",
1496    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV));
1497    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
1498
1499ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus_acc,"Vxx32+=vmpybus(Vu32,Rt32)","Vxx32.h+=vmpy(Vu32.ub,Rt32.b)",
1500"Vector absolute value of words",
1501    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV));
1502    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
1503
1504
1505ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus,"Vdd32=vmpabus(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.b)",
1506"Vertical Byte Multiply",
1507    VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV));
1508    VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV)))
1509
1510ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus_acc,"Vxx32+=vmpabus(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.b)",
1511"Vertical Byte Multiply",
1512    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV));
1513    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV)))
1514
1515// V65
1516
1517ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu,"Vdd32=vmpabuu(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.ub)",
1518"Vertical Byte Multiply",
1519    VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV));
1520    VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV)))
1521
1522ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu_acc,"Vxx32+=vmpabuu(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.ub)",
1523"Vertical Byte Multiply",
1524    VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV));
1525    VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV)))
1526
1527
1528
1529
1530/* Half by Byte */
1531ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb,"Vdd32=vmpahb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.h,Rt32.b)",
1532"Vertical Byte Multiply",
1533    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1534    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1535
1536ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb_acc,"Vxx32+=vmpahb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.h,Rt32.b)",
1537"Vertical Byte Multiply",
1538    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1539    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1540
1541/* Half by Byte */
1542ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb,"Vdd32=vmpauhb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.uh,Rt32.b)",
1543"Vertical Byte Multiply",
1544    VddV.v[0].w[i] = fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1545    VddV.v[1].w[i] = fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1546
1547ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb_acc,"Vxx32+=vmpauhb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.uh,Rt32.b)",
1548"Vertical Byte Multiply",
1549    VxxV.v[0].w[i] += fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1550    VxxV.v[1].w[i] += fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1551
1552
1553
1554
1555
1556
1557
1558/* Half by Half */
1559ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyh,"Vdd32=vmpyh(Vu32,Rt32)","Vdd32.w=vmpy(Vu32.h,Rt32.h)",
1560"Vector absolute value of words",
1561    VddV.v[0].w[i] =  fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV));
1562    VddV.v[1].w[i] =  fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))
1563
1564ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(32,vmpyh_acc,"Vxx32+=vmpyh(Vu32,Rt32)","Vxx32.w+=vmpy(Vu32.h,Rt32.h)",
1565"Vector even halfwords with scalar lower halfword multiply with shift and sat32",
1566    VxxV.v[0].w[i] =  fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV));
1567    VxxV.v[1].w[i] =  fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))
1568
1569
1570ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsat_acc,"Vxx32+=vmpyh(Vu32,Rt32):sat","Vxx32.w+=vmpy(Vu32.h,Rt32.h):sat",
1571"Vector even halfwords with scalar lower halfword multiply with shift and sat32",
1572    VxxV.v[0].w[i] =  fVSATW(fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)));
1573    VxxV.v[1].w[i] =  fVSATW(fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))))
1574
1575
1576
1577ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhss,"Vd32=vmpyh(Vu32,Rt32):<<1:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:sat",
1578"Vector halfword by halfword multiply, shift by 1, and take upper 16 msb",
1579          fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1)))));
1580          fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1)))));
1581)
1582
1583ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsrs,"Vd32=vmpyh(Vu32,Rt32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:rnd:sat",
1584"Vector halfword with scalar halfword multiply with round, shift, and sat16",
1585       fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1))))));
1586       fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1))))));
1587)
1588
1589
1590ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh,"Vdd32=vmpyuh(Vu32,Rt32)","Vdd32.uw=vmpy(Vu32.uh,Rt32.uh)",
1591"Vector even halfword unsigned multiply by scalar",
1592    VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV));
1593    VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV)))
1594
1595
1596ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh_acc,"Vxx32+=vmpyuh(Vu32,Rt32)","Vxx32.uw+=vmpy(Vu32.uh,Rt32.uh)",
1597"Vector even halfword unsigned multiply by scalar",
1598    VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV));
1599    VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV)))
1600
1601
1602
1603
1604/********************************************
1605*  HALF BY BYTE
1606********************************************/
1607ITERATOR_INSN2_MPY_SLOT(16,vmpyihb,"Vd32=vmpyihb(Vu32,Rt32)","Vd32.h=vmpyi(Vu32.h,Rt32.b)",
1608"Vector word by byte multiply, keep lower result",
1609VdV.h[i]  = fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) ))
1610
1611ITERATOR_INSN2_MPY_SLOT(16,vmpyihb_acc,"Vx32+=vmpyihb(Vu32,Rt32)","Vx32.h+=vmpyi(Vu32.h,Rt32.b)",
1612"Vector word by byte multiply, keep lower result",
1613VxV.h[i] += fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) ))
1614
1615
1616/********************************************
1617*  WORD BY BYTE
1618********************************************/
1619ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb,"Vd32=vmpyiwb(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.b)",
1620"Vector word by byte multiply, keep lower result",
1621VdV.w[i]  = fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) ))
1622
1623ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb_acc,"Vx32+=vmpyiwb(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.b)",
1624"Vector word by byte multiply, keep lower result",
1625VxV.w[i] += fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) ))
1626
1627ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub,"Vd32=vmpyiwub(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.ub)",
1628"Vector word by byte multiply, keep lower result",
1629VdV.w[i]  = fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) ))
1630
1631ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub_acc,"Vx32+=vmpyiwub(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.ub)",
1632"Vector word by byte multiply, keep lower result",
1633VxV.w[i] += fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) ))
1634
1635
1636/********************************************
1637*  WORD BY HALF
1638********************************************/
1639ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh,"Vd32=vmpyiwh(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.h)",
1640"Vector word by byte multiply, keep lower result",
1641VdV.w[i]  = fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV)))
1642
1643ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh_acc,"Vx32+=vmpyiwh(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.h)",
1644"Vector word by byte multiply, keep lower result",
1645VxV.w[i] += fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV)))
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665/**************************************************************************
1666 * MMVECTOR LOGICAL OPERATIONS
1667 * ************************************************************************/
1668ITERATOR_INSN_ANY_SLOT(16,vand,"Vd32=vand(Vu32,Vv32)", "Vector Logical And", VdV.uh[i] = VuV.uh[i] & VvV.h[i])
1669ITERATOR_INSN_ANY_SLOT(16,vor, "Vd32=vor(Vu32,Vv32)",  "Vector Logical Or", VdV.uh[i] = VuV.uh[i] | VvV.h[i])
1670ITERATOR_INSN_ANY_SLOT(16,vxor,"Vd32=vxor(Vu32,Vv32)", "Vector Logical XOR",    VdV.uh[i] = VuV.uh[i] ^ VvV.h[i])
1671ITERATOR_INSN_ANY_SLOT(16,vnot,"Vd32=vnot(Vu32)",     "Vector Logical NOT", VdV.uh[i] = ~VuV.uh[i])
1672
1673
1674
1675
1676
1677ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt,
1678"Vd32.ub=vand(Qu4.ub,Rt32.ub)", "Vd32=vand(Qu4,Rt32)", "Insert Predicate into Vector",
1679    VdV.ub[i] = fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0)
1680
1681ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt_acc,
1682"Vx32.ub|=vand(Qu4.ub,Rt32.ub)", "Vx32|=vand(Qu4,Rt32)",  "Insert Predicate into Vector",
1683    VxV.ub[i] |= (fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0)
1684
1685ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt,
1686"Vd32.ub=vand(!Qu4.ub,Rt32.ub)", "Vd32=vand(!Qu4,Rt32)", "Insert Predicate into Vector",
1687    VdV.ub[i] = !fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0)
1688
1689ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt_acc,
1690"Vx32.ub|=vand(!Qu4.ub,Rt32.ub)", "Vx32|=vand(!Qu4,Rt32)",  "Insert Predicate into Vector",
1691    VxV.ub[i] |= !(fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0)
1692
1693
1694ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt,
1695"Qd4.ub=vand(Vu32.ub,Rt32.ub)", "Qd4=vand(Vu32,Rt32)", "Insert into Predicate",
1696    fSETQBIT(QdV,i,((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0))
1697
1698ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt_acc,
1699"Qx4.ub|=vand(Vu32.ub,Rt32.ub)", "Qx4|=vand(Vu32,Rt32)", "Insert into Predicate ",
1700    fSETQBIT(QxV,i,fGETQBIT(QxV,i)|(((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0)))
1701
1702ITERATOR_INSN_ANY_SLOT(8,vandvqv,"Vd32=vand(Qv4,Vu32)","Mask off bytes",
1703VdV.b[i] = fGETQBIT(QvV,i) ? VuV.b[i] : 0)
1704ITERATOR_INSN_ANY_SLOT(8,vandvnqv,"Vd32=vand(!Qv4,Vu32)","Mask off bytes",
1705VdV.b[i] = !fGETQBIT(QvV,i) ? VuV.b[i] : 0)
1706
1707
1708 /***************************************************
1709 * Compare Vector with Vector
1710 ***************************************************/
1711#define VCMP(DEST, ASRC, ASRCOP, CMP, N, SRC, MASK, WIDTH)        \
1712{ \
1713       for(fHIDE(int) i = 0; i < fVBYTES(); i += WIDTH) { \
1714		fSETQBITS(DEST,WIDTH,MASK,i,ASRC ASRCOP ((VuV.SRC[i/WIDTH] CMP VvV.SRC[i/WIDTH]) ? MASK : 0)); \
1715    } \
1716       }
1717
1718
1719#define MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \
1720EXTINSN(V6_vgt##TYPE,       "Qd4=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than", \
1721	VCMP(QdV, , , >, N, SRC, MASK, WIDTH)) \
1722EXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-and", \
1723	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, >, N, SRC, MASK, WIDTH)) \
1724EXTINSN(V6_vgt##TYPE##_or,  "Qx4|=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-or", \
1725	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, >, N, SRC, MASK, WIDTH)) \
1726EXTINSN(V6_vgt##TYPE##_xor, "Qx4^=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-xor", \
1727	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, >, N, SRC, MASK, WIDTH))
1728
1729#define MMVEC_CMP(TYPE,TYPE2,TYPE3,DESCR,N,MASK, WIDTH, SRC)\
1730MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \
1731EXTINSN(V6_veq##TYPE,       "Qd4=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equal to", \
1732	VCMP(QdV, , , ==, N, SRC, MASK, WIDTH)) \
1733EXTINSN(V6_veq##TYPE##_and, "Qx4&=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-and", \
1734	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, ==, N, SRC, MASK, WIDTH)) \
1735EXTINSN(V6_veq##TYPE##_or,  "Qx4|=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-or", \
1736	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, ==, N, SRC, MASK, WIDTH)) \
1737EXTINSN(V6_veq##TYPE##_xor, "Qx4^=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-xor", \
1738	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, ==, N, SRC, MASK, WIDTH))
1739
1740
1741MMVEC_CMP(w,"w","","Vector Word Compare ", fVELEM(32), 0xF, 4, w)
1742MMVEC_CMP(h,"h","","Vector Half Compare ", fVELEM(16), 0x3, 2, h)
1743MMVEC_CMP(b,"b","","Vector Half Compare ", fVELEM(8),  0x1, 1, b)
1744MMVEC_CMPGT(uw,"uw","","Vector Unsigned Half Compare ", fVELEM(32), 0xF, 4,uw)
1745MMVEC_CMPGT(uh,"uh","","Vector Unsigned Half Compare ", fVELEM(16), 0x3, 2,uh)
1746MMVEC_CMPGT(ub,"ub","","Vector Unsigned Byte Compare ", fVELEM(8),  0x1, 1,ub)
1747
1748/***************************************************
1749* Predicate Operations
1750***************************************************/
1751
1752EXTINSN(V6_pred_scalar2, "Qd4=vsetq(Rt32)",         ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),   "Set Vector Predicate ",
1753{
1754    fHIDE(int i;)
1755    for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i < (RtV & (fVBYTES()-1))) ? 1 : 0);
1756})
1757
1758EXTINSN(V6_pred_scalar2v2, "Qd4=vsetq2(Rt32)",         ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),   "Set Vector Predicate ",
1759{
1760    fHIDE(int i;)
1761    for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i <= ((RtV-1) & (fVBYTES()-1))) ? 1 : 0);
1762})
1763
1764
1765ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqw, "Qd4.h=vshuffe(Qs4.w,Qt4.w)","Shrink Predicate", fSETQBIT(QdV,i, (i & 2) ? fGETQBIT(QsV,i-2) : fGETQBIT(QtV,i) ) )
1766ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqh, "Qd4.b=vshuffe(Qs4.h,Qt4.h)","Shrink Predicate", fSETQBIT(QdV,i, (i & 1) ? fGETQBIT(QsV,i-1) : fGETQBIT(QtV,i) ) )
1767ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or, "Qd4=or(Qs4,Qt4)","Vector Predicate Or", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || fGETQBIT(QtV,i) ) )
1768ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and, "Qd4=and(Qs4,Qt4)","Vector Predicate And", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && fGETQBIT(QtV,i) ) )
1769ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_xor, "Qd4=xor(Qs4,Qt4)","Vector Predicate Xor", fSETQBIT(QdV,i,fGETQBIT(QsV,i) ^ fGETQBIT(QtV,i) ) )
1770ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or_n, "Qd4=or(Qs4,!Qt4)","Vector Predicate Or with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || !fGETQBIT(QtV,i) ) )
1771ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and_n, "Qd4=and(Qs4,!Qt4)","Vector Predicate And  with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && !fGETQBIT(QtV,i) ) )
1772ITERATOR_INSN_ANY_SLOT(8, pred_not, "Qd4=not(Qs4)","Vector Predicate Not", fSETQBIT(QdV,i,!fGETQBIT(QsV,i) ) )
1773
1774
1775
1776EXTINSN(V6_vcmov,  "if (Ps4) Vd32=Vu32",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),   "Conditional Mov",
1777{
1778if (fLSBOLD(PsV))	{
1779	fHIDE(int i;)
1780	fVFOREACH(8, i) {
1781		VdV.ub[i] = VuV.ub[i];
1782	}
1783	} else {CANCEL;}
1784})
1785
1786EXTINSN(V6_vncmov,  "if (!Ps4) Vd32=Vu32",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),   "Conditional Mov",
1787{
1788if (fLSBOLDNOT(PsV))	{
1789	fHIDE(int i;)
1790	fVFOREACH(8, i) {
1791		VdV.ub[i] = VuV.ub[i];
1792	}
1793	} else {CANCEL;}
1794})
1795
1796EXTINSN(V6_vccombine,  "if (Ps4) Vdd32=vcombine(Vu32,Vv32)",	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),   "Conditional Combine",
1797{
1798if (fLSBOLD(PsV))	{
1799	fHIDE(int i;)
1800	fVFOREACH(8, i) {
1801		VddV.v[0].ub[i] = VvV.ub[i];
1802		VddV.v[1].ub[i] = VuV.ub[i];
1803	}
1804	} else {CANCEL;}
1805})
1806
1807EXTINSN(V6_vnccombine,  "if (!Ps4) Vdd32=vcombine(Vu32,Vv32)",	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),   "Conditional Combine",
1808{
1809if (fLSBOLDNOT(PsV))	{
1810	fHIDE(int i;)
1811	fVFOREACH(8, i) {
1812		VddV.v[0].ub[i] = VvV.ub[i];
1813		VddV.v[1].ub[i] = VuV.ub[i];
1814	}
1815	} else {CANCEL;}
1816})
1817
1818
1819
1820ITERATOR_INSN_ANY_SLOT(8,vmux,"Vd32=vmux(Qt4,Vu32,Vv32)",
1821"Vector Select Element 8-bit",
1822    VdV.ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i])
1823
1824ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vswap,"Vdd32=vswap(Qt4,Vu32,Vv32)",
1825"Vector Swap Element 8-bit",
1826    VddV.v[0].ub[i] =  fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i];
1827	VddV.v[1].ub[i] = !fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i])
1828
1829
1830/***************************************************************************
1831*
1832*   MMVECTOR SORTING
1833*
1834****************************************************************************/
1835
1836#define MMVEC_SORT(TYPE,TYPE2,DESCR,ELEMENTSIZE,SRC)\
1837ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmax##TYPE, "Vd32=vmax" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmax(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " max", VdV.SRC[i] = (VuV.SRC[i] > VvV.SRC[i]) ? VuV.SRC[i] :  VvV.SRC[i])  \
1838ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmin##TYPE, "Vd32=vmin" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmin(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " min", VdV.SRC[i] = (VuV.SRC[i] < VvV.SRC[i]) ? VuV.SRC[i] :  VvV.SRC[i])
1839
1840MMVEC_SORT(b,"b", "signed byte",    8,  b)
1841MMVEC_SORT(ub,"ub", "unsigned byte",    8,  ub)
1842MMVEC_SORT(uh,"uh", "unsigned halfword",16, uh)
1843MMVEC_SORT(h,   "h",    "halfword",         16, h)
1844MMVEC_SORT(w,   "w",    "word",             32, w)
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854/*************************************************************
1855* SHUFFLES
1856****************************************************************/
1857
1858ITERATOR_INSN2_ANY_SLOT(16,vsathub,"Vd32=vsathub(Vu32,Vv32)","Vd32.ub=vsat(Vu32.h,Vv32.h)",
1859"Saturate and pack 32 halfwords to 32 unsigned bytes, and interleave them",
1860    fSETBYTE(0, VdV.uh[i], fVSATUB(VvV.h[i]));
1861    fSETBYTE(1, VdV.uh[i], fVSATUB(VuV.h[i])))
1862
1863ITERATOR_INSN2_ANY_SLOT(32,vsatwh,"Vd32=vsatwh(Vu32,Vv32)","Vd32.h=vsat(Vu32.w,Vv32.w)",
1864"Saturate and pack 16 words to 16 halfwords, and interleave them",
1865    fSETHALF(0, VdV.w[i], fVSATH(VvV.w[i]));
1866    fSETHALF(1, VdV.w[i], fVSATH(VuV.w[i])))
1867
1868ITERATOR_INSN2_ANY_SLOT(32,vsatuwuh,"Vd32=vsatuwuh(Vu32,Vv32)","Vd32.uh=vsat(Vu32.uw,Vv32.uw)",
1869"Saturate and pack 16 words to 16 halfwords, and interleave them",
1870    fSETHALF(0, VdV.w[i], fVSATUH(VvV.uw[i]));
1871    fSETHALF(1, VdV.w[i], fVSATUH(VuV.uw[i])))
1872
1873ITERATOR_INSN2_ANY_SLOT(16,vshuffeb,"Vd32=vshuffeb(Vu32,Vv32)","Vd32.b=vshuffe(Vu32.b,Vv32.b)",
1874"Shuffle half words with in a lane",
1875    fSETBYTE(0, VdV.uh[i], fGETUBYTE(0, VvV.uh[i]));
1876    fSETBYTE(1, VdV.uh[i], fGETUBYTE(0, VuV.uh[i])))
1877
1878ITERATOR_INSN2_ANY_SLOT(16,vshuffob,"Vd32=vshuffob(Vu32,Vv32)","Vd32.b=vshuffo(Vu32.b,Vv32.b)",
1879"Shuffle half words with in a lane",
1880    fSETBYTE(0, VdV.uh[i], fGETUBYTE(1, VvV.uh[i]));
1881    fSETBYTE(1, VdV.uh[i], fGETUBYTE(1, VuV.uh[i])))
1882
1883ITERATOR_INSN2_ANY_SLOT(32,vshufeh,"Vd32=vshuffeh(Vu32,Vv32)","Vd32.h=vshuffe(Vu32.h,Vv32.h)",
1884"Shuffle half words with in a lane",
1885    fSETHALF(0, VdV.uw[i], fGETUHALF(0, VvV.uw[i]));
1886    fSETHALF(1, VdV.uw[i], fGETUHALF(0, VuV.uw[i])))
1887
1888ITERATOR_INSN2_ANY_SLOT(32,vshufoh,"Vd32=vshuffoh(Vu32,Vv32)","Vd32.h=vshuffo(Vu32.h,Vv32.h)",
1889"Shuffle half words with in a lane",
1890    fSETHALF(0, VdV.uw[i], fGETUHALF(1, VvV.uw[i]));
1891    fSETHALF(1, VdV.uw[i], fGETUHALF(1, VuV.uw[i])))
1892
1893
1894
1895
1896/**************************************************************************
1897* Double Vector Shuffles
1898**************************************************************************/
1899
1900EXTINSN(V6_vshuff, "vshuff(Vy32,Vx32,Rt32)",
1901ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1902"2x2->2x2 transpose, for multiple data sizes, inplace",
1903{
1904	fHIDE(int offset;)
1905	for (offset=1; offset<fVBYTES(); offset<<=1) {
1906		if ( RtV & offset) {
1907			    fHIDE(int k;) \
1908				fVFOREACH(8, k) {\
1909				if (!( k & offset)) {
1910					fSWAPB(VyV.ub[k], VxV.ub[k+offset]);
1911				}
1912			}
1913		}
1914	}
1915	})
1916
1917EXTINSN(V6_vshuffvdd, "Vdd32=vshuff(Vu32,Vv32,Rt8)",
1918ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1919"2x2->2x2 transpose for multiple data sizes",
1920{
1921	fHIDE(int offset;)
1922	VddV.v[0] = VvV;
1923	VddV.v[1] = VuV;
1924	for (offset=1; offset<fVBYTES(); offset<<=1) {
1925		if ( RtV & offset) {
1926			    fHIDE(int k;) \
1927				fVFOREACH(8, k) {\
1928				if (!( k & offset)) {
1929					fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]);
1930				}
1931			}
1932		}
1933	}
1934	})
1935
1936EXTINSN(V6_vdeal, "vdeal(Vy32,Vx32,Rt32)",
1937ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1938" vector - vector deal - or deinterleave, for multiple data sizes, inplace",
1939{
1940	fHIDE(int offset;)
1941	for (offset=fVBYTES()>>1; offset>0; offset>>=1) {
1942		if ( RtV & offset) {
1943			    fHIDE(int k;) \
1944				fVFOREACH(8, k) {\
1945				if (!( k & offset)) {
1946					fSWAPB(VyV.ub[k], VxV.ub[k+offset]);
1947				}
1948			}
1949		}
1950	}
1951	})
1952
1953EXTINSN(V6_vdealvdd, "Vdd32=vdeal(Vu32,Vv32,Rt8)",
1954ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1955" vector - vector deal - or deinterleave, for multiple data sizes",
1956{
1957	fHIDE(int offset;)
1958	VddV.v[0] = VvV;
1959	VddV.v[1] = VuV;
1960	for (offset=fVBYTES()>>1; offset>0; offset>>=1) {
1961		if ( RtV & offset) {
1962			    fHIDE(int k;) \
1963				fVFOREACH(8, k) {\
1964				if (!( k & offset)) {
1965					fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]);
1966				}
1967			}
1968		}
1969	}
1970	})
1971
1972/**************************************************************************/
1973
1974
1975
1976ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vshufoeh,"Vdd32=vshuffoeh(Vu32,Vv32)","Vdd32.h=vshuffoe(Vu32.h,Vv32.h)",
1977"Vector Shuffle half words",
1978    fSETHALF(0, VddV.v[0].uw[i], fGETUHALF(0, VvV.uw[i]));
1979    fSETHALF(1, VddV.v[0].uw[i], fGETUHALF(0, VuV.uw[i]));
1980    fSETHALF(0, VddV.v[1].uw[i], fGETUHALF(1, VvV.uw[i]));
1981    fSETHALF(1, VddV.v[1].uw[i], fGETUHALF(1, VuV.uw[i])))
1982
1983ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vshufoeb,"Vdd32=vshuffoeb(Vu32,Vv32)","Vdd32.b=vshuffoe(Vu32.b,Vv32.b)",
1984"Vector Shuffle bytes",
1985    fSETBYTE(0, VddV.v[0].uh[i], fGETUBYTE(0, VvV.uh[i]));
1986    fSETBYTE(1, VddV.v[0].uh[i], fGETUBYTE(0, VuV.uh[i]));
1987    fSETBYTE(0, VddV.v[1].uh[i], fGETUBYTE(1, VvV.uh[i]));
1988    fSETBYTE(1, VddV.v[1].uh[i], fGETUBYTE(1, VuV.uh[i])))
1989
1990
1991/***************************************************************
1992* Deal
1993***************************************************************/
1994
1995ITERATOR_INSN2_PERMUTE_SLOT(32, vdealh, "Vd32=vdealh(Vu32)", "Vd32.h=vdeal(Vu32.h)",
1996"Deal Halfwords",
1997    VdV.uh[i  ] = fGETUHALF(0, VuV.uw[i]);
1998    VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i]))
1999
2000ITERATOR_INSN2_PERMUTE_SLOT(16, vdealb, "Vd32=vdealb(Vu32)", "Vd32.b=vdeal(Vu32.b)",
2001"Deal Halfwords",
2002    VdV.ub[i   ] = fGETUBYTE(0, VuV.uh[i]);
2003    VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i]))
2004
2005ITERATOR_INSN2_PERMUTE_SLOT(32, vdealb4w,  "Vd32=vdealb4w(Vu32,Vv32)", "Vd32.b=vdeale(Vu32.b,Vv32.b)",
2006"Deal Two Vectors Bytes",
2007    VdV.ub[0+i ] = fGETUBYTE(0, VvV.uw[i]);
2008    VdV.ub[fVELEM(32)+i ] = fGETUBYTE(2, VvV.uw[i]);
2009    VdV.ub[2*fVELEM(32)+i] = fGETUBYTE(0, VuV.uw[i]);
2010    VdV.ub[3*fVELEM(32)+i] = fGETUBYTE(2, VuV.uw[i]))
2011
2012/***************************************************************
2013* shuffle
2014***************************************************************/
2015
2016ITERATOR_INSN2_PERMUTE_SLOT(32, vshuffh, "Vd32=vshuffh(Vu32)", "Vd32.h=vshuff(Vu32.h)",
2017"Deal Halfwords",
2018    fSETHALF(0, VdV.uw[i], VuV.uh[i]);
2019    fSETHALF(1, VdV.uw[i], VuV.uh[i+fVELEM(32)]))
2020
2021ITERATOR_INSN2_PERMUTE_SLOT(16, vshuffb, "Vd32=vshuffb(Vu32)", "Vd32.b=vshuff(Vu32.b)",
2022"Deal Halfwords",
2023    fSETBYTE(0, VdV.uh[i], VuV.ub[i]);
2024    fSETBYTE(1, VdV.uh[i], VuV.ub[i+fVELEM(16)]))
2025
2026
2027
2028
2029
2030/***********************************************************
2031* INSERT AND EXTRACT
2032*********************************************************/
2033EXTINSN(V6_extractw, "Rd32=vextract(Vu32,Rs32)",
2034ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_MEMLIKE,A_RESTRICT_SLOT0ONLY),
2035"Extract an element from a vector to scalar",
2036fHIDE(warn("RdN=%d VuN=%d RsN=%d RsV=0x%08x widx=%d",RdN,VuN,RsN,RsV,((RsV & (fVBYTES()-1)) >> 2));)
2037RdV = VuV.uw[ (RsV & (fVBYTES()-1)) >> 2];
2038fHIDE(warn("RdV=0x%08x",RdV);))
2039
2040EXTINSN(V6_vinsertwr, "Vx32.w=vinsert(Rt32)",
2041ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),
2042"Insert Word Scalar into Vector",
2043VxV.uw[0] = RtV;)
2044
2045
2046
2047
2048ITERATOR_INSN_MPY_SLOT_LATE(32,lvsplatw, "Vd32=vsplat(Rt32)", "Replicates scalar across words in vector", VdV.uw[i] = RtV)
2049
2050ITERATOR_INSN_MPY_SLOT_LATE(16,lvsplath, "Vd32.h=vsplat(Rt32)", "Replicates scalar across halves in vector", VdV.uh[i] = RtV)
2051
2052ITERATOR_INSN_MPY_SLOT_LATE(8,lvsplatb, "Vd32.b=vsplat(Rt32)", "Replicates scalar across bytes in vector", VdV.ub[i] = RtV)
2053
2054
2055ITERATOR_INSN_ANY_SLOT(32,vassign,"Vd32=Vu32","Copy a vector",VdV.w[i]=VuV.w[i])
2056
2057
2058ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vcombine,"Vdd32=vcombine(Vu32,Vv32)",
2059"Vector assign, Any two to Vector Pair",
2060    VddV.v[0].ub[i] = VvV.ub[i];
2061    VddV.v[1].ub[i] = VuV.ub[i])
2062
2063
2064
2065///////////////////////////////////////////////////////////////////////////
2066
2067EXTINSN(V6_vcombine_tmp, "Vdd32.tmp=vcombine(Vu32,Vv32)",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC),
2068"Vector assign tmp, Any two to Vector Pair ",
2069{
2070   fHIDE(int i;)
2071    fVFOREACH(8, i) {
2072           VddV.v[0].ub[i] = VvV.ub[i];
2073           VddV.v[1].ub[i] = VuV.ub[i];
2074    }
2075})
2076
2077EXTINSN(V6_vassign_tmp, "Vd32.tmp=Vu32",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC),
2078"Vector assign tmp, Any two to Vector Pair ",
2079{
2080   fHIDE(int i;)
2081    fVFOREACH(32, i) {
2082           VdV.w[i]=VuV.w[i];
2083    }
2084})
2085
2086/*********************************************************
2087* GENERAL PERMUTE NETWORKS
2088*********************************************************/
2089
2090
2091EXTINSN(V6_vdelta, "Vd32=vdelta(Vu32,Vv32)",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
2092"Reverse Benes Butterfly network ",
2093{
2094    fHIDE(int offset;)
2095    fHIDE(int k;)
2096    fHIDE(mmvector_t tmp;)
2097    tmp = VuV;
2098    for (offset=fVBYTES(); (offset>>=1)>0; ) {
2099        for (k = 0; k<fVBYTES(); k++) {
2100            VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k];
2101        }
2102        for (k = 0; k<fVBYTES(); k++) {
2103            tmp.ub[k] = VdV.ub[k];
2104        }
2105    }
2106})
2107
2108
2109EXTINSN(V6_vrdelta, "Vd32=vrdelta(Vu32,Vv32)",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
2110"Forward Benes Butterfly network ",
2111{
2112	fHIDE(int offset;)
2113    fHIDE(int k;)
2114    fHIDE(mmvector_t tmp;)
2115    tmp = VuV;
2116    for (offset=1; offset<fVBYTES(); offset<<=1){
2117        for (k = 0; k<fVBYTES(); k++) {
2118            VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k];
2119        }
2120        for (k = 0; k<fVBYTES(); k++) {
2121            tmp.ub[k] = VdV.ub[k];
2122        }
2123    }
2124})
2125
2126
2127
2128
2129
2130ITERATOR_INSN2_SHIFT_SLOT(32,vcl0w,"Vd32=vcl0w(Vu32)","Vd32.uw=vcl0(Vu32.uw)",         "Count Leading Zeros in Word",     VdV.uw[i]=fCL1_4(~VuV.uw[i]))
2131ITERATOR_INSN2_SHIFT_SLOT(16,vcl0h,"Vd32=vcl0h(Vu32)","Vd32.uh=vcl0(Vu32.uh)",         "Count Leading Zeros in Word",    VdV.uh[i]=fCL1_2(~VuV.uh[i]))
2132
2133ITERATOR_INSN2_SHIFT_SLOT(32,vnormamtw,"Vd32=vnormamtw(Vu32)","Vd32.w=vnormamt(Vu32.w)","Norm Amount Word",
2134VdV.w[i]=fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i]))-1; fHIDE(IV1DEAD();))
2135ITERATOR_INSN2_SHIFT_SLOT(16,vnormamth,"Vd32=vnormamth(Vu32)","Vd32.h=vnormamt(Vu32.h)","Norm Amount Halfword",
2136VdV.h[i]=fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i]))-1; fHIDE(IV1DEAD();))
2137
2138ITERATOR_INSN_SHIFT_SLOT_VV_LATE(32,vaddclbw,"Vd32.w=vadd(vclb(Vu32.w),Vv32.w)",
2139"Count leading bits and add",
2140VdV.w[i] = fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i])) + VvV.w[i])
2141
2142ITERATOR_INSN_SHIFT_SLOT_VV_LATE(16,vaddclbh,"Vd32.h=vadd(vclb(Vu32.h),Vv32.h)",
2143"Count leading bits and add",
2144VdV.h[i] = fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i])) + VvV.h[i])
2145
2146
2147ITERATOR_INSN2_SHIFT_SLOT(16,vpopcounth,"Vd32=vpopcounth(Vu32)","Vd32.h=vpopcount(Vu32.h)",   "Count Leading Zeros in Word",  VdV.uh[i]=fCOUNTONES_2(VuV.uh[i]))
2148
2149
2150#define fHIST(INPUTVEC) \
2151	fUARCH_NOTE_PUMP_4X(); \
2152	fHIDE(int lane;) \
2153	fHIDE(mmvector_t tmp;) \
2154	fVFOREACH(128, lane) { \
2155		for (fHIDE(int )i=0; i<128/8; ++i) { \
2156			unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \
2157			unsigned char regno = value>>3; \
2158			unsigned char element = value & 7; \
2159			READ_EXT_VREG(regno,tmp,0); \
2160			tmp.uh[(128/16)*lane+(element)]++; \
2161			WRITE_EXT_VREG(regno,tmp,EXT_NEW); \
2162		} \
2163	}
2164
2165#define fHISTQ(INPUTVEC,QVAL) \
2166	fUARCH_NOTE_PUMP_4X(); \
2167	fHIDE(int lane;) \
2168	fHIDE(mmvector_t tmp;) \
2169	fVFOREACH(128, lane) { \
2170		for (fHIDE(int )i=0; i<128/8; ++i) { \
2171			unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \
2172			unsigned char regno = value>>3; \
2173			unsigned char element = value & 7; \
2174			READ_EXT_VREG(regno,tmp,0); \
2175			if (fGETQBIT(QVAL,128/8*lane+i)) tmp.uh[(128/16)*lane+(element)]++; \
2176			WRITE_EXT_VREG(regno,tmp,EXT_NEW); \
2177		} \
2178	}
2179
2180
2181
2182EXTINSN(V6_vhist, "vhist",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHIST(inputVec); })
2183EXTINSN(V6_vhistq, "vhist(Qv4)",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHISTQ(inputVec,QvV); })
2184
2185#undef fHIST
2186#undef fHISTQ
2187
2188
2189/* **** WEIGHTED HISTOGRAM **** */
2190
2191
2192#if 1
2193#define WHIST(EL,MASK,BSHIFT,COND,SATF) \
2194	fHIDE(unsigned int) bucket = fGETUBYTE(0,input.h[i]); \
2195	fHIDE(unsigned int) weight = fGETUBYTE(1,input.h[i]); \
2196	fHIDE(unsigned int) vindex = (bucket >> 3) & 0x1F; \
2197	fHIDE(unsigned int) elindex = ((i>>BSHIFT) & (~MASK)) | ((bucket>>BSHIFT) & MASK); \
2198	fHIDE(mmvector_t tmp;) \
2199	READ_EXT_VREG(vindex,tmp,0); \
2200	COND tmp.EL[elindex] = SATF(tmp.EL[elindex] + weight); \
2201	WRITE_EXT_VREG(vindex,tmp,EXT_NEW); \
2202	fUARCH_NOTE_PUMP_2X();
2203
2204ITERATOR_INSN_VHISTLIKE(16,vwhist256,"vwhist256","vector weighted histogram halfword counters", WHIST(uh,7,0,,))
2205ITERATOR_INSN_VHISTLIKE(16,vwhist256q,"vwhist256(Qv4)","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),))
2206ITERATOR_INSN_VHISTLIKE(16,vwhist256_sat,"vwhist256:sat","vector weighted histogram halfword counters", WHIST(uh,7,0,,fVSATUH))
2207ITERATOR_INSN_VHISTLIKE(16,vwhist256q_sat,"vwhist256(Qv4):sat","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),fVSATUH))
2208ITERATOR_INSN_VHISTLIKE(16,vwhist128,"vwhist128","vector weighted histogram word counters", WHIST(uw,3,1,,))
2209ITERATOR_INSN_VHISTLIKE(16,vwhist128q,"vwhist128(Qv4)","vector weighted histogram word counters", WHIST(uw,3,1,if (fGETQBIT(QvV,2*i)),))
2210ITERATOR_INSN_VHISTLIKE(16,vwhist128m,"vwhist128(#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if ((bucket & 1) == uiV),))
2211ITERATOR_INSN_VHISTLIKE(16,vwhist128qm,"vwhist128(Qv4,#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if (((bucket & 1) == uiV) && fGETQBIT(QvV,2*i)),))
2212
2213
2214#endif
2215
2216
2217
2218/* ******   lookup table instructions                          ***********  */
2219
2220/* Use low bits from idx to choose next-bigger elements from vector, then use LSB from idx to choose odd or even element */
2221
2222ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup",
2223fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2224matchval = RtV & 0x7;
2225oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2226idx = VuV.ub[i];
2227VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2228
2229
2230ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracc,"Vx32.b|=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup",
2231fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2232matchval = RtV & 0x7;
2233oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2234idx = VuV.ub[i];
2235VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2236
2237ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup",
2238fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2239matchval = RtV & 0xF;
2240oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2241idx = fGETUBYTE(0,VuV.uh[i]);
2242VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2243idx = fGETUBYTE(1,VuV.uh[i]);
2244VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2245
2246ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracc,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup",
2247fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2248matchval = fGETUBYTE(0,RtV) & 0xF;
2249oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2250idx = fGETUBYTE(0,VuV.uh[i]);
2251VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2252idx = fGETUBYTE(1,VuV.uh[i]);
2253VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2254
2255ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvbi,"Vd32.b=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup",
2256fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2257matchval = uiV & 0x7;
2258oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2259idx = VuV.ub[i];
2260VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2261
2262
2263ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracci,"Vx32.b|=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup",
2264fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2265matchval = uiV & 0x7;
2266oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2267idx = VuV.ub[i];
2268VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2269
2270ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwhi,"Vdd32.h=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup",
2271fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2272matchval = uiV & 0xF;
2273oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2274idx = fGETUBYTE(0,VuV.uh[i]);
2275VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2276idx = fGETUBYTE(1,VuV.uh[i]);
2277VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2278
2279ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracci,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup",
2280fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2281matchval = uiV & 0xF;
2282oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2283idx = fGETUBYTE(0,VuV.uh[i]);
2284VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2285idx = fGETUBYTE(1,VuV.uh[i]);
2286VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2287
2288ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb_nm,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8):nomatch","vector-vector table lookup",
2289fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;)
2290    matchval = RtV & 0x7;
2291    oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2292    idx = VuV.ub[i];
2293    idx = (idx&0x1F) | (matchval<<5);
2294    VdV.b[i] = fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]))
2295
2296ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_nm,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8):nomatch","vector-vector table lookup",
2297fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;)
2298    matchval = RtV & 0xF;
2299    oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2300    idx = fGETUBYTE(0,VuV.uh[i]);
2301    idx = (idx&0x0F) | (matchval<<4);
2302    VddV.v[0].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]);
2303    idx = fGETUBYTE(1,VuV.uh[i]);
2304    idx = (idx&0x0F) | (matchval<<4);
2305    VddV.v[1].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]))
2306
2307
2308
2309
2310/******************************************************************************
2311NON LINEAR - V65
2312 ******************************************************************************/
2313
2314ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpahhsat,"Vx32.h=vmpa(Vx32.h,Vu32.h,Rtt32.h):sat","piecewise linear approximation",
2315    VxV.h[i]= fVSATH( ( ( fMPY16SS(VxV.h[i],VuV.h[i])<<1) + (fGETHALF(( (VuV.h[i]>>14)&0x3), RttV )<<15))>>16))
2316
2317
2318ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpauhuhsat,"Vx32.h=vmpa(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation",
2319    VxV.h[i]= fVSATH( (  fMPY16SU(VxV.h[i],VuV.uh[i]) + (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16))
2320
2321ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpsuhuhsat,"Vx32.h=vmps(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation",
2322    VxV.h[i]= fVSATH( (  fMPY16SU(VxV.h[i],VuV.uh[i]) - (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16))
2323
2324
2325ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vlut4,"Vd32.h=vlut4(Vu32.uh,Rtt32.h)","4 entry lookup table",
2326    VdV.h[i]= fGETHALF(  ((VuV.h[i]>>14)&0x3), RttV ))
2327
2328
2329
2330/******************************************************************************
2331V65
2332 ******************************************************************************/
2333
2334ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe,"Vd32.uw=vmpye(Vu32.uh,Rt32.uh)",
2335"Vector even halfword unsigned multiply by scalar",
2336    VdV.uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)))
2337
2338
2339ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe_acc,"Vx32.uw+=vmpye(Vu32.uh,Rt32.uh)",
2340"Vector even halfword unsigned multiply by scalar",
2341    VxV.uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)))
2342
2343
2344
2345
2346EXTINSN(V6_vgathermw,  "vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words",
2347{
2348    fHIDE(int i;)
2349	fHIDE(int element_size = 4;)
2350    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2351    fVLASTBYTE(MuV, element_size);
2352    fVALIGN(RtV, element_size);
2353    fVFOREACH(32, i) {
2354        EA = RtV+VvV.uw[i];
2355        fVLOG_VTCM_GATHER_WORD(EA, VvV.uw[i], i,MuV);
2356    }
2357    fGATHER_FINISH()
2358})
2359EXTINSN(V6_vgathermh,  "vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2360{
2361    fHIDE(int i;)
2362	fHIDE(int element_size = 2;)
2363    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2364    fVLASTBYTE(MuV, element_size);
2365    fVALIGN(RtV, element_size);
2366    fVFOREACH(16, i) {
2367        EA = RtV+VvV.uh[i];
2368        fVLOG_VTCM_GATHER_HALFWORD(EA, VvV.uh[i], i,MuV);
2369    }
2370    fGATHER_FINISH()
2371})
2372
2373
2374
2375EXTINSN(V6_vgathermhw,  "vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2376{
2377    fHIDE(int i;)
2378    fHIDE(int j;)
2379	fHIDE(int element_size = 2;)
2380    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2381    fVLASTBYTE(MuV, element_size);
2382    fVALIGN(RtV, element_size);
2383    fVFOREACH(32, i) {
2384       for(j = 0; j < 2; j++) {
2385            EA = RtV+VvvV.v[j].uw[i];
2386            fVLOG_VTCM_GATHER_HALFWORD_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,MuV);
2387        }
2388    }
2389     fGATHER_FINISH()
2390})
2391
2392
2393EXTINSN(V6_vgathermwq,  "if (Qs4) vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words",
2394{
2395    fHIDE(int i;)
2396	fHIDE(int element_size = 4;)
2397    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2398    fVLASTBYTE(MuV, element_size);
2399    fVALIGN(RtV, element_size);
2400    fVFOREACH(32, i) {
2401        EA = RtV+VvV.uw[i];
2402        fVLOG_VTCM_GATHER_WORDQ(EA, VvV.uw[i], i,QsV,MuV);
2403    }
2404    fGATHER_FINISH()
2405})
2406EXTINSN(V6_vgathermhq,  "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2407{
2408    fHIDE(int i;)
2409	fHIDE(int element_size = 2;)
2410    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2411    fVLASTBYTE(MuV, element_size);
2412    fVALIGN(RtV, element_size);
2413    fVFOREACH(16, i) {
2414        EA = RtV+VvV.uh[i];
2415        fVLOG_VTCM_GATHER_HALFWORDQ(EA, VvV.uh[i], i,QsV,MuV);
2416    }
2417    fGATHER_FINISH()
2418})
2419
2420
2421
2422EXTINSN(V6_vgathermhwq,  "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2423{
2424    fHIDE(int i;)
2425    fHIDE(int j;)
2426	fHIDE(int element_size = 2;)
2427    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2428    fVLASTBYTE(MuV, element_size);
2429    fVALIGN(RtV, element_size);
2430    fVFOREACH(32, i) {
2431       for(j = 0; j < 2; j++) {
2432            EA = RtV+VvvV.v[j].uw[i];
2433            fVLOG_VTCM_GATHER_HALFWORDQ_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,QsV,MuV);
2434       }
2435    }
2436    fGATHER_FINISH()
2437})
2438
2439
2440
2441EXTINSN(V6_vscattermw , "vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words",
2442{
2443    fHIDE(int i;)
2444	fHIDE(int element_size = 4;)
2445    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2446    fVLASTBYTE(MuV, element_size);
2447    fVALIGN(RtV, element_size);
2448    fVFOREACH(32, i) {
2449        EA = RtV+VvV.uw[i];
2450        fVLOG_VTCM_WORD(EA, VvV.uw[i], VwV,i,MuV);
2451    }
2452    fSCATTER_FINISH(0)
2453})
2454
2455
2456
2457EXTINSN(V6_vscattermh , "vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfWords",
2458{
2459    fHIDE(int i;)
2460	fHIDE(int element_size = 2;)
2461    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2462    fVLASTBYTE(MuV, element_size);
2463    fVALIGN(RtV, element_size);
2464    fVFOREACH(16, i) {
2465        EA = RtV+VvV.uh[i];
2466        fVLOG_VTCM_HALFWORD(EA,VvV.uh[i],VwV,i,MuV);
2467    }
2468    fSCATTER_FINISH(0)
2469})
2470
2471
2472EXTINSN(V6_vscattermw_add,  "vscatter(Rt32,Mu2,Vv32.w).w+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words-Add",
2473{
2474    fHIDE(int i;)
2475    fHIDE(int ALIGNMENT=4;)
2476	fHIDE(int element_size = 4;)
2477    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2478    fVLASTBYTE(MuV, element_size);
2479    fVALIGN(RtV, element_size);
2480    fVFOREACH(32, i) {
2481        EA = (RtV+fVALIGN(VvV.uw[i],ALIGNMENT));
2482        fVLOG_VTCM_WORD_INCREMENT(EA,VvV.uw[i],VwV,i,ALIGNMENT,MuV);
2483    }
2484    fHIDE(fLOG_SCATTER_OP(4);)
2485    fSCATTER_FINISH(1)
2486})
2487
2488EXTINSN(V6_vscattermh_add,  "vscatter(Rt32,Mu2,Vv32.h).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfword-Add",
2489{
2490    fHIDE(int i;)
2491    fHIDE(int ALIGNMENT=2;)
2492	fHIDE(int element_size = 2;)
2493    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2494    fVLASTBYTE(MuV, element_size);
2495    fVALIGN(RtV, element_size);
2496    fVFOREACH(16, i) {
2497        EA = (RtV+fVALIGN(VvV.uh[i],ALIGNMENT));
2498        fVLOG_VTCM_HALFWORD_INCREMENT(EA,VvV.uh[i],VwV,i,ALIGNMENT,MuV);
2499    }
2500    fHIDE(fLOG_SCATTER_OP(2);)
2501    fSCATTER_FINISH(1)
2502})
2503
2504
2505EXTINSN(V6_vscattermwq,  "if (Qs4) vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words conditional",
2506{
2507    fHIDE(int i;)
2508	fHIDE(int element_size = 4;)
2509    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2510    fVLASTBYTE(MuV, element_size);
2511    fVALIGN(RtV, element_size);
2512    fVFOREACH(32, i) {
2513        EA = RtV+VvV.uw[i];
2514        fVLOG_VTCM_WORDQ(EA,VvV.uw[i], VwV,i,QsV,MuV);
2515    }
2516    fSCATTER_FINISH(0)
2517})
2518
2519EXTINSN(V6_vscattermhq,  "if (Qs4) vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter HalfWords conditional",
2520{
2521    fHIDE(int i;)
2522	fHIDE(int element_size = 2;)
2523    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2524    fVLASTBYTE(MuV, element_size);
2525    fVALIGN(RtV, element_size);
2526    fVFOREACH(16, i) {
2527        EA = RtV+VvV.uh[i];
2528        fVLOG_VTCM_HALFWORDQ(EA,VvV.uh[i],VwV,i,QsV,MuV);
2529    }
2530    fSCATTER_FINISH(0)
2531})
2532
2533
2534
2535
2536EXTINSN(V6_vscattermhw , "vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter Words",
2537{
2538    fHIDE(int i;)
2539    fHIDE(int j;)
2540	fHIDE(int element_size = 2;)
2541    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2542    fVLASTBYTE(MuV, element_size);
2543    fVALIGN(RtV, element_size);
2544    fVFOREACH(32, i) {
2545        for(j = 0; j < 2; j++) {
2546            EA = RtV+VvvV.v[j].uw[i];
2547            fVLOG_VTCM_HALFWORD_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,MuV);
2548        }
2549    }
2550    fSCATTER_FINISH(0)
2551})
2552
2553
2554ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyvubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "",
2555    fHIDE(size2s_t c00;)
2556    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2557    fHIDE(size2s_t c01;)
2558    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2559    fHIDE(size2s_t c02;)
2560    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2561
2562	fHIDE(size2s_t c10;)
2563    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2564    fHIDE(size2s_t c11;)
2565    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2566    fHIDE(size2s_t c12;)
2567    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2568
2569    if (uiV == 0) {
2570        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2571        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2572        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2573
2574        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2575        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2576        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2577
2578        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10);
2579        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2580        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12);
2581
2582    } else if (uiV == 1) {
2583        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00);
2584        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01);
2585        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02);
2586
2587        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2588        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2589        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2590
2591        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2592        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2593        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2594
2595    } else if (uiV == 2) {
2596        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2597        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2598        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2599
2600        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2601        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2602        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2603
2604        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10);
2605        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11);
2606        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12);
2607
2608    } else if (uiV == 3) {
2609        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00);
2610        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2611        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02);
2612
2613        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2614        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2615        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2616
2617        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2618        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2619        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2620    }
2621)
2622ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyhubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "",
2623    fHIDE(size2s_t c00;)
2624    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2625    fHIDE(size2s_t c01;)
2626    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2627    fHIDE(size2s_t c02;)
2628    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2629    fHIDE(size2s_t c10;)
2630    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2631    fHIDE(size2s_t c11;)
2632    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2633    fHIDE(size2s_t c12;)
2634    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2635
2636    if (uiV == 0) {
2637        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2638        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2639        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2640
2641        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2642        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2643        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2644
2645        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10);
2646        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2647        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12);
2648
2649    } else if (uiV == 1) {
2650        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00);
2651        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01);
2652        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02);
2653
2654        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2655        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2656        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2657
2658        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2659        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2660        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2661
2662    }  else if (uiV == 2) {
2663        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2664        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2665        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2666
2667        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2668        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2669        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2670
2671        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10);
2672        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11);
2673        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12);
2674
2675    } else if (uiV == 3) {
2676        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00);
2677        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2678        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02);
2679
2680        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2681        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2682        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2683
2684        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2685        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2686        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2687    }
2688)
2689
2690
2691ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyvubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "",
2692    fHIDE(short c00;)
2693    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2694    fHIDE(short c01;)
2695    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2696    fHIDE(short c02;)
2697    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2698    fHIDE(short c10;)
2699    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2700    fHIDE(short c11;)
2701    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2702    fHIDE(short c12;)
2703    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2704
2705
2706
2707    if (uiV == 0) {
2708        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2709        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2710        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2711
2712        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2713        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2714        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2715
2716        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10);
2717        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2718        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12);
2719
2720    }  else if (uiV == 1) {
2721        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00);
2722        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01);
2723        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02);
2724
2725        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2726        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2727        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2728
2729        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2730        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2731        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2732
2733    }  else if (uiV == 2) {
2734        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2735        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2736        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2737
2738        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2739        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2740        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2741
2742        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10);
2743        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11);
2744        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12);
2745
2746    } else if (uiV == 3) {
2747        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00);
2748        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2749        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02);
2750
2751        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2752        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2753        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2754
2755        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2756        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2757        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2758    }
2759)
2760
2761ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyhubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "",
2762    fHIDE(short c00;)
2763    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2764    fHIDE(short c01;)
2765    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2766    fHIDE(short c02;)
2767    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2768    fHIDE(short c10;)
2769    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2770    fHIDE(short c11;)
2771    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2772    fHIDE(short c12;)
2773    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2774
2775    if (uiV == 0) {
2776        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2777        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2778        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2779
2780        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2781        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2782        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2783
2784        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10);
2785        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2786        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12);
2787
2788    }  else if (uiV == 1) {
2789        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00);
2790        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01);
2791        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02);
2792
2793        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2794        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2795        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2796
2797        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2798        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2799        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2800
2801    }  else if (uiV == 2) {
2802        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2803        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2804        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2805
2806        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2807        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2808        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2809
2810        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10);
2811        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11);
2812        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12);
2813
2814    } else if (uiV == 3) {
2815        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00);
2816        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2817        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02);
2818
2819        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2820        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2821        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2822
2823        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2824        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2825        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2826    }
2827)
2828
2829
2830EXTINSN(V6_vscattermhwq,  "if (Qs4) vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords conditional",
2831{
2832    fHIDE(int i;)
2833    fHIDE(int j;)
2834	fHIDE(int element_size = 2;)
2835    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2836    fVLASTBYTE(MuV, element_size);
2837    fVALIGN(RtV, element_size);
2838    fVFOREACH(32, i) {
2839        for(j = 0; j < 2; j++) {
2840            EA = RtV+VvvV.v[j].uw[i];
2841            fVLOG_VTCM_HALFWORDQ_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),QsV,i,j,MuV);
2842        }
2843    }
2844    fSCATTER_FINISH(0)
2845})
2846
2847EXTINSN(V6_vscattermhw_add,  "vscatter(Rt32,Mu2,Vvv32.w).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords-add",
2848{
2849    fHIDE(int i;)
2850    fHIDE(int j;)
2851    fHIDE(int ALIGNMENT=2;)
2852	fHIDE(int element_size = 2;)
2853    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2854    fVLASTBYTE(MuV, element_size);
2855    fVALIGN(RtV, element_size);
2856    fVFOREACH(32, i) {
2857        for(j = 0; j < 2; j++) {
2858             EA =  RtV + fVALIGN(VvvV.v[j].uw[i],ALIGNMENT);;
2859             fVLOG_VTCM_HALFWORD_INCREMENT_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,ALIGNMENT,MuV);
2860        }
2861    }
2862    fHIDE(fLOG_SCATTER_OP(2);)
2863    fSCATTER_FINISH(1)
2864})
2865
2866EXTINSN(V6_vprefixqb,"Vd32.b=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into byte",
2867{
2868    fHIDE(int i;)
2869    fHIDE(size1u_t acc = 0;)
2870    fVFOREACH(8, i) {
2871        acc += fGETQBIT(QvV,i);
2872        VdV.ub[i] = acc;
2873    }
2874    } )
2875EXTINSN(V6_vprefixqh,"Vd32.h=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into halfwords",
2876{
2877    fHIDE(int i;)
2878    fHIDE(size2u_t acc = 0;)
2879    fVFOREACH(16, i) {
2880        acc += fGETQBIT(QvV,i*2+0);
2881        acc += fGETQBIT(QvV,i*2+1);
2882        VdV.uh[i] = acc;
2883    }
2884    } )
2885EXTINSN(V6_vprefixqw,"Vd32.w=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into words",
2886{
2887    fHIDE(int i;)
2888    fHIDE(size4u_t acc = 0;)
2889    fVFOREACH(32, i) {
2890        acc += fGETQBIT(QvV,i*4+0);
2891        acc += fGETQBIT(QvV,i*4+1);
2892        acc += fGETQBIT(QvV,i*4+2);
2893        acc += fGETQBIT(QvV,i*4+3);
2894        VdV.uw[i] = acc;
2895    }
2896    } )
2897
2898
2899
2900
2901
2902/******************************************************************************
2903 DEBUG Vector/Register Printing
2904 ******************************************************************************/
2905
2906#define PRINT_VU(TYPE, TYPE2, COUNT)\
2907    int i;  \
2908    size4u_t vec_len = fVBYTES();\
2909    fprintf(stdout,"V%2d: ",VuN);  \
2910    for (i=0;i<vec_len>>COUNT;i++) {         \
2911        fprintf(stdout,TYPE2 " ", VuV.TYPE[i]); \
2912    };  \
2913    fprintf(stdout,"\\n");  \
2914	fflush(stdout);\
2915
2916#undef ATTR_VMEM
2917#undef ATTR_VMEMU
2918#undef ATTR_VMEM_NT
2919
2920#endif /* NO_MMVEC */
2921
2922#ifdef __SELF_DEF_EXTINSN
2923#undef EXTINSN
2924#undef __SELF_DEF_EXTINSN
2925#endif
2926