1// Inferno utils/6l/span.c
2// https://bitbucket.org/inferno-os/inferno-os/src/master/utils/6l/span.c
3//
4//	Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
5//	Portions Copyright © 1995-1997 C H Forsyth (forsyth@terzarima.net)
6//	Portions Copyright © 1997-1999 Vita Nuova Limited
7//	Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
8//	Portions Copyright © 2004,2006 Bruce Ellis
9//	Portions Copyright © 2005-2007 C H Forsyth (forsyth@terzarima.net)
10//	Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
11//	Portions Copyright © 2009 The Go Authors. All rights reserved.
12//
13// Permission is hereby granted, free of charge, to any person obtaining a copy
14// of this software and associated documentation files (the "Software"), to deal
15// in the Software without restriction, including without limitation the rights
16// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17// copies of the Software, and to permit persons to whom the Software is
18// furnished to do so, subject to the following conditions:
19//
20// The above copyright notice and this permission notice shall be included in
21// all copies or substantial portions of the Software.
22//
23// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
26// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29// THE SOFTWARE.
30
31package x86
32
33import (
34	"cmd/internal/obj"
35	"cmd/internal/objabi"
36	"cmd/internal/sys"
37	"encoding/binary"
38	"fmt"
39	"internal/buildcfg"
40	"log"
41	"strings"
42)
43
44var (
45	plan9privates *obj.LSym
46)
47
48// Instruction layout.
49
50// Loop alignment constants:
51// want to align loop entry to loopAlign-byte boundary,
52// and willing to insert at most maxLoopPad bytes of NOP to do so.
53// We define a loop entry as the target of a backward jump.
54//
55// gcc uses maxLoopPad = 10 for its 'generic x86-64' config,
56// and it aligns all jump targets, not just backward jump targets.
57//
58// As of 6/1/2012, the effect of setting maxLoopPad = 10 here
59// is very slight but negative, so the alignment is disabled by
60// setting MaxLoopPad = 0. The code is here for reference and
61// for future experiments.
62//
63const (
64	loopAlign  = 16
65	maxLoopPad = 0
66)
67
68// Bit flags that are used to express jump target properties.
69const (
70	// branchBackwards marks targets that are located behind.
71	// Used to express jumps to loop headers.
72	branchBackwards = (1 << iota)
73	// branchShort marks branches those target is close,
74	// with offset is in -128..127 range.
75	branchShort
76	// branchLoopHead marks loop entry.
77	// Used to insert padding for misaligned loops.
78	branchLoopHead
79)
80
81// opBytes holds optab encoding bytes.
82// Each ytab reserves fixed amount of bytes in this array.
83//
84// The size should be the minimal number of bytes that
85// are enough to hold biggest optab op lines.
86type opBytes [31]uint8
87
88type Optab struct {
89	as     obj.As
90	ytab   []ytab
91	prefix uint8
92	op     opBytes
93}
94
95type movtab struct {
96	as   obj.As
97	ft   uint8
98	f3t  uint8
99	tt   uint8
100	code uint8
101	op   [4]uint8
102}
103
104const (
105	Yxxx = iota
106	Ynone
107	Yi0 // $0
108	Yi1 // $1
109	Yu2 // $x, x fits in uint2
110	Yi8 // $x, x fits in int8
111	Yu8 // $x, x fits in uint8
112	Yu7 // $x, x in 0..127 (fits in both int8 and uint8)
113	Ys32
114	Yi32
115	Yi64
116	Yiauto
117	Yal
118	Ycl
119	Yax
120	Ycx
121	Yrb
122	Yrl
123	Yrl32 // Yrl on 32-bit system
124	Yrf
125	Yf0
126	Yrx
127	Ymb
128	Yml
129	Ym
130	Ybr
131	Ycs
132	Yss
133	Yds
134	Yes
135	Yfs
136	Ygs
137	Ygdtr
138	Yidtr
139	Yldtr
140	Ymsw
141	Ytask
142	Ycr0
143	Ycr1
144	Ycr2
145	Ycr3
146	Ycr4
147	Ycr5
148	Ycr6
149	Ycr7
150	Ycr8
151	Ydr0
152	Ydr1
153	Ydr2
154	Ydr3
155	Ydr4
156	Ydr5
157	Ydr6
158	Ydr7
159	Ytr0
160	Ytr1
161	Ytr2
162	Ytr3
163	Ytr4
164	Ytr5
165	Ytr6
166	Ytr7
167	Ymr
168	Ymm
169	Yxr0          // X0 only. "<XMM0>" notation in Intel manual.
170	YxrEvexMulti4 // [ X<n> - X<n+3> ]; multisource YxrEvex
171	Yxr           // X0..X15
172	YxrEvex       // X0..X31
173	Yxm
174	YxmEvex       // YxrEvex+Ym
175	Yxvm          // VSIB vector array; vm32x/vm64x
176	YxvmEvex      // Yxvm which permits High-16 X register as index.
177	YyrEvexMulti4 // [ Y<n> - Y<n+3> ]; multisource YyrEvex
178	Yyr           // Y0..Y15
179	YyrEvex       // Y0..Y31
180	Yym
181	YymEvex   // YyrEvex+Ym
182	Yyvm      // VSIB vector array; vm32y/vm64y
183	YyvmEvex  // Yyvm which permits High-16 Y register as index.
184	YzrMulti4 // [ Z<n> - Z<n+3> ]; multisource YzrEvex
185	Yzr       // Z0..Z31
186	Yzm       // Yzr+Ym
187	Yzvm      // VSIB vector array; vm32z/vm64z
188	Yk0       // K0
189	Yknot0    // K1..K7; write mask
190	Yk        // K0..K7; used for KOP
191	Ykm       // Yk+Ym; used for KOP
192	Ytls
193	Ytextsize
194	Yindir
195	Ymax
196)
197
198const (
199	Zxxx = iota
200	Zlit
201	Zlitm_r
202	Zlitr_m
203	Zlit_m_r
204	Z_rp
205	Zbr
206	Zcall
207	Zcallcon
208	Zcallduff
209	Zcallind
210	Zcallindreg
211	Zib_
212	Zib_rp
213	Zibo_m
214	Zibo_m_xm
215	Zil_
216	Zil_rp
217	Ziq_rp
218	Zilo_m
219	Zjmp
220	Zjmpcon
221	Zloop
222	Zo_iw
223	Zm_o
224	Zm_r
225	Z_m_r
226	Zm2_r
227	Zm_r_xm
228	Zm_r_i_xm
229	Zm_r_xm_nr
230	Zr_m_xm_nr
231	Zibm_r // mmx1,mmx2/mem64,imm8
232	Zibr_m
233	Zmb_r
234	Zaut_r
235	Zo_m
236	Zo_m64
237	Zpseudo
238	Zr_m
239	Zr_m_xm
240	Zrp_
241	Z_ib
242	Z_il
243	Zm_ibo
244	Zm_ilo
245	Zib_rr
246	Zil_rr
247	Zbyte
248
249	Zvex_rm_v_r
250	Zvex_rm_v_ro
251	Zvex_r_v_rm
252	Zvex_i_rm_vo
253	Zvex_v_rm_r
254	Zvex_i_rm_r
255	Zvex_i_r_v
256	Zvex_i_rm_v_r
257	Zvex
258	Zvex_rm_r_vo
259	Zvex_i_r_rm
260	Zvex_hr_rm_v_r
261
262	Zevex_first
263	Zevex_i_r_k_rm
264	Zevex_i_r_rm
265	Zevex_i_rm_k_r
266	Zevex_i_rm_k_vo
267	Zevex_i_rm_r
268	Zevex_i_rm_v_k_r
269	Zevex_i_rm_v_r
270	Zevex_i_rm_vo
271	Zevex_k_rmo
272	Zevex_r_k_rm
273	Zevex_r_v_k_rm
274	Zevex_r_v_rm
275	Zevex_rm_k_r
276	Zevex_rm_v_k_r
277	Zevex_rm_v_r
278	Zevex_last
279
280	Zmax
281)
282
283const (
284	Px   = 0
285	Px1  = 1    // symbolic; exact value doesn't matter
286	P32  = 0x32 // 32-bit only
287	Pe   = 0x66 // operand escape
288	Pm   = 0x0f // 2byte opcode escape
289	Pq   = 0xff // both escapes: 66 0f
290	Pb   = 0xfe // byte operands
291	Pf2  = 0xf2 // xmm escape 1: f2 0f
292	Pf3  = 0xf3 // xmm escape 2: f3 0f
293	Pef3 = 0xf5 // xmm escape 2 with 16-bit prefix: 66 f3 0f
294	Pq3  = 0x67 // xmm escape 3: 66 48 0f
295	Pq4  = 0x68 // xmm escape 4: 66 0F 38
296	Pq4w = 0x69 // Pq4 with Rex.w 66 0F 38
297	Pq5  = 0x6a // xmm escape 5: F3 0F 38
298	Pq5w = 0x6b // Pq5 with Rex.w F3 0F 38
299	Pfw  = 0xf4 // Pf3 with Rex.w: f3 48 0f
300	Pw   = 0x48 // Rex.w
301	Pw8  = 0x90 // symbolic; exact value doesn't matter
302	Py   = 0x80 // defaults to 64-bit mode
303	Py1  = 0x81 // symbolic; exact value doesn't matter
304	Py3  = 0x83 // symbolic; exact value doesn't matter
305	Pavx = 0x84 // symbolic: exact value doesn't matter
306
307	RxrEvex = 1 << 4 // AVX512 extension to REX.R/VEX.R
308	Rxw     = 1 << 3 // =1, 64-bit operand size
309	Rxr     = 1 << 2 // extend modrm reg
310	Rxx     = 1 << 1 // extend sib index
311	Rxb     = 1 << 0 // extend modrm r/m, sib base, or opcode reg
312)
313
314const (
315	// Encoding for VEX prefix in tables.
316	// The P, L, and W fields are chosen to match
317	// their eventual locations in the VEX prefix bytes.
318
319	// Encoding for VEX prefix in tables.
320	// The P, L, and W fields are chosen to match
321	// their eventual locations in the VEX prefix bytes.
322
323	// Using spare bit to make leading [E]VEX encoding byte different from
324	// 0x0f even if all other VEX fields are 0.
325	avxEscape = 1 << 6
326
327	// P field - 2 bits
328	vex66 = 1 << 0
329	vexF3 = 2 << 0
330	vexF2 = 3 << 0
331	// L field - 1 bit
332	vexLZ  = 0 << 2
333	vexLIG = 0 << 2
334	vex128 = 0 << 2
335	vex256 = 1 << 2
336	// W field - 1 bit
337	vexWIG = 0 << 7
338	vexW0  = 0 << 7
339	vexW1  = 1 << 7
340	// M field - 5 bits, but mostly reserved; we can store up to 3
341	vex0F   = 1 << 3
342	vex0F38 = 2 << 3
343	vex0F3A = 3 << 3
344)
345
346var ycover [Ymax * Ymax]uint8
347
348var reg [MAXREG]int
349
350var regrex [MAXREG + 1]int
351
352var ynone = []ytab{
353	{Zlit, 1, argList{}},
354}
355
356var ytext = []ytab{
357	{Zpseudo, 0, argList{Ymb, Ytextsize}},
358	{Zpseudo, 1, argList{Ymb, Yi32, Ytextsize}},
359}
360
361var ynop = []ytab{
362	{Zpseudo, 0, argList{}},
363	{Zpseudo, 0, argList{Yiauto}},
364	{Zpseudo, 0, argList{Yml}},
365	{Zpseudo, 0, argList{Yrf}},
366	{Zpseudo, 0, argList{Yxr}},
367	{Zpseudo, 0, argList{Yiauto}},
368	{Zpseudo, 0, argList{Yml}},
369	{Zpseudo, 0, argList{Yrf}},
370	{Zpseudo, 1, argList{Yxr}},
371}
372
373var yfuncdata = []ytab{
374	{Zpseudo, 0, argList{Yi32, Ym}},
375}
376
377var ypcdata = []ytab{
378	{Zpseudo, 0, argList{Yi32, Yi32}},
379}
380
381var yxorb = []ytab{
382	{Zib_, 1, argList{Yi32, Yal}},
383	{Zibo_m, 2, argList{Yi32, Ymb}},
384	{Zr_m, 1, argList{Yrb, Ymb}},
385	{Zm_r, 1, argList{Ymb, Yrb}},
386}
387
388var yaddl = []ytab{
389	{Zibo_m, 2, argList{Yi8, Yml}},
390	{Zil_, 1, argList{Yi32, Yax}},
391	{Zilo_m, 2, argList{Yi32, Yml}},
392	{Zr_m, 1, argList{Yrl, Yml}},
393	{Zm_r, 1, argList{Yml, Yrl}},
394}
395
396var yincl = []ytab{
397	{Z_rp, 1, argList{Yrl}},
398	{Zo_m, 2, argList{Yml}},
399}
400
401var yincq = []ytab{
402	{Zo_m, 2, argList{Yml}},
403}
404
405var ycmpb = []ytab{
406	{Z_ib, 1, argList{Yal, Yi32}},
407	{Zm_ibo, 2, argList{Ymb, Yi32}},
408	{Zm_r, 1, argList{Ymb, Yrb}},
409	{Zr_m, 1, argList{Yrb, Ymb}},
410}
411
412var ycmpl = []ytab{
413	{Zm_ibo, 2, argList{Yml, Yi8}},
414	{Z_il, 1, argList{Yax, Yi32}},
415	{Zm_ilo, 2, argList{Yml, Yi32}},
416	{Zm_r, 1, argList{Yml, Yrl}},
417	{Zr_m, 1, argList{Yrl, Yml}},
418}
419
420var yshb = []ytab{
421	{Zo_m, 2, argList{Yi1, Ymb}},
422	{Zibo_m, 2, argList{Yu8, Ymb}},
423	{Zo_m, 2, argList{Ycx, Ymb}},
424}
425
426var yshl = []ytab{
427	{Zo_m, 2, argList{Yi1, Yml}},
428	{Zibo_m, 2, argList{Yu8, Yml}},
429	{Zo_m, 2, argList{Ycl, Yml}},
430	{Zo_m, 2, argList{Ycx, Yml}},
431}
432
433var ytestl = []ytab{
434	{Zil_, 1, argList{Yi32, Yax}},
435	{Zilo_m, 2, argList{Yi32, Yml}},
436	{Zr_m, 1, argList{Yrl, Yml}},
437	{Zm_r, 1, argList{Yml, Yrl}},
438}
439
440var ymovb = []ytab{
441	{Zr_m, 1, argList{Yrb, Ymb}},
442	{Zm_r, 1, argList{Ymb, Yrb}},
443	{Zib_rp, 1, argList{Yi32, Yrb}},
444	{Zibo_m, 2, argList{Yi32, Ymb}},
445}
446
447var ybtl = []ytab{
448	{Zibo_m, 2, argList{Yi8, Yml}},
449	{Zr_m, 1, argList{Yrl, Yml}},
450}
451
452var ymovw = []ytab{
453	{Zr_m, 1, argList{Yrl, Yml}},
454	{Zm_r, 1, argList{Yml, Yrl}},
455	{Zil_rp, 1, argList{Yi32, Yrl}},
456	{Zilo_m, 2, argList{Yi32, Yml}},
457	{Zaut_r, 2, argList{Yiauto, Yrl}},
458}
459
460var ymovl = []ytab{
461	{Zr_m, 1, argList{Yrl, Yml}},
462	{Zm_r, 1, argList{Yml, Yrl}},
463	{Zil_rp, 1, argList{Yi32, Yrl}},
464	{Zilo_m, 2, argList{Yi32, Yml}},
465	{Zm_r_xm, 1, argList{Yml, Ymr}}, // MMX MOVD
466	{Zr_m_xm, 1, argList{Ymr, Yml}}, // MMX MOVD
467	{Zm_r_xm, 2, argList{Yml, Yxr}}, // XMM MOVD (32 bit)
468	{Zr_m_xm, 2, argList{Yxr, Yml}}, // XMM MOVD (32 bit)
469	{Zaut_r, 2, argList{Yiauto, Yrl}},
470}
471
472var yret = []ytab{
473	{Zo_iw, 1, argList{}},
474	{Zo_iw, 1, argList{Yi32}},
475}
476
477var ymovq = []ytab{
478	// valid in 32-bit mode
479	{Zm_r_xm_nr, 1, argList{Ym, Ymr}},  // 0x6f MMX MOVQ (shorter encoding)
480	{Zr_m_xm_nr, 1, argList{Ymr, Ym}},  // 0x7f MMX MOVQ
481	{Zm_r_xm_nr, 2, argList{Yxr, Ymr}}, // Pf2, 0xd6 MOVDQ2Q
482	{Zm_r_xm_nr, 2, argList{Yxm, Yxr}}, // Pf3, 0x7e MOVQ xmm1/m64 -> xmm2
483	{Zr_m_xm_nr, 2, argList{Yxr, Yxm}}, // Pe, 0xd6 MOVQ xmm1 -> xmm2/m64
484
485	// valid only in 64-bit mode, usually with 64-bit prefix
486	{Zr_m, 1, argList{Yrl, Yml}},      // 0x89
487	{Zm_r, 1, argList{Yml, Yrl}},      // 0x8b
488	{Zilo_m, 2, argList{Ys32, Yrl}},   // 32 bit signed 0xc7,(0)
489	{Ziq_rp, 1, argList{Yi64, Yrl}},   // 0xb8 -- 32/64 bit immediate
490	{Zilo_m, 2, argList{Yi32, Yml}},   // 0xc7,(0)
491	{Zm_r_xm, 1, argList{Ymm, Ymr}},   // 0x6e MMX MOVD
492	{Zr_m_xm, 1, argList{Ymr, Ymm}},   // 0x7e MMX MOVD
493	{Zm_r_xm, 2, argList{Yml, Yxr}},   // Pe, 0x6e MOVD xmm load
494	{Zr_m_xm, 2, argList{Yxr, Yml}},   // Pe, 0x7e MOVD xmm store
495	{Zaut_r, 1, argList{Yiauto, Yrl}}, // 0 built-in LEAQ
496}
497
498var ymovbe = []ytab{
499	{Zlitm_r, 3, argList{Ym, Yrl}},
500	{Zlitr_m, 3, argList{Yrl, Ym}},
501}
502
503var ym_rl = []ytab{
504	{Zm_r, 1, argList{Ym, Yrl}},
505}
506
507var yrl_m = []ytab{
508	{Zr_m, 1, argList{Yrl, Ym}},
509}
510
511var ymb_rl = []ytab{
512	{Zmb_r, 1, argList{Ymb, Yrl}},
513}
514
515var yml_rl = []ytab{
516	{Zm_r, 1, argList{Yml, Yrl}},
517}
518
519var yrl_ml = []ytab{
520	{Zr_m, 1, argList{Yrl, Yml}},
521}
522
523var yml_mb = []ytab{
524	{Zr_m, 1, argList{Yrb, Ymb}},
525	{Zm_r, 1, argList{Ymb, Yrb}},
526}
527
528var yrb_mb = []ytab{
529	{Zr_m, 1, argList{Yrb, Ymb}},
530}
531
532var yxchg = []ytab{
533	{Z_rp, 1, argList{Yax, Yrl}},
534	{Zrp_, 1, argList{Yrl, Yax}},
535	{Zr_m, 1, argList{Yrl, Yml}},
536	{Zm_r, 1, argList{Yml, Yrl}},
537}
538
539var ydivl = []ytab{
540	{Zm_o, 2, argList{Yml}},
541}
542
543var ydivb = []ytab{
544	{Zm_o, 2, argList{Ymb}},
545}
546
547var yimul = []ytab{
548	{Zm_o, 2, argList{Yml}},
549	{Zib_rr, 1, argList{Yi8, Yrl}},
550	{Zil_rr, 1, argList{Yi32, Yrl}},
551	{Zm_r, 2, argList{Yml, Yrl}},
552}
553
554var yimul3 = []ytab{
555	{Zibm_r, 2, argList{Yi8, Yml, Yrl}},
556	{Zibm_r, 2, argList{Yi32, Yml, Yrl}},
557}
558
559var ybyte = []ytab{
560	{Zbyte, 1, argList{Yi64}},
561}
562
563var yin = []ytab{
564	{Zib_, 1, argList{Yi32}},
565	{Zlit, 1, argList{}},
566}
567
568var yint = []ytab{
569	{Zib_, 1, argList{Yi32}},
570}
571
572var ypushl = []ytab{
573	{Zrp_, 1, argList{Yrl}},
574	{Zm_o, 2, argList{Ym}},
575	{Zib_, 1, argList{Yi8}},
576	{Zil_, 1, argList{Yi32}},
577}
578
579var ypopl = []ytab{
580	{Z_rp, 1, argList{Yrl}},
581	{Zo_m, 2, argList{Ym}},
582}
583
584var ywrfsbase = []ytab{
585	{Zm_o, 2, argList{Yrl}},
586}
587
588var yrdrand = []ytab{
589	{Zo_m, 2, argList{Yrl}},
590}
591
592var yclflush = []ytab{
593	{Zo_m, 2, argList{Ym}},
594}
595
596var ybswap = []ytab{
597	{Z_rp, 2, argList{Yrl}},
598}
599
600var yscond = []ytab{
601	{Zo_m, 2, argList{Ymb}},
602}
603
604var yjcond = []ytab{
605	{Zbr, 0, argList{Ybr}},
606	{Zbr, 0, argList{Yi0, Ybr}},
607	{Zbr, 1, argList{Yi1, Ybr}},
608}
609
610var yloop = []ytab{
611	{Zloop, 1, argList{Ybr}},
612}
613
614var ycall = []ytab{
615	{Zcallindreg, 0, argList{Yml}},
616	{Zcallindreg, 2, argList{Yrx, Yrx}},
617	{Zcallind, 2, argList{Yindir}},
618	{Zcall, 0, argList{Ybr}},
619	{Zcallcon, 1, argList{Yi32}},
620}
621
622var yduff = []ytab{
623	{Zcallduff, 1, argList{Yi32}},
624}
625
626var yjmp = []ytab{
627	{Zo_m64, 2, argList{Yml}},
628	{Zjmp, 0, argList{Ybr}},
629	{Zjmpcon, 1, argList{Yi32}},
630}
631
632var yfmvd = []ytab{
633	{Zm_o, 2, argList{Ym, Yf0}},
634	{Zo_m, 2, argList{Yf0, Ym}},
635	{Zm_o, 2, argList{Yrf, Yf0}},
636	{Zo_m, 2, argList{Yf0, Yrf}},
637}
638
639var yfmvdp = []ytab{
640	{Zo_m, 2, argList{Yf0, Ym}},
641	{Zo_m, 2, argList{Yf0, Yrf}},
642}
643
644var yfmvf = []ytab{
645	{Zm_o, 2, argList{Ym, Yf0}},
646	{Zo_m, 2, argList{Yf0, Ym}},
647}
648
649var yfmvx = []ytab{
650	{Zm_o, 2, argList{Ym, Yf0}},
651}
652
653var yfmvp = []ytab{
654	{Zo_m, 2, argList{Yf0, Ym}},
655}
656
657var yfcmv = []ytab{
658	{Zm_o, 2, argList{Yrf, Yf0}},
659}
660
661var yfadd = []ytab{
662	{Zm_o, 2, argList{Ym, Yf0}},
663	{Zm_o, 2, argList{Yrf, Yf0}},
664	{Zo_m, 2, argList{Yf0, Yrf}},
665}
666
667var yfxch = []ytab{
668	{Zo_m, 2, argList{Yf0, Yrf}},
669	{Zm_o, 2, argList{Yrf, Yf0}},
670}
671
672var ycompp = []ytab{
673	{Zo_m, 2, argList{Yf0, Yrf}}, // botch is really f0,f1
674}
675
676var ystsw = []ytab{
677	{Zo_m, 2, argList{Ym}},
678	{Zlit, 1, argList{Yax}},
679}
680
681var ysvrs_mo = []ytab{
682	{Zm_o, 2, argList{Ym}},
683}
684
685// unaryDst version of "ysvrs_mo".
686var ysvrs_om = []ytab{
687	{Zo_m, 2, argList{Ym}},
688}
689
690var ymm = []ytab{
691	{Zm_r_xm, 1, argList{Ymm, Ymr}},
692	{Zm_r_xm, 2, argList{Yxm, Yxr}},
693}
694
695var yxm = []ytab{
696	{Zm_r_xm, 1, argList{Yxm, Yxr}},
697}
698
699var yxm_q4 = []ytab{
700	{Zm_r, 1, argList{Yxm, Yxr}},
701}
702
703var yxcvm1 = []ytab{
704	{Zm_r_xm, 2, argList{Yxm, Yxr}},
705	{Zm_r_xm, 2, argList{Yxm, Ymr}},
706}
707
708var yxcvm2 = []ytab{
709	{Zm_r_xm, 2, argList{Yxm, Yxr}},
710	{Zm_r_xm, 2, argList{Ymm, Yxr}},
711}
712
713var yxr = []ytab{
714	{Zm_r_xm, 1, argList{Yxr, Yxr}},
715}
716
717var yxr_ml = []ytab{
718	{Zr_m_xm, 1, argList{Yxr, Yml}},
719}
720
721var ymr = []ytab{
722	{Zm_r, 1, argList{Ymr, Ymr}},
723}
724
725var ymr_ml = []ytab{
726	{Zr_m_xm, 1, argList{Ymr, Yml}},
727}
728
729var yxcmpi = []ytab{
730	{Zm_r_i_xm, 2, argList{Yxm, Yxr, Yi8}},
731}
732
733var yxmov = []ytab{
734	{Zm_r_xm, 1, argList{Yxm, Yxr}},
735	{Zr_m_xm, 1, argList{Yxr, Yxm}},
736}
737
738var yxcvfl = []ytab{
739	{Zm_r_xm, 1, argList{Yxm, Yrl}},
740}
741
742var yxcvlf = []ytab{
743	{Zm_r_xm, 1, argList{Yml, Yxr}},
744}
745
746var yxcvfq = []ytab{
747	{Zm_r_xm, 2, argList{Yxm, Yrl}},
748}
749
750var yxcvqf = []ytab{
751	{Zm_r_xm, 2, argList{Yml, Yxr}},
752}
753
754var yps = []ytab{
755	{Zm_r_xm, 1, argList{Ymm, Ymr}},
756	{Zibo_m_xm, 2, argList{Yi8, Ymr}},
757	{Zm_r_xm, 2, argList{Yxm, Yxr}},
758	{Zibo_m_xm, 3, argList{Yi8, Yxr}},
759}
760
761var yxrrl = []ytab{
762	{Zm_r, 1, argList{Yxr, Yrl}},
763}
764
765var ymrxr = []ytab{
766	{Zm_r, 1, argList{Ymr, Yxr}},
767	{Zm_r_xm, 1, argList{Yxm, Yxr}},
768}
769
770var ymshuf = []ytab{
771	{Zibm_r, 2, argList{Yi8, Ymm, Ymr}},
772}
773
774var ymshufb = []ytab{
775	{Zm2_r, 2, argList{Yxm, Yxr}},
776}
777
778// It should never have more than 1 entry,
779// because some optab entries you opcode secuences that
780// are longer than 2 bytes (zoffset=2 here),
781// ROUNDPD and ROUNDPS and recently added BLENDPD,
782// to name a few.
783var yxshuf = []ytab{
784	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
785}
786
787var yextrw = []ytab{
788	{Zibm_r, 2, argList{Yu8, Yxr, Yrl}},
789	{Zibr_m, 2, argList{Yu8, Yxr, Yml}},
790}
791
792var yextr = []ytab{
793	{Zibr_m, 3, argList{Yu8, Yxr, Ymm}},
794}
795
796var yinsrw = []ytab{
797	{Zibm_r, 2, argList{Yu8, Yml, Yxr}},
798}
799
800var yinsr = []ytab{
801	{Zibm_r, 3, argList{Yu8, Ymm, Yxr}},
802}
803
804var ypsdq = []ytab{
805	{Zibo_m, 2, argList{Yi8, Yxr}},
806}
807
808var ymskb = []ytab{
809	{Zm_r_xm, 2, argList{Yxr, Yrl}},
810	{Zm_r_xm, 1, argList{Ymr, Yrl}},
811}
812
813var ycrc32l = []ytab{
814	{Zlitm_r, 0, argList{Yml, Yrl}},
815}
816
817var ycrc32b = []ytab{
818	{Zlitm_r, 0, argList{Ymb, Yrl}},
819}
820
821var yprefetch = []ytab{
822	{Zm_o, 2, argList{Ym}},
823}
824
825var yaes = []ytab{
826	{Zlitm_r, 2, argList{Yxm, Yxr}},
827}
828
829var yxbegin = []ytab{
830	{Zjmp, 1, argList{Ybr}},
831}
832
833var yxabort = []ytab{
834	{Zib_, 1, argList{Yu8}},
835}
836
837var ylddqu = []ytab{
838	{Zm_r, 1, argList{Ym, Yxr}},
839}
840
841var ypalignr = []ytab{
842	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
843}
844
845var ysha256rnds2 = []ytab{
846	{Zlit_m_r, 0, argList{Yxr0, Yxm, Yxr}},
847}
848
849var yblendvpd = []ytab{
850	{Z_m_r, 1, argList{Yxr0, Yxm, Yxr}},
851}
852
853var ymmxmm0f38 = []ytab{
854	{Zlitm_r, 3, argList{Ymm, Ymr}},
855	{Zlitm_r, 5, argList{Yxm, Yxr}},
856}
857
858var yextractps = []ytab{
859	{Zibr_m, 2, argList{Yu2, Yxr, Yml}},
860}
861
862var ysha1rnds4 = []ytab{
863	{Zibm_r, 2, argList{Yu2, Yxm, Yxr}},
864}
865
866// You are doasm, holding in your hand a *obj.Prog with p.As set to, say,
867// ACRC32, and p.From and p.To as operands (obj.Addr).  The linker scans optab
868// to find the entry with the given p.As and then looks through the ytable for
869// that instruction (the second field in the optab struct) for a line whose
870// first two values match the Ytypes of the p.From and p.To operands.  The
871// function oclass computes the specific Ytype of an operand and then the set
872// of more general Ytypes that it satisfies is implied by the ycover table, set
873// up in instinit.  For example, oclass distinguishes the constants 0 and 1
874// from the more general 8-bit constants, but instinit says
875//
876//        ycover[Yi0*Ymax+Ys32] = 1
877//        ycover[Yi1*Ymax+Ys32] = 1
878//        ycover[Yi8*Ymax+Ys32] = 1
879//
880// which means that Yi0, Yi1, and Yi8 all count as Ys32 (signed 32)
881// if that's what an instruction can handle.
882//
883// In parallel with the scan through the ytable for the appropriate line, there
884// is a z pointer that starts out pointing at the strange magic byte list in
885// the Optab struct.  With each step past a non-matching ytable line, z
886// advances by the 4th entry in the line.  When a matching line is found, that
887// z pointer has the extra data to use in laying down the instruction bytes.
888// The actual bytes laid down are a function of the 3rd entry in the line (that
889// is, the Ztype) and the z bytes.
890//
891// For example, let's look at AADDL.  The optab line says:
892//        {AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
893//
894// and yaddl says
895//        var yaddl = []ytab{
896//                {Yi8, Ynone, Yml, Zibo_m, 2},
897//                {Yi32, Ynone, Yax, Zil_, 1},
898//                {Yi32, Ynone, Yml, Zilo_m, 2},
899//                {Yrl, Ynone, Yml, Zr_m, 1},
900//                {Yml, Ynone, Yrl, Zm_r, 1},
901//        }
902//
903// so there are 5 possible types of ADDL instruction that can be laid down, and
904// possible states used to lay them down (Ztype and z pointer, assuming z
905// points at opBytes{0x83, 00, 0x05,0x81, 00, 0x01, 0x03}) are:
906//
907//        Yi8, Yml -> Zibo_m, z (0x83, 00)
908//        Yi32, Yax -> Zil_, z+2 (0x05)
909//        Yi32, Yml -> Zilo_m, z+2+1 (0x81, 0x00)
910//        Yrl, Yml -> Zr_m, z+2+1+2 (0x01)
911//        Yml, Yrl -> Zm_r, z+2+1+2+1 (0x03)
912//
913// The Pconstant in the optab line controls the prefix bytes to emit.  That's
914// relatively straightforward as this program goes.
915//
916// The switch on yt.zcase in doasm implements the various Z cases.  Zibo_m, for
917// example, is an opcode byte (z[0]) then an asmando (which is some kind of
918// encoded addressing mode for the Yml arg), and then a single immediate byte.
919// Zilo_m is the same but a long (32-bit) immediate.
920var optab =
921//	as, ytab, andproto, opcode
922[...]Optab{
923	{obj.AXXX, nil, 0, opBytes{}},
924	{AAAA, ynone, P32, opBytes{0x37}},
925	{AAAD, ynone, P32, opBytes{0xd5, 0x0a}},
926	{AAAM, ynone, P32, opBytes{0xd4, 0x0a}},
927	{AAAS, ynone, P32, opBytes{0x3f}},
928	{AADCB, yxorb, Pb, opBytes{0x14, 0x80, 02, 0x10, 0x12}},
929	{AADCL, yaddl, Px, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
930	{AADCQ, yaddl, Pw, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
931	{AADCW, yaddl, Pe, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
932	{AADCXL, yml_rl, Pq4, opBytes{0xf6}},
933	{AADCXQ, yml_rl, Pq4w, opBytes{0xf6}},
934	{AADDB, yxorb, Pb, opBytes{0x04, 0x80, 00, 0x00, 0x02}},
935	{AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
936	{AADDPD, yxm, Pq, opBytes{0x58}},
937	{AADDPS, yxm, Pm, opBytes{0x58}},
938	{AADDQ, yaddl, Pw, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
939	{AADDSD, yxm, Pf2, opBytes{0x58}},
940	{AADDSS, yxm, Pf3, opBytes{0x58}},
941	{AADDSUBPD, yxm, Pq, opBytes{0xd0}},
942	{AADDSUBPS, yxm, Pf2, opBytes{0xd0}},
943	{AADDW, yaddl, Pe, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
944	{AADOXL, yml_rl, Pq5, opBytes{0xf6}},
945	{AADOXQ, yml_rl, Pq5w, opBytes{0xf6}},
946	{AADJSP, nil, 0, opBytes{}},
947	{AANDB, yxorb, Pb, opBytes{0x24, 0x80, 04, 0x20, 0x22}},
948	{AANDL, yaddl, Px, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
949	{AANDNPD, yxm, Pq, opBytes{0x55}},
950	{AANDNPS, yxm, Pm, opBytes{0x55}},
951	{AANDPD, yxm, Pq, opBytes{0x54}},
952	{AANDPS, yxm, Pm, opBytes{0x54}},
953	{AANDQ, yaddl, Pw, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
954	{AANDW, yaddl, Pe, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
955	{AARPL, yrl_ml, P32, opBytes{0x63}},
956	{ABOUNDL, yrl_m, P32, opBytes{0x62}},
957	{ABOUNDW, yrl_m, Pe, opBytes{0x62}},
958	{ABSFL, yml_rl, Pm, opBytes{0xbc}},
959	{ABSFQ, yml_rl, Pw, opBytes{0x0f, 0xbc}},
960	{ABSFW, yml_rl, Pq, opBytes{0xbc}},
961	{ABSRL, yml_rl, Pm, opBytes{0xbd}},
962	{ABSRQ, yml_rl, Pw, opBytes{0x0f, 0xbd}},
963	{ABSRW, yml_rl, Pq, opBytes{0xbd}},
964	{ABSWAPL, ybswap, Px, opBytes{0x0f, 0xc8}},
965	{ABSWAPQ, ybswap, Pw, opBytes{0x0f, 0xc8}},
966	{ABTCL, ybtl, Pm, opBytes{0xba, 07, 0xbb}},
967	{ABTCQ, ybtl, Pw, opBytes{0x0f, 0xba, 07, 0x0f, 0xbb}},
968	{ABTCW, ybtl, Pq, opBytes{0xba, 07, 0xbb}},
969	{ABTL, ybtl, Pm, opBytes{0xba, 04, 0xa3}},
970	{ABTQ, ybtl, Pw, opBytes{0x0f, 0xba, 04, 0x0f, 0xa3}},
971	{ABTRL, ybtl, Pm, opBytes{0xba, 06, 0xb3}},
972	{ABTRQ, ybtl, Pw, opBytes{0x0f, 0xba, 06, 0x0f, 0xb3}},
973	{ABTRW, ybtl, Pq, opBytes{0xba, 06, 0xb3}},
974	{ABTSL, ybtl, Pm, opBytes{0xba, 05, 0xab}},
975	{ABTSQ, ybtl, Pw, opBytes{0x0f, 0xba, 05, 0x0f, 0xab}},
976	{ABTSW, ybtl, Pq, opBytes{0xba, 05, 0xab}},
977	{ABTW, ybtl, Pq, opBytes{0xba, 04, 0xa3}},
978	{ABYTE, ybyte, Px, opBytes{1}},
979	{obj.ACALL, ycall, Px, opBytes{0xff, 02, 0xff, 0x15, 0xe8}},
980	{ACBW, ynone, Pe, opBytes{0x98}},
981	{ACDQ, ynone, Px, opBytes{0x99}},
982	{ACDQE, ynone, Pw, opBytes{0x98}},
983	{ACLAC, ynone, Pm, opBytes{01, 0xca}},
984	{ACLC, ynone, Px, opBytes{0xf8}},
985	{ACLD, ynone, Px, opBytes{0xfc}},
986	{ACLDEMOTE, yclflush, Pm, opBytes{0x1c, 00}},
987	{ACLFLUSH, yclflush, Pm, opBytes{0xae, 07}},
988	{ACLFLUSHOPT, yclflush, Pq, opBytes{0xae, 07}},
989	{ACLI, ynone, Px, opBytes{0xfa}},
990	{ACLTS, ynone, Pm, opBytes{0x06}},
991	{ACLWB, yclflush, Pq, opBytes{0xae, 06}},
992	{ACMC, ynone, Px, opBytes{0xf5}},
993	{ACMOVLCC, yml_rl, Pm, opBytes{0x43}},
994	{ACMOVLCS, yml_rl, Pm, opBytes{0x42}},
995	{ACMOVLEQ, yml_rl, Pm, opBytes{0x44}},
996	{ACMOVLGE, yml_rl, Pm, opBytes{0x4d}},
997	{ACMOVLGT, yml_rl, Pm, opBytes{0x4f}},
998	{ACMOVLHI, yml_rl, Pm, opBytes{0x47}},
999	{ACMOVLLE, yml_rl, Pm, opBytes{0x4e}},
1000	{ACMOVLLS, yml_rl, Pm, opBytes{0x46}},
1001	{ACMOVLLT, yml_rl, Pm, opBytes{0x4c}},
1002	{ACMOVLMI, yml_rl, Pm, opBytes{0x48}},
1003	{ACMOVLNE, yml_rl, Pm, opBytes{0x45}},
1004	{ACMOVLOC, yml_rl, Pm, opBytes{0x41}},
1005	{ACMOVLOS, yml_rl, Pm, opBytes{0x40}},
1006	{ACMOVLPC, yml_rl, Pm, opBytes{0x4b}},
1007	{ACMOVLPL, yml_rl, Pm, opBytes{0x49}},
1008	{ACMOVLPS, yml_rl, Pm, opBytes{0x4a}},
1009	{ACMOVQCC, yml_rl, Pw, opBytes{0x0f, 0x43}},
1010	{ACMOVQCS, yml_rl, Pw, opBytes{0x0f, 0x42}},
1011	{ACMOVQEQ, yml_rl, Pw, opBytes{0x0f, 0x44}},
1012	{ACMOVQGE, yml_rl, Pw, opBytes{0x0f, 0x4d}},
1013	{ACMOVQGT, yml_rl, Pw, opBytes{0x0f, 0x4f}},
1014	{ACMOVQHI, yml_rl, Pw, opBytes{0x0f, 0x47}},
1015	{ACMOVQLE, yml_rl, Pw, opBytes{0x0f, 0x4e}},
1016	{ACMOVQLS, yml_rl, Pw, opBytes{0x0f, 0x46}},
1017	{ACMOVQLT, yml_rl, Pw, opBytes{0x0f, 0x4c}},
1018	{ACMOVQMI, yml_rl, Pw, opBytes{0x0f, 0x48}},
1019	{ACMOVQNE, yml_rl, Pw, opBytes{0x0f, 0x45}},
1020	{ACMOVQOC, yml_rl, Pw, opBytes{0x0f, 0x41}},
1021	{ACMOVQOS, yml_rl, Pw, opBytes{0x0f, 0x40}},
1022	{ACMOVQPC, yml_rl, Pw, opBytes{0x0f, 0x4b}},
1023	{ACMOVQPL, yml_rl, Pw, opBytes{0x0f, 0x49}},
1024	{ACMOVQPS, yml_rl, Pw, opBytes{0x0f, 0x4a}},
1025	{ACMOVWCC, yml_rl, Pq, opBytes{0x43}},
1026	{ACMOVWCS, yml_rl, Pq, opBytes{0x42}},
1027	{ACMOVWEQ, yml_rl, Pq, opBytes{0x44}},
1028	{ACMOVWGE, yml_rl, Pq, opBytes{0x4d}},
1029	{ACMOVWGT, yml_rl, Pq, opBytes{0x4f}},
1030	{ACMOVWHI, yml_rl, Pq, opBytes{0x47}},
1031	{ACMOVWLE, yml_rl, Pq, opBytes{0x4e}},
1032	{ACMOVWLS, yml_rl, Pq, opBytes{0x46}},
1033	{ACMOVWLT, yml_rl, Pq, opBytes{0x4c}},
1034	{ACMOVWMI, yml_rl, Pq, opBytes{0x48}},
1035	{ACMOVWNE, yml_rl, Pq, opBytes{0x45}},
1036	{ACMOVWOC, yml_rl, Pq, opBytes{0x41}},
1037	{ACMOVWOS, yml_rl, Pq, opBytes{0x40}},
1038	{ACMOVWPC, yml_rl, Pq, opBytes{0x4b}},
1039	{ACMOVWPL, yml_rl, Pq, opBytes{0x49}},
1040	{ACMOVWPS, yml_rl, Pq, opBytes{0x4a}},
1041	{ACMPB, ycmpb, Pb, opBytes{0x3c, 0x80, 07, 0x38, 0x3a}},
1042	{ACMPL, ycmpl, Px, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1043	{ACMPPD, yxcmpi, Px, opBytes{Pe, 0xc2}},
1044	{ACMPPS, yxcmpi, Pm, opBytes{0xc2, 0}},
1045	{ACMPQ, ycmpl, Pw, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1046	{ACMPSB, ynone, Pb, opBytes{0xa6}},
1047	{ACMPSD, yxcmpi, Px, opBytes{Pf2, 0xc2}},
1048	{ACMPSL, ynone, Px, opBytes{0xa7}},
1049	{ACMPSQ, ynone, Pw, opBytes{0xa7}},
1050	{ACMPSS, yxcmpi, Px, opBytes{Pf3, 0xc2}},
1051	{ACMPSW, ynone, Pe, opBytes{0xa7}},
1052	{ACMPW, ycmpl, Pe, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1053	{ACOMISD, yxm, Pe, opBytes{0x2f}},
1054	{ACOMISS, yxm, Pm, opBytes{0x2f}},
1055	{ACPUID, ynone, Pm, opBytes{0xa2}},
1056	{ACVTPL2PD, yxcvm2, Px, opBytes{Pf3, 0xe6, Pe, 0x2a}},
1057	{ACVTPL2PS, yxcvm2, Pm, opBytes{0x5b, 0, 0x2a, 0}},
1058	{ACVTPD2PL, yxcvm1, Px, opBytes{Pf2, 0xe6, Pe, 0x2d}},
1059	{ACVTPD2PS, yxm, Pe, opBytes{0x5a}},
1060	{ACVTPS2PL, yxcvm1, Px, opBytes{Pe, 0x5b, Pm, 0x2d}},
1061	{ACVTPS2PD, yxm, Pm, opBytes{0x5a}},
1062	{ACVTSD2SL, yxcvfl, Pf2, opBytes{0x2d}},
1063	{ACVTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2d}},
1064	{ACVTSD2SS, yxm, Pf2, opBytes{0x5a}},
1065	{ACVTSL2SD, yxcvlf, Pf2, opBytes{0x2a}},
1066	{ACVTSQ2SD, yxcvqf, Pw, opBytes{Pf2, 0x2a}},
1067	{ACVTSL2SS, yxcvlf, Pf3, opBytes{0x2a}},
1068	{ACVTSQ2SS, yxcvqf, Pw, opBytes{Pf3, 0x2a}},
1069	{ACVTSS2SD, yxm, Pf3, opBytes{0x5a}},
1070	{ACVTSS2SL, yxcvfl, Pf3, opBytes{0x2d}},
1071	{ACVTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2d}},
1072	{ACVTTPD2PL, yxcvm1, Px, opBytes{Pe, 0xe6, Pe, 0x2c}},
1073	{ACVTTPS2PL, yxcvm1, Px, opBytes{Pf3, 0x5b, Pm, 0x2c}},
1074	{ACVTTSD2SL, yxcvfl, Pf2, opBytes{0x2c}},
1075	{ACVTTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2c}},
1076	{ACVTTSS2SL, yxcvfl, Pf3, opBytes{0x2c}},
1077	{ACVTTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2c}},
1078	{ACWD, ynone, Pe, opBytes{0x99}},
1079	{ACWDE, ynone, Px, opBytes{0x98}},
1080	{ACQO, ynone, Pw, opBytes{0x99}},
1081	{ADAA, ynone, P32, opBytes{0x27}},
1082	{ADAS, ynone, P32, opBytes{0x2f}},
1083	{ADECB, yscond, Pb, opBytes{0xfe, 01}},
1084	{ADECL, yincl, Px1, opBytes{0x48, 0xff, 01}},
1085	{ADECQ, yincq, Pw, opBytes{0xff, 01}},
1086	{ADECW, yincq, Pe, opBytes{0xff, 01}},
1087	{ADIVB, ydivb, Pb, opBytes{0xf6, 06}},
1088	{ADIVL, ydivl, Px, opBytes{0xf7, 06}},
1089	{ADIVPD, yxm, Pe, opBytes{0x5e}},
1090	{ADIVPS, yxm, Pm, opBytes{0x5e}},
1091	{ADIVQ, ydivl, Pw, opBytes{0xf7, 06}},
1092	{ADIVSD, yxm, Pf2, opBytes{0x5e}},
1093	{ADIVSS, yxm, Pf3, opBytes{0x5e}},
1094	{ADIVW, ydivl, Pe, opBytes{0xf7, 06}},
1095	{ADPPD, yxshuf, Pq, opBytes{0x3a, 0x41, 0}},
1096	{ADPPS, yxshuf, Pq, opBytes{0x3a, 0x40, 0}},
1097	{AEMMS, ynone, Pm, opBytes{0x77}},
1098	{AEXTRACTPS, yextractps, Pq, opBytes{0x3a, 0x17, 0}},
1099	{AENTER, nil, 0, opBytes{}}, // botch
1100	{AFXRSTOR, ysvrs_mo, Pm, opBytes{0xae, 01, 0xae, 01}},
1101	{AFXSAVE, ysvrs_om, Pm, opBytes{0xae, 00, 0xae, 00}},
1102	{AFXRSTOR64, ysvrs_mo, Pw, opBytes{0x0f, 0xae, 01, 0x0f, 0xae, 01}},
1103	{AFXSAVE64, ysvrs_om, Pw, opBytes{0x0f, 0xae, 00, 0x0f, 0xae, 00}},
1104	{AHLT, ynone, Px, opBytes{0xf4}},
1105	{AIDIVB, ydivb, Pb, opBytes{0xf6, 07}},
1106	{AIDIVL, ydivl, Px, opBytes{0xf7, 07}},
1107	{AIDIVQ, ydivl, Pw, opBytes{0xf7, 07}},
1108	{AIDIVW, ydivl, Pe, opBytes{0xf7, 07}},
1109	{AIMULB, ydivb, Pb, opBytes{0xf6, 05}},
1110	{AIMULL, yimul, Px, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1111	{AIMULQ, yimul, Pw, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1112	{AIMULW, yimul, Pe, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1113	{AIMUL3W, yimul3, Pe, opBytes{0x6b, 00, 0x69, 00}},
1114	{AIMUL3L, yimul3, Px, opBytes{0x6b, 00, 0x69, 00}},
1115	{AIMUL3Q, yimul3, Pw, opBytes{0x6b, 00, 0x69, 00}},
1116	{AINB, yin, Pb, opBytes{0xe4, 0xec}},
1117	{AINW, yin, Pe, opBytes{0xe5, 0xed}},
1118	{AINL, yin, Px, opBytes{0xe5, 0xed}},
1119	{AINCB, yscond, Pb, opBytes{0xfe, 00}},
1120	{AINCL, yincl, Px1, opBytes{0x40, 0xff, 00}},
1121	{AINCQ, yincq, Pw, opBytes{0xff, 00}},
1122	{AINCW, yincq, Pe, opBytes{0xff, 00}},
1123	{AINSB, ynone, Pb, opBytes{0x6c}},
1124	{AINSL, ynone, Px, opBytes{0x6d}},
1125	{AINSERTPS, yxshuf, Pq, opBytes{0x3a, 0x21, 0}},
1126	{AINSW, ynone, Pe, opBytes{0x6d}},
1127	{AICEBP, ynone, Px, opBytes{0xf1}},
1128	{AINT, yint, Px, opBytes{0xcd}},
1129	{AINTO, ynone, P32, opBytes{0xce}},
1130	{AIRETL, ynone, Px, opBytes{0xcf}},
1131	{AIRETQ, ynone, Pw, opBytes{0xcf}},
1132	{AIRETW, ynone, Pe, opBytes{0xcf}},
1133	{AJCC, yjcond, Px, opBytes{0x73, 0x83, 00}},
1134	{AJCS, yjcond, Px, opBytes{0x72, 0x82}},
1135	{AJCXZL, yloop, Px, opBytes{0xe3}},
1136	{AJCXZW, yloop, Px, opBytes{0xe3}},
1137	{AJCXZQ, yloop, Px, opBytes{0xe3}},
1138	{AJEQ, yjcond, Px, opBytes{0x74, 0x84}},
1139	{AJGE, yjcond, Px, opBytes{0x7d, 0x8d}},
1140	{AJGT, yjcond, Px, opBytes{0x7f, 0x8f}},
1141	{AJHI, yjcond, Px, opBytes{0x77, 0x87}},
1142	{AJLE, yjcond, Px, opBytes{0x7e, 0x8e}},
1143	{AJLS, yjcond, Px, opBytes{0x76, 0x86}},
1144	{AJLT, yjcond, Px, opBytes{0x7c, 0x8c}},
1145	{AJMI, yjcond, Px, opBytes{0x78, 0x88}},
1146	{obj.AJMP, yjmp, Px, opBytes{0xff, 04, 0xeb, 0xe9}},
1147	{AJNE, yjcond, Px, opBytes{0x75, 0x85}},
1148	{AJOC, yjcond, Px, opBytes{0x71, 0x81, 00}},
1149	{AJOS, yjcond, Px, opBytes{0x70, 0x80, 00}},
1150	{AJPC, yjcond, Px, opBytes{0x7b, 0x8b}},
1151	{AJPL, yjcond, Px, opBytes{0x79, 0x89}},
1152	{AJPS, yjcond, Px, opBytes{0x7a, 0x8a}},
1153	{AHADDPD, yxm, Pq, opBytes{0x7c}},
1154	{AHADDPS, yxm, Pf2, opBytes{0x7c}},
1155	{AHSUBPD, yxm, Pq, opBytes{0x7d}},
1156	{AHSUBPS, yxm, Pf2, opBytes{0x7d}},
1157	{ALAHF, ynone, Px, opBytes{0x9f}},
1158	{ALARL, yml_rl, Pm, opBytes{0x02}},
1159	{ALARQ, yml_rl, Pw, opBytes{0x0f, 0x02}},
1160	{ALARW, yml_rl, Pq, opBytes{0x02}},
1161	{ALDDQU, ylddqu, Pf2, opBytes{0xf0}},
1162	{ALDMXCSR, ysvrs_mo, Pm, opBytes{0xae, 02, 0xae, 02}},
1163	{ALEAL, ym_rl, Px, opBytes{0x8d}},
1164	{ALEAQ, ym_rl, Pw, opBytes{0x8d}},
1165	{ALEAVEL, ynone, P32, opBytes{0xc9}},
1166	{ALEAVEQ, ynone, Py, opBytes{0xc9}},
1167	{ALEAVEW, ynone, Pe, opBytes{0xc9}},
1168	{ALEAW, ym_rl, Pe, opBytes{0x8d}},
1169	{ALOCK, ynone, Px, opBytes{0xf0}},
1170	{ALODSB, ynone, Pb, opBytes{0xac}},
1171	{ALODSL, ynone, Px, opBytes{0xad}},
1172	{ALODSQ, ynone, Pw, opBytes{0xad}},
1173	{ALODSW, ynone, Pe, opBytes{0xad}},
1174	{ALONG, ybyte, Px, opBytes{4}},
1175	{ALOOP, yloop, Px, opBytes{0xe2}},
1176	{ALOOPEQ, yloop, Px, opBytes{0xe1}},
1177	{ALOOPNE, yloop, Px, opBytes{0xe0}},
1178	{ALTR, ydivl, Pm, opBytes{0x00, 03}},
1179	{ALZCNTL, yml_rl, Pf3, opBytes{0xbd}},
1180	{ALZCNTQ, yml_rl, Pfw, opBytes{0xbd}},
1181	{ALZCNTW, yml_rl, Pef3, opBytes{0xbd}},
1182	{ALSLL, yml_rl, Pm, opBytes{0x03}},
1183	{ALSLW, yml_rl, Pq, opBytes{0x03}},
1184	{ALSLQ, yml_rl, Pw, opBytes{0x0f, 0x03}},
1185	{AMASKMOVOU, yxr, Pe, opBytes{0xf7}},
1186	{AMASKMOVQ, ymr, Pm, opBytes{0xf7}},
1187	{AMAXPD, yxm, Pe, opBytes{0x5f}},
1188	{AMAXPS, yxm, Pm, opBytes{0x5f}},
1189	{AMAXSD, yxm, Pf2, opBytes{0x5f}},
1190	{AMAXSS, yxm, Pf3, opBytes{0x5f}},
1191	{AMINPD, yxm, Pe, opBytes{0x5d}},
1192	{AMINPS, yxm, Pm, opBytes{0x5d}},
1193	{AMINSD, yxm, Pf2, opBytes{0x5d}},
1194	{AMINSS, yxm, Pf3, opBytes{0x5d}},
1195	{AMONITOR, ynone, Px, opBytes{0x0f, 0x01, 0xc8, 0}},
1196	{AMWAIT, ynone, Px, opBytes{0x0f, 0x01, 0xc9, 0}},
1197	{AMOVAPD, yxmov, Pe, opBytes{0x28, 0x29}},
1198	{AMOVAPS, yxmov, Pm, opBytes{0x28, 0x29}},
1199	{AMOVB, ymovb, Pb, opBytes{0x88, 0x8a, 0xb0, 0xc6, 00}},
1200	{AMOVBLSX, ymb_rl, Pm, opBytes{0xbe}},
1201	{AMOVBLZX, ymb_rl, Pm, opBytes{0xb6}},
1202	{AMOVBQSX, ymb_rl, Pw, opBytes{0x0f, 0xbe}},
1203	{AMOVBQZX, ymb_rl, Pw, opBytes{0x0f, 0xb6}},
1204	{AMOVBWSX, ymb_rl, Pq, opBytes{0xbe}},
1205	{AMOVSWW, ymb_rl, Pe, opBytes{0x0f, 0xbf}},
1206	{AMOVBWZX, ymb_rl, Pq, opBytes{0xb6}},
1207	{AMOVZWW, ymb_rl, Pe, opBytes{0x0f, 0xb7}},
1208	{AMOVO, yxmov, Pe, opBytes{0x6f, 0x7f}},
1209	{AMOVOU, yxmov, Pf3, opBytes{0x6f, 0x7f}},
1210	{AMOVHLPS, yxr, Pm, opBytes{0x12}},
1211	{AMOVHPD, yxmov, Pe, opBytes{0x16, 0x17}},
1212	{AMOVHPS, yxmov, Pm, opBytes{0x16, 0x17}},
1213	{AMOVL, ymovl, Px, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
1214	{AMOVLHPS, yxr, Pm, opBytes{0x16}},
1215	{AMOVLPD, yxmov, Pe, opBytes{0x12, 0x13}},
1216	{AMOVLPS, yxmov, Pm, opBytes{0x12, 0x13}},
1217	{AMOVLQSX, yml_rl, Pw, opBytes{0x63}},
1218	{AMOVLQZX, yml_rl, Px, opBytes{0x8b}},
1219	{AMOVMSKPD, yxrrl, Pq, opBytes{0x50}},
1220	{AMOVMSKPS, yxrrl, Pm, opBytes{0x50}},
1221	{AMOVNTO, yxr_ml, Pe, opBytes{0xe7}},
1222	{AMOVNTDQA, ylddqu, Pq4, opBytes{0x2a}},
1223	{AMOVNTPD, yxr_ml, Pe, opBytes{0x2b}},
1224	{AMOVNTPS, yxr_ml, Pm, opBytes{0x2b}},
1225	{AMOVNTQ, ymr_ml, Pm, opBytes{0xe7}},
1226	{AMOVQ, ymovq, Pw8, opBytes{0x6f, 0x7f, Pf2, 0xd6, Pf3, 0x7e, Pe, 0xd6, 0x89, 0x8b, 0xc7, 00, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
1227	{AMOVQOZX, ymrxr, Pf3, opBytes{0xd6, 0x7e}},
1228	{AMOVSB, ynone, Pb, opBytes{0xa4}},
1229	{AMOVSD, yxmov, Pf2, opBytes{0x10, 0x11}},
1230	{AMOVSL, ynone, Px, opBytes{0xa5}},
1231	{AMOVSQ, ynone, Pw, opBytes{0xa5}},
1232	{AMOVSS, yxmov, Pf3, opBytes{0x10, 0x11}},
1233	{AMOVSW, ynone, Pe, opBytes{0xa5}},
1234	{AMOVUPD, yxmov, Pe, opBytes{0x10, 0x11}},
1235	{AMOVUPS, yxmov, Pm, opBytes{0x10, 0x11}},
1236	{AMOVW, ymovw, Pe, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0}},
1237	{AMOVWLSX, yml_rl, Pm, opBytes{0xbf}},
1238	{AMOVWLZX, yml_rl, Pm, opBytes{0xb7}},
1239	{AMOVWQSX, yml_rl, Pw, opBytes{0x0f, 0xbf}},
1240	{AMOVWQZX, yml_rl, Pw, opBytes{0x0f, 0xb7}},
1241	{AMPSADBW, yxshuf, Pq, opBytes{0x3a, 0x42, 0}},
1242	{AMULB, ydivb, Pb, opBytes{0xf6, 04}},
1243	{AMULL, ydivl, Px, opBytes{0xf7, 04}},
1244	{AMULPD, yxm, Pe, opBytes{0x59}},
1245	{AMULPS, yxm, Ym, opBytes{0x59}},
1246	{AMULQ, ydivl, Pw, opBytes{0xf7, 04}},
1247	{AMULSD, yxm, Pf2, opBytes{0x59}},
1248	{AMULSS, yxm, Pf3, opBytes{0x59}},
1249	{AMULW, ydivl, Pe, opBytes{0xf7, 04}},
1250	{ANEGB, yscond, Pb, opBytes{0xf6, 03}},
1251	{ANEGL, yscond, Px, opBytes{0xf7, 03}},
1252	{ANEGQ, yscond, Pw, opBytes{0xf7, 03}},
1253	{ANEGW, yscond, Pe, opBytes{0xf7, 03}},
1254	{obj.ANOP, ynop, Px, opBytes{0, 0}},
1255	{ANOTB, yscond, Pb, opBytes{0xf6, 02}},
1256	{ANOTL, yscond, Px, opBytes{0xf7, 02}}, // TODO(rsc): yscond is wrong here.
1257	{ANOTQ, yscond, Pw, opBytes{0xf7, 02}},
1258	{ANOTW, yscond, Pe, opBytes{0xf7, 02}},
1259	{AORB, yxorb, Pb, opBytes{0x0c, 0x80, 01, 0x08, 0x0a}},
1260	{AORL, yaddl, Px, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1261	{AORPD, yxm, Pq, opBytes{0x56}},
1262	{AORPS, yxm, Pm, opBytes{0x56}},
1263	{AORQ, yaddl, Pw, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1264	{AORW, yaddl, Pe, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1265	{AOUTB, yin, Pb, opBytes{0xe6, 0xee}},
1266	{AOUTL, yin, Px, opBytes{0xe7, 0xef}},
1267	{AOUTW, yin, Pe, opBytes{0xe7, 0xef}},
1268	{AOUTSB, ynone, Pb, opBytes{0x6e}},
1269	{AOUTSL, ynone, Px, opBytes{0x6f}},
1270	{AOUTSW, ynone, Pe, opBytes{0x6f}},
1271	{APABSB, yxm_q4, Pq4, opBytes{0x1c}},
1272	{APABSD, yxm_q4, Pq4, opBytes{0x1e}},
1273	{APABSW, yxm_q4, Pq4, opBytes{0x1d}},
1274	{APACKSSLW, ymm, Py1, opBytes{0x6b, Pe, 0x6b}},
1275	{APACKSSWB, ymm, Py1, opBytes{0x63, Pe, 0x63}},
1276	{APACKUSDW, yxm_q4, Pq4, opBytes{0x2b}},
1277	{APACKUSWB, ymm, Py1, opBytes{0x67, Pe, 0x67}},
1278	{APADDB, ymm, Py1, opBytes{0xfc, Pe, 0xfc}},
1279	{APADDL, ymm, Py1, opBytes{0xfe, Pe, 0xfe}},
1280	{APADDQ, yxm, Pe, opBytes{0xd4}},
1281	{APADDSB, ymm, Py1, opBytes{0xec, Pe, 0xec}},
1282	{APADDSW, ymm, Py1, opBytes{0xed, Pe, 0xed}},
1283	{APADDUSB, ymm, Py1, opBytes{0xdc, Pe, 0xdc}},
1284	{APADDUSW, ymm, Py1, opBytes{0xdd, Pe, 0xdd}},
1285	{APADDW, ymm, Py1, opBytes{0xfd, Pe, 0xfd}},
1286	{APALIGNR, ypalignr, Pq, opBytes{0x3a, 0x0f}},
1287	{APAND, ymm, Py1, opBytes{0xdb, Pe, 0xdb}},
1288	{APANDN, ymm, Py1, opBytes{0xdf, Pe, 0xdf}},
1289	{APAUSE, ynone, Px, opBytes{0xf3, 0x90}},
1290	{APAVGB, ymm, Py1, opBytes{0xe0, Pe, 0xe0}},
1291	{APAVGW, ymm, Py1, opBytes{0xe3, Pe, 0xe3}},
1292	{APBLENDW, yxshuf, Pq, opBytes{0x3a, 0x0e, 0}},
1293	{APCMPEQB, ymm, Py1, opBytes{0x74, Pe, 0x74}},
1294	{APCMPEQL, ymm, Py1, opBytes{0x76, Pe, 0x76}},
1295	{APCMPEQQ, yxm_q4, Pq4, opBytes{0x29}},
1296	{APCMPEQW, ymm, Py1, opBytes{0x75, Pe, 0x75}},
1297	{APCMPGTB, ymm, Py1, opBytes{0x64, Pe, 0x64}},
1298	{APCMPGTL, ymm, Py1, opBytes{0x66, Pe, 0x66}},
1299	{APCMPGTQ, yxm_q4, Pq4, opBytes{0x37}},
1300	{APCMPGTW, ymm, Py1, opBytes{0x65, Pe, 0x65}},
1301	{APCMPISTRI, yxshuf, Pq, opBytes{0x3a, 0x63, 0}},
1302	{APCMPISTRM, yxshuf, Pq, opBytes{0x3a, 0x62, 0}},
1303	{APEXTRW, yextrw, Pq, opBytes{0xc5, 0, 0x3a, 0x15, 0}},
1304	{APEXTRB, yextr, Pq, opBytes{0x3a, 0x14, 00}},
1305	{APEXTRD, yextr, Pq, opBytes{0x3a, 0x16, 00}},
1306	{APEXTRQ, yextr, Pq3, opBytes{0x3a, 0x16, 00}},
1307	{APHADDD, ymmxmm0f38, Px, opBytes{0x0F, 0x38, 0x02, 0, 0x66, 0x0F, 0x38, 0x02, 0}},
1308	{APHADDSW, yxm_q4, Pq4, opBytes{0x03}},
1309	{APHADDW, yxm_q4, Pq4, opBytes{0x01}},
1310	{APHMINPOSUW, yxm_q4, Pq4, opBytes{0x41}},
1311	{APHSUBD, yxm_q4, Pq4, opBytes{0x06}},
1312	{APHSUBSW, yxm_q4, Pq4, opBytes{0x07}},
1313	{APHSUBW, yxm_q4, Pq4, opBytes{0x05}},
1314	{APINSRW, yinsrw, Pq, opBytes{0xc4, 00}},
1315	{APINSRB, yinsr, Pq, opBytes{0x3a, 0x20, 00}},
1316	{APINSRD, yinsr, Pq, opBytes{0x3a, 0x22, 00}},
1317	{APINSRQ, yinsr, Pq3, opBytes{0x3a, 0x22, 00}},
1318	{APMADDUBSW, yxm_q4, Pq4, opBytes{0x04}},
1319	{APMADDWL, ymm, Py1, opBytes{0xf5, Pe, 0xf5}},
1320	{APMAXSB, yxm_q4, Pq4, opBytes{0x3c}},
1321	{APMAXSD, yxm_q4, Pq4, opBytes{0x3d}},
1322	{APMAXSW, yxm, Pe, opBytes{0xee}},
1323	{APMAXUB, yxm, Pe, opBytes{0xde}},
1324	{APMAXUD, yxm_q4, Pq4, opBytes{0x3f}},
1325	{APMAXUW, yxm_q4, Pq4, opBytes{0x3e}},
1326	{APMINSB, yxm_q4, Pq4, opBytes{0x38}},
1327	{APMINSD, yxm_q4, Pq4, opBytes{0x39}},
1328	{APMINSW, yxm, Pe, opBytes{0xea}},
1329	{APMINUB, yxm, Pe, opBytes{0xda}},
1330	{APMINUD, yxm_q4, Pq4, opBytes{0x3b}},
1331	{APMINUW, yxm_q4, Pq4, opBytes{0x3a}},
1332	{APMOVMSKB, ymskb, Px, opBytes{Pe, 0xd7, 0xd7}},
1333	{APMOVSXBD, yxm_q4, Pq4, opBytes{0x21}},
1334	{APMOVSXBQ, yxm_q4, Pq4, opBytes{0x22}},
1335	{APMOVSXBW, yxm_q4, Pq4, opBytes{0x20}},
1336	{APMOVSXDQ, yxm_q4, Pq4, opBytes{0x25}},
1337	{APMOVSXWD, yxm_q4, Pq4, opBytes{0x23}},
1338	{APMOVSXWQ, yxm_q4, Pq4, opBytes{0x24}},
1339	{APMOVZXBD, yxm_q4, Pq4, opBytes{0x31}},
1340	{APMOVZXBQ, yxm_q4, Pq4, opBytes{0x32}},
1341	{APMOVZXBW, yxm_q4, Pq4, opBytes{0x30}},
1342	{APMOVZXDQ, yxm_q4, Pq4, opBytes{0x35}},
1343	{APMOVZXWD, yxm_q4, Pq4, opBytes{0x33}},
1344	{APMOVZXWQ, yxm_q4, Pq4, opBytes{0x34}},
1345	{APMULDQ, yxm_q4, Pq4, opBytes{0x28}},
1346	{APMULHRSW, yxm_q4, Pq4, opBytes{0x0b}},
1347	{APMULHUW, ymm, Py1, opBytes{0xe4, Pe, 0xe4}},
1348	{APMULHW, ymm, Py1, opBytes{0xe5, Pe, 0xe5}},
1349	{APMULLD, yxm_q4, Pq4, opBytes{0x40}},
1350	{APMULLW, ymm, Py1, opBytes{0xd5, Pe, 0xd5}},
1351	{APMULULQ, ymm, Py1, opBytes{0xf4, Pe, 0xf4}},
1352	{APOPAL, ynone, P32, opBytes{0x61}},
1353	{APOPAW, ynone, Pe, opBytes{0x61}},
1354	{APOPCNTW, yml_rl, Pef3, opBytes{0xb8}},
1355	{APOPCNTL, yml_rl, Pf3, opBytes{0xb8}},
1356	{APOPCNTQ, yml_rl, Pfw, opBytes{0xb8}},
1357	{APOPFL, ynone, P32, opBytes{0x9d}},
1358	{APOPFQ, ynone, Py, opBytes{0x9d}},
1359	{APOPFW, ynone, Pe, opBytes{0x9d}},
1360	{APOPL, ypopl, P32, opBytes{0x58, 0x8f, 00}},
1361	{APOPQ, ypopl, Py, opBytes{0x58, 0x8f, 00}},
1362	{APOPW, ypopl, Pe, opBytes{0x58, 0x8f, 00}},
1363	{APOR, ymm, Py1, opBytes{0xeb, Pe, 0xeb}},
1364	{APSADBW, yxm, Pq, opBytes{0xf6}},
1365	{APSHUFHW, yxshuf, Pf3, opBytes{0x70, 00}},
1366	{APSHUFL, yxshuf, Pq, opBytes{0x70, 00}},
1367	{APSHUFLW, yxshuf, Pf2, opBytes{0x70, 00}},
1368	{APSHUFW, ymshuf, Pm, opBytes{0x70, 00}},
1369	{APSHUFB, ymshufb, Pq, opBytes{0x38, 0x00}},
1370	{APSIGNB, yxm_q4, Pq4, opBytes{0x08}},
1371	{APSIGND, yxm_q4, Pq4, opBytes{0x0a}},
1372	{APSIGNW, yxm_q4, Pq4, opBytes{0x09}},
1373	{APSLLO, ypsdq, Pq, opBytes{0x73, 07}},
1374	{APSLLL, yps, Py3, opBytes{0xf2, 0x72, 06, Pe, 0xf2, Pe, 0x72, 06}},
1375	{APSLLQ, yps, Py3, opBytes{0xf3, 0x73, 06, Pe, 0xf3, Pe, 0x73, 06}},
1376	{APSLLW, yps, Py3, opBytes{0xf1, 0x71, 06, Pe, 0xf1, Pe, 0x71, 06}},
1377	{APSRAL, yps, Py3, opBytes{0xe2, 0x72, 04, Pe, 0xe2, Pe, 0x72, 04}},
1378	{APSRAW, yps, Py3, opBytes{0xe1, 0x71, 04, Pe, 0xe1, Pe, 0x71, 04}},
1379	{APSRLO, ypsdq, Pq, opBytes{0x73, 03}},
1380	{APSRLL, yps, Py3, opBytes{0xd2, 0x72, 02, Pe, 0xd2, Pe, 0x72, 02}},
1381	{APSRLQ, yps, Py3, opBytes{0xd3, 0x73, 02, Pe, 0xd3, Pe, 0x73, 02}},
1382	{APSRLW, yps, Py3, opBytes{0xd1, 0x71, 02, Pe, 0xd1, Pe, 0x71, 02}},
1383	{APSUBB, yxm, Pe, opBytes{0xf8}},
1384	{APSUBL, yxm, Pe, opBytes{0xfa}},
1385	{APSUBQ, yxm, Pe, opBytes{0xfb}},
1386	{APSUBSB, yxm, Pe, opBytes{0xe8}},
1387	{APSUBSW, yxm, Pe, opBytes{0xe9}},
1388	{APSUBUSB, yxm, Pe, opBytes{0xd8}},
1389	{APSUBUSW, yxm, Pe, opBytes{0xd9}},
1390	{APSUBW, yxm, Pe, opBytes{0xf9}},
1391	{APTEST, yxm_q4, Pq4, opBytes{0x17}},
1392	{APUNPCKHBW, ymm, Py1, opBytes{0x68, Pe, 0x68}},
1393	{APUNPCKHLQ, ymm, Py1, opBytes{0x6a, Pe, 0x6a}},
1394	{APUNPCKHQDQ, yxm, Pe, opBytes{0x6d}},
1395	{APUNPCKHWL, ymm, Py1, opBytes{0x69, Pe, 0x69}},
1396	{APUNPCKLBW, ymm, Py1, opBytes{0x60, Pe, 0x60}},
1397	{APUNPCKLLQ, ymm, Py1, opBytes{0x62, Pe, 0x62}},
1398	{APUNPCKLQDQ, yxm, Pe, opBytes{0x6c}},
1399	{APUNPCKLWL, ymm, Py1, opBytes{0x61, Pe, 0x61}},
1400	{APUSHAL, ynone, P32, opBytes{0x60}},
1401	{APUSHAW, ynone, Pe, opBytes{0x60}},
1402	{APUSHFL, ynone, P32, opBytes{0x9c}},
1403	{APUSHFQ, ynone, Py, opBytes{0x9c}},
1404	{APUSHFW, ynone, Pe, opBytes{0x9c}},
1405	{APUSHL, ypushl, P32, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1406	{APUSHQ, ypushl, Py, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1407	{APUSHW, ypushl, Pe, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1408	{APXOR, ymm, Py1, opBytes{0xef, Pe, 0xef}},
1409	{AQUAD, ybyte, Px, opBytes{8}},
1410	{ARCLB, yshb, Pb, opBytes{0xd0, 02, 0xc0, 02, 0xd2, 02}},
1411	{ARCLL, yshl, Px, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1412	{ARCLQ, yshl, Pw, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1413	{ARCLW, yshl, Pe, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1414	{ARCPPS, yxm, Pm, opBytes{0x53}},
1415	{ARCPSS, yxm, Pf3, opBytes{0x53}},
1416	{ARCRB, yshb, Pb, opBytes{0xd0, 03, 0xc0, 03, 0xd2, 03}},
1417	{ARCRL, yshl, Px, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1418	{ARCRQ, yshl, Pw, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1419	{ARCRW, yshl, Pe, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1420	{AREP, ynone, Px, opBytes{0xf3}},
1421	{AREPN, ynone, Px, opBytes{0xf2}},
1422	{obj.ARET, ynone, Px, opBytes{0xc3}},
1423	{ARETFW, yret, Pe, opBytes{0xcb, 0xca}},
1424	{ARETFL, yret, Px, opBytes{0xcb, 0xca}},
1425	{ARETFQ, yret, Pw, opBytes{0xcb, 0xca}},
1426	{AROLB, yshb, Pb, opBytes{0xd0, 00, 0xc0, 00, 0xd2, 00}},
1427	{AROLL, yshl, Px, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1428	{AROLQ, yshl, Pw, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1429	{AROLW, yshl, Pe, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1430	{ARORB, yshb, Pb, opBytes{0xd0, 01, 0xc0, 01, 0xd2, 01}},
1431	{ARORL, yshl, Px, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1432	{ARORQ, yshl, Pw, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1433	{ARORW, yshl, Pe, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1434	{ARSQRTPS, yxm, Pm, opBytes{0x52}},
1435	{ARSQRTSS, yxm, Pf3, opBytes{0x52}},
1436	{ASAHF, ynone, Px, opBytes{0x9e, 00, 0x86, 0xe0, 0x50, 0x9d}}, // XCHGB AH,AL; PUSH AX; POPFL
1437	{ASALB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
1438	{ASALL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1439	{ASALQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1440	{ASALW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1441	{ASARB, yshb, Pb, opBytes{0xd0, 07, 0xc0, 07, 0xd2, 07}},
1442	{ASARL, yshl, Px, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1443	{ASARQ, yshl, Pw, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1444	{ASARW, yshl, Pe, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1445	{ASBBB, yxorb, Pb, opBytes{0x1c, 0x80, 03, 0x18, 0x1a}},
1446	{ASBBL, yaddl, Px, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1447	{ASBBQ, yaddl, Pw, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1448	{ASBBW, yaddl, Pe, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1449	{ASCASB, ynone, Pb, opBytes{0xae}},
1450	{ASCASL, ynone, Px, opBytes{0xaf}},
1451	{ASCASQ, ynone, Pw, opBytes{0xaf}},
1452	{ASCASW, ynone, Pe, opBytes{0xaf}},
1453	{ASETCC, yscond, Pb, opBytes{0x0f, 0x93, 00}},
1454	{ASETCS, yscond, Pb, opBytes{0x0f, 0x92, 00}},
1455	{ASETEQ, yscond, Pb, opBytes{0x0f, 0x94, 00}},
1456	{ASETGE, yscond, Pb, opBytes{0x0f, 0x9d, 00}},
1457	{ASETGT, yscond, Pb, opBytes{0x0f, 0x9f, 00}},
1458	{ASETHI, yscond, Pb, opBytes{0x0f, 0x97, 00}},
1459	{ASETLE, yscond, Pb, opBytes{0x0f, 0x9e, 00}},
1460	{ASETLS, yscond, Pb, opBytes{0x0f, 0x96, 00}},
1461	{ASETLT, yscond, Pb, opBytes{0x0f, 0x9c, 00}},
1462	{ASETMI, yscond, Pb, opBytes{0x0f, 0x98, 00}},
1463	{ASETNE, yscond, Pb, opBytes{0x0f, 0x95, 00}},
1464	{ASETOC, yscond, Pb, opBytes{0x0f, 0x91, 00}},
1465	{ASETOS, yscond, Pb, opBytes{0x0f, 0x90, 00}},
1466	{ASETPC, yscond, Pb, opBytes{0x0f, 0x9b, 00}},
1467	{ASETPL, yscond, Pb, opBytes{0x0f, 0x99, 00}},
1468	{ASETPS, yscond, Pb, opBytes{0x0f, 0x9a, 00}},
1469	{ASHLB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
1470	{ASHLL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1471	{ASHLQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1472	{ASHLW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1473	{ASHRB, yshb, Pb, opBytes{0xd0, 05, 0xc0, 05, 0xd2, 05}},
1474	{ASHRL, yshl, Px, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1475	{ASHRQ, yshl, Pw, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1476	{ASHRW, yshl, Pe, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1477	{ASHUFPD, yxshuf, Pq, opBytes{0xc6, 00}},
1478	{ASHUFPS, yxshuf, Pm, opBytes{0xc6, 00}},
1479	{ASQRTPD, yxm, Pe, opBytes{0x51}},
1480	{ASQRTPS, yxm, Pm, opBytes{0x51}},
1481	{ASQRTSD, yxm, Pf2, opBytes{0x51}},
1482	{ASQRTSS, yxm, Pf3, opBytes{0x51}},
1483	{ASTC, ynone, Px, opBytes{0xf9}},
1484	{ASTD, ynone, Px, opBytes{0xfd}},
1485	{ASTI, ynone, Px, opBytes{0xfb}},
1486	{ASTMXCSR, ysvrs_om, Pm, opBytes{0xae, 03, 0xae, 03}},
1487	{ASTOSB, ynone, Pb, opBytes{0xaa}},
1488	{ASTOSL, ynone, Px, opBytes{0xab}},
1489	{ASTOSQ, ynone, Pw, opBytes{0xab}},
1490	{ASTOSW, ynone, Pe, opBytes{0xab}},
1491	{ASUBB, yxorb, Pb, opBytes{0x2c, 0x80, 05, 0x28, 0x2a}},
1492	{ASUBL, yaddl, Px, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1493	{ASUBPD, yxm, Pe, opBytes{0x5c}},
1494	{ASUBPS, yxm, Pm, opBytes{0x5c}},
1495	{ASUBQ, yaddl, Pw, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1496	{ASUBSD, yxm, Pf2, opBytes{0x5c}},
1497	{ASUBSS, yxm, Pf3, opBytes{0x5c}},
1498	{ASUBW, yaddl, Pe, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1499	{ASWAPGS, ynone, Pm, opBytes{0x01, 0xf8}},
1500	{ASYSCALL, ynone, Px, opBytes{0x0f, 0x05}}, // fast syscall
1501	{ATESTB, yxorb, Pb, opBytes{0xa8, 0xf6, 00, 0x84, 0x84}},
1502	{ATESTL, ytestl, Px, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1503	{ATESTQ, ytestl, Pw, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1504	{ATESTW, ytestl, Pe, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1505	{ATPAUSE, ywrfsbase, Pq, opBytes{0xae, 06}},
1506	{obj.ATEXT, ytext, Px, opBytes{}},
1507	{AUCOMISD, yxm, Pe, opBytes{0x2e}},
1508	{AUCOMISS, yxm, Pm, opBytes{0x2e}},
1509	{AUNPCKHPD, yxm, Pe, opBytes{0x15}},
1510	{AUNPCKHPS, yxm, Pm, opBytes{0x15}},
1511	{AUNPCKLPD, yxm, Pe, opBytes{0x14}},
1512	{AUNPCKLPS, yxm, Pm, opBytes{0x14}},
1513	{AUMONITOR, ywrfsbase, Pf3, opBytes{0xae, 06}},
1514	{AVERR, ydivl, Pm, opBytes{0x00, 04}},
1515	{AVERW, ydivl, Pm, opBytes{0x00, 05}},
1516	{AWAIT, ynone, Px, opBytes{0x9b}},
1517	{AWORD, ybyte, Px, opBytes{2}},
1518	{AXCHGB, yml_mb, Pb, opBytes{0x86, 0x86}},
1519	{AXCHGL, yxchg, Px, opBytes{0x90, 0x90, 0x87, 0x87}},
1520	{AXCHGQ, yxchg, Pw, opBytes{0x90, 0x90, 0x87, 0x87}},
1521	{AXCHGW, yxchg, Pe, opBytes{0x90, 0x90, 0x87, 0x87}},
1522	{AXLAT, ynone, Px, opBytes{0xd7}},
1523	{AXORB, yxorb, Pb, opBytes{0x34, 0x80, 06, 0x30, 0x32}},
1524	{AXORL, yaddl, Px, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1525	{AXORPD, yxm, Pe, opBytes{0x57}},
1526	{AXORPS, yxm, Pm, opBytes{0x57}},
1527	{AXORQ, yaddl, Pw, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1528	{AXORW, yaddl, Pe, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1529	{AFMOVB, yfmvx, Px, opBytes{0xdf, 04}},
1530	{AFMOVBP, yfmvp, Px, opBytes{0xdf, 06}},
1531	{AFMOVD, yfmvd, Px, opBytes{0xdd, 00, 0xdd, 02, 0xd9, 00, 0xdd, 02}},
1532	{AFMOVDP, yfmvdp, Px, opBytes{0xdd, 03, 0xdd, 03}},
1533	{AFMOVF, yfmvf, Px, opBytes{0xd9, 00, 0xd9, 02}},
1534	{AFMOVFP, yfmvp, Px, opBytes{0xd9, 03}},
1535	{AFMOVL, yfmvf, Px, opBytes{0xdb, 00, 0xdb, 02}},
1536	{AFMOVLP, yfmvp, Px, opBytes{0xdb, 03}},
1537	{AFMOVV, yfmvx, Px, opBytes{0xdf, 05}},
1538	{AFMOVVP, yfmvp, Px, opBytes{0xdf, 07}},
1539	{AFMOVW, yfmvf, Px, opBytes{0xdf, 00, 0xdf, 02}},
1540	{AFMOVWP, yfmvp, Px, opBytes{0xdf, 03}},
1541	{AFMOVX, yfmvx, Px, opBytes{0xdb, 05}},
1542	{AFMOVXP, yfmvp, Px, opBytes{0xdb, 07}},
1543	{AFCMOVCC, yfcmv, Px, opBytes{0xdb, 00}},
1544	{AFCMOVCS, yfcmv, Px, opBytes{0xda, 00}},
1545	{AFCMOVEQ, yfcmv, Px, opBytes{0xda, 01}},
1546	{AFCMOVHI, yfcmv, Px, opBytes{0xdb, 02}},
1547	{AFCMOVLS, yfcmv, Px, opBytes{0xda, 02}},
1548	{AFCMOVB, yfcmv, Px, opBytes{0xda, 00}},
1549	{AFCMOVBE, yfcmv, Px, opBytes{0xda, 02}},
1550	{AFCMOVNB, yfcmv, Px, opBytes{0xdb, 00}},
1551	{AFCMOVNBE, yfcmv, Px, opBytes{0xdb, 02}},
1552	{AFCMOVE, yfcmv, Px, opBytes{0xda, 01}},
1553	{AFCMOVNE, yfcmv, Px, opBytes{0xdb, 01}},
1554	{AFCMOVNU, yfcmv, Px, opBytes{0xdb, 03}},
1555	{AFCMOVU, yfcmv, Px, opBytes{0xda, 03}},
1556	{AFCMOVUN, yfcmv, Px, opBytes{0xda, 03}},
1557	{AFCOMD, yfadd, Px, opBytes{0xdc, 02, 0xd8, 02, 0xdc, 02}},  // botch
1558	{AFCOMDP, yfadd, Px, opBytes{0xdc, 03, 0xd8, 03, 0xdc, 03}}, // botch
1559	{AFCOMDPP, ycompp, Px, opBytes{0xde, 03}},
1560	{AFCOMF, yfmvx, Px, opBytes{0xd8, 02}},
1561	{AFCOMFP, yfmvx, Px, opBytes{0xd8, 03}},
1562	{AFCOMI, yfcmv, Px, opBytes{0xdb, 06}},
1563	{AFCOMIP, yfcmv, Px, opBytes{0xdf, 06}},
1564	{AFCOML, yfmvx, Px, opBytes{0xda, 02}},
1565	{AFCOMLP, yfmvx, Px, opBytes{0xda, 03}},
1566	{AFCOMW, yfmvx, Px, opBytes{0xde, 02}},
1567	{AFCOMWP, yfmvx, Px, opBytes{0xde, 03}},
1568	{AFUCOM, ycompp, Px, opBytes{0xdd, 04}},
1569	{AFUCOMI, ycompp, Px, opBytes{0xdb, 05}},
1570	{AFUCOMIP, ycompp, Px, opBytes{0xdf, 05}},
1571	{AFUCOMP, ycompp, Px, opBytes{0xdd, 05}},
1572	{AFUCOMPP, ycompp, Px, opBytes{0xda, 13}},
1573	{AFADDDP, ycompp, Px, opBytes{0xde, 00}},
1574	{AFADDW, yfmvx, Px, opBytes{0xde, 00}},
1575	{AFADDL, yfmvx, Px, opBytes{0xda, 00}},
1576	{AFADDF, yfmvx, Px, opBytes{0xd8, 00}},
1577	{AFADDD, yfadd, Px, opBytes{0xdc, 00, 0xd8, 00, 0xdc, 00}},
1578	{AFMULDP, ycompp, Px, opBytes{0xde, 01}},
1579	{AFMULW, yfmvx, Px, opBytes{0xde, 01}},
1580	{AFMULL, yfmvx, Px, opBytes{0xda, 01}},
1581	{AFMULF, yfmvx, Px, opBytes{0xd8, 01}},
1582	{AFMULD, yfadd, Px, opBytes{0xdc, 01, 0xd8, 01, 0xdc, 01}},
1583	{AFSUBDP, ycompp, Px, opBytes{0xde, 05}},
1584	{AFSUBW, yfmvx, Px, opBytes{0xde, 04}},
1585	{AFSUBL, yfmvx, Px, opBytes{0xda, 04}},
1586	{AFSUBF, yfmvx, Px, opBytes{0xd8, 04}},
1587	{AFSUBD, yfadd, Px, opBytes{0xdc, 04, 0xd8, 04, 0xdc, 05}},
1588	{AFSUBRDP, ycompp, Px, opBytes{0xde, 04}},
1589	{AFSUBRW, yfmvx, Px, opBytes{0xde, 05}},
1590	{AFSUBRL, yfmvx, Px, opBytes{0xda, 05}},
1591	{AFSUBRF, yfmvx, Px, opBytes{0xd8, 05}},
1592	{AFSUBRD, yfadd, Px, opBytes{0xdc, 05, 0xd8, 05, 0xdc, 04}},
1593	{AFDIVDP, ycompp, Px, opBytes{0xde, 07}},
1594	{AFDIVW, yfmvx, Px, opBytes{0xde, 06}},
1595	{AFDIVL, yfmvx, Px, opBytes{0xda, 06}},
1596	{AFDIVF, yfmvx, Px, opBytes{0xd8, 06}},
1597	{AFDIVD, yfadd, Px, opBytes{0xdc, 06, 0xd8, 06, 0xdc, 07}},
1598	{AFDIVRDP, ycompp, Px, opBytes{0xde, 06}},
1599	{AFDIVRW, yfmvx, Px, opBytes{0xde, 07}},
1600	{AFDIVRL, yfmvx, Px, opBytes{0xda, 07}},
1601	{AFDIVRF, yfmvx, Px, opBytes{0xd8, 07}},
1602	{AFDIVRD, yfadd, Px, opBytes{0xdc, 07, 0xd8, 07, 0xdc, 06}},
1603	{AFXCHD, yfxch, Px, opBytes{0xd9, 01, 0xd9, 01}},
1604	{AFFREE, nil, 0, opBytes{}},
1605	{AFLDCW, ysvrs_mo, Px, opBytes{0xd9, 05, 0xd9, 05}},
1606	{AFLDENV, ysvrs_mo, Px, opBytes{0xd9, 04, 0xd9, 04}},
1607	{AFRSTOR, ysvrs_mo, Px, opBytes{0xdd, 04, 0xdd, 04}},
1608	{AFSAVE, ysvrs_om, Px, opBytes{0xdd, 06, 0xdd, 06}},
1609	{AFSTCW, ysvrs_om, Px, opBytes{0xd9, 07, 0xd9, 07}},
1610	{AFSTENV, ysvrs_om, Px, opBytes{0xd9, 06, 0xd9, 06}},
1611	{AFSTSW, ystsw, Px, opBytes{0xdd, 07, 0xdf, 0xe0}},
1612	{AF2XM1, ynone, Px, opBytes{0xd9, 0xf0}},
1613	{AFABS, ynone, Px, opBytes{0xd9, 0xe1}},
1614	{AFBLD, ysvrs_mo, Px, opBytes{0xdf, 04}},
1615	{AFBSTP, yclflush, Px, opBytes{0xdf, 06}},
1616	{AFCHS, ynone, Px, opBytes{0xd9, 0xe0}},
1617	{AFCLEX, ynone, Px, opBytes{0xdb, 0xe2}},
1618	{AFCOS, ynone, Px, opBytes{0xd9, 0xff}},
1619	{AFDECSTP, ynone, Px, opBytes{0xd9, 0xf6}},
1620	{AFINCSTP, ynone, Px, opBytes{0xd9, 0xf7}},
1621	{AFINIT, ynone, Px, opBytes{0xdb, 0xe3}},
1622	{AFLD1, ynone, Px, opBytes{0xd9, 0xe8}},
1623	{AFLDL2E, ynone, Px, opBytes{0xd9, 0xea}},
1624	{AFLDL2T, ynone, Px, opBytes{0xd9, 0xe9}},
1625	{AFLDLG2, ynone, Px, opBytes{0xd9, 0xec}},
1626	{AFLDLN2, ynone, Px, opBytes{0xd9, 0xed}},
1627	{AFLDPI, ynone, Px, opBytes{0xd9, 0xeb}},
1628	{AFLDZ, ynone, Px, opBytes{0xd9, 0xee}},
1629	{AFNOP, ynone, Px, opBytes{0xd9, 0xd0}},
1630	{AFPATAN, ynone, Px, opBytes{0xd9, 0xf3}},
1631	{AFPREM, ynone, Px, opBytes{0xd9, 0xf8}},
1632	{AFPREM1, ynone, Px, opBytes{0xd9, 0xf5}},
1633	{AFPTAN, ynone, Px, opBytes{0xd9, 0xf2}},
1634	{AFRNDINT, ynone, Px, opBytes{0xd9, 0xfc}},
1635	{AFSCALE, ynone, Px, opBytes{0xd9, 0xfd}},
1636	{AFSIN, ynone, Px, opBytes{0xd9, 0xfe}},
1637	{AFSINCOS, ynone, Px, opBytes{0xd9, 0xfb}},
1638	{AFSQRT, ynone, Px, opBytes{0xd9, 0xfa}},
1639	{AFTST, ynone, Px, opBytes{0xd9, 0xe4}},
1640	{AFXAM, ynone, Px, opBytes{0xd9, 0xe5}},
1641	{AFXTRACT, ynone, Px, opBytes{0xd9, 0xf4}},
1642	{AFYL2X, ynone, Px, opBytes{0xd9, 0xf1}},
1643	{AFYL2XP1, ynone, Px, opBytes{0xd9, 0xf9}},
1644	{ACMPXCHGB, yrb_mb, Pb, opBytes{0x0f, 0xb0}},
1645	{ACMPXCHGL, yrl_ml, Px, opBytes{0x0f, 0xb1}},
1646	{ACMPXCHGW, yrl_ml, Pe, opBytes{0x0f, 0xb1}},
1647	{ACMPXCHGQ, yrl_ml, Pw, opBytes{0x0f, 0xb1}},
1648	{ACMPXCHG8B, yscond, Pm, opBytes{0xc7, 01}},
1649	{ACMPXCHG16B, yscond, Pw, opBytes{0x0f, 0xc7, 01}},
1650	{AINVD, ynone, Pm, opBytes{0x08}},
1651	{AINVLPG, ydivb, Pm, opBytes{0x01, 07}},
1652	{AINVPCID, ycrc32l, Pe, opBytes{0x0f, 0x38, 0x82, 0}},
1653	{ALFENCE, ynone, Pm, opBytes{0xae, 0xe8}},
1654	{AMFENCE, ynone, Pm, opBytes{0xae, 0xf0}},
1655	{AMOVNTIL, yrl_ml, Pm, opBytes{0xc3}},
1656	{AMOVNTIQ, yrl_ml, Pw, opBytes{0x0f, 0xc3}},
1657	{ARDPKRU, ynone, Pm, opBytes{0x01, 0xee, 0}},
1658	{ARDMSR, ynone, Pm, opBytes{0x32}},
1659	{ARDPMC, ynone, Pm, opBytes{0x33}},
1660	{ARDTSC, ynone, Pm, opBytes{0x31}},
1661	{ARSM, ynone, Pm, opBytes{0xaa}},
1662	{ASFENCE, ynone, Pm, opBytes{0xae, 0xf8}},
1663	{ASYSRET, ynone, Pm, opBytes{0x07}},
1664	{AWBINVD, ynone, Pm, opBytes{0x09}},
1665	{AWRMSR, ynone, Pm, opBytes{0x30}},
1666	{AWRPKRU, ynone, Pm, opBytes{0x01, 0xef, 0}},
1667	{AXADDB, yrb_mb, Pb, opBytes{0x0f, 0xc0}},
1668	{AXADDL, yrl_ml, Px, opBytes{0x0f, 0xc1}},
1669	{AXADDQ, yrl_ml, Pw, opBytes{0x0f, 0xc1}},
1670	{AXADDW, yrl_ml, Pe, opBytes{0x0f, 0xc1}},
1671	{ACRC32B, ycrc32b, Px, opBytes{0xf2, 0x0f, 0x38, 0xf0, 0}},
1672	{ACRC32L, ycrc32l, Px, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1673	{ACRC32Q, ycrc32l, Pw, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1674	{ACRC32W, ycrc32l, Pe, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1675	{APREFETCHT0, yprefetch, Pm, opBytes{0x18, 01}},
1676	{APREFETCHT1, yprefetch, Pm, opBytes{0x18, 02}},
1677	{APREFETCHT2, yprefetch, Pm, opBytes{0x18, 03}},
1678	{APREFETCHNTA, yprefetch, Pm, opBytes{0x18, 00}},
1679	{AMOVQL, yrl_ml, Px, opBytes{0x89}},
1680	{obj.AUNDEF, ynone, Px, opBytes{0x0f, 0x0b}},
1681	{AAESENC, yaes, Pq, opBytes{0x38, 0xdc, 0}},
1682	{AAESENCLAST, yaes, Pq, opBytes{0x38, 0xdd, 0}},
1683	{AAESDEC, yaes, Pq, opBytes{0x38, 0xde, 0}},
1684	{AAESDECLAST, yaes, Pq, opBytes{0x38, 0xdf, 0}},
1685	{AAESIMC, yaes, Pq, opBytes{0x38, 0xdb, 0}},
1686	{AAESKEYGENASSIST, yxshuf, Pq, opBytes{0x3a, 0xdf, 0}},
1687	{AROUNDPD, yxshuf, Pq, opBytes{0x3a, 0x09, 0}},
1688	{AROUNDPS, yxshuf, Pq, opBytes{0x3a, 0x08, 0}},
1689	{AROUNDSD, yxshuf, Pq, opBytes{0x3a, 0x0b, 0}},
1690	{AROUNDSS, yxshuf, Pq, opBytes{0x3a, 0x0a, 0}},
1691	{APSHUFD, yxshuf, Pq, opBytes{0x70, 0}},
1692	{APCLMULQDQ, yxshuf, Pq, opBytes{0x3a, 0x44, 0}},
1693	{APCMPESTRI, yxshuf, Pq, opBytes{0x3a, 0x61, 0}},
1694	{APCMPESTRM, yxshuf, Pq, opBytes{0x3a, 0x60, 0}},
1695	{AMOVDDUP, yxm, Pf2, opBytes{0x12}},
1696	{AMOVSHDUP, yxm, Pf3, opBytes{0x16}},
1697	{AMOVSLDUP, yxm, Pf3, opBytes{0x12}},
1698	{ARDTSCP, ynone, Pm, opBytes{0x01, 0xf9, 0}},
1699	{ASTAC, ynone, Pm, opBytes{0x01, 0xcb, 0}},
1700	{AUD1, ynone, Pm, opBytes{0xb9, 0}},
1701	{AUD2, ynone, Pm, opBytes{0x0b, 0}},
1702	{AUMWAIT, ywrfsbase, Pf2, opBytes{0xae, 06}},
1703	{ASYSENTER, ynone, Px, opBytes{0x0f, 0x34, 0}},
1704	{ASYSENTER64, ynone, Pw, opBytes{0x0f, 0x34, 0}},
1705	{ASYSEXIT, ynone, Px, opBytes{0x0f, 0x35, 0}},
1706	{ASYSEXIT64, ynone, Pw, opBytes{0x0f, 0x35, 0}},
1707	{ALMSW, ydivl, Pm, opBytes{0x01, 06}},
1708	{ALLDT, ydivl, Pm, opBytes{0x00, 02}},
1709	{ALIDT, ysvrs_mo, Pm, opBytes{0x01, 03}},
1710	{ALGDT, ysvrs_mo, Pm, opBytes{0x01, 02}},
1711	{ATZCNTW, ycrc32l, Pe, opBytes{0xf3, 0x0f, 0xbc, 0}},
1712	{ATZCNTL, ycrc32l, Px, opBytes{0xf3, 0x0f, 0xbc, 0}},
1713	{ATZCNTQ, ycrc32l, Pw, opBytes{0xf3, 0x0f, 0xbc, 0}},
1714	{AXRSTOR, ydivl, Px, opBytes{0x0f, 0xae, 05}},
1715	{AXRSTOR64, ydivl, Pw, opBytes{0x0f, 0xae, 05}},
1716	{AXRSTORS, ydivl, Px, opBytes{0x0f, 0xc7, 03}},
1717	{AXRSTORS64, ydivl, Pw, opBytes{0x0f, 0xc7, 03}},
1718	{AXSAVE, yclflush, Px, opBytes{0x0f, 0xae, 04}},
1719	{AXSAVE64, yclflush, Pw, opBytes{0x0f, 0xae, 04}},
1720	{AXSAVEOPT, yclflush, Px, opBytes{0x0f, 0xae, 06}},
1721	{AXSAVEOPT64, yclflush, Pw, opBytes{0x0f, 0xae, 06}},
1722	{AXSAVEC, yclflush, Px, opBytes{0x0f, 0xc7, 04}},
1723	{AXSAVEC64, yclflush, Pw, opBytes{0x0f, 0xc7, 04}},
1724	{AXSAVES, yclflush, Px, opBytes{0x0f, 0xc7, 05}},
1725	{AXSAVES64, yclflush, Pw, opBytes{0x0f, 0xc7, 05}},
1726	{ASGDT, yclflush, Pm, opBytes{0x01, 00}},
1727	{ASIDT, yclflush, Pm, opBytes{0x01, 01}},
1728	{ARDRANDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 06}},
1729	{ARDRANDL, yrdrand, Px, opBytes{0x0f, 0xc7, 06}},
1730	{ARDRANDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 06}},
1731	{ARDSEEDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 07}},
1732	{ARDSEEDL, yrdrand, Px, opBytes{0x0f, 0xc7, 07}},
1733	{ARDSEEDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 07}},
1734	{ASTRW, yincq, Pe, opBytes{0x0f, 0x00, 01}},
1735	{ASTRL, yincq, Px, opBytes{0x0f, 0x00, 01}},
1736	{ASTRQ, yincq, Pw, opBytes{0x0f, 0x00, 01}},
1737	{AXSETBV, ynone, Pm, opBytes{0x01, 0xd1, 0}},
1738	{AMOVBEW, ymovbe, Pq, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
1739	{AMOVBEL, ymovbe, Pm, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
1740	{AMOVBEQ, ymovbe, Pw, opBytes{0x0f, 0x38, 0xf0, 0, 0x0f, 0x38, 0xf1, 0}},
1741	{ANOPW, ydivl, Pe, opBytes{0x0f, 0x1f, 00}},
1742	{ANOPL, ydivl, Px, opBytes{0x0f, 0x1f, 00}},
1743	{ASLDTW, yincq, Pe, opBytes{0x0f, 0x00, 00}},
1744	{ASLDTL, yincq, Px, opBytes{0x0f, 0x00, 00}},
1745	{ASLDTQ, yincq, Pw, opBytes{0x0f, 0x00, 00}},
1746	{ASMSWW, yincq, Pe, opBytes{0x0f, 0x01, 04}},
1747	{ASMSWL, yincq, Px, opBytes{0x0f, 0x01, 04}},
1748	{ASMSWQ, yincq, Pw, opBytes{0x0f, 0x01, 04}},
1749	{ABLENDVPS, yblendvpd, Pq4, opBytes{0x14}},
1750	{ABLENDVPD, yblendvpd, Pq4, opBytes{0x15}},
1751	{APBLENDVB, yblendvpd, Pq4, opBytes{0x10}},
1752	{ASHA1MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xc9, 0}},
1753	{ASHA1MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xca, 0}},
1754	{ASHA1NEXTE, yaes, Px, opBytes{0x0f, 0x38, 0xc8, 0}},
1755	{ASHA256MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xcc, 0}},
1756	{ASHA256MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xcd, 0}},
1757	{ASHA1RNDS4, ysha1rnds4, Pm, opBytes{0x3a, 0xcc, 0}},
1758	{ASHA256RNDS2, ysha256rnds2, Px, opBytes{0x0f, 0x38, 0xcb, 0}},
1759	{ARDFSBASEL, yrdrand, Pf3, opBytes{0xae, 00}},
1760	{ARDFSBASEQ, yrdrand, Pfw, opBytes{0xae, 00}},
1761	{ARDGSBASEL, yrdrand, Pf3, opBytes{0xae, 01}},
1762	{ARDGSBASEQ, yrdrand, Pfw, opBytes{0xae, 01}},
1763	{AWRFSBASEL, ywrfsbase, Pf3, opBytes{0xae, 02}},
1764	{AWRFSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 02}},
1765	{AWRGSBASEL, ywrfsbase, Pf3, opBytes{0xae, 03}},
1766	{AWRGSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 03}},
1767	{ALFSW, ym_rl, Pe, opBytes{0x0f, 0xb4}},
1768	{ALFSL, ym_rl, Px, opBytes{0x0f, 0xb4}},
1769	{ALFSQ, ym_rl, Pw, opBytes{0x0f, 0xb4}},
1770	{ALGSW, ym_rl, Pe, opBytes{0x0f, 0xb5}},
1771	{ALGSL, ym_rl, Px, opBytes{0x0f, 0xb5}},
1772	{ALGSQ, ym_rl, Pw, opBytes{0x0f, 0xb5}},
1773	{ALSSW, ym_rl, Pe, opBytes{0x0f, 0xb2}},
1774	{ALSSL, ym_rl, Px, opBytes{0x0f, 0xb2}},
1775	{ALSSQ, ym_rl, Pw, opBytes{0x0f, 0xb2}},
1776
1777	{ABLENDPD, yxshuf, Pq, opBytes{0x3a, 0x0d, 0}},
1778	{ABLENDPS, yxshuf, Pq, opBytes{0x3a, 0x0c, 0}},
1779	{AXACQUIRE, ynone, Px, opBytes{0xf2}},
1780	{AXRELEASE, ynone, Px, opBytes{0xf3}},
1781	{AXBEGIN, yxbegin, Px, opBytes{0xc7, 0xf8}},
1782	{AXABORT, yxabort, Px, opBytes{0xc6, 0xf8}},
1783	{AXEND, ynone, Px, opBytes{0x0f, 01, 0xd5}},
1784	{AXTEST, ynone, Px, opBytes{0x0f, 01, 0xd6}},
1785	{AXGETBV, ynone, Pm, opBytes{01, 0xd0}},
1786	{obj.AFUNCDATA, yfuncdata, Px, opBytes{0, 0}},
1787	{obj.APCDATA, ypcdata, Px, opBytes{0, 0}},
1788	{obj.ADUFFCOPY, yduff, Px, opBytes{0xe8}},
1789	{obj.ADUFFZERO, yduff, Px, opBytes{0xe8}},
1790
1791	{obj.AEND, nil, 0, opBytes{}},
1792	{0, nil, 0, opBytes{}},
1793}
1794
1795var opindex [(ALAST + 1) & obj.AMask]*Optab
1796
1797// useAbs reports whether s describes a symbol that must avoid pc-relative addressing.
1798// This happens on systems like Solaris that call .so functions instead of system calls.
1799// It does not seem to be necessary for any other systems. This is probably working
1800// around a Solaris-specific bug that should be fixed differently, but we don't know
1801// what that bug is. And this does fix it.
1802func useAbs(ctxt *obj.Link, s *obj.LSym) bool {
1803	if ctxt.Headtype == objabi.Hsolaris {
1804		// All the Solaris dynamic imports from libc.so begin with "libc_".
1805		return strings.HasPrefix(s.Name, "libc_")
1806	}
1807	return ctxt.Arch.Family == sys.I386 && !ctxt.Flag_shared
1808}
1809
1810// single-instruction no-ops of various lengths.
1811// constructed by hand and disassembled with gdb to verify.
1812// see http://www.agner.org/optimize/optimizing_assembly.pdf for discussion.
1813var nop = [][16]uint8{
1814	{0x90},
1815	{0x66, 0x90},
1816	{0x0F, 0x1F, 0x00},
1817	{0x0F, 0x1F, 0x40, 0x00},
1818	{0x0F, 0x1F, 0x44, 0x00, 0x00},
1819	{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
1820	{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
1821	{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1822	{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1823}
1824
1825// Native Client rejects the repeated 0x66 prefix.
1826// {0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1827func fillnop(p []byte, n int) {
1828	var m int
1829
1830	for n > 0 {
1831		m = n
1832		if m > len(nop) {
1833			m = len(nop)
1834		}
1835		copy(p[:m], nop[m-1][:m])
1836		p = p[m:]
1837		n -= m
1838	}
1839}
1840
1841func noppad(ctxt *obj.Link, s *obj.LSym, c int32, pad int32) int32 {
1842	s.Grow(int64(c) + int64(pad))
1843	fillnop(s.P[c:], int(pad))
1844	return c + pad
1845}
1846
1847func spadjop(ctxt *obj.Link, l, q obj.As) obj.As {
1848	if ctxt.Arch.Family != sys.AMD64 || ctxt.Arch.PtrSize == 4 {
1849		return l
1850	}
1851	return q
1852}
1853
1854// isJump returns whether p is a jump instruction.
1855// It is used to ensure that no standalone or macro-fused jump will straddle
1856// or end on a 32 byte boundary by inserting NOPs before the jumps.
1857func isJump(p *obj.Prog) bool {
1858	return p.To.Target() != nil || p.As == obj.AJMP || p.As == obj.ACALL ||
1859		p.As == obj.ARET || p.As == obj.ADUFFCOPY || p.As == obj.ADUFFZERO
1860}
1861
1862// lookForJCC returns the first real instruction starting from p, if that instruction is a conditional
1863// jump. Otherwise, nil is returned.
1864func lookForJCC(p *obj.Prog) *obj.Prog {
1865	// Skip any PCDATA, FUNCDATA or NOP instructions
1866	var q *obj.Prog
1867	for q = p.Link; q != nil && (q.As == obj.APCDATA || q.As == obj.AFUNCDATA || q.As == obj.ANOP); q = q.Link {
1868	}
1869
1870	if q == nil || q.To.Target() == nil || p.As == obj.AJMP || p.As == obj.ACALL {
1871		return nil
1872	}
1873
1874	switch q.As {
1875	case AJOS, AJOC, AJCS, AJCC, AJEQ, AJNE, AJLS, AJHI,
1876		AJMI, AJPL, AJPS, AJPC, AJLT, AJGE, AJLE, AJGT:
1877	default:
1878		return nil
1879	}
1880
1881	return q
1882}
1883
1884// fusedJump determines whether p can be fused with a subsequent conditional jump instruction.
1885// If it can, we return true followed by the total size of the fused jump. If it can't, we return false.
1886// Macro fusion rules are derived from the Intel Optimization Manual (April 2019) section 3.4.2.2.
1887func fusedJump(p *obj.Prog) (bool, uint8) {
1888	var fusedSize uint8
1889
1890	// The first instruction in a macro fused pair may be preceded by the LOCK prefix,
1891	// or possibly an XACQUIRE/XRELEASE prefix followed by a LOCK prefix. If it is, we
1892	// need to be careful to insert any padding before the locks rather than directly after them.
1893
1894	if p.As == AXRELEASE || p.As == AXACQUIRE {
1895		fusedSize += p.Isize
1896		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
1897		}
1898		if p == nil {
1899			return false, 0
1900		}
1901	}
1902	if p.As == ALOCK {
1903		fusedSize += p.Isize
1904		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
1905		}
1906		if p == nil {
1907			return false, 0
1908		}
1909	}
1910	cmp := p.As == ACMPB || p.As == ACMPL || p.As == ACMPQ || p.As == ACMPW
1911
1912	cmpAddSub := p.As == AADDB || p.As == AADDL || p.As == AADDW || p.As == AADDQ ||
1913		p.As == ASUBB || p.As == ASUBL || p.As == ASUBW || p.As == ASUBQ || cmp
1914
1915	testAnd := p.As == ATESTB || p.As == ATESTL || p.As == ATESTQ || p.As == ATESTW ||
1916		p.As == AANDB || p.As == AANDL || p.As == AANDQ || p.As == AANDW
1917
1918	incDec := p.As == AINCB || p.As == AINCL || p.As == AINCQ || p.As == AINCW ||
1919		p.As == ADECB || p.As == ADECL || p.As == ADECQ || p.As == ADECW
1920
1921	if !cmpAddSub && !testAnd && !incDec {
1922		return false, 0
1923	}
1924
1925	if !incDec {
1926		var argOne obj.AddrType
1927		var argTwo obj.AddrType
1928		if cmp {
1929			argOne = p.From.Type
1930			argTwo = p.To.Type
1931		} else {
1932			argOne = p.To.Type
1933			argTwo = p.From.Type
1934		}
1935		if argOne == obj.TYPE_REG {
1936			if argTwo != obj.TYPE_REG && argTwo != obj.TYPE_CONST && argTwo != obj.TYPE_MEM {
1937				return false, 0
1938			}
1939		} else if argOne == obj.TYPE_MEM {
1940			if argTwo != obj.TYPE_REG {
1941				return false, 0
1942			}
1943		} else {
1944			return false, 0
1945		}
1946	}
1947
1948	fusedSize += p.Isize
1949	jmp := lookForJCC(p)
1950	if jmp == nil {
1951		return false, 0
1952	}
1953
1954	fusedSize += jmp.Isize
1955
1956	if testAnd {
1957		return true, fusedSize
1958	}
1959
1960	if jmp.As == AJOC || jmp.As == AJOS || jmp.As == AJMI ||
1961		jmp.As == AJPL || jmp.As == AJPS || jmp.As == AJPC {
1962		return false, 0
1963	}
1964
1965	if cmpAddSub {
1966		return true, fusedSize
1967	}
1968
1969	if jmp.As == AJCS || jmp.As == AJCC || jmp.As == AJHI || jmp.As == AJLS {
1970		return false, 0
1971	}
1972
1973	return true, fusedSize
1974}
1975
1976type padJumpsCtx int32
1977
1978func makePjcCtx(ctxt *obj.Link) padJumpsCtx {
1979	// Disable jump padding on 32 bit builds by settting
1980	// padJumps to 0.
1981	if ctxt.Arch.Family == sys.I386 {
1982		return padJumpsCtx(0)
1983	}
1984
1985	// Disable jump padding for hand written assembly code.
1986	if ctxt.IsAsm {
1987		return padJumpsCtx(0)
1988	}
1989
1990	return padJumpsCtx(32)
1991}
1992
1993// padJump detects whether the instruction being assembled is a standalone or a macro-fused
1994// jump that needs to be padded. If it is, NOPs are inserted to ensure that the jump does
1995// not cross or end on a 32 byte boundary.
1996func (pjc padJumpsCtx) padJump(ctxt *obj.Link, s *obj.LSym, p *obj.Prog, c int32) int32 {
1997	if pjc == 0 {
1998		return c
1999	}
2000
2001	var toPad int32
2002	fj, fjSize := fusedJump(p)
2003	mask := int32(pjc - 1)
2004	if fj {
2005		if (c&mask)+int32(fjSize) >= int32(pjc) {
2006			toPad = int32(pjc) - (c & mask)
2007		}
2008	} else if isJump(p) {
2009		if (c&mask)+int32(p.Isize) >= int32(pjc) {
2010			toPad = int32(pjc) - (c & mask)
2011		}
2012	}
2013	if toPad <= 0 {
2014		return c
2015	}
2016
2017	return noppad(ctxt, s, c, toPad)
2018}
2019
2020// reAssemble is called if an instruction's size changes during assembly. If
2021// it does and the instruction is a standalone or a macro-fused jump we need to
2022// reassemble.
2023func (pjc padJumpsCtx) reAssemble(p *obj.Prog) bool {
2024	if pjc == 0 {
2025		return false
2026	}
2027
2028	fj, _ := fusedJump(p)
2029	return fj || isJump(p)
2030}
2031
2032type nopPad struct {
2033	p *obj.Prog // Instruction before the pad
2034	n int32     // Size of the pad
2035}
2036
2037func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
2038	if ctxt.Retpoline && ctxt.Arch.Family == sys.I386 {
2039		ctxt.Diag("-spectre=ret not supported on 386")
2040		ctxt.Retpoline = false // don't keep printing
2041	}
2042
2043	pjc := makePjcCtx(ctxt)
2044
2045	if s.P != nil {
2046		return
2047	}
2048
2049	if ycover[0] == 0 {
2050		ctxt.Diag("x86 tables not initialized, call x86.instinit first")
2051	}
2052
2053	for p := s.Func().Text; p != nil; p = p.Link {
2054		if p.To.Type == obj.TYPE_BRANCH && p.To.Target() == nil {
2055			p.To.SetTarget(p)
2056		}
2057		if p.As == AADJSP {
2058			p.To.Type = obj.TYPE_REG
2059			p.To.Reg = REG_SP
2060			// Generate 'ADDQ $x, SP' or 'SUBQ $x, SP', with x positive.
2061			// One exception: It is smaller to encode $-0x80 than $0x80.
2062			// For that case, flip the sign and the op:
2063			// Instead of 'ADDQ $0x80, SP', generate 'SUBQ $-0x80, SP'.
2064			switch v := p.From.Offset; {
2065			case v == 0:
2066				p.As = obj.ANOP
2067			case v == 0x80 || (v < 0 && v != -0x80):
2068				p.As = spadjop(ctxt, AADDL, AADDQ)
2069				p.From.Offset *= -1
2070			default:
2071				p.As = spadjop(ctxt, ASUBL, ASUBQ)
2072			}
2073		}
2074		if ctxt.Retpoline && (p.As == obj.ACALL || p.As == obj.AJMP) && (p.To.Type == obj.TYPE_REG || p.To.Type == obj.TYPE_MEM) {
2075			if p.To.Type != obj.TYPE_REG {
2076				ctxt.Diag("non-retpoline-compatible: %v", p)
2077				continue
2078			}
2079			p.To.Type = obj.TYPE_BRANCH
2080			p.To.Name = obj.NAME_EXTERN
2081			p.To.Sym = ctxt.Lookup("runtime.retpoline" + obj.Rconv(int(p.To.Reg)))
2082			p.To.Reg = 0
2083			p.To.Offset = 0
2084		}
2085	}
2086
2087	var count int64 // rough count of number of instructions
2088	for p := s.Func().Text; p != nil; p = p.Link {
2089		count++
2090		p.Back = branchShort // use short branches first time through
2091		if q := p.To.Target(); q != nil && (q.Back&branchShort != 0) {
2092			p.Back |= branchBackwards
2093			q.Back |= branchLoopHead
2094		}
2095	}
2096	s.GrowCap(count * 5) // preallocate roughly 5 bytes per instruction
2097
2098	var ab AsmBuf
2099	var n int
2100	var c int32
2101	errors := ctxt.Errors
2102	var nops []nopPad // Padding for a particular assembly (reuse slice storage if multiple assemblies)
2103	nrelocs0 := len(s.R)
2104	for {
2105		// This loop continues while there are reasons to re-assemble
2106		// whole block, like the presence of long forward jumps.
2107		reAssemble := false
2108		for i := range s.R[nrelocs0:] {
2109			s.R[nrelocs0+i] = obj.Reloc{}
2110		}
2111		s.R = s.R[:nrelocs0] // preserve marker relocations generated by the compiler
2112		s.P = s.P[:0]
2113		c = 0
2114		var pPrev *obj.Prog
2115		nops = nops[:0]
2116		for p := s.Func().Text; p != nil; p = p.Link {
2117			c0 := c
2118			c = pjc.padJump(ctxt, s, p, c)
2119
2120			if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 {
2121				// pad with NOPs
2122				v := -c & (loopAlign - 1)
2123
2124				if v <= maxLoopPad {
2125					s.Grow(int64(c) + int64(v))
2126					fillnop(s.P[c:], int(v))
2127					c += v
2128				}
2129			}
2130
2131			p.Pc = int64(c)
2132
2133			// process forward jumps to p
2134			for q := p.Rel; q != nil; q = q.Forwd {
2135				v := int32(p.Pc - (q.Pc + int64(q.Isize)))
2136				if q.Back&branchShort != 0 {
2137					if v > 127 {
2138						reAssemble = true
2139						q.Back ^= branchShort
2140					}
2141
2142					if q.As == AJCXZL || q.As == AXBEGIN {
2143						s.P[q.Pc+2] = byte(v)
2144					} else {
2145						s.P[q.Pc+1] = byte(v)
2146					}
2147				} else {
2148					binary.LittleEndian.PutUint32(s.P[q.Pc+int64(q.Isize)-4:], uint32(v))
2149				}
2150			}
2151
2152			p.Rel = nil
2153
2154			p.Pc = int64(c)
2155			ab.asmins(ctxt, s, p)
2156			m := ab.Len()
2157			if int(p.Isize) != m {
2158				p.Isize = uint8(m)
2159				if pjc.reAssemble(p) {
2160					// We need to re-assemble here to check for jumps and fused jumps
2161					// that span or end on 32 byte boundaries.
2162					reAssemble = true
2163				}
2164			}
2165
2166			s.Grow(p.Pc + int64(m))
2167			copy(s.P[p.Pc:], ab.Bytes())
2168			// If there was padding, remember it.
2169			if pPrev != nil && !ctxt.IsAsm && c > c0 {
2170				nops = append(nops, nopPad{p: pPrev, n: c - c0})
2171			}
2172			c += int32(m)
2173			pPrev = p
2174		}
2175
2176		n++
2177		if n > 1000 {
2178			ctxt.Diag("span must be looping")
2179			log.Fatalf("loop")
2180		}
2181		if !reAssemble {
2182			break
2183		}
2184		if ctxt.Errors > errors {
2185			return
2186		}
2187	}
2188	// splice padding nops into Progs
2189	for _, n := range nops {
2190		pp := n.p
2191		np := &obj.Prog{Link: pp.Link, Ctxt: pp.Ctxt, As: obj.ANOP, Pos: pp.Pos.WithNotStmt(), Pc: pp.Pc + int64(pp.Isize), Isize: uint8(n.n)}
2192		pp.Link = np
2193	}
2194
2195	s.Size = int64(c)
2196
2197	if false { /* debug['a'] > 1 */
2198		fmt.Printf("span1 %s %d (%d tries)\n %.6x", s.Name, s.Size, n, 0)
2199		var i int
2200		for i = 0; i < len(s.P); i++ {
2201			fmt.Printf(" %.2x", s.P[i])
2202			if i%16 == 15 {
2203				fmt.Printf("\n  %.6x", uint(i+1))
2204			}
2205		}
2206
2207		if i%16 != 0 {
2208			fmt.Printf("\n")
2209		}
2210
2211		for i := 0; i < len(s.R); i++ {
2212			r := &s.R[i]
2213			fmt.Printf(" rel %#.4x/%d %s%+d\n", uint32(r.Off), r.Siz, r.Sym.Name, r.Add)
2214		}
2215	}
2216
2217	// Mark nonpreemptible instruction sequences.
2218	// The 2-instruction TLS access sequence
2219	//	MOVQ TLS, BX
2220	//	MOVQ 0(BX)(TLS*1), BX
2221	// is not async preemptible, as if it is preempted and resumed on
2222	// a different thread, the TLS address may become invalid.
2223	if !CanUse1InsnTLS(ctxt) {
2224		useTLS := func(p *obj.Prog) bool {
2225			// Only need to mark the second instruction, which has
2226			// REG_TLS as Index. (It is okay to interrupt and restart
2227			// the first instruction.)
2228			return p.From.Index == REG_TLS
2229		}
2230		obj.MarkUnsafePoints(ctxt, s.Func().Text, newprog, useTLS, nil)
2231	}
2232}
2233
2234func instinit(ctxt *obj.Link) {
2235	if ycover[0] != 0 {
2236		// Already initialized; stop now.
2237		// This happens in the cmd/asm tests,
2238		// each of which re-initializes the arch.
2239		return
2240	}
2241
2242	switch ctxt.Headtype {
2243	case objabi.Hplan9:
2244		plan9privates = ctxt.Lookup("_privates")
2245	}
2246
2247	for i := range avxOptab {
2248		c := avxOptab[i].as
2249		if opindex[c&obj.AMask] != nil {
2250			ctxt.Diag("phase error in avxOptab: %d (%v)", i, c)
2251		}
2252		opindex[c&obj.AMask] = &avxOptab[i]
2253	}
2254	for i := 1; optab[i].as != 0; i++ {
2255		c := optab[i].as
2256		if opindex[c&obj.AMask] != nil {
2257			ctxt.Diag("phase error in optab: %d (%v)", i, c)
2258		}
2259		opindex[c&obj.AMask] = &optab[i]
2260	}
2261
2262	for i := 0; i < Ymax; i++ {
2263		ycover[i*Ymax+i] = 1
2264	}
2265
2266	ycover[Yi0*Ymax+Yu2] = 1
2267	ycover[Yi1*Ymax+Yu2] = 1
2268
2269	ycover[Yi0*Ymax+Yi8] = 1
2270	ycover[Yi1*Ymax+Yi8] = 1
2271	ycover[Yu2*Ymax+Yi8] = 1
2272	ycover[Yu7*Ymax+Yi8] = 1
2273
2274	ycover[Yi0*Ymax+Yu7] = 1
2275	ycover[Yi1*Ymax+Yu7] = 1
2276	ycover[Yu2*Ymax+Yu7] = 1
2277
2278	ycover[Yi0*Ymax+Yu8] = 1
2279	ycover[Yi1*Ymax+Yu8] = 1
2280	ycover[Yu2*Ymax+Yu8] = 1
2281	ycover[Yu7*Ymax+Yu8] = 1
2282
2283	ycover[Yi0*Ymax+Ys32] = 1
2284	ycover[Yi1*Ymax+Ys32] = 1
2285	ycover[Yu2*Ymax+Ys32] = 1
2286	ycover[Yu7*Ymax+Ys32] = 1
2287	ycover[Yu8*Ymax+Ys32] = 1
2288	ycover[Yi8*Ymax+Ys32] = 1
2289
2290	ycover[Yi0*Ymax+Yi32] = 1
2291	ycover[Yi1*Ymax+Yi32] = 1
2292	ycover[Yu2*Ymax+Yi32] = 1
2293	ycover[Yu7*Ymax+Yi32] = 1
2294	ycover[Yu8*Ymax+Yi32] = 1
2295	ycover[Yi8*Ymax+Yi32] = 1
2296	ycover[Ys32*Ymax+Yi32] = 1
2297
2298	ycover[Yi0*Ymax+Yi64] = 1
2299	ycover[Yi1*Ymax+Yi64] = 1
2300	ycover[Yu7*Ymax+Yi64] = 1
2301	ycover[Yu2*Ymax+Yi64] = 1
2302	ycover[Yu8*Ymax+Yi64] = 1
2303	ycover[Yi8*Ymax+Yi64] = 1
2304	ycover[Ys32*Ymax+Yi64] = 1
2305	ycover[Yi32*Ymax+Yi64] = 1
2306
2307	ycover[Yal*Ymax+Yrb] = 1
2308	ycover[Ycl*Ymax+Yrb] = 1
2309	ycover[Yax*Ymax+Yrb] = 1
2310	ycover[Ycx*Ymax+Yrb] = 1
2311	ycover[Yrx*Ymax+Yrb] = 1
2312	ycover[Yrl*Ymax+Yrb] = 1 // but not Yrl32
2313
2314	ycover[Ycl*Ymax+Ycx] = 1
2315
2316	ycover[Yax*Ymax+Yrx] = 1
2317	ycover[Ycx*Ymax+Yrx] = 1
2318
2319	ycover[Yax*Ymax+Yrl] = 1
2320	ycover[Ycx*Ymax+Yrl] = 1
2321	ycover[Yrx*Ymax+Yrl] = 1
2322	ycover[Yrl32*Ymax+Yrl] = 1
2323
2324	ycover[Yf0*Ymax+Yrf] = 1
2325
2326	ycover[Yal*Ymax+Ymb] = 1
2327	ycover[Ycl*Ymax+Ymb] = 1
2328	ycover[Yax*Ymax+Ymb] = 1
2329	ycover[Ycx*Ymax+Ymb] = 1
2330	ycover[Yrx*Ymax+Ymb] = 1
2331	ycover[Yrb*Ymax+Ymb] = 1
2332	ycover[Yrl*Ymax+Ymb] = 1 // but not Yrl32
2333	ycover[Ym*Ymax+Ymb] = 1
2334
2335	ycover[Yax*Ymax+Yml] = 1
2336	ycover[Ycx*Ymax+Yml] = 1
2337	ycover[Yrx*Ymax+Yml] = 1
2338	ycover[Yrl*Ymax+Yml] = 1
2339	ycover[Yrl32*Ymax+Yml] = 1
2340	ycover[Ym*Ymax+Yml] = 1
2341
2342	ycover[Yax*Ymax+Ymm] = 1
2343	ycover[Ycx*Ymax+Ymm] = 1
2344	ycover[Yrx*Ymax+Ymm] = 1
2345	ycover[Yrl*Ymax+Ymm] = 1
2346	ycover[Yrl32*Ymax+Ymm] = 1
2347	ycover[Ym*Ymax+Ymm] = 1
2348	ycover[Ymr*Ymax+Ymm] = 1
2349
2350	ycover[Yxr0*Ymax+Yxr] = 1
2351
2352	ycover[Ym*Ymax+Yxm] = 1
2353	ycover[Yxr0*Ymax+Yxm] = 1
2354	ycover[Yxr*Ymax+Yxm] = 1
2355
2356	ycover[Ym*Ymax+Yym] = 1
2357	ycover[Yyr*Ymax+Yym] = 1
2358
2359	ycover[Yxr0*Ymax+YxrEvex] = 1
2360	ycover[Yxr*Ymax+YxrEvex] = 1
2361
2362	ycover[Ym*Ymax+YxmEvex] = 1
2363	ycover[Yxr0*Ymax+YxmEvex] = 1
2364	ycover[Yxr*Ymax+YxmEvex] = 1
2365	ycover[YxrEvex*Ymax+YxmEvex] = 1
2366
2367	ycover[Yyr*Ymax+YyrEvex] = 1
2368
2369	ycover[Ym*Ymax+YymEvex] = 1
2370	ycover[Yyr*Ymax+YymEvex] = 1
2371	ycover[YyrEvex*Ymax+YymEvex] = 1
2372
2373	ycover[Ym*Ymax+Yzm] = 1
2374	ycover[Yzr*Ymax+Yzm] = 1
2375
2376	ycover[Yk0*Ymax+Yk] = 1
2377	ycover[Yknot0*Ymax+Yk] = 1
2378
2379	ycover[Yk0*Ymax+Ykm] = 1
2380	ycover[Yknot0*Ymax+Ykm] = 1
2381	ycover[Yk*Ymax+Ykm] = 1
2382	ycover[Ym*Ymax+Ykm] = 1
2383
2384	ycover[Yxvm*Ymax+YxvmEvex] = 1
2385
2386	ycover[Yyvm*Ymax+YyvmEvex] = 1
2387
2388	for i := 0; i < MAXREG; i++ {
2389		reg[i] = -1
2390		if i >= REG_AL && i <= REG_R15B {
2391			reg[i] = (i - REG_AL) & 7
2392			if i >= REG_SPB && i <= REG_DIB {
2393				regrex[i] = 0x40
2394			}
2395			if i >= REG_R8B && i <= REG_R15B {
2396				regrex[i] = Rxr | Rxx | Rxb
2397			}
2398		}
2399
2400		if i >= REG_AH && i <= REG_BH {
2401			reg[i] = 4 + ((i - REG_AH) & 7)
2402		}
2403		if i >= REG_AX && i <= REG_R15 {
2404			reg[i] = (i - REG_AX) & 7
2405			if i >= REG_R8 {
2406				regrex[i] = Rxr | Rxx | Rxb
2407			}
2408		}
2409
2410		if i >= REG_F0 && i <= REG_F0+7 {
2411			reg[i] = (i - REG_F0) & 7
2412		}
2413		if i >= REG_M0 && i <= REG_M0+7 {
2414			reg[i] = (i - REG_M0) & 7
2415		}
2416		if i >= REG_K0 && i <= REG_K0+7 {
2417			reg[i] = (i - REG_K0) & 7
2418		}
2419		if i >= REG_X0 && i <= REG_X0+15 {
2420			reg[i] = (i - REG_X0) & 7
2421			if i >= REG_X0+8 {
2422				regrex[i] = Rxr | Rxx | Rxb
2423			}
2424		}
2425		if i >= REG_X16 && i <= REG_X16+15 {
2426			reg[i] = (i - REG_X16) & 7
2427			if i >= REG_X16+8 {
2428				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2429			} else {
2430				regrex[i] = RxrEvex
2431			}
2432		}
2433		if i >= REG_Y0 && i <= REG_Y0+15 {
2434			reg[i] = (i - REG_Y0) & 7
2435			if i >= REG_Y0+8 {
2436				regrex[i] = Rxr | Rxx | Rxb
2437			}
2438		}
2439		if i >= REG_Y16 && i <= REG_Y16+15 {
2440			reg[i] = (i - REG_Y16) & 7
2441			if i >= REG_Y16+8 {
2442				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2443			} else {
2444				regrex[i] = RxrEvex
2445			}
2446		}
2447		if i >= REG_Z0 && i <= REG_Z0+15 {
2448			reg[i] = (i - REG_Z0) & 7
2449			if i > REG_Z0+7 {
2450				regrex[i] = Rxr | Rxx | Rxb
2451			}
2452		}
2453		if i >= REG_Z16 && i <= REG_Z16+15 {
2454			reg[i] = (i - REG_Z16) & 7
2455			if i >= REG_Z16+8 {
2456				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2457			} else {
2458				regrex[i] = RxrEvex
2459			}
2460		}
2461
2462		if i >= REG_CR+8 && i <= REG_CR+15 {
2463			regrex[i] = Rxr
2464		}
2465	}
2466}
2467
2468var isAndroid = buildcfg.GOOS == "android"
2469
2470func prefixof(ctxt *obj.Link, a *obj.Addr) int {
2471	if a.Reg < REG_CS && a.Index < REG_CS { // fast path
2472		return 0
2473	}
2474	if a.Type == obj.TYPE_MEM && a.Name == obj.NAME_NONE {
2475		switch a.Reg {
2476		case REG_CS:
2477			return 0x2e
2478
2479		case REG_DS:
2480			return 0x3e
2481
2482		case REG_ES:
2483			return 0x26
2484
2485		case REG_FS:
2486			return 0x64
2487
2488		case REG_GS:
2489			return 0x65
2490
2491		case REG_TLS:
2492			// NOTE: Systems listed here should be only systems that
2493			// support direct TLS references like 8(TLS) implemented as
2494			// direct references from FS or GS. Systems that require
2495			// the initial-exec model, where you load the TLS base into
2496			// a register and then index from that register, do not reach
2497			// this code and should not be listed.
2498			if ctxt.Arch.Family == sys.I386 {
2499				switch ctxt.Headtype {
2500				default:
2501					if isAndroid {
2502						return 0x65 // GS
2503					}
2504					log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)
2505
2506				case objabi.Hdarwin,
2507					objabi.Hdragonfly,
2508					objabi.Hfreebsd,
2509					objabi.Hnetbsd,
2510					objabi.Hopenbsd:
2511					return 0x65 // GS
2512				}
2513			}
2514
2515			switch ctxt.Headtype {
2516			default:
2517				log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)
2518
2519			case objabi.Hlinux:
2520				if isAndroid {
2521					return 0x64 // FS
2522				}
2523
2524				if ctxt.Flag_shared {
2525					log.Fatalf("unknown TLS base register for linux with -shared")
2526				} else {
2527					return 0x64 // FS
2528				}
2529
2530			case objabi.Hdragonfly,
2531				objabi.Hfreebsd,
2532				objabi.Hnetbsd,
2533				objabi.Hopenbsd,
2534				objabi.Hsolaris:
2535				return 0x64 // FS
2536
2537			case objabi.Hdarwin:
2538				return 0x65 // GS
2539			}
2540		}
2541	}
2542
2543	if ctxt.Arch.Family == sys.I386 {
2544		if a.Index == REG_TLS && ctxt.Flag_shared {
2545			// When building for inclusion into a shared library, an instruction of the form
2546			//     MOVL off(CX)(TLS*1), AX
2547			// becomes
2548			//     mov %gs:off(%ecx), %eax
2549			// which assumes that the correct TLS offset has been loaded into %ecx (today
2550			// there is only one TLS variable -- g -- so this is OK). When not building for
2551			// a shared library the instruction it becomes
2552			//     mov 0x0(%ecx), %eax
2553			// and a R_TLS_LE relocation, and so does not require a prefix.
2554			return 0x65 // GS
2555		}
2556		return 0
2557	}
2558
2559	switch a.Index {
2560	case REG_CS:
2561		return 0x2e
2562
2563	case REG_DS:
2564		return 0x3e
2565
2566	case REG_ES:
2567		return 0x26
2568
2569	case REG_TLS:
2570		if ctxt.Flag_shared && ctxt.Headtype != objabi.Hwindows {
2571			// When building for inclusion into a shared library, an instruction of the form
2572			//     MOV off(CX)(TLS*1), AX
2573			// becomes
2574			//     mov %fs:off(%rcx), %rax
2575			// which assumes that the correct TLS offset has been loaded into %rcx (today
2576			// there is only one TLS variable -- g -- so this is OK). When not building for
2577			// a shared library the instruction does not require a prefix.
2578			return 0x64
2579		}
2580
2581	case REG_FS:
2582		return 0x64
2583
2584	case REG_GS:
2585		return 0x65
2586	}
2587
2588	return 0
2589}
2590
2591// oclassRegList returns multisource operand class for addr.
2592func oclassRegList(ctxt *obj.Link, addr *obj.Addr) int {
2593	// TODO(quasilyte): when oclass register case is refactored into
2594	// lookup table, use it here to get register kind more easily.
2595	// Helper functions like regIsXmm should go away too (they will become redundant).
2596
2597	regIsXmm := func(r int) bool { return r >= REG_X0 && r <= REG_X31 }
2598	regIsYmm := func(r int) bool { return r >= REG_Y0 && r <= REG_Y31 }
2599	regIsZmm := func(r int) bool { return r >= REG_Z0 && r <= REG_Z31 }
2600
2601	reg0, reg1 := decodeRegisterRange(addr.Offset)
2602	low := regIndex(int16(reg0))
2603	high := regIndex(int16(reg1))
2604
2605	if ctxt.Arch.Family == sys.I386 {
2606		if low >= 8 || high >= 8 {
2607			return Yxxx
2608		}
2609	}
2610
2611	switch high - low {
2612	case 3:
2613		switch {
2614		case regIsXmm(reg0) && regIsXmm(reg1):
2615			return YxrEvexMulti4
2616		case regIsYmm(reg0) && regIsYmm(reg1):
2617			return YyrEvexMulti4
2618		case regIsZmm(reg0) && regIsZmm(reg1):
2619			return YzrMulti4
2620		default:
2621			return Yxxx
2622		}
2623	default:
2624		return Yxxx
2625	}
2626}
2627
2628// oclassVMem returns V-mem (vector memory with VSIB) operand class.
2629// For addr that is not V-mem returns (Yxxx, false).
2630func oclassVMem(ctxt *obj.Link, addr *obj.Addr) (int, bool) {
2631	switch addr.Index {
2632	case REG_X0 + 0,
2633		REG_X0 + 1,
2634		REG_X0 + 2,
2635		REG_X0 + 3,
2636		REG_X0 + 4,
2637		REG_X0 + 5,
2638		REG_X0 + 6,
2639		REG_X0 + 7:
2640		return Yxvm, true
2641	case REG_X8 + 0,
2642		REG_X8 + 1,
2643		REG_X8 + 2,
2644		REG_X8 + 3,
2645		REG_X8 + 4,
2646		REG_X8 + 5,
2647		REG_X8 + 6,
2648		REG_X8 + 7:
2649		if ctxt.Arch.Family == sys.I386 {
2650			return Yxxx, true
2651		}
2652		return Yxvm, true
2653	case REG_X16 + 0,
2654		REG_X16 + 1,
2655		REG_X16 + 2,
2656		REG_X16 + 3,
2657		REG_X16 + 4,
2658		REG_X16 + 5,
2659		REG_X16 + 6,
2660		REG_X16 + 7,
2661		REG_X16 + 8,
2662		REG_X16 + 9,
2663		REG_X16 + 10,
2664		REG_X16 + 11,
2665		REG_X16 + 12,
2666		REG_X16 + 13,
2667		REG_X16 + 14,
2668		REG_X16 + 15:
2669		if ctxt.Arch.Family == sys.I386 {
2670			return Yxxx, true
2671		}
2672		return YxvmEvex, true
2673
2674	case REG_Y0 + 0,
2675		REG_Y0 + 1,
2676		REG_Y0 + 2,
2677		REG_Y0 + 3,
2678		REG_Y0 + 4,
2679		REG_Y0 + 5,
2680		REG_Y0 + 6,
2681		REG_Y0 + 7:
2682		return Yyvm, true
2683	case REG_Y8 + 0,
2684		REG_Y8 + 1,
2685		REG_Y8 + 2,
2686		REG_Y8 + 3,
2687		REG_Y8 + 4,
2688		REG_Y8 + 5,
2689		REG_Y8 + 6,
2690		REG_Y8 + 7:
2691		if ctxt.Arch.Family == sys.I386 {
2692			return Yxxx, true
2693		}
2694		return Yyvm, true
2695	case REG_Y16 + 0,
2696		REG_Y16 + 1,
2697		REG_Y16 + 2,
2698		REG_Y16 + 3,
2699		REG_Y16 + 4,
2700		REG_Y16 + 5,
2701		REG_Y16 + 6,
2702		REG_Y16 + 7,
2703		REG_Y16 + 8,
2704		REG_Y16 + 9,
2705		REG_Y16 + 10,
2706		REG_Y16 + 11,
2707		REG_Y16 + 12,
2708		REG_Y16 + 13,
2709		REG_Y16 + 14,
2710		REG_Y16 + 15:
2711		if ctxt.Arch.Family == sys.I386 {
2712			return Yxxx, true
2713		}
2714		return YyvmEvex, true
2715
2716	case REG_Z0 + 0,
2717		REG_Z0 + 1,
2718		REG_Z0 + 2,
2719		REG_Z0 + 3,
2720		REG_Z0 + 4,
2721		REG_Z0 + 5,
2722		REG_Z0 + 6,
2723		REG_Z0 + 7:
2724		return Yzvm, true
2725	case REG_Z8 + 0,
2726		REG_Z8 + 1,
2727		REG_Z8 + 2,
2728		REG_Z8 + 3,
2729		REG_Z8 + 4,
2730		REG_Z8 + 5,
2731		REG_Z8 + 6,
2732		REG_Z8 + 7,
2733		REG_Z8 + 8,
2734		REG_Z8 + 9,
2735		REG_Z8 + 10,
2736		REG_Z8 + 11,
2737		REG_Z8 + 12,
2738		REG_Z8 + 13,
2739		REG_Z8 + 14,
2740		REG_Z8 + 15,
2741		REG_Z8 + 16,
2742		REG_Z8 + 17,
2743		REG_Z8 + 18,
2744		REG_Z8 + 19,
2745		REG_Z8 + 20,
2746		REG_Z8 + 21,
2747		REG_Z8 + 22,
2748		REG_Z8 + 23:
2749		if ctxt.Arch.Family == sys.I386 {
2750			return Yxxx, true
2751		}
2752		return Yzvm, true
2753	}
2754
2755	return Yxxx, false
2756}
2757
2758func oclass(ctxt *obj.Link, p *obj.Prog, a *obj.Addr) int {
2759	switch a.Type {
2760	case obj.TYPE_REGLIST:
2761		return oclassRegList(ctxt, a)
2762
2763	case obj.TYPE_NONE:
2764		return Ynone
2765
2766	case obj.TYPE_BRANCH:
2767		return Ybr
2768
2769	case obj.TYPE_INDIR:
2770		if a.Name != obj.NAME_NONE && a.Reg == REG_NONE && a.Index == REG_NONE && a.Scale == 0 {
2771			return Yindir
2772		}
2773		return Yxxx
2774
2775	case obj.TYPE_MEM:
2776		// Pseudo registers have negative index, but SP is
2777		// not pseudo on x86, hence REG_SP check is not redundant.
2778		if a.Index == REG_SP || a.Index < 0 {
2779			// Can't use FP/SB/PC/SP as the index register.
2780			return Yxxx
2781		}
2782
2783		if vmem, ok := oclassVMem(ctxt, a); ok {
2784			return vmem
2785		}
2786
2787		if ctxt.Arch.Family == sys.AMD64 {
2788			switch a.Name {
2789			case obj.NAME_EXTERN, obj.NAME_STATIC, obj.NAME_GOTREF:
2790				// Global variables can't use index registers and their
2791				// base register is %rip (%rip is encoded as REG_NONE).
2792				if a.Reg != REG_NONE || a.Index != REG_NONE || a.Scale != 0 {
2793					return Yxxx
2794				}
2795			case obj.NAME_AUTO, obj.NAME_PARAM:
2796				// These names must have a base of SP.  The old compiler
2797				// uses 0 for the base register. SSA uses REG_SP.
2798				if a.Reg != REG_SP && a.Reg != 0 {
2799					return Yxxx
2800				}
2801			case obj.NAME_NONE:
2802				// everything is ok
2803			default:
2804				// unknown name
2805				return Yxxx
2806			}
2807		}
2808		return Ym
2809
2810	case obj.TYPE_ADDR:
2811		switch a.Name {
2812		case obj.NAME_GOTREF:
2813			ctxt.Diag("unexpected TYPE_ADDR with NAME_GOTREF")
2814			return Yxxx
2815
2816		case obj.NAME_EXTERN,
2817			obj.NAME_STATIC:
2818			if a.Sym != nil && useAbs(ctxt, a.Sym) {
2819				return Yi32
2820			}
2821			return Yiauto // use pc-relative addressing
2822
2823		case obj.NAME_AUTO,
2824			obj.NAME_PARAM:
2825			return Yiauto
2826		}
2827
2828		// TODO(rsc): DUFFZERO/DUFFCOPY encoding forgot to set a->index
2829		// and got Yi32 in an earlier version of this code.
2830		// Keep doing that until we fix yduff etc.
2831		if a.Sym != nil && strings.HasPrefix(a.Sym.Name, "runtime.duff") {
2832			return Yi32
2833		}
2834
2835		if a.Sym != nil || a.Name != obj.NAME_NONE {
2836			ctxt.Diag("unexpected addr: %v", obj.Dconv(p, a))
2837		}
2838		fallthrough
2839
2840	case obj.TYPE_CONST:
2841		if a.Sym != nil {
2842			ctxt.Diag("TYPE_CONST with symbol: %v", obj.Dconv(p, a))
2843		}
2844
2845		v := a.Offset
2846		if ctxt.Arch.Family == sys.I386 {
2847			v = int64(int32(v))
2848		}
2849		switch {
2850		case v == 0:
2851			return Yi0
2852		case v == 1:
2853			return Yi1
2854		case v >= 0 && v <= 3:
2855			return Yu2
2856		case v >= 0 && v <= 127:
2857			return Yu7
2858		case v >= 0 && v <= 255:
2859			return Yu8
2860		case v >= -128 && v <= 127:
2861			return Yi8
2862		}
2863		if ctxt.Arch.Family == sys.I386 {
2864			return Yi32
2865		}
2866		l := int32(v)
2867		if int64(l) == v {
2868			return Ys32 // can sign extend
2869		}
2870		if v>>32 == 0 {
2871			return Yi32 // unsigned
2872		}
2873		return Yi64
2874
2875	case obj.TYPE_TEXTSIZE:
2876		return Ytextsize
2877	}
2878
2879	if a.Type != obj.TYPE_REG {
2880		ctxt.Diag("unexpected addr1: type=%d %v", a.Type, obj.Dconv(p, a))
2881		return Yxxx
2882	}
2883
2884	switch a.Reg {
2885	case REG_AL:
2886		return Yal
2887
2888	case REG_AX:
2889		return Yax
2890
2891		/*
2892			case REG_SPB:
2893		*/
2894	case REG_BPB,
2895		REG_SIB,
2896		REG_DIB,
2897		REG_R8B,
2898		REG_R9B,
2899		REG_R10B,
2900		REG_R11B,
2901		REG_R12B,
2902		REG_R13B,
2903		REG_R14B,
2904		REG_R15B:
2905		if ctxt.Arch.Family == sys.I386 {
2906			return Yxxx
2907		}
2908		fallthrough
2909
2910	case REG_DL,
2911		REG_BL,
2912		REG_AH,
2913		REG_CH,
2914		REG_DH,
2915		REG_BH:
2916		return Yrb
2917
2918	case REG_CL:
2919		return Ycl
2920
2921	case REG_CX:
2922		return Ycx
2923
2924	case REG_DX, REG_BX:
2925		return Yrx
2926
2927	case REG_R8, // not really Yrl
2928		REG_R9,
2929		REG_R10,
2930		REG_R11,
2931		REG_R12,
2932		REG_R13,
2933		REG_R14,
2934		REG_R15:
2935		if ctxt.Arch.Family == sys.I386 {
2936			return Yxxx
2937		}
2938		fallthrough
2939
2940	case REG_SP, REG_BP, REG_SI, REG_DI:
2941		if ctxt.Arch.Family == sys.I386 {
2942			return Yrl32
2943		}
2944		return Yrl
2945
2946	case REG_F0 + 0:
2947		return Yf0
2948
2949	case REG_F0 + 1,
2950		REG_F0 + 2,
2951		REG_F0 + 3,
2952		REG_F0 + 4,
2953		REG_F0 + 5,
2954		REG_F0 + 6,
2955		REG_F0 + 7:
2956		return Yrf
2957
2958	case REG_M0 + 0,
2959		REG_M0 + 1,
2960		REG_M0 + 2,
2961		REG_M0 + 3,
2962		REG_M0 + 4,
2963		REG_M0 + 5,
2964		REG_M0 + 6,
2965		REG_M0 + 7:
2966		return Ymr
2967
2968	case REG_X0:
2969		return Yxr0
2970
2971	case REG_X0 + 1,
2972		REG_X0 + 2,
2973		REG_X0 + 3,
2974		REG_X0 + 4,
2975		REG_X0 + 5,
2976		REG_X0 + 6,
2977		REG_X0 + 7,
2978		REG_X0 + 8,
2979		REG_X0 + 9,
2980		REG_X0 + 10,
2981		REG_X0 + 11,
2982		REG_X0 + 12,
2983		REG_X0 + 13,
2984		REG_X0 + 14,
2985		REG_X0 + 15:
2986		return Yxr
2987
2988	case REG_X0 + 16,
2989		REG_X0 + 17,
2990		REG_X0 + 18,
2991		REG_X0 + 19,
2992		REG_X0 + 20,
2993		REG_X0 + 21,
2994		REG_X0 + 22,
2995		REG_X0 + 23,
2996		REG_X0 + 24,
2997		REG_X0 + 25,
2998		REG_X0 + 26,
2999		REG_X0 + 27,
3000		REG_X0 + 28,
3001		REG_X0 + 29,
3002		REG_X0 + 30,
3003		REG_X0 + 31:
3004		return YxrEvex
3005
3006	case REG_Y0 + 0,
3007		REG_Y0 + 1,
3008		REG_Y0 + 2,
3009		REG_Y0 + 3,
3010		REG_Y0 + 4,
3011		REG_Y0 + 5,
3012		REG_Y0 + 6,
3013		REG_Y0 + 7,
3014		REG_Y0 + 8,
3015		REG_Y0 + 9,
3016		REG_Y0 + 10,
3017		REG_Y0 + 11,
3018		REG_Y0 + 12,
3019		REG_Y0 + 13,
3020		REG_Y0 + 14,
3021		REG_Y0 + 15:
3022		return Yyr
3023
3024	case REG_Y0 + 16,
3025		REG_Y0 + 17,
3026		REG_Y0 + 18,
3027		REG_Y0 + 19,
3028		REG_Y0 + 20,
3029		REG_Y0 + 21,
3030		REG_Y0 + 22,
3031		REG_Y0 + 23,
3032		REG_Y0 + 24,
3033		REG_Y0 + 25,
3034		REG_Y0 + 26,
3035		REG_Y0 + 27,
3036		REG_Y0 + 28,
3037		REG_Y0 + 29,
3038		REG_Y0 + 30,
3039		REG_Y0 + 31:
3040		return YyrEvex
3041
3042	case REG_Z0 + 0,
3043		REG_Z0 + 1,
3044		REG_Z0 + 2,
3045		REG_Z0 + 3,
3046		REG_Z0 + 4,
3047		REG_Z0 + 5,
3048		REG_Z0 + 6,
3049		REG_Z0 + 7:
3050		return Yzr
3051
3052	case REG_Z0 + 8,
3053		REG_Z0 + 9,
3054		REG_Z0 + 10,
3055		REG_Z0 + 11,
3056		REG_Z0 + 12,
3057		REG_Z0 + 13,
3058		REG_Z0 + 14,
3059		REG_Z0 + 15,
3060		REG_Z0 + 16,
3061		REG_Z0 + 17,
3062		REG_Z0 + 18,
3063		REG_Z0 + 19,
3064		REG_Z0 + 20,
3065		REG_Z0 + 21,
3066		REG_Z0 + 22,
3067		REG_Z0 + 23,
3068		REG_Z0 + 24,
3069		REG_Z0 + 25,
3070		REG_Z0 + 26,
3071		REG_Z0 + 27,
3072		REG_Z0 + 28,
3073		REG_Z0 + 29,
3074		REG_Z0 + 30,
3075		REG_Z0 + 31:
3076		if ctxt.Arch.Family == sys.I386 {
3077			return Yxxx
3078		}
3079		return Yzr
3080
3081	case REG_K0:
3082		return Yk0
3083
3084	case REG_K0 + 1,
3085		REG_K0 + 2,
3086		REG_K0 + 3,
3087		REG_K0 + 4,
3088		REG_K0 + 5,
3089		REG_K0 + 6,
3090		REG_K0 + 7:
3091		return Yknot0
3092
3093	case REG_CS:
3094		return Ycs
3095	case REG_SS:
3096		return Yss
3097	case REG_DS:
3098		return Yds
3099	case REG_ES:
3100		return Yes
3101	case REG_FS:
3102		return Yfs
3103	case REG_GS:
3104		return Ygs
3105	case REG_TLS:
3106		return Ytls
3107
3108	case REG_GDTR:
3109		return Ygdtr
3110	case REG_IDTR:
3111		return Yidtr
3112	case REG_LDTR:
3113		return Yldtr
3114	case REG_MSW:
3115		return Ymsw
3116	case REG_TASK:
3117		return Ytask
3118
3119	case REG_CR + 0:
3120		return Ycr0
3121	case REG_CR + 1:
3122		return Ycr1
3123	case REG_CR + 2:
3124		return Ycr2
3125	case REG_CR + 3:
3126		return Ycr3
3127	case REG_CR + 4:
3128		return Ycr4
3129	case REG_CR + 5:
3130		return Ycr5
3131	case REG_CR + 6:
3132		return Ycr6
3133	case REG_CR + 7:
3134		return Ycr7
3135	case REG_CR + 8:
3136		return Ycr8
3137
3138	case REG_DR + 0:
3139		return Ydr0
3140	case REG_DR + 1:
3141		return Ydr1
3142	case REG_DR + 2:
3143		return Ydr2
3144	case REG_DR + 3:
3145		return Ydr3
3146	case REG_DR + 4:
3147		return Ydr4
3148	case REG_DR + 5:
3149		return Ydr5
3150	case REG_DR + 6:
3151		return Ydr6
3152	case REG_DR + 7:
3153		return Ydr7
3154
3155	case REG_TR + 0:
3156		return Ytr0
3157	case REG_TR + 1:
3158		return Ytr1
3159	case REG_TR + 2:
3160		return Ytr2
3161	case REG_TR + 3:
3162		return Ytr3
3163	case REG_TR + 4:
3164		return Ytr4
3165	case REG_TR + 5:
3166		return Ytr5
3167	case REG_TR + 6:
3168		return Ytr6
3169	case REG_TR + 7:
3170		return Ytr7
3171	}
3172
3173	return Yxxx
3174}
3175
3176// AsmBuf is a simple buffer to assemble variable-length x86 instructions into
3177// and hold assembly state.
3178type AsmBuf struct {
3179	buf      [100]byte
3180	off      int
3181	rexflag  int
3182	vexflag  bool // Per inst: true for VEX-encoded
3183	evexflag bool // Per inst: true for EVEX-encoded
3184	rep      bool
3185	repn     bool
3186	lock     bool
3187
3188	evex evexBits // Initialized when evexflag is true
3189}
3190
3191// Put1 appends one byte to the end of the buffer.
3192func (ab *AsmBuf) Put1(x byte) {
3193	ab.buf[ab.off] = x
3194	ab.off++
3195}
3196
3197// Put2 appends two bytes to the end of the buffer.
3198func (ab *AsmBuf) Put2(x, y byte) {
3199	ab.buf[ab.off+0] = x
3200	ab.buf[ab.off+1] = y
3201	ab.off += 2
3202}
3203
3204// Put3 appends three bytes to the end of the buffer.
3205func (ab *AsmBuf) Put3(x, y, z byte) {
3206	ab.buf[ab.off+0] = x
3207	ab.buf[ab.off+1] = y
3208	ab.buf[ab.off+2] = z
3209	ab.off += 3
3210}
3211
3212// Put4 appends four bytes to the end of the buffer.
3213func (ab *AsmBuf) Put4(x, y, z, w byte) {
3214	ab.buf[ab.off+0] = x
3215	ab.buf[ab.off+1] = y
3216	ab.buf[ab.off+2] = z
3217	ab.buf[ab.off+3] = w
3218	ab.off += 4
3219}
3220
3221// PutInt16 writes v into the buffer using little-endian encoding.
3222func (ab *AsmBuf) PutInt16(v int16) {
3223	ab.buf[ab.off+0] = byte(v)
3224	ab.buf[ab.off+1] = byte(v >> 8)
3225	ab.off += 2
3226}
3227
3228// PutInt32 writes v into the buffer using little-endian encoding.
3229func (ab *AsmBuf) PutInt32(v int32) {
3230	ab.buf[ab.off+0] = byte(v)
3231	ab.buf[ab.off+1] = byte(v >> 8)
3232	ab.buf[ab.off+2] = byte(v >> 16)
3233	ab.buf[ab.off+3] = byte(v >> 24)
3234	ab.off += 4
3235}
3236
3237// PutInt64 writes v into the buffer using little-endian encoding.
3238func (ab *AsmBuf) PutInt64(v int64) {
3239	ab.buf[ab.off+0] = byte(v)
3240	ab.buf[ab.off+1] = byte(v >> 8)
3241	ab.buf[ab.off+2] = byte(v >> 16)
3242	ab.buf[ab.off+3] = byte(v >> 24)
3243	ab.buf[ab.off+4] = byte(v >> 32)
3244	ab.buf[ab.off+5] = byte(v >> 40)
3245	ab.buf[ab.off+6] = byte(v >> 48)
3246	ab.buf[ab.off+7] = byte(v >> 56)
3247	ab.off += 8
3248}
3249
3250// Put copies b into the buffer.
3251func (ab *AsmBuf) Put(b []byte) {
3252	copy(ab.buf[ab.off:], b)
3253	ab.off += len(b)
3254}
3255
3256// PutOpBytesLit writes zero terminated sequence of bytes from op,
3257// starting at specified offset (e.g. z counter value).
3258// Trailing 0 is not written.
3259//
3260// Intended to be used for literal Z cases.
3261// Literal Z cases usually have "Zlit" in their name (Zlit, Zlitr_m, Zlitm_r).
3262func (ab *AsmBuf) PutOpBytesLit(offset int, op *opBytes) {
3263	for int(op[offset]) != 0 {
3264		ab.Put1(byte(op[offset]))
3265		offset++
3266	}
3267}
3268
3269// Insert inserts b at offset i.
3270func (ab *AsmBuf) Insert(i int, b byte) {
3271	ab.off++
3272	copy(ab.buf[i+1:ab.off], ab.buf[i:ab.off-1])
3273	ab.buf[i] = b
3274}
3275
3276// Last returns the byte at the end of the buffer.
3277func (ab *AsmBuf) Last() byte { return ab.buf[ab.off-1] }
3278
3279// Len returns the length of the buffer.
3280func (ab *AsmBuf) Len() int { return ab.off }
3281
3282// Bytes returns the contents of the buffer.
3283func (ab *AsmBuf) Bytes() []byte { return ab.buf[:ab.off] }
3284
3285// Reset empties the buffer.
3286func (ab *AsmBuf) Reset() { ab.off = 0 }
3287
3288// At returns the byte at offset i.
3289func (ab *AsmBuf) At(i int) byte { return ab.buf[i] }
3290
3291// asmidx emits SIB byte.
3292func (ab *AsmBuf) asmidx(ctxt *obj.Link, scale int, index int, base int) {
3293	var i int
3294
3295	// X/Y index register is used in VSIB.
3296	switch index {
3297	default:
3298		goto bad
3299
3300	case REG_NONE:
3301		i = 4 << 3
3302		goto bas
3303
3304	case REG_R8,
3305		REG_R9,
3306		REG_R10,
3307		REG_R11,
3308		REG_R12,
3309		REG_R13,
3310		REG_R14,
3311		REG_R15,
3312		REG_X8,
3313		REG_X9,
3314		REG_X10,
3315		REG_X11,
3316		REG_X12,
3317		REG_X13,
3318		REG_X14,
3319		REG_X15,
3320		REG_X16,
3321		REG_X17,
3322		REG_X18,
3323		REG_X19,
3324		REG_X20,
3325		REG_X21,
3326		REG_X22,
3327		REG_X23,
3328		REG_X24,
3329		REG_X25,
3330		REG_X26,
3331		REG_X27,
3332		REG_X28,
3333		REG_X29,
3334		REG_X30,
3335		REG_X31,
3336		REG_Y8,
3337		REG_Y9,
3338		REG_Y10,
3339		REG_Y11,
3340		REG_Y12,
3341		REG_Y13,
3342		REG_Y14,
3343		REG_Y15,
3344		REG_Y16,
3345		REG_Y17,
3346		REG_Y18,
3347		REG_Y19,
3348		REG_Y20,
3349		REG_Y21,
3350		REG_Y22,
3351		REG_Y23,
3352		REG_Y24,
3353		REG_Y25,
3354		REG_Y26,
3355		REG_Y27,
3356		REG_Y28,
3357		REG_Y29,
3358		REG_Y30,
3359		REG_Y31,
3360		REG_Z8,
3361		REG_Z9,
3362		REG_Z10,
3363		REG_Z11,
3364		REG_Z12,
3365		REG_Z13,
3366		REG_Z14,
3367		REG_Z15,
3368		REG_Z16,
3369		REG_Z17,
3370		REG_Z18,
3371		REG_Z19,
3372		REG_Z20,
3373		REG_Z21,
3374		REG_Z22,
3375		REG_Z23,
3376		REG_Z24,
3377		REG_Z25,
3378		REG_Z26,
3379		REG_Z27,
3380		REG_Z28,
3381		REG_Z29,
3382		REG_Z30,
3383		REG_Z31:
3384		if ctxt.Arch.Family == sys.I386 {
3385			goto bad
3386		}
3387		fallthrough
3388
3389	case REG_AX,
3390		REG_CX,
3391		REG_DX,
3392		REG_BX,
3393		REG_BP,
3394		REG_SI,
3395		REG_DI,
3396		REG_X0,
3397		REG_X1,
3398		REG_X2,
3399		REG_X3,
3400		REG_X4,
3401		REG_X5,
3402		REG_X6,
3403		REG_X7,
3404		REG_Y0,
3405		REG_Y1,
3406		REG_Y2,
3407		REG_Y3,
3408		REG_Y4,
3409		REG_Y5,
3410		REG_Y6,
3411		REG_Y7,
3412		REG_Z0,
3413		REG_Z1,
3414		REG_Z2,
3415		REG_Z3,
3416		REG_Z4,
3417		REG_Z5,
3418		REG_Z6,
3419		REG_Z7:
3420		i = reg[index] << 3
3421	}
3422
3423	switch scale {
3424	default:
3425		goto bad
3426
3427	case 1:
3428		break
3429
3430	case 2:
3431		i |= 1 << 6
3432
3433	case 4:
3434		i |= 2 << 6
3435
3436	case 8:
3437		i |= 3 << 6
3438	}
3439
3440bas:
3441	switch base {
3442	default:
3443		goto bad
3444
3445	case REG_NONE: // must be mod=00
3446		i |= 5
3447
3448	case REG_R8,
3449		REG_R9,
3450		REG_R10,
3451		REG_R11,
3452		REG_R12,
3453		REG_R13,
3454		REG_R14,
3455		REG_R15:
3456		if ctxt.Arch.Family == sys.I386 {
3457			goto bad
3458		}
3459		fallthrough
3460
3461	case REG_AX,
3462		REG_CX,
3463		REG_DX,
3464		REG_BX,
3465		REG_SP,
3466		REG_BP,
3467		REG_SI,
3468		REG_DI:
3469		i |= reg[base]
3470	}
3471
3472	ab.Put1(byte(i))
3473	return
3474
3475bad:
3476	ctxt.Diag("asmidx: bad address %d/%d/%d", scale, index, base)
3477	ab.Put1(0)
3478}
3479
3480func (ab *AsmBuf) relput4(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr) {
3481	var rel obj.Reloc
3482
3483	v := vaddr(ctxt, p, a, &rel)
3484	if rel.Siz != 0 {
3485		if rel.Siz != 4 {
3486			ctxt.Diag("bad reloc")
3487		}
3488		r := obj.Addrel(cursym)
3489		*r = rel
3490		r.Off = int32(p.Pc + int64(ab.Len()))
3491	}
3492
3493	ab.PutInt32(int32(v))
3494}
3495
3496func vaddr(ctxt *obj.Link, p *obj.Prog, a *obj.Addr, r *obj.Reloc) int64 {
3497	if r != nil {
3498		*r = obj.Reloc{}
3499	}
3500
3501	switch a.Name {
3502	case obj.NAME_STATIC,
3503		obj.NAME_GOTREF,
3504		obj.NAME_EXTERN:
3505		s := a.Sym
3506		if r == nil {
3507			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
3508			log.Fatalf("reloc")
3509		}
3510
3511		if a.Name == obj.NAME_GOTREF {
3512			r.Siz = 4
3513			r.Type = objabi.R_GOTPCREL
3514		} else if useAbs(ctxt, s) {
3515			r.Siz = 4
3516			r.Type = objabi.R_ADDR
3517		} else {
3518			r.Siz = 4
3519			r.Type = objabi.R_PCREL
3520		}
3521
3522		r.Off = -1 // caller must fill in
3523		r.Sym = s
3524		r.Add = a.Offset
3525
3526		return 0
3527	}
3528
3529	if (a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Reg == REG_TLS {
3530		if r == nil {
3531			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
3532			log.Fatalf("reloc")
3533		}
3534
3535		if !ctxt.Flag_shared || isAndroid || ctxt.Headtype == objabi.Hdarwin {
3536			r.Type = objabi.R_TLS_LE
3537			r.Siz = 4
3538			r.Off = -1 // caller must fill in
3539			r.Add = a.Offset
3540		}
3541		return 0
3542	}
3543
3544	return a.Offset
3545}
3546
3547func (ab *AsmBuf) asmandsz(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, r int, rex int, m64 int) {
3548	var base int
3549	var rel obj.Reloc
3550
3551	rex &= 0x40 | Rxr
3552	if a.Offset != int64(int32(a.Offset)) {
3553		// The rules are slightly different for 386 and AMD64,
3554		// mostly for historical reasons. We may unify them later,
3555		// but it must be discussed beforehand.
3556		//
3557		// For 64bit mode only LEAL is allowed to overflow.
3558		// It's how https://golang.org/cl/59630 made it.
3559		// crypto/sha1/sha1block_amd64.s depends on this feature.
3560		//
3561		// For 32bit mode rules are more permissive.
3562		// If offset fits uint32, it's permitted.
3563		// This is allowed for assembly that wants to use 32-bit hex
3564		// constants, e.g. LEAL 0x99999999(AX), AX.
3565		overflowOK := (ctxt.Arch.Family == sys.AMD64 && p.As == ALEAL) ||
3566			(ctxt.Arch.Family != sys.AMD64 &&
3567				int64(uint32(a.Offset)) == a.Offset &&
3568				ab.rexflag&Rxw == 0)
3569		if !overflowOK {
3570			ctxt.Diag("offset too large in %s", p)
3571		}
3572	}
3573	v := int32(a.Offset)
3574	rel.Siz = 0
3575
3576	switch a.Type {
3577	case obj.TYPE_ADDR:
3578		if a.Name == obj.NAME_NONE {
3579			ctxt.Diag("unexpected TYPE_ADDR with NAME_NONE")
3580		}
3581		if a.Index == REG_TLS {
3582			ctxt.Diag("unexpected TYPE_ADDR with index==REG_TLS")
3583		}
3584		goto bad
3585
3586	case obj.TYPE_REG:
3587		const regFirst = REG_AL
3588		const regLast = REG_Z31
3589		if a.Reg < regFirst || regLast < a.Reg {
3590			goto bad
3591		}
3592		if v != 0 {
3593			goto bad
3594		}
3595		ab.Put1(byte(3<<6 | reg[a.Reg]<<0 | r<<3))
3596		ab.rexflag |= regrex[a.Reg]&(0x40|Rxb) | rex
3597		return
3598	}
3599
3600	if a.Type != obj.TYPE_MEM {
3601		goto bad
3602	}
3603
3604	if a.Index != REG_NONE && a.Index != REG_TLS {
3605		base := int(a.Reg)
3606		switch a.Name {
3607		case obj.NAME_EXTERN,
3608			obj.NAME_GOTREF,
3609			obj.NAME_STATIC:
3610			if !useAbs(ctxt, a.Sym) && ctxt.Arch.Family == sys.AMD64 {
3611				goto bad
3612			}
3613			if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
3614				// The base register has already been set. It holds the PC
3615				// of this instruction returned by a PC-reading thunk.
3616				// See obj6.go:rewriteToPcrel.
3617			} else {
3618				base = REG_NONE
3619			}
3620			v = int32(vaddr(ctxt, p, a, &rel))
3621
3622		case obj.NAME_AUTO,
3623			obj.NAME_PARAM:
3624			base = REG_SP
3625		}
3626
3627		ab.rexflag |= regrex[int(a.Index)]&Rxx | regrex[base]&Rxb | rex
3628		if base == REG_NONE {
3629			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
3630			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3631			goto putrelv
3632		}
3633
3634		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
3635			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
3636			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3637			return
3638		}
3639
3640		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
3641			ab.Put1(byte(1<<6 | 4<<0 | r<<3))
3642			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3643			ab.Put1(disp8)
3644			return
3645		}
3646
3647		ab.Put1(byte(2<<6 | 4<<0 | r<<3))
3648		ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3649		goto putrelv
3650	}
3651
3652	base = int(a.Reg)
3653	switch a.Name {
3654	case obj.NAME_STATIC,
3655		obj.NAME_GOTREF,
3656		obj.NAME_EXTERN:
3657		if a.Sym == nil {
3658			ctxt.Diag("bad addr: %v", p)
3659		}
3660		if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
3661			// The base register has already been set. It holds the PC
3662			// of this instruction returned by a PC-reading thunk.
3663			// See obj6.go:rewriteToPcrel.
3664		} else {
3665			base = REG_NONE
3666		}
3667		v = int32(vaddr(ctxt, p, a, &rel))
3668
3669	case obj.NAME_AUTO,
3670		obj.NAME_PARAM:
3671		base = REG_SP
3672	}
3673
3674	if base == REG_TLS {
3675		v = int32(vaddr(ctxt, p, a, &rel))
3676	}
3677
3678	ab.rexflag |= regrex[base]&Rxb | rex
3679	if base == REG_NONE || (REG_CS <= base && base <= REG_GS) || base == REG_TLS {
3680		if (a.Sym == nil || !useAbs(ctxt, a.Sym)) && base == REG_NONE && (a.Name == obj.NAME_STATIC || a.Name == obj.NAME_EXTERN || a.Name == obj.NAME_GOTREF) || ctxt.Arch.Family != sys.AMD64 {
3681			if a.Name == obj.NAME_GOTREF && (a.Offset != 0 || a.Index != 0 || a.Scale != 0) {
3682				ctxt.Diag("%v has offset against gotref", p)
3683			}
3684			ab.Put1(byte(0<<6 | 5<<0 | r<<3))
3685			goto putrelv
3686		}
3687
3688		// temporary
3689		ab.Put2(
3690			byte(0<<6|4<<0|r<<3), // sib present
3691			0<<6|4<<3|5<<0,       // DS:d32
3692		)
3693		goto putrelv
3694	}
3695
3696	if base == REG_SP || base == REG_R12 {
3697		if v == 0 {
3698			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
3699			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3700			return
3701		}
3702
3703		if disp8, ok := toDisp8(v, p, ab); ok {
3704			ab.Put1(byte(1<<6 | reg[base]<<0 | r<<3))
3705			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3706			ab.Put1(disp8)
3707			return
3708		}
3709
3710		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
3711		ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3712		goto putrelv
3713	}
3714
3715	if REG_AX <= base && base <= REG_R15 {
3716		if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid {
3717			rel = obj.Reloc{}
3718			rel.Type = objabi.R_TLS_LE
3719			rel.Siz = 4
3720			rel.Sym = nil
3721			rel.Add = int64(v)
3722			v = 0
3723		}
3724
3725		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
3726			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
3727			return
3728		}
3729
3730		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
3731			ab.Put2(byte(1<<6|reg[base]<<0|r<<3), disp8)
3732			return
3733		}
3734
3735		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
3736		goto putrelv
3737	}
3738
3739	goto bad
3740
3741putrelv:
3742	if rel.Siz != 0 {
3743		if rel.Siz != 4 {
3744			ctxt.Diag("bad rel")
3745			goto bad
3746		}
3747
3748		r := obj.Addrel(cursym)
3749		*r = rel
3750		r.Off = int32(p.Pc + int64(ab.Len()))
3751	}
3752
3753	ab.PutInt32(v)
3754	return
3755
3756bad:
3757	ctxt.Diag("asmand: bad address %v", obj.Dconv(p, a))
3758}
3759
3760func (ab *AsmBuf) asmand(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, ra *obj.Addr) {
3761	ab.asmandsz(ctxt, cursym, p, a, reg[ra.Reg], regrex[ra.Reg], 0)
3762}
3763
3764func (ab *AsmBuf) asmando(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, o int) {
3765	ab.asmandsz(ctxt, cursym, p, a, o, 0, 0)
3766}
3767
3768func bytereg(a *obj.Addr, t *uint8) {
3769	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AX <= a.Reg && a.Reg <= REG_R15) {
3770		a.Reg += REG_AL - REG_AX
3771		*t = 0
3772	}
3773}
3774
3775func unbytereg(a *obj.Addr, t *uint8) {
3776	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AL <= a.Reg && a.Reg <= REG_R15B) {
3777		a.Reg += REG_AX - REG_AL
3778		*t = 0
3779	}
3780}
3781
3782const (
3783	movLit uint8 = iota // Like Zlit
3784	movRegMem
3785	movMemReg
3786	movRegMem2op
3787	movMemReg2op
3788	movFullPtr // Load full pointer, trash heap (unsupported)
3789	movDoubleShift
3790	movTLSReg
3791)
3792
3793var ymovtab = []movtab{
3794	// push
3795	{APUSHL, Ycs, Ynone, Ynone, movLit, [4]uint8{0x0e, 0}},
3796	{APUSHL, Yss, Ynone, Ynone, movLit, [4]uint8{0x16, 0}},
3797	{APUSHL, Yds, Ynone, Ynone, movLit, [4]uint8{0x1e, 0}},
3798	{APUSHL, Yes, Ynone, Ynone, movLit, [4]uint8{0x06, 0}},
3799	{APUSHL, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
3800	{APUSHL, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
3801	{APUSHQ, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
3802	{APUSHQ, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
3803	{APUSHW, Ycs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0e, 0}},
3804	{APUSHW, Yss, Ynone, Ynone, movLit, [4]uint8{Pe, 0x16, 0}},
3805	{APUSHW, Yds, Ynone, Ynone, movLit, [4]uint8{Pe, 0x1e, 0}},
3806	{APUSHW, Yes, Ynone, Ynone, movLit, [4]uint8{Pe, 0x06, 0}},
3807	{APUSHW, Yfs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa0, 0}},
3808	{APUSHW, Ygs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa8, 0}},
3809
3810	// pop
3811	{APOPL, Ynone, Ynone, Yds, movLit, [4]uint8{0x1f, 0}},
3812	{APOPL, Ynone, Ynone, Yes, movLit, [4]uint8{0x07, 0}},
3813	{APOPL, Ynone, Ynone, Yss, movLit, [4]uint8{0x17, 0}},
3814	{APOPL, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
3815	{APOPL, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
3816	{APOPQ, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
3817	{APOPQ, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
3818	{APOPW, Ynone, Ynone, Yds, movLit, [4]uint8{Pe, 0x1f, 0}},
3819	{APOPW, Ynone, Ynone, Yes, movLit, [4]uint8{Pe, 0x07, 0}},
3820	{APOPW, Ynone, Ynone, Yss, movLit, [4]uint8{Pe, 0x17, 0}},
3821	{APOPW, Ynone, Ynone, Yfs, movLit, [4]uint8{Pe, 0x0f, 0xa1, 0}},
3822	{APOPW, Ynone, Ynone, Ygs, movLit, [4]uint8{Pe, 0x0f, 0xa9, 0}},
3823
3824	// mov seg
3825	{AMOVW, Yes, Ynone, Yml, movRegMem, [4]uint8{0x8c, 0, 0, 0}},
3826	{AMOVW, Ycs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 1, 0, 0}},
3827	{AMOVW, Yss, Ynone, Yml, movRegMem, [4]uint8{0x8c, 2, 0, 0}},
3828	{AMOVW, Yds, Ynone, Yml, movRegMem, [4]uint8{0x8c, 3, 0, 0}},
3829	{AMOVW, Yfs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 4, 0, 0}},
3830	{AMOVW, Ygs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 5, 0, 0}},
3831	{AMOVW, Yml, Ynone, Yes, movMemReg, [4]uint8{0x8e, 0, 0, 0}},
3832	{AMOVW, Yml, Ynone, Ycs, movMemReg, [4]uint8{0x8e, 1, 0, 0}},
3833	{AMOVW, Yml, Ynone, Yss, movMemReg, [4]uint8{0x8e, 2, 0, 0}},
3834	{AMOVW, Yml, Ynone, Yds, movMemReg, [4]uint8{0x8e, 3, 0, 0}},
3835	{AMOVW, Yml, Ynone, Yfs, movMemReg, [4]uint8{0x8e, 4, 0, 0}},
3836	{AMOVW, Yml, Ynone, Ygs, movMemReg, [4]uint8{0x8e, 5, 0, 0}},
3837
3838	// mov cr
3839	{AMOVL, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
3840	{AMOVL, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
3841	{AMOVL, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
3842	{AMOVL, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
3843	{AMOVL, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
3844	{AMOVQ, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
3845	{AMOVQ, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
3846	{AMOVQ, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
3847	{AMOVQ, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
3848	{AMOVQ, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
3849	{AMOVL, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
3850	{AMOVL, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
3851	{AMOVL, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
3852	{AMOVL, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
3853	{AMOVL, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},
3854	{AMOVQ, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
3855	{AMOVQ, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
3856	{AMOVQ, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
3857	{AMOVQ, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
3858	{AMOVQ, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},
3859
3860	// mov dr
3861	{AMOVL, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
3862	{AMOVL, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
3863	{AMOVL, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
3864	{AMOVQ, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
3865	{AMOVQ, Ydr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 2, 0}},
3866	{AMOVQ, Ydr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 3, 0}},
3867	{AMOVQ, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
3868	{AMOVQ, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
3869	{AMOVL, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
3870	{AMOVL, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
3871	{AMOVL, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},
3872	{AMOVQ, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
3873	{AMOVQ, Yrl, Ynone, Ydr2, movMemReg2op, [4]uint8{0x0f, 0x23, 2, 0}},
3874	{AMOVQ, Yrl, Ynone, Ydr3, movMemReg2op, [4]uint8{0x0f, 0x23, 3, 0}},
3875	{AMOVQ, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
3876	{AMOVQ, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},
3877
3878	// mov tr
3879	{AMOVL, Ytr6, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 6, 0}},
3880	{AMOVL, Ytr7, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 7, 0}},
3881	{AMOVL, Yml, Ynone, Ytr6, movMemReg2op, [4]uint8{0x0f, 0x26, 6, 0xff}},
3882	{AMOVL, Yml, Ynone, Ytr7, movMemReg2op, [4]uint8{0x0f, 0x26, 7, 0xff}},
3883
3884	// lgdt, sgdt, lidt, sidt
3885	{AMOVL, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
3886	{AMOVL, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
3887	{AMOVL, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
3888	{AMOVL, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},
3889	{AMOVQ, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
3890	{AMOVQ, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
3891	{AMOVQ, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
3892	{AMOVQ, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},
3893
3894	// lldt, sldt
3895	{AMOVW, Yml, Ynone, Yldtr, movMemReg2op, [4]uint8{0x0f, 0x00, 2, 0}},
3896	{AMOVW, Yldtr, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 0, 0}},
3897
3898	// lmsw, smsw
3899	{AMOVW, Yml, Ynone, Ymsw, movMemReg2op, [4]uint8{0x0f, 0x01, 6, 0}},
3900	{AMOVW, Ymsw, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x01, 4, 0}},
3901
3902	// ltr, str
3903	{AMOVW, Yml, Ynone, Ytask, movMemReg2op, [4]uint8{0x0f, 0x00, 3, 0}},
3904	{AMOVW, Ytask, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 1, 0}},
3905
3906	/* load full pointer - unsupported
3907	{AMOVL, Yml, Ycol, movFullPtr, [4]uint8{0, 0, 0, 0}},
3908	{AMOVW, Yml, Ycol, movFullPtr, [4]uint8{Pe, 0, 0, 0}},
3909	*/
3910
3911	// double shift
3912	{ASHLL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3913	{ASHLL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3914	{ASHLL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3915	{ASHRL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3916	{ASHRL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3917	{ASHRL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3918	{ASHLQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3919	{ASHLQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3920	{ASHLQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3921	{ASHRQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3922	{ASHRQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3923	{ASHRQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3924	{ASHLW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3925	{ASHLW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3926	{ASHLW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3927	{ASHRW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3928	{ASHRW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3929	{ASHRW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3930
3931	// load TLS base
3932	{AMOVL, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
3933	{AMOVQ, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
3934	{0, 0, 0, 0, 0, [4]uint8{}},
3935}
3936
3937func isax(a *obj.Addr) bool {
3938	switch a.Reg {
3939	case REG_AX, REG_AL, REG_AH:
3940		return true
3941	}
3942
3943	if a.Index == REG_AX {
3944		return true
3945	}
3946	return false
3947}
3948
3949func subreg(p *obj.Prog, from int, to int) {
3950	if false { /* debug['Q'] */
3951		fmt.Printf("\n%v\ts/%v/%v/\n", p, rconv(from), rconv(to))
3952	}
3953
3954	if int(p.From.Reg) == from {
3955		p.From.Reg = int16(to)
3956		p.Ft = 0
3957	}
3958
3959	if int(p.To.Reg) == from {
3960		p.To.Reg = int16(to)
3961		p.Tt = 0
3962	}
3963
3964	if int(p.From.Index) == from {
3965		p.From.Index = int16(to)
3966		p.Ft = 0
3967	}
3968
3969	if int(p.To.Index) == from {
3970		p.To.Index = int16(to)
3971		p.Tt = 0
3972	}
3973
3974	if false { /* debug['Q'] */
3975		fmt.Printf("%v\n", p)
3976	}
3977}
3978
3979func (ab *AsmBuf) mediaop(ctxt *obj.Link, o *Optab, op int, osize int, z int) int {
3980	switch op {
3981	case Pm, Pe, Pf2, Pf3:
3982		if osize != 1 {
3983			if op != Pm {
3984				ab.Put1(byte(op))
3985			}
3986			ab.Put1(Pm)
3987			z++
3988			op = int(o.op[z])
3989			break
3990		}
3991		fallthrough
3992
3993	default:
3994		if ab.Len() == 0 || ab.Last() != Pm {
3995			ab.Put1(Pm)
3996		}
3997	}
3998
3999	ab.Put1(byte(op))
4000	return z
4001}
4002
4003var bpduff1 = []byte{
4004	0x48, 0x89, 0x6c, 0x24, 0xf0, // MOVQ BP, -16(SP)
4005	0x48, 0x8d, 0x6c, 0x24, 0xf0, // LEAQ -16(SP), BP
4006}
4007
4008var bpduff2 = []byte{
4009	0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
4010}
4011
4012// asmevex emits EVEX pregis and opcode byte.
4013// In addition to asmvex r/m, vvvv and reg fields also requires optional
4014// K-masking register.
4015//
4016// Expects asmbuf.evex to be properly initialized.
4017func (ab *AsmBuf) asmevex(ctxt *obj.Link, p *obj.Prog, rm, v, r, k *obj.Addr) {
4018	ab.evexflag = true
4019	evex := ab.evex
4020
4021	rexR := byte(1)
4022	evexR := byte(1)
4023	rexX := byte(1)
4024	rexB := byte(1)
4025	if r != nil {
4026		if regrex[r.Reg]&Rxr != 0 {
4027			rexR = 0 // "ModR/M.reg" selector 4th bit.
4028		}
4029		if regrex[r.Reg]&RxrEvex != 0 {
4030			evexR = 0 // "ModR/M.reg" selector 5th bit.
4031		}
4032	}
4033	if rm != nil {
4034		if rm.Index == REG_NONE && regrex[rm.Reg]&RxrEvex != 0 {
4035			rexX = 0
4036		} else if regrex[rm.Index]&Rxx != 0 {
4037			rexX = 0
4038		}
4039		if regrex[rm.Reg]&Rxb != 0 {
4040			rexB = 0
4041		}
4042	}
4043	// P0 = [R][X][B][R'][00][mm]
4044	p0 := (rexR << 7) |
4045		(rexX << 6) |
4046		(rexB << 5) |
4047		(evexR << 4) |
4048		(0 << 2) |
4049		(evex.M() << 0)
4050
4051	vexV := byte(0)
4052	if v != nil {
4053		// 4bit-wide reg index.
4054		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
4055	}
4056	vexV ^= 0x0F
4057	// P1 = [W][vvvv][1][pp]
4058	p1 := (evex.W() << 7) |
4059		(vexV << 3) |
4060		(1 << 2) |
4061		(evex.P() << 0)
4062
4063	suffix := evexSuffixMap[p.Scond]
4064	evexZ := byte(0)
4065	evexLL := evex.L()
4066	evexB := byte(0)
4067	evexV := byte(1)
4068	evexA := byte(0)
4069	if suffix.zeroing {
4070		if !evex.ZeroingEnabled() {
4071			ctxt.Diag("unsupported zeroing: %v", p)
4072		}
4073		evexZ = 1
4074	}
4075	switch {
4076	case suffix.rounding != rcUnset:
4077		if rm != nil && rm.Type == obj.TYPE_MEM {
4078			ctxt.Diag("illegal rounding with memory argument: %v", p)
4079		} else if !evex.RoundingEnabled() {
4080			ctxt.Diag("unsupported rounding: %v", p)
4081		}
4082		evexB = 1
4083		evexLL = suffix.rounding
4084	case suffix.broadcast:
4085		if rm == nil || rm.Type != obj.TYPE_MEM {
4086			ctxt.Diag("illegal broadcast without memory argument: %v", p)
4087		} else if !evex.BroadcastEnabled() {
4088			ctxt.Diag("unsupported broadcast: %v", p)
4089		}
4090		evexB = 1
4091	case suffix.sae:
4092		if rm != nil && rm.Type == obj.TYPE_MEM {
4093			ctxt.Diag("illegal SAE with memory argument: %v", p)
4094		} else if !evex.SaeEnabled() {
4095			ctxt.Diag("unsupported SAE: %v", p)
4096		}
4097		evexB = 1
4098	}
4099	if rm != nil && regrex[rm.Index]&RxrEvex != 0 {
4100		evexV = 0
4101	} else if v != nil && regrex[v.Reg]&RxrEvex != 0 {
4102		evexV = 0 // VSR selector 5th bit.
4103	}
4104	if k != nil {
4105		evexA = byte(reg[k.Reg])
4106	}
4107	// P2 = [z][L'L][b][V'][aaa]
4108	p2 := (evexZ << 7) |
4109		(evexLL << 5) |
4110		(evexB << 4) |
4111		(evexV << 3) |
4112		(evexA << 0)
4113
4114	const evexEscapeByte = 0x62
4115	ab.Put4(evexEscapeByte, p0, p1, p2)
4116	ab.Put1(evex.opcode)
4117}
4118
4119// Emit VEX prefix and opcode byte.
4120// The three addresses are the r/m, vvvv, and reg fields.
4121// The reg and rm arguments appear in the same order as the
4122// arguments to asmand, which typically follows the call to asmvex.
4123// The final two arguments are the VEX prefix (see encoding above)
4124// and the opcode byte.
4125// For details about vex prefix see:
4126// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
4127func (ab *AsmBuf) asmvex(ctxt *obj.Link, rm, v, r *obj.Addr, vex, opcode uint8) {
4128	ab.vexflag = true
4129	rexR := 0
4130	if r != nil {
4131		rexR = regrex[r.Reg] & Rxr
4132	}
4133	rexB := 0
4134	rexX := 0
4135	if rm != nil {
4136		rexB = regrex[rm.Reg] & Rxb
4137		rexX = regrex[rm.Index] & Rxx
4138	}
4139	vexM := (vex >> 3) & 0x7
4140	vexWLP := vex & 0x87
4141	vexV := byte(0)
4142	if v != nil {
4143		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
4144	}
4145	vexV ^= 0xF
4146	if vexM == 1 && (rexX|rexB) == 0 && vex&vexW1 == 0 {
4147		// Can use 2-byte encoding.
4148		ab.Put2(0xc5, byte(rexR<<5)^0x80|vexV<<3|vexWLP)
4149	} else {
4150		// Must use 3-byte encoding.
4151		ab.Put3(0xc4,
4152			(byte(rexR|rexX|rexB)<<5)^0xE0|vexM,
4153			vexV<<3|vexWLP,
4154		)
4155	}
4156	ab.Put1(opcode)
4157}
4158
4159// regIndex returns register index that fits in 5 bits.
4160//
4161//	R         : 3 bit | legacy instructions     | N/A
4162//	[R/V]EX.R : 1 bit | REX / VEX extension bit | Rxr
4163//	EVEX.R    : 1 bit | EVEX extension bit      | RxrEvex
4164//
4165// Examples:
4166//	REG_Z30 => 30
4167//	REG_X15 => 15
4168//	REG_R9  => 9
4169//	REG_AX  => 0
4170//
4171func regIndex(r int16) int {
4172	lower3bits := reg[r]
4173	high4bit := regrex[r] & Rxr << 1
4174	high5bit := regrex[r] & RxrEvex << 0
4175	return lower3bits | high4bit | high5bit
4176}
4177
4178// avx2gatherValid reports whether p satisfies AVX2 gather constraints.
4179// Reports errors via ctxt.
4180func avx2gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
4181	// If any pair of the index, mask, or destination registers
4182	// are the same, illegal instruction trap (#UD) is triggered.
4183	index := regIndex(p.GetFrom3().Index)
4184	mask := regIndex(p.From.Reg)
4185	dest := regIndex(p.To.Reg)
4186	if dest == mask || dest == index || mask == index {
4187		ctxt.Diag("mask, index, and destination registers should be distinct: %v", p)
4188		return false
4189	}
4190
4191	return true
4192}
4193
4194// avx512gatherValid reports whether p satisfies AVX512 gather constraints.
4195// Reports errors via ctxt.
4196func avx512gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
4197	// Illegal instruction trap (#UD) is triggered if the destination vector
4198	// register is the same as index vector in VSIB.
4199	index := regIndex(p.From.Index)
4200	dest := regIndex(p.To.Reg)
4201	if dest == index {
4202		ctxt.Diag("index and destination registers should be distinct: %v", p)
4203		return false
4204	}
4205
4206	return true
4207}
4208
4209func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
4210	o := opindex[p.As&obj.AMask]
4211
4212	if o == nil {
4213		ctxt.Diag("asmins: missing op %v", p)
4214		return
4215	}
4216
4217	if pre := prefixof(ctxt, &p.From); pre != 0 {
4218		ab.Put1(byte(pre))
4219	}
4220	if pre := prefixof(ctxt, &p.To); pre != 0 {
4221		ab.Put1(byte(pre))
4222	}
4223
4224	// Checks to warn about instruction/arguments combinations that
4225	// will unconditionally trigger illegal instruction trap (#UD).
4226	switch p.As {
4227	case AVGATHERDPD,
4228		AVGATHERQPD,
4229		AVGATHERDPS,
4230		AVGATHERQPS,
4231		AVPGATHERDD,
4232		AVPGATHERQD,
4233		AVPGATHERDQ,
4234		AVPGATHERQQ:
4235		// AVX512 gather requires explicit K mask.
4236		if p.GetFrom3().Reg >= REG_K0 && p.GetFrom3().Reg <= REG_K7 {
4237			if !avx512gatherValid(ctxt, p) {
4238				return
4239			}
4240		} else {
4241			if !avx2gatherValid(ctxt, p) {
4242				return
4243			}
4244		}
4245	}
4246
4247	if p.Ft == 0 {
4248		p.Ft = uint8(oclass(ctxt, p, &p.From))
4249	}
4250	if p.Tt == 0 {
4251		p.Tt = uint8(oclass(ctxt, p, &p.To))
4252	}
4253
4254	ft := int(p.Ft) * Ymax
4255	var f3t int
4256	tt := int(p.Tt) * Ymax
4257
4258	xo := obj.Bool2int(o.op[0] == 0x0f)
4259	z := 0
4260	var a *obj.Addr
4261	var l int
4262	var op int
4263	var q *obj.Prog
4264	var r *obj.Reloc
4265	var rel obj.Reloc
4266	var v int64
4267
4268	args := make([]int, 0, argListMax)
4269	if ft != Ynone*Ymax {
4270		args = append(args, ft)
4271	}
4272	for i := range p.RestArgs {
4273		args = append(args, oclass(ctxt, p, &p.RestArgs[i].Addr)*Ymax)
4274	}
4275	if tt != Ynone*Ymax {
4276		args = append(args, tt)
4277	}
4278
4279	for _, yt := range o.ytab {
4280		// ytab matching is purely args-based,
4281		// but AVX512 suffixes like "Z" or "RU_SAE" will
4282		// add EVEX-only filter that will reject non-EVEX matches.
4283		//
4284		// Consider "VADDPD.BCST 2032(DX), X0, X0".
4285		// Without this rule, operands will lead to VEX-encoded form
4286		// and produce "c5b15813" encoding.
4287		if !yt.match(args) {
4288			// "xo" is always zero for VEX/EVEX encoded insts.
4289			z += int(yt.zoffset) + xo
4290		} else {
4291			if p.Scond != 0 && !evexZcase(yt.zcase) {
4292				// Do not signal error and continue to search
4293				// for matching EVEX-encoded form.
4294				z += int(yt.zoffset)
4295				continue
4296			}
4297
4298			switch o.prefix {
4299			case Px1: // first option valid only in 32-bit mode
4300				if ctxt.Arch.Family == sys.AMD64 && z == 0 {
4301					z += int(yt.zoffset) + xo
4302					continue
4303				}
4304			case Pq: // 16 bit escape and opcode escape
4305				ab.Put2(Pe, Pm)
4306
4307			case Pq3: // 16 bit escape and opcode escape + REX.W
4308				ab.rexflag |= Pw
4309				ab.Put2(Pe, Pm)
4310
4311			case Pq4: // 66 0F 38
4312				ab.Put3(0x66, 0x0F, 0x38)
4313
4314			case Pq4w: // 66 0F 38 + REX.W
4315				ab.rexflag |= Pw
4316				ab.Put3(0x66, 0x0F, 0x38)
4317
4318			case Pq5: // F3 0F 38
4319				ab.Put3(0xF3, 0x0F, 0x38)
4320
4321			case Pq5w: //  F3 0F 38 + REX.W
4322				ab.rexflag |= Pw
4323				ab.Put3(0xF3, 0x0F, 0x38)
4324
4325			case Pf2, // xmm opcode escape
4326				Pf3:
4327				ab.Put2(o.prefix, Pm)
4328
4329			case Pef3:
4330				ab.Put3(Pe, Pf3, Pm)
4331
4332			case Pfw: // xmm opcode escape + REX.W
4333				ab.rexflag |= Pw
4334				ab.Put2(Pf3, Pm)
4335
4336			case Pm: // opcode escape
4337				ab.Put1(Pm)
4338
4339			case Pe: // 16 bit escape
4340				ab.Put1(Pe)
4341
4342			case Pw: // 64-bit escape
4343				if ctxt.Arch.Family != sys.AMD64 {
4344					ctxt.Diag("asmins: illegal 64: %v", p)
4345				}
4346				ab.rexflag |= Pw
4347
4348			case Pw8: // 64-bit escape if z >= 8
4349				if z >= 8 {
4350					if ctxt.Arch.Family != sys.AMD64 {
4351						ctxt.Diag("asmins: illegal 64: %v", p)
4352					}
4353					ab.rexflag |= Pw
4354				}
4355
4356			case Pb: // botch
4357				if ctxt.Arch.Family != sys.AMD64 && (isbadbyte(&p.From) || isbadbyte(&p.To)) {
4358					goto bad
4359				}
4360				// NOTE(rsc): This is probably safe to do always,
4361				// but when enabled it chooses different encodings
4362				// than the old cmd/internal/obj/i386 code did,
4363				// which breaks our "same bits out" checks.
4364				// In particular, CMPB AX, $0 encodes as 80 f8 00
4365				// in the original obj/i386, and it would encode
4366				// (using a valid, shorter form) as 3c 00 if we enabled
4367				// the call to bytereg here.
4368				if ctxt.Arch.Family == sys.AMD64 {
4369					bytereg(&p.From, &p.Ft)
4370					bytereg(&p.To, &p.Tt)
4371				}
4372
4373			case P32: // 32 bit but illegal if 64-bit mode
4374				if ctxt.Arch.Family == sys.AMD64 {
4375					ctxt.Diag("asmins: illegal in 64-bit mode: %v", p)
4376				}
4377
4378			case Py: // 64-bit only, no prefix
4379				if ctxt.Arch.Family != sys.AMD64 {
4380					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4381				}
4382
4383			case Py1: // 64-bit only if z < 1, no prefix
4384				if z < 1 && ctxt.Arch.Family != sys.AMD64 {
4385					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4386				}
4387
4388			case Py3: // 64-bit only if z < 3, no prefix
4389				if z < 3 && ctxt.Arch.Family != sys.AMD64 {
4390					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4391				}
4392			}
4393
4394			if z >= len(o.op) {
4395				log.Fatalf("asmins bad table %v", p)
4396			}
4397			op = int(o.op[z])
4398			if op == 0x0f {
4399				ab.Put1(byte(op))
4400				z++
4401				op = int(o.op[z])
4402			}
4403
4404			switch yt.zcase {
4405			default:
4406				ctxt.Diag("asmins: unknown z %d %v", yt.zcase, p)
4407				return
4408
4409			case Zpseudo:
4410				break
4411
4412			case Zlit:
4413				ab.PutOpBytesLit(z, &o.op)
4414
4415			case Zlitr_m:
4416				ab.PutOpBytesLit(z, &o.op)
4417				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4418
4419			case Zlitm_r:
4420				ab.PutOpBytesLit(z, &o.op)
4421				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4422
4423			case Zlit_m_r:
4424				ab.PutOpBytesLit(z, &o.op)
4425				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4426
4427			case Zmb_r:
4428				bytereg(&p.From, &p.Ft)
4429				fallthrough
4430
4431			case Zm_r:
4432				ab.Put1(byte(op))
4433				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4434
4435			case Z_m_r:
4436				ab.Put1(byte(op))
4437				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4438
4439			case Zm2_r:
4440				ab.Put2(byte(op), o.op[z+1])
4441				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4442
4443			case Zm_r_xm:
4444				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4445				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4446
4447			case Zm_r_xm_nr:
4448				ab.rexflag = 0
4449				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4450				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4451
4452			case Zm_r_i_xm:
4453				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4454				ab.asmand(ctxt, cursym, p, &p.From, p.GetFrom3())
4455				ab.Put1(byte(p.To.Offset))
4456
4457			case Zibm_r, Zibr_m:
4458				ab.PutOpBytesLit(z, &o.op)
4459				if yt.zcase == Zibr_m {
4460					ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4461				} else {
4462					ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4463				}
4464				switch {
4465				default:
4466					ab.Put1(byte(p.From.Offset))
4467				case yt.args[0] == Yi32 && o.prefix == Pe:
4468					ab.PutInt16(int16(p.From.Offset))
4469				case yt.args[0] == Yi32:
4470					ab.PutInt32(int32(p.From.Offset))
4471				}
4472
4473			case Zaut_r:
4474				ab.Put1(0x8d) // leal
4475				if p.From.Type != obj.TYPE_ADDR {
4476					ctxt.Diag("asmins: Zaut sb type ADDR")
4477				}
4478				p.From.Type = obj.TYPE_MEM
4479				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4480				p.From.Type = obj.TYPE_ADDR
4481
4482			case Zm_o:
4483				ab.Put1(byte(op))
4484				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4485
4486			case Zr_m:
4487				ab.Put1(byte(op))
4488				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4489
4490			case Zvex:
4491				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4492
4493			case Zvex_rm_v_r:
4494				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4495				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4496
4497			case Zvex_rm_v_ro:
4498				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4499				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))
4500
4501			case Zvex_i_rm_vo:
4502				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
4503				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+2]))
4504				ab.Put1(byte(p.From.Offset))
4505
4506			case Zvex_i_r_v:
4507				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
4508				regnum := byte(0x7)
4509				if p.GetFrom3().Reg >= REG_X0 && p.GetFrom3().Reg <= REG_X15 {
4510					regnum &= byte(p.GetFrom3().Reg - REG_X0)
4511				} else {
4512					regnum &= byte(p.GetFrom3().Reg - REG_Y0)
4513				}
4514				ab.Put1(o.op[z+2] | regnum)
4515				ab.Put1(byte(p.From.Offset))
4516
4517			case Zvex_i_rm_v_r:
4518				imm, from, from3, to := unpackOps4(p)
4519				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
4520				ab.asmand(ctxt, cursym, p, from, to)
4521				ab.Put1(byte(imm.Offset))
4522
4523			case Zvex_i_rm_r:
4524				ab.asmvex(ctxt, p.GetFrom3(), nil, &p.To, o.op[z], o.op[z+1])
4525				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4526				ab.Put1(byte(p.From.Offset))
4527
4528			case Zvex_v_rm_r:
4529				ab.asmvex(ctxt, p.GetFrom3(), &p.From, &p.To, o.op[z], o.op[z+1])
4530				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4531
4532			case Zvex_r_v_rm:
4533				ab.asmvex(ctxt, &p.To, p.GetFrom3(), &p.From, o.op[z], o.op[z+1])
4534				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4535
4536			case Zvex_rm_r_vo:
4537				ab.asmvex(ctxt, &p.From, &p.To, p.GetFrom3(), o.op[z], o.op[z+1])
4538				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))
4539
4540			case Zvex_i_r_rm:
4541				ab.asmvex(ctxt, &p.To, nil, p.GetFrom3(), o.op[z], o.op[z+1])
4542				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4543				ab.Put1(byte(p.From.Offset))
4544
4545			case Zvex_hr_rm_v_r:
4546				hr, from, from3, to := unpackOps4(p)
4547				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
4548				ab.asmand(ctxt, cursym, p, from, to)
4549				ab.Put1(byte(regIndex(hr.Reg) << 4))
4550
4551			case Zevex_k_rmo:
4552				ab.evex = newEVEXBits(z, &o.op)
4553				ab.asmevex(ctxt, p, &p.To, nil, nil, &p.From)
4554				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+3]))
4555
4556			case Zevex_i_rm_vo:
4557				ab.evex = newEVEXBits(z, &o.op)
4558				ab.asmevex(ctxt, p, p.GetFrom3(), &p.To, nil, nil)
4559				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+3]))
4560				ab.Put1(byte(p.From.Offset))
4561
4562			case Zevex_i_rm_k_vo:
4563				imm, from, kmask, to := unpackOps4(p)
4564				ab.evex = newEVEXBits(z, &o.op)
4565				ab.asmevex(ctxt, p, from, to, nil, kmask)
4566				ab.asmando(ctxt, cursym, p, from, int(o.op[z+3]))
4567				ab.Put1(byte(imm.Offset))
4568
4569			case Zevex_i_r_rm:
4570				ab.evex = newEVEXBits(z, &o.op)
4571				ab.asmevex(ctxt, p, &p.To, nil, p.GetFrom3(), nil)
4572				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4573				ab.Put1(byte(p.From.Offset))
4574
4575			case Zevex_i_r_k_rm:
4576				imm, from, kmask, to := unpackOps4(p)
4577				ab.evex = newEVEXBits(z, &o.op)
4578				ab.asmevex(ctxt, p, to, nil, from, kmask)
4579				ab.asmand(ctxt, cursym, p, to, from)
4580				ab.Put1(byte(imm.Offset))
4581
4582			case Zevex_i_rm_r:
4583				ab.evex = newEVEXBits(z, &o.op)
4584				ab.asmevex(ctxt, p, p.GetFrom3(), nil, &p.To, nil)
4585				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4586				ab.Put1(byte(p.From.Offset))
4587
4588			case Zevex_i_rm_k_r:
4589				imm, from, kmask, to := unpackOps4(p)
4590				ab.evex = newEVEXBits(z, &o.op)
4591				ab.asmevex(ctxt, p, from, nil, to, kmask)
4592				ab.asmand(ctxt, cursym, p, from, to)
4593				ab.Put1(byte(imm.Offset))
4594
4595			case Zevex_i_rm_v_r:
4596				imm, from, from3, to := unpackOps4(p)
4597				ab.evex = newEVEXBits(z, &o.op)
4598				ab.asmevex(ctxt, p, from, from3, to, nil)
4599				ab.asmand(ctxt, cursym, p, from, to)
4600				ab.Put1(byte(imm.Offset))
4601
4602			case Zevex_i_rm_v_k_r:
4603				imm, from, from3, kmask, to := unpackOps5(p)
4604				ab.evex = newEVEXBits(z, &o.op)
4605				ab.asmevex(ctxt, p, from, from3, to, kmask)
4606				ab.asmand(ctxt, cursym, p, from, to)
4607				ab.Put1(byte(imm.Offset))
4608
4609			case Zevex_r_v_rm:
4610				ab.evex = newEVEXBits(z, &o.op)
4611				ab.asmevex(ctxt, p, &p.To, p.GetFrom3(), &p.From, nil)
4612				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4613
4614			case Zevex_rm_v_r:
4615				ab.evex = newEVEXBits(z, &o.op)
4616				ab.asmevex(ctxt, p, &p.From, p.GetFrom3(), &p.To, nil)
4617				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4618
4619			case Zevex_rm_k_r:
4620				ab.evex = newEVEXBits(z, &o.op)
4621				ab.asmevex(ctxt, p, &p.From, nil, &p.To, p.GetFrom3())
4622				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4623
4624			case Zevex_r_k_rm:
4625				ab.evex = newEVEXBits(z, &o.op)
4626				ab.asmevex(ctxt, p, &p.To, nil, &p.From, p.GetFrom3())
4627				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4628
4629			case Zevex_rm_v_k_r:
4630				from, from3, kmask, to := unpackOps4(p)
4631				ab.evex = newEVEXBits(z, &o.op)
4632				ab.asmevex(ctxt, p, from, from3, to, kmask)
4633				ab.asmand(ctxt, cursym, p, from, to)
4634
4635			case Zevex_r_v_k_rm:
4636				from, from3, kmask, to := unpackOps4(p)
4637				ab.evex = newEVEXBits(z, &o.op)
4638				ab.asmevex(ctxt, p, to, from3, from, kmask)
4639				ab.asmand(ctxt, cursym, p, to, from)
4640
4641			case Zr_m_xm:
4642				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4643				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4644
4645			case Zr_m_xm_nr:
4646				ab.rexflag = 0
4647				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4648				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4649
4650			case Zo_m:
4651				ab.Put1(byte(op))
4652				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4653
4654			case Zcallindreg:
4655				r = obj.Addrel(cursym)
4656				r.Off = int32(p.Pc)
4657				r.Type = objabi.R_CALLIND
4658				r.Siz = 0
4659				fallthrough
4660
4661			case Zo_m64:
4662				ab.Put1(byte(op))
4663				ab.asmandsz(ctxt, cursym, p, &p.To, int(o.op[z+1]), 0, 1)
4664
4665			case Zm_ibo:
4666				ab.Put1(byte(op))
4667				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4668				ab.Put1(byte(vaddr(ctxt, p, &p.To, nil)))
4669
4670			case Zibo_m:
4671				ab.Put1(byte(op))
4672				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4673				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4674
4675			case Zibo_m_xm:
4676				z = ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4677				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4678				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4679
4680			case Z_ib, Zib_:
4681				if yt.zcase == Zib_ {
4682					a = &p.From
4683				} else {
4684					a = &p.To
4685				}
4686				ab.Put1(byte(op))
4687				if p.As == AXABORT {
4688					ab.Put1(o.op[z+1])
4689				}
4690				ab.Put1(byte(vaddr(ctxt, p, a, nil)))
4691
4692			case Zib_rp:
4693				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
4694				ab.Put2(byte(op+reg[p.To.Reg]), byte(vaddr(ctxt, p, &p.From, nil)))
4695
4696			case Zil_rp:
4697				ab.rexflag |= regrex[p.To.Reg] & Rxb
4698				ab.Put1(byte(op + reg[p.To.Reg]))
4699				if o.prefix == Pe {
4700					v = vaddr(ctxt, p, &p.From, nil)
4701					ab.PutInt16(int16(v))
4702				} else {
4703					ab.relput4(ctxt, cursym, p, &p.From)
4704				}
4705
4706			case Zo_iw:
4707				ab.Put1(byte(op))
4708				if p.From.Type != obj.TYPE_NONE {
4709					v = vaddr(ctxt, p, &p.From, nil)
4710					ab.PutInt16(int16(v))
4711				}
4712
4713			case Ziq_rp:
4714				v = vaddr(ctxt, p, &p.From, &rel)
4715				l = int(v >> 32)
4716				if l == 0 && rel.Siz != 8 {
4717					ab.rexflag &^= (0x40 | Rxw)
4718
4719					ab.rexflag |= regrex[p.To.Reg] & Rxb
4720					ab.Put1(byte(0xb8 + reg[p.To.Reg]))
4721					if rel.Type != 0 {
4722						r = obj.Addrel(cursym)
4723						*r = rel
4724						r.Off = int32(p.Pc + int64(ab.Len()))
4725					}
4726
4727					ab.PutInt32(int32(v))
4728				} else if l == -1 && uint64(v)&(uint64(1)<<31) != 0 { // sign extend
4729					ab.Put1(0xc7)
4730					ab.asmando(ctxt, cursym, p, &p.To, 0)
4731
4732					ab.PutInt32(int32(v)) // need all 8
4733				} else {
4734					ab.rexflag |= regrex[p.To.Reg] & Rxb
4735					ab.Put1(byte(op + reg[p.To.Reg]))
4736					if rel.Type != 0 {
4737						r = obj.Addrel(cursym)
4738						*r = rel
4739						r.Off = int32(p.Pc + int64(ab.Len()))
4740					}
4741
4742					ab.PutInt64(v)
4743				}
4744
4745			case Zib_rr:
4746				ab.Put1(byte(op))
4747				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
4748				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4749
4750			case Z_il, Zil_:
4751				if yt.zcase == Zil_ {
4752					a = &p.From
4753				} else {
4754					a = &p.To
4755				}
4756				ab.Put1(byte(op))
4757				if o.prefix == Pe {
4758					v = vaddr(ctxt, p, a, nil)
4759					ab.PutInt16(int16(v))
4760				} else {
4761					ab.relput4(ctxt, cursym, p, a)
4762				}
4763
4764			case Zm_ilo, Zilo_m:
4765				ab.Put1(byte(op))
4766				if yt.zcase == Zilo_m {
4767					a = &p.From
4768					ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4769				} else {
4770					a = &p.To
4771					ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4772				}
4773
4774				if o.prefix == Pe {
4775					v = vaddr(ctxt, p, a, nil)
4776					ab.PutInt16(int16(v))
4777				} else {
4778					ab.relput4(ctxt, cursym, p, a)
4779				}
4780
4781			case Zil_rr:
4782				ab.Put1(byte(op))
4783				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
4784				if o.prefix == Pe {
4785					v = vaddr(ctxt, p, &p.From, nil)
4786					ab.PutInt16(int16(v))
4787				} else {
4788					ab.relput4(ctxt, cursym, p, &p.From)
4789				}
4790
4791			case Z_rp:
4792				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
4793				ab.Put1(byte(op + reg[p.To.Reg]))
4794
4795			case Zrp_:
4796				ab.rexflag |= regrex[p.From.Reg] & (Rxb | 0x40)
4797				ab.Put1(byte(op + reg[p.From.Reg]))
4798
4799			case Zcallcon, Zjmpcon:
4800				if yt.zcase == Zcallcon {
4801					ab.Put1(byte(op))
4802				} else {
4803					ab.Put1(o.op[z+1])
4804				}
4805				r = obj.Addrel(cursym)
4806				r.Off = int32(p.Pc + int64(ab.Len()))
4807				r.Type = objabi.R_PCREL
4808				r.Siz = 4
4809				r.Add = p.To.Offset
4810				ab.PutInt32(0)
4811
4812			case Zcallind:
4813				ab.Put2(byte(op), o.op[z+1])
4814				r = obj.Addrel(cursym)
4815				r.Off = int32(p.Pc + int64(ab.Len()))
4816				if ctxt.Arch.Family == sys.AMD64 {
4817					r.Type = objabi.R_PCREL
4818				} else {
4819					r.Type = objabi.R_ADDR
4820				}
4821				r.Siz = 4
4822				r.Add = p.To.Offset
4823				r.Sym = p.To.Sym
4824				ab.PutInt32(0)
4825
4826			case Zcall, Zcallduff:
4827				if p.To.Sym == nil {
4828					ctxt.Diag("call without target")
4829					ctxt.DiagFlush()
4830					log.Fatalf("bad code")
4831				}
4832
4833				if yt.zcase == Zcallduff && ctxt.Flag_dynlink {
4834					ctxt.Diag("directly calling duff when dynamically linking Go")
4835				}
4836
4837				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
4838					// Maintain BP around call, since duffcopy/duffzero can't do it
4839					// (the call jumps into the middle of the function).
4840					// This makes it possible to see call sites for duffcopy/duffzero in
4841					// BP-based profiling tools like Linux perf (which is the
4842					// whole point of maintaining frame pointers in Go).
4843					// MOVQ BP, -16(SP)
4844					// LEAQ -16(SP), BP
4845					ab.Put(bpduff1)
4846				}
4847				ab.Put1(byte(op))
4848				r = obj.Addrel(cursym)
4849				r.Off = int32(p.Pc + int64(ab.Len()))
4850				r.Sym = p.To.Sym
4851				r.Add = p.To.Offset
4852				r.Type = objabi.R_CALL
4853				r.Siz = 4
4854				ab.PutInt32(0)
4855
4856				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
4857					// Pop BP pushed above.
4858					// MOVQ 0(BP), BP
4859					ab.Put(bpduff2)
4860				}
4861
4862			// TODO: jump across functions needs reloc
4863			case Zbr, Zjmp, Zloop:
4864				if p.As == AXBEGIN {
4865					ab.Put1(byte(op))
4866				}
4867				if p.To.Sym != nil {
4868					if yt.zcase != Zjmp {
4869						ctxt.Diag("branch to ATEXT")
4870						ctxt.DiagFlush()
4871						log.Fatalf("bad code")
4872					}
4873
4874					ab.Put1(o.op[z+1])
4875					r = obj.Addrel(cursym)
4876					r.Off = int32(p.Pc + int64(ab.Len()))
4877					r.Sym = p.To.Sym
4878					// Note: R_CALL instead of R_PCREL. R_CALL is more permissive in that
4879					// it can point to a trampoline instead of the destination itself.
4880					r.Type = objabi.R_CALL
4881					r.Siz = 4
4882					ab.PutInt32(0)
4883					break
4884				}
4885
4886				// Assumes q is in this function.
4887				// TODO: Check in input, preserve in brchain.
4888
4889				// Fill in backward jump now.
4890				q = p.To.Target()
4891
4892				if q == nil {
4893					ctxt.Diag("jmp/branch/loop without target")
4894					ctxt.DiagFlush()
4895					log.Fatalf("bad code")
4896				}
4897
4898				if p.Back&branchBackwards != 0 {
4899					v = q.Pc - (p.Pc + 2)
4900					if v >= -128 && p.As != AXBEGIN {
4901						if p.As == AJCXZL {
4902							ab.Put1(0x67)
4903						}
4904						ab.Put2(byte(op), byte(v))
4905					} else if yt.zcase == Zloop {
4906						ctxt.Diag("loop too far: %v", p)
4907					} else {
4908						v -= 5 - 2
4909						if p.As == AXBEGIN {
4910							v--
4911						}
4912						if yt.zcase == Zbr {
4913							ab.Put1(0x0f)
4914							v--
4915						}
4916
4917						ab.Put1(o.op[z+1])
4918						ab.PutInt32(int32(v))
4919					}
4920
4921					break
4922				}
4923
4924				// Annotate target; will fill in later.
4925				p.Forwd = q.Rel
4926
4927				q.Rel = p
4928				if p.Back&branchShort != 0 && p.As != AXBEGIN {
4929					if p.As == AJCXZL {
4930						ab.Put1(0x67)
4931					}
4932					ab.Put2(byte(op), 0)
4933				} else if yt.zcase == Zloop {
4934					ctxt.Diag("loop too far: %v", p)
4935				} else {
4936					if yt.zcase == Zbr {
4937						ab.Put1(0x0f)
4938					}
4939					ab.Put1(o.op[z+1])
4940					ab.PutInt32(0)
4941				}
4942
4943			case Zbyte:
4944				v = vaddr(ctxt, p, &p.From, &rel)
4945				if rel.Siz != 0 {
4946					rel.Siz = uint8(op)
4947					r = obj.Addrel(cursym)
4948					*r = rel
4949					r.Off = int32(p.Pc + int64(ab.Len()))
4950				}
4951
4952				ab.Put1(byte(v))
4953				if op > 1 {
4954					ab.Put1(byte(v >> 8))
4955					if op > 2 {
4956						ab.PutInt16(int16(v >> 16))
4957						if op > 4 {
4958							ab.PutInt32(int32(v >> 32))
4959						}
4960					}
4961				}
4962			}
4963
4964			return
4965		}
4966	}
4967	f3t = Ynone * Ymax
4968	if p.GetFrom3() != nil {
4969		f3t = oclass(ctxt, p, p.GetFrom3()) * Ymax
4970	}
4971	for mo := ymovtab; mo[0].as != 0; mo = mo[1:] {
4972		var pp obj.Prog
4973		var t []byte
4974		if p.As == mo[0].as {
4975			if ycover[ft+int(mo[0].ft)] != 0 && ycover[f3t+int(mo[0].f3t)] != 0 && ycover[tt+int(mo[0].tt)] != 0 {
4976				t = mo[0].op[:]
4977				switch mo[0].code {
4978				default:
4979					ctxt.Diag("asmins: unknown mov %d %v", mo[0].code, p)
4980
4981				case movLit:
4982					for z = 0; t[z] != 0; z++ {
4983						ab.Put1(t[z])
4984					}
4985
4986				case movRegMem:
4987					ab.Put1(t[0])
4988					ab.asmando(ctxt, cursym, p, &p.To, int(t[1]))
4989
4990				case movMemReg:
4991					ab.Put1(t[0])
4992					ab.asmando(ctxt, cursym, p, &p.From, int(t[1]))
4993
4994				case movRegMem2op: // r,m - 2op
4995					ab.Put2(t[0], t[1])
4996					ab.asmando(ctxt, cursym, p, &p.To, int(t[2]))
4997					ab.rexflag |= regrex[p.From.Reg] & (Rxr | 0x40)
4998
4999				case movMemReg2op:
5000					ab.Put2(t[0], t[1])
5001					ab.asmando(ctxt, cursym, p, &p.From, int(t[2]))
5002					ab.rexflag |= regrex[p.To.Reg] & (Rxr | 0x40)
5003
5004				case movFullPtr:
5005					if t[0] != 0 {
5006						ab.Put1(t[0])
5007					}
5008					switch p.To.Index {
5009					default:
5010						goto bad
5011
5012					case REG_DS:
5013						ab.Put1(0xc5)
5014
5015					case REG_SS:
5016						ab.Put2(0x0f, 0xb2)
5017
5018					case REG_ES:
5019						ab.Put1(0xc4)
5020
5021					case REG_FS:
5022						ab.Put2(0x0f, 0xb4)
5023
5024					case REG_GS:
5025						ab.Put2(0x0f, 0xb5)
5026					}
5027
5028					ab.asmand(ctxt, cursym, p, &p.From, &p.To)
5029
5030				case movDoubleShift:
5031					if t[0] == Pw {
5032						if ctxt.Arch.Family != sys.AMD64 {
5033							ctxt.Diag("asmins: illegal 64: %v", p)
5034						}
5035						ab.rexflag |= Pw
5036						t = t[1:]
5037					} else if t[0] == Pe {
5038						ab.Put1(Pe)
5039						t = t[1:]
5040					}
5041
5042					switch p.From.Type {
5043					default:
5044						goto bad
5045
5046					case obj.TYPE_CONST:
5047						ab.Put2(0x0f, t[0])
5048						ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
5049						ab.Put1(byte(p.From.Offset))
5050
5051					case obj.TYPE_REG:
5052						switch p.From.Reg {
5053						default:
5054							goto bad
5055
5056						case REG_CL, REG_CX:
5057							ab.Put2(0x0f, t[1])
5058							ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
5059						}
5060					}
5061
5062				// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
5063				// where you load the TLS base register into a register and then index off that
5064				// register to access the actual TLS variables. Systems that allow direct TLS access
5065				// are handled in prefixof above and should not be listed here.
5066				case movTLSReg:
5067					if ctxt.Arch.Family == sys.AMD64 && p.As != AMOVQ || ctxt.Arch.Family == sys.I386 && p.As != AMOVL {
5068						ctxt.Diag("invalid load of TLS: %v", p)
5069					}
5070
5071					if ctxt.Arch.Family == sys.I386 {
5072						// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
5073						// where you load the TLS base register into a register and then index off that
5074						// register to access the actual TLS variables. Systems that allow direct TLS access
5075						// are handled in prefixof above and should not be listed here.
5076						switch ctxt.Headtype {
5077						default:
5078							log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)
5079
5080						case objabi.Hlinux, objabi.Hfreebsd:
5081							if ctxt.Flag_shared {
5082								// Note that this is not generating the same insns as the other cases.
5083								//     MOV TLS, dst
5084								// becomes
5085								//     call __x86.get_pc_thunk.dst
5086								//     movl (gotpc + g@gotntpoff)(dst), dst
5087								// which is encoded as
5088								//     call __x86.get_pc_thunk.dst
5089								//     movq 0(dst), dst
5090								// and R_CALL & R_TLS_IE relocs. This all assumes the only tls variable we access
5091								// is g, which we can't check here, but will when we assemble the second
5092								// instruction.
5093								dst := p.To.Reg
5094								ab.Put1(0xe8)
5095								r = obj.Addrel(cursym)
5096								r.Off = int32(p.Pc + int64(ab.Len()))
5097								r.Type = objabi.R_CALL
5098								r.Siz = 4
5099								r.Sym = ctxt.Lookup("__x86.get_pc_thunk." + strings.ToLower(rconv(int(dst))))
5100								ab.PutInt32(0)
5101
5102								ab.Put2(0x8B, byte(2<<6|reg[dst]|(reg[dst]<<3)))
5103								r = obj.Addrel(cursym)
5104								r.Off = int32(p.Pc + int64(ab.Len()))
5105								r.Type = objabi.R_TLS_IE
5106								r.Siz = 4
5107								r.Add = 2
5108								ab.PutInt32(0)
5109							} else {
5110								// ELF TLS base is 0(GS).
5111								pp.From = p.From
5112
5113								pp.From.Type = obj.TYPE_MEM
5114								pp.From.Reg = REG_GS
5115								pp.From.Offset = 0
5116								pp.From.Index = REG_NONE
5117								pp.From.Scale = 0
5118								ab.Put2(0x65, // GS
5119									0x8B)
5120								ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5121							}
5122						case objabi.Hplan9:
5123							pp.From = obj.Addr{}
5124							pp.From.Type = obj.TYPE_MEM
5125							pp.From.Name = obj.NAME_EXTERN
5126							pp.From.Sym = plan9privates
5127							pp.From.Offset = 0
5128							pp.From.Index = REG_NONE
5129							ab.Put1(0x8B)
5130							ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5131
5132						case objabi.Hwindows:
5133							// Windows TLS base is always 0x14(FS).
5134							pp.From = p.From
5135
5136							pp.From.Type = obj.TYPE_MEM
5137							pp.From.Reg = REG_FS
5138							pp.From.Offset = 0x14
5139							pp.From.Index = REG_NONE
5140							pp.From.Scale = 0
5141							ab.Put2(0x64, // FS
5142								0x8B)
5143							ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5144						}
5145						break
5146					}
5147
5148					switch ctxt.Headtype {
5149					default:
5150						log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)
5151
5152					case objabi.Hlinux, objabi.Hfreebsd:
5153						if !ctxt.Flag_shared {
5154							log.Fatalf("unknown TLS base location for linux/freebsd without -shared")
5155						}
5156						// Note that this is not generating the same insn as the other cases.
5157						//     MOV TLS, R_to
5158						// becomes
5159						//     movq g@gottpoff(%rip), R_to
5160						// which is encoded as
5161						//     movq 0(%rip), R_to
5162						// and a R_TLS_IE reloc. This all assumes the only tls variable we access
5163						// is g, which we can't check here, but will when we assemble the second
5164						// instruction.
5165						ab.rexflag = Pw | (regrex[p.To.Reg] & Rxr)
5166
5167						ab.Put2(0x8B, byte(0x05|(reg[p.To.Reg]<<3)))
5168						r = obj.Addrel(cursym)
5169						r.Off = int32(p.Pc + int64(ab.Len()))
5170						r.Type = objabi.R_TLS_IE
5171						r.Siz = 4
5172						r.Add = -4
5173						ab.PutInt32(0)
5174
5175					case objabi.Hplan9:
5176						pp.From = obj.Addr{}
5177						pp.From.Type = obj.TYPE_MEM
5178						pp.From.Name = obj.NAME_EXTERN
5179						pp.From.Sym = plan9privates
5180						pp.From.Offset = 0
5181						pp.From.Index = REG_NONE
5182						ab.rexflag |= Pw
5183						ab.Put1(0x8B)
5184						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5185
5186					case objabi.Hsolaris: // TODO(rsc): Delete Hsolaris from list. Should not use this code. See progedit in obj6.c.
5187						// TLS base is 0(FS).
5188						pp.From = p.From
5189
5190						pp.From.Type = obj.TYPE_MEM
5191						pp.From.Name = obj.NAME_NONE
5192						pp.From.Reg = REG_NONE
5193						pp.From.Offset = 0
5194						pp.From.Index = REG_NONE
5195						pp.From.Scale = 0
5196						ab.rexflag |= Pw
5197						ab.Put2(0x64, // FS
5198							0x8B)
5199						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5200
5201					case objabi.Hwindows:
5202						// Windows TLS base is always 0x28(GS).
5203						pp.From = p.From
5204
5205						pp.From.Type = obj.TYPE_MEM
5206						pp.From.Name = obj.NAME_NONE
5207						pp.From.Reg = REG_GS
5208						pp.From.Offset = 0x28
5209						pp.From.Index = REG_NONE
5210						pp.From.Scale = 0
5211						ab.rexflag |= Pw
5212						ab.Put2(0x65, // GS
5213							0x8B)
5214						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5215					}
5216				}
5217				return
5218			}
5219		}
5220	}
5221	goto bad
5222
5223bad:
5224	if ctxt.Arch.Family != sys.AMD64 {
5225		// here, the assembly has failed.
5226		// if it's a byte instruction that has
5227		// unaddressable registers, try to
5228		// exchange registers and reissue the
5229		// instruction with the operands renamed.
5230		pp := *p
5231
5232		unbytereg(&pp.From, &pp.Ft)
5233		unbytereg(&pp.To, &pp.Tt)
5234
5235		z := int(p.From.Reg)
5236		if p.From.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
5237			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
5238			// For now, different to keep bit-for-bit compatibility.
5239			if ctxt.Arch.Family == sys.I386 {
5240				breg := byteswapreg(ctxt, &p.To)
5241				if breg != REG_AX {
5242					ab.Put1(0x87) // xchg lhs,bx
5243					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
5244					subreg(&pp, z, breg)
5245					ab.doasm(ctxt, cursym, &pp)
5246					ab.Put1(0x87) // xchg lhs,bx
5247					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
5248				} else {
5249					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5250					subreg(&pp, z, REG_AX)
5251					ab.doasm(ctxt, cursym, &pp)
5252					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5253				}
5254				return
5255			}
5256
5257			if isax(&p.To) || p.To.Type == obj.TYPE_NONE {
5258				// We certainly don't want to exchange
5259				// with AX if the op is MUL or DIV.
5260				ab.Put1(0x87) // xchg lhs,bx
5261				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
5262				subreg(&pp, z, REG_BX)
5263				ab.doasm(ctxt, cursym, &pp)
5264				ab.Put1(0x87) // xchg lhs,bx
5265				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
5266			} else {
5267				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5268				subreg(&pp, z, REG_AX)
5269				ab.doasm(ctxt, cursym, &pp)
5270				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5271			}
5272			return
5273		}
5274
5275		z = int(p.To.Reg)
5276		if p.To.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
5277			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
5278			// For now, different to keep bit-for-bit compatibility.
5279			if ctxt.Arch.Family == sys.I386 {
5280				breg := byteswapreg(ctxt, &p.From)
5281				if breg != REG_AX {
5282					ab.Put1(0x87) //xchg rhs,bx
5283					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
5284					subreg(&pp, z, breg)
5285					ab.doasm(ctxt, cursym, &pp)
5286					ab.Put1(0x87) // xchg rhs,bx
5287					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
5288				} else {
5289					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5290					subreg(&pp, z, REG_AX)
5291					ab.doasm(ctxt, cursym, &pp)
5292					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5293				}
5294				return
5295			}
5296
5297			if isax(&p.From) {
5298				ab.Put1(0x87) // xchg rhs,bx
5299				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
5300				subreg(&pp, z, REG_BX)
5301				ab.doasm(ctxt, cursym, &pp)
5302				ab.Put1(0x87) // xchg rhs,bx
5303				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
5304			} else {
5305				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5306				subreg(&pp, z, REG_AX)
5307				ab.doasm(ctxt, cursym, &pp)
5308				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5309			}
5310			return
5311		}
5312	}
5313
5314	ctxt.Diag("%s: invalid instruction: %v", cursym.Name, p)
5315}
5316
5317// byteswapreg returns a byte-addressable register (AX, BX, CX, DX)
5318// which is not referenced in a.
5319// If a is empty, it returns BX to account for MULB-like instructions
5320// that might use DX and AX.
5321func byteswapreg(ctxt *obj.Link, a *obj.Addr) int {
5322	cana, canb, canc, cand := true, true, true, true
5323	if a.Type == obj.TYPE_NONE {
5324		cana, cand = false, false
5325	}
5326
5327	if a.Type == obj.TYPE_REG || ((a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Name == obj.NAME_NONE) {
5328		switch a.Reg {
5329		case REG_NONE:
5330			cana, cand = false, false
5331		case REG_AX, REG_AL, REG_AH:
5332			cana = false
5333		case REG_BX, REG_BL, REG_BH:
5334			canb = false
5335		case REG_CX, REG_CL, REG_CH:
5336			canc = false
5337		case REG_DX, REG_DL, REG_DH:
5338			cand = false
5339		}
5340	}
5341
5342	if a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR {
5343		switch a.Index {
5344		case REG_AX:
5345			cana = false
5346		case REG_BX:
5347			canb = false
5348		case REG_CX:
5349			canc = false
5350		case REG_DX:
5351			cand = false
5352		}
5353	}
5354
5355	switch {
5356	case cana:
5357		return REG_AX
5358	case canb:
5359		return REG_BX
5360	case canc:
5361		return REG_CX
5362	case cand:
5363		return REG_DX
5364	default:
5365		ctxt.Diag("impossible byte register")
5366		ctxt.DiagFlush()
5367		log.Fatalf("bad code")
5368		return 0
5369	}
5370}
5371
5372func isbadbyte(a *obj.Addr) bool {
5373	return a.Type == obj.TYPE_REG && (REG_BP <= a.Reg && a.Reg <= REG_DI || REG_BPB <= a.Reg && a.Reg <= REG_DIB)
5374}
5375
5376func (ab *AsmBuf) asmins(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
5377	ab.Reset()
5378
5379	ab.rexflag = 0
5380	ab.vexflag = false
5381	ab.evexflag = false
5382	mark := ab.Len()
5383	ab.doasm(ctxt, cursym, p)
5384	if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
5385		// as befits the whole approach of the architecture,
5386		// the rex prefix must appear before the first opcode byte
5387		// (and thus after any 66/67/f2/f3/26/2e/3e prefix bytes, but
5388		// before the 0f opcode escape!), or it might be ignored.
5389		// note that the handbook often misleadingly shows 66/f2/f3 in `opcode'.
5390		if ctxt.Arch.Family != sys.AMD64 {
5391			ctxt.Diag("asmins: illegal in mode %d: %v (%d %d)", ctxt.Arch.RegSize*8, p, p.Ft, p.Tt)
5392		}
5393		n := ab.Len()
5394		var np int
5395		for np = mark; np < n; np++ {
5396			c := ab.At(np)
5397			if c != 0xf2 && c != 0xf3 && (c < 0x64 || c > 0x67) && c != 0x2e && c != 0x3e && c != 0x26 {
5398				break
5399			}
5400		}
5401		ab.Insert(np, byte(0x40|ab.rexflag))
5402	}
5403
5404	n := ab.Len()
5405	for i := len(cursym.R) - 1; i >= 0; i-- {
5406		r := &cursym.R[i]
5407		if int64(r.Off) < p.Pc {
5408			break
5409		}
5410		if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
5411			r.Off++
5412		}
5413		if r.Type == objabi.R_PCREL {
5414			if ctxt.Arch.Family == sys.AMD64 || p.As == obj.AJMP || p.As == obj.ACALL {
5415				// PC-relative addressing is relative to the end of the instruction,
5416				// but the relocations applied by the linker are relative to the end
5417				// of the relocation. Because immediate instruction
5418				// arguments can follow the PC-relative memory reference in the
5419				// instruction encoding, the two may not coincide. In this case,
5420				// adjust addend so that linker can keep relocating relative to the
5421				// end of the relocation.
5422				r.Add -= p.Pc + int64(n) - (int64(r.Off) + int64(r.Siz))
5423			} else if ctxt.Arch.Family == sys.I386 {
5424				// On 386 PC-relative addressing (for non-call/jmp instructions)
5425				// assumes that the previous instruction loaded the PC of the end
5426				// of that instruction into CX, so the adjustment is relative to
5427				// that.
5428				r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
5429			}
5430		}
5431		if r.Type == objabi.R_GOTPCREL && ctxt.Arch.Family == sys.I386 {
5432			// On 386, R_GOTPCREL makes the same assumptions as R_PCREL.
5433			r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
5434		}
5435
5436	}
5437}
5438
5439// unpackOps4 extracts 4 operands from p.
5440func unpackOps4(p *obj.Prog) (arg0, arg1, arg2, dst *obj.Addr) {
5441	return &p.From, &p.RestArgs[0].Addr, &p.RestArgs[1].Addr, &p.To
5442}
5443
5444// unpackOps5 extracts 5 operands from p.
5445func unpackOps5(p *obj.Prog) (arg0, arg1, arg2, arg3, dst *obj.Addr) {
5446	return &p.From, &p.RestArgs[0].Addr, &p.RestArgs[1].Addr, &p.RestArgs[2].Addr, &p.To
5447}
5448