xref: /qemu/target/i386/tcg/fpu_helper.c (revision f917eed3)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "exec/helper-proto.h"
24 #include "qemu/host-utils.h"
25 #include "exec/exec-all.h"
26 #include "exec/cpu_ldst.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 
31 #ifdef CONFIG_SOFTMMU
32 #include "hw/irq.h"
33 #endif
34 
35 /* float macros */
36 #define FT0    (env->ft0)
37 #define ST0    (env->fpregs[env->fpstt].d)
38 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
39 #define ST1    ST(1)
40 
41 #define FPU_RC_MASK         0xc00
42 #define FPU_RC_NEAR         0x000
43 #define FPU_RC_DOWN         0x400
44 #define FPU_RC_UP           0x800
45 #define FPU_RC_CHOP         0xc00
46 
47 #define MAXTAN 9223372036854775808.0
48 
49 /* the following deal with x86 long double-precision numbers */
50 #define MAXEXPD 0x7fff
51 #define EXPBIAS 16383
52 #define EXPD(fp)        (fp.l.upper & 0x7fff)
53 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
54 #define MANTD(fp)       (fp.l.lower)
55 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
56 
57 #define FPUS_IE (1 << 0)
58 #define FPUS_DE (1 << 1)
59 #define FPUS_ZE (1 << 2)
60 #define FPUS_OE (1 << 3)
61 #define FPUS_UE (1 << 4)
62 #define FPUS_PE (1 << 5)
63 #define FPUS_SF (1 << 6)
64 #define FPUS_SE (1 << 7)
65 #define FPUS_B  (1 << 15)
66 
67 #define FPUC_EM 0x3f
68 
69 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
70 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
71 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
72 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
73 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
74 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
75 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
76 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
77 
78 #if !defined(CONFIG_USER_ONLY)
79 static qemu_irq ferr_irq;
80 
81 void x86_register_ferr_irq(qemu_irq irq)
82 {
83     ferr_irq = irq;
84 }
85 
86 static void cpu_clear_ignne(void)
87 {
88     CPUX86State *env = &X86_CPU(first_cpu)->env;
89     env->hflags2 &= ~HF2_IGNNE_MASK;
90 }
91 
92 void cpu_set_ignne(void)
93 {
94     CPUX86State *env = &X86_CPU(first_cpu)->env;
95     env->hflags2 |= HF2_IGNNE_MASK;
96     /*
97      * We get here in response to a write to port F0h.  The chipset should
98      * deassert FP_IRQ and FERR# instead should stay signaled until FPSW_SE is
99      * cleared, because FERR# and FP_IRQ are two separate pins on real
100      * hardware.  However, we don't model FERR# as a qemu_irq, so we just
101      * do directly what the chipset would do, i.e. deassert FP_IRQ.
102      */
103     qemu_irq_lower(ferr_irq);
104 }
105 #endif
106 
107 
108 static inline void fpush(CPUX86State *env)
109 {
110     env->fpstt = (env->fpstt - 1) & 7;
111     env->fptags[env->fpstt] = 0; /* validate stack entry */
112 }
113 
114 static inline void fpop(CPUX86State *env)
115 {
116     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
117     env->fpstt = (env->fpstt + 1) & 7;
118 }
119 
120 static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr,
121                                    uintptr_t retaddr)
122 {
123     CPU_LDoubleU temp;
124 
125     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
126     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
127     return temp.d;
128 }
129 
130 static inline void helper_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
131                                uintptr_t retaddr)
132 {
133     CPU_LDoubleU temp;
134 
135     temp.d = f;
136     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
137     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
138 }
139 
140 /* x87 FPU helpers */
141 
142 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
143 {
144     union {
145         float64 f64;
146         double d;
147     } u;
148 
149     u.f64 = floatx80_to_float64(a, &env->fp_status);
150     return u.d;
151 }
152 
153 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
154 {
155     union {
156         float64 f64;
157         double d;
158     } u;
159 
160     u.d = a;
161     return float64_to_floatx80(u.f64, &env->fp_status);
162 }
163 
164 static void fpu_set_exception(CPUX86State *env, int mask)
165 {
166     env->fpus |= mask;
167     if (env->fpus & (~env->fpuc & FPUC_EM)) {
168         env->fpus |= FPUS_SE | FPUS_B;
169     }
170 }
171 
172 static inline uint8_t save_exception_flags(CPUX86State *env)
173 {
174     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
175     set_float_exception_flags(0, &env->fp_status);
176     return old_flags;
177 }
178 
179 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
180 {
181     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
182     float_raise(old_flags, &env->fp_status);
183     fpu_set_exception(env,
184                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
185                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
186                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
187                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
188                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
189                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
190 }
191 
192 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
193 {
194     uint8_t old_flags = save_exception_flags(env);
195     floatx80 ret = floatx80_div(a, b, &env->fp_status);
196     merge_exception_flags(env, old_flags);
197     return ret;
198 }
199 
200 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
201 {
202     if (env->cr[0] & CR0_NE_MASK) {
203         raise_exception_ra(env, EXCP10_COPR, retaddr);
204     }
205 #if !defined(CONFIG_USER_ONLY)
206     else if (ferr_irq && !(env->hflags2 & HF2_IGNNE_MASK)) {
207         qemu_irq_raise(ferr_irq);
208     }
209 #endif
210 }
211 
212 void helper_flds_FT0(CPUX86State *env, uint32_t val)
213 {
214     uint8_t old_flags = save_exception_flags(env);
215     union {
216         float32 f;
217         uint32_t i;
218     } u;
219 
220     u.i = val;
221     FT0 = float32_to_floatx80(u.f, &env->fp_status);
222     merge_exception_flags(env, old_flags);
223 }
224 
225 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
226 {
227     uint8_t old_flags = save_exception_flags(env);
228     union {
229         float64 f;
230         uint64_t i;
231     } u;
232 
233     u.i = val;
234     FT0 = float64_to_floatx80(u.f, &env->fp_status);
235     merge_exception_flags(env, old_flags);
236 }
237 
238 void helper_fildl_FT0(CPUX86State *env, int32_t val)
239 {
240     FT0 = int32_to_floatx80(val, &env->fp_status);
241 }
242 
243 void helper_flds_ST0(CPUX86State *env, uint32_t val)
244 {
245     uint8_t old_flags = save_exception_flags(env);
246     int new_fpstt;
247     union {
248         float32 f;
249         uint32_t i;
250     } u;
251 
252     new_fpstt = (env->fpstt - 1) & 7;
253     u.i = val;
254     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
255     env->fpstt = new_fpstt;
256     env->fptags[new_fpstt] = 0; /* validate stack entry */
257     merge_exception_flags(env, old_flags);
258 }
259 
260 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
261 {
262     uint8_t old_flags = save_exception_flags(env);
263     int new_fpstt;
264     union {
265         float64 f;
266         uint64_t i;
267     } u;
268 
269     new_fpstt = (env->fpstt - 1) & 7;
270     u.i = val;
271     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
272     env->fpstt = new_fpstt;
273     env->fptags[new_fpstt] = 0; /* validate stack entry */
274     merge_exception_flags(env, old_flags);
275 }
276 
277 void helper_fildl_ST0(CPUX86State *env, int32_t val)
278 {
279     int new_fpstt;
280 
281     new_fpstt = (env->fpstt - 1) & 7;
282     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
283     env->fpstt = new_fpstt;
284     env->fptags[new_fpstt] = 0; /* validate stack entry */
285 }
286 
287 void helper_fildll_ST0(CPUX86State *env, int64_t val)
288 {
289     int new_fpstt;
290 
291     new_fpstt = (env->fpstt - 1) & 7;
292     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
293     env->fpstt = new_fpstt;
294     env->fptags[new_fpstt] = 0; /* validate stack entry */
295 }
296 
297 uint32_t helper_fsts_ST0(CPUX86State *env)
298 {
299     uint8_t old_flags = save_exception_flags(env);
300     union {
301         float32 f;
302         uint32_t i;
303     } u;
304 
305     u.f = floatx80_to_float32(ST0, &env->fp_status);
306     merge_exception_flags(env, old_flags);
307     return u.i;
308 }
309 
310 uint64_t helper_fstl_ST0(CPUX86State *env)
311 {
312     uint8_t old_flags = save_exception_flags(env);
313     union {
314         float64 f;
315         uint64_t i;
316     } u;
317 
318     u.f = floatx80_to_float64(ST0, &env->fp_status);
319     merge_exception_flags(env, old_flags);
320     return u.i;
321 }
322 
323 int32_t helper_fist_ST0(CPUX86State *env)
324 {
325     uint8_t old_flags = save_exception_flags(env);
326     int32_t val;
327 
328     val = floatx80_to_int32(ST0, &env->fp_status);
329     if (val != (int16_t)val) {
330         set_float_exception_flags(float_flag_invalid, &env->fp_status);
331         val = -32768;
332     }
333     merge_exception_flags(env, old_flags);
334     return val;
335 }
336 
337 int32_t helper_fistl_ST0(CPUX86State *env)
338 {
339     uint8_t old_flags = save_exception_flags(env);
340     int32_t val;
341 
342     val = floatx80_to_int32(ST0, &env->fp_status);
343     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
344         val = 0x80000000;
345     }
346     merge_exception_flags(env, old_flags);
347     return val;
348 }
349 
350 int64_t helper_fistll_ST0(CPUX86State *env)
351 {
352     uint8_t old_flags = save_exception_flags(env);
353     int64_t val;
354 
355     val = floatx80_to_int64(ST0, &env->fp_status);
356     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
357         val = 0x8000000000000000ULL;
358     }
359     merge_exception_flags(env, old_flags);
360     return val;
361 }
362 
363 int32_t helper_fistt_ST0(CPUX86State *env)
364 {
365     uint8_t old_flags = save_exception_flags(env);
366     int32_t val;
367 
368     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
369     if (val != (int16_t)val) {
370         set_float_exception_flags(float_flag_invalid, &env->fp_status);
371         val = -32768;
372     }
373     merge_exception_flags(env, old_flags);
374     return val;
375 }
376 
377 int32_t helper_fisttl_ST0(CPUX86State *env)
378 {
379     uint8_t old_flags = save_exception_flags(env);
380     int32_t val;
381 
382     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
383     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
384         val = 0x80000000;
385     }
386     merge_exception_flags(env, old_flags);
387     return val;
388 }
389 
390 int64_t helper_fisttll_ST0(CPUX86State *env)
391 {
392     uint8_t old_flags = save_exception_flags(env);
393     int64_t val;
394 
395     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
396     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
397         val = 0x8000000000000000ULL;
398     }
399     merge_exception_flags(env, old_flags);
400     return val;
401 }
402 
403 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
404 {
405     int new_fpstt;
406 
407     new_fpstt = (env->fpstt - 1) & 7;
408     env->fpregs[new_fpstt].d = helper_fldt(env, ptr, GETPC());
409     env->fpstt = new_fpstt;
410     env->fptags[new_fpstt] = 0; /* validate stack entry */
411 }
412 
413 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
414 {
415     helper_fstt(env, ST0, ptr, GETPC());
416 }
417 
418 void helper_fpush(CPUX86State *env)
419 {
420     fpush(env);
421 }
422 
423 void helper_fpop(CPUX86State *env)
424 {
425     fpop(env);
426 }
427 
428 void helper_fdecstp(CPUX86State *env)
429 {
430     env->fpstt = (env->fpstt - 1) & 7;
431     env->fpus &= ~0x4700;
432 }
433 
434 void helper_fincstp(CPUX86State *env)
435 {
436     env->fpstt = (env->fpstt + 1) & 7;
437     env->fpus &= ~0x4700;
438 }
439 
440 /* FPU move */
441 
442 void helper_ffree_STN(CPUX86State *env, int st_index)
443 {
444     env->fptags[(env->fpstt + st_index) & 7] = 1;
445 }
446 
447 void helper_fmov_ST0_FT0(CPUX86State *env)
448 {
449     ST0 = FT0;
450 }
451 
452 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
453 {
454     FT0 = ST(st_index);
455 }
456 
457 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
458 {
459     ST0 = ST(st_index);
460 }
461 
462 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
463 {
464     ST(st_index) = ST0;
465 }
466 
467 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
468 {
469     floatx80 tmp;
470 
471     tmp = ST(st_index);
472     ST(st_index) = ST0;
473     ST0 = tmp;
474 }
475 
476 /* FPU operations */
477 
478 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
479 
480 void helper_fcom_ST0_FT0(CPUX86State *env)
481 {
482     uint8_t old_flags = save_exception_flags(env);
483     FloatRelation ret;
484 
485     ret = floatx80_compare(ST0, FT0, &env->fp_status);
486     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
487     merge_exception_flags(env, old_flags);
488 }
489 
490 void helper_fucom_ST0_FT0(CPUX86State *env)
491 {
492     uint8_t old_flags = save_exception_flags(env);
493     FloatRelation ret;
494 
495     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
496     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
497     merge_exception_flags(env, old_flags);
498 }
499 
500 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
501 
502 void helper_fcomi_ST0_FT0(CPUX86State *env)
503 {
504     uint8_t old_flags = save_exception_flags(env);
505     int eflags;
506     FloatRelation ret;
507 
508     ret = floatx80_compare(ST0, FT0, &env->fp_status);
509     eflags = cpu_cc_compute_all(env, CC_OP);
510     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
511     CC_SRC = eflags;
512     merge_exception_flags(env, old_flags);
513 }
514 
515 void helper_fucomi_ST0_FT0(CPUX86State *env)
516 {
517     uint8_t old_flags = save_exception_flags(env);
518     int eflags;
519     FloatRelation ret;
520 
521     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
522     eflags = cpu_cc_compute_all(env, CC_OP);
523     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
524     CC_SRC = eflags;
525     merge_exception_flags(env, old_flags);
526 }
527 
528 void helper_fadd_ST0_FT0(CPUX86State *env)
529 {
530     uint8_t old_flags = save_exception_flags(env);
531     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
532     merge_exception_flags(env, old_flags);
533 }
534 
535 void helper_fmul_ST0_FT0(CPUX86State *env)
536 {
537     uint8_t old_flags = save_exception_flags(env);
538     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
539     merge_exception_flags(env, old_flags);
540 }
541 
542 void helper_fsub_ST0_FT0(CPUX86State *env)
543 {
544     uint8_t old_flags = save_exception_flags(env);
545     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
546     merge_exception_flags(env, old_flags);
547 }
548 
549 void helper_fsubr_ST0_FT0(CPUX86State *env)
550 {
551     uint8_t old_flags = save_exception_flags(env);
552     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
553     merge_exception_flags(env, old_flags);
554 }
555 
556 void helper_fdiv_ST0_FT0(CPUX86State *env)
557 {
558     ST0 = helper_fdiv(env, ST0, FT0);
559 }
560 
561 void helper_fdivr_ST0_FT0(CPUX86State *env)
562 {
563     ST0 = helper_fdiv(env, FT0, ST0);
564 }
565 
566 /* fp operations between STN and ST0 */
567 
568 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
569 {
570     uint8_t old_flags = save_exception_flags(env);
571     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
572     merge_exception_flags(env, old_flags);
573 }
574 
575 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
576 {
577     uint8_t old_flags = save_exception_flags(env);
578     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
579     merge_exception_flags(env, old_flags);
580 }
581 
582 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
583 {
584     uint8_t old_flags = save_exception_flags(env);
585     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
586     merge_exception_flags(env, old_flags);
587 }
588 
589 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
590 {
591     uint8_t old_flags = save_exception_flags(env);
592     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
593     merge_exception_flags(env, old_flags);
594 }
595 
596 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
597 {
598     floatx80 *p;
599 
600     p = &ST(st_index);
601     *p = helper_fdiv(env, *p, ST0);
602 }
603 
604 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
605 {
606     floatx80 *p;
607 
608     p = &ST(st_index);
609     *p = helper_fdiv(env, ST0, *p);
610 }
611 
612 /* misc FPU operations */
613 void helper_fchs_ST0(CPUX86State *env)
614 {
615     ST0 = floatx80_chs(ST0);
616 }
617 
618 void helper_fabs_ST0(CPUX86State *env)
619 {
620     ST0 = floatx80_abs(ST0);
621 }
622 
623 void helper_fld1_ST0(CPUX86State *env)
624 {
625     ST0 = floatx80_one;
626 }
627 
628 void helper_fldl2t_ST0(CPUX86State *env)
629 {
630     switch (env->fpuc & FPU_RC_MASK) {
631     case FPU_RC_UP:
632         ST0 = floatx80_l2t_u;
633         break;
634     default:
635         ST0 = floatx80_l2t;
636         break;
637     }
638 }
639 
640 void helper_fldl2e_ST0(CPUX86State *env)
641 {
642     switch (env->fpuc & FPU_RC_MASK) {
643     case FPU_RC_DOWN:
644     case FPU_RC_CHOP:
645         ST0 = floatx80_l2e_d;
646         break;
647     default:
648         ST0 = floatx80_l2e;
649         break;
650     }
651 }
652 
653 void helper_fldpi_ST0(CPUX86State *env)
654 {
655     switch (env->fpuc & FPU_RC_MASK) {
656     case FPU_RC_DOWN:
657     case FPU_RC_CHOP:
658         ST0 = floatx80_pi_d;
659         break;
660     default:
661         ST0 = floatx80_pi;
662         break;
663     }
664 }
665 
666 void helper_fldlg2_ST0(CPUX86State *env)
667 {
668     switch (env->fpuc & FPU_RC_MASK) {
669     case FPU_RC_DOWN:
670     case FPU_RC_CHOP:
671         ST0 = floatx80_lg2_d;
672         break;
673     default:
674         ST0 = floatx80_lg2;
675         break;
676     }
677 }
678 
679 void helper_fldln2_ST0(CPUX86State *env)
680 {
681     switch (env->fpuc & FPU_RC_MASK) {
682     case FPU_RC_DOWN:
683     case FPU_RC_CHOP:
684         ST0 = floatx80_ln2_d;
685         break;
686     default:
687         ST0 = floatx80_ln2;
688         break;
689     }
690 }
691 
692 void helper_fldz_ST0(CPUX86State *env)
693 {
694     ST0 = floatx80_zero;
695 }
696 
697 void helper_fldz_FT0(CPUX86State *env)
698 {
699     FT0 = floatx80_zero;
700 }
701 
702 uint32_t helper_fnstsw(CPUX86State *env)
703 {
704     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
705 }
706 
707 uint32_t helper_fnstcw(CPUX86State *env)
708 {
709     return env->fpuc;
710 }
711 
712 void update_fp_status(CPUX86State *env)
713 {
714     int rnd_type;
715 
716     /* set rounding mode */
717     switch (env->fpuc & FPU_RC_MASK) {
718     default:
719     case FPU_RC_NEAR:
720         rnd_type = float_round_nearest_even;
721         break;
722     case FPU_RC_DOWN:
723         rnd_type = float_round_down;
724         break;
725     case FPU_RC_UP:
726         rnd_type = float_round_up;
727         break;
728     case FPU_RC_CHOP:
729         rnd_type = float_round_to_zero;
730         break;
731     }
732     set_float_rounding_mode(rnd_type, &env->fp_status);
733     switch ((env->fpuc >> 8) & 3) {
734     case 0:
735         rnd_type = 32;
736         break;
737     case 2:
738         rnd_type = 64;
739         break;
740     case 3:
741     default:
742         rnd_type = 80;
743         break;
744     }
745     set_floatx80_rounding_precision(rnd_type, &env->fp_status);
746 }
747 
748 void helper_fldcw(CPUX86State *env, uint32_t val)
749 {
750     cpu_set_fpuc(env, val);
751 }
752 
753 void helper_fclex(CPUX86State *env)
754 {
755     env->fpus &= 0x7f00;
756 }
757 
758 void helper_fwait(CPUX86State *env)
759 {
760     if (env->fpus & FPUS_SE) {
761         fpu_raise_exception(env, GETPC());
762     }
763 }
764 
765 void helper_fninit(CPUX86State *env)
766 {
767     env->fpus = 0;
768     env->fpstt = 0;
769     cpu_set_fpuc(env, 0x37f);
770     env->fptags[0] = 1;
771     env->fptags[1] = 1;
772     env->fptags[2] = 1;
773     env->fptags[3] = 1;
774     env->fptags[4] = 1;
775     env->fptags[5] = 1;
776     env->fptags[6] = 1;
777     env->fptags[7] = 1;
778 }
779 
780 /* BCD ops */
781 
782 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
783 {
784     floatx80 tmp;
785     uint64_t val;
786     unsigned int v;
787     int i;
788 
789     val = 0;
790     for (i = 8; i >= 0; i--) {
791         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
792         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
793     }
794     tmp = int64_to_floatx80(val, &env->fp_status);
795     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
796         tmp = floatx80_chs(tmp);
797     }
798     fpush(env);
799     ST0 = tmp;
800 }
801 
802 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
803 {
804     uint8_t old_flags = save_exception_flags(env);
805     int v;
806     target_ulong mem_ref, mem_end;
807     int64_t val;
808     CPU_LDoubleU temp;
809 
810     temp.d = ST0;
811 
812     val = floatx80_to_int64(ST0, &env->fp_status);
813     mem_ref = ptr;
814     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
815         set_float_exception_flags(float_flag_invalid, &env->fp_status);
816         while (mem_ref < ptr + 7) {
817             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
818         }
819         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
820         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
821         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
822         merge_exception_flags(env, old_flags);
823         return;
824     }
825     mem_end = mem_ref + 9;
826     if (SIGND(temp)) {
827         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
828         val = -val;
829     } else {
830         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
831     }
832     while (mem_ref < mem_end) {
833         if (val == 0) {
834             break;
835         }
836         v = val % 100;
837         val = val / 100;
838         v = ((v / 10) << 4) | (v % 10);
839         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
840     }
841     while (mem_ref < mem_end) {
842         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
843     }
844     merge_exception_flags(env, old_flags);
845 }
846 
847 /* 128-bit significand of log(2).  */
848 #define ln2_sig_high 0xb17217f7d1cf79abULL
849 #define ln2_sig_low 0xc9e3b39803f2f6afULL
850 
851 /*
852  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
853  * the interval [-1/64, 1/64].
854  */
855 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
856 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
857 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
858 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
859 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
860 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
861 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
862 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
863 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
864 
865 struct f2xm1_data {
866     /*
867      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
868      * are very close to exact floatx80 values.
869      */
870     floatx80 t;
871     /* The value of 2^t.  */
872     floatx80 exp2;
873     /* The value of 2^t - 1.  */
874     floatx80 exp2m1;
875 };
876 
877 static const struct f2xm1_data f2xm1_table[65] = {
878     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
879       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
880       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
881     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
882       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
883       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
884     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
885       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
886       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
887     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
888       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
889       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
890     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
891       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
892       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
893     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
894       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
895       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
896     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
897       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
898       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
899     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
900       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
901       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
902     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
903       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
904       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
905     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
906       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
907       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
908     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
909       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
910       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
911     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
912       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
913       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
914     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
915       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
916       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
917     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
918       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
919       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
920     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
921       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
922       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
923     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
924       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
925       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
926     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
927       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
928       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
929     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
930       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
931       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
932     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
933       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
934       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
935     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
936       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
937       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
938     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
939       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
940       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
941     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
942       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
943       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
944     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
945       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
946       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
947     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
948       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
949       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
950     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
951       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
952       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
953     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
954       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
955       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
956     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
957       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
958       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
959     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
960       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
961       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
962     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
963       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
964       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
965     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
966       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
967       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
968     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
969       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
970       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
971     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
972       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
973       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
974     { floatx80_zero_init,
975       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
976       floatx80_zero_init },
977     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
978       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
979       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
980     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
981       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
982       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
983     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
984       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
985       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
986     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
987       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
988       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
989     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
990       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
991       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
992     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
993       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
994       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
995     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
996       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
997       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
998     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
999       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
1000       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1001     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1002       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1003       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1004     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1005       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1006       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1007     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1008       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1009       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1010     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1011       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1012       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1013     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1014       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1015       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1016     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1017       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1018       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1019     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1020       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1021       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1022     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1023       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1024       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1025     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1026       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1027       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1028     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1029       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1030       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1031     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1032       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1033       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1034     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1035       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1036       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1037     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1038       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1039       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1040     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1041       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1042       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1043     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1044       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1045       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1046     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1047       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1048       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1049     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1050       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1051       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1052     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1053       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1054       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1055     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1056       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1057       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1058     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1059       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1060       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1061     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1062       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1063       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1064     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1065       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1066       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1067     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1068       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1069       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1070     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1071       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1072       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1073 };
1074 
1075 void helper_f2xm1(CPUX86State *env)
1076 {
1077     uint8_t old_flags = save_exception_flags(env);
1078     uint64_t sig = extractFloatx80Frac(ST0);
1079     int32_t exp = extractFloatx80Exp(ST0);
1080     bool sign = extractFloatx80Sign(ST0);
1081 
1082     if (floatx80_invalid_encoding(ST0)) {
1083         float_raise(float_flag_invalid, &env->fp_status);
1084         ST0 = floatx80_default_nan(&env->fp_status);
1085     } else if (floatx80_is_any_nan(ST0)) {
1086         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1087             float_raise(float_flag_invalid, &env->fp_status);
1088             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1089         }
1090     } else if (exp > 0x3fff ||
1091                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1092         /* Out of range for the instruction, treat as invalid.  */
1093         float_raise(float_flag_invalid, &env->fp_status);
1094         ST0 = floatx80_default_nan(&env->fp_status);
1095     } else if (exp == 0x3fff) {
1096         /* Argument 1 or -1, exact result 1 or -0.5.  */
1097         if (sign) {
1098             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1099         }
1100     } else if (exp < 0x3fb0) {
1101         if (!floatx80_is_zero(ST0)) {
1102             /*
1103              * Multiplying the argument by an extra-precision version
1104              * of log(2) is sufficiently precise.  Zero arguments are
1105              * returned unchanged.
1106              */
1107             uint64_t sig0, sig1, sig2;
1108             if (exp == 0) {
1109                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1110             }
1111             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1112                             &sig2);
1113             /* This result is inexact.  */
1114             sig1 |= 1;
1115             ST0 = normalizeRoundAndPackFloatx80(80, sign, exp, sig0, sig1,
1116                                                 &env->fp_status);
1117         }
1118     } else {
1119         floatx80 tmp, y, accum;
1120         bool asign, bsign;
1121         int32_t n, aexp, bexp;
1122         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1123         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1124         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1125         env->fp_status.float_rounding_mode = float_round_nearest_even;
1126         env->fp_status.floatx80_rounding_precision = 80;
1127 
1128         /* Find the nearest multiple of 1/32 to the argument.  */
1129         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1130         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1131         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1132 
1133         if (floatx80_is_zero(y)) {
1134             /*
1135              * Use the value of 2^t - 1 from the table, to avoid
1136              * needing to special-case zero as a result of
1137              * multiplication below.
1138              */
1139             ST0 = f2xm1_table[n].t;
1140             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1141             env->fp_status.float_rounding_mode = save_mode;
1142         } else {
1143             /*
1144              * Compute the lower parts of a polynomial expansion for
1145              * (2^y - 1) / y.
1146              */
1147             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1148             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1149             accum = floatx80_mul(accum, y, &env->fp_status);
1150             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1151             accum = floatx80_mul(accum, y, &env->fp_status);
1152             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1153             accum = floatx80_mul(accum, y, &env->fp_status);
1154             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1155             accum = floatx80_mul(accum, y, &env->fp_status);
1156             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1157             accum = floatx80_mul(accum, y, &env->fp_status);
1158             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1159             accum = floatx80_mul(accum, y, &env->fp_status);
1160             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1161 
1162             /*
1163              * The full polynomial expansion is f2xm1_coeff_0 + accum
1164              * (where accum has much lower magnitude, and so, in
1165              * particular, carry out of the addition is not possible).
1166              * (This expansion is only accurate to about 70 bits, not
1167              * 128 bits.)
1168              */
1169             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1170             asign = extractFloatx80Sign(f2xm1_coeff_0);
1171             shift128RightJamming(extractFloatx80Frac(accum), 0,
1172                                  aexp - extractFloatx80Exp(accum),
1173                                  &asig0, &asig1);
1174             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1175             bsig1 = 0;
1176             if (asign == extractFloatx80Sign(accum)) {
1177                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1178             } else {
1179                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1180             }
1181             /* And thus compute an approximation to 2^y - 1.  */
1182             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1183                             &asig0, &asig1, &asig2);
1184             aexp += extractFloatx80Exp(y) - 0x3ffe;
1185             asign ^= extractFloatx80Sign(y);
1186             if (n != 32) {
1187                 /*
1188                  * Multiply this by the precomputed value of 2^t and
1189                  * add that of 2^t - 1.
1190                  */
1191                 mul128By64To192(asig0, asig1,
1192                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1193                                 &asig0, &asig1, &asig2);
1194                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1195                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1196                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1197                 bsig1 = 0;
1198                 if (bexp < aexp) {
1199                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1200                                          &bsig0, &bsig1);
1201                 } else if (aexp < bexp) {
1202                     shift128RightJamming(asig0, asig1, bexp - aexp,
1203                                          &asig0, &asig1);
1204                     aexp = bexp;
1205                 }
1206                 /* The sign of 2^t - 1 is always that of the result.  */
1207                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1208                 if (asign == bsign) {
1209                     /* Avoid possible carry out of the addition.  */
1210                     shift128RightJamming(asig0, asig1, 1,
1211                                          &asig0, &asig1);
1212                     shift128RightJamming(bsig0, bsig1, 1,
1213                                          &bsig0, &bsig1);
1214                     ++aexp;
1215                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1216                 } else {
1217                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1218                     asign = bsign;
1219                 }
1220             }
1221             env->fp_status.float_rounding_mode = save_mode;
1222             /* This result is inexact.  */
1223             asig1 |= 1;
1224             ST0 = normalizeRoundAndPackFloatx80(80, asign, aexp, asig0, asig1,
1225                                                 &env->fp_status);
1226         }
1227 
1228         env->fp_status.floatx80_rounding_precision = save_prec;
1229     }
1230     merge_exception_flags(env, old_flags);
1231 }
1232 
1233 void helper_fptan(CPUX86State *env)
1234 {
1235     double fptemp = floatx80_to_double(env, ST0);
1236 
1237     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1238         env->fpus |= 0x400;
1239     } else {
1240         fptemp = tan(fptemp);
1241         ST0 = double_to_floatx80(env, fptemp);
1242         fpush(env);
1243         ST0 = floatx80_one;
1244         env->fpus &= ~0x400; /* C2 <-- 0 */
1245         /* the above code is for |arg| < 2**52 only */
1246     }
1247 }
1248 
1249 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1250 #define pi_4_exp 0x3ffe
1251 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1252 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1253 #define pi_2_exp 0x3fff
1254 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1255 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1256 #define pi_34_exp 0x4000
1257 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1258 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1259 #define pi_exp 0x4000
1260 #define pi_sig_high 0xc90fdaa22168c234ULL
1261 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1262 
1263 /*
1264  * Polynomial coefficients for an approximation to atan(x), with only
1265  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1266  * for some other approximations, no low part is needed for the first
1267  * coefficient here to achieve a sufficiently accurate result, because
1268  * the coefficient in this minimax approximation is very close to
1269  * exactly 1.)
1270  */
1271 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1272 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1273 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1274 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1275 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1276 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1277 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1278 
1279 struct fpatan_data {
1280     /* High and low parts of atan(x).  */
1281     floatx80 atan_high, atan_low;
1282 };
1283 
1284 static const struct fpatan_data fpatan_table[9] = {
1285     { floatx80_zero_init,
1286       floatx80_zero_init },
1287     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1288       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1289     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1290       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1291     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1292       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1293     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1294       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1295     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1296       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1297     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1298       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1299     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1300       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1301     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1302       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1303 };
1304 
1305 void helper_fpatan(CPUX86State *env)
1306 {
1307     uint8_t old_flags = save_exception_flags(env);
1308     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1309     int32_t arg0_exp = extractFloatx80Exp(ST0);
1310     bool arg0_sign = extractFloatx80Sign(ST0);
1311     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1312     int32_t arg1_exp = extractFloatx80Exp(ST1);
1313     bool arg1_sign = extractFloatx80Sign(ST1);
1314 
1315     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1316         float_raise(float_flag_invalid, &env->fp_status);
1317         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1318     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1319         float_raise(float_flag_invalid, &env->fp_status);
1320         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1321     } else if (floatx80_invalid_encoding(ST0) ||
1322                floatx80_invalid_encoding(ST1)) {
1323         float_raise(float_flag_invalid, &env->fp_status);
1324         ST1 = floatx80_default_nan(&env->fp_status);
1325     } else if (floatx80_is_any_nan(ST0)) {
1326         ST1 = ST0;
1327     } else if (floatx80_is_any_nan(ST1)) {
1328         /* Pass this NaN through.  */
1329     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1330         /* Pass this zero through.  */
1331     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1332                  arg0_exp - arg1_exp >= 80) &&
1333                !arg0_sign) {
1334         /*
1335          * Dividing ST1 by ST0 gives the correct result up to
1336          * rounding, and avoids spurious underflow exceptions that
1337          * might result from passing some small values through the
1338          * polynomial approximation, but if a finite nonzero result of
1339          * division is exact, the result of fpatan is still inexact
1340          * (and underflowing where appropriate).
1341          */
1342         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1343         env->fp_status.floatx80_rounding_precision = 80;
1344         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1345         env->fp_status.floatx80_rounding_precision = save_prec;
1346         if (!floatx80_is_zero(ST1) &&
1347             !(get_float_exception_flags(&env->fp_status) &
1348               float_flag_inexact)) {
1349             /*
1350              * The mathematical result is very slightly closer to zero
1351              * than this exact result.  Round a value with the
1352              * significand adjusted accordingly to get the correct
1353              * exceptions, and possibly an adjusted result depending
1354              * on the rounding mode.
1355              */
1356             uint64_t sig = extractFloatx80Frac(ST1);
1357             int32_t exp = extractFloatx80Exp(ST1);
1358             bool sign = extractFloatx80Sign(ST1);
1359             if (exp == 0) {
1360                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1361             }
1362             ST1 = normalizeRoundAndPackFloatx80(80, sign, exp, sig - 1,
1363                                                 -1, &env->fp_status);
1364         }
1365     } else {
1366         /* The result is inexact.  */
1367         bool rsign = arg1_sign;
1368         int32_t rexp;
1369         uint64_t rsig0, rsig1;
1370         if (floatx80_is_zero(ST1)) {
1371             /*
1372              * ST0 is negative.  The result is pi with the sign of
1373              * ST1.
1374              */
1375             rexp = pi_exp;
1376             rsig0 = pi_sig_high;
1377             rsig1 = pi_sig_low;
1378         } else if (floatx80_is_infinity(ST1)) {
1379             if (floatx80_is_infinity(ST0)) {
1380                 if (arg0_sign) {
1381                     rexp = pi_34_exp;
1382                     rsig0 = pi_34_sig_high;
1383                     rsig1 = pi_34_sig_low;
1384                 } else {
1385                     rexp = pi_4_exp;
1386                     rsig0 = pi_4_sig_high;
1387                     rsig1 = pi_4_sig_low;
1388                 }
1389             } else {
1390                 rexp = pi_2_exp;
1391                 rsig0 = pi_2_sig_high;
1392                 rsig1 = pi_2_sig_low;
1393             }
1394         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1395             rexp = pi_2_exp;
1396             rsig0 = pi_2_sig_high;
1397             rsig1 = pi_2_sig_low;
1398         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1399             /* ST0 is negative.  */
1400             rexp = pi_exp;
1401             rsig0 = pi_sig_high;
1402             rsig1 = pi_sig_low;
1403         } else {
1404             /*
1405              * ST0 and ST1 are finite, nonzero and with exponents not
1406              * too far apart.
1407              */
1408             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1409             int32_t azexp, axexp;
1410             bool adj_sub, ysign, zsign;
1411             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1412             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1413             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1414             uint64_t azsig0, azsig1;
1415             uint64_t azsig2, azsig3, axsig0, axsig1;
1416             floatx80 x8;
1417             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1418             signed char save_prec = env->fp_status.floatx80_rounding_precision;
1419             env->fp_status.float_rounding_mode = float_round_nearest_even;
1420             env->fp_status.floatx80_rounding_precision = 80;
1421 
1422             if (arg0_exp == 0) {
1423                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1424             }
1425             if (arg1_exp == 0) {
1426                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1427             }
1428             if (arg0_exp > arg1_exp ||
1429                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1430                 /* Work with abs(ST1) / abs(ST0).  */
1431                 num_exp = arg1_exp;
1432                 num_sig = arg1_sig;
1433                 den_exp = arg0_exp;
1434                 den_sig = arg0_sig;
1435                 if (arg0_sign) {
1436                     /* The result is subtracted from pi.  */
1437                     adj_exp = pi_exp;
1438                     adj_sig0 = pi_sig_high;
1439                     adj_sig1 = pi_sig_low;
1440                     adj_sub = true;
1441                 } else {
1442                     /* The result is used as-is.  */
1443                     adj_exp = 0;
1444                     adj_sig0 = 0;
1445                     adj_sig1 = 0;
1446                     adj_sub = false;
1447                 }
1448             } else {
1449                 /* Work with abs(ST0) / abs(ST1).  */
1450                 num_exp = arg0_exp;
1451                 num_sig = arg0_sig;
1452                 den_exp = arg1_exp;
1453                 den_sig = arg1_sig;
1454                 /* The result is added to or subtracted from pi/2.  */
1455                 adj_exp = pi_2_exp;
1456                 adj_sig0 = pi_2_sig_high;
1457                 adj_sig1 = pi_2_sig_low;
1458                 adj_sub = !arg0_sign;
1459             }
1460 
1461             /*
1462              * Compute x = num/den, where 0 < x <= 1 and x is not too
1463              * small.
1464              */
1465             xexp = num_exp - den_exp + 0x3ffe;
1466             remsig0 = num_sig;
1467             remsig1 = 0;
1468             if (den_sig <= remsig0) {
1469                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1470                 ++xexp;
1471             }
1472             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1473             mul64To128(den_sig, xsig0, &msig0, &msig1);
1474             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1475             while ((int64_t) remsig0 < 0) {
1476                 --xsig0;
1477                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1478             }
1479             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1480             /*
1481              * No need to correct any estimation error in xsig1; even
1482              * with such error, it is accurate enough.
1483              */
1484 
1485             /*
1486              * Split x as x = t + y, where t = n/8 is the nearest
1487              * multiple of 1/8 to x.
1488              */
1489             x8 = normalizeRoundAndPackFloatx80(80, false, xexp + 3, xsig0,
1490                                                xsig1, &env->fp_status);
1491             n = floatx80_to_int32(x8, &env->fp_status);
1492             if (n == 0) {
1493                 ysign = false;
1494                 yexp = xexp;
1495                 ysig0 = xsig0;
1496                 ysig1 = xsig1;
1497                 texp = 0;
1498                 tsig = 0;
1499             } else {
1500                 int shift = clz32(n) + 32;
1501                 texp = 0x403b - shift;
1502                 tsig = n;
1503                 tsig <<= shift;
1504                 if (texp == xexp) {
1505                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1506                     if ((int64_t) ysig0 >= 0) {
1507                         ysign = false;
1508                         if (ysig0 == 0) {
1509                             if (ysig1 == 0) {
1510                                 yexp = 0;
1511                             } else {
1512                                 shift = clz64(ysig1) + 64;
1513                                 yexp = xexp - shift;
1514                                 shift128Left(ysig0, ysig1, shift,
1515                                              &ysig0, &ysig1);
1516                             }
1517                         } else {
1518                             shift = clz64(ysig0);
1519                             yexp = xexp - shift;
1520                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1521                         }
1522                     } else {
1523                         ysign = true;
1524                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1525                         if (ysig0 == 0) {
1526                             shift = clz64(ysig1) + 64;
1527                         } else {
1528                             shift = clz64(ysig0);
1529                         }
1530                         yexp = xexp - shift;
1531                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1532                     }
1533                 } else {
1534                     /*
1535                      * t's exponent must be greater than x's because t
1536                      * is positive and the nearest multiple of 1/8 to
1537                      * x, and if x has a greater exponent, the power
1538                      * of 2 with that exponent is also a multiple of
1539                      * 1/8.
1540                      */
1541                     uint64_t usig0, usig1;
1542                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1543                                          &usig0, &usig1);
1544                     ysign = true;
1545                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1546                     if (ysig0 == 0) {
1547                         shift = clz64(ysig1) + 64;
1548                     } else {
1549                         shift = clz64(ysig0);
1550                     }
1551                     yexp = texp - shift;
1552                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1553                 }
1554             }
1555 
1556             /*
1557              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1558              * arctan(z).
1559              */
1560             zsign = ysign;
1561             if (texp == 0 || yexp == 0) {
1562                 zexp = yexp;
1563                 zsig0 = ysig0;
1564                 zsig1 = ysig1;
1565             } else {
1566                 /*
1567                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1568                  */
1569                 int32_t dexp = texp + xexp - 0x3ffe;
1570                 uint64_t dsig0, dsig1, dsig2;
1571                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1572                 /*
1573                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1574                  * bit).  Add 1 to produce the denominator 1+tx.
1575                  */
1576                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1577                                      &dsig0, &dsig1);
1578                 dsig0 |= 0x8000000000000000ULL;
1579                 zexp = yexp - 1;
1580                 remsig0 = ysig0;
1581                 remsig1 = ysig1;
1582                 remsig2 = 0;
1583                 if (dsig0 <= remsig0) {
1584                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1585                     ++zexp;
1586                 }
1587                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1588                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1589                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1590                        &remsig0, &remsig1, &remsig2);
1591                 while ((int64_t) remsig0 < 0) {
1592                     --zsig0;
1593                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1594                            &remsig0, &remsig1, &remsig2);
1595                 }
1596                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1597                 /* No need to correct any estimation error in zsig1.  */
1598             }
1599 
1600             if (zexp == 0) {
1601                 azexp = 0;
1602                 azsig0 = 0;
1603                 azsig1 = 0;
1604             } else {
1605                 floatx80 z2, accum;
1606                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1607                 /* Compute z^2.  */
1608                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1609                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1610                 z2 = normalizeRoundAndPackFloatx80(80, false,
1611                                                    zexp + zexp - 0x3ffe,
1612                                                    z2sig0, z2sig1,
1613                                                    &env->fp_status);
1614 
1615                 /* Compute the lower parts of the polynomial expansion.  */
1616                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1617                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1618                 accum = floatx80_mul(accum, z2, &env->fp_status);
1619                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1620                 accum = floatx80_mul(accum, z2, &env->fp_status);
1621                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1622                 accum = floatx80_mul(accum, z2, &env->fp_status);
1623                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1624                 accum = floatx80_mul(accum, z2, &env->fp_status);
1625                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1626                 accum = floatx80_mul(accum, z2, &env->fp_status);
1627 
1628                 /*
1629                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1630                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1631                  */
1632                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1633                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1634                                      aexp - extractFloatx80Exp(accum),
1635                                      &asig0, &asig1);
1636                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1637                        &asig0, &asig1);
1638                 /* Multiply by z to compute arctan(z).  */
1639                 azexp = aexp + zexp - 0x3ffe;
1640                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1641                             &azsig2, &azsig3);
1642             }
1643 
1644             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1645             if (texp == 0) {
1646                 /* z is positive.  */
1647                 axexp = azexp;
1648                 axsig0 = azsig0;
1649                 axsig1 = azsig1;
1650             } else {
1651                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1652                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1653                 uint64_t low_sig0 =
1654                     extractFloatx80Frac(fpatan_table[n].atan_low);
1655                 uint64_t low_sig1 = 0;
1656                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1657                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1658                 axsig1 = 0;
1659                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1660                                      &low_sig0, &low_sig1);
1661                 if (low_sign) {
1662                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1663                            &axsig0, &axsig1);
1664                 } else {
1665                     add128(axsig0, axsig1, low_sig0, low_sig1,
1666                            &axsig0, &axsig1);
1667                 }
1668                 if (azexp >= axexp) {
1669                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1670                                          &axsig0, &axsig1);
1671                     axexp = azexp + 1;
1672                     shift128RightJamming(azsig0, azsig1, 1,
1673                                          &azsig0, &azsig1);
1674                 } else {
1675                     shift128RightJamming(axsig0, axsig1, 1,
1676                                          &axsig0, &axsig1);
1677                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1678                                          &azsig0, &azsig1);
1679                     ++axexp;
1680                 }
1681                 if (zsign) {
1682                     sub128(axsig0, axsig1, azsig0, azsig1,
1683                            &axsig0, &axsig1);
1684                 } else {
1685                     add128(axsig0, axsig1, azsig0, azsig1,
1686                            &axsig0, &axsig1);
1687                 }
1688             }
1689 
1690             if (adj_exp == 0) {
1691                 rexp = axexp;
1692                 rsig0 = axsig0;
1693                 rsig1 = axsig1;
1694             } else {
1695                 /*
1696                  * Add or subtract arctan(x) (exponent axexp,
1697                  * significand axsig0 and axsig1, positive, not
1698                  * necessarily normalized) to the number given by
1699                  * adj_exp, adj_sig0 and adj_sig1, according to
1700                  * adj_sub.
1701                  */
1702                 if (adj_exp >= axexp) {
1703                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1704                                          &axsig0, &axsig1);
1705                     rexp = adj_exp + 1;
1706                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1707                                          &adj_sig0, &adj_sig1);
1708                 } else {
1709                     shift128RightJamming(axsig0, axsig1, 1,
1710                                          &axsig0, &axsig1);
1711                     shift128RightJamming(adj_sig0, adj_sig1,
1712                                          axexp - adj_exp + 1,
1713                                          &adj_sig0, &adj_sig1);
1714                     rexp = axexp + 1;
1715                 }
1716                 if (adj_sub) {
1717                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1718                            &rsig0, &rsig1);
1719                 } else {
1720                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1721                            &rsig0, &rsig1);
1722                 }
1723             }
1724 
1725             env->fp_status.float_rounding_mode = save_mode;
1726             env->fp_status.floatx80_rounding_precision = save_prec;
1727         }
1728         /* This result is inexact.  */
1729         rsig1 |= 1;
1730         ST1 = normalizeRoundAndPackFloatx80(80, rsign, rexp,
1731                                             rsig0, rsig1, &env->fp_status);
1732     }
1733 
1734     fpop(env);
1735     merge_exception_flags(env, old_flags);
1736 }
1737 
1738 void helper_fxtract(CPUX86State *env)
1739 {
1740     uint8_t old_flags = save_exception_flags(env);
1741     CPU_LDoubleU temp;
1742 
1743     temp.d = ST0;
1744 
1745     if (floatx80_is_zero(ST0)) {
1746         /* Easy way to generate -inf and raising division by 0 exception */
1747         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1748                            &env->fp_status);
1749         fpush(env);
1750         ST0 = temp.d;
1751     } else if (floatx80_invalid_encoding(ST0)) {
1752         float_raise(float_flag_invalid, &env->fp_status);
1753         ST0 = floatx80_default_nan(&env->fp_status);
1754         fpush(env);
1755         ST0 = ST1;
1756     } else if (floatx80_is_any_nan(ST0)) {
1757         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1758             float_raise(float_flag_invalid, &env->fp_status);
1759             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1760         }
1761         fpush(env);
1762         ST0 = ST1;
1763     } else if (floatx80_is_infinity(ST0)) {
1764         fpush(env);
1765         ST0 = ST1;
1766         ST1 = floatx80_infinity;
1767     } else {
1768         int expdif;
1769 
1770         if (EXPD(temp) == 0) {
1771             int shift = clz64(temp.l.lower);
1772             temp.l.lower <<= shift;
1773             expdif = 1 - EXPBIAS - shift;
1774             float_raise(float_flag_input_denormal, &env->fp_status);
1775         } else {
1776             expdif = EXPD(temp) - EXPBIAS;
1777         }
1778         /* DP exponent bias */
1779         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1780         fpush(env);
1781         BIASEXPONENT(temp);
1782         ST0 = temp.d;
1783     }
1784     merge_exception_flags(env, old_flags);
1785 }
1786 
1787 static void helper_fprem_common(CPUX86State *env, bool mod)
1788 {
1789     uint8_t old_flags = save_exception_flags(env);
1790     uint64_t quotient;
1791     CPU_LDoubleU temp0, temp1;
1792     int exp0, exp1, expdiff;
1793 
1794     temp0.d = ST0;
1795     temp1.d = ST1;
1796     exp0 = EXPD(temp0);
1797     exp1 = EXPD(temp1);
1798 
1799     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1800     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1801         exp0 == 0x7fff || exp1 == 0x7fff ||
1802         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1803         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1804     } else {
1805         if (exp0 == 0) {
1806             exp0 = 1 - clz64(temp0.l.lower);
1807         }
1808         if (exp1 == 0) {
1809             exp1 = 1 - clz64(temp1.l.lower);
1810         }
1811         expdiff = exp0 - exp1;
1812         if (expdiff < 64) {
1813             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1814             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1815             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1816             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1817         } else {
1818             /*
1819              * Partial remainder.  This choice of how many bits to
1820              * process at once is specified in AMD instruction set
1821              * manuals, and empirically is followed by Intel
1822              * processors as well; it ensures that the final remainder
1823              * operation in a loop does produce the correct low three
1824              * bits of the quotient.  AMD manuals specify that the
1825              * flags other than C2 are cleared, and empirically Intel
1826              * processors clear them as well.
1827              */
1828             int n = 32 + (expdiff % 32);
1829             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1830             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1831             env->fpus |= 0x400;  /* C2 <-- 1 */
1832         }
1833     }
1834     merge_exception_flags(env, old_flags);
1835 }
1836 
1837 void helper_fprem1(CPUX86State *env)
1838 {
1839     helper_fprem_common(env, false);
1840 }
1841 
1842 void helper_fprem(CPUX86State *env)
1843 {
1844     helper_fprem_common(env, true);
1845 }
1846 
1847 /* 128-bit significand of log2(e).  */
1848 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1849 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1850 
1851 /*
1852  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1853  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1854  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1855  * interval [sqrt(2)/2, sqrt(2)].
1856  */
1857 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1858 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1859 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1860 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1861 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1862 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1863 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1864 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1865 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1866 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1867 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1868 
1869 /*
1870  * Compute an approximation of log2(1+arg), where 1+arg is in the
1871  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1872  * function is called, rounding precision is set to 80 and the
1873  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1874  * and must not be so close to zero that underflow might occur.
1875  */
1876 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1877                                 uint64_t *sig0, uint64_t *sig1)
1878 {
1879     uint64_t arg0_sig = extractFloatx80Frac(arg);
1880     int32_t arg0_exp = extractFloatx80Exp(arg);
1881     bool arg0_sign = extractFloatx80Sign(arg);
1882     bool asign;
1883     int32_t dexp, texp, aexp;
1884     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1885     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1886     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1887     floatx80 t2, accum;
1888 
1889     /*
1890      * Compute an approximation of arg/(2+arg), with extra precision,
1891      * as the argument to a polynomial approximation.  The extra
1892      * precision is only needed for the first term of the
1893      * approximation, with subsequent terms being significantly
1894      * smaller; the approximation only uses odd exponents, and the
1895      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1896      */
1897     if (arg0_sign) {
1898         dexp = 0x3fff;
1899         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1900         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1901     } else {
1902         dexp = 0x4000;
1903         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1904         dsig0 |= 0x8000000000000000ULL;
1905     }
1906     texp = arg0_exp - dexp + 0x3ffe;
1907     rsig0 = arg0_sig;
1908     rsig1 = 0;
1909     rsig2 = 0;
1910     if (dsig0 <= rsig0) {
1911         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1912         ++texp;
1913     }
1914     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1915     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1916     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1917            &rsig0, &rsig1, &rsig2);
1918     while ((int64_t) rsig0 < 0) {
1919         --tsig0;
1920         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1921                &rsig0, &rsig1, &rsig2);
1922     }
1923     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1924     /*
1925      * No need to correct any estimation error in tsig1; even with
1926      * such error, it is accurate enough.  Now compute the square of
1927      * that approximation.
1928      */
1929     mul128To256(tsig0, tsig1, tsig0, tsig1,
1930                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1931     t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
1932                                        t2sig0, t2sig1, &env->fp_status);
1933 
1934     /* Compute the lower parts of the polynomial expansion.  */
1935     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1936     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1937     accum = floatx80_mul(accum, t2, &env->fp_status);
1938     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1939     accum = floatx80_mul(accum, t2, &env->fp_status);
1940     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1941     accum = floatx80_mul(accum, t2, &env->fp_status);
1942     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1943     accum = floatx80_mul(accum, t2, &env->fp_status);
1944     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1945     accum = floatx80_mul(accum, t2, &env->fp_status);
1946     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1947     accum = floatx80_mul(accum, t2, &env->fp_status);
1948     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1949     accum = floatx80_mul(accum, t2, &env->fp_status);
1950     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1951     accum = floatx80_mul(accum, t2, &env->fp_status);
1952     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1953 
1954     /*
1955      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1956      * accum has much lower magnitude, and so, in particular, carry
1957      * out of the addition is not possible), multiplied by t.  (This
1958      * expansion is only accurate to about 70 bits, not 128 bits.)
1959      */
1960     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1961     asign = extractFloatx80Sign(fyl2x_coeff_0);
1962     shift128RightJamming(extractFloatx80Frac(accum), 0,
1963                          aexp - extractFloatx80Exp(accum),
1964                          &asig0, &asig1);
1965     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1966     bsig1 = 0;
1967     if (asign == extractFloatx80Sign(accum)) {
1968         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1969     } else {
1970         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1971     }
1972     /* Multiply by t to compute the required result.  */
1973     mul128To256(asig0, asig1, tsig0, tsig1,
1974                 &asig0, &asig1, &asig2, &asig3);
1975     aexp += texp - 0x3ffe;
1976     *exp = aexp;
1977     *sig0 = asig0;
1978     *sig1 = asig1;
1979 }
1980 
1981 void helper_fyl2xp1(CPUX86State *env)
1982 {
1983     uint8_t old_flags = save_exception_flags(env);
1984     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1985     int32_t arg0_exp = extractFloatx80Exp(ST0);
1986     bool arg0_sign = extractFloatx80Sign(ST0);
1987     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1988     int32_t arg1_exp = extractFloatx80Exp(ST1);
1989     bool arg1_sign = extractFloatx80Sign(ST1);
1990 
1991     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1992         float_raise(float_flag_invalid, &env->fp_status);
1993         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1994     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1995         float_raise(float_flag_invalid, &env->fp_status);
1996         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1997     } else if (floatx80_invalid_encoding(ST0) ||
1998                floatx80_invalid_encoding(ST1)) {
1999         float_raise(float_flag_invalid, &env->fp_status);
2000         ST1 = floatx80_default_nan(&env->fp_status);
2001     } else if (floatx80_is_any_nan(ST0)) {
2002         ST1 = ST0;
2003     } else if (floatx80_is_any_nan(ST1)) {
2004         /* Pass this NaN through.  */
2005     } else if (arg0_exp > 0x3ffd ||
2006                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2007                                                   0x95f619980c4336f7ULL :
2008                                                   0xd413cccfe7799211ULL))) {
2009         /*
2010          * Out of range for the instruction (ST0 must have absolute
2011          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2012          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2013          * to sqrt(2) - 1, which we allow here), treat as invalid.
2014          */
2015         float_raise(float_flag_invalid, &env->fp_status);
2016         ST1 = floatx80_default_nan(&env->fp_status);
2017     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2018                arg1_exp == 0x7fff) {
2019         /*
2020          * One argument is zero, or multiplying by infinity; correct
2021          * result is exact and can be obtained by multiplying the
2022          * arguments.
2023          */
2024         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2025     } else if (arg0_exp < 0x3fb0) {
2026         /*
2027          * Multiplying both arguments and an extra-precision version
2028          * of log2(e) is sufficiently precise.
2029          */
2030         uint64_t sig0, sig1, sig2;
2031         int32_t exp;
2032         if (arg0_exp == 0) {
2033             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2034         }
2035         if (arg1_exp == 0) {
2036             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2037         }
2038         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2039                         &sig0, &sig1, &sig2);
2040         exp = arg0_exp + 1;
2041         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2042         exp += arg1_exp - 0x3ffe;
2043         /* This result is inexact.  */
2044         sig1 |= 1;
2045         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
2046                                             sig0, sig1, &env->fp_status);
2047     } else {
2048         int32_t aexp;
2049         uint64_t asig0, asig1, asig2;
2050         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2051         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2052         env->fp_status.float_rounding_mode = float_round_nearest_even;
2053         env->fp_status.floatx80_rounding_precision = 80;
2054 
2055         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2056         /*
2057          * Multiply by the second argument to compute the required
2058          * result.
2059          */
2060         if (arg1_exp == 0) {
2061             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2062         }
2063         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2064         aexp += arg1_exp - 0x3ffe;
2065         /* This result is inexact.  */
2066         asig1 |= 1;
2067         env->fp_status.float_rounding_mode = save_mode;
2068         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp,
2069                                             asig0, asig1, &env->fp_status);
2070         env->fp_status.floatx80_rounding_precision = save_prec;
2071     }
2072     fpop(env);
2073     merge_exception_flags(env, old_flags);
2074 }
2075 
2076 void helper_fyl2x(CPUX86State *env)
2077 {
2078     uint8_t old_flags = save_exception_flags(env);
2079     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2080     int32_t arg0_exp = extractFloatx80Exp(ST0);
2081     bool arg0_sign = extractFloatx80Sign(ST0);
2082     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2083     int32_t arg1_exp = extractFloatx80Exp(ST1);
2084     bool arg1_sign = extractFloatx80Sign(ST1);
2085 
2086     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2087         float_raise(float_flag_invalid, &env->fp_status);
2088         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2089     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2090         float_raise(float_flag_invalid, &env->fp_status);
2091         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2092     } else if (floatx80_invalid_encoding(ST0) ||
2093                floatx80_invalid_encoding(ST1)) {
2094         float_raise(float_flag_invalid, &env->fp_status);
2095         ST1 = floatx80_default_nan(&env->fp_status);
2096     } else if (floatx80_is_any_nan(ST0)) {
2097         ST1 = ST0;
2098     } else if (floatx80_is_any_nan(ST1)) {
2099         /* Pass this NaN through.  */
2100     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2101         float_raise(float_flag_invalid, &env->fp_status);
2102         ST1 = floatx80_default_nan(&env->fp_status);
2103     } else if (floatx80_is_infinity(ST1)) {
2104         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2105                                              &env->fp_status);
2106         switch (cmp) {
2107         case float_relation_less:
2108             ST1 = floatx80_chs(ST1);
2109             break;
2110         case float_relation_greater:
2111             /* Result is infinity of the same sign as ST1.  */
2112             break;
2113         default:
2114             float_raise(float_flag_invalid, &env->fp_status);
2115             ST1 = floatx80_default_nan(&env->fp_status);
2116             break;
2117         }
2118     } else if (floatx80_is_infinity(ST0)) {
2119         if (floatx80_is_zero(ST1)) {
2120             float_raise(float_flag_invalid, &env->fp_status);
2121             ST1 = floatx80_default_nan(&env->fp_status);
2122         } else if (arg1_sign) {
2123             ST1 = floatx80_chs(ST0);
2124         } else {
2125             ST1 = ST0;
2126         }
2127     } else if (floatx80_is_zero(ST0)) {
2128         if (floatx80_is_zero(ST1)) {
2129             float_raise(float_flag_invalid, &env->fp_status);
2130             ST1 = floatx80_default_nan(&env->fp_status);
2131         } else {
2132             /* Result is infinity with opposite sign to ST1.  */
2133             float_raise(float_flag_divbyzero, &env->fp_status);
2134             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2135                                 0x8000000000000000ULL);
2136         }
2137     } else if (floatx80_is_zero(ST1)) {
2138         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2139             ST1 = floatx80_chs(ST1);
2140         }
2141         /* Otherwise, ST1 is already the correct result.  */
2142     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2143         if (arg1_sign) {
2144             ST1 = floatx80_chs(floatx80_zero);
2145         } else {
2146             ST1 = floatx80_zero;
2147         }
2148     } else {
2149         int32_t int_exp;
2150         floatx80 arg0_m1;
2151         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2152         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2153         env->fp_status.float_rounding_mode = float_round_nearest_even;
2154         env->fp_status.floatx80_rounding_precision = 80;
2155 
2156         if (arg0_exp == 0) {
2157             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2158         }
2159         if (arg1_exp == 0) {
2160             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2161         }
2162         int_exp = arg0_exp - 0x3fff;
2163         if (arg0_sig > 0xb504f333f9de6484ULL) {
2164             ++int_exp;
2165         }
2166         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2167                                                &env->fp_status),
2168                                floatx80_one, &env->fp_status);
2169         if (floatx80_is_zero(arg0_m1)) {
2170             /* Exact power of 2; multiply by ST1.  */
2171             env->fp_status.float_rounding_mode = save_mode;
2172             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2173                                ST1, &env->fp_status);
2174         } else {
2175             bool asign = extractFloatx80Sign(arg0_m1);
2176             int32_t aexp;
2177             uint64_t asig0, asig1, asig2;
2178             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2179             if (int_exp != 0) {
2180                 bool isign = (int_exp < 0);
2181                 int32_t iexp;
2182                 uint64_t isig;
2183                 int shift;
2184                 int_exp = isign ? -int_exp : int_exp;
2185                 shift = clz32(int_exp) + 32;
2186                 isig = int_exp;
2187                 isig <<= shift;
2188                 iexp = 0x403e - shift;
2189                 shift128RightJamming(asig0, asig1, iexp - aexp,
2190                                      &asig0, &asig1);
2191                 if (asign == isign) {
2192                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2193                 } else {
2194                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2195                 }
2196                 aexp = iexp;
2197                 asign = isign;
2198             }
2199             /*
2200              * Multiply by the second argument to compute the required
2201              * result.
2202              */
2203             if (arg1_exp == 0) {
2204                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2205             }
2206             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2207             aexp += arg1_exp - 0x3ffe;
2208             /* This result is inexact.  */
2209             asig1 |= 1;
2210             env->fp_status.float_rounding_mode = save_mode;
2211             ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
2212                                                 asig0, asig1, &env->fp_status);
2213         }
2214 
2215         env->fp_status.floatx80_rounding_precision = save_prec;
2216     }
2217     fpop(env);
2218     merge_exception_flags(env, old_flags);
2219 }
2220 
2221 void helper_fsqrt(CPUX86State *env)
2222 {
2223     uint8_t old_flags = save_exception_flags(env);
2224     if (floatx80_is_neg(ST0)) {
2225         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2226         env->fpus |= 0x400;
2227     }
2228     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2229     merge_exception_flags(env, old_flags);
2230 }
2231 
2232 void helper_fsincos(CPUX86State *env)
2233 {
2234     double fptemp = floatx80_to_double(env, ST0);
2235 
2236     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2237         env->fpus |= 0x400;
2238     } else {
2239         ST0 = double_to_floatx80(env, sin(fptemp));
2240         fpush(env);
2241         ST0 = double_to_floatx80(env, cos(fptemp));
2242         env->fpus &= ~0x400;  /* C2 <-- 0 */
2243         /* the above code is for |arg| < 2**63 only */
2244     }
2245 }
2246 
2247 void helper_frndint(CPUX86State *env)
2248 {
2249     uint8_t old_flags = save_exception_flags(env);
2250     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2251     merge_exception_flags(env, old_flags);
2252 }
2253 
2254 void helper_fscale(CPUX86State *env)
2255 {
2256     uint8_t old_flags = save_exception_flags(env);
2257     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2258         float_raise(float_flag_invalid, &env->fp_status);
2259         ST0 = floatx80_default_nan(&env->fp_status);
2260     } else if (floatx80_is_any_nan(ST1)) {
2261         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2262             float_raise(float_flag_invalid, &env->fp_status);
2263         }
2264         ST0 = ST1;
2265         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2266             float_raise(float_flag_invalid, &env->fp_status);
2267             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2268         }
2269     } else if (floatx80_is_infinity(ST1) &&
2270                !floatx80_invalid_encoding(ST0) &&
2271                !floatx80_is_any_nan(ST0)) {
2272         if (floatx80_is_neg(ST1)) {
2273             if (floatx80_is_infinity(ST0)) {
2274                 float_raise(float_flag_invalid, &env->fp_status);
2275                 ST0 = floatx80_default_nan(&env->fp_status);
2276             } else {
2277                 ST0 = (floatx80_is_neg(ST0) ?
2278                        floatx80_chs(floatx80_zero) :
2279                        floatx80_zero);
2280             }
2281         } else {
2282             if (floatx80_is_zero(ST0)) {
2283                 float_raise(float_flag_invalid, &env->fp_status);
2284                 ST0 = floatx80_default_nan(&env->fp_status);
2285             } else {
2286                 ST0 = (floatx80_is_neg(ST0) ?
2287                        floatx80_chs(floatx80_infinity) :
2288                        floatx80_infinity);
2289             }
2290         }
2291     } else {
2292         int n;
2293         signed char save = env->fp_status.floatx80_rounding_precision;
2294         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2295         set_float_exception_flags(0, &env->fp_status);
2296         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2297         set_float_exception_flags(save_flags, &env->fp_status);
2298         env->fp_status.floatx80_rounding_precision = 80;
2299         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2300         env->fp_status.floatx80_rounding_precision = save;
2301     }
2302     merge_exception_flags(env, old_flags);
2303 }
2304 
2305 void helper_fsin(CPUX86State *env)
2306 {
2307     double fptemp = floatx80_to_double(env, ST0);
2308 
2309     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2310         env->fpus |= 0x400;
2311     } else {
2312         ST0 = double_to_floatx80(env, sin(fptemp));
2313         env->fpus &= ~0x400;  /* C2 <-- 0 */
2314         /* the above code is for |arg| < 2**53 only */
2315     }
2316 }
2317 
2318 void helper_fcos(CPUX86State *env)
2319 {
2320     double fptemp = floatx80_to_double(env, ST0);
2321 
2322     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2323         env->fpus |= 0x400;
2324     } else {
2325         ST0 = double_to_floatx80(env, cos(fptemp));
2326         env->fpus &= ~0x400;  /* C2 <-- 0 */
2327         /* the above code is for |arg| < 2**63 only */
2328     }
2329 }
2330 
2331 void helper_fxam_ST0(CPUX86State *env)
2332 {
2333     CPU_LDoubleU temp;
2334     int expdif;
2335 
2336     temp.d = ST0;
2337 
2338     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2339     if (SIGND(temp)) {
2340         env->fpus |= 0x200; /* C1 <-- 1 */
2341     }
2342 
2343     if (env->fptags[env->fpstt]) {
2344         env->fpus |= 0x4100; /* Empty */
2345         return;
2346     }
2347 
2348     expdif = EXPD(temp);
2349     if (expdif == MAXEXPD) {
2350         if (MANTD(temp) == 0x8000000000000000ULL) {
2351             env->fpus |= 0x500; /* Infinity */
2352         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2353             env->fpus |= 0x100; /* NaN */
2354         }
2355     } else if (expdif == 0) {
2356         if (MANTD(temp) == 0) {
2357             env->fpus |=  0x4000; /* Zero */
2358         } else {
2359             env->fpus |= 0x4400; /* Denormal */
2360         }
2361     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2362         env->fpus |= 0x400;
2363     }
2364 }
2365 
2366 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2367                       uintptr_t retaddr)
2368 {
2369     int fpus, fptag, exp, i;
2370     uint64_t mant;
2371     CPU_LDoubleU tmp;
2372 
2373     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2374     fptag = 0;
2375     for (i = 7; i >= 0; i--) {
2376         fptag <<= 2;
2377         if (env->fptags[i]) {
2378             fptag |= 3;
2379         } else {
2380             tmp.d = env->fpregs[i].d;
2381             exp = EXPD(tmp);
2382             mant = MANTD(tmp);
2383             if (exp == 0 && mant == 0) {
2384                 /* zero */
2385                 fptag |= 1;
2386             } else if (exp == 0 || exp == MAXEXPD
2387                        || (mant & (1LL << 63)) == 0) {
2388                 /* NaNs, infinity, denormal */
2389                 fptag |= 2;
2390             }
2391         }
2392     }
2393     if (data32) {
2394         /* 32 bit */
2395         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2396         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2397         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2398         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2399         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2400         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2401         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2402     } else {
2403         /* 16 bit */
2404         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2405         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2406         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2407         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2408         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2409         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2410         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2411     }
2412 }
2413 
2414 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2415 {
2416     do_fstenv(env, ptr, data32, GETPC());
2417 }
2418 
2419 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2420 {
2421     env->fpstt = (fpus >> 11) & 7;
2422     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2423     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2424 #if !defined(CONFIG_USER_ONLY)
2425     if (!(env->fpus & FPUS_SE)) {
2426         /*
2427          * Here the processor deasserts FERR#; in response, the chipset deasserts
2428          * IGNNE#.
2429          */
2430         cpu_clear_ignne();
2431     }
2432 #endif
2433 }
2434 
2435 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2436                       uintptr_t retaddr)
2437 {
2438     int i, fpus, fptag;
2439 
2440     if (data32) {
2441         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2442         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2443         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2444     } else {
2445         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2446         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2447         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2448     }
2449     cpu_set_fpus(env, fpus);
2450     for (i = 0; i < 8; i++) {
2451         env->fptags[i] = ((fptag & 3) == 3);
2452         fptag >>= 2;
2453     }
2454 }
2455 
2456 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2457 {
2458     do_fldenv(env, ptr, data32, GETPC());
2459 }
2460 
2461 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2462 {
2463     floatx80 tmp;
2464     int i;
2465 
2466     do_fstenv(env, ptr, data32, GETPC());
2467 
2468     ptr += (14 << data32);
2469     for (i = 0; i < 8; i++) {
2470         tmp = ST(i);
2471         helper_fstt(env, tmp, ptr, GETPC());
2472         ptr += 10;
2473     }
2474 
2475     /* fninit */
2476     env->fpus = 0;
2477     env->fpstt = 0;
2478     cpu_set_fpuc(env, 0x37f);
2479     env->fptags[0] = 1;
2480     env->fptags[1] = 1;
2481     env->fptags[2] = 1;
2482     env->fptags[3] = 1;
2483     env->fptags[4] = 1;
2484     env->fptags[5] = 1;
2485     env->fptags[6] = 1;
2486     env->fptags[7] = 1;
2487 }
2488 
2489 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2490 {
2491     floatx80 tmp;
2492     int i;
2493 
2494     do_fldenv(env, ptr, data32, GETPC());
2495     ptr += (14 << data32);
2496 
2497     for (i = 0; i < 8; i++) {
2498         tmp = helper_fldt(env, ptr, GETPC());
2499         ST(i) = tmp;
2500         ptr += 10;
2501     }
2502 }
2503 
2504 #if defined(CONFIG_USER_ONLY)
2505 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2506 {
2507     helper_fsave(env, ptr, data32);
2508 }
2509 
2510 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2511 {
2512     helper_frstor(env, ptr, data32);
2513 }
2514 #endif
2515 
2516 #define XO(X)  offsetof(X86XSaveArea, X)
2517 
2518 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2519 {
2520     int fpus, fptag, i;
2521     target_ulong addr;
2522 
2523     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2524     fptag = 0;
2525     for (i = 0; i < 8; i++) {
2526         fptag |= (env->fptags[i] << i);
2527     }
2528 
2529     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2530     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2531     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2532 
2533     /* In 32-bit mode this is eip, sel, dp, sel.
2534        In 64-bit mode this is rip, rdp.
2535        But in either case we don't write actual data, just zeros.  */
2536     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2537     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2538 
2539     addr = ptr + XO(legacy.fpregs);
2540     for (i = 0; i < 8; i++) {
2541         floatx80 tmp = ST(i);
2542         helper_fstt(env, tmp, addr, ra);
2543         addr += 16;
2544     }
2545 }
2546 
2547 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2548 {
2549     update_mxcsr_from_sse_status(env);
2550     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2551     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2552 }
2553 
2554 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2555 {
2556     int i, nb_xmm_regs;
2557     target_ulong addr;
2558 
2559     if (env->hflags & HF_CS64_MASK) {
2560         nb_xmm_regs = 16;
2561     } else {
2562         nb_xmm_regs = 8;
2563     }
2564 
2565     addr = ptr + XO(legacy.xmm_regs);
2566     for (i = 0; i < nb_xmm_regs; i++) {
2567         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2568         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2569         addr += 16;
2570     }
2571 }
2572 
2573 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2574 {
2575     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2576     int i;
2577 
2578     for (i = 0; i < 4; i++, addr += 16) {
2579         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2580         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2581     }
2582 }
2583 
2584 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2585 {
2586     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2587                     env->bndcs_regs.cfgu, ra);
2588     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2589                     env->bndcs_regs.sts, ra);
2590 }
2591 
2592 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2593 {
2594     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2595 }
2596 
2597 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2598 {
2599     uintptr_t ra = GETPC();
2600 
2601     /* The operand must be 16 byte aligned */
2602     if (ptr & 0xf) {
2603         raise_exception_ra(env, EXCP0D_GPF, ra);
2604     }
2605 
2606     do_xsave_fpu(env, ptr, ra);
2607 
2608     if (env->cr[4] & CR4_OSFXSR_MASK) {
2609         do_xsave_mxcsr(env, ptr, ra);
2610         /* Fast FXSAVE leaves out the XMM registers */
2611         if (!(env->efer & MSR_EFER_FFXSR)
2612             || (env->hflags & HF_CPL_MASK)
2613             || !(env->hflags & HF_LMA_MASK)) {
2614             do_xsave_sse(env, ptr, ra);
2615         }
2616     }
2617 }
2618 
2619 static uint64_t get_xinuse(CPUX86State *env)
2620 {
2621     uint64_t inuse = -1;
2622 
2623     /* For the most part, we don't track XINUSE.  We could calculate it
2624        here for all components, but it's probably less work to simply
2625        indicate in use.  That said, the state of BNDREGS is important
2626        enough to track in HFLAGS, so we might as well use that here.  */
2627     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2628        inuse &= ~XSTATE_BNDREGS_MASK;
2629     }
2630     return inuse;
2631 }
2632 
2633 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2634                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2635 {
2636     uint64_t old_bv, new_bv;
2637 
2638     /* The OS must have enabled XSAVE.  */
2639     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2640         raise_exception_ra(env, EXCP06_ILLOP, ra);
2641     }
2642 
2643     /* The operand must be 64 byte aligned.  */
2644     if (ptr & 63) {
2645         raise_exception_ra(env, EXCP0D_GPF, ra);
2646     }
2647 
2648     /* Never save anything not enabled by XCR0.  */
2649     rfbm &= env->xcr0;
2650     opt &= rfbm;
2651 
2652     if (opt & XSTATE_FP_MASK) {
2653         do_xsave_fpu(env, ptr, ra);
2654     }
2655     if (rfbm & XSTATE_SSE_MASK) {
2656         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2657         do_xsave_mxcsr(env, ptr, ra);
2658     }
2659     if (opt & XSTATE_SSE_MASK) {
2660         do_xsave_sse(env, ptr, ra);
2661     }
2662     if (opt & XSTATE_BNDREGS_MASK) {
2663         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2664     }
2665     if (opt & XSTATE_BNDCSR_MASK) {
2666         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2667     }
2668     if (opt & XSTATE_PKRU_MASK) {
2669         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2670     }
2671 
2672     /* Update the XSTATE_BV field.  */
2673     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2674     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2675     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2676 }
2677 
2678 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2679 {
2680     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2681 }
2682 
2683 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2684 {
2685     uint64_t inuse = get_xinuse(env);
2686     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2687 }
2688 
2689 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2690 {
2691     int i, fpuc, fpus, fptag;
2692     target_ulong addr;
2693 
2694     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2695     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2696     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2697     cpu_set_fpuc(env, fpuc);
2698     cpu_set_fpus(env, fpus);
2699     fptag ^= 0xff;
2700     for (i = 0; i < 8; i++) {
2701         env->fptags[i] = ((fptag >> i) & 1);
2702     }
2703 
2704     addr = ptr + XO(legacy.fpregs);
2705     for (i = 0; i < 8; i++) {
2706         floatx80 tmp = helper_fldt(env, addr, ra);
2707         ST(i) = tmp;
2708         addr += 16;
2709     }
2710 }
2711 
2712 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2713 {
2714     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2715 }
2716 
2717 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2718 {
2719     int i, nb_xmm_regs;
2720     target_ulong addr;
2721 
2722     if (env->hflags & HF_CS64_MASK) {
2723         nb_xmm_regs = 16;
2724     } else {
2725         nb_xmm_regs = 8;
2726     }
2727 
2728     addr = ptr + XO(legacy.xmm_regs);
2729     for (i = 0; i < nb_xmm_regs; i++) {
2730         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2731         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2732         addr += 16;
2733     }
2734 }
2735 
2736 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2737 {
2738     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2739     int i;
2740 
2741     for (i = 0; i < 4; i++, addr += 16) {
2742         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2743         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2744     }
2745 }
2746 
2747 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2748 {
2749     /* FIXME: Extend highest implemented bit of linear address.  */
2750     env->bndcs_regs.cfgu
2751         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2752     env->bndcs_regs.sts
2753         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2754 }
2755 
2756 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2757 {
2758     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2759 }
2760 
2761 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2762 {
2763     uintptr_t ra = GETPC();
2764 
2765     /* The operand must be 16 byte aligned */
2766     if (ptr & 0xf) {
2767         raise_exception_ra(env, EXCP0D_GPF, ra);
2768     }
2769 
2770     do_xrstor_fpu(env, ptr, ra);
2771 
2772     if (env->cr[4] & CR4_OSFXSR_MASK) {
2773         do_xrstor_mxcsr(env, ptr, ra);
2774         /* Fast FXRSTOR leaves out the XMM registers */
2775         if (!(env->efer & MSR_EFER_FFXSR)
2776             || (env->hflags & HF_CPL_MASK)
2777             || !(env->hflags & HF_LMA_MASK)) {
2778             do_xrstor_sse(env, ptr, ra);
2779         }
2780     }
2781 }
2782 
2783 #if defined(CONFIG_USER_ONLY)
2784 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2785 {
2786     helper_fxsave(env, ptr);
2787 }
2788 
2789 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2790 {
2791     helper_fxrstor(env, ptr);
2792 }
2793 #endif
2794 
2795 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2796 {
2797     uintptr_t ra = GETPC();
2798     uint64_t xstate_bv, xcomp_bv, reserve0;
2799 
2800     rfbm &= env->xcr0;
2801 
2802     /* The OS must have enabled XSAVE.  */
2803     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2804         raise_exception_ra(env, EXCP06_ILLOP, ra);
2805     }
2806 
2807     /* The operand must be 64 byte aligned.  */
2808     if (ptr & 63) {
2809         raise_exception_ra(env, EXCP0D_GPF, ra);
2810     }
2811 
2812     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2813 
2814     if ((int64_t)xstate_bv < 0) {
2815         /* FIXME: Compact form.  */
2816         raise_exception_ra(env, EXCP0D_GPF, ra);
2817     }
2818 
2819     /* Standard form.  */
2820 
2821     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2822     if (xstate_bv & ~env->xcr0) {
2823         raise_exception_ra(env, EXCP0D_GPF, ra);
2824     }
2825 
2826     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2827        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2828        describes only XCOMP_BV, but the description of the standard form
2829        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2830        includes the next 64-bit field.  */
2831     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2832     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2833     if (xcomp_bv || reserve0) {
2834         raise_exception_ra(env, EXCP0D_GPF, ra);
2835     }
2836 
2837     if (rfbm & XSTATE_FP_MASK) {
2838         if (xstate_bv & XSTATE_FP_MASK) {
2839             do_xrstor_fpu(env, ptr, ra);
2840         } else {
2841             helper_fninit(env);
2842             memset(env->fpregs, 0, sizeof(env->fpregs));
2843         }
2844     }
2845     if (rfbm & XSTATE_SSE_MASK) {
2846         /* Note that the standard form of XRSTOR loads MXCSR from memory
2847            whether or not the XSTATE_BV bit is set.  */
2848         do_xrstor_mxcsr(env, ptr, ra);
2849         if (xstate_bv & XSTATE_SSE_MASK) {
2850             do_xrstor_sse(env, ptr, ra);
2851         } else {
2852             /* ??? When AVX is implemented, we may have to be more
2853                selective in the clearing.  */
2854             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2855         }
2856     }
2857     if (rfbm & XSTATE_BNDREGS_MASK) {
2858         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2859             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2860             env->hflags |= HF_MPX_IU_MASK;
2861         } else {
2862             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2863             env->hflags &= ~HF_MPX_IU_MASK;
2864         }
2865     }
2866     if (rfbm & XSTATE_BNDCSR_MASK) {
2867         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2868             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2869         } else {
2870             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2871         }
2872         cpu_sync_bndcs_hflags(env);
2873     }
2874     if (rfbm & XSTATE_PKRU_MASK) {
2875         uint64_t old_pkru = env->pkru;
2876         if (xstate_bv & XSTATE_PKRU_MASK) {
2877             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2878         } else {
2879             env->pkru = 0;
2880         }
2881         if (env->pkru != old_pkru) {
2882             CPUState *cs = env_cpu(env);
2883             tlb_flush(cs);
2884         }
2885     }
2886 }
2887 
2888 #undef XO
2889 
2890 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2891 {
2892     /* The OS must have enabled XSAVE.  */
2893     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2894         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2895     }
2896 
2897     switch (ecx) {
2898     case 0:
2899         return env->xcr0;
2900     case 1:
2901         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2902             return env->xcr0 & get_xinuse(env);
2903         }
2904         break;
2905     }
2906     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2907 }
2908 
2909 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2910 {
2911     uint32_t dummy, ena_lo, ena_hi;
2912     uint64_t ena;
2913 
2914     /* The OS must have enabled XSAVE.  */
2915     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2916         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2917     }
2918 
2919     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2920     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2921         goto do_gpf;
2922     }
2923 
2924     /* Disallow enabling unimplemented features.  */
2925     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2926     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2927     if (mask & ~ena) {
2928         goto do_gpf;
2929     }
2930 
2931     /* Disallow enabling only half of MPX.  */
2932     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2933         & XSTATE_BNDCSR_MASK) {
2934         goto do_gpf;
2935     }
2936 
2937     env->xcr0 = mask;
2938     cpu_sync_bndcs_hflags(env);
2939     return;
2940 
2941  do_gpf:
2942     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2943 }
2944 
2945 /* MMX/SSE */
2946 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2947 
2948 #define SSE_DAZ             0x0040
2949 #define SSE_RC_MASK         0x6000
2950 #define SSE_RC_NEAR         0x0000
2951 #define SSE_RC_DOWN         0x2000
2952 #define SSE_RC_UP           0x4000
2953 #define SSE_RC_CHOP         0x6000
2954 #define SSE_FZ              0x8000
2955 
2956 void update_mxcsr_status(CPUX86State *env)
2957 {
2958     uint32_t mxcsr = env->mxcsr;
2959     int rnd_type;
2960 
2961     /* set rounding mode */
2962     switch (mxcsr & SSE_RC_MASK) {
2963     default:
2964     case SSE_RC_NEAR:
2965         rnd_type = float_round_nearest_even;
2966         break;
2967     case SSE_RC_DOWN:
2968         rnd_type = float_round_down;
2969         break;
2970     case SSE_RC_UP:
2971         rnd_type = float_round_up;
2972         break;
2973     case SSE_RC_CHOP:
2974         rnd_type = float_round_to_zero;
2975         break;
2976     }
2977     set_float_rounding_mode(rnd_type, &env->sse_status);
2978 
2979     /* Set exception flags.  */
2980     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2981                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2982                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2983                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2984                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2985                               &env->sse_status);
2986 
2987     /* set denormals are zero */
2988     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2989 
2990     /* set flush to zero */
2991     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2992 }
2993 
2994 void update_mxcsr_from_sse_status(CPUX86State *env)
2995 {
2996     uint8_t flags = get_float_exception_flags(&env->sse_status);
2997     /*
2998      * The MXCSR denormal flag has opposite semantics to
2999      * float_flag_input_denormal (the softfloat code sets that flag
3000      * only when flushing input denormals to zero, but SSE sets it
3001      * only when not flushing them to zero), so is not converted
3002      * here.
3003      */
3004     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3005                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3006                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3007                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3008                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3009                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3010                     0));
3011 }
3012 
3013 void helper_update_mxcsr(CPUX86State *env)
3014 {
3015     update_mxcsr_from_sse_status(env);
3016 }
3017 
3018 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3019 {
3020     cpu_set_mxcsr(env, val);
3021 }
3022 
3023 void helper_enter_mmx(CPUX86State *env)
3024 {
3025     env->fpstt = 0;
3026     *(uint32_t *)(env->fptags) = 0;
3027     *(uint32_t *)(env->fptags + 4) = 0;
3028 }
3029 
3030 void helper_emms(CPUX86State *env)
3031 {
3032     /* set to empty state */
3033     *(uint32_t *)(env->fptags) = 0x01010101;
3034     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3035 }
3036 
3037 /* XXX: suppress */
3038 void helper_movq(CPUX86State *env, void *d, void *s)
3039 {
3040     *(uint64_t *)d = *(uint64_t *)s;
3041 }
3042 
3043 #define SHIFT 0
3044 #include "ops_sse.h"
3045 
3046 #define SHIFT 1
3047 #include "ops_sse.h"
3048