1```c
2// v寄存器全部使用 %.4s
3// 128-bit vreg matches %.4s
4// a += b * c
5float32x4_t _a = vld1q_f32(a);
6float32x4_t _b = vld1q_f32(b);
7float32x4_t _c = vld1q_f32(c);
8asm volatile(
9    "fmla  %0.4s, %2.4s, %3.4s"
10    : "=w"(_a) // %0
11    : "0"(_a),
12      "w"(_b), // %2
13      "w"(_c)  // %3
14    :
15);
16```
17```c
18// v寄存器使用低64位  %.2s
19// low 64-bit vreg matches %.2s
20// a += b * c
21float32x2_t _a = vld1_f32(a);
22float32x2_t _b = vld1_f32(b);
23float32x2_t _c = vld1_f32(c);
24asm volatile(
25    "fmla  %0.2s, %2.2s, %3.2s"
26    : "=w"(_a) // %0
27    : "0"(_a),
28      "w"(_b), // %2
29      "w"(_c)  // %3
30    :
31);
32```
33```c
34// v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3]
35// 32-bit register matches %.s[0]
36// a += b * c[0]
37// a += b * c[1]
38// a += b * c[2]
39// a += b * c[3]
40float32x4_t _a = vld1_f32(a);
41float32x4_t _b = vld1_f32(b);
42float32x4_t _c = vld1_f32(c);
43asm volatile(
44    "fmla  %0.4s, %2.4s, %3.s[0]"
45    "fmla  %0.4s, %2.4s, %3.s[1]"
46    "fmla  %0.4s, %2.4s, %3.s[2]"
47    "fmla  %0.4s, %2.4s, %3.s[3]"
48    : "=w"(_a) // %0
49    : "0"(_a),
50      "w"(_b), // %2
51      "w"(_c)  // %3
52    :
53);
54```
55
56
57qwq
58