1```c 2// v寄存器全部使用 %.4s 3// 128-bit vreg matches %.4s 4// a += b * c 5float32x4_t _a = vld1q_f32(a); 6float32x4_t _b = vld1q_f32(b); 7float32x4_t _c = vld1q_f32(c); 8asm volatile( 9 "fmla %0.4s, %2.4s, %3.4s" 10 : "=w"(_a) // %0 11 : "0"(_a), 12 "w"(_b), // %2 13 "w"(_c) // %3 14 : 15); 16``` 17```c 18// v寄存器使用低64位 %.2s 19// low 64-bit vreg matches %.2s 20// a += b * c 21float32x2_t _a = vld1_f32(a); 22float32x2_t _b = vld1_f32(b); 23float32x2_t _c = vld1_f32(c); 24asm volatile( 25 "fmla %0.2s, %2.2s, %3.2s" 26 : "=w"(_a) // %0 27 : "0"(_a), 28 "w"(_b), // %2 29 "w"(_c) // %3 30 : 31); 32``` 33```c 34// v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3] 35// 32-bit register matches %.s[0] 36// a += b * c[0] 37// a += b * c[1] 38// a += b * c[2] 39// a += b * c[3] 40float32x4_t _a = vld1_f32(a); 41float32x4_t _b = vld1_f32(b); 42float32x4_t _c = vld1_f32(c); 43asm volatile( 44 "fmla %0.4s, %2.4s, %3.s[0]" 45 "fmla %0.4s, %2.4s, %3.s[1]" 46 "fmla %0.4s, %2.4s, %3.s[2]" 47 "fmla %0.4s, %2.4s, %3.s[3]" 48 : "=w"(_a) // %0 49 : "0"(_a), 50 "w"(_b), // %2 51 "w"(_c) // %3 52 : 53); 54``` 55 56 57qwq 58