1/ This Source Code Form is subject to the terms of the Mozilla Public 2/ License, v. 2.0. If a copy of the MPL was not distributed with this 3/ file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 6/ ------------------------------------------------------------------------ 7/ 8/ Implementation of s_mpv_mul_set_vec which exploits 9/ the 64X64->128 bit unsigned multiply instruction. 10/ 11/ ------------------------------------------------------------------------ 12 13/ r = a * digit, r and a are vectors of length len 14/ returns the carry digit 15/ r and a are 64 bit aligned. 16/ 17/ uint64_t 18/ s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 19/ 20 21.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: 22 23 xorq %rax, %rax / if (len == 0) return (0) 24 testq %rdx, %rdx 25 jz .L17 26 27 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 28 xorq %r9, %r9 / cy = 0 29 30.L15: 31 cmpq $8, %r8 / 8 - len 32 jb .L16 33 movq 0(%rsi), %rax / rax = a[0] 34 movq 8(%rsi), %r11 / prefetch a[1] 35 mulq %rcx / p = a[0] * digit 36 addq %r9, %rax 37 adcq $0, %rdx / p += cy 38 movq %rax, 0(%rdi) / r[0] = lo(p) 39 movq %rdx, %r9 / cy = hi(p) 40 41 movq %r11, %rax 42 movq 16(%rsi), %r11 / prefetch a[2] 43 mulq %rcx / p = a[1] * digit 44 addq %r9, %rax 45 adcq $0, %rdx / p += cy 46 movq %rax, 8(%rdi) / r[1] = lo(p) 47 movq %rdx, %r9 / cy = hi(p) 48 49 movq %r11, %rax 50 movq 24(%rsi), %r11 / prefetch a[3] 51 mulq %rcx / p = a[2] * digit 52 addq %r9, %rax 53 adcq $0, %rdx / p += cy 54 movq %rax, 16(%rdi) / r[2] = lo(p) 55 movq %rdx, %r9 / cy = hi(p) 56 57 movq %r11, %rax 58 movq 32(%rsi), %r11 / prefetch a[4] 59 mulq %rcx / p = a[3] * digit 60 addq %r9, %rax 61 adcq $0, %rdx / p += cy 62 movq %rax, 24(%rdi) / r[3] = lo(p) 63 movq %rdx, %r9 / cy = hi(p) 64 65 movq %r11, %rax 66 movq 40(%rsi), %r11 / prefetch a[5] 67 mulq %rcx / p = a[4] * digit 68 addq %r9, %rax 69 adcq $0, %rdx / p += cy 70 movq %rax, 32(%rdi) / r[4] = lo(p) 71 movq %rdx, %r9 / cy = hi(p) 72 73 movq %r11, %rax 74 movq 48(%rsi), %r11 / prefetch a[6] 75 mulq %rcx / p = a[5] * digit 76 addq %r9, %rax 77 adcq $0, %rdx / p += cy 78 movq %rax, 40(%rdi) / r[5] = lo(p) 79 movq %rdx, %r9 / cy = hi(p) 80 81 movq %r11, %rax 82 movq 56(%rsi), %r11 / prefetch a[7] 83 mulq %rcx / p = a[6] * digit 84 addq %r9, %rax 85 adcq $0, %rdx / p += cy 86 movq %rax, 48(%rdi) / r[6] = lo(p) 87 movq %rdx, %r9 / cy = hi(p) 88 89 movq %r11, %rax 90 mulq %rcx / p = a[7] * digit 91 addq %r9, %rax 92 adcq $0, %rdx / p += cy 93 movq %rax, 56(%rdi) / r[7] = lo(p) 94 movq %rdx, %r9 / cy = hi(p) 95 96 addq $64, %rsi 97 addq $64, %rdi 98 subq $8, %r8 99 100 jz .L17 101 jmp .L15 102 103.L16: 104 movq 0(%rsi), %rax 105 mulq %rcx / p = a[0] * digit 106 addq %r9, %rax 107 adcq $0, %rdx / p += cy 108 movq %rax, 0(%rdi) / r[0] = lo(p) 109 movq %rdx, %r9 / cy = hi(p) 110 decq %r8 111 jz .L17 112 113 movq 8(%rsi), %rax 114 mulq %rcx / p = a[1] * digit 115 addq %r9, %rax 116 adcq $0, %rdx / p += cy 117 movq %rax, 8(%rdi) / r[1] = lo(p) 118 movq %rdx, %r9 / cy = hi(p) 119 decq %r8 120 jz .L17 121 122 movq 16(%rsi), %rax 123 mulq %rcx / p = a[2] * digit 124 addq %r9, %rax 125 adcq $0, %rdx / p += cy 126 movq %rax, 16(%rdi) / r[2] = lo(p) 127 movq %rdx, %r9 / cy = hi(p) 128 decq %r8 129 jz .L17 130 131 movq 24(%rsi), %rax 132 mulq %rcx / p = a[3] * digit 133 addq %r9, %rax 134 adcq $0, %rdx / p += cy 135 movq %rax, 24(%rdi) / r[3] = lo(p) 136 movq %rdx, %r9 / cy = hi(p) 137 decq %r8 138 jz .L17 139 140 movq 32(%rsi), %rax 141 mulq %rcx / p = a[4] * digit 142 addq %r9, %rax 143 adcq $0, %rdx / p += cy 144 movq %rax, 32(%rdi) / r[4] = lo(p) 145 movq %rdx, %r9 / cy = hi(p) 146 decq %r8 147 jz .L17 148 149 movq 40(%rsi), %rax 150 mulq %rcx / p = a[5] * digit 151 addq %r9, %rax 152 adcq $0, %rdx / p += cy 153 movq %rax, 40(%rdi) / r[5] = lo(p) 154 movq %rdx, %r9 / cy = hi(p) 155 decq %r8 156 jz .L17 157 158 movq 48(%rsi), %rax 159 mulq %rcx / p = a[6] * digit 160 addq %r9, %rax 161 adcq $0, %rdx / p += cy 162 movq %rax, 48(%rdi) / r[6] = lo(p) 163 movq %rdx, %r9 / cy = hi(p) 164 decq %r8 165 jz .L17 166 167 168.L17: 169 movq %r9, %rax 170 ret 171 172.size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64 173 174/ ------------------------------------------------------------------------ 175/ 176/ Implementation of s_mpv_mul_add_vec which exploits 177/ the 64X64->128 bit unsigned multiply instruction. 178/ 179/ ------------------------------------------------------------------------ 180 181/ r += a * digit, r and a are vectors of length len 182/ returns the carry digit 183/ r and a are 64 bit aligned. 184/ 185/ uint64_t 186/ s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 187/ 188 189.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: 190 191 xorq %rax, %rax / if (len == 0) return (0) 192 testq %rdx, %rdx 193 jz .L27 194 195 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul 196 xorq %r9, %r9 / cy = 0 197 198.L25: 199 cmpq $8, %r8 / 8 - len 200 jb .L26 201 movq 0(%rsi), %rax / rax = a[0] 202 movq 0(%rdi), %r10 / r10 = r[0] 203 movq 8(%rsi), %r11 / prefetch a[1] 204 mulq %rcx / p = a[0] * digit 205 addq %r10, %rax 206 adcq $0, %rdx / p += r[0] 207 movq 8(%rdi), %r10 / prefetch r[1] 208 addq %r9, %rax 209 adcq $0, %rdx / p += cy 210 movq %rax, 0(%rdi) / r[0] = lo(p) 211 movq %rdx, %r9 / cy = hi(p) 212 213 movq %r11, %rax 214 movq 16(%rsi), %r11 / prefetch a[2] 215 mulq %rcx / p = a[1] * digit 216 addq %r10, %rax 217 adcq $0, %rdx / p += r[1] 218 movq 16(%rdi), %r10 / prefetch r[2] 219 addq %r9, %rax 220 adcq $0, %rdx / p += cy 221 movq %rax, 8(%rdi) / r[1] = lo(p) 222 movq %rdx, %r9 / cy = hi(p) 223 224 movq %r11, %rax 225 movq 24(%rsi), %r11 / prefetch a[3] 226 mulq %rcx / p = a[2] * digit 227 addq %r10, %rax 228 adcq $0, %rdx / p += r[2] 229 movq 24(%rdi), %r10 / prefetch r[3] 230 addq %r9, %rax 231 adcq $0, %rdx / p += cy 232 movq %rax, 16(%rdi) / r[2] = lo(p) 233 movq %rdx, %r9 / cy = hi(p) 234 235 movq %r11, %rax 236 movq 32(%rsi), %r11 / prefetch a[4] 237 mulq %rcx / p = a[3] * digit 238 addq %r10, %rax 239 adcq $0, %rdx / p += r[3] 240 movq 32(%rdi), %r10 / prefetch r[4] 241 addq %r9, %rax 242 adcq $0, %rdx / p += cy 243 movq %rax, 24(%rdi) / r[3] = lo(p) 244 movq %rdx, %r9 / cy = hi(p) 245 246 movq %r11, %rax 247 movq 40(%rsi), %r11 / prefetch a[5] 248 mulq %rcx / p = a[4] * digit 249 addq %r10, %rax 250 adcq $0, %rdx / p += r[4] 251 movq 40(%rdi), %r10 / prefetch r[5] 252 addq %r9, %rax 253 adcq $0, %rdx / p += cy 254 movq %rax, 32(%rdi) / r[4] = lo(p) 255 movq %rdx, %r9 / cy = hi(p) 256 257 movq %r11, %rax 258 movq 48(%rsi), %r11 / prefetch a[6] 259 mulq %rcx / p = a[5] * digit 260 addq %r10, %rax 261 adcq $0, %rdx / p += r[5] 262 movq 48(%rdi), %r10 / prefetch r[6] 263 addq %r9, %rax 264 adcq $0, %rdx / p += cy 265 movq %rax, 40(%rdi) / r[5] = lo(p) 266 movq %rdx, %r9 / cy = hi(p) 267 268 movq %r11, %rax 269 movq 56(%rsi), %r11 / prefetch a[7] 270 mulq %rcx / p = a[6] * digit 271 addq %r10, %rax 272 adcq $0, %rdx / p += r[6] 273 movq 56(%rdi), %r10 / prefetch r[7] 274 addq %r9, %rax 275 adcq $0, %rdx / p += cy 276 movq %rax, 48(%rdi) / r[6] = lo(p) 277 movq %rdx, %r9 / cy = hi(p) 278 279 movq %r11, %rax 280 mulq %rcx / p = a[7] * digit 281 addq %r10, %rax 282 adcq $0, %rdx / p += r[7] 283 addq %r9, %rax 284 adcq $0, %rdx / p += cy 285 movq %rax, 56(%rdi) / r[7] = lo(p) 286 movq %rdx, %r9 / cy = hi(p) 287 288 addq $64, %rsi 289 addq $64, %rdi 290 subq $8, %r8 291 292 jz .L27 293 jmp .L25 294 295.L26: 296 movq 0(%rsi), %rax 297 movq 0(%rdi), %r10 298 mulq %rcx / p = a[0] * digit 299 addq %r10, %rax 300 adcq $0, %rdx / p += r[0] 301 addq %r9, %rax 302 adcq $0, %rdx / p += cy 303 movq %rax, 0(%rdi) / r[0] = lo(p) 304 movq %rdx, %r9 / cy = hi(p) 305 decq %r8 306 jz .L27 307 308 movq 8(%rsi), %rax 309 movq 8(%rdi), %r10 310 mulq %rcx / p = a[1] * digit 311 addq %r10, %rax 312 adcq $0, %rdx / p += r[1] 313 addq %r9, %rax 314 adcq $0, %rdx / p += cy 315 movq %rax, 8(%rdi) / r[1] = lo(p) 316 movq %rdx, %r9 / cy = hi(p) 317 decq %r8 318 jz .L27 319 320 movq 16(%rsi), %rax 321 movq 16(%rdi), %r10 322 mulq %rcx / p = a[2] * digit 323 addq %r10, %rax 324 adcq $0, %rdx / p += r[2] 325 addq %r9, %rax 326 adcq $0, %rdx / p += cy 327 movq %rax, 16(%rdi) / r[2] = lo(p) 328 movq %rdx, %r9 / cy = hi(p) 329 decq %r8 330 jz .L27 331 332 movq 24(%rsi), %rax 333 movq 24(%rdi), %r10 334 mulq %rcx / p = a[3] * digit 335 addq %r10, %rax 336 adcq $0, %rdx / p += r[3] 337 addq %r9, %rax 338 adcq $0, %rdx / p += cy 339 movq %rax, 24(%rdi) / r[3] = lo(p) 340 movq %rdx, %r9 / cy = hi(p) 341 decq %r8 342 jz .L27 343 344 movq 32(%rsi), %rax 345 movq 32(%rdi), %r10 346 mulq %rcx / p = a[4] * digit 347 addq %r10, %rax 348 adcq $0, %rdx / p += r[4] 349 addq %r9, %rax 350 adcq $0, %rdx / p += cy 351 movq %rax, 32(%rdi) / r[4] = lo(p) 352 movq %rdx, %r9 / cy = hi(p) 353 decq %r8 354 jz .L27 355 356 movq 40(%rsi), %rax 357 movq 40(%rdi), %r10 358 mulq %rcx / p = a[5] * digit 359 addq %r10, %rax 360 adcq $0, %rdx / p += r[5] 361 addq %r9, %rax 362 adcq $0, %rdx / p += cy 363 movq %rax, 40(%rdi) / r[5] = lo(p) 364 movq %rdx, %r9 / cy = hi(p) 365 decq %r8 366 jz .L27 367 368 movq 48(%rsi), %rax 369 movq 48(%rdi), %r10 370 mulq %rcx / p = a[6] * digit 371 addq %r10, %rax 372 adcq $0, %rdx / p += r[6] 373 addq %r9, %rax 374 adcq $0, %rdx / p += cy 375 movq %rax, 48(%rdi) / r[6] = lo(p) 376 movq %rdx, %r9 / cy = hi(p) 377 decq %r8 378 jz .L27 379 380 381.L27: 382 movq %r9, %rax 383 ret 384 385.size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64 386