1dnl ****************************************************************************** 2dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. 3dnl 4dnl This file is part of the ECM Library. 5dnl 6dnl The ECM Library is free software; you can redistribute it and/or modify 7dnl it under the terms of the GNU Lesser General Public License as published by 8dnl the Free Software Foundation; either version 3 of the License, or (at your 9dnl option) any later version. 10dnl 11dnl The ECM Library is distributed in the hope that it will be useful, but 12dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 13dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 14dnl License for more details. 15dnl 16dnl You should have received a copy of the GNU Lesser General Public License 17dnl along with the ECM Library; see the file COPYING.LIB. If not, write to 18dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, 19dnl MA 02110-1301, USA. 20dnl ****************************************************************************** 21 22define(C, ` 23dnl') 24 25C mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, 26C const mp_limb_t *m, mp_limb_t inv_m); 27C 28C arguments: 29C r3 = ptr to result z least significant limb 30C r4 = ptr to input x least significant limb 31C r5 = ptr to input y least significant limb 32C r6 = ptr to modulus m least significant limb 33C r7 = -1/m mod 2^64 34C 35C final carry returned in r3 36 37 38 39include(`config.m4') 40 41 GLOBL GSYM_PREFIX`'mulredc12 42 GLOBL .GSYM_PREFIX`'mulredc12 43 44 .section ".opd", "aw" 45 .align 3 46GSYM_PREFIX`'mulredc12: 47 .quad .GSYM_PREFIX`'mulredc12, .TOC.@tocbase, 0 48 .size GSYM_PREFIX`'mulredc12, 24 49 50 51C Implements multiplication and REDC for two input numbers of 12 words 52 53C The algorithm: 54C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) 55C 56C T1:T0 = x[i]*y[0] ; 57C u = (T0*invm) % 2^64 ; 58C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ 59C for (j = 1; j < len; j++) 60C { 61C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; 62C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ 63C tmp[j-1] = T0; 64C } 65C tmp[len-1] = T1 ; 66C tmp[len] = cy ; /* cy <= 1 (see note 2) */ 67C for (i = 1; i < len; i++) 68C { 69C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; 70C u = (T0*invm) % 2^64 ; 71C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ 72C for (j = 1; j < len; j++) 73C { 74C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; 75C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 76C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ 77C tmp[j-1] = T0; 78C } 79C tmp[len-1] = T1 ; 80C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ 81C } 82C z[0 ... len-1] = tmp[0 ... len-1] ; 83C return (tmp[len]) ; 84C 85C notes: 86C 87C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, 88C so cy:T1 <= 2*2^64 - 4. 89C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 90C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), 91C so cy:T1 <= 2*2^64 - 3. For j > 1, 92C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), 93C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. 94C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, 95C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) 96C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 97C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 98C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), 99C so cy:T1 <= 3*2^64 - 3. For j > 1, 100C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), 101C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. 102C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. 103C Assume this is true for index i-1, Then 104C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 105C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 106C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), 107C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. 108C 109C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 110C YP = r5, MP = r6, TP = r1 (stack ptr) 111C 112 113C local variables: tmp[0 ... 12] array, having 12+1 8-byte words 114C The tmp array needs 12+1 entries, but tmp[12] is stored in 115C r15, so only 12 entries are used in the stack. 116 117 118 TEXT 119 .align 5 C powerPC 32 byte alignment 120.GSYM_PREFIX`'mulredc12: 121 122C ######################################################################## 123C # i = 0 pass 124C ######################################################################### 125 126C Pass for j = 0. We need to fetch x[i] from memory and compute the new u 127 128 ld r12, 0(r4) C XI = x[0] 129 ld r0, 0(r5) C y[0] 130 stdu r13, -8(r1) C save r13 131 mulld r8, r0, r12 C x[0]*y[0] low half 132 stdu r14, -8(r1) C save r14 133 mulhdu r9, r0, r12 C x[0]*y[0] high half 134 ld r0, 0(r6) C m[0] 135 mulld r11, r7, r8 C U = T0*invm mod 2^64 136 stdu r15, -8(r1) C save r15 137 mulld r13, r0, r11 C T0 = U*m[0] low 138 stdu r16, -8(r1) C save r16 139 li r16, 0 C set r16 to zero for carry propagation 140 subi r1, r1, 96 C set tmp stack space 141 mulhdu r14, r0, r11 C T1 = U*m[0] high 142 ld r0, 8(r5) C y[1] 143 addc r8, r8, r13 C 144 adde r13, r9, r14 C T0 = initial tmp(0) 145 addze r10, r16 C carry to CY 146 C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence 147 C CY:T1 <= 2*2^64 - 4 148 149C Pass for j = 1 150 151 mulld r8, r0, r12 C x[i]*y[j] low half 152 mulhdu r9, r0, r12 C x[i]*y[j] high half 153 ld r0, 8(r6) C m[j] 154 addc r13, r8, r13 C add low word to T0 155 adde r14, r9, r10 C add high word with carry + CY to T1 156 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 157 158 mulld r8, r0, r11 C U*m[j] low 159 mulhdu r9, r0, r11 C U*m[j] high 160 addc r8, r8, r13 C add T0 and low word 161 ld r0, 16(r5) C y[j+1] 162 adde r13, r9, r14 C add high word with carry to T1 163 addze r10, r16 C carry to CY 164 std r8, 0(r1) C store tmp[j-1] 165 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 166 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 167 168C Pass for j = 2 169 170 mulld r8, r0, r12 C x[i]*y[j] low half 171 mulhdu r9, r0, r12 C x[i]*y[j] high half 172 ld r0, 16(r6) C m[j] 173 addc r13, r8, r13 C add low word to T0 174 adde r14, r9, r10 C add high word with carry + CY to T1 175 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 176 177 mulld r8, r0, r11 C U*m[j] low 178 mulhdu r9, r0, r11 C U*m[j] high 179 addc r8, r8, r13 C add T0 and low word 180 ld r0, 24(r5) C y[j+1] 181 adde r13, r9, r14 C add high word with carry to T1 182 addze r10, r16 C carry to CY 183 std r8, 8(r1) C store tmp[j-1] 184 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 185 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 186 187C Pass for j = 3 188 189 mulld r8, r0, r12 C x[i]*y[j] low half 190 mulhdu r9, r0, r12 C x[i]*y[j] high half 191 ld r0, 24(r6) C m[j] 192 addc r13, r8, r13 C add low word to T0 193 adde r14, r9, r10 C add high word with carry + CY to T1 194 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 195 196 mulld r8, r0, r11 C U*m[j] low 197 mulhdu r9, r0, r11 C U*m[j] high 198 addc r8, r8, r13 C add T0 and low word 199 ld r0, 32(r5) C y[j+1] 200 adde r13, r9, r14 C add high word with carry to T1 201 addze r10, r16 C carry to CY 202 std r8, 16(r1) C store tmp[j-1] 203 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 204 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 205 206C Pass for j = 4 207 208 mulld r8, r0, r12 C x[i]*y[j] low half 209 mulhdu r9, r0, r12 C x[i]*y[j] high half 210 ld r0, 32(r6) C m[j] 211 addc r13, r8, r13 C add low word to T0 212 adde r14, r9, r10 C add high word with carry + CY to T1 213 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 214 215 mulld r8, r0, r11 C U*m[j] low 216 mulhdu r9, r0, r11 C U*m[j] high 217 addc r8, r8, r13 C add T0 and low word 218 ld r0, 40(r5) C y[j+1] 219 adde r13, r9, r14 C add high word with carry to T1 220 addze r10, r16 C carry to CY 221 std r8, 24(r1) C store tmp[j-1] 222 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 223 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 224 225C Pass for j = 5 226 227 mulld r8, r0, r12 C x[i]*y[j] low half 228 mulhdu r9, r0, r12 C x[i]*y[j] high half 229 ld r0, 40(r6) C m[j] 230 addc r13, r8, r13 C add low word to T0 231 adde r14, r9, r10 C add high word with carry + CY to T1 232 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 233 234 mulld r8, r0, r11 C U*m[j] low 235 mulhdu r9, r0, r11 C U*m[j] high 236 addc r8, r8, r13 C add T0 and low word 237 ld r0, 48(r5) C y[j+1] 238 adde r13, r9, r14 C add high word with carry to T1 239 addze r10, r16 C carry to CY 240 std r8, 32(r1) C store tmp[j-1] 241 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 242 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 243 244C Pass for j = 6 245 246 mulld r8, r0, r12 C x[i]*y[j] low half 247 mulhdu r9, r0, r12 C x[i]*y[j] high half 248 ld r0, 48(r6) C m[j] 249 addc r13, r8, r13 C add low word to T0 250 adde r14, r9, r10 C add high word with carry + CY to T1 251 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 252 253 mulld r8, r0, r11 C U*m[j] low 254 mulhdu r9, r0, r11 C U*m[j] high 255 addc r8, r8, r13 C add T0 and low word 256 ld r0, 56(r5) C y[j+1] 257 adde r13, r9, r14 C add high word with carry to T1 258 addze r10, r16 C carry to CY 259 std r8, 40(r1) C store tmp[j-1] 260 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 261 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 262 263C Pass for j = 7 264 265 mulld r8, r0, r12 C x[i]*y[j] low half 266 mulhdu r9, r0, r12 C x[i]*y[j] high half 267 ld r0, 56(r6) C m[j] 268 addc r13, r8, r13 C add low word to T0 269 adde r14, r9, r10 C add high word with carry + CY to T1 270 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 271 272 mulld r8, r0, r11 C U*m[j] low 273 mulhdu r9, r0, r11 C U*m[j] high 274 addc r8, r8, r13 C add T0 and low word 275 ld r0, 64(r5) C y[j+1] 276 adde r13, r9, r14 C add high word with carry to T1 277 addze r10, r16 C carry to CY 278 std r8, 48(r1) C store tmp[j-1] 279 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 280 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 281 282C Pass for j = 8 283 284 mulld r8, r0, r12 C x[i]*y[j] low half 285 mulhdu r9, r0, r12 C x[i]*y[j] high half 286 ld r0, 64(r6) C m[j] 287 addc r13, r8, r13 C add low word to T0 288 adde r14, r9, r10 C add high word with carry + CY to T1 289 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 290 291 mulld r8, r0, r11 C U*m[j] low 292 mulhdu r9, r0, r11 C U*m[j] high 293 addc r8, r8, r13 C add T0 and low word 294 ld r0, 72(r5) C y[j+1] 295 adde r13, r9, r14 C add high word with carry to T1 296 addze r10, r16 C carry to CY 297 std r8, 56(r1) C store tmp[j-1] 298 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 299 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 300 301C Pass for j = 9 302 303 mulld r8, r0, r12 C x[i]*y[j] low half 304 mulhdu r9, r0, r12 C x[i]*y[j] high half 305 ld r0, 72(r6) C m[j] 306 addc r13, r8, r13 C add low word to T0 307 adde r14, r9, r10 C add high word with carry + CY to T1 308 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 309 310 mulld r8, r0, r11 C U*m[j] low 311 mulhdu r9, r0, r11 C U*m[j] high 312 addc r8, r8, r13 C add T0 and low word 313 ld r0, 80(r5) C y[j+1] 314 adde r13, r9, r14 C add high word with carry to T1 315 addze r10, r16 C carry to CY 316 std r8, 64(r1) C store tmp[j-1] 317 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 318 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 319 320C Pass for j = 10 321 322 mulld r8, r0, r12 C x[i]*y[j] low half 323 mulhdu r9, r0, r12 C x[i]*y[j] high half 324 ld r0, 80(r6) C m[j] 325 addc r13, r8, r13 C add low word to T0 326 adde r14, r9, r10 C add high word with carry + CY to T1 327 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 328 329 mulld r8, r0, r11 C U*m[j] low 330 mulhdu r9, r0, r11 C U*m[j] high 331 addc r8, r8, r13 C add T0 and low word 332 ld r0, 88(r5) C y[j+1] 333 adde r13, r9, r14 C add high word with carry to T1 334 addze r10, r16 C carry to CY 335 std r8, 72(r1) C store tmp[j-1] 336 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 337 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 338 339C Pass for j = 11. Don't fetch new data from y[j+1]. 340 341 mulld r8, r0, r12 C x[i]*y[j] low half 342 mulhdu r9, r0, r12 C x[i]*y[j] high half 343 ld r0, 88(r6) C m[j] 344 addc r13, r8, r13 C add low word to T0 345 adde r14, r9, r10 C add high word with carry + CY to T1 346 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 347 348 mulld r8, r0, r11 C U*m[j] low 349 mulhdu r9, r0, r11 C U*m[j] high 350 addc r8, r8, r13 C add T0 and low word 351 adde r13, r9, r14 C add high word with carry to T1 352 std r8, 80(r1) C store tmp[len-2] 353 addze r15, r16 C put carry in r15 (tmp[len] <= 1) 354 std r13, 88(r1) C store tmp[len-1] 355 356 357C ######################################################################### 358C # i > 0 passes 359C ######################################################################### 360 361 362 li r9, 11 C outer loop count 363 mtctr r9 364 3651: 366 367C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory 368C and compute the new u 369 370 ldu r12, 8(r4) C x[i] 371 ld r0, 0(r5) C y[0] 372 ld r13, 0(r1) C tmp[0] 373 mulld r8, r0, r12 C x[i]*y[0] low half 374 ld r14, 8(r1) C tmp[1] 375 mulhdu r9, r0, r12 C x[i]*y[0] high half 376 addc r13, r8, r13 C T0 377 ld r0, 0(r6) C m[0] 378 mulld r11, r7, r13 C U = T0*invm mod 2^64 379 adde r14, r9, r14 C T1 380 mulld r8, r0, r11 C U*m[0] low 381 addze r10, r16 C CY 382 mulhdu r9, r0, r11 C U*m[0] high 383 ld r0, 8(r5) C y[1] 384 addc r8, r8, r13 C result = 0 385 adde r13, r9, r14 C T0, carry pending 386 C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, 387 C so cy:T1 <= 3*2^64 - 4 388 389C Pass for j = 1 390 391 ld r14, 16(r1) C tmp[j+1] 392 mulld r8, r0, r12 C x[i]*y[j] low half 393 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 394 addze r10, r16 C carry to CY 395 mulhdu r9, r0, r12 C x[i]*y[j] high half 396 ld r0, 8(r6) C m[j] 397 addc r13, r8, r13 C add low word to T0 398 mulld r8, r0, r11 C U*m[j] low 399 adde r14, r9, r14 C add high to T1 400 addze r10, r10 C add carry to CY 401 mulhdu r9, r0, r11 C U*m[j] high 402 addc r8, r8, r13 C add T0 and low word 403 ld r0, 16(r5) C y[j+1] 404 adde r13, r9, r14 C T1, carry pending 405 std r8, 0(r1) C store tmp[j-1] 406 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 407 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 408 409C Pass for j = 2 410 411 ld r14, 24(r1) C tmp[j+1] 412 mulld r8, r0, r12 C x[i]*y[j] low half 413 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 414 addze r10, r16 C carry to CY 415 mulhdu r9, r0, r12 C x[i]*y[j] high half 416 ld r0, 16(r6) C m[j] 417 addc r13, r8, r13 C add low word to T0 418 mulld r8, r0, r11 C U*m[j] low 419 adde r14, r9, r14 C add high to T1 420 addze r10, r10 C add carry to CY 421 mulhdu r9, r0, r11 C U*m[j] high 422 addc r8, r8, r13 C add T0 and low word 423 ld r0, 24(r5) C y[j+1] 424 adde r13, r9, r14 C T1, carry pending 425 std r8, 8(r1) C store tmp[j-1] 426 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 427 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 428 429C Pass for j = 3 430 431 ld r14, 32(r1) C tmp[j+1] 432 mulld r8, r0, r12 C x[i]*y[j] low half 433 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 434 addze r10, r16 C carry to CY 435 mulhdu r9, r0, r12 C x[i]*y[j] high half 436 ld r0, 24(r6) C m[j] 437 addc r13, r8, r13 C add low word to T0 438 mulld r8, r0, r11 C U*m[j] low 439 adde r14, r9, r14 C add high to T1 440 addze r10, r10 C add carry to CY 441 mulhdu r9, r0, r11 C U*m[j] high 442 addc r8, r8, r13 C add T0 and low word 443 ld r0, 32(r5) C y[j+1] 444 adde r13, r9, r14 C T1, carry pending 445 std r8, 16(r1) C store tmp[j-1] 446 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 447 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 448 449C Pass for j = 4 450 451 ld r14, 40(r1) C tmp[j+1] 452 mulld r8, r0, r12 C x[i]*y[j] low half 453 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 454 addze r10, r16 C carry to CY 455 mulhdu r9, r0, r12 C x[i]*y[j] high half 456 ld r0, 32(r6) C m[j] 457 addc r13, r8, r13 C add low word to T0 458 mulld r8, r0, r11 C U*m[j] low 459 adde r14, r9, r14 C add high to T1 460 addze r10, r10 C add carry to CY 461 mulhdu r9, r0, r11 C U*m[j] high 462 addc r8, r8, r13 C add T0 and low word 463 ld r0, 40(r5) C y[j+1] 464 adde r13, r9, r14 C T1, carry pending 465 std r8, 24(r1) C store tmp[j-1] 466 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 467 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 468 469C Pass for j = 5 470 471 ld r14, 48(r1) C tmp[j+1] 472 mulld r8, r0, r12 C x[i]*y[j] low half 473 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 474 addze r10, r16 C carry to CY 475 mulhdu r9, r0, r12 C x[i]*y[j] high half 476 ld r0, 40(r6) C m[j] 477 addc r13, r8, r13 C add low word to T0 478 mulld r8, r0, r11 C U*m[j] low 479 adde r14, r9, r14 C add high to T1 480 addze r10, r10 C add carry to CY 481 mulhdu r9, r0, r11 C U*m[j] high 482 addc r8, r8, r13 C add T0 and low word 483 ld r0, 48(r5) C y[j+1] 484 adde r13, r9, r14 C T1, carry pending 485 std r8, 32(r1) C store tmp[j-1] 486 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 487 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 488 489C Pass for j = 6 490 491 ld r14, 56(r1) C tmp[j+1] 492 mulld r8, r0, r12 C x[i]*y[j] low half 493 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 494 addze r10, r16 C carry to CY 495 mulhdu r9, r0, r12 C x[i]*y[j] high half 496 ld r0, 48(r6) C m[j] 497 addc r13, r8, r13 C add low word to T0 498 mulld r8, r0, r11 C U*m[j] low 499 adde r14, r9, r14 C add high to T1 500 addze r10, r10 C add carry to CY 501 mulhdu r9, r0, r11 C U*m[j] high 502 addc r8, r8, r13 C add T0 and low word 503 ld r0, 56(r5) C y[j+1] 504 adde r13, r9, r14 C T1, carry pending 505 std r8, 40(r1) C store tmp[j-1] 506 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 507 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 508 509C Pass for j = 7 510 511 ld r14, 64(r1) C tmp[j+1] 512 mulld r8, r0, r12 C x[i]*y[j] low half 513 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 514 addze r10, r16 C carry to CY 515 mulhdu r9, r0, r12 C x[i]*y[j] high half 516 ld r0, 56(r6) C m[j] 517 addc r13, r8, r13 C add low word to T0 518 mulld r8, r0, r11 C U*m[j] low 519 adde r14, r9, r14 C add high to T1 520 addze r10, r10 C add carry to CY 521 mulhdu r9, r0, r11 C U*m[j] high 522 addc r8, r8, r13 C add T0 and low word 523 ld r0, 64(r5) C y[j+1] 524 adde r13, r9, r14 C T1, carry pending 525 std r8, 48(r1) C store tmp[j-1] 526 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 527 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 528 529C Pass for j = 8 530 531 ld r14, 72(r1) C tmp[j+1] 532 mulld r8, r0, r12 C x[i]*y[j] low half 533 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 534 addze r10, r16 C carry to CY 535 mulhdu r9, r0, r12 C x[i]*y[j] high half 536 ld r0, 64(r6) C m[j] 537 addc r13, r8, r13 C add low word to T0 538 mulld r8, r0, r11 C U*m[j] low 539 adde r14, r9, r14 C add high to T1 540 addze r10, r10 C add carry to CY 541 mulhdu r9, r0, r11 C U*m[j] high 542 addc r8, r8, r13 C add T0 and low word 543 ld r0, 72(r5) C y[j+1] 544 adde r13, r9, r14 C T1, carry pending 545 std r8, 56(r1) C store tmp[j-1] 546 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 547 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 548 549C Pass for j = 9 550 551 ld r14, 80(r1) C tmp[j+1] 552 mulld r8, r0, r12 C x[i]*y[j] low half 553 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 554 addze r10, r16 C carry to CY 555 mulhdu r9, r0, r12 C x[i]*y[j] high half 556 ld r0, 72(r6) C m[j] 557 addc r13, r8, r13 C add low word to T0 558 mulld r8, r0, r11 C U*m[j] low 559 adde r14, r9, r14 C add high to T1 560 addze r10, r10 C add carry to CY 561 mulhdu r9, r0, r11 C U*m[j] high 562 addc r8, r8, r13 C add T0 and low word 563 ld r0, 80(r5) C y[j+1] 564 adde r13, r9, r14 C T1, carry pending 565 std r8, 64(r1) C store tmp[j-1] 566 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 567 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 568 569C Pass for j = 10 570 571 ld r14, 88(r1) C tmp[j+1] 572 mulld r8, r0, r12 C x[i]*y[j] low half 573 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 574 addze r10, r16 C carry to CY 575 mulhdu r9, r0, r12 C x[i]*y[j] high half 576 ld r0, 80(r6) C m[j] 577 addc r13, r8, r13 C add low word to T0 578 mulld r8, r0, r11 C U*m[j] low 579 adde r14, r9, r14 C add high to T1 580 addze r10, r10 C add carry to CY 581 mulhdu r9, r0, r11 C U*m[j] high 582 addc r8, r8, r13 C add T0 and low word 583 ld r0, 88(r5) C y[j+1] 584 adde r13, r9, r14 C T1, carry pending 585 std r8, 72(r1) C store tmp[j-1] 586 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 587 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 588 589C Pass for j = 11. Don't fetch new data from y[j+1]. 590 591 mulld r8, r0, r12 C x[i]*y[j] low half 592 adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry 593 C since tmp[len] <= 1, T1 <= 3 and carry is zero 594 mulhdu r9, r0, r12 C x[i]*y[j] high half 595 ld r0, 88(r6) C m[j] 596 addc r13, r8, r13 C add low word to T0 597 mulld r8, r0, r11 C U*m[j] low 598 adde r14, r9, r14 C add high to T1 599 addze r10, r16 C CY 600 mulhdu r9, r0, r11 C U*m[j] high 601 addc r8, r8, r13 C add T0 and low word 602 adde r13, r9, r14 C T1, carry pending 603 std r8, 80(r1) C store tmp[len-2] 604 addze r15, r10 C store tmp[len] <= 1 605 std r13, 88(r1) C store tmp[len-1] 606 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 607 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) 608 609 bdnz 1b 610 611C Copy result from tmp memory to z 612 613 ld r8, 0(r1) 614 ldu r9, 8(r1) 615 std r8, 0(r3) 616 stdu r9, 8(r3) 617 ldu r8, 8(r1) 618 ldu r9, 8(r1) 619 stdu r8, 8(r3) 620 stdu r9, 8(r3) 621 ldu r8, 8(r1) 622 ldu r9, 8(r1) 623 stdu r8, 8(r3) 624 stdu r9, 8(r3) 625 ldu r8, 8(r1) 626 ldu r9, 8(r1) 627 stdu r8, 8(r3) 628 stdu r9, 8(r3) 629 ldu r8, 8(r1) 630 ldu r9, 8(r1) 631 stdu r8, 8(r3) 632 stdu r9, 8(r3) 633 ldu r8, 8(r1) 634 ldu r9, 8(r1) 635 stdu r8, 8(r3) 636 stdu r9, 8(r3) 637 638 mr r3, r15 C return tmp(len) 639 ldu r16, 8(r1) 640 ldu r15, 8(r1) 641 ldu r14, 8(r1) 642 ldu r13, 8(r1) 643 addi r1, r1, 8 644 blr 645 646 .size .GSYM_PREFIX`'mulredc12, .-.GSYM_PREFIX`'mulredc12 647 648