1dnl ****************************************************************************** 2dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. 3dnl 4dnl This file is part of the ECM Library. 5dnl 6dnl The ECM Library is free software; you can redistribute it and/or modify 7dnl it under the terms of the GNU Lesser General Public License as published by 8dnl the Free Software Foundation; either version 3 of the License, or (at your 9dnl option) any later version. 10dnl 11dnl The ECM Library is distributed in the hope that it will be useful, but 12dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 13dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 14dnl License for more details. 15dnl 16dnl You should have received a copy of the GNU Lesser General Public License 17dnl along with the ECM Library; see the file COPYING.LIB. If not, write to 18dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, 19dnl MA 02110-1301, USA. 20dnl ****************************************************************************** 21 22define(C, ` 23dnl') 24 25C mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, 26C const mp_limb_t *m, mp_limb_t inv_m); 27C 28C arguments: 29C r3 = ptr to result z least significant limb 30C r4 = ptr to input x least significant limb 31C r5 = ptr to input y least significant limb 32C r6 = ptr to modulus m least significant limb 33C r7 = -1/m mod 2^64 34C 35C final carry returned in r3 36 37 38 39include(`config.m4') 40 41 GLOBL GSYM_PREFIX`'mulredc14 42 GLOBL .GSYM_PREFIX`'mulredc14 43 44 .section ".opd", "aw" 45 .align 3 46GSYM_PREFIX`'mulredc14: 47 .quad .GSYM_PREFIX`'mulredc14, .TOC.@tocbase, 0 48 .size GSYM_PREFIX`'mulredc14, 24 49 50 51C Implements multiplication and REDC for two input numbers of 14 words 52 53C The algorithm: 54C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) 55C 56C T1:T0 = x[i]*y[0] ; 57C u = (T0*invm) % 2^64 ; 58C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ 59C for (j = 1; j < len; j++) 60C { 61C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; 62C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ 63C tmp[j-1] = T0; 64C } 65C tmp[len-1] = T1 ; 66C tmp[len] = cy ; /* cy <= 1 (see note 2) */ 67C for (i = 1; i < len; i++) 68C { 69C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; 70C u = (T0*invm) % 2^64 ; 71C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ 72C for (j = 1; j < len; j++) 73C { 74C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; 75C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 76C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ 77C tmp[j-1] = T0; 78C } 79C tmp[len-1] = T1 ; 80C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ 81C } 82C z[0 ... len-1] = tmp[0 ... len-1] ; 83C return (tmp[len]) ; 84C 85C notes: 86C 87C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, 88C so cy:T1 <= 2*2^64 - 4. 89C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 90C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), 91C so cy:T1 <= 2*2^64 - 3. For j > 1, 92C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), 93C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. 94C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, 95C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) 96C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 97C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 98C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), 99C so cy:T1 <= 3*2^64 - 3. For j > 1, 100C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), 101C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. 102C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. 103C Assume this is true for index i-1, Then 104C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 105C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 106C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), 107C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. 108C 109C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 110C YP = r5, MP = r6, TP = r1 (stack ptr) 111C 112 113C local variables: tmp[0 ... 14] array, having 14+1 8-byte words 114C The tmp array needs 14+1 entries, but tmp[14] is stored in 115C r15, so only 14 entries are used in the stack. 116 117 118 TEXT 119 .align 5 C powerPC 32 byte alignment 120.GSYM_PREFIX`'mulredc14: 121 122C ######################################################################## 123C # i = 0 pass 124C ######################################################################### 125 126C Pass for j = 0. We need to fetch x[i] from memory and compute the new u 127 128 ld r12, 0(r4) C XI = x[0] 129 ld r0, 0(r5) C y[0] 130 stdu r13, -8(r1) C save r13 131 mulld r8, r0, r12 C x[0]*y[0] low half 132 stdu r14, -8(r1) C save r14 133 mulhdu r9, r0, r12 C x[0]*y[0] high half 134 ld r0, 0(r6) C m[0] 135 mulld r11, r7, r8 C U = T0*invm mod 2^64 136 stdu r15, -8(r1) C save r15 137 mulld r13, r0, r11 C T0 = U*m[0] low 138 stdu r16, -8(r1) C save r16 139 li r16, 0 C set r16 to zero for carry propagation 140 subi r1, r1, 112 C set tmp stack space 141 mulhdu r14, r0, r11 C T1 = U*m[0] high 142 ld r0, 8(r5) C y[1] 143 addc r8, r8, r13 C 144 adde r13, r9, r14 C T0 = initial tmp(0) 145 addze r10, r16 C carry to CY 146 C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence 147 C CY:T1 <= 2*2^64 - 4 148 149C Pass for j = 1 150 151 mulld r8, r0, r12 C x[i]*y[j] low half 152 mulhdu r9, r0, r12 C x[i]*y[j] high half 153 ld r0, 8(r6) C m[j] 154 addc r13, r8, r13 C add low word to T0 155 adde r14, r9, r10 C add high word with carry + CY to T1 156 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 157 158 mulld r8, r0, r11 C U*m[j] low 159 mulhdu r9, r0, r11 C U*m[j] high 160 addc r8, r8, r13 C add T0 and low word 161 ld r0, 16(r5) C y[j+1] 162 adde r13, r9, r14 C add high word with carry to T1 163 addze r10, r16 C carry to CY 164 std r8, 0(r1) C store tmp[j-1] 165 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 166 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 167 168C Pass for j = 2 169 170 mulld r8, r0, r12 C x[i]*y[j] low half 171 mulhdu r9, r0, r12 C x[i]*y[j] high half 172 ld r0, 16(r6) C m[j] 173 addc r13, r8, r13 C add low word to T0 174 adde r14, r9, r10 C add high word with carry + CY to T1 175 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 176 177 mulld r8, r0, r11 C U*m[j] low 178 mulhdu r9, r0, r11 C U*m[j] high 179 addc r8, r8, r13 C add T0 and low word 180 ld r0, 24(r5) C y[j+1] 181 adde r13, r9, r14 C add high word with carry to T1 182 addze r10, r16 C carry to CY 183 std r8, 8(r1) C store tmp[j-1] 184 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 185 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 186 187C Pass for j = 3 188 189 mulld r8, r0, r12 C x[i]*y[j] low half 190 mulhdu r9, r0, r12 C x[i]*y[j] high half 191 ld r0, 24(r6) C m[j] 192 addc r13, r8, r13 C add low word to T0 193 adde r14, r9, r10 C add high word with carry + CY to T1 194 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 195 196 mulld r8, r0, r11 C U*m[j] low 197 mulhdu r9, r0, r11 C U*m[j] high 198 addc r8, r8, r13 C add T0 and low word 199 ld r0, 32(r5) C y[j+1] 200 adde r13, r9, r14 C add high word with carry to T1 201 addze r10, r16 C carry to CY 202 std r8, 16(r1) C store tmp[j-1] 203 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 204 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 205 206C Pass for j = 4 207 208 mulld r8, r0, r12 C x[i]*y[j] low half 209 mulhdu r9, r0, r12 C x[i]*y[j] high half 210 ld r0, 32(r6) C m[j] 211 addc r13, r8, r13 C add low word to T0 212 adde r14, r9, r10 C add high word with carry + CY to T1 213 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 214 215 mulld r8, r0, r11 C U*m[j] low 216 mulhdu r9, r0, r11 C U*m[j] high 217 addc r8, r8, r13 C add T0 and low word 218 ld r0, 40(r5) C y[j+1] 219 adde r13, r9, r14 C add high word with carry to T1 220 addze r10, r16 C carry to CY 221 std r8, 24(r1) C store tmp[j-1] 222 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 223 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 224 225C Pass for j = 5 226 227 mulld r8, r0, r12 C x[i]*y[j] low half 228 mulhdu r9, r0, r12 C x[i]*y[j] high half 229 ld r0, 40(r6) C m[j] 230 addc r13, r8, r13 C add low word to T0 231 adde r14, r9, r10 C add high word with carry + CY to T1 232 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 233 234 mulld r8, r0, r11 C U*m[j] low 235 mulhdu r9, r0, r11 C U*m[j] high 236 addc r8, r8, r13 C add T0 and low word 237 ld r0, 48(r5) C y[j+1] 238 adde r13, r9, r14 C add high word with carry to T1 239 addze r10, r16 C carry to CY 240 std r8, 32(r1) C store tmp[j-1] 241 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 242 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 243 244C Pass for j = 6 245 246 mulld r8, r0, r12 C x[i]*y[j] low half 247 mulhdu r9, r0, r12 C x[i]*y[j] high half 248 ld r0, 48(r6) C m[j] 249 addc r13, r8, r13 C add low word to T0 250 adde r14, r9, r10 C add high word with carry + CY to T1 251 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 252 253 mulld r8, r0, r11 C U*m[j] low 254 mulhdu r9, r0, r11 C U*m[j] high 255 addc r8, r8, r13 C add T0 and low word 256 ld r0, 56(r5) C y[j+1] 257 adde r13, r9, r14 C add high word with carry to T1 258 addze r10, r16 C carry to CY 259 std r8, 40(r1) C store tmp[j-1] 260 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 261 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 262 263C Pass for j = 7 264 265 mulld r8, r0, r12 C x[i]*y[j] low half 266 mulhdu r9, r0, r12 C x[i]*y[j] high half 267 ld r0, 56(r6) C m[j] 268 addc r13, r8, r13 C add low word to T0 269 adde r14, r9, r10 C add high word with carry + CY to T1 270 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 271 272 mulld r8, r0, r11 C U*m[j] low 273 mulhdu r9, r0, r11 C U*m[j] high 274 addc r8, r8, r13 C add T0 and low word 275 ld r0, 64(r5) C y[j+1] 276 adde r13, r9, r14 C add high word with carry to T1 277 addze r10, r16 C carry to CY 278 std r8, 48(r1) C store tmp[j-1] 279 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 280 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 281 282C Pass for j = 8 283 284 mulld r8, r0, r12 C x[i]*y[j] low half 285 mulhdu r9, r0, r12 C x[i]*y[j] high half 286 ld r0, 64(r6) C m[j] 287 addc r13, r8, r13 C add low word to T0 288 adde r14, r9, r10 C add high word with carry + CY to T1 289 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 290 291 mulld r8, r0, r11 C U*m[j] low 292 mulhdu r9, r0, r11 C U*m[j] high 293 addc r8, r8, r13 C add T0 and low word 294 ld r0, 72(r5) C y[j+1] 295 adde r13, r9, r14 C add high word with carry to T1 296 addze r10, r16 C carry to CY 297 std r8, 56(r1) C store tmp[j-1] 298 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 299 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 300 301C Pass for j = 9 302 303 mulld r8, r0, r12 C x[i]*y[j] low half 304 mulhdu r9, r0, r12 C x[i]*y[j] high half 305 ld r0, 72(r6) C m[j] 306 addc r13, r8, r13 C add low word to T0 307 adde r14, r9, r10 C add high word with carry + CY to T1 308 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 309 310 mulld r8, r0, r11 C U*m[j] low 311 mulhdu r9, r0, r11 C U*m[j] high 312 addc r8, r8, r13 C add T0 and low word 313 ld r0, 80(r5) C y[j+1] 314 adde r13, r9, r14 C add high word with carry to T1 315 addze r10, r16 C carry to CY 316 std r8, 64(r1) C store tmp[j-1] 317 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 318 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 319 320C Pass for j = 10 321 322 mulld r8, r0, r12 C x[i]*y[j] low half 323 mulhdu r9, r0, r12 C x[i]*y[j] high half 324 ld r0, 80(r6) C m[j] 325 addc r13, r8, r13 C add low word to T0 326 adde r14, r9, r10 C add high word with carry + CY to T1 327 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 328 329 mulld r8, r0, r11 C U*m[j] low 330 mulhdu r9, r0, r11 C U*m[j] high 331 addc r8, r8, r13 C add T0 and low word 332 ld r0, 88(r5) C y[j+1] 333 adde r13, r9, r14 C add high word with carry to T1 334 addze r10, r16 C carry to CY 335 std r8, 72(r1) C store tmp[j-1] 336 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 337 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 338 339C Pass for j = 11 340 341 mulld r8, r0, r12 C x[i]*y[j] low half 342 mulhdu r9, r0, r12 C x[i]*y[j] high half 343 ld r0, 88(r6) C m[j] 344 addc r13, r8, r13 C add low word to T0 345 adde r14, r9, r10 C add high word with carry + CY to T1 346 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 347 348 mulld r8, r0, r11 C U*m[j] low 349 mulhdu r9, r0, r11 C U*m[j] high 350 addc r8, r8, r13 C add T0 and low word 351 ld r0, 96(r5) C y[j+1] 352 adde r13, r9, r14 C add high word with carry to T1 353 addze r10, r16 C carry to CY 354 std r8, 80(r1) C store tmp[j-1] 355 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 356 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 357 358C Pass for j = 12 359 360 mulld r8, r0, r12 C x[i]*y[j] low half 361 mulhdu r9, r0, r12 C x[i]*y[j] high half 362 ld r0, 96(r6) C m[j] 363 addc r13, r8, r13 C add low word to T0 364 adde r14, r9, r10 C add high word with carry + CY to T1 365 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 366 367 mulld r8, r0, r11 C U*m[j] low 368 mulhdu r9, r0, r11 C U*m[j] high 369 addc r8, r8, r13 C add T0 and low word 370 ld r0, 104(r5) C y[j+1] 371 adde r13, r9, r14 C add high word with carry to T1 372 addze r10, r16 C carry to CY 373 std r8, 88(r1) C store tmp[j-1] 374 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 375 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 376 377C Pass for j = 13. Don't fetch new data from y[j+1]. 378 379 mulld r8, r0, r12 C x[i]*y[j] low half 380 mulhdu r9, r0, r12 C x[i]*y[j] high half 381 ld r0, 104(r6) C m[j] 382 addc r13, r8, r13 C add low word to T0 383 adde r14, r9, r10 C add high word with carry + CY to T1 384 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 385 386 mulld r8, r0, r11 C U*m[j] low 387 mulhdu r9, r0, r11 C U*m[j] high 388 addc r8, r8, r13 C add T0 and low word 389 adde r13, r9, r14 C add high word with carry to T1 390 std r8, 96(r1) C store tmp[len-2] 391 addze r15, r16 C put carry in r15 (tmp[len] <= 1) 392 std r13, 104(r1) C store tmp[len-1] 393 394 395C ######################################################################### 396C # i > 0 passes 397C ######################################################################### 398 399 400 li r9, 13 C outer loop count 401 mtctr r9 402 4031: 404 405C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory 406C and compute the new u 407 408 ldu r12, 8(r4) C x[i] 409 ld r0, 0(r5) C y[0] 410 ld r13, 0(r1) C tmp[0] 411 mulld r8, r0, r12 C x[i]*y[0] low half 412 ld r14, 8(r1) C tmp[1] 413 mulhdu r9, r0, r12 C x[i]*y[0] high half 414 addc r13, r8, r13 C T0 415 ld r0, 0(r6) C m[0] 416 mulld r11, r7, r13 C U = T0*invm mod 2^64 417 adde r14, r9, r14 C T1 418 mulld r8, r0, r11 C U*m[0] low 419 addze r10, r16 C CY 420 mulhdu r9, r0, r11 C U*m[0] high 421 ld r0, 8(r5) C y[1] 422 addc r8, r8, r13 C result = 0 423 adde r13, r9, r14 C T0, carry pending 424 C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, 425 C so cy:T1 <= 3*2^64 - 4 426 427C Pass for j = 1 428 429 ld r14, 16(r1) C tmp[j+1] 430 mulld r8, r0, r12 C x[i]*y[j] low half 431 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 432 addze r10, r16 C carry to CY 433 mulhdu r9, r0, r12 C x[i]*y[j] high half 434 ld r0, 8(r6) C m[j] 435 addc r13, r8, r13 C add low word to T0 436 mulld r8, r0, r11 C U*m[j] low 437 adde r14, r9, r14 C add high to T1 438 addze r10, r10 C add carry to CY 439 mulhdu r9, r0, r11 C U*m[j] high 440 addc r8, r8, r13 C add T0 and low word 441 ld r0, 16(r5) C y[j+1] 442 adde r13, r9, r14 C T1, carry pending 443 std r8, 0(r1) C store tmp[j-1] 444 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 445 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 446 447C Pass for j = 2 448 449 ld r14, 24(r1) C tmp[j+1] 450 mulld r8, r0, r12 C x[i]*y[j] low half 451 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 452 addze r10, r16 C carry to CY 453 mulhdu r9, r0, r12 C x[i]*y[j] high half 454 ld r0, 16(r6) C m[j] 455 addc r13, r8, r13 C add low word to T0 456 mulld r8, r0, r11 C U*m[j] low 457 adde r14, r9, r14 C add high to T1 458 addze r10, r10 C add carry to CY 459 mulhdu r9, r0, r11 C U*m[j] high 460 addc r8, r8, r13 C add T0 and low word 461 ld r0, 24(r5) C y[j+1] 462 adde r13, r9, r14 C T1, carry pending 463 std r8, 8(r1) C store tmp[j-1] 464 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 465 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 466 467C Pass for j = 3 468 469 ld r14, 32(r1) C tmp[j+1] 470 mulld r8, r0, r12 C x[i]*y[j] low half 471 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 472 addze r10, r16 C carry to CY 473 mulhdu r9, r0, r12 C x[i]*y[j] high half 474 ld r0, 24(r6) C m[j] 475 addc r13, r8, r13 C add low word to T0 476 mulld r8, r0, r11 C U*m[j] low 477 adde r14, r9, r14 C add high to T1 478 addze r10, r10 C add carry to CY 479 mulhdu r9, r0, r11 C U*m[j] high 480 addc r8, r8, r13 C add T0 and low word 481 ld r0, 32(r5) C y[j+1] 482 adde r13, r9, r14 C T1, carry pending 483 std r8, 16(r1) C store tmp[j-1] 484 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 485 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 486 487C Pass for j = 4 488 489 ld r14, 40(r1) C tmp[j+1] 490 mulld r8, r0, r12 C x[i]*y[j] low half 491 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 492 addze r10, r16 C carry to CY 493 mulhdu r9, r0, r12 C x[i]*y[j] high half 494 ld r0, 32(r6) C m[j] 495 addc r13, r8, r13 C add low word to T0 496 mulld r8, r0, r11 C U*m[j] low 497 adde r14, r9, r14 C add high to T1 498 addze r10, r10 C add carry to CY 499 mulhdu r9, r0, r11 C U*m[j] high 500 addc r8, r8, r13 C add T0 and low word 501 ld r0, 40(r5) C y[j+1] 502 adde r13, r9, r14 C T1, carry pending 503 std r8, 24(r1) C store tmp[j-1] 504 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 505 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 506 507C Pass for j = 5 508 509 ld r14, 48(r1) C tmp[j+1] 510 mulld r8, r0, r12 C x[i]*y[j] low half 511 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 512 addze r10, r16 C carry to CY 513 mulhdu r9, r0, r12 C x[i]*y[j] high half 514 ld r0, 40(r6) C m[j] 515 addc r13, r8, r13 C add low word to T0 516 mulld r8, r0, r11 C U*m[j] low 517 adde r14, r9, r14 C add high to T1 518 addze r10, r10 C add carry to CY 519 mulhdu r9, r0, r11 C U*m[j] high 520 addc r8, r8, r13 C add T0 and low word 521 ld r0, 48(r5) C y[j+1] 522 adde r13, r9, r14 C T1, carry pending 523 std r8, 32(r1) C store tmp[j-1] 524 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 525 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 526 527C Pass for j = 6 528 529 ld r14, 56(r1) C tmp[j+1] 530 mulld r8, r0, r12 C x[i]*y[j] low half 531 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 532 addze r10, r16 C carry to CY 533 mulhdu r9, r0, r12 C x[i]*y[j] high half 534 ld r0, 48(r6) C m[j] 535 addc r13, r8, r13 C add low word to T0 536 mulld r8, r0, r11 C U*m[j] low 537 adde r14, r9, r14 C add high to T1 538 addze r10, r10 C add carry to CY 539 mulhdu r9, r0, r11 C U*m[j] high 540 addc r8, r8, r13 C add T0 and low word 541 ld r0, 56(r5) C y[j+1] 542 adde r13, r9, r14 C T1, carry pending 543 std r8, 40(r1) C store tmp[j-1] 544 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 545 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 546 547C Pass for j = 7 548 549 ld r14, 64(r1) C tmp[j+1] 550 mulld r8, r0, r12 C x[i]*y[j] low half 551 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 552 addze r10, r16 C carry to CY 553 mulhdu r9, r0, r12 C x[i]*y[j] high half 554 ld r0, 56(r6) C m[j] 555 addc r13, r8, r13 C add low word to T0 556 mulld r8, r0, r11 C U*m[j] low 557 adde r14, r9, r14 C add high to T1 558 addze r10, r10 C add carry to CY 559 mulhdu r9, r0, r11 C U*m[j] high 560 addc r8, r8, r13 C add T0 and low word 561 ld r0, 64(r5) C y[j+1] 562 adde r13, r9, r14 C T1, carry pending 563 std r8, 48(r1) C store tmp[j-1] 564 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 565 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 566 567C Pass for j = 8 568 569 ld r14, 72(r1) C tmp[j+1] 570 mulld r8, r0, r12 C x[i]*y[j] low half 571 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 572 addze r10, r16 C carry to CY 573 mulhdu r9, r0, r12 C x[i]*y[j] high half 574 ld r0, 64(r6) C m[j] 575 addc r13, r8, r13 C add low word to T0 576 mulld r8, r0, r11 C U*m[j] low 577 adde r14, r9, r14 C add high to T1 578 addze r10, r10 C add carry to CY 579 mulhdu r9, r0, r11 C U*m[j] high 580 addc r8, r8, r13 C add T0 and low word 581 ld r0, 72(r5) C y[j+1] 582 adde r13, r9, r14 C T1, carry pending 583 std r8, 56(r1) C store tmp[j-1] 584 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 585 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 586 587C Pass for j = 9 588 589 ld r14, 80(r1) C tmp[j+1] 590 mulld r8, r0, r12 C x[i]*y[j] low half 591 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 592 addze r10, r16 C carry to CY 593 mulhdu r9, r0, r12 C x[i]*y[j] high half 594 ld r0, 72(r6) C m[j] 595 addc r13, r8, r13 C add low word to T0 596 mulld r8, r0, r11 C U*m[j] low 597 adde r14, r9, r14 C add high to T1 598 addze r10, r10 C add carry to CY 599 mulhdu r9, r0, r11 C U*m[j] high 600 addc r8, r8, r13 C add T0 and low word 601 ld r0, 80(r5) C y[j+1] 602 adde r13, r9, r14 C T1, carry pending 603 std r8, 64(r1) C store tmp[j-1] 604 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 605 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 606 607C Pass for j = 10 608 609 ld r14, 88(r1) C tmp[j+1] 610 mulld r8, r0, r12 C x[i]*y[j] low half 611 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 612 addze r10, r16 C carry to CY 613 mulhdu r9, r0, r12 C x[i]*y[j] high half 614 ld r0, 80(r6) C m[j] 615 addc r13, r8, r13 C add low word to T0 616 mulld r8, r0, r11 C U*m[j] low 617 adde r14, r9, r14 C add high to T1 618 addze r10, r10 C add carry to CY 619 mulhdu r9, r0, r11 C U*m[j] high 620 addc r8, r8, r13 C add T0 and low word 621 ld r0, 88(r5) C y[j+1] 622 adde r13, r9, r14 C T1, carry pending 623 std r8, 72(r1) C store tmp[j-1] 624 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 625 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 626 627C Pass for j = 11 628 629 ld r14, 96(r1) C tmp[j+1] 630 mulld r8, r0, r12 C x[i]*y[j] low half 631 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 632 addze r10, r16 C carry to CY 633 mulhdu r9, r0, r12 C x[i]*y[j] high half 634 ld r0, 88(r6) C m[j] 635 addc r13, r8, r13 C add low word to T0 636 mulld r8, r0, r11 C U*m[j] low 637 adde r14, r9, r14 C add high to T1 638 addze r10, r10 C add carry to CY 639 mulhdu r9, r0, r11 C U*m[j] high 640 addc r8, r8, r13 C add T0 and low word 641 ld r0, 96(r5) C y[j+1] 642 adde r13, r9, r14 C T1, carry pending 643 std r8, 80(r1) C store tmp[j-1] 644 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 645 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 646 647C Pass for j = 12 648 649 ld r14, 104(r1) C tmp[j+1] 650 mulld r8, r0, r12 C x[i]*y[j] low half 651 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 652 addze r10, r16 C carry to CY 653 mulhdu r9, r0, r12 C x[i]*y[j] high half 654 ld r0, 96(r6) C m[j] 655 addc r13, r8, r13 C add low word to T0 656 mulld r8, r0, r11 C U*m[j] low 657 adde r14, r9, r14 C add high to T1 658 addze r10, r10 C add carry to CY 659 mulhdu r9, r0, r11 C U*m[j] high 660 addc r8, r8, r13 C add T0 and low word 661 ld r0, 104(r5) C y[j+1] 662 adde r13, r9, r14 C T1, carry pending 663 std r8, 88(r1) C store tmp[j-1] 664 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 665 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 666 667C Pass for j = 13. Don't fetch new data from y[j+1]. 668 669 mulld r8, r0, r12 C x[i]*y[j] low half 670 adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry 671 C since tmp[len] <= 1, T1 <= 3 and carry is zero 672 mulhdu r9, r0, r12 C x[i]*y[j] high half 673 ld r0, 104(r6) C m[j] 674 addc r13, r8, r13 C add low word to T0 675 mulld r8, r0, r11 C U*m[j] low 676 adde r14, r9, r14 C add high to T1 677 addze r10, r16 C CY 678 mulhdu r9, r0, r11 C U*m[j] high 679 addc r8, r8, r13 C add T0 and low word 680 adde r13, r9, r14 C T1, carry pending 681 std r8, 96(r1) C store tmp[len-2] 682 addze r15, r10 C store tmp[len] <= 1 683 std r13, 104(r1) C store tmp[len-1] 684 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 685 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) 686 687 bdnz 1b 688 689C Copy result from tmp memory to z 690 691 ld r8, 0(r1) 692 ldu r9, 8(r1) 693 std r8, 0(r3) 694 stdu r9, 8(r3) 695 ldu r8, 8(r1) 696 ldu r9, 8(r1) 697 stdu r8, 8(r3) 698 stdu r9, 8(r3) 699 ldu r8, 8(r1) 700 ldu r9, 8(r1) 701 stdu r8, 8(r3) 702 stdu r9, 8(r3) 703 ldu r8, 8(r1) 704 ldu r9, 8(r1) 705 stdu r8, 8(r3) 706 stdu r9, 8(r3) 707 ldu r8, 8(r1) 708 ldu r9, 8(r1) 709 stdu r8, 8(r3) 710 stdu r9, 8(r3) 711 ldu r8, 8(r1) 712 ldu r9, 8(r1) 713 stdu r8, 8(r3) 714 stdu r9, 8(r3) 715 ldu r8, 8(r1) 716 ldu r9, 8(r1) 717 stdu r8, 8(r3) 718 stdu r9, 8(r3) 719 720 mr r3, r15 C return tmp(len) 721 ldu r16, 8(r1) 722 ldu r15, 8(r1) 723 ldu r14, 8(r1) 724 ldu r13, 8(r1) 725 addi r1, r1, 8 726 blr 727 728 .size .GSYM_PREFIX`'mulredc14, .-.GSYM_PREFIX`'mulredc14 729 730