1dnl ****************************************************************************** 2dnl Copyright 2009 Paul Zimmermann and Alexander Kruppa. 3dnl 4dnl This file is part of the ECM Library. 5dnl 6dnl The ECM Library is free software; you can redistribute it and/or modify 7dnl it under the terms of the GNU Lesser General Public License as published by 8dnl the Free Software Foundation; either version 3 of the License, or (at your 9dnl option) any later version. 10dnl 11dnl The ECM Library is distributed in the hope that it will be useful, but 12dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 13dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 14dnl License for more details. 15dnl 16dnl You should have received a copy of the GNU Lesser General Public License 17dnl along with the ECM Library; see the file COPYING.LIB. If not, write to 18dnl the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, 19dnl MA 02110-1301, USA. 20dnl ****************************************************************************** 21 22define(C, ` 23dnl') 24 25C mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, 26C const mp_limb_t *m, mp_limb_t inv_m); 27C 28C arguments: 29C r3 = ptr to result z least significant limb 30C r4 = ptr to input x least significant limb 31C r5 = ptr to input y least significant limb 32C r6 = ptr to modulus m least significant limb 33C r7 = -1/m mod 2^64 34C 35C final carry returned in r3 36 37 38 39include(`config.m4') 40 41 GLOBL GSYM_PREFIX`'mulredc15 42 GLOBL .GSYM_PREFIX`'mulredc15 43 44 .section ".opd", "aw" 45 .align 3 46GSYM_PREFIX`'mulredc15: 47 .quad .GSYM_PREFIX`'mulredc15, .TOC.@tocbase, 0 48 .size GSYM_PREFIX`'mulredc15, 24 49 50 51C Implements multiplication and REDC for two input numbers of 15 words 52 53C The algorithm: 54C (Notation: a:b:c == a * 2^128 + b * 2^64 + c) 55C 56C T1:T0 = x[i]*y[0] ; 57C u = (T0*invm) % 2^64 ; 58C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */ 59C for (j = 1; j < len; j++) 60C { 61C cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ; 62C /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */ 63C tmp[j-1] = T0; 64C } 65C tmp[len-1] = T1 ; 66C tmp[len] = cy ; /* cy <= 1 (see note 2) */ 67C for (i = 1; i < len; i++) 68C { 69C cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ; 70C u = (T0*invm) % 2^64 ; 71C cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */ 72C for (j = 1; j < len; j++) 73C { 74C cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ; 75C /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3 76C for j = (len-1), result cy:T1 <= 2*2^64 - 1 (see note 4) */ 77C tmp[j-1] = T0; 78C } 79C tmp[len-1] = T1 ; 80C tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */ 81C } 82C z[0 ... len-1] = tmp[0 ... len-1] ; 83C return (tmp[len]) ; 84C 85C notes: 86C 87C 1: m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2, 88C so cy:T1 <= 2*2^64 - 4. 89C 2: For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4 90C <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2), 91C so cy:T1 <= 2*2^64 - 3. For j > 1, 92C x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1), 93C so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j. 94C 3: m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, 95C so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4) 96C 4: For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 97C <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64 98C <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2), 99C so cy:T1 <= 3*2^64 - 3. For j > 1, 100C x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1), 101C so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1. 102C For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0. 103C Assume this is true for index i-1, Then 104C x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1 105C <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 106C <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1), 107C so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction. 108C 109C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11 110C YP = r5, MP = r6, TP = r1 (stack ptr) 111C 112 113C local variables: tmp[0 ... 15] array, having 15+1 8-byte words 114C The tmp array needs 15+1 entries, but tmp[15] is stored in 115C r15, so only 15 entries are used in the stack. 116 117 118 TEXT 119 .align 5 C powerPC 32 byte alignment 120.GSYM_PREFIX`'mulredc15: 121 122C ######################################################################## 123C # i = 0 pass 124C ######################################################################### 125 126C Pass for j = 0. We need to fetch x[i] from memory and compute the new u 127 128 ld r12, 0(r4) C XI = x[0] 129 ld r0, 0(r5) C y[0] 130 stdu r13, -8(r1) C save r13 131 mulld r8, r0, r12 C x[0]*y[0] low half 132 stdu r14, -8(r1) C save r14 133 mulhdu r9, r0, r12 C x[0]*y[0] high half 134 ld r0, 0(r6) C m[0] 135 mulld r11, r7, r8 C U = T0*invm mod 2^64 136 stdu r15, -8(r1) C save r15 137 mulld r13, r0, r11 C T0 = U*m[0] low 138 stdu r16, -8(r1) C save r16 139 li r16, 0 C set r16 to zero for carry propagation 140 subi r1, r1, 120 C set tmp stack space 141 mulhdu r14, r0, r11 C T1 = U*m[0] high 142 ld r0, 8(r5) C y[1] 143 addc r8, r8, r13 C 144 adde r13, r9, r14 C T0 = initial tmp(0) 145 addze r10, r16 C carry to CY 146 C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence 147 C CY:T1 <= 2*2^64 - 4 148 149C Pass for j = 1 150 151 mulld r8, r0, r12 C x[i]*y[j] low half 152 mulhdu r9, r0, r12 C x[i]*y[j] high half 153 ld r0, 8(r6) C m[j] 154 addc r13, r8, r13 C add low word to T0 155 adde r14, r9, r10 C add high word with carry + CY to T1 156 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 157 158 mulld r8, r0, r11 C U*m[j] low 159 mulhdu r9, r0, r11 C U*m[j] high 160 addc r8, r8, r13 C add T0 and low word 161 ld r0, 16(r5) C y[j+1] 162 adde r13, r9, r14 C add high word with carry to T1 163 addze r10, r16 C carry to CY 164 std r8, 0(r1) C store tmp[j-1] 165 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 166 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 167 168C Pass for j = 2 169 170 mulld r8, r0, r12 C x[i]*y[j] low half 171 mulhdu r9, r0, r12 C x[i]*y[j] high half 172 ld r0, 16(r6) C m[j] 173 addc r13, r8, r13 C add low word to T0 174 adde r14, r9, r10 C add high word with carry + CY to T1 175 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 176 177 mulld r8, r0, r11 C U*m[j] low 178 mulhdu r9, r0, r11 C U*m[j] high 179 addc r8, r8, r13 C add T0 and low word 180 ld r0, 24(r5) C y[j+1] 181 adde r13, r9, r14 C add high word with carry to T1 182 addze r10, r16 C carry to CY 183 std r8, 8(r1) C store tmp[j-1] 184 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 185 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 186 187C Pass for j = 3 188 189 mulld r8, r0, r12 C x[i]*y[j] low half 190 mulhdu r9, r0, r12 C x[i]*y[j] high half 191 ld r0, 24(r6) C m[j] 192 addc r13, r8, r13 C add low word to T0 193 adde r14, r9, r10 C add high word with carry + CY to T1 194 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 195 196 mulld r8, r0, r11 C U*m[j] low 197 mulhdu r9, r0, r11 C U*m[j] high 198 addc r8, r8, r13 C add T0 and low word 199 ld r0, 32(r5) C y[j+1] 200 adde r13, r9, r14 C add high word with carry to T1 201 addze r10, r16 C carry to CY 202 std r8, 16(r1) C store tmp[j-1] 203 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 204 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 205 206C Pass for j = 4 207 208 mulld r8, r0, r12 C x[i]*y[j] low half 209 mulhdu r9, r0, r12 C x[i]*y[j] high half 210 ld r0, 32(r6) C m[j] 211 addc r13, r8, r13 C add low word to T0 212 adde r14, r9, r10 C add high word with carry + CY to T1 213 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 214 215 mulld r8, r0, r11 C U*m[j] low 216 mulhdu r9, r0, r11 C U*m[j] high 217 addc r8, r8, r13 C add T0 and low word 218 ld r0, 40(r5) C y[j+1] 219 adde r13, r9, r14 C add high word with carry to T1 220 addze r10, r16 C carry to CY 221 std r8, 24(r1) C store tmp[j-1] 222 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 223 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 224 225C Pass for j = 5 226 227 mulld r8, r0, r12 C x[i]*y[j] low half 228 mulhdu r9, r0, r12 C x[i]*y[j] high half 229 ld r0, 40(r6) C m[j] 230 addc r13, r8, r13 C add low word to T0 231 adde r14, r9, r10 C add high word with carry + CY to T1 232 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 233 234 mulld r8, r0, r11 C U*m[j] low 235 mulhdu r9, r0, r11 C U*m[j] high 236 addc r8, r8, r13 C add T0 and low word 237 ld r0, 48(r5) C y[j+1] 238 adde r13, r9, r14 C add high word with carry to T1 239 addze r10, r16 C carry to CY 240 std r8, 32(r1) C store tmp[j-1] 241 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 242 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 243 244C Pass for j = 6 245 246 mulld r8, r0, r12 C x[i]*y[j] low half 247 mulhdu r9, r0, r12 C x[i]*y[j] high half 248 ld r0, 48(r6) C m[j] 249 addc r13, r8, r13 C add low word to T0 250 adde r14, r9, r10 C add high word with carry + CY to T1 251 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 252 253 mulld r8, r0, r11 C U*m[j] low 254 mulhdu r9, r0, r11 C U*m[j] high 255 addc r8, r8, r13 C add T0 and low word 256 ld r0, 56(r5) C y[j+1] 257 adde r13, r9, r14 C add high word with carry to T1 258 addze r10, r16 C carry to CY 259 std r8, 40(r1) C store tmp[j-1] 260 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 261 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 262 263C Pass for j = 7 264 265 mulld r8, r0, r12 C x[i]*y[j] low half 266 mulhdu r9, r0, r12 C x[i]*y[j] high half 267 ld r0, 56(r6) C m[j] 268 addc r13, r8, r13 C add low word to T0 269 adde r14, r9, r10 C add high word with carry + CY to T1 270 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 271 272 mulld r8, r0, r11 C U*m[j] low 273 mulhdu r9, r0, r11 C U*m[j] high 274 addc r8, r8, r13 C add T0 and low word 275 ld r0, 64(r5) C y[j+1] 276 adde r13, r9, r14 C add high word with carry to T1 277 addze r10, r16 C carry to CY 278 std r8, 48(r1) C store tmp[j-1] 279 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 280 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 281 282C Pass for j = 8 283 284 mulld r8, r0, r12 C x[i]*y[j] low half 285 mulhdu r9, r0, r12 C x[i]*y[j] high half 286 ld r0, 64(r6) C m[j] 287 addc r13, r8, r13 C add low word to T0 288 adde r14, r9, r10 C add high word with carry + CY to T1 289 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 290 291 mulld r8, r0, r11 C U*m[j] low 292 mulhdu r9, r0, r11 C U*m[j] high 293 addc r8, r8, r13 C add T0 and low word 294 ld r0, 72(r5) C y[j+1] 295 adde r13, r9, r14 C add high word with carry to T1 296 addze r10, r16 C carry to CY 297 std r8, 56(r1) C store tmp[j-1] 298 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 299 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 300 301C Pass for j = 9 302 303 mulld r8, r0, r12 C x[i]*y[j] low half 304 mulhdu r9, r0, r12 C x[i]*y[j] high half 305 ld r0, 72(r6) C m[j] 306 addc r13, r8, r13 C add low word to T0 307 adde r14, r9, r10 C add high word with carry + CY to T1 308 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 309 310 mulld r8, r0, r11 C U*m[j] low 311 mulhdu r9, r0, r11 C U*m[j] high 312 addc r8, r8, r13 C add T0 and low word 313 ld r0, 80(r5) C y[j+1] 314 adde r13, r9, r14 C add high word with carry to T1 315 addze r10, r16 C carry to CY 316 std r8, 64(r1) C store tmp[j-1] 317 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 318 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 319 320C Pass for j = 10 321 322 mulld r8, r0, r12 C x[i]*y[j] low half 323 mulhdu r9, r0, r12 C x[i]*y[j] high half 324 ld r0, 80(r6) C m[j] 325 addc r13, r8, r13 C add low word to T0 326 adde r14, r9, r10 C add high word with carry + CY to T1 327 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 328 329 mulld r8, r0, r11 C U*m[j] low 330 mulhdu r9, r0, r11 C U*m[j] high 331 addc r8, r8, r13 C add T0 and low word 332 ld r0, 88(r5) C y[j+1] 333 adde r13, r9, r14 C add high word with carry to T1 334 addze r10, r16 C carry to CY 335 std r8, 72(r1) C store tmp[j-1] 336 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 337 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 338 339C Pass for j = 11 340 341 mulld r8, r0, r12 C x[i]*y[j] low half 342 mulhdu r9, r0, r12 C x[i]*y[j] high half 343 ld r0, 88(r6) C m[j] 344 addc r13, r8, r13 C add low word to T0 345 adde r14, r9, r10 C add high word with carry + CY to T1 346 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 347 348 mulld r8, r0, r11 C U*m[j] low 349 mulhdu r9, r0, r11 C U*m[j] high 350 addc r8, r8, r13 C add T0 and low word 351 ld r0, 96(r5) C y[j+1] 352 adde r13, r9, r14 C add high word with carry to T1 353 addze r10, r16 C carry to CY 354 std r8, 80(r1) C store tmp[j-1] 355 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 356 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 357 358C Pass for j = 12 359 360 mulld r8, r0, r12 C x[i]*y[j] low half 361 mulhdu r9, r0, r12 C x[i]*y[j] high half 362 ld r0, 96(r6) C m[j] 363 addc r13, r8, r13 C add low word to T0 364 adde r14, r9, r10 C add high word with carry + CY to T1 365 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 366 367 mulld r8, r0, r11 C U*m[j] low 368 mulhdu r9, r0, r11 C U*m[j] high 369 addc r8, r8, r13 C add T0 and low word 370 ld r0, 104(r5) C y[j+1] 371 adde r13, r9, r14 C add high word with carry to T1 372 addze r10, r16 C carry to CY 373 std r8, 88(r1) C store tmp[j-1] 374 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 375 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 376 377C Pass for j = 13 378 379 mulld r8, r0, r12 C x[i]*y[j] low half 380 mulhdu r9, r0, r12 C x[i]*y[j] high half 381 ld r0, 104(r6) C m[j] 382 addc r13, r8, r13 C add low word to T0 383 adde r14, r9, r10 C add high word with carry + CY to T1 384 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 385 386 mulld r8, r0, r11 C U*m[j] low 387 mulhdu r9, r0, r11 C U*m[j] high 388 addc r8, r8, r13 C add T0 and low word 389 ld r0, 112(r5) C y[j+1] 390 adde r13, r9, r14 C add high word with carry to T1 391 addze r10, r16 C carry to CY 392 std r8, 96(r1) C store tmp[j-1] 393 C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <= 394 C 2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3 395 396C Pass for j = 14. Don't fetch new data from y[j+1]. 397 398 mulld r8, r0, r12 C x[i]*y[j] low half 399 mulhdu r9, r0, r12 C x[i]*y[j] high half 400 ld r0, 112(r6) C m[j] 401 addc r13, r8, r13 C add low word to T0 402 adde r14, r9, r10 C add high word with carry + CY to T1 403 C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry! 404 405 mulld r8, r0, r11 C U*m[j] low 406 mulhdu r9, r0, r11 C U*m[j] high 407 addc r8, r8, r13 C add T0 and low word 408 adde r13, r9, r14 C add high word with carry to T1 409 std r8, 104(r1) C store tmp[len-2] 410 addze r15, r16 C put carry in r15 (tmp[len] <= 1) 411 std r13, 112(r1) C store tmp[len-1] 412 413 414C ######################################################################### 415C # i > 0 passes 416C ######################################################################### 417 418 419 li r9, 14 C outer loop count 420 mtctr r9 421 4221: 423 424C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory 425C and compute the new u 426 427 ldu r12, 8(r4) C x[i] 428 ld r0, 0(r5) C y[0] 429 ld r13, 0(r1) C tmp[0] 430 mulld r8, r0, r12 C x[i]*y[0] low half 431 ld r14, 8(r1) C tmp[1] 432 mulhdu r9, r0, r12 C x[i]*y[0] high half 433 addc r13, r8, r13 C T0 434 ld r0, 0(r6) C m[0] 435 mulld r11, r7, r13 C U = T0*invm mod 2^64 436 adde r14, r9, r14 C T1 437 mulld r8, r0, r11 C U*m[0] low 438 addze r10, r16 C CY 439 mulhdu r9, r0, r11 C U*m[0] high 440 ld r0, 8(r5) C y[1] 441 addc r8, r8, r13 C result = 0 442 adde r13, r9, r14 C T0, carry pending 443 C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1, 444 C so cy:T1 <= 3*2^64 - 4 445 446C Pass for j = 1 447 448 ld r14, 16(r1) C tmp[j+1] 449 mulld r8, r0, r12 C x[i]*y[j] low half 450 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 451 addze r10, r16 C carry to CY 452 mulhdu r9, r0, r12 C x[i]*y[j] high half 453 ld r0, 8(r6) C m[j] 454 addc r13, r8, r13 C add low word to T0 455 mulld r8, r0, r11 C U*m[j] low 456 adde r14, r9, r14 C add high to T1 457 addze r10, r10 C add carry to CY 458 mulhdu r9, r0, r11 C U*m[j] high 459 addc r8, r8, r13 C add T0 and low word 460 ld r0, 16(r5) C y[j+1] 461 adde r13, r9, r14 C T1, carry pending 462 std r8, 0(r1) C store tmp[j-1] 463 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 464 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 465 466C Pass for j = 2 467 468 ld r14, 24(r1) C tmp[j+1] 469 mulld r8, r0, r12 C x[i]*y[j] low half 470 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 471 addze r10, r16 C carry to CY 472 mulhdu r9, r0, r12 C x[i]*y[j] high half 473 ld r0, 16(r6) C m[j] 474 addc r13, r8, r13 C add low word to T0 475 mulld r8, r0, r11 C U*m[j] low 476 adde r14, r9, r14 C add high to T1 477 addze r10, r10 C add carry to CY 478 mulhdu r9, r0, r11 C U*m[j] high 479 addc r8, r8, r13 C add T0 and low word 480 ld r0, 24(r5) C y[j+1] 481 adde r13, r9, r14 C T1, carry pending 482 std r8, 8(r1) C store tmp[j-1] 483 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 484 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 485 486C Pass for j = 3 487 488 ld r14, 32(r1) C tmp[j+1] 489 mulld r8, r0, r12 C x[i]*y[j] low half 490 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 491 addze r10, r16 C carry to CY 492 mulhdu r9, r0, r12 C x[i]*y[j] high half 493 ld r0, 24(r6) C m[j] 494 addc r13, r8, r13 C add low word to T0 495 mulld r8, r0, r11 C U*m[j] low 496 adde r14, r9, r14 C add high to T1 497 addze r10, r10 C add carry to CY 498 mulhdu r9, r0, r11 C U*m[j] high 499 addc r8, r8, r13 C add T0 and low word 500 ld r0, 32(r5) C y[j+1] 501 adde r13, r9, r14 C T1, carry pending 502 std r8, 16(r1) C store tmp[j-1] 503 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 504 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 505 506C Pass for j = 4 507 508 ld r14, 40(r1) C tmp[j+1] 509 mulld r8, r0, r12 C x[i]*y[j] low half 510 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 511 addze r10, r16 C carry to CY 512 mulhdu r9, r0, r12 C x[i]*y[j] high half 513 ld r0, 32(r6) C m[j] 514 addc r13, r8, r13 C add low word to T0 515 mulld r8, r0, r11 C U*m[j] low 516 adde r14, r9, r14 C add high to T1 517 addze r10, r10 C add carry to CY 518 mulhdu r9, r0, r11 C U*m[j] high 519 addc r8, r8, r13 C add T0 and low word 520 ld r0, 40(r5) C y[j+1] 521 adde r13, r9, r14 C T1, carry pending 522 std r8, 24(r1) C store tmp[j-1] 523 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 524 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 525 526C Pass for j = 5 527 528 ld r14, 48(r1) C tmp[j+1] 529 mulld r8, r0, r12 C x[i]*y[j] low half 530 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 531 addze r10, r16 C carry to CY 532 mulhdu r9, r0, r12 C x[i]*y[j] high half 533 ld r0, 40(r6) C m[j] 534 addc r13, r8, r13 C add low word to T0 535 mulld r8, r0, r11 C U*m[j] low 536 adde r14, r9, r14 C add high to T1 537 addze r10, r10 C add carry to CY 538 mulhdu r9, r0, r11 C U*m[j] high 539 addc r8, r8, r13 C add T0 and low word 540 ld r0, 48(r5) C y[j+1] 541 adde r13, r9, r14 C T1, carry pending 542 std r8, 32(r1) C store tmp[j-1] 543 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 544 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 545 546C Pass for j = 6 547 548 ld r14, 56(r1) C tmp[j+1] 549 mulld r8, r0, r12 C x[i]*y[j] low half 550 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 551 addze r10, r16 C carry to CY 552 mulhdu r9, r0, r12 C x[i]*y[j] high half 553 ld r0, 48(r6) C m[j] 554 addc r13, r8, r13 C add low word to T0 555 mulld r8, r0, r11 C U*m[j] low 556 adde r14, r9, r14 C add high to T1 557 addze r10, r10 C add carry to CY 558 mulhdu r9, r0, r11 C U*m[j] high 559 addc r8, r8, r13 C add T0 and low word 560 ld r0, 56(r5) C y[j+1] 561 adde r13, r9, r14 C T1, carry pending 562 std r8, 40(r1) C store tmp[j-1] 563 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 564 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 565 566C Pass for j = 7 567 568 ld r14, 64(r1) C tmp[j+1] 569 mulld r8, r0, r12 C x[i]*y[j] low half 570 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 571 addze r10, r16 C carry to CY 572 mulhdu r9, r0, r12 C x[i]*y[j] high half 573 ld r0, 56(r6) C m[j] 574 addc r13, r8, r13 C add low word to T0 575 mulld r8, r0, r11 C U*m[j] low 576 adde r14, r9, r14 C add high to T1 577 addze r10, r10 C add carry to CY 578 mulhdu r9, r0, r11 C U*m[j] high 579 addc r8, r8, r13 C add T0 and low word 580 ld r0, 64(r5) C y[j+1] 581 adde r13, r9, r14 C T1, carry pending 582 std r8, 48(r1) C store tmp[j-1] 583 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 584 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 585 586C Pass for j = 8 587 588 ld r14, 72(r1) C tmp[j+1] 589 mulld r8, r0, r12 C x[i]*y[j] low half 590 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 591 addze r10, r16 C carry to CY 592 mulhdu r9, r0, r12 C x[i]*y[j] high half 593 ld r0, 64(r6) C m[j] 594 addc r13, r8, r13 C add low word to T0 595 mulld r8, r0, r11 C U*m[j] low 596 adde r14, r9, r14 C add high to T1 597 addze r10, r10 C add carry to CY 598 mulhdu r9, r0, r11 C U*m[j] high 599 addc r8, r8, r13 C add T0 and low word 600 ld r0, 72(r5) C y[j+1] 601 adde r13, r9, r14 C T1, carry pending 602 std r8, 56(r1) C store tmp[j-1] 603 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 604 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 605 606C Pass for j = 9 607 608 ld r14, 80(r1) C tmp[j+1] 609 mulld r8, r0, r12 C x[i]*y[j] low half 610 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 611 addze r10, r16 C carry to CY 612 mulhdu r9, r0, r12 C x[i]*y[j] high half 613 ld r0, 72(r6) C m[j] 614 addc r13, r8, r13 C add low word to T0 615 mulld r8, r0, r11 C U*m[j] low 616 adde r14, r9, r14 C add high to T1 617 addze r10, r10 C add carry to CY 618 mulhdu r9, r0, r11 C U*m[j] high 619 addc r8, r8, r13 C add T0 and low word 620 ld r0, 80(r5) C y[j+1] 621 adde r13, r9, r14 C T1, carry pending 622 std r8, 64(r1) C store tmp[j-1] 623 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 624 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 625 626C Pass for j = 10 627 628 ld r14, 88(r1) C tmp[j+1] 629 mulld r8, r0, r12 C x[i]*y[j] low half 630 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 631 addze r10, r16 C carry to CY 632 mulhdu r9, r0, r12 C x[i]*y[j] high half 633 ld r0, 80(r6) C m[j] 634 addc r13, r8, r13 C add low word to T0 635 mulld r8, r0, r11 C U*m[j] low 636 adde r14, r9, r14 C add high to T1 637 addze r10, r10 C add carry to CY 638 mulhdu r9, r0, r11 C U*m[j] high 639 addc r8, r8, r13 C add T0 and low word 640 ld r0, 88(r5) C y[j+1] 641 adde r13, r9, r14 C T1, carry pending 642 std r8, 72(r1) C store tmp[j-1] 643 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 644 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 645 646C Pass for j = 11 647 648 ld r14, 96(r1) C tmp[j+1] 649 mulld r8, r0, r12 C x[i]*y[j] low half 650 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 651 addze r10, r16 C carry to CY 652 mulhdu r9, r0, r12 C x[i]*y[j] high half 653 ld r0, 88(r6) C m[j] 654 addc r13, r8, r13 C add low word to T0 655 mulld r8, r0, r11 C U*m[j] low 656 adde r14, r9, r14 C add high to T1 657 addze r10, r10 C add carry to CY 658 mulhdu r9, r0, r11 C U*m[j] high 659 addc r8, r8, r13 C add T0 and low word 660 ld r0, 96(r5) C y[j+1] 661 adde r13, r9, r14 C T1, carry pending 662 std r8, 80(r1) C store tmp[j-1] 663 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 664 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 665 666C Pass for j = 12 667 668 ld r14, 104(r1) C tmp[j+1] 669 mulld r8, r0, r12 C x[i]*y[j] low half 670 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 671 addze r10, r16 C carry to CY 672 mulhdu r9, r0, r12 C x[i]*y[j] high half 673 ld r0, 96(r6) C m[j] 674 addc r13, r8, r13 C add low word to T0 675 mulld r8, r0, r11 C U*m[j] low 676 adde r14, r9, r14 C add high to T1 677 addze r10, r10 C add carry to CY 678 mulhdu r9, r0, r11 C U*m[j] high 679 addc r8, r8, r13 C add T0 and low word 680 ld r0, 104(r5) C y[j+1] 681 adde r13, r9, r14 C T1, carry pending 682 std r8, 88(r1) C store tmp[j-1] 683 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 684 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 685 686C Pass for j = 13 687 688 ld r14, 112(r1) C tmp[j+1] 689 mulld r8, r0, r12 C x[i]*y[j] low half 690 adde r14, r14, r10 C tmp[j+1] + CY + pending carry 691 addze r10, r16 C carry to CY 692 mulhdu r9, r0, r12 C x[i]*y[j] high half 693 ld r0, 104(r6) C m[j] 694 addc r13, r8, r13 C add low word to T0 695 mulld r8, r0, r11 C U*m[j] low 696 adde r14, r9, r14 C add high to T1 697 addze r10, r10 C add carry to CY 698 mulhdu r9, r0, r11 C U*m[j] high 699 addc r8, r8, r13 C add T0 and low word 700 ld r0, 112(r5) C y[j+1] 701 adde r13, r9, r14 C T1, carry pending 702 std r8, 96(r1) C store tmp[j-1] 703 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64 704 C <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3 705 706C Pass for j = 14. Don't fetch new data from y[j+1]. 707 708 mulld r8, r0, r12 C x[i]*y[j] low half 709 adde r14, r15, r10 C T1 = tmp[len] + CY + pending carry 710 C since tmp[len] <= 1, T1 <= 3 and carry is zero 711 mulhdu r9, r0, r12 C x[i]*y[j] high half 712 ld r0, 112(r6) C m[j] 713 addc r13, r8, r13 C add low word to T0 714 mulld r8, r0, r11 C U*m[j] low 715 adde r14, r9, r14 C add high to T1 716 addze r10, r16 C CY 717 mulhdu r9, r0, r11 C U*m[j] high 718 addc r8, r8, r13 C add T0 and low word 719 adde r13, r9, r14 C T1, carry pending 720 std r8, 104(r1) C store tmp[len-2] 721 addze r15, r10 C store tmp[len] <= 1 722 std r13, 112(r1) C store tmp[len-1] 723 C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64 724 C <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1) 725 726 bdnz 1b 727 728C Copy result from tmp memory to z 729 730 ld r8, 0(r1) 731 ldu r9, 8(r1) 732 std r8, 0(r3) 733 stdu r9, 8(r3) 734 ldu r8, 8(r1) 735 ldu r9, 8(r1) 736 stdu r8, 8(r3) 737 stdu r9, 8(r3) 738 ldu r8, 8(r1) 739 ldu r9, 8(r1) 740 stdu r8, 8(r3) 741 stdu r9, 8(r3) 742 ldu r8, 8(r1) 743 ldu r9, 8(r1) 744 stdu r8, 8(r3) 745 stdu r9, 8(r3) 746 ldu r8, 8(r1) 747 ldu r9, 8(r1) 748 stdu r8, 8(r3) 749 stdu r9, 8(r3) 750 ldu r8, 8(r1) 751 ldu r9, 8(r1) 752 stdu r8, 8(r3) 753 stdu r9, 8(r3) 754 ldu r8, 8(r1) 755 ldu r9, 8(r1) 756 stdu r8, 8(r3) 757 stdu r9, 8(r3) 758 ldu r8, 8(r1) 759 stdu r8, 8(r3) 760 761 mr r3, r15 C return tmp(len) 762 ldu r16, 8(r1) 763 ldu r15, 8(r1) 764 ldu r14, 8(r1) 765 ldu r13, 8(r1) 766 addi r1, r1, 8 767 blr 768 769 .size .GSYM_PREFIX`'mulredc15, .-.GSYM_PREFIX`'mulredc15 770 771