1; 2; feilipu, 2020 March 3; 4; This Source Code Form is subject to the terms of the Mozilla Public 5; License, v. 2.0. If a copy of the MPL was not distributed with this 6; file, You can obtain one at http://mozilla.org/MPL/2.0/. 7; 8;------------------------------------------------------------------------------ 9; 10; Using RC2014 LUT Module 11; 12;------------------------------------------------------------------------------ 13 14INCLUDE "config_private.inc" 15 16SECTION code_clib 17SECTION code_math 18 19PUBLIC l_lut_mulu_64_32x32, l0_lut_mulu_64_32x32 20 21l_lut_mulu_64_32x32: 22 23 ; multiplication of two 32-bit numbers into a 64-bit product 24 ; 25 ; enter : dehl = 32-bit multiplicand 26 ; dehl'= 32-bit multiplicand 27 ; 28 ; exit : dehl dehl' = 64-bit product 29 ; carry reset 30 ; 31 ; uses : af, bc, de, hl, af', bc', de', hl' 32 33 ld c,l 34 ld b,h 35 push de 36 exx 37 pop bc 38 push hl 39 exx 40 pop de 41 42l0_lut_mulu_64_32x32: 43 44 ; multiplication of two 32-bit numbers into a 64-bit product 45 ; 46 ; enter : de'de = 32-bit multiplier = x 47 ; bc'bc = 32-bit multiplicand = y 48 ; 49 ; exit : dehl dehl' = 64-bit product 50 ; carry reset 51 ; 52 ; uses : af, bc, de, hl, af', bc', de', hl' 53 54 ; save material for the byte p7 p6 = x3*y3 + p5 carry 55 exx ; 4 ' 56 ld h,d ; 4 ' 57 ld l,b ; 4 ' 58 push hl ; 10'x3 y3 59 60 ; save material for the byte p5 = x3*y2 + x2*y3 + p4 carry 61 ld l,c ; 4 ' 62 push hl ; 11'x3 y2 63 ld h,b ; 4 ' 64 ld l,e ; 4 ' 65 push hl ; 11'y3 x2 66 67 ; save material for the byte p4 = x3*y1 + x2*y2 + x1*y3 + p3 carry 68 ld h,e ; 4 ' 69 ld l,c ; 4 ' 70 push hl ; 11'x2 y2 71 ld h,d ; 4 ' 72 ld l,b ; 4 ' 73 push hl ; 11'x3 y3 74 exx ; 4 75 ld l,b ; 4 76 ld h,d ; 4 77 push hl ; 11 x1 y1 78 79 ; save material for the byte p3 = x3*y0 + x2*y1 + x1*y2 + x0*y3 + p2 carry 80 push bc ; 11 y1 y0 81 exx ; 4 ' 82 push de ; 11'x3 x2 83 push bc ; 11'y3 y2 84 exx ; 4 85 push de ; 11 x1 x0 86 87 ; save material for the byte p2 = x2*y0 + x0*y2 + x1*y1 + p1 carry 88 ; start of 32_32x32 89 exx ; 4 ' 90 ld h,e ; 4 ' 91 ld l,c ; 4 ' 92 push hl ; 11'x2 y2 93 94 exx ; 4 95 ld h,e ; 4 96 ld l,c ; 4 97 push hl ; 11 x0 y0 98 99 ; start of 32_16x16 p1 = x1*y0 + x0*y1 + p0 carry 100 ; p0 = x0*y0 101 102 ld h,d ; 4 103 ld l,b ; 4 104 push hl ; 11 x1 y1 105 106 ld h,e ; 4 107 ld l,c ; 4 108 push hl ; 11 x0 y0 109 110 ld h,b ; 4 y1 111 ld l,c ; 4 y0 112 113;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; y1*x0 114 ld c,__IO_LUT_OPERAND_LATCH ; 7 operand latch address 115 ld b,h ; 4 operand Y in B 116 out (c),e ; 12 operand X from E 117 in e,(c) ; 12 result Z LSB to E 118 inc c ; 4 result MSB address 119 in h,(c) ; 12 result Z MSB to H 120 121;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x1*y0 122 dec c ; 4 operand latch address 123 ld b,d ; 4 operand Y in B 124 out (c),l ; 12 operand X from L 125 in l,(c) ; 12 result Z LSB to L 126 inc c ; 4 result MSB address 127 in d,(c) ; 12 result Z MSB to D 128 129 xor a ; 4 zero A 130 add hl,de ; 11 add cross products 131 adc a,a ; 4 capture carry 132 133 pop de ; 10 restore y0*x0 134 135;;; MLT DE (xBC) ;;;;;;;;;;;;;;;; y0*x0 136 dec c ; 4 operand latch address 137 ld b,d ; 4 operand Y in B 138 out (c),e ; 12 operand X from A 139 in e,(c) ; 12 result Z LSB to E 140 inc c ; 4 result MSB address 141 in d,(c) ; 12 result Z MSB to D 142 143 ld b,a ; 4 carry from cross products 144 145 ld a,d ; 4 146 add a,l ; 4 147 ld d,a ; 4 de = final LSW 148 149 ld l,h ; 4 LSB of MSW from cross products 150 ld h,b ; 4 carry from cross products 151 152 ex (sp),hl ; 19 restore y1*x1, stack interim p3 p2 153 154;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; x1*y1 155 dec c ; 4 operand latch address 156 ld b,h ; 4 operand Y in B 157 out (c),l ; 12 operand X from L 158 in l,(c) ; 12 result Z LSB to L 159 inc c ; 4 result MSB address 160 in h,(c) ; 12 result Z MSB to H 161 162 pop bc ; 10 destack interim p3 p2 163 164 adc hl,bc ; 15 HL = interim MSW p3 p2 165 ex de,hl ; 4 DEHL = end of 32_16x16 166 167 push de ; 11 stack interim p3 p2 168 169 ; continue doing the p2 byte 170 171 exx ; 4 now we're working in the high order bytes 172 ; DEHL' = end of 32_16x16 173 pop hl ; 10 destack interim p3 p2 174 175 pop de ; 10 x0 y0 176 ex (sp),hl ; 19 x2 y2, stack interim p3 p2 177 178;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; x2*y0 179 ld c,__IO_LUT_OPERAND_LATCH ; 7 operand latch address 180 ld b,h ; 4 operand Y in B 181 out (c),e ; 12 operand X from E 182 in e,(c) ; 12 result Z LSB to E 183 inc c ; 4 result MSB address 184 in h,(c) ; 12 result Z MSB to H 185 186;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x0*y2 187 dec c ; 4 operand latch address 188 ld b,d ; 4 operand Y in B 189 out (c),l ; 12 operand X from L 190 in l,(c) ; 12 result Z LSB to L 191 inc c ; 4 result MSB address 192 in d,(c) ; 12 result Z MSB to D 193 194 xor a ; 4 195 add hl,de ; 11 196 adc a,a ; 4 capture carry p4 197 pop de ; 10 destack interim p3 p2 198 add hl,de ; 11 199 adc a,0 ; 4 capture carry p4 200 201 push hl ; 11 202 203 exx ; 4 ' 204 pop de ; 10'save p2 in E' 205 206 exx ; 4 207 208 ld l,h ; 4 promote HL p4 p3 209 ld h,a ; 4 210 211 ; start doing the p3 byte 212 213 pop de ; 10 y3 y2 214 ex (sp),hl ; 19 x1 x0, stack interim p4 p3 215 216;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; x1*y2 217 dec c ; 4 operand latch address 218 ld b,h ; 4 operand Y in B 219 out (c),e ; 12 operand X from E 220 in e,(c) ; 12 result Z LSB to E 221 inc c ; 4 result MSB address 222 in h,(c) ; 12 result Z MSB to H 223;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; y3*x0 224 dec c ; 4 operand latch address 225 ld b,d ; 4 operand Y in B 226 out (c),l ; 12 operand X from L 227 in l,(c) ; 12 result Z LSB to L 228 inc c ; 4 result MSB address 229 in d,(c) ; 12 result Z MSB to D 230 231 xor a ; 4 zero A 232 add hl,de ; 11 p4 p3 233 adc a,a ; 4 p5 234 pop de ; 10 destack interim p4 p3 235 add hl,de ; 11 p4 p3 236 adc a,0 ; 4 p5 237 238 pop de ; 10 x3 x2 239 ex (sp),hl ; 19 y1 y0, stack interim p4 p3 240 241;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; y1*x2 242 dec c ; 4 operand latch address 243 ld b,h ; 4 operand Y in B 244 out (c),e ; 12 operand X from E 245 in e,(c) ; 12 result Z LSB to E 246 inc c ; 4 result MSB address 247 in h,(c) ; 12 result Z MSB to H 248;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x3*y0 249 dec c ; 4 operand latch address 250 ld b,d ; 4 operand Y in B 251 out (c),l ; 12 operand X from L 252 in l,(c) ; 12 result Z LSB to L 253 inc c ; 4 result MSB address 254 in d,(c) ; 12 result Z MSB to D 255 256 add hl,de ; 11 p4 p3 257 adc a,0 ; 4 p5 258 259 pop de ; 10 destack interim p4 p3 260 add hl,de ; 11 p4 p3 261 adc a,0 ; 4 p5 262 263 push hl ; 11 leave final p3 in L 264 265 exx ; 4 ' 266 pop bc ; 10' 267 ld d,c ; 4 'put final p3 in D 268 269 exx ; 4 low 32bits in DEHL 270 271 ld l,h ; 4 prepare HL for next cycle 272 ld h,a ; 4 promote HL p5 p4 273 274 ; start doing the p4 byte 275 276 pop de ; 10 x1 y1 277 ex (sp),hl ; 19 x3 y3, stack interim p5 p4 278 279;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; x3*y1 280 dec c ; 4 operand latch address 281 ld b,h ; 4 operand Y in B 282 out (c),e ; 12 operand X from E 283 in e,(c) ; 12 result Z LSB to E 284 inc c ; 4 result MSB address 285 in h,(c) ; 12 result Z MSB to H 286;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x1*y3 287 dec c ; 4 operand latch address 288 ld b,d ; 4 operand Y in B 289 out (c),l ; 12 operand X from L 290 in l,(c) ; 12 result Z LSB to L 291 inc c ; 4 result MSB address 292 in d,(c) ; 12 result Z MSB to D 293 294 xor a ; 4 zero A 295 add hl,de ; 11 p5 p4 296 adc a,a ; 4 p6 297 298 pop de ; 10 destack interim p5 p4 299 add hl,de ; 11 p5 p4 300 adc a,0 ; 7 p6 301 302 pop de ; 10 x2 y2 303 304;;; MLT DE (xBC) ;;;;;;;;;;;;;;;; x2*y2 305 dec c ; 4 operand latch address 306 ld b,d ; 4 operand Y in B 307 out (c),e ; 12 operand X from E 308 in e,(c) ; 12 result Z LSB to E 309 inc c ; 4 result MSB address 310 in d,(c) ; 12 result Z MSB to D 311 312 add hl,de ; 11 p5 p4 313 adc a,0 ; 4 p6 314 315 ld e,l ; 4 final p4 byte in E 316 ld l,h ; 4 prepare HL for next cycle 317 ld h,a ; 4 promote HL p6 p5 318 319 ; start doing the p5 byte 320 321 ex (sp),hl ; 19 y3 x2, stack interim p6 p5 322 323;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; y3*x2 324 dec c ; 4 operand latch address 325 ld b,h ; 4 operand Y in B 326 out (c),l ; 12 operand X from L 327 in l,(c) ; 12 result Z LSB to L 328 inc c ; 4 result MSB address 329 in h,(c) ; 12 result Z MSB to H 330 331 xor a ; 4 zero A 332 pop bc ; 10 destack interim p6 p5 333 add hl,bc ; 11 p6 p5 334 adc a,a ; 4 p7 335 336 ex (sp),hl ; 19 x3 y2, stack interim p6 p5 337 338;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; x3*y2 339 ld c,__IO_LUT_OPERAND_LATCH ; 7 operand latch address 340 ld b,h ; 4 operand Y in B 341 out (c),l ; 12 operand X from L 342 in l,(c) ; 12 result Z LSB to L 343 inc c ; 4 result MSB address 344 in h,(c) ; 12 result Z MSB to H 345 346 pop bc ; 10 destack interim p6 p5 347 add hl,bc ; 4 p6 p5 348 adc a,0 ; 4 p7 349 350 ld d,l ; 4 final p5 byte in D 351 ld l,h ; 4 prepare HL for next cycle 352 ld h,a ; 4 promote HL p7 p6 353 354 ; start doing the p6 p7 bytes 355 356 ex (sp),hl ; 19 x3 y3, stack interim p7 p6 357 358;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; x3*y3 359 ld c,__IO_LUT_OPERAND_LATCH ; 7 operand latch address 360 ld b,h ; 4 operand Y in B 361 out (c),l ; 12 operand X from L 362 in l,(c) ; 12 result Z LSB to L 363 inc c ; 4 result MSB address 364 in h,(c) ; 12 result Z MSB to H 365 366 pop bc ; 10 destack interim p7 p6 367 add hl,bc ; 4 p7 p6 368 ex de,hl ; 4 p7 p6 <-> p5 p4 369 370 ret ; exit : DEHL DEHL' = 64-bit product 371