1;
2;  feilipu, 2020 March
3;
4;  This Source Code Form is subject to the terms of the Mozilla Public
5;  License, v. 2.0. If a copy of the MPL was not distributed with this
6;  file, You can obtain one at http://mozilla.org/MPL/2.0/.
7;
8;------------------------------------------------------------------------------
9;
10; Using RC2014 LUT Module
11;
12;------------------------------------------------------------------------------
13
14INCLUDE "config_private.inc"
15
16SECTION code_clib
17SECTION code_math
18
19PUBLIC l_lut_mulu_64_32x32, l0_lut_mulu_64_32x32
20
21l_lut_mulu_64_32x32:
22
23    ; multiplication of two 32-bit numbers into a 64-bit product
24    ;
25    ; enter : dehl = 32-bit multiplicand
26    ;         dehl'= 32-bit multiplicand
27    ;
28    ; exit  : dehl dehl' = 64-bit product
29    ;         carry reset
30    ;
31    ; uses  : af, bc, de, hl, af', bc', de', hl'
32
33    ld c,l
34    ld b,h
35    push de
36    exx
37    pop bc
38    push hl
39    exx
40    pop de
41
42l0_lut_mulu_64_32x32:
43
44    ; multiplication of two 32-bit numbers into a 64-bit product
45    ;
46    ; enter : de'de = 32-bit multiplier    = x
47    ;         bc'bc = 32-bit multiplicand  = y
48    ;
49    ; exit  : dehl dehl' = 64-bit product
50    ;         carry reset
51    ;
52    ; uses  : af, bc, de, hl, af', bc', de', hl'
53
54    ; save material for the byte p7 p6 = x3*y3 + p5 carry
55    exx                         ; 4 '
56    ld h,d                      ; 4 '
57    ld l,b                      ; 4 '
58    push hl                     ; 10'x3 y3
59
60    ; save material for the byte p5 = x3*y2 + x2*y3 + p4 carry
61    ld l,c                      ; 4 '
62    push hl                     ; 11'x3 y2
63    ld h,b                      ; 4 '
64    ld l,e                      ; 4 '
65    push hl                     ; 11'y3 x2
66
67    ; save material for the byte p4 = x3*y1 + x2*y2 + x1*y3 + p3 carry
68    ld h,e                      ; 4 '
69    ld l,c                      ; 4 '
70    push hl                     ; 11'x2 y2
71    ld h,d                      ; 4 '
72    ld l,b                      ; 4 '
73    push hl                     ; 11'x3 y3
74    exx                         ; 4
75    ld l,b                      ; 4
76    ld h,d                      ; 4
77    push hl                     ; 11 x1 y1
78
79    ; save material for the byte p3 = x3*y0 + x2*y1 + x1*y2 + x0*y3 + p2 carry
80    push bc                     ; 11 y1 y0
81    exx                         ; 4 '
82    push de                     ; 11'x3 x2
83    push bc                     ; 11'y3 y2
84    exx                         ; 4
85    push de                     ; 11 x1 x0
86
87    ; save material for the byte p2 = x2*y0 + x0*y2 + x1*y1 + p1 carry
88    ; start of 32_32x32
89    exx                         ; 4 '
90    ld h,e                      ; 4 '
91    ld l,c                      ; 4 '
92    push hl                     ; 11'x2 y2
93
94    exx                         ; 4
95    ld h,e                      ; 4
96    ld l,c                      ; 4
97    push hl                     ; 11 x0 y0
98
99    ; start of 32_16x16          p1 = x1*y0 + x0*y1 + p0 carry
100    ;                            p0 = x0*y0
101
102    ld h,d                      ; 4
103    ld l,b                      ; 4
104    push hl                     ; 11 x1 y1
105
106    ld h,e                      ; 4
107    ld l,c                      ; 4
108    push hl                     ; 11 x0 y0
109
110    ld h,b                      ; 4  y1
111    ld l,c                      ; 4  y0
112
113;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; y1*x0
114    ld c,__IO_LUT_OPERAND_LATCH ; 7  operand latch address
115    ld b,h                      ; 4  operand Y in B
116    out (c),e                   ; 12 operand X from E
117    in e,(c)                    ; 12 result Z LSB to E
118    inc c                       ; 4  result MSB address
119    in h,(c)                    ; 12 result Z MSB to H
120
121;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x1*y0
122    dec c                       ; 4  operand latch address
123    ld b,d                      ; 4  operand Y in B
124    out (c),l                   ; 12 operand X from L
125    in l,(c)                    ; 12 result Z LSB to L
126    inc c                       ; 4  result MSB address
127    in d,(c)                    ; 12 result Z MSB to D
128
129    xor a                       ; 4  zero A
130    add hl,de                   ; 11 add cross products
131    adc a,a                     ; 4  capture carry
132
133    pop de                      ; 10 restore y0*x0
134
135;;; MLT DE (xBC) ;;;;;;;;;;;;;;;; y0*x0
136    dec c                       ; 4  operand latch address
137    ld b,d                      ; 4  operand Y in B
138    out (c),e                   ; 12 operand X from A
139    in e,(c)                    ; 12 result Z LSB to E
140    inc c                       ; 4  result MSB address
141    in d,(c)                    ; 12 result Z MSB to D
142
143    ld b,a                      ; 4  carry from cross products
144
145    ld a,d                      ; 4
146    add a,l                     ; 4
147    ld d,a                      ; 4  de = final LSW
148
149    ld l,h                      ; 4  LSB of MSW from cross products
150    ld h,b                      ; 4  carry from cross products
151
152    ex (sp),hl                  ; 19 restore y1*x1, stack interim p3 p2
153
154;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; x1*y1
155    dec c                       ; 4  operand latch address
156    ld b,h                      ; 4  operand Y in B
157    out (c),l                   ; 12 operand X from L
158    in l,(c)                    ; 12 result Z LSB to L
159    inc c                       ; 4  result MSB address
160    in h,(c)                    ; 12 result Z MSB to H
161
162    pop bc                      ; 10 destack interim p3 p2
163
164    adc hl,bc                   ; 15 HL = interim MSW p3 p2
165    ex de,hl                    ; 4  DEHL = end of 32_16x16
166
167    push de                     ; 11 stack interim p3 p2
168
169    ; continue doing the p2 byte
170
171    exx                         ; 4  now we're working in the high order bytes
172                                ;    DEHL' = end of 32_16x16
173    pop hl                      ; 10 destack interim p3 p2
174
175    pop de                      ; 10 x0 y0
176    ex (sp),hl                  ; 19 x2 y2, stack interim p3 p2
177
178;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; x2*y0
179    ld c,__IO_LUT_OPERAND_LATCH ; 7  operand latch address
180    ld b,h                      ; 4  operand Y in B
181    out (c),e                   ; 12 operand X from E
182    in e,(c)                    ; 12 result Z LSB to E
183    inc c                       ; 4  result MSB address
184    in h,(c)                    ; 12 result Z MSB to H
185
186;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x0*y2
187    dec c                       ; 4  operand latch address
188    ld b,d                      ; 4  operand Y in B
189    out (c),l                   ; 12 operand X from L
190    in l,(c)                    ; 12 result Z LSB to L
191    inc c                       ; 4  result MSB address
192    in d,(c)                    ; 12 result Z MSB to D
193
194    xor a                       ; 4
195    add hl,de                   ; 11
196    adc a,a                     ; 4  capture carry p4
197    pop de                      ; 10 destack interim p3 p2
198    add hl,de                   ; 11
199    adc a,0                     ; 4  capture carry p4
200
201    push hl                     ; 11
202
203    exx                         ; 4 '
204    pop de                      ; 10'save p2 in E'
205
206    exx                         ; 4
207
208    ld l,h                      ; 4  promote HL p4 p3
209    ld h,a                      ; 4
210
211    ; start doing the p3 byte
212
213    pop de                      ; 10 y3 y2
214    ex (sp),hl                  ; 19 x1 x0, stack interim p4 p3
215
216;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; x1*y2
217    dec c                       ; 4  operand latch address
218    ld b,h                      ; 4  operand Y in B
219    out (c),e                   ; 12 operand X from E
220    in e,(c)                    ; 12 result Z LSB to E
221    inc c                       ; 4  result MSB address
222    in h,(c)                    ; 12 result Z MSB to H
223;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; y3*x0
224    dec c                       ; 4  operand latch address
225    ld b,d                      ; 4  operand Y in B
226    out (c),l                   ; 12 operand X from L
227    in l,(c)                    ; 12 result Z LSB to L
228    inc c                       ; 4  result MSB address
229    in d,(c)                    ; 12 result Z MSB to D
230
231    xor a                       ; 4  zero A
232    add hl,de                   ; 11 p4 p3
233    adc a,a                     ; 4  p5
234    pop de                      ; 10 destack interim p4 p3
235    add hl,de                   ; 11 p4 p3
236    adc a,0                     ; 4  p5
237
238    pop de                      ; 10 x3 x2
239    ex (sp),hl                  ; 19 y1 y0, stack interim p4 p3
240
241;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; y1*x2
242    dec c                       ; 4  operand latch address
243    ld b,h                      ; 4  operand Y in B
244    out (c),e                   ; 12 operand X from E
245    in e,(c)                    ; 12 result Z LSB to E
246    inc c                       ; 4  result MSB address
247    in h,(c)                    ; 12 result Z MSB to H
248;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x3*y0
249    dec c                       ; 4  operand latch address
250    ld b,d                      ; 4  operand Y in B
251    out (c),l                   ; 12 operand X from L
252    in l,(c)                    ; 12 result Z LSB to L
253    inc c                       ; 4  result MSB address
254    in d,(c)                    ; 12 result Z MSB to D
255
256    add hl,de                   ; 11 p4 p3
257    adc a,0                     ; 4  p5
258
259    pop de                      ; 10 destack interim p4 p3
260    add hl,de                   ; 11 p4 p3
261    adc a,0                     ; 4  p5
262
263    push hl                     ; 11 leave final p3 in L
264
265    exx                         ; 4 '
266    pop bc                      ; 10'
267    ld d,c                      ; 4 'put final p3 in D
268
269    exx                         ; 4  low 32bits in DEHL
270
271    ld l,h                      ; 4  prepare HL for next cycle
272    ld h,a                      ; 4  promote HL p5 p4
273
274    ; start doing the p4 byte
275
276    pop de                      ; 10 x1 y1
277    ex (sp),hl                  ; 19 x3 y3, stack interim p5 p4
278
279;;; MLT HE (xBC) ;;;;;;;;;;;;;;;; x3*y1
280    dec c                       ; 4  operand latch address
281    ld b,h                      ; 4  operand Y in B
282    out (c),e                   ; 12 operand X from E
283    in e,(c)                    ; 12 result Z LSB to E
284    inc c                       ; 4  result MSB address
285    in h,(c)                    ; 12 result Z MSB to H
286;;; MLT DL (xBC) ;;;;;;;;;;;;;;;; x1*y3
287    dec c                       ; 4  operand latch address
288    ld b,d                      ; 4  operand Y in B
289    out (c),l                   ; 12 operand X from L
290    in l,(c)                    ; 12 result Z LSB to L
291    inc c                       ; 4  result MSB address
292    in d,(c)                    ; 12 result Z MSB to D
293
294    xor a                       ; 4  zero A
295    add hl,de                   ; 11 p5 p4
296    adc a,a                     ; 4  p6
297
298    pop de                      ; 10 destack interim p5 p4
299    add hl,de                   ; 11 p5 p4
300    adc a,0                     ; 7  p6
301
302    pop de                      ; 10 x2 y2
303
304;;; MLT DE (xBC) ;;;;;;;;;;;;;;;; x2*y2
305    dec c                       ; 4  operand latch address
306    ld b,d                      ; 4  operand Y in B
307    out (c),e                   ; 12 operand X from E
308    in e,(c)                    ; 12 result Z LSB to E
309    inc c                       ; 4  result MSB address
310    in d,(c)                    ; 12 result Z MSB to D
311
312    add hl,de                   ; 11 p5 p4
313    adc a,0                     ; 4  p6
314
315    ld e,l                      ; 4  final p4 byte in E
316    ld l,h                      ; 4  prepare HL for next cycle
317    ld h,a                      ; 4  promote HL p6 p5
318
319    ; start doing the p5 byte
320
321    ex (sp),hl                  ; 19 y3 x2, stack interim p6 p5
322
323;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; y3*x2
324    dec c                       ; 4  operand latch address
325    ld b,h                      ; 4  operand Y in B
326    out (c),l                   ; 12 operand X from L
327    in l,(c)                    ; 12 result Z LSB to L
328    inc c                       ; 4  result MSB address
329    in h,(c)                    ; 12 result Z MSB to H
330
331    xor a                       ; 4  zero A
332    pop bc                      ; 10 destack interim p6 p5
333    add hl,bc                   ; 11 p6 p5
334    adc a,a                     ; 4  p7
335
336    ex (sp),hl                  ; 19 x3 y2, stack interim p6 p5
337
338;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; x3*y2
339    ld c,__IO_LUT_OPERAND_LATCH ; 7  operand latch address
340    ld b,h                      ; 4  operand Y in B
341    out (c),l                   ; 12 operand X from L
342    in l,(c)                    ; 12 result Z LSB to L
343    inc c                       ; 4  result MSB address
344    in h,(c)                    ; 12 result Z MSB to H
345
346    pop bc                      ; 10 destack interim p6 p5
347    add hl,bc                   ; 4  p6 p5
348    adc a,0                     ; 4  p7
349
350    ld d,l                      ; 4  final p5 byte in D
351    ld l,h                      ; 4  prepare HL for next cycle
352    ld h,a                      ; 4  promote HL p7 p6
353
354    ; start doing the p6 p7 bytes
355
356    ex (sp),hl                  ; 19 x3 y3, stack interim p7 p6
357
358;;; MLT HL (xBC) ;;;;;;;;;;;;;;;; x3*y3
359    ld c,__IO_LUT_OPERAND_LATCH ; 7  operand latch address
360    ld b,h                      ; 4  operand Y in B
361    out (c),l                   ; 12 operand X from L
362    in l,(c)                    ; 12 result Z LSB to L
363    inc c                       ; 4  result MSB address
364    in h,(c)                    ; 12 result Z MSB to H
365
366    pop bc                      ; 10 destack interim p7 p6
367    add hl,bc                   ; 4  p7 p6
368    ex de,hl                    ; 4  p7 p6 <-> p5 p4
369
370    ret                         ;    exit  : DEHL DEHL' = 64-bit product
371