1/*
2 * Copyright 2014 Martin Peres <martin.peres@free.fr>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the folloing conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * Authors: Martin Peres
23 */
24
25/******************************************************************************
26 * arith data segment
27 *****************************************************************************/
28#ifdef INCLUDE_PROC
29#endif
30
31#ifdef INCLUDE_DATA
32#endif
33
34/******************************************************************************
35 * arith code segment
36 *****************************************************************************/
37#ifdef INCLUDE_CODE
38
39// does a 32x32 -> 64 multiplication
40//
41// A * B = A_lo * B_lo
42//        + ( A_hi * B_lo ) << 16
43//        + ( A_lo * B_hi ) << 16
44//        + ( A_hi * B_hi ) << 32
45//
46// $r15 - current
47// $r14 - A
48// $r13 - B
49// $r12 - mul_lo (return)
50// $r11 - mul_hi (return)
51// $r0  - zero
52mulu32_32_64:
53	push $r1 // A_hi
54	push $r2 // B_hi
55	push $r3 // tmp0
56	push $r4 // tmp1
57
58	shr b32 $r1 $r14 16
59	shr b32 $r2 $r13 16
60
61	clear b32 $r12
62	clear b32 $r11
63
64	// A_lo * B_lo
65	mulu $r12 $r14 $r13
66
67	// ( A_hi * B_lo ) << 16
68	mulu $r3 $r1 $r13 // tmp0 = A_hi * B_lo
69	mov b32 $r4 $r3
70	and $r3 0xffff // tmp0 = tmp0_lo
71	shl b32 $r3 16
72	shr b32 $r4 16 // tmp1 = tmp0_hi
73	add b32 $r12 $r3
74	adc b32 $r11 $r4
75
76	// ( A_lo * B_hi ) << 16
77	mulu $r3 $r14 $r2 // tmp0 = A_lo * B_hi
78	mov b32 $r4 $r3
79	and $r3 0xffff // tmp0 = tmp0_lo
80	shl b32 $r3 16
81	shr b32 $r4 16 // tmp1 = tmp0_hi
82	add b32 $r12 $r3
83	adc b32 $r11 $r4
84
85	// ( A_hi * B_hi ) << 32
86	mulu $r3 $r1 $r2 // tmp0 = A_hi * B_hi
87	add b32 $r11 $r3
88
89	pop $r4
90	pop $r3
91	pop $r2
92	pop $r1
93	ret
94#endif
95