1#if 0
2static void
3matmul_sub_large_fbd(abobj_ptr x, abt ** sb, const abt * z, const uint8_t * q, unsigned int n)
4{
5    /* Dispatch data found in z[0]...z[n] such that z[i] is in array
6     * pointed to by sb[q[i]]. Exactly n coefficients are expected. All
7     * the sb[] pointers are increased */
8    for(unsigned int c = 0 ; c < n ; c++) {
9        abcopy(x, sb[q[c]], z, 1);
10        sb[q[c]]+= aboffset(x, 1);
11        z += aboffset(x, 1);
12    }
13}
14#endif
15	.text
16	.p2align 4,,15
17.globl matmul_sub_large_fbd
18	.type	matmul_sub_large_fbd, @function
19matmul_sub_large_fbd:
20        pushq %rbp
21        pushq %rbx
22        pushq %r15
23        pushq %r14
24        pushq %r13
25        pushq %r12
26        // sb pointer --> store in rdi
27        movq    %rsi, %rdi
28        // z pointer --> store in rsi
29	movq	%rdx, %rsi
30        // matrix pointer --> rbp
31        movq    %rcx, %rbp
32
33        movq    %r8, %r9
34        test    %r9, %r9
35
36        // at .Lfinished, rbp is shipped as a return value.
37	je	.Lfinished
38
39        // we want our reads aligned, we want to unroll our main loop,
40        // and leave at least some space in the tail because the tight
41        // loop does some readahead.
42
43#define ALIGN_MASK      7
44
45#define one_simple_move(reg2,reg3,reg4)   	\
46        movq    (%rsi), % ## reg3			; \
47        movzbl  (%rbp),  % ## reg2 ## d		        ; \
48	movq	(%rdi,% ## reg2,8), % ## reg4		; \
49	movq	% ## reg3, (% ## reg4)		       	; \
50	addq	$8, (%rdi,% ## reg2,8)                  ; \
51        addq    $1, %rbp                                ; \
52        addq    $8, %rsi
53
54.Lunaligned_loop:
55        movq    %rbp, %rax
56        andq     $ALIGN_MASK, %rax
57        je .Laligned
58        one_simple_move(r8, r10, r11)
59        subq    $1, %r9
60	je	.Lfinished
61        jmp .Lunaligned_loop
62
63.Laligned:
64        // now rbp is aligned.
65
66        // four bytes of readahead seem to provide timings a lot more
67        // stable than with just two.
68
69#define READAHEAD_COEFFS        32
70        // we're reading READAHEAD_COEFFS coeffs ahead of time, so the
71        // tail must contain at least that many coefficients.
72
73        leaq    (%rbp,%r9,1),%r15
74
75        cmpq $READAHEAD_COEFFS, %r9
76
77        // the tail loop is just simple and stupid. rbp points to a
78        // location where exactly r9 coeffs are to be read. Note that
79        // some coefficients are already in registers at this point. But
80        // this does not matter much (we could avoid re-reading, but
81        // really we don't care).
82        jb .Ltail_loop
83
84        // we're going to advance rbp one loop iteration ahead. This
85        // eases the handling of the tail.
86
87#define LOOP_LENGTH     32
88
89        subq $READAHEAD_COEFFS, %r9
90        // r15+1*READAHEAD_COEFFS : end data pointer.
91        leaq    (%rbp,%r9,1),%r15
92
93        movq 0(%rbp), %rax
94        movq 8(%rbp), %rbx
95        movq 16(%rbp), %rcx
96        movq 24(%rbp), %rdx
97
98        addq    $LOOP_LENGTH, %rbp
99
100	cmpq	%rbp, %r15
101	jb	.Lfixup_before_tail
102
103.Lmain_loop:
104
105// reg1 : one of a..d, containing 8 coeffs, one per byte (only the bucket
106// number).
107// reg2 scratch buffer for storing bucket number
108// reg3 scratch buffer for the coefficient block in the source vector
109// reg4 scratch buffer for the address in the bucket
110#define use_one_coefficient(reg1,reg2,reg3,reg4,soffset)   	\
111        movq    soffset(%rsi), % ## reg3			; \
112        movzbl  % ## reg1 ## l, % ## reg2 ## d		        ; \
113	movq	(%rdi,% ## reg2,8), % ## reg4			; \
114	movq	% ## reg3, (% ## reg4)		        	; \
115	addq	$8, % ## reg4                                   ; \
116        movq    % ## reg4, (%rdi,% ## reg2,8)
117
118#define eight_moves_and_reload(reg1,reg2,reg3,reg4,offset,soffset)       \
119        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 0)   ; \
120        shrq    $8, %r ## reg1 ## x                             ; \
121        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 8)   ; \
122        shrq    $8, %r ## reg1 ## x                             ; \
123        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 16)  ; \
124        shrq    $8, %r ## reg1 ## x                             ; \
125        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 24)  ; \
126        shrq    $8, %r ## reg1 ## x                             ; \
127        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 32)  ; \
128        shrl    $8, %e ## reg1 ## x                             ; \
129        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 40)  ; \
130        shrl    $8, %e ## reg1 ## x                             ; \
131        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 48)  ; \
132        shrl    $8, %e ## reg1 ## x                             ; \
133        use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 56)  ; \
134	movq	offset (%rbp), %r ## reg1 ## x
135
136        prefetchnta     0x100(%rsi)
137        eight_moves_and_reload(a,r8,r9,r10,,0)
138        addq    $0x40, %rsi
139        eight_moves_and_reload(b,r11,r12,r13,8,0)
140        addq    $0x40, %rsi
141        eight_moves_and_reload(c,r8,r9,r10,16,0)
142        addq    $0x40, %rsi
143        eight_moves_and_reload(d,r11,r12,r13,24,0)
144        addq    $0x40, %rsi
145
146        // note that we need 2*READAHEAD_COEFFS bytes of readahead at the
147        // end of the array.
148
149	addq	$LOOP_LENGTH, %rbp
150	cmpq	%rbp, %r15
151	ja	.Lmain_loop
152
153.Lfixup_before_tail:
154        // fix r15 for real value
155        movq    $READAHEAD_COEFFS, %r9
156        leaq    (%r15,%r9,1), %r15
157        // also fix rbp
158        subq    $LOOP_LENGTH, %rbp
159
160.Ltail_loop:
161        cmpq    %rbp, %r15
162        je      .Lfinished
163        one_simple_move(r8, r10, r11)
164        jmp .Ltail_loop
165
166.Lfinished:
167	movq	%rbp, %rax
168
169        popq %r12
170        popq %r13
171        popq %r14
172        popq %r15
173        popq %rbx
174        popq %rbp
175	ret
176	.size	matmul_sub_large_fbd, .-matmul_sub_large_fbd
177