1#if 0
2static inline void matmul_sub_large_fbi(abobj_ptr x, abt ** sb, const abt * z, const uint8_t * q, unsigned int n)
3{
4    /* Dispatch data found in z[0]...z[f(n-1)] such that z[f(i)] is in
5     * array pointed to by sb[q[2*i+1]]. The function f(i) is given by
6     * the sum q[0]+q[2]+...+q[2*(i-1)]. Exactly 2n coefficients are
7     * expected in q[] All the sb[] pointers are increased */
8    for(unsigned int c = 0 ; c < n ; c++) {
9        z += aboffset(x, *q);
10        q++;
11        abcopy(x, sb[*q], z, 1);
12        sb[*q]+= aboffset(x, 1);
13        q++;
14    }
15}
16
17static inline void matmul_sub_large_fbi(ulong ** sb, const ulong * z, const uint8_t * q, unsigned int n)
18    for(unsigned int c = 0 ; c < n ; c++) {
19        z += *q++;
20        *sb[*q++]++ = *z;
21    }
22}
23#endif
24	.text
25	.p2align 4,,15
26.globl matmul_sub_large_fbi
27	.type	matmul_sub_large_fbi, @function
28matmul_sub_large_fbi:
29        pushq %rbp
30        pushq %rbx
31        pushq %r15
32        pushq %r14
33        pushq %r13
34        pushq %r12
35        // sb pointer --> store in rdi
36        movq    %rsi, %rdi
37        // z pointer --> store in rsi
38	movq	%rdx, %rsi
39        // matrix pointer --> rbp
40        movq    %rcx, %rbp
41
42        movq    %r8, %r9
43        test    %r9,%r9
44
45        // at .Lfinished, rbp is shipped as a return value.
46	je	.Lfinished
47
48        // we want our reads aligned, we want to unroll our main loop,
49        // and leave at least some space in the tail because the tight
50        // loop does some readahead.
51
52#define ALIGN_MASK      15
53
54#define one_simple_move(reg2,reg3,reg4)   	\
55        movzbl  (%rbp),  % ## reg2 ## d	        	        ; \
56        movq    (%rsi,% ## reg2,8), % ## reg3			; \
57	leaq	(%rsi,% ## reg2,8), %rsi			; \
58        movzbl  1(%rbp),  % ## reg2 ## d		        ; \
59	movq	(%rdi,% ## reg2,8), % ## reg4			; \
60	movq	% ## reg3, (% ## reg4)		        	; \
61	addq	$8, (%rdi,% ## reg2,8)                          ; \
62        addq    $2, %rbp
63
64.Lunaligned_loop:
65        movq    %rbp, %rax
66        andq     $ALIGN_MASK, %rax
67        je .Laligned
68        one_simple_move(r8, r10, r11)
69        subq    $1, %r9
70	je	.Lfinished
71        jmp .Lunaligned_loop
72
73.Laligned:
74        // now rbp is aligned.
75
76        // four bytes of readahead seem to provide timings a lot more
77        // stable than with just two.
78
79#define READAHEAD_COEFFS        16
80        // we're reading READAHEAD_COEFFS coeffs ahead of time, so the
81        // tail must contain at least that many coefficients.
82
83        leaq    (%rbp,%r9,2),%r15
84
85        cmpq $READAHEAD_COEFFS, %r9
86
87        // the tail loop is just simple and stupid. rbp points to a
88        // location where exactly r9 coeffs are to be read. Note that
89        // some coefficients are already in registers at this point. But
90        // this does not matter much (we could avoid re-reading, but
91        // really we don't care).
92        jb .Ltail_loop
93
94        // we're going to advance rbp one loop iteration ahead. This
95        // eases the handling of the tail.
96
97#define LOOP_LENGTH     16
98
99        subq $READAHEAD_COEFFS, %r9
100        // r15+2*READAHEAD_COEFFS : end data pointer.
101        leaq    (%rbp,%r9,2),%r15
102
103// do the readahead.
104
105        // movq 0(%rbp), %rax
106        movdqa (%rbp), %xmm0
107        // movq 8(%rbp), %rbx
108        // movq 16(%rbp), %rcx
109        // movq 24(%rbp), %rdx
110
111        addq    $LOOP_LENGTH, %rbp
112
113	cmpq	%rbp, %r15
114	jb	.Lfixup_before_tail
115
116        movq    %rsp, %rcx
117        subq    $16, %rsp
118        andq    $-16, %rsp
119        movq    %rcx, -8(%rsp)
120
121.Lmain_loop:
122
123// reg1 : one of a..d, containing 4 coeffs in two byte pairs (col index
124// difference, bucket number).
125// reg2 scratch buffer for storing index difference and (afterwards)
126// bucket number
127// reg3 scratch buffer for the coefficient block in the source vector
128// reg4 scratch buffer for the address in the bucket
129#define use_one_coefficient(reg1,reg2,reg3,reg4)   	        \
130        movzbl  % ## reg1 ## l, % ## reg2 ## d		        ; \
131        movq    (%rsi,% ## reg2,8), % ## reg3			; \
132	leaq	(%rsi,% ## reg2,8), %rsi			; \
133        shrq    $8, %r ## reg1 ## x		               	; \
134        movzbl  % ## reg1 ## l, % ## reg2 ## d		        ; \
135	movq	(%rdi,% ## reg2,8), % ## reg4			; \
136	movq	% ## reg3, (% ## reg4)		        	; \
137	addq	$8, % ## reg4                                   ; \
138        movq    % ## reg4, (%rdi,% ## reg2,8)
139
140#define four_moves_and_reload(reg1,reg2,reg3,reg4,offset)       \
141        use_one_coefficient(reg1,reg2,reg3,reg4)                ; \
142        shrq    $8, %r ## reg1 ## x                             ; \
143        use_one_coefficient(reg1,reg2,reg3,reg4)                ; \
144        shrq    $8, %r ## reg1 ## x                             ; \
145        use_one_coefficient(reg1,reg2,reg3,reg4)                ; \
146        shrl    $8, %e ## reg1 ## x                             ; \
147        use_one_coefficient(reg1,reg2,reg3,reg4)                ; \
148	movq	offset (%rbp), %r ## reg1 ## x
149
150        // on core2 U9400, using vstrip 6 of c135b, 0x200 and 0x400 are
151        // good choices.
152        // prefetchnta     0x200(%rsi)
153        // four_moves_and_reload(a,r8,r9,r10,)
154        // four_moves_and_reload(b,r11,r12,r13,8)
155        // four_moves_and_reload(c,r8,r9,r10,16)
156        // four_moves_and_reload(d,r11,r12,r13,24)
157
158//////////////////////////
159
160        // prefetcht1     0x40(%rsi)
161        // prefetcht1     0x80(%rsi)
162        // prefetcht1     0xc0(%rsi)
163        // prefetcht1     0x100(%rsi)
164
165        movdqa  %xmm0, (%rsp)
166        movq    (%rsp), %rdx
167        movq    8(%rsp), %rax
168        movdqa  0(%rbp), %xmm0
169        // movq    %rax, %rdx
170        // movq    0(%rbp), %rax
171
172#define read_two_bytes_and_load(dreg,r8,r9d,mm0)              \
173        movzbl  % ## dreg ## l, % ## r8 ## d       	; \
174        leaq    (%rsi,% ## r8,8), %rsi  		; \
175        movq    (%rsi), % ## mm0        		; \
176        shrq    $8,%r ## dreg ## x                 	; \
177        movzbl  % ## dreg ## l, % ## r9d       		; \
178        shrq    $8,%r ## dreg ## x
179
180#define read_two_bytes_and_load2(dreg,bx,r9d,mm0)              \
181        movzbl  % ## dreg ## l, %e ## bx       		; \
182        leaq    (%rsi,%r ## bx,8), %rsi  		; \
183        movq    (%rsi), % ## mm0        		; \
184        shrq    $8,%r ## dreg ## x                 	; \
185        movzbl  % ## dreg ## l, % ## r9d       		; \
186        shrq    $8,%r ## dreg ## x
187
188        read_two_bytes_and_load(d,r8,r8d,mm0)
189        read_two_bytes_and_load(d,r10,r10d,mm1)
190        read_two_bytes_and_load(d,r12,r12d,mm2)
191        read_two_bytes_and_load(d,r14,r14d,mm3)
192
193        read_two_bytes_and_load(a,r9,r9d,mm4)
194        read_two_bytes_and_load(a,r11,r11d,mm5)
195        read_two_bytes_and_load(a,r13,r13d,mm6)
196        read_two_bytes_and_load2(a,bx,ebx,mm7)
197
198        // now do stores
199
200#define do_one_store(r9,mm0)                    \
201	movq	(%rdi,% ## r9,8), %rcx          ; \
202	movq	% ## mm0, (%rcx)                ; \
203	addq	$8, %rcx                        ; \
204        movq    %rcx, (%rdi,% ## r9,8)
205
206        do_one_store(r8,mm0)
207        do_one_store(r10,mm1)
208        do_one_store(r12,mm2)
209        do_one_store(r14,mm3)
210
211        do_one_store(r9,mm4)
212        do_one_store(r11,mm5)
213        do_one_store(r13,mm6)
214        do_one_store(rbx,mm7)
215
216
217//////////////////////////
218
219        // note that we need 2*READAHEAD_COEFFS bytes of readahead at the
220        // end of the array.
221
222	addq	$LOOP_LENGTH, %rbp
223	cmpq	%rbp, %r15
224	ja	.Lmain_loop
225
226        emms
227
228        movq    -8(%rsp), %rsp
229
230.Lfixup_before_tail:
231        // fix r15 for real value
232        movq    $READAHEAD_COEFFS, %r9
233        leaq    (%r15,%r9,2), %r15
234        // also fix rbp
235        subq    $LOOP_LENGTH, %rbp
236
237.Ltail_loop:
238        cmpq    %rbp, %r15
239        je      .Lfinished
240        one_simple_move(r8, r10, r11)
241        jmp .Ltail_loop
242
243.Lfinished:
244	movq	%rbp, %rax
245
246        popq %r12
247        popq %r13
248        popq %r14
249        popq %r15
250        popq %rbx
251        popq %rbp
252	ret
253	.size	matmul_sub_large_fbi, .-matmul_sub_large_fbi
254