1#if 0 2static inline void matmul_sub_large_fbi(abobj_ptr x, abt ** sb, const abt * z, const uint8_t * q, unsigned int n) 3{ 4 /* Dispatch data found in z[0]...z[f(n-1)] such that z[f(i)] is in 5 * array pointed to by sb[q[2*i+1]]. The function f(i) is given by 6 * the sum q[0]+q[2]+...+q[2*(i-1)]. Exactly 2n coefficients are 7 * expected in q[] All the sb[] pointers are increased */ 8 for(unsigned int c = 0 ; c < n ; c++) { 9 z += aboffset(x, *q); 10 q++; 11 abcopy(x, sb[*q], z, 1); 12 sb[*q]+= aboffset(x, 1); 13 q++; 14 } 15} 16 17static inline void matmul_sub_large_fbi(ulong ** sb, const ulong * z, const uint8_t * q, unsigned int n) 18 for(unsigned int c = 0 ; c < n ; c++) { 19 z += *q++; 20 *sb[*q++]++ = *z; 21 } 22} 23#endif 24 .text 25 .p2align 4,,15 26.globl matmul_sub_large_fbi 27 .type matmul_sub_large_fbi, @function 28matmul_sub_large_fbi: 29 pushq %rbp 30 pushq %rbx 31 pushq %r15 32 pushq %r14 33 pushq %r13 34 pushq %r12 35 // sb pointer --> store in rdi 36 movq %rsi, %rdi 37 // z pointer --> store in rsi 38 movq %rdx, %rsi 39 // matrix pointer --> rbp 40 movq %rcx, %rbp 41 42 movq %r8, %r9 43 test %r9,%r9 44 45 // at .Lfinished, rbp is shipped as a return value. 46 je .Lfinished 47 48 // we want our reads aligned, we want to unroll our main loop, 49 // and leave at least some space in the tail because the tight 50 // loop does some readahead. 51 52#define ALIGN_MASK 15 53 54#define one_simple_move(reg2,reg3,reg4) \ 55 movzbl (%rbp), % ## reg2 ## d ; \ 56 movq (%rsi,% ## reg2,8), % ## reg3 ; \ 57 leaq (%rsi,% ## reg2,8), %rsi ; \ 58 movzbl 1(%rbp), % ## reg2 ## d ; \ 59 movq (%rdi,% ## reg2,8), % ## reg4 ; \ 60 movq % ## reg3, (% ## reg4) ; \ 61 addq $8, (%rdi,% ## reg2,8) ; \ 62 addq $2, %rbp 63 64.Lunaligned_loop: 65 movq %rbp, %rax 66 andq $ALIGN_MASK, %rax 67 je .Laligned 68 one_simple_move(r8, r10, r11) 69 subq $1, %r9 70 je .Lfinished 71 jmp .Lunaligned_loop 72 73.Laligned: 74 // now rbp is aligned. 75 76 // four bytes of readahead seem to provide timings a lot more 77 // stable than with just two. 78 79#define READAHEAD_COEFFS 16 80 // we're reading READAHEAD_COEFFS coeffs ahead of time, so the 81 // tail must contain at least that many coefficients. 82 83 leaq (%rbp,%r9,2),%r15 84 85 cmpq $READAHEAD_COEFFS, %r9 86 87 // the tail loop is just simple and stupid. rbp points to a 88 // location where exactly r9 coeffs are to be read. Note that 89 // some coefficients are already in registers at this point. But 90 // this does not matter much (we could avoid re-reading, but 91 // really we don't care). 92 jb .Ltail_loop 93 94 // we're going to advance rbp one loop iteration ahead. This 95 // eases the handling of the tail. 96 97#define LOOP_LENGTH 16 98 99 subq $READAHEAD_COEFFS, %r9 100 // r15+2*READAHEAD_COEFFS : end data pointer. 101 leaq (%rbp,%r9,2),%r15 102 103// do the readahead. 104 105 // movq 0(%rbp), %rax 106 movdqa (%rbp), %xmm0 107 // movq 8(%rbp), %rbx 108 // movq 16(%rbp), %rcx 109 // movq 24(%rbp), %rdx 110 111 addq $LOOP_LENGTH, %rbp 112 113 cmpq %rbp, %r15 114 jb .Lfixup_before_tail 115 116 movq %rsp, %rcx 117 subq $16, %rsp 118 andq $-16, %rsp 119 movq %rcx, -8(%rsp) 120 121.Lmain_loop: 122 123// reg1 : one of a..d, containing 4 coeffs in two byte pairs (col index 124// difference, bucket number). 125// reg2 scratch buffer for storing index difference and (afterwards) 126// bucket number 127// reg3 scratch buffer for the coefficient block in the source vector 128// reg4 scratch buffer for the address in the bucket 129#define use_one_coefficient(reg1,reg2,reg3,reg4) \ 130 movzbl % ## reg1 ## l, % ## reg2 ## d ; \ 131 movq (%rsi,% ## reg2,8), % ## reg3 ; \ 132 leaq (%rsi,% ## reg2,8), %rsi ; \ 133 shrq $8, %r ## reg1 ## x ; \ 134 movzbl % ## reg1 ## l, % ## reg2 ## d ; \ 135 movq (%rdi,% ## reg2,8), % ## reg4 ; \ 136 movq % ## reg3, (% ## reg4) ; \ 137 addq $8, % ## reg4 ; \ 138 movq % ## reg4, (%rdi,% ## reg2,8) 139 140#define four_moves_and_reload(reg1,reg2,reg3,reg4,offset) \ 141 use_one_coefficient(reg1,reg2,reg3,reg4) ; \ 142 shrq $8, %r ## reg1 ## x ; \ 143 use_one_coefficient(reg1,reg2,reg3,reg4) ; \ 144 shrq $8, %r ## reg1 ## x ; \ 145 use_one_coefficient(reg1,reg2,reg3,reg4) ; \ 146 shrl $8, %e ## reg1 ## x ; \ 147 use_one_coefficient(reg1,reg2,reg3,reg4) ; \ 148 movq offset (%rbp), %r ## reg1 ## x 149 150 // on core2 U9400, using vstrip 6 of c135b, 0x200 and 0x400 are 151 // good choices. 152 // prefetchnta 0x200(%rsi) 153 // four_moves_and_reload(a,r8,r9,r10,) 154 // four_moves_and_reload(b,r11,r12,r13,8) 155 // four_moves_and_reload(c,r8,r9,r10,16) 156 // four_moves_and_reload(d,r11,r12,r13,24) 157 158////////////////////////// 159 160 // prefetcht1 0x40(%rsi) 161 // prefetcht1 0x80(%rsi) 162 // prefetcht1 0xc0(%rsi) 163 // prefetcht1 0x100(%rsi) 164 165 movdqa %xmm0, (%rsp) 166 movq (%rsp), %rdx 167 movq 8(%rsp), %rax 168 movdqa 0(%rbp), %xmm0 169 // movq %rax, %rdx 170 // movq 0(%rbp), %rax 171 172#define read_two_bytes_and_load(dreg,r8,r9d,mm0) \ 173 movzbl % ## dreg ## l, % ## r8 ## d ; \ 174 leaq (%rsi,% ## r8,8), %rsi ; \ 175 movq (%rsi), % ## mm0 ; \ 176 shrq $8,%r ## dreg ## x ; \ 177 movzbl % ## dreg ## l, % ## r9d ; \ 178 shrq $8,%r ## dreg ## x 179 180#define read_two_bytes_and_load2(dreg,bx,r9d,mm0) \ 181 movzbl % ## dreg ## l, %e ## bx ; \ 182 leaq (%rsi,%r ## bx,8), %rsi ; \ 183 movq (%rsi), % ## mm0 ; \ 184 shrq $8,%r ## dreg ## x ; \ 185 movzbl % ## dreg ## l, % ## r9d ; \ 186 shrq $8,%r ## dreg ## x 187 188 read_two_bytes_and_load(d,r8,r8d,mm0) 189 read_two_bytes_and_load(d,r10,r10d,mm1) 190 read_two_bytes_and_load(d,r12,r12d,mm2) 191 read_two_bytes_and_load(d,r14,r14d,mm3) 192 193 read_two_bytes_and_load(a,r9,r9d,mm4) 194 read_two_bytes_and_load(a,r11,r11d,mm5) 195 read_two_bytes_and_load(a,r13,r13d,mm6) 196 read_two_bytes_and_load2(a,bx,ebx,mm7) 197 198 // now do stores 199 200#define do_one_store(r9,mm0) \ 201 movq (%rdi,% ## r9,8), %rcx ; \ 202 movq % ## mm0, (%rcx) ; \ 203 addq $8, %rcx ; \ 204 movq %rcx, (%rdi,% ## r9,8) 205 206 do_one_store(r8,mm0) 207 do_one_store(r10,mm1) 208 do_one_store(r12,mm2) 209 do_one_store(r14,mm3) 210 211 do_one_store(r9,mm4) 212 do_one_store(r11,mm5) 213 do_one_store(r13,mm6) 214 do_one_store(rbx,mm7) 215 216 217////////////////////////// 218 219 // note that we need 2*READAHEAD_COEFFS bytes of readahead at the 220 // end of the array. 221 222 addq $LOOP_LENGTH, %rbp 223 cmpq %rbp, %r15 224 ja .Lmain_loop 225 226 emms 227 228 movq -8(%rsp), %rsp 229 230.Lfixup_before_tail: 231 // fix r15 for real value 232 movq $READAHEAD_COEFFS, %r9 233 leaq (%r15,%r9,2), %r15 234 // also fix rbp 235 subq $LOOP_LENGTH, %rbp 236 237.Ltail_loop: 238 cmpq %rbp, %r15 239 je .Lfinished 240 one_simple_move(r8, r10, r11) 241 jmp .Ltail_loop 242 243.Lfinished: 244 movq %rbp, %rax 245 246 popq %r12 247 popq %r13 248 popq %r14 249 popq %r15 250 popq %rbx 251 popq %rbp 252 ret 253 .size matmul_sub_large_fbi, .-matmul_sub_large_fbi 254