1#if 0 2static void 3matmul_sub_large_fbd(abobj_ptr x, abt ** sb, const abt * z, const uint8_t * q, unsigned int n) 4{ 5 /* Dispatch data found in z[0]...z[n] such that z[i] is in array 6 * pointed to by sb[q[i]]. Exactly n coefficients are expected. All 7 * the sb[] pointers are increased */ 8 for(unsigned int c = 0 ; c < n ; c++) { 9 abcopy(x, sb[q[c]], z, 1); 10 sb[q[c]]+= aboffset(x, 1); 11 z += aboffset(x, 1); 12 } 13} 14#endif 15 .text 16 .p2align 4,,15 17.globl matmul_sub_large_fbd 18 .type matmul_sub_large_fbd, @function 19matmul_sub_large_fbd: 20 pushq %rbp 21 pushq %rbx 22 pushq %r15 23 pushq %r14 24 pushq %r13 25 pushq %r12 26 // sb pointer --> store in rdi 27 movq %rsi, %rdi 28 // z pointer --> store in rsi 29 movq %rdx, %rsi 30 // matrix pointer --> rbp 31 movq %rcx, %rbp 32 33 movq %r8, %r9 34 test %r9, %r9 35 36 // at .Lfinished, rbp is shipped as a return value. 37 je .Lfinished 38 39 // we want our reads aligned, we want to unroll our main loop, 40 // and leave at least some space in the tail because the tight 41 // loop does some readahead. 42 43#define ALIGN_MASK 7 44 45#define one_simple_move(reg2,reg3,reg4) \ 46 movq (%rsi), % ## reg3 ; \ 47 movzbl (%rbp), % ## reg2 ## d ; \ 48 movq (%rdi,% ## reg2,8), % ## reg4 ; \ 49 movq % ## reg3, (% ## reg4) ; \ 50 addq $8, (%rdi,% ## reg2,8) ; \ 51 addq $1, %rbp ; \ 52 addq $8, %rsi 53 54.Lunaligned_loop: 55 movq %rbp, %rax 56 andq $ALIGN_MASK, %rax 57 je .Laligned 58 one_simple_move(r8, r10, r11) 59 subq $1, %r9 60 je .Lfinished 61 jmp .Lunaligned_loop 62 63.Laligned: 64 // now rbp is aligned. 65 66 // four bytes of readahead seem to provide timings a lot more 67 // stable than with just two. 68 69#define READAHEAD_COEFFS 32 70 // we're reading READAHEAD_COEFFS coeffs ahead of time, so the 71 // tail must contain at least that many coefficients. 72 73 leaq (%rbp,%r9,1),%r15 74 75 cmpq $READAHEAD_COEFFS, %r9 76 77 // the tail loop is just simple and stupid. rbp points to a 78 // location where exactly r9 coeffs are to be read. Note that 79 // some coefficients are already in registers at this point. But 80 // this does not matter much (we could avoid re-reading, but 81 // really we don't care). 82 jb .Ltail_loop 83 84 // we're going to advance rbp one loop iteration ahead. This 85 // eases the handling of the tail. 86 87#define LOOP_LENGTH 32 88 89 subq $READAHEAD_COEFFS, %r9 90 // r15+1*READAHEAD_COEFFS : end data pointer. 91 leaq (%rbp,%r9,1),%r15 92 93 movq 0(%rbp), %rax 94 movq 8(%rbp), %rbx 95 movq 16(%rbp), %rcx 96 movq 24(%rbp), %rdx 97 98 addq $LOOP_LENGTH, %rbp 99 100 cmpq %rbp, %r15 101 jb .Lfixup_before_tail 102 103.Lmain_loop: 104 105// reg1 : one of a..d, containing 8 coeffs, one per byte (only the bucket 106// number). 107// reg2 scratch buffer for storing bucket number 108// reg3 scratch buffer for the coefficient block in the source vector 109// reg4 scratch buffer for the address in the bucket 110#define use_one_coefficient(reg1,reg2,reg3,reg4,soffset) \ 111 movq soffset(%rsi), % ## reg3 ; \ 112 movzbl % ## reg1 ## l, % ## reg2 ## d ; \ 113 movq (%rdi,% ## reg2,8), % ## reg4 ; \ 114 movq % ## reg3, (% ## reg4) ; \ 115 addq $8, % ## reg4 ; \ 116 movq % ## reg4, (%rdi,% ## reg2,8) 117 118#define eight_moves_and_reload(reg1,reg2,reg3,reg4,offset,soffset) \ 119 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 0) ; \ 120 shrq $8, %r ## reg1 ## x ; \ 121 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 8) ; \ 122 shrq $8, %r ## reg1 ## x ; \ 123 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 16) ; \ 124 shrq $8, %r ## reg1 ## x ; \ 125 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 24) ; \ 126 shrq $8, %r ## reg1 ## x ; \ 127 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 32) ; \ 128 shrl $8, %e ## reg1 ## x ; \ 129 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 40) ; \ 130 shrl $8, %e ## reg1 ## x ; \ 131 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 48) ; \ 132 shrl $8, %e ## reg1 ## x ; \ 133 use_one_coefficient(reg1,reg2,reg3,reg4, soffset + 56) ; \ 134 movq offset (%rbp), %r ## reg1 ## x 135 136 prefetchnta 0x100(%rsi) 137 eight_moves_and_reload(a,r8,r9,r10,,0) 138 addq $0x40, %rsi 139 eight_moves_and_reload(b,r11,r12,r13,8,0) 140 addq $0x40, %rsi 141 eight_moves_and_reload(c,r8,r9,r10,16,0) 142 addq $0x40, %rsi 143 eight_moves_and_reload(d,r11,r12,r13,24,0) 144 addq $0x40, %rsi 145 146 // note that we need 2*READAHEAD_COEFFS bytes of readahead at the 147 // end of the array. 148 149 addq $LOOP_LENGTH, %rbp 150 cmpq %rbp, %r15 151 ja .Lmain_loop 152 153.Lfixup_before_tail: 154 // fix r15 for real value 155 movq $READAHEAD_COEFFS, %r9 156 leaq (%r15,%r9,1), %r15 157 // also fix rbp 158 subq $LOOP_LENGTH, %rbp 159 160.Ltail_loop: 161 cmpq %rbp, %r15 162 je .Lfinished 163 one_simple_move(r8, r10, r11) 164 jmp .Ltail_loop 165 166.Lfinished: 167 movq %rbp, %rax 168 169 popq %r12 170 popq %r13 171 popq %r14 172 popq %r15 173 popq %rbx 174 popq %rbp 175 ret 176 .size matmul_sub_large_fbd, .-matmul_sub_large_fbd 177