1#if 0 2static inline void matmul_sub_vsc_combine(abobj_ptr x, abt * dst, const abt * * mptrs, const uint8_t * q, unsigned int count) 3{ 4 for( ; count-- ; ) { 5 uint8_t c = *q++; 6 abadd(x, dst, mptrs[c]); 7 mptrs[c] += aboffset(x, c != 0); 8 dst += aboffset(x, c == 0); 9 } 10} 11#endif 12 .text 13 .p2align 4,,15 14.globl matmul_sub_vsc_combine 15 .type matmul_sub_vsc_combine, @function 16matmul_sub_vsc_combine: 17 pushq %rbp 18 pushq %rbx 19 pushq %r15 20 pushq %r14 21 pushq %r13 22 pushq %r12 23 // rdi ignored (abobj arg) 24 movq %rsi, %rdi 25 movq %rdx, %rsi 26 movq %rcx, %rbp 27 movl %r8d, %r9d 28 // r9 is ignored for now. 29 testq %r9, %r9 30 je .Lfinished 31 .p2align 4,,10 32 .p2align 3 33 34#define one_simple_loop \ 35 movzbl (%rbp), %eax ; \ 36 leaq (%rsi,%rax,8), %rdx ; \ 37 movq (%rdx), %rcx ; \ 38 movq (%rdi), %rbx ; \ 39 xorq (%rcx), %rbx ; \ 40 movq %rbx, (%rdi) ; \ 41 xorq %rbx, %rbx ; \ 42 testb %al, %al ; \ 43 setne %bl ; \ 44 leaq (%rcx, %rbx, 8), %rcx ; \ 45 movq %rcx, (%rdx) ; \ 46 xorb $1, %bl ; \ 47 leaq (%rdi, %rbx, 8), %rdi 48 49#define ALIGN_MASK 7 50 51.Lunaligned: 52 movq %rbp, %rax 53 andq $ALIGN_MASK, %rax 54 je .Laligned 55 one_simple_loop 56 addq $1, %rbp 57 subq $1, %r9 58 je .Lfinished 59 jmp .Lunaligned 60 61.Laligned: 62 leaq (%rbp,%r9), %r15 63 cmpq $8, %r9 64 jb .Ltail 65 subq $8, %r9 66 leaq (%rbp,%r9), %r15 67 68 movq (%rbp), %rax 69 addq $8, %rbp 70 cmpq %rbp, %r15 71 jb .Lfixup 72 73.Lmain: 74 // rax contains eight bytes ahead. 75 76#define inner_ops \ 77 leaq (%rsi,%rcx,8), %rdx ; \ 78 movq (%rdx), %r8 ; \ 79 movq (%rdi), %r9 ; \ 80 xorq (%r8), %r9 ; \ 81 movq %r9, (%rdi) ; \ 82 xorq %rbx, %rbx ; \ 83 testb %cl, %cl ; \ 84 setne %bl ; \ 85 leaq (%r8, %rbx, 8), %r8 ; \ 86 movq %r8, (%rdx) ; \ 87 xorb $1, %bl ; \ 88 leaq (%rdi, %rbx, 8), %rdi 89 90 // do this 8 times. 91 movzbq %al, %rcx 92 shrq $8, %rax 93 inner_ops 94 movzbq %al, %rcx 95 shrq $8, %rax 96 inner_ops 97 movzbq %al, %rcx 98 shrq $8, %rax 99 inner_ops 100 movzbq %al, %rcx 101 shrq $8, %rax 102 inner_ops 103 104 movzbq %al, %rcx 105 shrq $8, %rax 106 inner_ops 107 movzbq %al, %rcx 108 shrq $8, %rax 109 inner_ops 110 111 movzbq %al, %rcx 112 shrq $8, %rax 113 inner_ops 114 movzbq %al, %rcx 115 inner_ops 116 117 movq (%rbp), %rax 118 119 addq $8, %rbp 120 cmpq %rbp, %r15 121 ja .Lmain 122.Lfixup: 123 movq $8, %r9 124 leaq (%r15,%r9,1), %r15 125 subq $8, %rbp 126 127.Ltail: 128 one_simple_loop 129 addq $1, %rbp 130 cmpq %rbp, %r15 131 jne .Ltail 132 133.Lfinished: 134 popq %r12 135 popq %r13 136 popq %r14 137 popq %r15 138 popq %rbx 139 popq %rbp 140 ret 141 .size matmul_sub_vsc_combine, .-matmul_sub_vsc_combine 142 143