1#if 0
2static inline void matmul_sub_vsc_combine(abobj_ptr x, abt * dst, const abt * * mptrs, const uint8_t * q, unsigned int count)
3{
4    for( ; count-- ; ) {
5        uint8_t c = *q++;
6        abadd(x, dst, mptrs[c]);
7        mptrs[c] += aboffset(x, c != 0);
8        dst += aboffset(x, c == 0);
9    }
10}
11#endif
12	.text
13	.p2align 4,,15
14.globl matmul_sub_vsc_combine
15	.type	matmul_sub_vsc_combine, @function
16matmul_sub_vsc_combine:
17        pushq %rbp
18        pushq %rbx
19        pushq %r15
20        pushq %r14
21        pushq %r13
22        pushq %r12
23        // rdi ignored (abobj arg)
24	movq	%rsi, %rdi
25	movq	%rdx, %rsi
26	movq	%rcx, %rbp
27	movl	%r8d, %r9d
28        // r9 is ignored for now.
29	testq	%r9, %r9
30	je	.Lfinished
31	.p2align 4,,10
32	.p2align 3
33
34#define one_simple_loop							\
35	movzbl	(%rbp), %eax                                          ; \
36	leaq	(%rsi,%rax,8), %rdx                                   ; \
37	movq	(%rdx), %rcx                                          ; \
38        movq    (%rdi), %rbx					      ; \
39	xorq	(%rcx), %rbx                                          ; \
40	movq	%rbx, (%rdi)                                          ; \
41        xorq    %rbx, %rbx					      ; \
42        testb   %al, %al					      ; \
43        setne   %bl						      ; \
44        leaq    (%rcx, %rbx, 8), %rcx				      ; \
45        movq    %rcx, (%rdx)					      ; \
46        xorb    $1, %bl						      ; \
47        leaq    (%rdi, %rbx, 8), %rdi
48
49#define ALIGN_MASK      7
50
51.Lunaligned:
52        movq    %rbp, %rax
53        andq    $ALIGN_MASK, %rax
54        je .Laligned
55        one_simple_loop
56        addq $1, %rbp
57        subq $1, %r9
58        je .Lfinished
59        jmp .Lunaligned
60
61.Laligned:
62	leaq	(%rbp,%r9), %r15
63        cmpq    $8, %r9
64        jb      .Ltail
65        subq    $8, %r9
66	leaq	(%rbp,%r9), %r15
67
68        movq    (%rbp), %rax
69        addq    $8, %rbp
70        cmpq    %rbp, %r15
71        jb .Lfixup
72
73.Lmain:
74        // rax contains eight bytes ahead.
75
76#define inner_ops							\
77	leaq	(%rsi,%rcx,8), %rdx                                   ; \
78	movq	(%rdx), %r8                                           ; \
79        movq    (%rdi), %r9					      ; \
80	xorq	(%r8), %r9                                            ; \
81	movq	%r9, (%rdi)                                           ; \
82        xorq    %rbx, %rbx					      ; \
83        testb   %cl, %cl					      ; \
84        setne   %bl						      ; \
85        leaq    (%r8, %rbx, 8), %r8				      ; \
86        movq    %r8, (%rdx)					      ; \
87        xorb    $1, %bl						      ; \
88        leaq    (%rdi, %rbx, 8), %rdi
89
90        // do this 8 times.
91        movzbq  %al, %rcx
92        shrq    $8, %rax
93        inner_ops
94        movzbq  %al, %rcx
95        shrq    $8, %rax
96        inner_ops
97        movzbq  %al, %rcx
98        shrq    $8, %rax
99        inner_ops
100        movzbq  %al, %rcx
101        shrq    $8, %rax
102        inner_ops
103
104        movzbq  %al, %rcx
105        shrq    $8, %rax
106        inner_ops
107        movzbq  %al, %rcx
108        shrq    $8, %rax
109        inner_ops
110
111        movzbq  %al, %rcx
112        shrq    $8, %rax
113        inner_ops
114        movzbq  %al, %rcx
115        inner_ops
116
117	movq    (%rbp), %rax
118
119        addq    $8, %rbp
120        cmpq    %rbp, %r15
121        ja      .Lmain
122.Lfixup:
123        movq    $8, %r9
124        leaq    (%r15,%r9,1), %r15
125        subq    $8, %rbp
126
127.Ltail:
128        one_simple_loop
129        addq    $1, %rbp
130        cmpq    %rbp, %r15
131        jne .Ltail
132
133.Lfinished:
134        popq %r12
135        popq %r13
136        popq %r14
137        popq %r15
138        popq %rbx
139        popq %rbp
140	ret
141	.size	matmul_sub_vsc_combine, .-matmul_sub_vsc_combine
142
143