1// +build arm64,!gccgo,!appengine
2
3#include "textflag.h"
4
5
6// This implements union2by2 using golang's version of arm64 assembly
7// The algorithm is very similar to the generic one,
8// but makes better use of arm64 features so is notably faster.
9// The basic algorithm structure is as follows:
10// 1. If either set is empty, copy the other set into the buffer and return the length
11// 2. Otherwise, load the first element of each set into a variable (s1 and s2).
12// 3. a. Compare the values of s1 and s2.
13 // b. add the smaller one to the buffer.
14 // c. perform a bounds check before incrementing.
15 // If one set is finished, copy the rest of the other set over.
16 // d. update s1 and or s2 to the next value, continue loop.
17 //
18 // Past the fact of the algorithm, this code makes use of several arm64 features
19 // Condition Codes:
20 // arm64's CMP operation sets 4 bits that can be used for branching,
21 // rather than just true or false.
22 // As a consequence, a single comparison gives enough information to distinguish the three cases
23 //
24 // Post-increment pointers after load/store:
25 // Instructions like `MOVHU.P 2(R0), R6`
26 // increment the register by a specified amount, in this example 2.
27 // Because uint16's are exactly 2 bytes and the length of the slices
28 // is part of the slice header,
29 // there is no need to separately track the index into the slice.
30 // Instead, the code can calculate the final read value and compare against that,
31 // using the post-increment reads to move the pointers along.
32 //
33 // TODO: CALL out to memmove once the list is exhausted.
34 // Right now it moves the necessary shorts so that the remaining count
35 // is a multiple of 4 and then copies 64 bits at a time.
36
37TEXT ·union2by2(SB), NOSPLIT, $0-80
38	// R0, R1, and R2 for the pointers to the three slices
39	MOVD set1+0(FP), R0
40	MOVD set2+24(FP), R1
41	MOVD buffer+48(FP), R2
42
43	//R3 and R4 will be the values at which we will have finished reading set1 and set2.
44	// R3 should be R0 + 2 * set1_len+8(FP)
45	MOVD set1_len+8(FP), R3
46	MOVD set2_len+32(FP), R4
47
48	ADD R3<<1, R0, R3
49	ADD R4<<1, R1, R4
50
51
52	//Rather than counting the number of elements added separately
53	//Save the starting register of buffer.
54	MOVD buffer+48(FP), R5
55
56	// set1 is empty, just flush set2
57	CMP R0, R3
58	BEQ flush_right
59
60	// set2 is empty, just flush set1
61	CMP R1, R4
62	BEQ flush_left
63
64	// R6, R7 are the working space for s1 and s2
65	MOVD ZR, R6
66	MOVD ZR, R7
67
68	MOVHU.P 2(R0), R6
69	MOVHU.P 2(R1), R7
70loop:
71
72	CMP R6, R7
73	BEQ pop_both // R6 == R7
74	BLS pop_right // R6 > R7
75//pop_left: // R6 < R7
76	MOVHU.P R6, 2(R2)
77	CMP R0, R3
78	BEQ pop_then_flush_right
79	MOVHU.P 2(R0), R6
80	JMP loop
81pop_both:
82	MOVHU.P R6, 2(R2) //could also use R7, since they are equal
83	CMP R0, R3
84	BEQ flush_right
85	CMP R1, R4
86	BEQ flush_left
87	MOVHU.P 2(R0), R6
88	MOVHU.P 2(R1), R7
89	JMP loop
90pop_right:
91	MOVHU.P R7, 2(R2)
92	CMP R1, R4
93	BEQ pop_then_flush_left
94	MOVHU.P 2(R1), R7
95	JMP loop
96
97pop_then_flush_right:
98	MOVHU.P R7, 2(R2)
99flush_right:
100	MOVD R1, R0
101	MOVD R4, R3
102	JMP flush_left
103pop_then_flush_left:
104	MOVHU.P R6, 2(R2)
105flush_left:
106	CMP R0, R3
107	BEQ return
108	//figure out how many bytes to slough off. Must be a multiple of two
109	SUB R0, R3, R4
110	ANDS $6, R4
111	BEQ long_flush //handles the 0 mod 8 case
112	SUBS $4, R4, R4 // since possible values are 2, 4, 6, this splits evenly
113	BLT pop_single  // exactly the 2 case
114	MOVW.P 4(R0), R6
115	MOVW.P R6, 4(R2)
116	BEQ long_flush // we're now aligned by 64 bits, as R4==4, otherwise 2 more
117pop_single:
118	MOVHU.P 2(R0), R6
119	MOVHU.P R6, 2(R2)
120long_flush:
121	// at this point we know R3 - R0 is a multiple of 8.
122	CMP R0, R3
123	BEQ return
124	MOVD.P 8(R0), R6
125	MOVD.P R6, 8(R2)
126	JMP long_flush
127return:
128	// number of shorts written is (R5 - R2) >> 1
129	SUB R5, R2
130	LSR $1, R2, R2
131	MOVD R2, size+72(FP)
132	RET
133