1@ ARM procedure call convention:
2@ r0..r3, r12 (ip) and r14 (lr) are volatile.  Args are passed in r0..r3,
3@ and the return address in r14.
4@
5@ All other registers must be preserved by the callee.  r13 (sp) and r15 (pc)
6@ are as expected.
7@
8@ The usual convention is to push all the needed registers, including r14,
9@ on the stack, and the restore them at the end, but to r15 rather than r14.
10@ This, however, WILL NOT WORK for Thumb code.  You have to use the "bx"
11@ instruction for that, so you need one more trailing instruction.
12
13	.text
14	.align	2
15	.type	k_table, %object
16k_table:
17	.word	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b
18	.word	0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01
19	.word	0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7
20	.word	0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
21	.word	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152
22	.word	0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147
23	.word	0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc
24	.word	0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
25	.word	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819
26	.word	0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08
27	.word	0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f
28	.word	0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
29	.word	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
30	.size	k_table, .-k_table
31
32@ We use 13 local variables:
33pp	.req r0		@ The argument, points to the IV and w[] space
34aa	.req r1		@ Working variable.
35bb	.req r2
36cc	.req r3
37dd	.req r4
38ee	.req r5
39ff	.req r6
40gg	.req r7
41hh	.req r8
42ii	.req r9		@ Loop index
43tt	.req r10	@ General purpose temp
44kk	.req r11	@ k+64 (k_table+256)
45ww	.req r12	@ Actually, w+64 much of the time
46@ We could use r14 as well, but don't need to.
47@ (The names are doubled because a sigle b is "branch"!
48@
49@ This function takes a pointer to an array of 72 32-bit words:
50@ The first 8 are the state vector a..h
51@ The next 16 are the input data words w[0..15], in native byte order.
52@ The next 48 are used to hold the rest of the key schedule w[16..63].
53
54	.global	sha256_transform
55	.type	sha256_transform, %function
56sha256_transform:
57	stmfd	sp!, {r4,r5,r6,r7,r8,r9,r10,r11}
58	add	ww, pp, #4*(8+16)	@ w + 16 = p + 8 + 16
59	mov	ii, #64-16		@ loop counter
60
61	@ Fill in words 16..63 of the w[] array, at p+24..p+71
621:
63	@ ww[i] = w[i-16] + s0(w[i-15]) + w[i-7] + s1(w[i-2])
64	ldr	aa, [ww, #-64]		@ a = w[i-16]
65	ldr	bb, [ww, #-60]		@ b = w[i-15]
66	ldr	cc, [ww, #-28]		@ c = w[i-7]
67	add	aa, aa, cc		@ a += c (= w[i-7])
68
69	@ s0(x) = (x >>> 7) ^ (x >>> 18) ^ (x >> 3)
70	mov	cc, bb, ror #18		@ c = b>>>18
71	eor	cc, cc, bb, ror #7	@ c ^= b>>>7
72	eor	cc, cc, bb, lsr #3	@ c ^= b>>3
73	ldr	bb, [ww, #-8]		@ b = w[i-2]
74	add	aa, aa, cc		@ a += c (= s0(w[i-15]))
75	@ s1(x) = (x >>> 17) ^ (x >>> 19) ^ (x >> 10)
76	mov	cc, bb, ror #19		@ c = b>>>19
77	eor	cc, cc, bb, ror #17	@ c ^= b>>>17
78	eor	cc, cc, bb, lsr #10	@ c ^= b>>10
79	add	aa, aa, cc		@ a += c (= s1(w[i-2]))
80
81	subs	ii, ii, #1		@ --i
82	str	aa, [ww], #4		@ w[i++] = a
83	bne	1b
84
85
86	@ The main loop.  Arrays are indexed with i, which starts at -256
87	@ and counts up to 0.  In addition to t, we use h as a working
88	@ variable for the first part of the loop, until doing the
89	@ big register rotation, then a as a temp for the last part.
90
91	ldmia	pp, {aa,bb,cc,dd,ee,ff,gg,hh}	@ Load a..h
92	mov	ii, #-256		@ i = -64 (*4 strength-reduced)
93	adr	kk, k_table+256		@ Load up r12 to the END of k
942:
95	@ t = h + S1(e) + Ch(e,f,g) + k[i] + w[i]
96	@ Form t = Ch(e,f,g) = (g ^ (e & (f ^ g))
97	eor	tt, ff, gg		@ t = f^g
98	and	tt, tt, ee		@ t &= e
99	eor	tt, tt, gg		@ t ^= g
100
101	add	tt, tt, hh		@ t += h
102
103	@ Form t += S1(e) = (e >>> 6) ^ (e >>> 11) ^ (e >>> 25)
104	eor	hh, ee, ee, ror #25-6	@ h = e ^ e>>>(25-6)
105	eor	hh, hh, ee, ror #11-6	@ h = h ^ e>>>(11-6)
106	add	tt, tt, hh, ror #6	@ t += h>>>6
107
108	@ Add k[i] and w[i].  Note that -64 <= i < 0.
109	ldr	hh, [ww, ii]		@ h = w[64+i]
110	add	tt, tt, hh
111	ldr	hh, [kk, ii]		@ h = k[64+i]
112	add	tt, tt, hh
113	adds	ii, ii, #4		@ ++i (*4 strength-reduced)
114
115	@ Copy (h,g,f,e,d,c,b) = (g,f,e,d+t1,c,b,a)
116	@ This could be shrunk with aa big stm/ldm pair, but that
117	@ seems terribly wasteful...
118	mov	hh, gg			@ h = g
119	mov	gg, ff			@ g = f
120	mov	ff, ee			@ f = e
121	add	ee, dd, tt		@ e = d + t
122	mov	dd, cc			@ d = c
123	mov	cc, bb			@ c = b
124	mov	bb, aa			@ b = a
125
126	@ a = t + S0(b) + Maj(b,c,d)
127	@ Form t += S0(b) = (b >>> 2) ^ (b >>> 13) ^ (b >>> 22) */
128	eor	aa, bb, bb, ror #22-2	@ a = b ^ b>>>(22-2)
129	eor	aa, aa, bb, ror #13-2	@ a = a ^ b>>>(13-2)
130	add	tt, tt, aa, ror #2	@ t += a>>>2
131
132	@ Form a = t + Maj(b,c,d) = (c & d) + (b & (c ^ d))
133	and	aa, cc, dd		@ a = c & d
134	add	tt, tt, aa		@ t += a
135	eor	aa, cc, dd		@ a = c ^ d
136	and	aa, aa, bb		@ a &= b
137	add	aa, aa, tt		@ a += t
138
139	bne	2b			@ while (i != 0)
140
141	@ Now, the final summation.  Minimum code size is tricky...
142	ldmia	pp!, {ii,tt,kk,ww}	@ Load old iv[0..3]
143	add	aa, aa, ii		@ a += iv[0]
144	add	bb, bb, tt		@ b += iv[1]
145	add	cc, cc, kk		@ c += iv[2]
146	add	dd, dd, ww		@ d += iv[3]
147	ldmia	pp!, {ii,tt,kk,ww}	@ Load old iv[4..7]
148	add	ee, ee, ii		@ e += iv[4]
149	add	ff, ff, tt		@ f += iv[5]
150	add	gg, gg, kk		@ g += iv[6]
151	add	hh, hh, ww		@ h += iv[7]
152	stmfd	pp, {aa,bb,cc,dd,ee,ff,gg,hh}	@ Store new iv[0..7]
153
154	ldmfd	sp!, {r4,r5,r6,r7,r8,r9,r10,r11}
155	bx	lr
156
157	.size	sha256_transform, .-sha256_transform
158