1@ ARM procedure call convention: 2@ r0..r3, r12 (ip) and r14 (lr) are volatile. Args are passed in r0..r3, 3@ and the return address in r14. 4@ 5@ All other registers must be preserved by the callee. r13 (sp) and r15 (pc) 6@ are as expected. 7@ 8@ The usual convention is to push all the needed registers, including r14, 9@ on the stack, and the restore them at the end, but to r15 rather than r14. 10@ This, however, WILL NOT WORK for Thumb code. You have to use the "bx" 11@ instruction for that, so you need one more trailing instruction. 12 13 .text 14 .align 2 15 .type k_table, %object 16k_table: 17 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b 18 .word 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01 19 .word 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7 20 .word 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 21 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152 22 .word 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147 23 .word 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc 24 .word 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 25 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819 26 .word 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08 27 .word 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f 28 .word 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 29 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 30 .size k_table, .-k_table 31 32@ We use 13 local variables: 33pp .req r0 @ The argument, points to the IV and w[] space 34aa .req r1 @ Working variable. 35bb .req r2 36cc .req r3 37dd .req r4 38ee .req r5 39ff .req r6 40gg .req r7 41hh .req r8 42ii .req r9 @ Loop index 43tt .req r10 @ General purpose temp 44kk .req r11 @ k+64 (k_table+256) 45ww .req r12 @ Actually, w+64 much of the time 46@ We could use r14 as well, but don't need to. 47@ (The names are doubled because a sigle b is "branch"! 48@ 49@ This function takes a pointer to an array of 72 32-bit words: 50@ The first 8 are the state vector a..h 51@ The next 16 are the input data words w[0..15], in native byte order. 52@ The next 48 are used to hold the rest of the key schedule w[16..63]. 53 54 .global sha256_transform 55 .type sha256_transform, %function 56sha256_transform: 57 stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11} 58 add ww, pp, #4*(8+16) @ w + 16 = p + 8 + 16 59 mov ii, #64-16 @ loop counter 60 61 @ Fill in words 16..63 of the w[] array, at p+24..p+71 621: 63 @ ww[i] = w[i-16] + s0(w[i-15]) + w[i-7] + s1(w[i-2]) 64 ldr aa, [ww, #-64] @ a = w[i-16] 65 ldr bb, [ww, #-60] @ b = w[i-15] 66 ldr cc, [ww, #-28] @ c = w[i-7] 67 add aa, aa, cc @ a += c (= w[i-7]) 68 69 @ s0(x) = (x >>> 7) ^ (x >>> 18) ^ (x >> 3) 70 mov cc, bb, ror #18 @ c = b>>>18 71 eor cc, cc, bb, ror #7 @ c ^= b>>>7 72 eor cc, cc, bb, lsr #3 @ c ^= b>>3 73 ldr bb, [ww, #-8] @ b = w[i-2] 74 add aa, aa, cc @ a += c (= s0(w[i-15])) 75 @ s1(x) = (x >>> 17) ^ (x >>> 19) ^ (x >> 10) 76 mov cc, bb, ror #19 @ c = b>>>19 77 eor cc, cc, bb, ror #17 @ c ^= b>>>17 78 eor cc, cc, bb, lsr #10 @ c ^= b>>10 79 add aa, aa, cc @ a += c (= s1(w[i-2])) 80 81 subs ii, ii, #1 @ --i 82 str aa, [ww], #4 @ w[i++] = a 83 bne 1b 84 85 86 @ The main loop. Arrays are indexed with i, which starts at -256 87 @ and counts up to 0. In addition to t, we use h as a working 88 @ variable for the first part of the loop, until doing the 89 @ big register rotation, then a as a temp for the last part. 90 91 ldmia pp, {aa,bb,cc,dd,ee,ff,gg,hh} @ Load a..h 92 mov ii, #-256 @ i = -64 (*4 strength-reduced) 93 adr kk, k_table+256 @ Load up r12 to the END of k 942: 95 @ t = h + S1(e) + Ch(e,f,g) + k[i] + w[i] 96 @ Form t = Ch(e,f,g) = (g ^ (e & (f ^ g)) 97 eor tt, ff, gg @ t = f^g 98 and tt, tt, ee @ t &= e 99 eor tt, tt, gg @ t ^= g 100 101 add tt, tt, hh @ t += h 102 103 @ Form t += S1(e) = (e >>> 6) ^ (e >>> 11) ^ (e >>> 25) 104 eor hh, ee, ee, ror #25-6 @ h = e ^ e>>>(25-6) 105 eor hh, hh, ee, ror #11-6 @ h = h ^ e>>>(11-6) 106 add tt, tt, hh, ror #6 @ t += h>>>6 107 108 @ Add k[i] and w[i]. Note that -64 <= i < 0. 109 ldr hh, [ww, ii] @ h = w[64+i] 110 add tt, tt, hh 111 ldr hh, [kk, ii] @ h = k[64+i] 112 add tt, tt, hh 113 adds ii, ii, #4 @ ++i (*4 strength-reduced) 114 115 @ Copy (h,g,f,e,d,c,b) = (g,f,e,d+t1,c,b,a) 116 @ This could be shrunk with aa big stm/ldm pair, but that 117 @ seems terribly wasteful... 118 mov hh, gg @ h = g 119 mov gg, ff @ g = f 120 mov ff, ee @ f = e 121 add ee, dd, tt @ e = d + t 122 mov dd, cc @ d = c 123 mov cc, bb @ c = b 124 mov bb, aa @ b = a 125 126 @ a = t + S0(b) + Maj(b,c,d) 127 @ Form t += S0(b) = (b >>> 2) ^ (b >>> 13) ^ (b >>> 22) */ 128 eor aa, bb, bb, ror #22-2 @ a = b ^ b>>>(22-2) 129 eor aa, aa, bb, ror #13-2 @ a = a ^ b>>>(13-2) 130 add tt, tt, aa, ror #2 @ t += a>>>2 131 132 @ Form a = t + Maj(b,c,d) = (c & d) + (b & (c ^ d)) 133 and aa, cc, dd @ a = c & d 134 add tt, tt, aa @ t += a 135 eor aa, cc, dd @ a = c ^ d 136 and aa, aa, bb @ a &= b 137 add aa, aa, tt @ a += t 138 139 bne 2b @ while (i != 0) 140 141 @ Now, the final summation. Minimum code size is tricky... 142 ldmia pp!, {ii,tt,kk,ww} @ Load old iv[0..3] 143 add aa, aa, ii @ a += iv[0] 144 add bb, bb, tt @ b += iv[1] 145 add cc, cc, kk @ c += iv[2] 146 add dd, dd, ww @ d += iv[3] 147 ldmia pp!, {ii,tt,kk,ww} @ Load old iv[4..7] 148 add ee, ee, ii @ e += iv[4] 149 add ff, ff, tt @ f += iv[5] 150 add gg, gg, kk @ g += iv[6] 151 add hh, hh, ww @ h += iv[7] 152 stmfd pp, {aa,bb,cc,dd,ee,ff,gg,hh} @ Store new iv[0..7] 153 154 ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11} 155 bx lr 156 157 .size sha256_transform, .-sha256_transform 158