1/* twofish-arm.S - ARM assembly implementation of Twofish cipher 2 * 3 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This file is part of Libgcrypt. 6 * 7 * Libgcrypt is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU Lesser General Public License as 9 * published by the Free Software Foundation; either version 2.1 of 10 * the License, or (at your option) any later version. 11 * 12 * Libgcrypt is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21#include <config.h> 22 23#if defined(__ARMEL__) 24#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS 25 26.text 27 28.syntax unified 29.arm 30 31/* structure of TWOFISH_context: */ 32#define s0 0 33#define s1 ((s0) + 4 * 256) 34#define s2 ((s1) + 4 * 256) 35#define s3 ((s2) + 4 * 256) 36#define w ((s3) + 4 * 256) 37#define k ((w) + 4 * 8) 38 39/* register macros */ 40#define CTX %r0 41#define CTXs0 %r0 42#define CTXs1 %r1 43#define CTXs3 %r7 44 45#define RA %r3 46#define RB %r4 47#define RC %r5 48#define RD %r6 49 50#define RX %r2 51#define RY %ip 52 53#define RMASK %lr 54 55#define RT0 %r8 56#define RT1 %r9 57#define RT2 %r10 58#define RT3 %r11 59 60/* helper macros */ 61#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ 62 ldrb rout, [rsrc, #((offs) + 0)]; \ 63 ldrb rtmp, [rsrc, #((offs) + 1)]; \ 64 orr rout, rout, rtmp, lsl #8; \ 65 ldrb rtmp, [rsrc, #((offs) + 2)]; \ 66 orr rout, rout, rtmp, lsl #16; \ 67 ldrb rtmp, [rsrc, #((offs) + 3)]; \ 68 orr rout, rout, rtmp, lsl #24; 69 70#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \ 71 mov rtmp0, rin, lsr #8; \ 72 strb rin, [rdst, #((offs) + 0)]; \ 73 mov rtmp1, rin, lsr #16; \ 74 strb rtmp0, [rdst, #((offs) + 1)]; \ 75 mov rtmp0, rin, lsr #24; \ 76 strb rtmp1, [rdst, #((offs) + 2)]; \ 77 strb rtmp0, [rdst, #((offs) + 3)]; 78 79#ifndef __ARMEL__ 80 /* bswap on big-endian */ 81 #define host_to_le(reg) \ 82 rev reg, reg; 83 #define le_to_host(reg) \ 84 rev reg, reg; 85#else 86 /* nop on little-endian */ 87 #define host_to_le(reg) /*_*/ 88 #define le_to_host(reg) /*_*/ 89#endif 90 91#define ldr_input_aligned_le(rin, a, b, c, d) \ 92 ldr a, [rin, #0]; \ 93 ldr b, [rin, #4]; \ 94 le_to_host(a); \ 95 ldr c, [rin, #8]; \ 96 le_to_host(b); \ 97 ldr d, [rin, #12]; \ 98 le_to_host(c); \ 99 le_to_host(d); 100 101#define str_output_aligned_le(rout, a, b, c, d) \ 102 le_to_host(a); \ 103 le_to_host(b); \ 104 str a, [rout, #0]; \ 105 le_to_host(c); \ 106 str b, [rout, #4]; \ 107 le_to_host(d); \ 108 str c, [rout, #8]; \ 109 str d, [rout, #12]; 110 111#ifdef __ARM_FEATURE_UNALIGNED 112 /* unaligned word reads/writes allowed */ 113 #define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \ 114 ldr_input_aligned_le(rin, ra, rb, rc, rd) 115 116 #define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \ 117 str_output_aligned_le(rout, ra, rb, rc, rd) 118#else 119 /* need to handle unaligned reads/writes by byte reads */ 120 #define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \ 121 tst rin, #3; \ 122 beq 1f; \ 123 ldr_unaligned_le(ra, rin, 0, rtmp0); \ 124 ldr_unaligned_le(rb, rin, 4, rtmp0); \ 125 ldr_unaligned_le(rc, rin, 8, rtmp0); \ 126 ldr_unaligned_le(rd, rin, 12, rtmp0); \ 127 b 2f; \ 128 1:;\ 129 ldr_input_aligned_le(rin, ra, rb, rc, rd); \ 130 2:; 131 132 #define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \ 133 tst rout, #3; \ 134 beq 1f; \ 135 str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \ 136 str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \ 137 str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \ 138 str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \ 139 b 2f; \ 140 1:;\ 141 str_output_aligned_le(rout, ra, rb, rc, rd); \ 142 2:; 143#endif 144 145/********************************************************************** 146 1-way twofish 147 **********************************************************************/ 148#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \ 149 and RT0, RMASK, b, lsr#(8 - 2); \ 150 and RY, RMASK, b, lsr#(16 - 2); \ 151 add RT0, RT0, #(s2 - s1); \ 152 and RT1, RMASK, b, lsr#(24 - 2); \ 153 ldr RY, [CTXs3, RY]; \ 154 and RT2, RMASK, b, lsl#(2); \ 155 ldr RT0, [CTXs1, RT0]; \ 156 and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \ 157 ldr RT1, [CTXs0, RT1]; \ 158 and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \ 159 ldr RT2, [CTXs1, RT2]; \ 160 add RT3, RT3, #(s2 - s1); \ 161 ldr RX, [CTXs1, RX]; \ 162 ror_a(a); \ 163 \ 164 eor RY, RY, RT0; \ 165 ldr RT3, [CTXs1, RT3]; \ 166 and RT0, RMASK, a, lsl#(2); \ 167 eor RY, RY, RT1; \ 168 and RT1, RMASK, a, lsr#(24 - 2); \ 169 eor RY, RY, RT2; \ 170 ldr RT0, [CTXs0, RT0]; \ 171 eor RX, RX, RT3; \ 172 ldr RT1, [CTXs3, RT1]; \ 173 eor RX, RX, RT0; \ 174 \ 175 ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \ 176 eor RX, RX, RT1; \ 177 ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \ 178 \ 179 add RT0, RX, RY, lsl #1; \ 180 add RX, RX, RY; \ 181 add RT0, RT0, RT3; \ 182 add RX, RX, RT2; \ 183 eor rd, RT0, rd, ror #31; \ 184 eor rc, rc, RX; 185 186#define dummy(x) /*_*/ 187 188#define ror1(r) \ 189 ror r, r, #1; 190 191#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \ 192 and RT3, RMASK, b, lsl#(2 - (adj_b)); \ 193 and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \ 194 ror_b(b); \ 195 and RT2, RMASK, a, lsl#(2); \ 196 and RT0, RMASK, a, lsr#(8 - 2); \ 197 \ 198 ldr RY, [CTXs1, RT3]; \ 199 add RT1, RT1, #(s2 - s1); \ 200 ldr RX, [CTXs0, RT2]; \ 201 and RT3, RMASK, b, lsr#(16 - 2); \ 202 ldr RT1, [CTXs1, RT1]; \ 203 and RT2, RMASK, a, lsr#(16 - 2); \ 204 ldr RT0, [CTXs1, RT0]; \ 205 \ 206 add RT2, RT2, #(s2 - s1); \ 207 ldr RT3, [CTXs3, RT3]; \ 208 eor RY, RY, RT1; \ 209 \ 210 and RT1, RMASK, b, lsr#(24 - 2); \ 211 eor RX, RX, RT0; \ 212 ldr RT2, [CTXs1, RT2]; \ 213 and RT0, RMASK, a, lsr#(24 - 2); \ 214 \ 215 ldr RT1, [CTXs0, RT1]; \ 216 \ 217 eor RY, RY, RT3; \ 218 ldr RT0, [CTXs3, RT0]; \ 219 eor RX, RX, RT2; \ 220 eor RY, RY, RT1; \ 221 \ 222 ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \ 223 eor RX, RX, RT0; \ 224 ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \ 225 \ 226 add RT0, RX, RY, lsl #1; \ 227 add RX, RX, RY; \ 228 add RT0, RT0, RT1; \ 229 add RX, RX, RT2; \ 230 eor rd, rd, RT0; \ 231 eor rc, RX, rc, ror #31; 232 233#define first_encrypt_cycle(nc) \ 234 encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \ 235 encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); 236 237#define encrypt_cycle(nc) \ 238 encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \ 239 encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); 240 241#define last_encrypt_cycle(nc) \ 242 encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \ 243 encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \ 244 ror1(RA); 245 246#define first_decrypt_cycle(nc) \ 247 decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \ 248 decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); 249 250#define decrypt_cycle(nc) \ 251 decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \ 252 decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); 253 254#define last_decrypt_cycle(nc) \ 255 decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \ 256 decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \ 257 ror1(RD); 258 259.align 3 260.globl _gcry_twofish_arm_encrypt_block 261.type _gcry_twofish_arm_encrypt_block,%function; 262 263_gcry_twofish_arm_encrypt_block: 264 /* input: 265 * %r0: ctx 266 * %r1: dst 267 * %r2: src 268 */ 269 push {%r1, %r4-%r11, %ip, %lr}; 270 271 add RY, CTXs0, #w; 272 273 ldr_input_le(%r2, RA, RB, RC, RD, RT0); 274 275 /* Input whitening */ 276 ldm RY, {RT0, RT1, RT2, RT3}; 277 add CTXs3, CTXs0, #(s3 - s0); 278 add CTXs1, CTXs0, #(s1 - s0); 279 mov RMASK, #(0xff << 2); 280 eor RA, RA, RT0; 281 eor RB, RB, RT1; 282 eor RC, RC, RT2; 283 eor RD, RD, RT3; 284 285 first_encrypt_cycle(0); 286 encrypt_cycle(1); 287 encrypt_cycle(2); 288 encrypt_cycle(3); 289 encrypt_cycle(4); 290 encrypt_cycle(5); 291 encrypt_cycle(6); 292 last_encrypt_cycle(7); 293 294 add RY, CTXs3, #(w + 4*4 - s3); 295 pop {%r1}; /* dst */ 296 297 /* Output whitening */ 298 ldm RY, {RT0, RT1, RT2, RT3}; 299 eor RC, RC, RT0; 300 eor RD, RD, RT1; 301 eor RA, RA, RT2; 302 eor RB, RB, RT3; 303 304 str_output_le(%r1, RC, RD, RA, RB, RT0, RT1); 305 306 pop {%r4-%r11, %ip, %pc}; 307.ltorg 308.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block; 309 310.align 3 311.globl _gcry_twofish_arm_decrypt_block 312.type _gcry_twofish_arm_decrypt_block,%function; 313 314_gcry_twofish_arm_decrypt_block: 315 /* input: 316 * %r0: ctx 317 * %r1: dst 318 * %r2: src 319 */ 320 push {%r1, %r4-%r11, %ip, %lr}; 321 322 add CTXs3, CTXs0, #(s3 - s0); 323 324 ldr_input_le(%r2, RC, RD, RA, RB, RT0); 325 326 add RY, CTXs3, #(w + 4*4 - s3); 327 add CTXs3, CTXs0, #(s3 - s0); 328 329 /* Input whitening */ 330 ldm RY, {RT0, RT1, RT2, RT3}; 331 add CTXs1, CTXs0, #(s1 - s0); 332 mov RMASK, #(0xff << 2); 333 eor RC, RC, RT0; 334 eor RD, RD, RT1; 335 eor RA, RA, RT2; 336 eor RB, RB, RT3; 337 338 first_decrypt_cycle(7); 339 decrypt_cycle(6); 340 decrypt_cycle(5); 341 decrypt_cycle(4); 342 decrypt_cycle(3); 343 decrypt_cycle(2); 344 decrypt_cycle(1); 345 last_decrypt_cycle(0); 346 347 add RY, CTXs0, #w; 348 pop {%r1}; /* dst */ 349 350 /* Output whitening */ 351 ldm RY, {RT0, RT1, RT2, RT3}; 352 eor RA, RA, RT0; 353 eor RB, RB, RT1; 354 eor RC, RC, RT2; 355 eor RD, RD, RT3; 356 357 str_output_le(%r1, RA, RB, RC, RD, RT0, RT1); 358 359 pop {%r4-%r11, %ip, %pc}; 360.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block; 361 362#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ 363#endif /*__ARMEL__*/ 364