1 /* FFdecsa -- fast decsa algorithm
2  *
3  * Copyright (C) 2003-2004  fatih89r
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18  */
19 
20 
21 #include <sys/types.h>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 
26 #ifndef NULL
27 #define NULL 0
28 #endif
29 
30 //#define DEBUG
31 #ifdef DEBUG
32 #define DBG(a) a
33 #else
34 #define DBG(a)
35 #endif
36 
37 //// parallelization stuff, large speed differences are possible
38 // possible choices
39 #define PARALLEL_32_4CHAR     320
40 #define PARALLEL_32_4CHARA    321
41 #define PARALLEL_32_INT       322
42 #define PARALLEL_64_8CHAR     640
43 #define PARALLEL_64_8CHARA    641
44 #define PARALLEL_64_2INT      642
45 #define PARALLEL_64_LONG      643
46 #define PARALLEL_64_MMX       644
47 #define PARALLEL_128_16CHAR  1280
48 #define PARALLEL_128_16CHARA 1281
49 #define PARALLEL_128_4INT    1282
50 #define PARALLEL_128_2LONG   1283
51 #define PARALLEL_128_2MMX    1284
52 #define PARALLEL_128_SSE     1285
53 #define PARALLEL_128_SSE2    1286
54 
55 #include "parallel_generic.h"
56 //// conditionals
57 #if PARALLEL_MODE==PARALLEL_32_4CHAR
58 #include "parallel_032_4char.h"
59 #elif PARALLEL_MODE==PARALLEL_32_4CHARA
60 #include "parallel_032_4charA.h"
61 #elif PARALLEL_MODE==PARALLEL_32_INT
62 #include "parallel_032_int.h"
63 #define FUNC(x) (x ## _32int)
64 #elif PARALLEL_MODE==PARALLEL_64_8CHAR
65 #include "parallel_064_8char.h"
66 #elif PARALLEL_MODE==PARALLEL_64_8CHARA
67 #include "parallel_064_8charA.h"
68 #elif PARALLEL_MODE==PARALLEL_64_2INT
69 #include "parallel_064_2int.h"
70 #elif PARALLEL_MODE==PARALLEL_64_LONG
71 #include "parallel_064_long.h"
72 #elif PARALLEL_MODE==PARALLEL_64_MMX
73 #include "parallel_064_mmx.h"
74 #define FUNC(x) (x ## _64mmx)
75 #elif PARALLEL_MODE==PARALLEL_128_16CHAR
76 #include "parallel_128_16char.h"
77 #elif PARALLEL_MODE==PARALLEL_128_16CHARA
78 #include "parallel_128_16charA.h"
79 #elif PARALLEL_MODE==PARALLEL_128_4INT
80 #include "parallel_128_4int.h"
81 #elif PARALLEL_MODE==PARALLEL_128_2LONG
82 #include "parallel_128_2long.h"
83 #elif PARALLEL_MODE==PARALLEL_128_2MMX
84 #include "parallel_128_2mmx.h"
85 #elif PARALLEL_MODE==PARALLEL_128_SSE
86 #include "parallel_128_sse.h"
87 #elif PARALLEL_MODE==PARALLEL_128_SSE2
88 #include "parallel_128_sse2.h"
89 #define FUNC(x) (x ## _128sse2)
90 #else
91 #error "unknown/undefined parallel mode"
92 #endif
93 
94 
95 // stuff depending on conditionals
96 
97 #define BYTES_PER_GROUP (GROUP_PARALLELISM/8)
98 #define BYPG BYTES_PER_GROUP
99 #define BITS_PER_GROUP GROUP_PARALLELISM
100 #define BIPG BITS_PER_GROUP
101 
102 #ifndef MALLOC
103 #define MALLOC(X) malloc(X)
104 #endif
105 #ifndef FREE
106 #define FREE(X) free(X)
107 #endif
108 #ifndef MEMALIGN
109 #define MEMALIGN
110 #endif
111 
112 //// debug tool
113 
114 #if 0
115 static void dump_mem(const char *string, const unsigned char *p, int len, int linelen){
116   int i;
117   for(i=0;i<len;i++){
118     if(i%linelen==0&&i) fprintf(stderr,"\n");
119     if(i%linelen==0) fprintf(stderr,"%s %08x:",string,i);
120     else{
121       if(i%8==0) fprintf(stderr," ");
122       if(i%4==0) fprintf(stderr," ");
123     }
124     fprintf(stderr," %02x",p[i]);
125   }
126   if(i%linelen==0) fprintf(stderr,"\n");
127 }
128 #endif
129 
130 //////////////////////////////////////////////////////////////////////////////////
131 
132 struct csa_key_t{
133 	unsigned char ck[8];
134 // used by stream
135         int iA[8];  // iA[0] is for A1, iA[7] is for A8
136         int iB[8];  // iB[0] is for B1, iB[7] is for B8
137 // used by stream (group)
138         MEMALIGN group ck_g[8][8]; // [byte][bit:0=LSB,7=MSB]
139         MEMALIGN group iA_g[8][4]; // [0 for A1][0 for LSB]
140         MEMALIGN group iB_g[8][4]; // [0 for B1][0 for LSB]
141 // used by block
142 	unsigned char kk[56];
143 // used by block (group)
144 	MEMALIGN batch kkmulti[56]; // many times the same byte in every batch
145 };
146 
147 struct csa_keys_t{
148   struct csa_key_t even;
149   struct csa_key_t odd;
150 };
151 
152 //-----stream cypher
153 
154 //-----key schedule for stream decypher
key_schedule_stream(unsigned char * ck,int * iA,int * iB)155 static void key_schedule_stream(
156   unsigned char *ck,    // [In]  ck[0]-ck[7]   8 bytes   | Key.
157   int *iA,              // [Out] iA[0]-iA[7]   8 nibbles | Key schedule.
158   int *iB)              // [Out] iB[0]-iB[7]   8 nibbles | Key schedule.
159 {
160     iA[0]=(ck[0]>>4)&0xf;
161     iA[1]=(ck[0]   )&0xf;
162     iA[2]=(ck[1]>>4)&0xf;
163     iA[3]=(ck[1]   )&0xf;
164     iA[4]=(ck[2]>>4)&0xf;
165     iA[5]=(ck[2]   )&0xf;
166     iA[6]=(ck[3]>>4)&0xf;
167     iA[7]=(ck[3]   )&0xf;
168     iB[0]=(ck[4]>>4)&0xf;
169     iB[1]=(ck[4]   )&0xf;
170     iB[2]=(ck[5]>>4)&0xf;
171     iB[3]=(ck[5]   )&0xf;
172     iB[4]=(ck[6]>>4)&0xf;
173     iB[5]=(ck[6]   )&0xf;
174     iB[6]=(ck[7]>>4)&0xf;
175     iB[7]=(ck[7]   )&0xf;
176 }
177 
178 //----- stream main function
179 
180 #define STREAM_INIT
181 #include "stream.c"
182 #undef STREAM_INIT
183 
184 #define STREAM_NORMAL
185 #include "stream.c"
186 #undef STREAM_NORMAL
187 
188 
189 //-----block decypher
190 
191 //-----key schedule for block decypher
192 
key_schedule_block(unsigned char * ck,unsigned char * kk)193 static void key_schedule_block(
194   unsigned char *ck,    // [In]  ck[0]-ck[7]   8 bytes | Key.
195   unsigned char *kk)    // [Out] kk[0]-kk[55] 56 bytes | Key schedule.
196 {
197   static const unsigned char key_perm[0x40] = {
198     0x12,0x24,0x09,0x07,0x2A,0x31,0x1D,0x15, 0x1C,0x36,0x3E,0x32,0x13,0x21,0x3B,0x40,
199     0x18,0x14,0x25,0x27,0x02,0x35,0x1B,0x01, 0x22,0x04,0x0D,0x0E,0x39,0x28,0x1A,0x29,
200     0x33,0x23,0x34,0x0C,0x16,0x30,0x1E,0x3A, 0x2D,0x1F,0x08,0x19,0x17,0x2F,0x3D,0x11,
201     0x3C,0x05,0x38,0x2B,0x0B,0x06,0x0A,0x2C, 0x20,0x3F,0x2E,0x0F,0x03,0x26,0x10,0x37,
202   };
203 
204   int i,j,k;
205   int bit[64];
206   int newbit[64];
207   int kb[7][8];
208 
209   // 56 steps
210   // 56 key bytes kk(55)..kk(0) by key schedule from ck
211 
212   // kb(6,0) .. kb(6,7) = ck(0) .. ck(7)
213   kb[6][0] = ck[0];
214   kb[6][1] = ck[1];
215   kb[6][2] = ck[2];
216   kb[6][3] = ck[3];
217   kb[6][4] = ck[4];
218   kb[6][5] = ck[5];
219   kb[6][6] = ck[6];
220   kb[6][7] = ck[7];
221 
222   // calculate kb[5] .. kb[0]
223   for(i=5; i>=0; i--){
224     // 64 bit perm on kb
225     for(j=0; j<8; j++){
226       for(k=0; k<8; k++){
227         bit[j*8+k] = (kb[i+1][j] >> (7-k)) & 1;
228         newbit[key_perm[j*8+k]-1] = bit[j*8+k];
229       }
230     }
231     for(j=0; j<8; j++){
232       kb[i][j] = 0;
233       for(k=0; k<8; k++){
234         kb[i][j] |= newbit[j*8+k] << (7-k);
235       }
236     }
237   }
238 
239   // xor to give kk
240   for(i=0; i<7; i++){
241     for(j=0; j<8; j++){
242       kk[i*8+j] = kb[i][j] ^ i;
243     }
244   }
245 
246 }
247 
248 //-----block utils
249 
trasp_N_8(unsigned char * in,unsigned char * out,int count)250 static inline __attribute__((always_inline)) void trasp_N_8 (unsigned char *in,unsigned char* out,int count){
251   int *ri=(int *)in;
252   int *ibi=(int *)out;
253   int j,i,k,g;
254   // copy and first step
255   for(g=0;g<count;g++){
256     ri[g]=ibi[2*g];
257     ri[GROUP_PARALLELISM+g]=ibi[2*g+1];
258   }
259 //dump_mem("NE1 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
260 // now 01230123
261 #define INTS_PER_ROW (GROUP_PARALLELISM/8*2)
262   for(j=0;j<8;j+=4){
263     for(i=0;i<2;i++){
264       for(k=0;k<INTS_PER_ROW;k++){
265         unsigned int t,b;
266         t=ri[INTS_PER_ROW*(j+i)+k];
267         b=ri[INTS_PER_ROW*(j+i+2)+k];
268         ri[INTS_PER_ROW*(j+i)+k]=     (t&0x0000ffff)      | ((b           )<<16);
269         ri[INTS_PER_ROW*(j+i+2)+k]=  ((t           )>>16) |  (b&0xffff0000) ;
270       }
271     }
272   }
273 //dump_mem("NE2 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
274 // now 01010101
275   for(j=0;j<8;j+=2){
276     for(i=0;i<1;i++){
277       for(k=0;k<INTS_PER_ROW;k++){
278         unsigned int t,b;
279         t=ri[INTS_PER_ROW*(j+i)+k];
280         b=ri[INTS_PER_ROW*(j+i+1)+k];
281         ri[INTS_PER_ROW*(j+i)+k]=     (t&0x00ff00ff)     | ((b&0x00ff00ff)<<8);
282         ri[INTS_PER_ROW*(j+i+1)+k]=  ((t&0xff00ff00)>>8) |  (b&0xff00ff00);
283       }
284     }
285   }
286 //dump_mem("NE3 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
287 // now 00000000
288 }
289 
trasp_8_N(unsigned char * in,unsigned char * out,int count)290 static inline __attribute__((always_inline)) void trasp_8_N (unsigned char *in,unsigned char* out,int count){
291   int *ri=(int *)in;
292   int *bdi=(int *)out;
293   int j,i,k,g;
294 #define INTS_PER_ROW (GROUP_PARALLELISM/8*2)
295 //dump_mem("NE1 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
296 // now 00000000
297   for(j=0;j<8;j+=2){
298     for(i=0;i<1;i++){
299       for(k=0;k<INTS_PER_ROW;k++){
300         unsigned int t,b;
301         t=ri[INTS_PER_ROW*(j+i)+k];
302         b=ri[INTS_PER_ROW*(j+i+1)+k];
303         ri[INTS_PER_ROW*(j+i)+k]=     (t&0x00ff00ff)     | ((b&0x00ff00ff)<<8);
304         ri[INTS_PER_ROW*(j+i+1)+k]=  ((t&0xff00ff00)>>8) |  (b&0xff00ff00);
305       }
306     }
307   }
308 //dump_mem("NE2 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
309 // now 01010101
310   for(j=0;j<8;j+=4){
311     for(i=0;i<2;i++){
312       for(k=0;k<INTS_PER_ROW;k++){
313         unsigned int t,b;
314         t=ri[INTS_PER_ROW*(j+i)+k];
315         b=ri[INTS_PER_ROW*(j+i+2)+k];
316         ri[INTS_PER_ROW*(j+i)+k]=     (t&0x0000ffff)      | ((b           )<<16);
317         ri[INTS_PER_ROW*(j+i+2)+k]=  ((t           )>>16) |  (b&0xffff0000) ;
318       }
319     }
320   }
321 //dump_mem("NE3 r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
322 // now 01230123
323   for(g=0;g<count;g++){
324     bdi[2*g]=ri[g];
325     bdi[2*g+1]=ri[GROUP_PARALLELISM+g];
326   }
327 }
328 
329 //-----block main function
330 
331 // block group
block_decypher_group(batch * kkmulti,unsigned char * ib,unsigned char * bd,int count)332 static void block_decypher_group (
333   batch *kkmulti,       // [In]  kkmulti[0]-kkmulti[55] 56 batches | Key schedule (each batch has repeated equal bytes).
334   unsigned char *ib,    // [In]  (ib0,ib1,...ib7)...x32 32*8 bytes | Initialization vector.
335   unsigned char *bd,    // [Out] (bd0,bd1,...bd7)...x32 32*8 bytes | Block decipher.
336   int count)
337 {
338   // int is faster than unsigned char. apparently not
339   static const unsigned char block_sbox[0x100] = {
340     0x3A,0xEA,0x68,0xFE,0x33,0xE9,0x88,0x1A, 0x83,0xCF,0xE1,0x7F,0xBA,0xE2,0x38,0x12,
341     0xE8,0x27,0x61,0x95,0x0C,0x36,0xE5,0x70, 0xA2,0x06,0x82,0x7C,0x17,0xA3,0x26,0x49,
342     0xBE,0x7A,0x6D,0x47,0xC1,0x51,0x8F,0xF3, 0xCC,0x5B,0x67,0xBD,0xCD,0x18,0x08,0xC9,
343     0xFF,0x69,0xEF,0x03,0x4E,0x48,0x4A,0x84, 0x3F,0xB4,0x10,0x04,0xDC,0xF5,0x5C,0xC6,
344     0x16,0xAB,0xAC,0x4C,0xF1,0x6A,0x2F,0x3C, 0x3B,0xD4,0xD5,0x94,0xD0,0xC4,0x63,0x62,
345     0x71,0xA1,0xF9,0x4F,0x2E,0xAA,0xC5,0x56, 0xE3,0x39,0x93,0xCE,0x65,0x64,0xE4,0x58,
346     0x6C,0x19,0x42,0x79,0xDD,0xEE,0x96,0xF6, 0x8A,0xEC,0x1E,0x85,0x53,0x45,0xDE,0xBB,
347     0x7E,0x0A,0x9A,0x13,0x2A,0x9D,0xC2,0x5E, 0x5A,0x1F,0x32,0x35,0x9C,0xA8,0x73,0x30,
348 
349     0x29,0x3D,0xE7,0x92,0x87,0x1B,0x2B,0x4B, 0xA5,0x57,0x97,0x40,0x15,0xE6,0xBC,0x0E,
350     0xEB,0xC3,0x34,0x2D,0xB8,0x44,0x25,0xA4, 0x1C,0xC7,0x23,0xED,0x90,0x6E,0x50,0x00,
351     0x99,0x9E,0x4D,0xD9,0xDA,0x8D,0x6F,0x5F, 0x3E,0xD7,0x21,0x74,0x86,0xDF,0x6B,0x05,
352     0x8E,0x5D,0x37,0x11,0xD2,0x28,0x75,0xD6, 0xA7,0x77,0x24,0xBF,0xF0,0xB0,0x02,0xB7,
353     0xF8,0xFC,0x81,0x09,0xB1,0x01,0x76,0x91, 0x7D,0x0F,0xC8,0xA0,0xF2,0xCB,0x78,0x60,
354     0xD1,0xF7,0xE0,0xB5,0x98,0x22,0xB3,0x20, 0x1D,0xA6,0xDB,0x7B,0x59,0x9F,0xAE,0x31,
355     0xFB,0xD3,0xB6,0xCA,0x43,0x72,0x07,0xF4, 0xD8,0x41,0x14,0x55,0x0D,0x54,0x8B,0xB9,
356     0xAD,0x46,0x0B,0xAF,0x80,0x52,0x2C,0xFA, 0x8C,0x89,0x66,0xFD,0xB2,0xA9,0x9B,0xC0,
357   };
358   MEMALIGN unsigned char r[GROUP_PARALLELISM*(8+56)];  /* 56 because we will move back in memory while looping */
359   MEMALIGN unsigned char sbox_in[GROUP_PARALLELISM],sbox_out[GROUP_PARALLELISM],perm_out[GROUP_PARALLELISM];
360   int roff;
361   int i,g,count_all=GROUP_PARALLELISM;
362 
363   roff=GROUP_PARALLELISM*56;
364   memset(r + roff, 0, sizeof(r) - roff);
365 
366 #define FASTTRASP1
367 #ifndef FASTTRASP1
368   for(g=0;g<count;g++){
369     // Init registers
370     int j;
371     for(j=0;j<8;j++){
372       r[roff+GROUP_PARALLELISM*j+g]=ib[8*g+j];
373     }
374   }
375 #else
376   trasp_N_8((unsigned char *)&r[roff],(unsigned char *)ib,count);
377 #endif
378 //dump_mem("OLD r[roff]",&r[roff],GROUP_PARALLELISM*8,GROUP_PARALLELISM);
379 
380   // loop over kk[55]..kk[0]
381   for(i=55;i>=0;i--){
382     {
383       MEMALIGN batch tkkmulti=kkmulti[i];
384       batch *si=(batch *)sbox_in;
385       batch *r6_N=(batch *)(r+roff+GROUP_PARALLELISM*6);
386       for(g=0;g<count_all/BYTES_PER_BATCH;g++){
387         si[g]=B_FFXOR(tkkmulti,r6_N[g]);              //FIXME: introduce FASTBATCH?
388       }
389     }
390 
391     // table lookup, this works on only one byte at a time
392     // most difficult part of all
393     // - can't be parallelized
394     // - can't be synthetized through boolean terms (8 input bits are too many)
395     for(g=0;g<count_all;g++){
396       sbox_out[g]=block_sbox[sbox_in[g]];
397     }
398 
399     // bit permutation
400     {
401       unsigned char *po=(unsigned char *)perm_out;
402       unsigned char *so=(unsigned char *)sbox_out;
403 //dump_mem("pre perm ",(unsigned char *)so,GROUP_PARALLELISM,GROUP_PARALLELISM);
404       for(g=0;g<count_all;g+=BYTES_PER_BATCH){
405         MEMALIGN batch in,out;
406         in=*(batch *)&so[g];
407 
408         out=B_FFOR(
409 	    B_FFOR(
410 	    B_FFOR(
411 	    B_FFOR(
412 	    B_FFOR(
413 	           B_FFSH8L(B_FFAND(in,B_FFN_ALL_29()),1),
414 	           B_FFSH8L(B_FFAND(in,B_FFN_ALL_02()),6)),
415 	           B_FFSH8L(B_FFAND(in,B_FFN_ALL_04()),3)),
416 	           B_FFSH8R(B_FFAND(in,B_FFN_ALL_10()),2)),
417 	           B_FFSH8R(B_FFAND(in,B_FFN_ALL_40()),6)),
418 	           B_FFSH8R(B_FFAND(in,B_FFN_ALL_80()),4));
419 
420         *(batch *)&po[g]=out;
421       }
422 //dump_mem("post perm",(unsigned char *)po,GROUP_PARALLELISM,GROUP_PARALLELISM);
423     }
424 
425     roff-=GROUP_PARALLELISM; /* virtual shift of registers */
426 
427 #if 0
428 /* one by one */
429     for(g=0;g<count_all;g++){
430       r[roff+GROUP_PARALLELISM*0+g]=r[roff+GROUP_PARALLELISM*8+g]^sbox_out[g];
431       r[roff+GROUP_PARALLELISM*6+g]^=perm_out[g];
432       r[roff+GROUP_PARALLELISM*4+g]^=r[roff+GROUP_PARALLELISM*0+g];
433       r[roff+GROUP_PARALLELISM*3+g]^=r[roff+GROUP_PARALLELISM*0+g];
434       r[roff+GROUP_PARALLELISM*2+g]^=r[roff+GROUP_PARALLELISM*0+g];
435     }
436 #else
437     for(g=0;g<count_all;g+=BEST_SPAN){
438       XOR_BEST_BY(&r[roff+GROUP_PARALLELISM*0+g],&r[roff+GROUP_PARALLELISM*8+g],&sbox_out[g]);
439       XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*6+g],&perm_out[g]);
440       XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*4+g],&r[roff+GROUP_PARALLELISM*0+g]);
441       XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*3+g],&r[roff+GROUP_PARALLELISM*0+g]);
442       XOREQ_BEST_BY(&r[roff+GROUP_PARALLELISM*2+g],&r[roff+GROUP_PARALLELISM*0+g]);
443     }
444 #endif
445   }
446 
447 #define FASTTRASP2
448 #ifndef FASTTRASP2
449   for(g=0;g<count;g++){
450     // Copy results
451     int j;
452     for(j=0;j<8;j++){
453       bd[8*g+j]=r[roff+GROUP_PARALLELISM*j+g];
454     }
455   }
456 #else
457   trasp_8_N((unsigned char *)&r[roff],(unsigned char *)bd,count);
458 #endif
459 }
460 
461 //-----------------------------------EXTERNAL INTERFACE
462 
463 
464 //-----set control words
465 
schedule_key(struct csa_key_t * key,const unsigned char * pk)466 static void schedule_key(struct csa_key_t *key, const unsigned char *pk){
467   // could be made faster, but is not run often
468   int bi,by;
469   int i,j;
470 // key
471   memcpy(key->ck,pk,8);
472 // precalculations for stream
473   key_schedule_stream(key->ck,key->iA,key->iB);
474   for(by=0;by<8;by++){
475     for(bi=0;bi<8;bi++){
476       key->ck_g[by][bi]=(key->ck[by]&(1<<bi))?FF1():FF0();
477     }
478   }
479   for(by=0;by<8;by++){
480     for(bi=0;bi<4;bi++){
481       key->iA_g[by][bi]=(key->iA[by]&(1<<bi))?FF1():FF0();
482       key->iB_g[by][bi]=(key->iB[by]&(1<<bi))?FF1():FF0();
483     }
484   }
485 // precalculations for block
486   key_schedule_block(key->ck,key->kk);
487   for(i=0;i<56;i++){
488     for(j=0;j<BYTES_PER_BATCH;j++){
489       *(((unsigned char *)&key->kkmulti[i])+j)=key->kk[i];
490     }
491   }
492 }
493 
494 extern void FUNC(set_control_words)(void *keys, const unsigned char *ev, const unsigned char *od);
495 
FUNC(set_control_words)496 void FUNC(set_control_words)(void *keys, const unsigned char *ev, const unsigned char *od)
497 {
498   schedule_key(&((struct csa_keys_t *)keys)->even,ev);
499   schedule_key(&((struct csa_keys_t *)keys)->odd,od);
500 }
501 
502 extern void FUNC(set_even_control_word)(void *keys, const unsigned char *pk);
503 
FUNC(set_even_control_word)504 void FUNC(set_even_control_word)(void *keys, const unsigned char *pk)
505 {
506   schedule_key(&((struct csa_keys_t *)keys)->even,pk);
507 }
508 
509 extern void FUNC(set_odd_control_word)(void *keys, const unsigned char *pk);
510 
FUNC(set_odd_control_word)511 void FUNC(set_odd_control_word)(void *keys, const unsigned char *pk){
512   schedule_key(&((struct csa_keys_t *)keys)->odd,pk);
513 }
514 
515 //-----get internal parallelism
516 
517 extern int FUNC(get_internal_parallelism)(void);
518 
FUNC(get_internal_parallelism)519 int FUNC(get_internal_parallelism)(void)
520 {
521   return GROUP_PARALLELISM;
522 }
523 
524 //-----get suggested cluster size
525 
526 extern int FUNC(get_suggested_cluster_size)(void);
527 
FUNC(get_suggested_cluster_size)528 int FUNC(get_suggested_cluster_size)(void)
529 {
530   int r;
531   r=GROUP_PARALLELISM+GROUP_PARALLELISM/10;
532   if(r<GROUP_PARALLELISM+5) r=GROUP_PARALLELISM+5;
533   return r;
534 }
535 
536 //-----key structure
537 
538 extern void *FUNC(get_key_struct)(void);
FUNC(get_key_struct)539 void *FUNC(get_key_struct)(void)
540 {
541   struct csa_keys_t *keys=(struct csa_keys_t *)MALLOC(sizeof(struct csa_keys_t));
542   if(keys) {
543     static const unsigned char pk[8] = { 0,0,0,0,0,0,0,0 };
544     FUNC(set_control_words)(keys,pk,pk);
545     }
546   return keys;
547 }
548 
549 extern void FUNC(free_key_struct)(void *keys);
FUNC(free_key_struct)550 void FUNC(free_key_struct)(void *keys)
551 {
552   return FREE(keys);
553 }
554 
555 
556 
557 //-----get control words
558 #if 0
559 void get_control_words(void *keys, unsigned char *even, unsigned char *odd){
560   memcpy(even,&((struct csa_keys_t *)keys)->even.ck,8);
561   memcpy(odd,&((struct csa_keys_t *)keys)->odd.ck,8);
562 }
563 #endif
564 
565 //----- decrypt
566 
567 extern int FUNC(decrypt_packets)(void *keys, unsigned char **cluster);
FUNC(decrypt_packets)568 int FUNC(decrypt_packets)(void *keys, unsigned char **cluster)
569 {
570   // statistics, currently unused
571   int stat_no_scramble=0;
572   int stat_reserved=0;
573   int stat_decrypted[2]={0,0};
574   int stat_decrypted_mini=0;
575   unsigned char **clst;
576   unsigned char **clst2;
577   int grouped;
578   int group_ev_od;
579   int advanced;
580   int can_advance;
581   unsigned char *g_pkt[GROUP_PARALLELISM];
582   int g_len[GROUP_PARALLELISM];
583   int g_offset[GROUP_PARALLELISM];
584   int g_n[GROUP_PARALLELISM];
585   int g_residue[GROUP_PARALLELISM];
586   unsigned char *pkt;
587   int xc0,ev_od,len,offset,n,residue;
588   struct csa_key_t* k;
589   int i,j,iter,g;
590   int t23,tsmall;
591   int alive[24];
592 //icc craziness  int pad1=0; //////////align! FIXME
593   unsigned char *encp[GROUP_PARALLELISM];
594   MEMALIGN unsigned char stream_in[GROUP_PARALLELISM*8];
595   MEMALIGN unsigned char stream_out[GROUP_PARALLELISM*8];
596   MEMALIGN unsigned char ib[GROUP_PARALLELISM*8];
597   MEMALIGN unsigned char block_out[GROUP_PARALLELISM*8];
598   struct stream_regs regs;
599 
600 //icc craziness  i=(int)&pad1;//////////align!!! FIXME
601 
602   // build a list of packets to be processed
603   clst=cluster;
604   grouped=0;
605   advanced=0;
606   can_advance=1;
607   group_ev_od=-1; // silence incorrect compiler warning
608   pkt=*clst;
609   do{ // find a new packet
610     if(grouped==GROUP_PARALLELISM){
611       // full
612       break;
613     }
614     if(pkt==NULL){
615       // no more ranges
616       break;
617     }
618     if(pkt>=*(clst+1)){
619       // out of this range, try next
620       clst++;clst++;
621       pkt=*clst;
622       continue;
623     }
624 
625     do{ // handle this packet
626       xc0=pkt[3]&0xc0;
627       DBG(fprintf(stderr,"   exam pkt=%p, xc0=%02x, can_adv=%i\n",pkt,xc0,can_advance));
628       if(xc0==0x00){
629         DBG(fprintf(stderr,"skip clear pkt %p (can_advance is %i)\n",pkt,can_advance));
630         advanced+=can_advance;
631         stat_no_scramble++;
632         break;
633       }
634       if(xc0==0x40){
635         DBG(fprintf(stderr,"skip reserved pkt %p (can_advance is %i)\n",pkt,can_advance));
636         advanced+=can_advance;
637         stat_reserved++;
638         break;
639       }
640       if(xc0==0x80||xc0==0xc0){ // encrypted
641         ev_od=(xc0&0x40)>>6; // 0 even, 1 odd
642         if(grouped==0) group_ev_od=ev_od; // this group will be all even (or odd)
643         if(group_ev_od==ev_od){ // could be added to group
644           pkt[3]&=0x3f;  // consider it decrypted now
645           if(pkt[3]&0x20){ // incomplete packet
646             offset=4+pkt[4]+1;
647             len=188-offset;
648             n=len>>3;
649             residue=len-(n<<3);
650             if(n==0){ // decrypted==encrypted!
651               DBG(fprintf(stderr,"DECRYPTED MINI! (can_advance is %i)\n",can_advance));
652               advanced+=can_advance;
653               stat_decrypted_mini++;
654               break; // this doesn't need more processing
655             }
656           }else{
657             len=184;
658             offset=4;
659             n=23;
660             residue=0;
661           }
662           g_pkt[grouped]=pkt;
663           g_len[grouped]=len;
664           g_offset[grouped]=offset;
665           g_n[grouped]=n;
666           g_residue[grouped]=residue;
667           DBG(fprintf(stderr,"%2i: eo=%i pkt=%p len=%03i n=%2i residue=%i\n",grouped,ev_od,pkt,len,n,residue));
668           grouped++;
669           advanced+=can_advance;
670           stat_decrypted[ev_od]++;
671         }
672         else{
673           can_advance=0;
674           DBG(fprintf(stderr,"skip pkt %p and can_advance set to 0\n",pkt));
675           break; // skip and go on
676         }
677       }
678     } while(0);
679 
680     if(can_advance){
681       // move range start forward
682       *clst+=188;
683     }
684     // next packet, if there is one
685     pkt+=188;
686   } while(1);
687   DBG(fprintf(stderr,"-- result: grouped %i pkts, advanced %i pkts\n",grouped,advanced));
688 
689   // delete empty ranges and compact list
690   clst2=cluster;
691   for(clst=cluster;*clst!=NULL;clst+=2){
692     // if not empty
693     if(*clst<*(clst+1)){
694       // it will remain
695       *clst2=*clst;
696       *(clst2+1)=*(clst+1);
697       clst2+=2;
698     }
699   }
700   *clst2=NULL;
701 
702   if(grouped==0){
703     // no processing needed
704     return advanced;
705   }
706 
707   //  sort them, longest payload first
708   //  we expect many n=23 packets and a few n<23
709   DBG(fprintf(stderr,"PRESORTING\n"));
710   for(i=0;i<grouped;i++){
711     DBG(fprintf(stderr,"%2i of %2i: pkt=%p len=%03i n=%2i residue=%i\n",i,grouped,g_pkt[i],g_len[i],g_n[i],g_residue[i]));
712     }
713   // grouped is always <= GROUP_PARALLELISM
714 
715 #define g_swap(a,b) \
716     pkt=g_pkt[a]; \
717     g_pkt[a]=g_pkt[b]; \
718     g_pkt[b]=pkt; \
719 \
720     len=g_len[a]; \
721     g_len[a]=g_len[b]; \
722     g_len[b]=len; \
723 \
724     offset=g_offset[a]; \
725     g_offset[a]=g_offset[b]; \
726     g_offset[b]=offset; \
727 \
728     n=g_n[a]; \
729     g_n[a]=g_n[b]; \
730     g_n[b]=n; \
731 \
732     residue=g_residue[a]; \
733     g_residue[a]=g_residue[b]; \
734     g_residue[b]=residue;
735 
736   // step 1: move n=23 packets before small packets
737   t23=0;
738   tsmall=grouped-1;
739   for(;;){
740     for(;t23<grouped;t23++){
741       if(g_n[t23]!=23) break;
742     }
743 DBG(fprintf(stderr,"t23 after for =%i\n",t23));
744 
745     for(;tsmall>=0;tsmall--){
746       if(g_n[tsmall]==23) break;
747     }
748 DBG(fprintf(stderr,"tsmall after for =%i\n",tsmall));
749 
750     if(tsmall-t23<1) break;
751 
752 DBG(fprintf(stderr,"swap t23=%i,tsmall=%i\n",t23,tsmall));
753 
754     g_swap(t23,tsmall);
755 
756     t23++;
757     tsmall--;
758 DBG(fprintf(stderr,"new t23=%i,tsmall=%i\n\n",t23,tsmall));
759   }
760   DBG(fprintf(stderr,"packets with n=23, t23=%i   grouped=%i\n",t23,grouped));
761   DBG(fprintf(stderr,"MIDSORTING\n"));
762   for(i=0;i<grouped;i++){
763     DBG(fprintf(stderr,"%2i of %2i: pkt=%p len=%03i n=%2i residue=%i\n",i,grouped,g_pkt[i],g_len[i],g_n[i],g_residue[i]));
764     }
765 
766   // step 2: sort small packets in decreasing order of n (bubble sort is enough)
767   for(i=t23;i<grouped;i++){
768     for(j=i+1;j<grouped;j++){
769       if(g_n[j]>g_n[i]){
770         g_swap(i,j);
771       }
772     }
773   }
774   DBG(fprintf(stderr,"POSTSORTING\n"));
775   for(i=0;i<grouped;i++){
776     DBG(fprintf(stderr,"%2i of %2i: pkt=%p len=%03i n=%2i residue=%i\n",i,grouped,g_pkt[i],g_len[i],g_n[i],g_residue[i]));
777     }
778 
779   // we need to know how many packets need 23 iterations, how many 22...
780   for(i=0;i<=23;i++){
781     alive[i]=0;
782   }
783   // count
784   alive[23-1]=t23;
785   for(i=t23;i<grouped;i++){
786     alive[g_n[i]-1]++;
787   }
788   // integrate
789   for(i=22;i>=0;i--){
790     alive[i]+=alive[i+1];
791   }
792   DBG(fprintf(stderr,"ALIVE\n"));
793   for(i=0;i<=23;i++){
794     DBG(fprintf(stderr,"alive%2i=%i\n",i,alive[i]));
795     }
796 
797   // choose key
798   if(group_ev_od==0){
799     k=&((struct csa_keys_t *)keys)->even;
800   }
801   else{
802     k=&((struct csa_keys_t *)keys)->odd;
803   }
804 
805   //INIT
806 //#define INITIALIZE_UNUSED_INPUT
807 #ifdef INITIALIZE_UNUSED_INPUT
808 // unnecessary zeroing.
809 // without this, we operate on uninitialized memory
810 // when grouped<GROUP_PARALLELISM, but it's not a problem,
811 // as final results will be discarded.
812 // random data makes debugging sessions difficult.
813   for(j=0;j<GROUP_PARALLELISM*8;j++) stream_in[j]=0;
814 DBG(fprintf(stderr,"--- WARNING: you could gain speed by not initializing unused memory ---\n"));
815 #else
816 DBG(fprintf(stderr,"--- WARNING: DEBUGGING IS MORE DIFFICULT WHEN PROCESSING RANDOM DATA CHANGING AT EVERY RUN! ---\n"));
817 #endif
818 
819   for(g=0;g<grouped;g++){
820     encp[g]=g_pkt[g];
821     DBG(fprintf(stderr,"header[%i]=%p (%02x)\n",g,encp[g],*(encp[g])));
822     encp[g]+=g_offset[g]; // skip header
823     FFTABLEIN(stream_in,g,encp[g]);
824   }
825 //dump_mem("stream_in",stream_in,GROUP_PARALLELISM*8,BYPG);
826 
827 
828   // ITER 0
829 DBG(fprintf(stderr,">>>>>ITER 0\n"));
830   iter=0;
831   stream_cypher_group_init(&regs,k->iA_g,k->iB_g,stream_in);
832   // fill first ib
833   for(g=0;g<alive[iter];g++){
834     COPY_8_BY(ib+8*g,encp[g]);
835   }
836 DBG(dump_mem("IB ",ib,8*alive[iter],8));
837   // ITER 1..N-1
838   for (iter=1;iter<23&&alive[iter-1]>0;iter++){
839 DBG(fprintf(stderr,">>>>>ITER %i\n",iter));
840     // alive and just dead packets: calc block
841     block_decypher_group(k->kkmulti,ib,block_out,alive[iter-1]);
842 DBG(dump_mem("BLO_ib ",block_out,8*alive[iter-1],8));
843     // all packets (dead too): calc stream
844     stream_cypher_group_normal(&regs,stream_out);
845 //dump_mem("stream_out",stream_out,GROUP_PARALLELISM*8,BYPG);
846 
847     // alive packets: calc ib
848     for(g=0;g<alive[iter];g++){
849       FFTABLEOUT(ib+8*g,stream_out,g);
850 DBG(dump_mem("stream_out_ib ",ib+8*g,8,8));
851 // XOREQ8BY gcc bug? 2x4 ok, 8 ko    UPDATE: result ok but speed 1-2% slower (!!!???)
852 #if 1
853       XOREQ_4_BY(ib+8*g,encp[g]+8);
854       XOREQ_4_BY(ib+8*g+4,encp[g]+8+4);
855 #else
856       XOREQ_8_BY(ib+8*g,encp[g]+8);
857 #endif
858 DBG(dump_mem("after_stream_xor_ib ",ib+8*g,8,8));
859     }
860     // alive packets: decrypt data
861     for(g=0;g<alive[iter];g++){
862 DBG(dump_mem("before_ib_decrypt_data ",encp[g],8,8));
863       XOR_8_BY(encp[g],ib+8*g,block_out+8*g);
864 DBG(dump_mem("after_ib_decrypt_data ",encp[g],8,8));
865     }
866     // just dead packets: write decrypted data
867     for(g=alive[iter];g<alive[iter-1];g++){
868 DBG(dump_mem("jd_before_ib_decrypt_data ",encp[g],8,8));
869       COPY_8_BY(encp[g],block_out+8*g);
870 DBG(dump_mem("jd_after_ib_decrypt_data ",encp[g],8,8));
871     }
872     // just dead packets: decrypt residue
873     for(g=alive[iter];g<alive[iter-1];g++){
874 DBG(dump_mem("jd_before_decrypt_residue ",encp[g]+8,g_residue[g],g_residue[g]));
875       FFTABLEOUTXORNBY(g_residue[g],encp[g]+8,stream_out,g);
876 DBG(dump_mem("jd_after_decrypt_residue ",encp[g]+8,g_residue[g],g_residue[g]));
877     }
878     // alive packets: pointers++
879     for(g=0;g<alive[iter];g++) encp[g]+=8;
880   };
881   // ITER N
882 DBG(fprintf(stderr,">>>>>ITER 23\n"));
883   iter=23;
884   // calc block
885   block_decypher_group(k->kkmulti,ib,block_out,alive[iter-1]);
886 DBG(dump_mem("23BLO_ib ",block_out,8*alive[iter-1],8));
887   // just dead packets: write decrypted data
888   for(g=alive[iter];g<alive[iter-1];g++){
889 DBG(dump_mem("23jd_before_ib_decrypt_data ",encp[g],8,8));
890     COPY_8_BY(encp[g],block_out+8*g);
891 DBG(dump_mem("23jd_after_ib_decrypt_data ",encp[g],8,8));
892   }
893   // no residue possible
894   // so do nothing
895 
896   DBG(fprintf(stderr,"returning advanced=%i\n",advanced));
897 
898   M_EMPTY(); // restore CPU multimedia state
899 
900   return advanced;
901 }
902