1 #include "rar.hpp"
2 
3 // We used "Screaming Fast Galois Field Arithmetic Using Intel SIMD
4 // Instructions" paper by James S. Plank, Kevin M. Greenan
5 // and Ethan L. Miller for fast SSE based multiplication.
6 // Also we are grateful to Artem Drobanov and Bulat Ziganshin
7 // for samples and ideas allowed to make Reed-Solomon codec more efficient.
8 
RSCoder16()9 RSCoder16::RSCoder16()
10 {
11   Decoding=false;
12   ND=NR=NE=0;
13   ValidFlags=NULL;
14   MX=NULL;
15   DataLog=NULL;
16   DataLogSize=0;
17 
18   gfInit();
19 }
20 
21 
~RSCoder16()22 RSCoder16::~RSCoder16()
23 {
24   delete[] gfExp;
25   delete[] gfLog;
26   delete[] DataLog;
27   delete[] MX;
28   delete[] ValidFlags;
29 }
30 
31 
32 // Initialize logarithms and exponents Galois field tables.
gfInit()33 void RSCoder16::gfInit()
34 {
35   gfExp=new uint[4*gfSize+1];
36   gfLog=new uint[gfSize+1];
37 
38   for (uint L=0,E=1;L<gfSize;L++)
39   {
40     gfLog[E]=L;
41     gfExp[L]=E;
42     gfExp[L+gfSize]=E;  // Duplicate the table to avoid gfExp overflow check.
43     E<<=1;
44     if (E>gfSize)
45       E^=0x1100B; // Irreducible field-generator polynomial.
46   }
47 
48   // log(0)+log(x) must be outside of usual log table, so we can set it
49   // to 0 and avoid check for 0 in multiplication parameters.
50   gfLog[0]= 2*gfSize;
51   for (uint I=2*gfSize;I<=4*gfSize;I++) // Results for log(0)+log(x).
52     gfExp[I]=0;
53 }
54 
55 
gfAdd(uint a,uint b)56 uint RSCoder16::gfAdd(uint a,uint b) // Addition in Galois field.
57 {
58   return a^b;
59 }
60 
61 
gfMul(uint a,uint b)62 uint RSCoder16::gfMul(uint a,uint b) // Multiplication in Galois field.
63 {
64   return gfExp[gfLog[a]+gfLog[b]];
65 }
66 
67 
gfInv(uint a)68 uint RSCoder16::gfInv(uint a) // Inverse element in Galois field.
69 {
70   return a==0 ? 0:gfExp[gfSize-gfLog[a]];
71 }
72 
73 
Init(uint DataCount,uint RecCount,bool * ValidityFlags)74 bool RSCoder16::Init(uint DataCount, uint RecCount, bool *ValidityFlags)
75 {
76   ND = DataCount;
77   NR = RecCount;
78   NE = 0;
79 
80   Decoding=ValidityFlags!=NULL;
81   if (Decoding)
82   {
83     delete[] ValidFlags;
84     ValidFlags=new bool[ND + NR];
85 
86     for (uint I = 0; I < ND + NR; I++)
87       ValidFlags[I]=ValidityFlags[I];
88     for (uint I = 0; I < ND; I++)
89       if (!ValidFlags[I])
90         NE++;
91     uint ValidECC=0;
92     for (uint I = ND; I < ND + NR; I++)
93       if (ValidFlags[I])
94         ValidECC++;
95     if (NE > ValidECC || NE == 0 || ValidECC == 0)
96       return false;
97   }
98   if (ND + NR > gfSize || NR > ND || ND == 0 || NR == 0)
99     return false;
100 
101   delete[] MX;
102   if (Decoding)
103   {
104     MX=new uint[NE * ND];
105     MakeDecoderMatrix();
106     InvertDecoderMatrix();
107   }
108   else
109   {
110     MX=new uint[NR * ND];
111     MakeEncoderMatrix();
112   }
113   return true;
114 }
115 
116 
MakeEncoderMatrix()117 void RSCoder16::MakeEncoderMatrix()
118 {
119   // Create Cauchy encoder generator matrix. Skip trivial "1" diagonal rows,
120   // which would just copy source data to destination.
121   for (uint I = 0; I < NR; I++)
122     for (uint J = 0; J < ND; J++)
123       MX[I * ND + J] = gfInv( gfAdd( (I+ND), J) );
124 }
125 
126 
MakeDecoderMatrix()127 void RSCoder16::MakeDecoderMatrix()
128 {
129   // Create Cauchy decoder matrix. Skip trivial rows matching valid data
130   // units and containing "1" on main diagonal. Such rows would just copy
131   // source data to destination and they have no real value for us.
132   // Include rows only for broken data units and replace them by first
133   // available valid recovery code rows.
134   for (uint Flag=0, R=ND, Dest=0; Flag < ND; Flag++)
135     if (!ValidFlags[Flag]) // For every broken data unit.
136     {
137       while (!ValidFlags[R]) // Find a valid recovery unit.
138         R++;
139       for (uint J = 0; J < ND; J++) // And place its row to matrix.
140         MX[Dest*ND + J] = gfInv( gfAdd(R,J) );
141       Dest++;
142       R++;
143     }
144 }
145 
146 
147 // Apply Gauss�Jordan elimination to find inverse of decoder matrix.
148 // We have the square NDxND matrix, but we do not store its trivial
149 // diagonal "1" rows matching valid data, so we work with NExND matrix.
150 // Our original Cauchy matrix does not contain 0, so we skip search
151 // for non-zero pivot.
InvertDecoderMatrix()152 void RSCoder16::InvertDecoderMatrix()
153 {
154   uint *MI=new uint[NE * ND]; // We'll create inverse matrix here.
155   memset(MI, 0, ND * NE * sizeof(*MI)); // Initialize to identity matrix.
156   for (uint Kr = 0, Kf = 0; Kr < NE; Kr++, Kf++)
157   {
158     while (ValidFlags[Kf]) // Skip trivial rows.
159       Kf++;
160     MI[Kr * ND + Kf] = 1;  // Set diagonal 1.
161   }
162 
163   // Kr is the number of row in our actual reduced NE x ND matrix,
164   // which does not contain trivial diagonal 1 rows.
165   // Kf is the number of row in full ND x ND matrix with all trivial rows
166   // included.
167   for (uint Kr = 0, Kf = 0; Kf < ND; Kr++, Kf++) // Select pivot row.
168   {
169     while (ValidFlags[Kf] && Kf < ND)
170     {
171       // Here we process trivial diagonal 1 rows matching valid data units.
172       // Their processing can be simplified comparing to usual rows.
173       // In full version of elimination we would set MX[I * ND + Kf] to zero
174       // after MI[..]^=, but we do not need it for matrix inversion.
175       for (uint I = 0; I < NE; I++)
176         MI[I * ND + Kf] ^= MX[I * ND + Kf];
177       Kf++;
178     }
179 
180     if (Kf == ND)
181       break;
182 
183     uint *MXk = MX + Kr * ND; // k-th row of main matrix.
184     uint *MIk = MI + Kr * ND; // k-th row of inversion matrix.
185 
186     uint PInv = gfInv( MXk[Kf] ); // Pivot inverse.
187     // Divide the pivot row by pivot, so pivot cell contains 1.
188     for (uint I = 0; I < ND; I++)
189     {
190       MXk[I] = gfMul( MXk[I], PInv );
191       MIk[I] = gfMul( MIk[I], PInv );
192     }
193 
194     for (uint I = 0; I < NE; I++)
195       if (I != Kr) // For all rows except containing the pivot cell.
196       {
197         // Apply Gaussian elimination Mij -= Mkj * Mik / pivot.
198         // Since pivot is already 1, it is reduced to Mij -= Mkj * Mik.
199         uint *MXi = MX + I * ND; // i-th row of main matrix.
200         uint *MIi = MI + I * ND; // i-th row of inversion matrix.
201         uint Mik = MXi[Kf]; // Cell in pivot position.
202         for (uint J = 0; J < ND; J++)
203         {
204           MXi[J] ^= gfMul(MXk[J] , Mik);
205           MIi[J] ^= gfMul(MIk[J] , Mik);
206         }
207       }
208   }
209 
210   // Copy data to main matrix.
211   for (uint I = 0; I < NE * ND; I++)
212     MX[I] = MI[I];
213 
214   delete[] MI;
215 }
216 
217 
218 #if 0
219 // Multiply matrix to data vector. When encoding, it contains data in Data
220 // and stores error correction codes in Out. When decoding it contains
221 // broken data followed by ECC in Data and stores recovered data to Out.
222 // We do not use this function now, everything is moved to UpdateECC.
223 void RSCoder16::Process(const uint *Data, uint *Out)
224 {
225   uint ProcData[gfSize];
226 
227   for (uint I = 0; I < ND; I++)
228     ProcData[I]=Data[I];
229 
230   if (Decoding)
231   {
232     // Replace broken data units with first available valid recovery codes.
233     // 'Data' array must contain recovery codes after data.
234     for (uint I=0, R=ND, Dest=0; I < ND; I++)
235       if (!ValidFlags[I]) // For every broken data unit.
236       {
237         while (!ValidFlags[R]) // Find a valid recovery unit.
238           R++;
239         ProcData[I]=Data[R];
240         R++;
241       }
242   }
243 
244   uint H=Decoding ? NE : NR;
245   for (uint I = 0; I < H; I++)
246   {
247     uint R = 0; // Result of matrix row multiplication to data.
248 
249     uint *MXi=MX + I * ND;
250     for (uint J = 0; J < ND; J++)
251       R ^= gfMul(MXi[J], ProcData[J]);
252 
253     Out[I] = R;
254   }
255 }
256 #endif
257 
258 
259 // We update ECC in blocks by applying every data block to all ECC blocks.
260 // This function applies one data block to one ECC block.
UpdateECC(uint DataNum,uint ECCNum,const byte * Data,byte * ECC,size_t BlockSize)261 void RSCoder16::UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize)
262 {
263   if (DataNum==0) // Init ECC data.
264     memset(ECC, 0, BlockSize);
265 
266   bool DirectAccess;
267 #ifdef LITTLE_ENDIAN
268   // We can access data and ECC directly if we have little endian 16 bit uint.
269   DirectAccess=sizeof(ushort)==2;
270 #else
271   DirectAccess=false;
272 #endif
273 
274 #ifdef USE_SSE
275   if (DirectAccess && SSE_UpdateECC(DataNum,ECCNum,Data,ECC,BlockSize))
276     return;
277 #endif
278 
279   if (ECCNum==0)
280   {
281     if (DataLogSize!=BlockSize)
282     {
283       delete[] DataLog;
284       DataLog=new uint[BlockSize];
285       DataLogSize=BlockSize;
286 
287     }
288     if (DirectAccess)
289       for (size_t I=0; I<BlockSize; I+=2)
290         DataLog[I] = gfLog[ *(ushort*)(Data+I) ];
291     else
292       for (size_t I=0; I<BlockSize; I+=2)
293       {
294         uint D=Data[I]+Data[I+1]*256;
295         DataLog[I] = gfLog[ D ];
296       }
297   }
298 
299   uint ML = gfLog[ MX[ECCNum * ND + DataNum] ];
300 
301   if (DirectAccess)
302     for (size_t I=0; I<BlockSize; I+=2)
303       *(ushort*)(ECC+I) ^= gfExp[ ML + DataLog[I] ];
304   else
305     for (size_t I=0; I<BlockSize; I+=2)
306     {
307       uint R=gfExp[ ML + DataLog[I] ];
308       ECC[I]^=byte(R);
309       ECC[I+1]^=byte(R/256);
310     }
311 }
312 
313 
314 #ifdef USE_SSE
315 // Data and ECC addresses must be properly aligned for SSE.
316 // AVX2 did not provide a noticeable speed gain on i7-6700K here.
SSE_UpdateECC(uint DataNum,uint ECCNum,const byte * Data,byte * ECC,size_t BlockSize)317 bool RSCoder16::SSE_UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize)
318 {
319   // Check data alignment and SSSE3 support.
320   if ((size_t(Data) & (SSE_ALIGNMENT-1))!=0 || (size_t(ECC) & (SSE_ALIGNMENT-1))!=0 ||
321       _SSE_Version<SSE_SSSE3)
322     return false;
323 
324   uint M=MX[ECCNum * ND + DataNum];
325 
326   // Prepare tables containing products of M and 4, 8, 12, 16 bit length
327   // numbers, which have 4 high bits in 0..15 range and other bits set to 0.
328   // Store high and low bytes of resulting 16 bit product in separate tables.
329   __m128i T0L,T1L,T2L,T3L; // Low byte tables.
330   __m128i T0H,T1H,T2H,T3H; // High byte tables.
331 
332   for (uint I=0;I<16;I++)
333   {
334     ((byte *)&T0L)[I]=gfMul(I,M);
335     ((byte *)&T0H)[I]=gfMul(I,M)>>8;
336     ((byte *)&T1L)[I]=gfMul(I<<4,M);
337     ((byte *)&T1H)[I]=gfMul(I<<4,M)>>8;
338     ((byte *)&T2L)[I]=gfMul(I<<8,M);
339     ((byte *)&T2H)[I]=gfMul(I<<8,M)>>8;
340     ((byte *)&T3L)[I]=gfMul(I<<12,M);
341     ((byte *)&T3H)[I]=gfMul(I<<12,M)>>8;
342   }
343 
344   size_t Pos=0;
345 
346   __m128i LowByteMask=_mm_set1_epi16(0xff);     // 00ff00ff...00ff
347   __m128i Low4Mask=_mm_set1_epi8(0xf);          // 0f0f0f0f...0f0f
348   __m128i High4Mask=_mm_slli_epi16(Low4Mask,4); // f0f0f0f0...f0f0
349 
350   for (; Pos+2*sizeof(__m128i)<=BlockSize; Pos+=2*sizeof(__m128i))
351   {
352     // We process two 128 bit chunks of source data at once.
353     __m128i *D=(__m128i *)(Data+Pos);
354 
355     // Place high bytes of both chunks to one variable and low bytes to
356     // another, so we can use the table lookup multiplication for 16 values
357     // 4 bit length each at once.
358     __m128i HighBytes0=_mm_srli_epi16(D[0],8);
359     __m128i LowBytes0=_mm_and_si128(D[0],LowByteMask);
360     __m128i HighBytes1=_mm_srli_epi16(D[1],8);
361     __m128i LowBytes1=_mm_and_si128(D[1],LowByteMask);
362     __m128i HighBytes=_mm_packus_epi16(HighBytes0,HighBytes1);
363     __m128i LowBytes=_mm_packus_epi16(LowBytes0,LowBytes1);
364 
365     // Multiply bits 0..3 of low bytes. Store low and high product bytes
366     // separately in cumulative sum variables.
367     __m128i LowBytesLow4=_mm_and_si128(LowBytes,Low4Mask);
368     __m128i LowBytesMultSum=_mm_shuffle_epi8(T0L,LowBytesLow4);
369     __m128i HighBytesMultSum=_mm_shuffle_epi8(T0H,LowBytesLow4);
370 
371     // Multiply bits 4..7 of low bytes. Store low and high product bytes separately.
372     __m128i LowBytesHigh4=_mm_and_si128(LowBytes,High4Mask);
373             LowBytesHigh4=_mm_srli_epi16(LowBytesHigh4,4);
374     __m128i LowBytesHigh4MultLow=_mm_shuffle_epi8(T1L,LowBytesHigh4);
375     __m128i LowBytesHigh4MultHigh=_mm_shuffle_epi8(T1H,LowBytesHigh4);
376 
377     // Add new product to existing sum, low and high bytes separately.
378     LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,LowBytesHigh4MultLow);
379     HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,LowBytesHigh4MultHigh);
380 
381     // Multiply bits 0..3 of high bytes. Store low and high product bytes separately.
382     __m128i HighBytesLow4=_mm_and_si128(HighBytes,Low4Mask);
383     __m128i HighBytesLow4MultLow=_mm_shuffle_epi8(T2L,HighBytesLow4);
384     __m128i HighBytesLow4MultHigh=_mm_shuffle_epi8(T2H,HighBytesLow4);
385 
386     // Add new product to existing sum, low and high bytes separately.
387     LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesLow4MultLow);
388     HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesLow4MultHigh);
389 
390     // Multiply bits 4..7 of high bytes. Store low and high product bytes separately.
391     __m128i HighBytesHigh4=_mm_and_si128(HighBytes,High4Mask);
392             HighBytesHigh4=_mm_srli_epi16(HighBytesHigh4,4);
393     __m128i HighBytesHigh4MultLow=_mm_shuffle_epi8(T3L,HighBytesHigh4);
394     __m128i HighBytesHigh4MultHigh=_mm_shuffle_epi8(T3H,HighBytesHigh4);
395 
396     // Add new product to existing sum, low and high bytes separately.
397     LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesHigh4MultLow);
398     HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesHigh4MultHigh);
399 
400     // Combine separate low and high cumulative sum bytes to 16-bit words.
401     __m128i HighBytesHigh4Mult0=_mm_unpacklo_epi8(LowBytesMultSum,HighBytesMultSum);
402     __m128i HighBytesHigh4Mult1=_mm_unpackhi_epi8(LowBytesMultSum,HighBytesMultSum);
403 
404     // Add result to ECC.
405     __m128i *StoreECC=(__m128i *)(ECC+Pos);
406 
407     StoreECC[0]=_mm_xor_si128(StoreECC[0],HighBytesHigh4Mult0);
408     StoreECC[1]=_mm_xor_si128(StoreECC[1],HighBytesHigh4Mult1);
409   }
410 
411   // If we have non 128 bit aligned data in the end of block, process them
412   // in a usual way. We cannot do the same in the beginning of block,
413   // because Data and ECC can have different alignment offsets.
414   for (; Pos<BlockSize; Pos+=2)
415     *(ushort*)(ECC+Pos) ^= gfMul( M, *(ushort*)(Data+Pos) );
416 
417   return true;
418 }
419 #endif
420