1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "U2Bits.h"
23 
24 #include <U2Core/U2OpStatus.h>
25 
26 namespace U2 {
27 
28 // static int getLenBits(int len) {
29 //     return len == 0 ? 3 : len < 0XFF ? 0 : len < 0xFFFF ? 1 : 2;
30 // }
getLenBitsSize(int len)31 static int getLenBitsSize(int len) {
32     return len == 0 ? 0 : len < 0XFF ? 8
33                       : len < 0xFFFF ? 16
34                                      : 32;
35 }
36 
writeLength(uchar * bits,int len,int lenBitsLen)37 static void writeLength(uchar *bits, int len, int lenBitsLen) {
38     if (lenBitsLen == 8) {  // 00
39         U2Bits::writeInt8(bits, 2, (qint8)len);
40     } else if (lenBitsLen == 16) {  // 01
41         U2Bits::setBit(bits, 0);
42         U2Bits::writeInt16(bits, 2, (qint16)len);
43     } else if (lenBitsLen == 32) {  // 10
44         U2Bits::setBit(bits, 1);
45         U2Bits::writeInt16(bits, 2, (qint32)len);
46     } else {
47         U2Bits::setBit(bits, 0);  // empty length => both bits set: 11
48         U2Bits::setBit(bits, 1);
49     }
50 }
51 
readLength(const uchar * bits,int & nBits)52 static int readLength(const uchar *bits, int &nBits) {
53     bool b0 = U2Bits::getBit(bits, 0);
54     bool b1 = U2Bits::getBit(bits, 1);
55     if (b0 == b1) {
56         if (!b0) {
57             nBits = 8;
58             return U2Bits::readInt8(bits, 2);
59         }
60         nBits = 0;
61         return 0;
62     } else if (b0) {
63         nBits = 16;
64         return U2Bits::readInt16(bits, 2);
65     } else {
66         nBits = 32;
67         return U2Bits::readInt32(bits, 2);
68     }
69 }
70 
compress(const char * text,int len,int alphabetSize,const int * alphabetCharNums,U2OpStatus & os)71 QByteArray U2BitCompression::compress(const char *text, int len, int alphabetSize, const int *alphabetCharNums, U2OpStatus &os) {
72     // algorithm:
73     // 1. compute chars freq -> derive number of bits per char
74     // 2. assign bit masks per char. Do not assign any bit masks for non-used alphabet chars
75     // 3. compress chars.
76     // 4. create header with used char mask
77     // Result bits [len type][len][used alpha bits][compressed text]
78     //  where [len type] is a type of length field: 00 -> empty, 01 -> 8 byte, 10 -> 16 bytes, 11 -> 32 bytes
79     //  [len] - length of the result sequence
80     //  [used alpha bits] bit is set if alpha char is used in the text.
81     //  [compressed text] the data in compressed form
82 
83     assert(alphabetSize <= 32);  // avoid this check in runtime -> use this method correctly
84 
85     // find all used chars in text
86     QVector<bool> visitVector(alphabetSize, false);
87     bool *visited = visitVector.data();
88     for (int i = 0; i < len; i++) {
89         uchar c = text[i];
90         int n = alphabetCharNums[c];
91         if (n == -1) {
92             os.setError(tr("Bit compression: illegal character in text '%1'").arg(char(c)));
93             return QByteArray();
94         }
95         if (!visited[n]) {
96             visited[n] = true;
97         }
98     }
99 
100     // assign sequential bit-mask for all used chars
101     QVector<uchar> maskVector(alphabetSize, 0);
102     uchar *mask = maskVector.data();
103     uchar visitedCharBitMask = 0;
104     for (int i = 0; i < alphabetSize; i++) {
105         if (visited[i]) {
106             mask[i] = visitedCharBitMask;
107             visitedCharBitMask++;
108         }
109     }
110     // store header and data to bit set
111     int bitsPerChar = U2Bits::getNumberOfBitsPerChar(visitedCharBitMask);
112     int compressedBitSize = len * bitsPerChar;
113     int lenBits = getLenBitsSize(len);
114     int headerSizeBits = 2 + lenBits + alphabetSize;
115     int resultSizeBits = headerSizeBits + compressedBitSize;
116     QByteArray bitSet = U2Bits::allocateBits(resultSizeBits);
117     uchar *bits = (uchar *)bitSet.data();
118     writeLength(bits, len, lenBits);
119     int pos = 2 + lenBits;
120     for (; pos < alphabetSize; pos++) {
121         if (visited[pos]) {
122             U2Bits::setBit(bits, pos);
123         }
124     }
125     for (int i = 0; i < len; i++, pos += bitsPerChar) {
126         uchar c = text[i];
127         int n = alphabetCharNums[c];
128         uchar alphabetCharBitMask = mask[n];
129         U2Bits::setBits(bits, pos, &alphabetCharBitMask, bitsPerChar);
130     }
131     return bitSet;
132 }
133 
uncompress(const char * data,const QByteArray & alphabetChars,U2OpStatus &)134 QByteArray U2BitCompression::uncompress(const char *data, const QByteArray &alphabetChars, U2OpStatus &) {
135     // algorithm
136     // 1. Derive all chars from header
137     // 2. Assign bit masks per chars that have signed bit in header
138     // 3. Unpack value
139 
140     int alphabetSize = alphabetChars.size();
141     const char *aChars = alphabetChars.data();
142     const uchar *bits = (const uchar *)data;
143 
144     int alphaMaskOffset = 0;
145     int len = readLength(bits, alphaMaskOffset);
146 
147     // restore bit masks
148     QVector<bool> visitVector(alphabetSize, false);
149     bool *visited = visitVector.data();
150     int nChars = 0;
151     for (int i = 0; i < alphabetSize; i++) {
152         if (U2Bits::getBit(bits, i + alphaMaskOffset)) {
153             visited[i] = true;
154             nChars++;
155         }
156     }
157     int bitsPerChar = U2Bits::getNumberOfBitsPerChar(nChars);
158 
159     QVector<char> mask2Char(nChars, 0);
160     uchar visitedCharBitMask = 0;
161     for (int i = 0; i < alphabetSize; i++) {
162         if (visited[i]) {
163             mask2Char[visitedCharBitMask] = aChars[i];
164             visitedCharBitMask++;
165         }
166     }
167     int pos = alphaMaskOffset + alphabetSize;
168     QByteArray result(len, Qt::Uninitialized);
169     char *res = result.data();
170     for (int i = 0; i < len; i++, pos += bitsPerChar) {
171         int bitsAsInt32 = U2Bits::bitsRange2Int32(bits, pos, bitsPerChar);
172         char c = mask2Char[bitsAsInt32];
173         assert(c != 0);
174         res[i] = c;
175     }
176     return result;
177 }
178 
179 //////////////////////////////////////////////////////////////////////////
180 // bits helper
181 
getNumberOfBitsPerChar(int nChars)182 int U2Bits::getNumberOfBitsPerChar(int nChars) {
183     int bitsPerChar = nChars <= 2 ? 1 : (nChars <= 4) ? 2
184                                     : (nChars <= 8)   ? 3
185                                     : (nChars <= 16)  ? 4
186                                                       : 5;
187     return bitsPerChar;
188 }
189 
allocateBits(int nBits)190 QByteArray U2Bits::allocateBits(int nBits) {
191     int nBytes = getNumberOfBytes(nBits);
192     return QByteArray(nBytes, Qt::Uninitialized);
193 }
194 
setBits(uchar * dstBits,int pos,const uchar * srcBits,int nBits)195 void U2Bits::setBits(uchar *dstBits, int pos, const uchar *srcBits, int nBits) {
196     // TODO: optimize
197     for (int i = 0; i < nBits; i++) {
198         bool val = getBit(srcBits, i);
199         setBit(dstBits, i + pos, val);
200     }
201 }
bitsRange2Int32(const uchar * bits,int pos,int len)202 int U2Bits::bitsRange2Int32(const uchar *bits, int pos, int len) {
203     // TODO: optimize
204     assert(len <= 32);
205     int res = 0;
206     for (int i = 0; i < len; i++) {
207         bool b = getBit(bits, pos + i);
208         if (b) {
209             res = res | (1 << i);
210         }
211     }
212     return res;
213 }
214 
readInt8(const uchar * bits,int pos)215 qint8 U2Bits::readInt8(const uchar *bits, int pos) {
216     int res = 0;
217     for (int i = 0; i < 8; i++) {
218         res = res << 1;
219         if (U2Bits::getBit(bits, pos + i)) {
220             res += 1;
221         }
222     }
223     return qint8(res);
224 }
225 
writeInt8(uchar * bits,int pos,qint8 val)226 void U2Bits::writeInt8(uchar *bits, int pos, qint8 val) {
227     const uchar *data = (const uchar *)&val;
228     for (int i = 0; i < 8; i++) {
229         if (U2Bits::getBit(data, i)) {
230             U2Bits::setBit(bits, pos + i);
231         } else {
232             U2Bits::clearBit(bits, pos + i);
233         }
234     }
235 }
236 
readInt16(const uchar * bits,int pos)237 qint8 U2Bits::readInt16(const uchar *bits, int pos) {
238     int res = (readInt8(bits, pos) << 8) + readInt8(bits, pos + 8);
239     return qint16(res);
240 }
241 
writeInt16(uchar * bits,int pos,qint16 val)242 void U2Bits::writeInt16(uchar *bits, int pos, qint16 val) {
243     writeInt8(bits, pos + 8, qint8(val));
244     writeInt8(bits, pos, qint8(val >> 8));
245 }
246 
readInt32(const uchar * bits,int pos)247 qint8 U2Bits::readInt32(const uchar *bits, int pos) {
248     int res = (readInt8(bits, pos) << 24) + (readInt8(bits, pos + 8) << 16) + (readInt8(bits, pos + 16) << 8) + readInt8(bits, pos + 24);
249     return qint16(res);
250 }
251 
writeInt32(uchar * bits,int pos,qint32 val)252 void U2Bits::writeInt32(uchar *bits, int pos, qint32 val) {
253     writeInt8(bits, pos + 24, qint8(val));
254     writeInt8(bits, pos + 16, qint8(val >> 8));
255     writeInt8(bits, pos + 8, qint8(val >> 16));
256     writeInt8(bits, pos, qint8(val >> 24));
257 }
258 
259 }  // namespace U2
260