1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "U2Bits.h"
23
24 #include <U2Core/U2OpStatus.h>
25
26 namespace U2 {
27
28 // static int getLenBits(int len) {
29 // return len == 0 ? 3 : len < 0XFF ? 0 : len < 0xFFFF ? 1 : 2;
30 // }
getLenBitsSize(int len)31 static int getLenBitsSize(int len) {
32 return len == 0 ? 0 : len < 0XFF ? 8
33 : len < 0xFFFF ? 16
34 : 32;
35 }
36
writeLength(uchar * bits,int len,int lenBitsLen)37 static void writeLength(uchar *bits, int len, int lenBitsLen) {
38 if (lenBitsLen == 8) { // 00
39 U2Bits::writeInt8(bits, 2, (qint8)len);
40 } else if (lenBitsLen == 16) { // 01
41 U2Bits::setBit(bits, 0);
42 U2Bits::writeInt16(bits, 2, (qint16)len);
43 } else if (lenBitsLen == 32) { // 10
44 U2Bits::setBit(bits, 1);
45 U2Bits::writeInt16(bits, 2, (qint32)len);
46 } else {
47 U2Bits::setBit(bits, 0); // empty length => both bits set: 11
48 U2Bits::setBit(bits, 1);
49 }
50 }
51
readLength(const uchar * bits,int & nBits)52 static int readLength(const uchar *bits, int &nBits) {
53 bool b0 = U2Bits::getBit(bits, 0);
54 bool b1 = U2Bits::getBit(bits, 1);
55 if (b0 == b1) {
56 if (!b0) {
57 nBits = 8;
58 return U2Bits::readInt8(bits, 2);
59 }
60 nBits = 0;
61 return 0;
62 } else if (b0) {
63 nBits = 16;
64 return U2Bits::readInt16(bits, 2);
65 } else {
66 nBits = 32;
67 return U2Bits::readInt32(bits, 2);
68 }
69 }
70
compress(const char * text,int len,int alphabetSize,const int * alphabetCharNums,U2OpStatus & os)71 QByteArray U2BitCompression::compress(const char *text, int len, int alphabetSize, const int *alphabetCharNums, U2OpStatus &os) {
72 // algorithm:
73 // 1. compute chars freq -> derive number of bits per char
74 // 2. assign bit masks per char. Do not assign any bit masks for non-used alphabet chars
75 // 3. compress chars.
76 // 4. create header with used char mask
77 // Result bits [len type][len][used alpha bits][compressed text]
78 // where [len type] is a type of length field: 00 -> empty, 01 -> 8 byte, 10 -> 16 bytes, 11 -> 32 bytes
79 // [len] - length of the result sequence
80 // [used alpha bits] bit is set if alpha char is used in the text.
81 // [compressed text] the data in compressed form
82
83 assert(alphabetSize <= 32); // avoid this check in runtime -> use this method correctly
84
85 // find all used chars in text
86 QVector<bool> visitVector(alphabetSize, false);
87 bool *visited = visitVector.data();
88 for (int i = 0; i < len; i++) {
89 uchar c = text[i];
90 int n = alphabetCharNums[c];
91 if (n == -1) {
92 os.setError(tr("Bit compression: illegal character in text '%1'").arg(char(c)));
93 return QByteArray();
94 }
95 if (!visited[n]) {
96 visited[n] = true;
97 }
98 }
99
100 // assign sequential bit-mask for all used chars
101 QVector<uchar> maskVector(alphabetSize, 0);
102 uchar *mask = maskVector.data();
103 uchar visitedCharBitMask = 0;
104 for (int i = 0; i < alphabetSize; i++) {
105 if (visited[i]) {
106 mask[i] = visitedCharBitMask;
107 visitedCharBitMask++;
108 }
109 }
110 // store header and data to bit set
111 int bitsPerChar = U2Bits::getNumberOfBitsPerChar(visitedCharBitMask);
112 int compressedBitSize = len * bitsPerChar;
113 int lenBits = getLenBitsSize(len);
114 int headerSizeBits = 2 + lenBits + alphabetSize;
115 int resultSizeBits = headerSizeBits + compressedBitSize;
116 QByteArray bitSet = U2Bits::allocateBits(resultSizeBits);
117 uchar *bits = (uchar *)bitSet.data();
118 writeLength(bits, len, lenBits);
119 int pos = 2 + lenBits;
120 for (; pos < alphabetSize; pos++) {
121 if (visited[pos]) {
122 U2Bits::setBit(bits, pos);
123 }
124 }
125 for (int i = 0; i < len; i++, pos += bitsPerChar) {
126 uchar c = text[i];
127 int n = alphabetCharNums[c];
128 uchar alphabetCharBitMask = mask[n];
129 U2Bits::setBits(bits, pos, &alphabetCharBitMask, bitsPerChar);
130 }
131 return bitSet;
132 }
133
uncompress(const char * data,const QByteArray & alphabetChars,U2OpStatus &)134 QByteArray U2BitCompression::uncompress(const char *data, const QByteArray &alphabetChars, U2OpStatus &) {
135 // algorithm
136 // 1. Derive all chars from header
137 // 2. Assign bit masks per chars that have signed bit in header
138 // 3. Unpack value
139
140 int alphabetSize = alphabetChars.size();
141 const char *aChars = alphabetChars.data();
142 const uchar *bits = (const uchar *)data;
143
144 int alphaMaskOffset = 0;
145 int len = readLength(bits, alphaMaskOffset);
146
147 // restore bit masks
148 QVector<bool> visitVector(alphabetSize, false);
149 bool *visited = visitVector.data();
150 int nChars = 0;
151 for (int i = 0; i < alphabetSize; i++) {
152 if (U2Bits::getBit(bits, i + alphaMaskOffset)) {
153 visited[i] = true;
154 nChars++;
155 }
156 }
157 int bitsPerChar = U2Bits::getNumberOfBitsPerChar(nChars);
158
159 QVector<char> mask2Char(nChars, 0);
160 uchar visitedCharBitMask = 0;
161 for (int i = 0; i < alphabetSize; i++) {
162 if (visited[i]) {
163 mask2Char[visitedCharBitMask] = aChars[i];
164 visitedCharBitMask++;
165 }
166 }
167 int pos = alphaMaskOffset + alphabetSize;
168 QByteArray result(len, Qt::Uninitialized);
169 char *res = result.data();
170 for (int i = 0; i < len; i++, pos += bitsPerChar) {
171 int bitsAsInt32 = U2Bits::bitsRange2Int32(bits, pos, bitsPerChar);
172 char c = mask2Char[bitsAsInt32];
173 assert(c != 0);
174 res[i] = c;
175 }
176 return result;
177 }
178
179 //////////////////////////////////////////////////////////////////////////
180 // bits helper
181
getNumberOfBitsPerChar(int nChars)182 int U2Bits::getNumberOfBitsPerChar(int nChars) {
183 int bitsPerChar = nChars <= 2 ? 1 : (nChars <= 4) ? 2
184 : (nChars <= 8) ? 3
185 : (nChars <= 16) ? 4
186 : 5;
187 return bitsPerChar;
188 }
189
allocateBits(int nBits)190 QByteArray U2Bits::allocateBits(int nBits) {
191 int nBytes = getNumberOfBytes(nBits);
192 return QByteArray(nBytes, Qt::Uninitialized);
193 }
194
setBits(uchar * dstBits,int pos,const uchar * srcBits,int nBits)195 void U2Bits::setBits(uchar *dstBits, int pos, const uchar *srcBits, int nBits) {
196 // TODO: optimize
197 for (int i = 0; i < nBits; i++) {
198 bool val = getBit(srcBits, i);
199 setBit(dstBits, i + pos, val);
200 }
201 }
bitsRange2Int32(const uchar * bits,int pos,int len)202 int U2Bits::bitsRange2Int32(const uchar *bits, int pos, int len) {
203 // TODO: optimize
204 assert(len <= 32);
205 int res = 0;
206 for (int i = 0; i < len; i++) {
207 bool b = getBit(bits, pos + i);
208 if (b) {
209 res = res | (1 << i);
210 }
211 }
212 return res;
213 }
214
readInt8(const uchar * bits,int pos)215 qint8 U2Bits::readInt8(const uchar *bits, int pos) {
216 int res = 0;
217 for (int i = 0; i < 8; i++) {
218 res = res << 1;
219 if (U2Bits::getBit(bits, pos + i)) {
220 res += 1;
221 }
222 }
223 return qint8(res);
224 }
225
writeInt8(uchar * bits,int pos,qint8 val)226 void U2Bits::writeInt8(uchar *bits, int pos, qint8 val) {
227 const uchar *data = (const uchar *)&val;
228 for (int i = 0; i < 8; i++) {
229 if (U2Bits::getBit(data, i)) {
230 U2Bits::setBit(bits, pos + i);
231 } else {
232 U2Bits::clearBit(bits, pos + i);
233 }
234 }
235 }
236
readInt16(const uchar * bits,int pos)237 qint8 U2Bits::readInt16(const uchar *bits, int pos) {
238 int res = (readInt8(bits, pos) << 8) + readInt8(bits, pos + 8);
239 return qint16(res);
240 }
241
writeInt16(uchar * bits,int pos,qint16 val)242 void U2Bits::writeInt16(uchar *bits, int pos, qint16 val) {
243 writeInt8(bits, pos + 8, qint8(val));
244 writeInt8(bits, pos, qint8(val >> 8));
245 }
246
readInt32(const uchar * bits,int pos)247 qint8 U2Bits::readInt32(const uchar *bits, int pos) {
248 int res = (readInt8(bits, pos) << 24) + (readInt8(bits, pos + 8) << 16) + (readInt8(bits, pos + 16) << 8) + readInt8(bits, pos + 24);
249 return qint16(res);
250 }
251
writeInt32(uchar * bits,int pos,qint32 val)252 void U2Bits::writeInt32(uchar *bits, int pos, qint32 val) {
253 writeInt8(bits, pos + 24, qint8(val));
254 writeInt8(bits, pos + 16, qint8(val >> 8));
255 writeInt8(bits, pos + 8, qint8(val >> 16));
256 writeInt8(bits, pos, qint8(val >> 24));
257 }
258
259 } // namespace U2
260