1 /*
2 * Copyright (c) 2016,2017 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 // cc -I.. -g -O3 tokenise_name3.c rANS_static4x16pr.c pooled_alloc.c -pthread -DTEST_TOKENISER
35
36 // As per tokenise_name2 but has the entropy encoder built in already,
37 // so we just have a single encode and decode binary. (WIP; mainly TODO)
38
39 // TODO
40 //
41 // - Is it better when encoding 1, 2, 3, 3, 4, 5, 5, 6, 7, 9, 9, 10 to encode
42 // this as a mixture of MATCH and DELTA ops, or as entirely as DELTA ops
43 // with some delta values being zero? I suspect the latter, but it is
44 // not implemented here. See "last_token_delta" comments in code.
45 //
46 // - Consider variable size string implementations.
47 // Pascal style strings (length + str),
48 // C style strings (nul terminated),
49 // Or split blocks: length block and string contents block.
50 //
51 // - Is this one token-block or many serialised token-blocks?
52 // A) Lots of different models but feeding one bit-buffer emitted to
53 // by the entropy encoder => one block (fqzcomp).
54 // B) Lots of different models each feeding their own bit-buffers
55 // => many blocks.
56 //
57 // - multiple integer types depending on size; 1, 2, 4 byte long.
58 //
59 // - Consider token choice for isalnum instead of isalpha. Sometimes better.
60 //
61 // - Consider token synchronisation (eg on matching chr symbols?) incase of
62 // variable number. Eg consider foo:0999, foo:1000, foo:1001 (the leading
63 // zero adds an extra token).
64 //
65 // - Optimisation of tokens. Eg:
66 // HS25_09827:2:2102:11274:80442#49
67 // HS25_09827:2:2109:12941:31311#49
68 //
69 // We'll have tokens for HS 25 _ 09827 : 2 : that are entirely <MATCH>
70 // after the initial token. These 7 tokens could be one ALPHA instead
71 // of 7 distinct tokens, with 1 MATCH instead of 7. This is both a speed
72 // improvement for decoding as well as a space saving (fewer token-blocks
73 // and associated overhead).
74 //
75 // - XOR. Like ALPHA, but used when previous symbol is ALPHA or XOR
76 // and string lengths match. Useful when names are similar, eg:
77 // the sequence in 07.names:
78 //
79 // @VP2-06:112:H7LNDMCVY:1:1105:26919:1172 1:N:0:ATTCAGAA+AGGAGAAG
80 // @VP2-06:112:H7LNDMCVY:1:1105:27100:1172 1:N:0:ATTCAGAA+AGGCGAAG
81 // @VP2-06:112:H7LNDMCVY:1:1105:27172:1172 1:N:0:ATTCAGAA+AGGCTAAG
82
83
84
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <string.h>
88 #include <sys/types.h>
89 #include <sys/stat.h>
90 #include <unistd.h>
91 #include <limits.h>
92 #include <ctype.h>
93 #include <assert.h>
94 #include <inttypes.h>
95 #include <math.h>
96 #include <fcntl.h>
97 #include <errno.h>
98 #include <time.h>
99
100 #include "io_lib/pooled_alloc.h"
101 #include "io_lib/rANS_static4x16.h"
102 #include "io_lib/tokenise_name3.h"
103
104 // 128 is insufficient for SAM names (max 256 bytes) as
105 // we may alternate a0a0a0a0a0 etc. However if we fail,
106 // we just give up and switch to another codec, so this
107 // isn't a serious limit. Maybe up to 256 to permit all
108 // SAM names?
109 #define MAX_TOKENS 128
110 #define MAX_TBLOCKS (MAX_TOKENS<<4)
111
112 // Number of names per block
113 #define MAX_NAMES 1000000
114
115 enum name_type {N_ERR = -1, N_TYPE = 0, N_ALPHA, N_CHAR, N_DZLEN, N_DIGITS0, N_DUP, N_DIFF,
116 N_DIGITS, N_D1, N_D2, N_D3, N_DDELTA, N_DDELTA0, N_MATCH, N_END,N_ALL};
117
118 char *types[]={"TYPE", "ALPHA", "CHAR", "DZLEN", "DIG0", "DUP", "DIFF",
119 "DIGITS", "", "", "", "DDELTA", "DDELTA0", "MATCH", "END"};
120
121 typedef struct trie trie_t;
122
123 typedef struct {
124 char *last_name;
125 int last_ntok;
126 enum name_type last_token_type[MAX_TOKENS];
127 int last_token_int[MAX_TOKENS];
128 int last_token_str[MAX_TOKENS];
129 //int last_token_delta[MAX_TOKENS];
130 } last_context;
131
132 typedef struct {
133 uint8_t *buf;
134 size_t buf_a, buf_l; // alloc and used length.
135 int tnum, ttype;
136 int dup_from;
137 } descriptor;
138
139 typedef struct {
140 last_context *lc;
141
142 // For finding entire line dups
143 int counter;
144
145 // Trie used in encoder only
146 trie_t *t_head;
147 pool_alloc_t *pool;
148
149 // token blocks
150 descriptor desc[MAX_TBLOCKS];
151 } name_context;
152
create_context(int max_names)153 name_context *create_context(int max_names) {
154 if (max_names <= 0)
155 return NULL;
156
157 name_context *ctx = malloc(sizeof(*ctx) + ++max_names*sizeof(*ctx->lc));
158 if (!ctx) return NULL;
159
160 ctx->counter = 0;
161 ctx->t_head = NULL;
162
163 ctx->lc = (last_context *)(((char *)ctx) + sizeof(*ctx));
164 ctx->pool = NULL;
165
166 memset(&ctx->desc[0], 0, MAX_TBLOCKS * sizeof(ctx->desc[0]));
167
168 return ctx;
169 }
170
171 void free_trie(trie_t *t);
free_context(name_context * ctx)172 void free_context(name_context *ctx) {
173 if (!ctx)
174 return;
175
176 if (ctx->t_head)
177 free(ctx->t_head);
178 if (ctx->pool)
179 pool_destroy(ctx->pool);
180
181 free(ctx);
182 }
183
184 //-----------------------------------------------------------------------------
185 // Fast unsigned integer printing code.
186 // Returns number of bytes written.
append_uint32_fixed(char * cp,uint32_t i,uint8_t l)187 static int append_uint32_fixed(char *cp, uint32_t i, uint8_t l) {
188 switch (l) {
189 case 9:*cp++ = i / 100000000 + '0', i %= 100000000;
190 case 8:*cp++ = i / 10000000 + '0', i %= 10000000;
191 case 7:*cp++ = i / 1000000 + '0', i %= 1000000;
192 case 6:*cp++ = i / 100000 + '0', i %= 100000;
193 case 5:*cp++ = i / 10000 + '0', i %= 10000;
194 case 4:*cp++ = i / 1000 + '0', i %= 1000;
195 case 3:*cp++ = i / 100 + '0', i %= 100;
196 case 2:*cp++ = i / 10 + '0', i %= 10;
197 case 1:*cp++ = i + '0';
198 case 0:break;
199 }
200 return l;
201 }
202
append_uint32_var(char * cp,uint32_t i)203 static int append_uint32_var(char *cp, uint32_t i) {
204 char *op = cp;
205 uint32_t j;
206
207 //if (i < 10) goto b0;
208 if (i < 100) goto b1;
209 //if (i < 1000) goto b2;
210 if (i < 10000) goto b3;
211 //if (i < 100000) goto b4;
212 if (i < 1000000) goto b5;
213 //if (i < 10000000) goto b6;
214 if (i < 100000000) goto b7;
215
216 if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
217 if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
218 b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
219 if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
220 b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
221 if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
222 b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
223 if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
224 b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
225 if (i) *cp++ = i + '0';
226 return cp-op;
227
228 x8:*cp++ = i / 100000000 + '0', i %= 100000000;
229 x7:*cp++ = i / 10000000 + '0', i %= 10000000;
230 x6:*cp++ = i / 1000000 + '0', i %= 1000000;
231 x5:*cp++ = i / 100000 + '0', i %= 100000;
232 x4:*cp++ = i / 10000 + '0', i %= 10000;
233 x3:*cp++ = i / 1000 + '0', i %= 1000;
234 x2:*cp++ = i / 100 + '0', i %= 100;
235 x1:*cp++ = i / 10 + '0', i %= 10;
236 x0:*cp++ = i + '0';
237
238 return cp-op;
239 }
240
241 //-----------------------------------------------------------------------------
242 // Simple variable sized unsigned integers
i7put(uint8_t * cp,uint64_t i)243 static int i7put(uint8_t *cp, uint64_t i) {
244 uint8_t *op = cp;
245 int s = 0;
246 uint64_t o = i;
247
248 do {
249 s += 7;
250 o >>= 7;
251 } while (o);
252
253 do {
254 s -= 7;
255 *cp++ = ((i>>s)&0x7f) + (s?128:0);
256 } while (s);
257
258 return cp-op;
259 }
260
i7get(uint8_t * cp,uint64_t * i)261 static int i7get(uint8_t *cp, uint64_t *i) {
262 uint8_t *op = cp, c;
263 uint64_t j = 0;
264
265 do {
266 c = *cp++;
267 j = (j<<7) | (c & 0x7f);
268 } while (c & 0x80);
269
270 *i = j;
271 return cp-op;
272 }
273
274 //-----------------------------------------------------------------------------
275 // Example descriptor encoding and IO.
276 //
277 // Here we just append to a buffer so we can dump out the results.
278 // These could then be passed through a static entropy encoder that
279 // encodes the entire buffer.
280 //
281 // Alternatively an adaptive entropy encoder could be place inline
282 // here to encode as it goes using additional knowledge from the
283 // supplied context.
284
285 // Ensure room for sz more bytes.
descriptor_grow(descriptor * fd,uint32_t sz)286 static int descriptor_grow(descriptor *fd, uint32_t sz) {
287 while (fd->buf_l + sz > fd->buf_a) {
288 size_t buf_a = fd->buf_a ? fd->buf_a*2 : 65536;
289 unsigned char *buf = realloc(fd->buf, buf_a);
290 if (!buf)
291 return -1;
292 fd->buf = buf;
293 fd->buf_a = buf_a;
294 }
295
296 return 0;
297 }
298
encode_token_type(name_context * ctx,int ntok,enum name_type type)299 static int encode_token_type(name_context *ctx, int ntok,
300 enum name_type type) {
301 int id = ntok<<4;
302
303 if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
304
305 ctx->desc[id].buf[ctx->desc[id].buf_l++] = type;
306
307 return 0;
308 }
309
encode_token_match(name_context * ctx,int ntok)310 static int encode_token_match(name_context *ctx, int ntok) {
311 return encode_token_type(ctx, ntok, N_MATCH);
312 }
313
encode_token_end(name_context * ctx,int ntok)314 static int encode_token_end(name_context *ctx, int ntok) {
315 return encode_token_type(ctx, ntok, N_END);
316 }
317
decode_token_type(name_context * ctx,int ntok)318 static enum name_type decode_token_type(name_context *ctx, int ntok) {
319 int id = ntok<<4;
320 if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1;
321 return ctx->desc[id].buf[ctx->desc[id].buf_l++];
322 }
323
324 // int stored as 32-bit quantities
encode_token_int(name_context * ctx,int ntok,enum name_type type,uint32_t val)325 static int encode_token_int(name_context *ctx, int ntok,
326 enum name_type type, uint32_t val) {
327 int id = (ntok<<4) | type;
328
329 if (encode_token_type(ctx, ntok, type) < 0) return -1;
330 if (descriptor_grow(&ctx->desc[id], 4) < 0) return -1;
331
332 // Assumes little endian and unalign access OK.
333 *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l) = val;
334 ctx->desc[id].buf_l += 4;
335
336 return 0;
337 }
338
339 // Return 0 on success, -1 on failure;
decode_token_int(name_context * ctx,int ntok,enum name_type type,uint32_t * val)340 static int decode_token_int(name_context *ctx, int ntok,
341 enum name_type type, uint32_t *val) {
342 int id = (ntok<<4) | type;
343
344 if (ctx->desc[id].buf_l + 4 > ctx->desc[id].buf_a)
345 return -1;
346
347 // Assumes little endian and unalign access OK.
348 *val = *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l);
349 ctx->desc[id].buf_l += 4;
350
351 return 0;
352 }
353
354 // 8 bit integer quantity
encode_token_int1(name_context * ctx,int ntok,enum name_type type,uint32_t val)355 static int encode_token_int1(name_context *ctx, int ntok,
356 enum name_type type, uint32_t val) {
357 int id = (ntok<<4) | type;
358
359 if (encode_token_type(ctx, ntok, type) < 0) return -1;
360 if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
361
362 ctx->desc[id].buf[ctx->desc[id].buf_l++] = val;
363
364 return 0;
365 }
366
encode_token_int1_(name_context * ctx,int ntok,enum name_type type,uint32_t val)367 static int encode_token_int1_(name_context *ctx, int ntok,
368 enum name_type type, uint32_t val) {
369 int id = (ntok<<4) | type;
370
371 if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
372
373 ctx->desc[id].buf[ctx->desc[id].buf_l++] = val;
374
375 return 0;
376 }
377
378 // Return 0 on success, -1 on failure;
decode_token_int1(name_context * ctx,int ntok,enum name_type type,uint32_t * val)379 static int decode_token_int1(name_context *ctx, int ntok,
380 enum name_type type, uint32_t *val) {
381 int id = (ntok<<4) | type;
382
383 if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a)
384 return -1;
385 *val = ctx->desc[id].buf[ctx->desc[id].buf_l++];
386
387 return 0;
388 }
389
390
391 // Basic C-string style for now.
392 //
393 // Maybe XOR with previous string as context?
394 // This permits partial match to be encoded efficiently.
encode_token_alpha(name_context * ctx,int ntok,char * str,int len)395 static int encode_token_alpha(name_context *ctx, int ntok,
396 char *str, int len) {
397 int id = (ntok<<4) | N_ALPHA;
398
399 if (encode_token_type(ctx, ntok, N_ALPHA) < 0) return -1;
400 if (descriptor_grow(&ctx->desc[id], len+1) < 0) return -1;
401 memcpy(&ctx->desc[id].buf[ctx->desc[id].buf_l], str, len);
402 ctx->desc[id].buf[ctx->desc[id].buf_l+len] = 0;
403 ctx->desc[id].buf_l += len+1;
404
405 return 0;
406 }
407
408 // FIXME: need limit on string length for security.
409 // Return length on success, -1 on failure;
decode_token_alpha(name_context * ctx,int ntok,char * str,int max_len)410 static int decode_token_alpha(name_context *ctx, int ntok, char *str, int max_len) {
411 int id = (ntok<<4) | N_ALPHA;
412 char c;
413 int len = 0;
414 if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a)
415 return -1;
416 do {
417 c = ctx->desc[id].buf[ctx->desc[id].buf_l++];
418 str[len++] = c;
419 } while(c && len < max_len && ctx->desc[id].buf_l < ctx->desc[id].buf_a);
420
421 return len-1;
422 }
423
encode_token_char(name_context * ctx,int ntok,char c)424 static int encode_token_char(name_context *ctx, int ntok, char c) {
425 int id = (ntok<<4) | N_CHAR;
426
427 if (encode_token_type(ctx, ntok, N_CHAR) < 0) return -1;
428 if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
429 ctx->desc[id].buf[ctx->desc[id].buf_l++] = c;
430
431 return 0;
432 }
433
434 // FIXME: need limit on string length for security
435 // Return length on success, -1 on failure;
decode_token_char(name_context * ctx,int ntok,char * str)436 static int decode_token_char(name_context *ctx, int ntok, char *str) {
437 int id = (ntok<<4) | N_CHAR;
438
439 if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a)
440 return -1;
441 *str = ctx->desc[id].buf[ctx->desc[id].buf_l++];
442
443 return 1;
444 }
445
446
447 // A duplicated name
encode_token_dup(name_context * ctx,uint32_t val)448 static int encode_token_dup(name_context *ctx, uint32_t val) {
449 return encode_token_int(ctx, 0, N_DUP, val);
450 }
451
452 // Which read to delta against
encode_token_diff(name_context * ctx,uint32_t val)453 static int encode_token_diff(name_context *ctx, uint32_t val) {
454 return encode_token_int(ctx, 0, N_DIFF, val);
455 }
456
457
458 //-----------------------------------------------------------------------------
459 // Trie implementation for tracking common name prefixes.
460 struct trie {
461 char c;
462 int count;
463 //struct trie *next[128];
464 struct trie *next, *sibling;
465 int n; // Nth line
466 };
467
468 //static trie_t *t_head = NULL;
469
free_trie(trie_t * t)470 void free_trie(trie_t *t) {
471 trie_t *x, *n;
472 for (x = t->next; x; x = n) {
473 n = x->sibling;
474 free_trie(x);
475 }
476 free(t);
477 }
478
build_trie(name_context * ctx,char * data,size_t len,int n)479 int build_trie(name_context *ctx, char *data, size_t len, int n) {
480 int nlines = 0;
481 size_t i;
482 trie_t *t;
483
484 if (!ctx->t_head) {
485 ctx->t_head = calloc(1, sizeof(*ctx->t_head));
486 if (!ctx->t_head)
487 return -1;
488 }
489
490 // Build our trie, also counting input lines
491 for (nlines = i = 0; i < len; i++, nlines++) {
492 t = ctx->t_head;
493 t->count++;
494 while (i < len && data[i] > '\n') {
495 unsigned char c = data[i++];
496 if (c & 0x80)
497 //fprintf(stderr, "8-bit ASCII is unsupported\n");
498 abort();
499 c &= 127;
500
501
502 trie_t *x = t->next, *l = NULL;
503 while (x && x->c != c) {
504 l = x; x = x->sibling;
505 }
506 if (!x) {
507 if (!ctx->pool)
508 ctx->pool = pool_create(sizeof(trie_t));
509 if (!(x = (trie_t *)pool_alloc(ctx->pool)))
510 return -1;
511 memset(x, 0, sizeof(*x));
512 if (!l)
513 x = t->next = x;
514 else
515 x = l->sibling = x;
516 x->n = n;
517 x->c = c;
518 }
519 t = x;
520 t->c = c;
521 t->count++;
522 }
523 }
524
525 return 0;
526 }
527
528 #if 0
529 void dump_trie(trie_t *t, int depth) {
530 if (depth == 0) {
531 printf("graph x_%p {\n splines = ortho\n ranksep=2\n", t);
532 printf(" p_%p [label=\"\"];\n", t);
533 dump_trie(t, 1);
534 printf("}\n");
535 } else {
536 int j, k, count;//, cj;
537 char label[100], *cp;
538 trie_t *tp = t;
539
540 // patricia:
541 // for (count = j = 0; j < 128; j++)
542 // if (t->next[j])
543 // count++, cj=j;
544 //
545 // if (count == 1) {
546 // t = t->next[cj];
547 // *cp++ = cj;
548 // goto patricia;
549 // }
550
551 trie_t *x;
552 for (x = t->next; x; x = x->sibling) {
553 printf(" p_%p [label=\"%c\"];\n", x, x->c);
554 printf(" p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, x, x->count, MAX((log(x->count)-3)*2,1));
555 //if (depth <= 11)
556 dump_trie(x, depth+1);
557 }
558
559 #if 0
560 for (j = 0; j < 128; j++) {
561 trie_t *tn;
562
563 if (!t->next[j])
564 continue;
565
566 cp = label;
567 tn = t->next[j];
568 *cp++ = j;
569 // patricia:
570
571 for (count = k = 0; k < 128; k++)
572 if (tn->next[k])
573 count++;//, cj=k;
574
575 // if (count == 1) {
576 // tn = tn->next[cj];
577 // *cp++ = cj;
578 // goto patricia;
579 // }
580 *cp++ = 0;
581
582 printf(" p_%p [label=\"%s\"];\n", tn, label);
583 printf(" p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, tn, tn->count, MAX((log(tn->count)-3)*2,1));
584 if (depth <= 11)
585 dump_trie(tn, depth+1);
586 }
587 #endif
588 }
589 }
590 #endif
591
search_trie(name_context * ctx,char * data,size_t len,int n,int * exact,int * is_fixed,int * fixed_len)592 int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, int *is_fixed, int *fixed_len) {
593 int nlines = 0;
594 size_t i;
595 trie_t *t;
596 int from = -1, p3 = -1;
597
598 // Horrid hack for the encoder only.
599 // We optimise per known name format here.
600 int prefix_len;
601 char *d = *data == '@' ? data+1 : data;
602 int l = *data == '@' ? len-1 : len;
603 int f = (*data == '>') ? 1 : 0;
604 if (l > 70 && d[f+0] == 'm' && d[7] == '_' && d[f+14] == '_' && d[f+61] == '/') {
605 prefix_len = 60; // PacBio
606 *is_fixed = 0;
607 } else if (l == 17 && d[f+5] == ':' && d[f+11] == ':') {
608 prefix_len = 7; // IonTorrent
609 *fixed_len = 7;
610 *is_fixed = 1;
611 } else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' &&
612 ((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) &&
613 ((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) {
614 // ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template...
615 prefix_len = 37;
616 *fixed_len = 37;
617 *is_fixed = 1;
618 } else {
619 // Anything else we give up on the trie method, but we still want to search
620 // for exact matches;
621 prefix_len = INT_MAX;
622 *is_fixed = 0;
623 }
624 //prefix_len = INT_MAX;
625
626 if (!ctx->t_head) {
627 ctx->t_head = calloc(1, sizeof(*ctx->t_head));
628 if (!ctx->t_head)
629 return -1;
630 }
631
632 // Find an item in the trie
633 for (nlines = i = 0; i < len; i++, nlines++) {
634 t = ctx->t_head;
635 while (i < len && data[i] > '\n') {
636 unsigned char c = data[i++];
637 if (c & 0x80)
638 //fprintf(stderr, "8-bit ASCII is unsupported\n");
639 abort();
640 c &= 127;
641
642 trie_t *x = t->next;
643 while (x && x->c != c)
644 x = x->sibling;
645 t = x;
646
647 // t = t->next[c];
648
649 // if (!t)
650 // return -1;
651
652 from = t->n;
653 if (i == prefix_len) p3 = t->n;
654 //if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio
655 //if (i == 60) p3 = t->n; // pacbio
656 //if (i == 7) p3 = t->n; // iontorrent
657 t->n = n;
658 }
659 }
660
661 //printf("Looked for %d, found %d, prefix %d\n", n, from, p3);
662
663 *exact = (n != from);
664 return *exact ? from : p3;
665 }
666
667
668 //-----------------------------------------------------------------------------
669 // Name encoder
670
671 /*
672 * Tokenises a read name using ctx as context as the previous
673 * tokenisation.
674 *
675 * Parsed elements are then emitted for encoding by calling the
676 * encode_token() function with the context, token number (Nth token
677 * in line), token type and token value.
678 *
679 * Returns 0 on success;
680 * -1 on failure.
681 */
encode_name(name_context * ctx,char * name,int len,int mode)682 static int encode_name(name_context *ctx, char *name, int len, int mode) {
683 int i, is_fixed, fixed_len;
684
685 int exact;
686 int cnum = ctx->counter++;
687 int pnum = search_trie(ctx, name, len, cnum, &exact, &is_fixed, &fixed_len);
688 if (pnum < 0) pnum = cnum ? cnum-1 : 0;
689 //pnum = pnum & (MAX_NAMES-1);
690 //cnum = cnum & (MAX_NAMES-1);
691 //if (pnum == cnum) {pnum = cnum ? cnum-1 : 0;}
692 #ifdef ENC_DEBUG
693 fprintf(stderr, "%d: pnum=%d (%d), exact=%d\n%s\n%s\n",
694 ctx->counter, pnum, cnum-pnum, exact, ctx->lc[pnum].last_name, name);
695 #endif
696
697 // Return DUP or DIFF switch, plus the distance.
698 if (exact && len == strlen(ctx->lc[pnum].last_name)) {
699 encode_token_dup(ctx, cnum-pnum);
700 ctx->lc[cnum].last_name = name;
701 ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok;
702 // FIXME: optimise this
703 int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS;
704 memcpy(ctx->lc[cnum].last_token_type, ctx->lc[pnum].last_token_type, nc * sizeof(int));
705 memcpy(ctx->lc[cnum].last_token_int , ctx->lc[pnum].last_token_int , nc * sizeof(int));
706 memcpy(ctx->lc[cnum].last_token_str , ctx->lc[pnum].last_token_str , nc * sizeof(int));
707 return 0;
708 }
709
710 encode_token_diff(ctx, cnum-pnum);
711
712 int ntok = 1;
713 i = 0;
714 if (is_fixed) {
715 if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_ALPHA) {
716 if (ctx->lc[pnum].last_token_int[ntok] == fixed_len && memcmp(name, ctx->lc[pnum].last_name, fixed_len) == 0) {
717 encode_token_match(ctx, ntok);
718 } else {
719 encode_token_alpha(ctx, ntok, name, fixed_len);
720 }
721 } else {
722 encode_token_alpha(ctx, ntok, name, fixed_len);
723 }
724 ctx->lc[cnum].last_token_int[ntok] = fixed_len;
725 ctx->lc[cnum].last_token_str[ntok] = 0;
726 ctx->lc[cnum].last_token_type[ntok++] = N_ALPHA;
727 i = fixed_len;
728 }
729
730 for (; i < len; i++) {
731 /* Determine data type of this segment */
732 if (isalpha(name[i])) {
733 int s = i+1;
734 // int S = i+1;
735
736 // // FIXME: try which of these is best. alnum is good sometimes.
737 // while (s < len && isalpha(name[s]))
738 while (s < len && (isalpha(name[s]) || ispunct(name[s])))
739 // while (s < len && name[s] != ':')
740 // while (s < len && !isdigit(name[s]) && name[s] != ':')
741 s++;
742
743 // if (!is_fixed) {
744 // while (S < len && isalnum(name[S]))
745 // S++;
746 // if (s < S)
747 // s = S;
748 // }
749
750 // Single byte strings are better encoded as chars.
751 if (s-i == 1) goto n_char;
752
753 if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_ALPHA) {
754 if (s-i == ctx->lc[pnum].last_token_int[ntok] &&
755 memcmp(&name[i],
756 &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]],
757 s-i) == 0) {
758 #ifdef ENC_DEBUG
759 fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]);
760 #endif
761 if (encode_token_match(ctx, ntok) < 0) return -1;
762 } else {
763 #ifdef ENC_DEBUG
764 fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA,
765 s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]], s-i, &name[i]);
766 #endif
767 // same token/length, but mismatches
768 if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
769 }
770 } else {
771 #ifdef ENC_DEBUG
772 fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]);
773 #endif
774 if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
775 }
776
777 ctx->lc[cnum].last_token_int[ntok] = s-i;
778 ctx->lc[cnum].last_token_str[ntok] = i;
779 ctx->lc[cnum].last_token_type[ntok] = N_ALPHA;
780
781 i = s-1;
782 } else if (name[i] == '0') digits0: {
783 // Digits starting with zero; encode length + value
784 uint32_t s = i;
785 uint32_t v = 0;
786 int d = 0;
787
788 while (s < len && isdigit(name[s]) && s-i < 8) {
789 v = v*10 + name[s] - '0';
790 //putchar(name[s]);
791 s++;
792 }
793
794 // TODO: optimise choice over whether to switch from DIGITS to DELTA
795 // regularly vs all DIGITS, also MATCH vs DELTA 0.
796 if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_DIGITS0) {
797 d = v - ctx->lc[pnum].last_token_int[ntok];
798 if (d == 0 && ctx->lc[pnum].last_token_str[ntok] == s-i) {
799 #ifdef ENC_DEBUG
800 fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
801 #endif
802 if (encode_token_match(ctx, ntok) < 0) return -1;
803 //ctx->lc[pnum].last_token_delta[ntok]=0;
804 } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last_token_str[ntok] == s-i) {
805 #ifdef ENC_DEBUG
806 fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last_token_int[ntok], v);
807 #endif
808 //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
809 if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1;
810 //ctx->lc[pnum].last_token_delta[ntok]=1;
811 } else {
812 #ifdef ENC_DEBUG
813 fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last_token_int[ntok], v);
814 #endif
815 if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
816 if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
817 //ctx->lc[pnum].last_token_delta[ntok]=0;
818 }
819 } else {
820 #ifdef ENC_DEBUG
821 fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
822 #endif
823 if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
824 if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
825 //ctx->lc[pnum].last_token_delta[ntok]=0;
826 }
827
828 ctx->lc[cnum].last_token_str[ntok] = s-i; // length
829 ctx->lc[cnum].last_token_int[ntok] = v;
830 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
831
832 i = s-1;
833 } else if (isdigit(name[i])) {
834 // digits starting 1-9; encode value
835 uint32_t s = i;
836 uint32_t v = 0;
837 int d = 0;
838
839 while (s < len && isdigit(name[s]) && s-i < 8) {
840 v = v*10 + name[s] - '0';
841 //putchar(name[s]);
842 s++;
843 }
844
845 // dataset/10/K562_cytosol_LID8465_TopHat_v2.names
846 // col 4 is Illumina lane - we don't want match & delta in there
847 // as it has multiple lanes (so not ALL match) and delta is just
848 // random chance, increasing entropy instead.
849 // if (ntok == 4 || ntok == 8 || ntok == 10) {
850 // encode_token_int(ctx, ntok, N_DIGITS, v);
851 // } else {
852
853 // If the last token was DIGITS0 and we are the same length, then encode
854 // using that method instead as it seems likely the entire column is fixed
855 // width, sometimes with leading zeros.
856 if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok &&
857 ctx->lc[pnum].last_token_type[ntok] == N_DIGITS0 &&
858 ctx->lc[pnum].last_token_str[ntok] == s-i)
859 goto digits0;
860
861 // TODO: optimise choice over whether to switch from DIGITS to DELTA
862 // regularly vs all DIGITS, also MATCH vs DELTA 0.
863 if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_DIGITS) {
864 d = v - ctx->lc[pnum].last_token_int[ntok];
865 if (d == 0) {
866 #ifdef ENC_DEBUG
867 fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
868 #endif
869 if (encode_token_match(ctx, ntok) < 0) return -1;
870 //ctx->lc[pnum].last_token_delta[ntok]=0;
871 } else if (mode == 1 && d < 256 && d >= 0) {
872 #ifdef ENC_DEBUG
873 fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last_token_int[ntok], v);
874 #endif
875 if (encode_token_int1(ctx, ntok, N_DDELTA, d) < 0) return -1;
876 //ctx->lc[pnum].last_token_delta[ntok]=1;
877 } else {
878 #ifdef ENC_DEBUG
879 fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last_token_int[ntok], v);
880 #endif
881 if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1;
882 //ctx->lc[pnum].last_token_delta[ntok]=0;
883 }
884 } else {
885 #ifdef ENC_DEBUG
886 fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
887 #endif
888 if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1;
889 //ctx->lc[pnum].last_token_delta[ntok]=0;
890 }
891 // }
892
893 ctx->lc[cnum].last_token_int[ntok] = v;
894 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
895
896 i = s-1;
897 } else {
898 n_char:
899 //if (!isalpha(name[i])) putchar(name[i]);
900 if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_CHAR) {
901 if (name[i] == ctx->lc[pnum].last_token_int[ntok]) {
902 #ifdef ENC_DEBUG
903 fprintf(stderr, "Tok %d (chr-mat, %c)\n", N_MATCH, name[i]);
904 #endif
905 if (encode_token_match(ctx, ntok) < 0) return -1;
906 } else {
907 #ifdef ENC_DEBUG
908 fprintf(stderr, "Tok %d (chr, %c / %c)\n", N_CHAR, ctx->lc[pnum].last_token_int[ntok], name[i]);
909 #endif
910 if (encode_token_char(ctx, ntok, name[i]) < 0) return -1;
911 }
912 } else {
913 #ifdef ENC_DEBUG
914 fprintf(stderr, "Tok %d (new chr, %c)\n", N_CHAR, name[i]);
915 #endif
916 if (encode_token_char(ctx, ntok, name[i]) < 0) return -1;
917 }
918
919 ctx->lc[cnum].last_token_int[ntok] = name[i];
920 ctx->lc[cnum].last_token_type[ntok] = N_CHAR;
921 }
922
923 ntok++;
924 //putchar(' ');
925 }
926
927 #ifdef ENC_DEBUG
928 fprintf(stderr, "Tok %d (end)\n", N_END);
929 #endif
930 if (encode_token_end(ctx, ntok) < 0) return -1;
931
932 //printf("Encoded %.*s with %d tokens\n", len, name, ntok);
933
934 ctx->lc[cnum].last_name = name;
935 ctx->lc[cnum].last_ntok = ntok;
936
937 return 0;
938 }
939
940 //-----------------------------------------------------------------------------
941 // Name decoder
942
decode_name(name_context * ctx,char * name,int name_len)943 static int decode_name(name_context *ctx, char *name, int name_len) {
944 int t0 = decode_token_type(ctx, 0);
945 uint32_t dist;
946 int pnum, cnum = ctx->counter++;
947
948 if (t0 < 0)
949 return 0;
950
951 if (decode_token_int(ctx, 0, t0, &dist) < 0 || dist > cnum)
952 return -1;
953 if ((pnum = cnum - dist) < 0) pnum = 0;
954
955 //fprintf(stderr, "t0=%d, dist=%d, pnum=%d, cnum=%d\n", t0, dist, pnum, cnum);
956
957 if (t0 == N_DUP) {
958 if (strlen(ctx->lc[pnum].last_name) +1 >= name_len) return -1;
959 strcpy(name, ctx->lc[pnum].last_name);
960 // FIXME: optimise this
961 ctx->lc[cnum].last_name = name;
962 ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok;
963 int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS;
964 memcpy(ctx->lc[cnum].last_token_type, ctx->lc[pnum].last_token_type, nc * sizeof(int));
965 memcpy(ctx->lc[cnum].last_token_int , ctx->lc[pnum].last_token_int , nc * sizeof(int));
966 memcpy(ctx->lc[cnum].last_token_str , ctx->lc[pnum].last_token_str , nc * sizeof(int));
967
968 return strlen(name)+1;
969 }
970
971 *name = 0;
972 int ntok, len = 0, len2;
973
974 for (ntok = 1; ntok < MAX_TOKENS; ntok++) {
975 uint32_t v, vl;
976 enum name_type tok;
977 tok = decode_token_type(ctx, ntok);
978 //fprintf(stderr, "Tok %d = %d\n", ntok, tok);
979
980 switch (tok) {
981 case N_CHAR:
982 if (len+1 >= name_len) return -1;
983 if (decode_token_char(ctx, ntok, &name[len]) < 0) return -1;
984 //fprintf(stderr, "Tok %d CHAR %c\n", ntok, name[len]);
985 ctx->lc[cnum].last_token_type[ntok] = N_CHAR;
986 ctx->lc[cnum].last_token_int [ntok] = name[len++];
987 break;
988
989 case N_ALPHA:
990 len2 = decode_token_alpha(ctx, ntok, &name[len], name_len - len);
991 //fprintf(stderr, "Tok %d ALPHA %.*s\n", ntok, len2, &name[len]);
992 ctx->lc[cnum].last_token_type[ntok] = N_ALPHA;
993 ctx->lc[cnum].last_token_str [ntok] = len;
994 ctx->lc[cnum].last_token_int [ntok] = len2;
995 len += len2;
996 break;
997
998 case N_DIGITS0: // [0-9]*
999 if (decode_token_int1(ctx, ntok, N_DZLEN, &vl) < 0) return -1;
1000 if (decode_token_int(ctx, ntok, N_DIGITS0, &v) < 0) return -1;
1001 if (len+20+vl >= name_len) return -1;
1002 len += append_uint32_fixed(&name[len], v, vl);
1003 //fprintf(stderr, "Tok %d DIGITS0 %0*d\n", ntok, vl, v);
1004 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
1005 ctx->lc[cnum].last_token_int [ntok] = v;
1006 ctx->lc[cnum].last_token_str [ntok] = vl;
1007 break;
1008
1009 case N_DDELTA0:
1010 if (decode_token_int1(ctx, ntok, N_DDELTA0, &v) < 0) return -1;
1011 v += ctx->lc[pnum].last_token_int[ntok];
1012 if (len+ctx->lc[pnum].last_token_str[ntok]+1 >= name_len) return -1;
1013 len += append_uint32_fixed(&name[len], v, ctx->lc[pnum].last_token_str[ntok]);
1014 //fprintf(stderr, "Tok %d DELTA0 %0*d\n", ntok, ctx->lc[pnum].last_token_str[ntok], v);
1015 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
1016 ctx->lc[cnum].last_token_int [ntok] = v;
1017 ctx->lc[cnum].last_token_str [ntok] = ctx->lc[pnum].last_token_str[ntok];
1018 break;
1019
1020 case N_DIGITS: // [1-9][0-9]*
1021 if (decode_token_int(ctx, ntok, N_DIGITS, &v) < 0) return -1;
1022 if (len+20 >= name_len) return -1;
1023 len += append_uint32_var(&name[len], v);
1024 //fprintf(stderr, "Tok %d DIGITS %d\n", ntok, v);
1025 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
1026 ctx->lc[cnum].last_token_int [ntok] = v;
1027 break;
1028
1029 case N_DDELTA:
1030 if (decode_token_int1(ctx, ntok, N_DDELTA, &v) < 0) return -1;
1031 v += ctx->lc[pnum].last_token_int[ntok];
1032 if (len+20 >= name_len) return -1;
1033 len += append_uint32_var(&name[len], v);
1034 //fprintf(stderr, "Tok %d DELTA %d\n", ntok, v);
1035 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
1036 ctx->lc[cnum].last_token_int [ntok] = v;
1037 break;
1038
1039 case N_MATCH:
1040 switch (ctx->lc[pnum].last_token_type[ntok]) {
1041 case N_CHAR:
1042 if (len+1 >= name_len) return -1;
1043 name[len++] = ctx->lc[pnum].last_token_int[ntok];
1044 //fprintf(stderr, "Tok %d MATCH CHAR %c\n", ntok, ctx->lc[pnum].last_token_int[ntok]);
1045 ctx->lc[cnum].last_token_type[ntok] = N_CHAR;
1046 ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1047 break;
1048
1049 case N_ALPHA:
1050 if (ctx->lc[pnum].last_token_int[ntok] < 0 ||
1051 len+ctx->lc[pnum].last_token_int[ntok] >= name_len) return -1;
1052 memcpy(&name[len],
1053 &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]],
1054 ctx->lc[pnum].last_token_int[ntok]);
1055 //fprintf(stderr, "Tok %d MATCH ALPHA %.*s\n", ntok, ctx->lc[pnum].last_token_int[ntok], &name[len]);
1056 ctx->lc[cnum].last_token_type[ntok] = N_ALPHA;
1057 ctx->lc[cnum].last_token_str [ntok] = len;
1058 ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1059 len += ctx->lc[pnum].last_token_int[ntok];
1060 break;
1061
1062 case N_DIGITS:
1063 if (len+20 >= name_len) return -1;
1064 len += append_uint32_var(&name[len], ctx->lc[pnum].last_token_int[ntok]);
1065 //fprintf(stderr, "Tok %d MATCH DIGITS %d\n", ntok, ctx->lc[pnum].last_token_int[ntok]);
1066 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
1067 ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1068 break;
1069
1070 case N_DIGITS0:
1071 if (len+ctx->lc[pnum].last_token_str[ntok] >= name_len) return -1;
1072 len += append_uint32_fixed(&name[len], ctx->lc[pnum].last_token_int[ntok], ctx->lc[pnum].last_token_str[ntok]);
1073 //fprintf(stderr, "Tok %d MATCH DIGITS %0*d\n", ntok, ctx->lc[pnum].last_token_str[ntok], ctx->lc[pnum].last_token_int[ntok]);
1074 ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
1075 ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1076 ctx->lc[cnum].last_token_str [ntok] = ctx->lc[pnum].last_token_str[ntok];
1077 break;
1078
1079 default:
1080 return -1;
1081 }
1082 break;
1083
1084 default: // an elided N_END
1085 case N_END:
1086 if (len+1 >= name_len) return -1;
1087 name[len++] = 0;
1088 ctx->lc[cnum].last_token_type[ntok] = N_END;
1089
1090 ctx->lc[cnum].last_name = name;
1091 ctx->lc[cnum].last_ntok = ntok;
1092
1093 return len;
1094 }
1095 }
1096
1097
1098 return -1;
1099 }
1100
1101
1102 //-----------------------------------------------------------------------------
1103 // rANS codec
rans_encode(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len,int method)1104 static int rans_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) {
1105 unsigned int olen = *out_len-6, nb;
1106 if (rans_compress_to_4x16(in, in_len, out+6, &olen, method) == NULL)
1107 return -1;
1108
1109 nb = i7put(out, olen);
1110 memmove(out+nb, out+6, olen);
1111 *out_len = olen+nb;
1112 return 0;
1113 }
1114
1115 // Returns number of bytes read from 'in' on success,
1116 // -1 on failure.
rans_decode(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len)1117 static int64_t rans_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1118 unsigned int olen = *out_len;
1119
1120 uint64_t clen;
1121 int nb = i7get(in, &clen);
1122 //fprintf(stderr, "Rans decode %x\n", in[nb]);
1123 if (rans_uncompress_to_4x16(in+nb, in_len-nb, out, &olen, 0) == NULL)
1124 return -1;
1125 //fprintf(stderr, " Stored clen=%d\n", (int)clen);
1126 return clen+nb;
1127 }
1128
compress(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len)1129 static int compress(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1130 uint64_t best_sz = UINT64_MAX;
1131 int best = 0;
1132 uint64_t olen = *out_len;
1133
1134 //fprintf(stderr, "=== try %d ===\n", (int)in_len);
1135
1136 if (in_len <= 3) {
1137 out[0] = in_len;
1138 memcpy(out+1, in, in_len);
1139 *out_len = in_len+1;
1140 return 0;
1141 }
1142
1143 //int rmethods[] = {0,1,128,129,64,65,192,193, 193+8}, m;
1144 //int rmethods[] = {0,1,128,129,64,65,192,193, 193+8, 0+4, 128+4}, m;
1145 // DO_DICT doesn't yet work in conjunction with DO_PACK.
1146 int rmethods[] = {0,1,128,129,64,65,192,193, 193+8, 0+4}, m;
1147 for (m = 0; m < sizeof(rmethods)/sizeof(*rmethods); m++) {
1148 *out_len = olen;
1149 if (rans_encode(in, in_len, out, out_len, rmethods[m]) < 0) return -1;
1150
1151 if (best_sz > *out_len) {
1152 best_sz = *out_len;
1153 best = rmethods[m];
1154 }
1155 }
1156
1157 *out_len = olen;
1158 if (rans_encode(in, in_len, out, out_len, best) < 0) return -1;
1159
1160 assert(*out_len > 2);
1161
1162 // uint64_t tmp;
1163 // fprintf(stderr, "%d -> %d via method %x, %x\n", (int)in_len, (int)best_sz, best, out[i7get(out,&tmp)]);
1164
1165 return 0;
1166 }
1167
uncompressed_size(uint8_t * in,uint64_t in_len)1168 static uint64_t uncompressed_size(uint8_t *in, uint64_t in_len) {
1169 uint64_t clen, ulen;
1170
1171 int nb = i7get(in, &clen);
1172 if (clen <= 3) return clen;
1173
1174 i7get(in+nb+1, &ulen);
1175
1176 return ulen;
1177 }
1178
uncompress(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len)1179 static int uncompress(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1180 uint64_t clen;
1181 i7get(in, &clen);
1182
1183 if (clen <= 3) {
1184 memcpy(out, in+1, clen);
1185 *out_len = clen;
1186 return clen+1;
1187
1188 //*out = in[1];
1189 //*out_len = 1;
1190 //return 2;
1191 }
1192
1193 return rans_decode(in, in_len, out, out_len);
1194 }
1195
1196 //-----------------------------------------------------------------------------
1197
1198 /*
1199 * Converts a line or \0 separated block of reading names to a compressed buffer.
1200 * The code can only encode whole lines and will not attempt a partial line.
1201 * Use the "last_start_p" return value to identify the partial line start
1202 * offset, for continuation purposes.
1203 *
1204 * Returns a malloced buffer holding compressed data of size *out_len,
1205 * or NULL on failure
1206 */
encode_names(char * blk,int len,int * out_len,int * last_start_p)1207 uint8_t *encode_names(char *blk, int len, int *out_len, int *last_start_p) {
1208 int last_start = 0, i, j, nreads;
1209
1210 // Count lines
1211 for (nreads = i = 0; i < len; i++)
1212 if (blk[i] <= '\n') // \n or \0 separated entries
1213 nreads++;
1214
1215 name_context *ctx = create_context(nreads);
1216 if (!ctx)
1217 return NULL;
1218
1219 // Construct trie
1220 int ctr = 0;
1221 for (i = j = 0; i < len; j=++i) {
1222 while (i < len && blk[i] > '\n')
1223 i++;
1224 if (i >= len)
1225 break;
1226
1227 //blk[i] = '\0';
1228 last_start = i+1;
1229 if (build_trie(ctx, &blk[j], i-j, ctr++) < 0) {
1230 free_context(ctx);
1231 return NULL;
1232 }
1233 }
1234 if (last_start_p)
1235 *last_start_p = last_start;
1236
1237 //fprintf(stderr, "Processed %d of %d in block, line %d\n", last_start, len, ctr);
1238
1239 // Encode name
1240 for (i = j = 0; i < len; j=++i) {
1241 while (i < len && blk[i] > '\n')
1242 i++;
1243 if (i >= len)
1244 break;
1245
1246 blk[i] = '\0';
1247 // try both 0 and 1 and pick best?
1248 if (encode_name(ctx, &blk[j], i-j, 1) < 0) {
1249 free_context(ctx);
1250 return NULL;
1251 }
1252 }
1253
1254 #if 0
1255 for (i = 0; i < MAX_TBLOCKS; i++) {
1256 char fn[1024];
1257 if (!ctx->desc[i].buf_l) continue;
1258 sprintf(fn, "_tok.%02d_%02d.%d", i>>4,i&15,i);
1259 FILE *fp = fopen(fn, "w");
1260 fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
1261 fclose(fp);
1262 }
1263 #endif
1264
1265 //dump_trie(t_head, 0);
1266
1267 // FIXME: merge descriptors
1268 //
1269 // If we see foo7:1 foo7:12 foo7:7 etc then foo: is constant,
1270 // but it's encoded as alpha<foo>+dig<7>+char<:> instead of alpha<foo7:>.
1271 // Any time token type 0 is all match beyond the first location we have
1272 // a candidate for merging in string form.
1273 //
1274 // This saves around .1 to 1.3 percent on varying data sets.
1275 // Cruder hack is dedicated prefix/suffix matching to short-cut this.
1276
1277
1278 // Drop N_TYPE blocks if they all contain matches bar the first item,
1279 // as we can regenerate these from the subsequent blocks types during
1280 // decode.
1281 for (i = 0; i < MAX_TBLOCKS; i+=16) {
1282 if (!ctx->desc[i].buf_l) continue;
1283
1284 int z;
1285 for (z=1; z<ctx->desc[i].buf_l; z++) {
1286 if (ctx->desc[i].buf[z] != N_MATCH)
1287 break;
1288 }
1289 if (z == ctx->desc[i].buf_l) {
1290 int k;
1291 for (k=1; k<16; k++)
1292 if (ctx->desc[i+k].buf_l)
1293 break;
1294
1295 ctx->desc[i].buf_l = 0;
1296 free(ctx->desc[i].buf);
1297 }
1298 }
1299
1300 // Serialise descriptors
1301 uint32_t tot_size = 8;
1302 int ndesc = 0;
1303 for (i = 0; i < MAX_TBLOCKS; i++) {
1304 if (!ctx->desc[i].buf_l) continue;
1305
1306 ndesc++;
1307
1308 int tnum = i>>4;
1309 int ttype = i&15;
1310
1311 uint64_t out_len = 1.5 * rans_compress_bound_4x16(ctx->desc[i].buf_l, 1); // guesswork
1312 uint8_t *out = malloc(out_len);
1313 if (!out) {
1314 free_context(ctx);
1315 return NULL;
1316 }
1317
1318 if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, out, &out_len) < 0) {
1319 free_context(ctx);
1320 return NULL;
1321 }
1322
1323 free(ctx->desc[i].buf);
1324 ctx->desc[i].buf = out;
1325 ctx->desc[i].buf_l = out_len;
1326 ctx->desc[i].tnum = tnum;
1327 ctx->desc[i].ttype = ttype;
1328
1329 // Find dups
1330 int j;
1331 for (j = 0; j < i; j++) {
1332 if (!ctx->desc[j].buf)
1333 continue;
1334 if (ctx->desc[i].buf_l != ctx->desc[j].buf_l || ctx->desc[i].buf_l <= 4)
1335 continue;
1336 if (memcmp(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_l) == 0)
1337 break;
1338 }
1339 if (j < i) {
1340 ctx->desc[i].dup_from = j;
1341 tot_size += 4; // flag, dup_from, ttype
1342 } else {
1343 ctx->desc[i].dup_from = 0;
1344 tot_size += out_len + 1; // ttype
1345 }
1346 }
1347
1348 #if 0
1349 for (i = 0; i < MAX_TBLOCKS; i++) {
1350 char fn[1024];
1351 if (!ctx->desc[i].buf_l && !ctx->desc[i].dup_from) continue;
1352 sprintf(fn, "_tok.%02d_%02d.%d.comp", i>>4,i&15,i);
1353 FILE *fp = fopen(fn, "w");
1354 fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
1355 fclose(fp);
1356 }
1357 #endif
1358
1359 // Write
1360 uint8_t *out = malloc(tot_size+12);
1361 if (!out) {
1362 free_context(ctx);
1363 return NULL;
1364 }
1365
1366 uint8_t *cp = out;
1367 //*out_len = tot_size+4;
1368 //*(uint32_t *)cp = tot_size; cp += 4;
1369
1370 *out_len = tot_size;
1371 *(uint32_t *)cp = last_start; cp += 4;
1372 *(uint32_t *)cp = nreads; cp += 4;
1373 //write(1, &nreads, 4);
1374 int last_tnum = -1;
1375 for (i = 0; i < MAX_TBLOCKS; i++) {
1376 if (!ctx->desc[i].buf_l) continue;
1377 uint8_t ttype8 = ctx->desc[i].ttype;
1378 if (ctx->desc[i].tnum != last_tnum) {
1379 ttype8 |= 128;
1380 last_tnum = ctx->desc[i].tnum;
1381 }
1382 if (ctx->desc[i].dup_from) {
1383 //fprintf(stderr, "Dup %d from %d, sz %d\n", i, ctx->desc[i].dup_from, ctx->desc[i].buf_l);
1384 //uint8_t x = 255;
1385 //write(1, &x, 1);
1386 *cp++ = 255;
1387 //uint16_t y = ctx->desc[i].dup_from;
1388 //write(1, &y, 2);
1389 *(uint16_t *)cp = ctx->desc[i].dup_from; cp+= 2;
1390 //write(1, &ttype8, 1);
1391 *cp++ = ttype8;
1392 } else {
1393 //write(1, &ttype8, 1);
1394 //write(1, ctx->desc[i].buf, ctx->desc[i].buf_l);
1395 *cp++ = ttype8;
1396 memcpy(cp, ctx->desc[i].buf, ctx->desc[i].buf_l);
1397 cp += ctx->desc[i].buf_l;
1398 }
1399 }
1400
1401 //assert(cp-out == tot_size);
1402
1403 for (i = 0; i < MAX_TBLOCKS; i++) {
1404 if (!ctx->desc[i].buf_l) continue;
1405 free(ctx->desc[i].buf);
1406 }
1407
1408 free_context(ctx);
1409
1410 return out;
1411 }
1412
1413 /*
1414 * Decodes a compressed block of read names into \0 separated names.
1415 * The size of the data returned (malloced) is in *out_len.
1416 *
1417 * Returns NULL on failure.
1418 */
decode_names(uint8_t * in,uint32_t sz,int * out_len)1419 uint8_t *decode_names(uint8_t *in, uint32_t sz, int *out_len) {
1420 if (sz < 8)
1421 return NULL;
1422
1423 int i, o = 8;
1424 int ulen = *(uint32_t *)in;
1425 int nreads = *(uint32_t *)(in+4);
1426 name_context *ctx = create_context(nreads);
1427 if (!ctx)
1428 return NULL;
1429
1430 // Unpack descriptors
1431 int tnum = -1;
1432 while (o < sz) {
1433 uint8_t ttype = in[o++];
1434 if (ttype == 255) {
1435 if (o+3 >= sz) return NULL;
1436 uint16_t j = *(uint16_t *)&in[o];
1437 o += 2;
1438 ttype = in[o++];
1439 // if (ttype == 0)
1440 if (ttype & 128)
1441 tnum++;
1442
1443 if ((ttype & 15) != 0 && (ttype & 128)) {
1444 ctx->desc[tnum<<4].buf = malloc(nreads);
1445 if (!ctx->desc[tnum<<4].buf)
1446 return NULL;
1447
1448 ctx->desc[tnum<<4].buf_a = nreads;
1449 ctx->desc[tnum<<4].buf[0] = ttype&15;
1450 if ((ttype&15) == N_DZLEN)
1451 ctx->desc[tnum<<4].buf[0] = N_DIGITS0;
1452 memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1);
1453 }
1454
1455 i = (tnum<<4) | (ttype&15);
1456 if (j >= i)
1457 return NULL;
1458
1459 ctx->desc[i].buf_l = 0;
1460 ctx->desc[i].buf_a = ctx->desc[j].buf_a;
1461 ctx->desc[i].buf = malloc(ctx->desc[i].buf_a);
1462 if (!ctx->desc[i].buf)
1463 return NULL;
1464
1465 memcpy(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_a);
1466 //fprintf(stderr, "Copy ttype %d, i=%d,j=%d, size %d\n", ttype, i, j, (int)ctx->desc[i].buf_a);
1467 continue;
1468 }
1469
1470 //if (ttype == 0)
1471 if (ttype & 128)
1472 tnum++;
1473
1474 if ((ttype & 15) != 0 && (ttype & 128)) {
1475 ctx->desc[tnum<<4].buf = malloc(nreads);
1476 if (!ctx->desc[tnum<<4].buf) {
1477 free_context(ctx);
1478 return NULL;
1479 }
1480 ctx->desc[tnum<<4].buf_a = nreads;
1481 ctx->desc[tnum<<4].buf[0] = ttype&15;
1482 if ((ttype&15) == N_DZLEN)
1483 ctx->desc[tnum<<4].buf[0] = N_DIGITS0;
1484 memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1);
1485 }
1486
1487 //fprintf(stderr, "Read %02x\n", c);
1488
1489 // Load compressed block
1490 int64_t clen, ulen = uncompressed_size(&in[o], sz-o);
1491 if (ulen < 0) {
1492 free_context(ctx);
1493 return NULL;
1494 }
1495 i = (tnum<<4) | (ttype&15);
1496
1497 if (i >= MAX_TBLOCKS || i < 0)
1498 return NULL;
1499
1500 ctx->desc[i].buf_l = 0;
1501 ctx->desc[i].buf = malloc(ulen);
1502 if (!ctx->desc[i].buf) {
1503 free_context(ctx);
1504 return NULL;
1505 }
1506
1507 ctx->desc[i].buf_a = ulen;
1508 clen = uncompress(&in[o], sz-o, ctx->desc[i].buf, &ctx->desc[i].buf_a);
1509 if (clen < 0) {
1510 free(ctx->desc[i].buf);
1511 free_context(ctx);
1512 return NULL;
1513 }
1514 assert(ctx->desc[i].buf_a == ulen);
1515
1516 // fprintf(stderr, "%d: Decode tnum %d type %d clen %d ulen %d via %d\n",
1517 // o, tnum, ttype, (int)clen, (int)ctx->desc[i].buf_a, ctx->desc[i].buf[0]);
1518
1519 o += clen;
1520
1521 // Encode tnum 0 type 0 ulen 100000 clen 12530 via 2
1522 // Encode tnum 0 type 6 ulen 196800 clen 43928 via 3
1523 // Encode tnum 0 type 7 ulen 203200 clen 17531 via 3
1524 // Encode tnum 1 type 0 ulen 50800 clen 10 via 1
1525 // Encode tnum 1 type 1 ulen 3 clen 5 via 0
1526 // Encode tnum 2 type 0 ulen 50800 clen 10 via 1
1527 //
1528 }
1529
1530 int ret;
1531 ulen += 1024; // for easy coding in decode_name.
1532 uint8_t *out = malloc(ulen);
1533 if (!out) {
1534 free_context(ctx);
1535 return NULL;
1536 }
1537
1538 size_t out_sz = 0;
1539 while ((ret = decode_name(ctx, (char *)out+out_sz, ulen)) > 0) {
1540 out_sz += ret;
1541 ulen -= ret;
1542 }
1543
1544 if (ret < 0)
1545 free(out);
1546
1547 for (i = 0; i < MAX_TBLOCKS; i++) {
1548 if (ctx->desc[i].buf) {
1549 free(ctx->desc[i].buf);
1550 ctx->desc[i].buf = 0;
1551 }
1552 }
1553
1554 free_context(ctx);
1555
1556 *out_len = out_sz;
1557 return ret == 0 ? out : NULL;
1558 }
1559
1560
1561
1562 #ifdef TEST_TOKENISER
1563 //-----------------------------------------------------------------------------
1564 // main() implementation for testing
1565
1566 // Large enough for whole file for now.
1567 #ifndef BLK_SIZE
1568 #define BLK_SIZE 1*1024*1024
1569 #endif
1570 static char blk[BLK_SIZE*2]; // temporary fix for decoder, which needs more space
1571
encode(int argc,char ** argv)1572 static int encode(int argc, char **argv) {
1573 FILE *fp;
1574 int len, i, j;
1575 name_context *ctx;
1576
1577 if (argc > 1) {
1578 fp = fopen(argv[1], "r");
1579 if (!fp) {
1580 perror(argv[1]);
1581 return 1;
1582 }
1583 } else {
1584 fp = stdin;
1585 }
1586
1587 int blk_offset = 0;
1588 int blk_num = 0;
1589 for (;;) {
1590 int last_start = 0;
1591
1592 len = fread(blk+blk_offset, 1, BLK_SIZE-blk_offset, fp);
1593 if (len <= 0)
1594 break;
1595 len += blk_offset;
1596
1597 int out_len;
1598 uint8_t *out = encode_names(blk, len, &out_len, &last_start);
1599 write(1, &out_len, 4);
1600 write(1, out, out_len); // encoded data
1601 free(out);
1602
1603 if (len > last_start)
1604 memmove(blk, &blk[last_start], len - last_start);
1605 blk_offset = len - last_start;
1606 blk_num++;
1607 }
1608
1609 if (fclose(fp) < 0) {
1610 perror("closing file");
1611 return 1;
1612 }
1613
1614 return 0;
1615 }
1616
decode(int argc,char ** argv)1617 static int decode(int argc, char **argv) {
1618 uint32_t in_sz, out_sz;
1619 while (fread(&in_sz, 1, 4, stdin) == 4) {
1620 uint8_t *in = malloc(in_sz), *out;
1621 if (!in)
1622 return -1;
1623
1624 if (fread(in, 1, in_sz, stdin) != in_sz) {
1625 free(in);
1626 return -1;
1627 }
1628
1629 if ((out = decode_names(in, in_sz, &out_sz)) == NULL) {
1630 free(in);
1631 return -1;
1632 }
1633
1634 write(1, out, out_sz);
1635
1636 free(in);
1637 free(out);
1638 }
1639
1640 return 0;
1641 }
1642
main(int argc,char ** argv)1643 int main(int argc, char **argv) {
1644
1645 if (argc > 1 && strcmp(argv[1], "-d") == 0)
1646 return decode(argc-1, argv+1);
1647 else
1648 return encode(argc, argv);
1649 }
1650
1651 #endif // TEST_TOKENISER
1652