1 /*
2  * Copyright (c) 2016,2017 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 // cc -I.. -g -O3 tokenise_name3.c rANS_static4x16pr.c pooled_alloc.c -pthread -DTEST_TOKENISER
35 
36 // As per tokenise_name2 but has the entropy encoder built in already,
37 // so we just have a single encode and decode binary.  (WIP; mainly TODO)
38 
39 // TODO
40 //
41 // - Is it better when encoding 1, 2, 3, 3, 4, 5, 5, 6, 7, 9, 9, 10 to encode
42 //   this as a mixture of MATCH and DELTA ops, or as entirely as DELTA ops
43 //   with some delta values being zero?  I suspect the latter, but it is
44 //   not implemented here.  See "last_token_delta" comments in code.
45 //
46 // - Consider variable size string implementations.
47 //   Pascal style strings (length + str),
48 //   C style strings (nul terminated),
49 //   Or split blocks: length block and string contents block.
50 //
51 // - Is this one token-block or many serialised token-blocks?
52 //   A) Lots of different models but feeding one bit-buffer emitted to
53 //      by the entropy encoder => one block (fqzcomp).
54 //   B) Lots of different models each feeding their own bit-buffers
55 //      => many blocks.
56 //
57 // - multiple integer types depending on size; 1, 2, 4 byte long.
58 //
59 // - Consider token choice for isalnum instead of isalpha.  Sometimes better.
60 //
61 // - Consider token synchronisation (eg on matching chr symbols?) incase of
62 //   variable number.  Eg consider foo:0999, foo:1000, foo:1001 (the leading
63 //   zero adds an extra token).
64 //
65 // - Optimisation of tokens.  Eg:
66 //     HS25_09827:2:2102:11274:80442#49
67 //     HS25_09827:2:2109:12941:31311#49
68 //
69 //   We'll have tokens for HS 25 _ 09827 : 2 : that are entirely <MATCH>
70 //   after the initial token.  These 7 tokens could be one ALPHA instead
71 //   of 7 distinct tokens, with 1 MATCH instead of 7.  This is both a speed
72 //   improvement for decoding as well as a space saving (fewer token-blocks
73 //   and associated overhead).
74 //
75 // - XOR.  Like ALPHA, but used when previous symbol is ALPHA or XOR
76 //   and string lengths match.  Useful when names are similar, eg:
77 //   the sequence in 07.names:
78 //
79 //   @VP2-06:112:H7LNDMCVY:1:1105:26919:1172 1:N:0:ATTCAGAA+AGGAGAAG
80 //   @VP2-06:112:H7LNDMCVY:1:1105:27100:1172 1:N:0:ATTCAGAA+AGGCGAAG
81 //   @VP2-06:112:H7LNDMCVY:1:1105:27172:1172 1:N:0:ATTCAGAA+AGGCTAAG
82 
83 
84 
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <string.h>
88 #include <sys/types.h>
89 #include <sys/stat.h>
90 #include <unistd.h>
91 #include <limits.h>
92 #include <ctype.h>
93 #include <assert.h>
94 #include <inttypes.h>
95 #include <math.h>
96 #include <fcntl.h>
97 #include <errno.h>
98 #include <time.h>
99 
100 #include "io_lib/pooled_alloc.h"
101 #include "io_lib/rANS_static4x16.h"
102 #include "io_lib/tokenise_name3.h"
103 
104 // 128 is insufficient for SAM names (max 256 bytes) as
105 // we may alternate a0a0a0a0a0 etc.  However if we fail,
106 // we just give up and switch to another codec, so this
107 // isn't a serious limit.  Maybe up to 256 to permit all
108 // SAM names?
109 #define MAX_TOKENS 128
110 #define MAX_TBLOCKS (MAX_TOKENS<<4)
111 
112 // Number of names per block
113 #define MAX_NAMES 1000000
114 
115 enum name_type {N_ERR = -1, N_TYPE = 0, N_ALPHA, N_CHAR, N_DZLEN, N_DIGITS0, N_DUP, N_DIFF,
116 		N_DIGITS, N_D1, N_D2, N_D3, N_DDELTA, N_DDELTA0, N_MATCH, N_END,N_ALL};
117 
118 char *types[]={"TYPE", "ALPHA", "CHAR", "DZLEN", "DIG0", "DUP", "DIFF",
119 	       "DIGITS", "", "", "", "DDELTA", "DDELTA0", "MATCH", "END"};
120 
121 typedef struct trie trie_t;
122 
123 typedef struct {
124     char *last_name;
125     int last_ntok;
126     enum name_type last_token_type[MAX_TOKENS];
127     int last_token_int[MAX_TOKENS];
128     int last_token_str[MAX_TOKENS];
129     //int last_token_delta[MAX_TOKENS];
130 } last_context;
131 
132 typedef struct {
133     uint8_t *buf;
134     size_t buf_a, buf_l; // alloc and used length.
135     int tnum, ttype;
136     int dup_from;
137 } descriptor;
138 
139 typedef struct {
140     last_context *lc;
141 
142     // For finding entire line dups
143     int counter;
144 
145     // Trie used in encoder only
146     trie_t *t_head;
147     pool_alloc_t *pool;
148 
149     // token blocks
150     descriptor desc[MAX_TBLOCKS];
151 } name_context;
152 
create_context(int max_names)153 name_context *create_context(int max_names) {
154     if (max_names <= 0)
155 	return NULL;
156 
157     name_context *ctx = malloc(sizeof(*ctx) + ++max_names*sizeof(*ctx->lc));
158     if (!ctx) return NULL;
159 
160     ctx->counter = 0;
161     ctx->t_head = NULL;
162 
163     ctx->lc = (last_context *)(((char *)ctx) + sizeof(*ctx));
164     ctx->pool = NULL;
165 
166     memset(&ctx->desc[0], 0, MAX_TBLOCKS * sizeof(ctx->desc[0]));
167 
168     return ctx;
169 }
170 
171 void free_trie(trie_t *t);
free_context(name_context * ctx)172 void free_context(name_context *ctx) {
173     if (!ctx)
174 	return;
175 
176     if (ctx->t_head)
177 	free(ctx->t_head);
178     if (ctx->pool)
179 	pool_destroy(ctx->pool);
180 
181     free(ctx);
182 }
183 
184 //-----------------------------------------------------------------------------
185 // Fast unsigned integer printing code.
186 // Returns number of bytes written.
append_uint32_fixed(char * cp,uint32_t i,uint8_t l)187 static int append_uint32_fixed(char *cp, uint32_t i, uint8_t l) {
188     switch (l) {
189     case 9:*cp++ = i / 100000000 + '0', i %= 100000000;
190     case 8:*cp++ = i / 10000000  + '0', i %= 10000000;
191     case 7:*cp++ = i / 1000000   + '0', i %= 1000000;
192     case 6:*cp++ = i / 100000    + '0', i %= 100000;
193     case 5:*cp++ = i / 10000     + '0', i %= 10000;
194     case 4:*cp++ = i / 1000      + '0', i %= 1000;
195     case 3:*cp++ = i / 100       + '0', i %= 100;
196     case 2:*cp++ = i / 10        + '0', i %= 10;
197     case 1:*cp++ = i             + '0';
198     case 0:break;
199     }
200     return l;
201 }
202 
append_uint32_var(char * cp,uint32_t i)203 static int append_uint32_var(char *cp, uint32_t i) {
204     char *op = cp;
205     uint32_t j;
206 
207     //if (i < 10)         goto b0;
208     if (i < 100)        goto b1;
209     //if (i < 1000)       goto b2;
210     if (i < 10000)      goto b3;
211     //if (i < 100000)     goto b4;
212     if (i < 1000000)    goto b5;
213     //if (i < 10000000)   goto b6;
214     if (i < 100000000)  goto b7;
215 
216     if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
217     if ((j = i / 100000000))  {*cp++ = j + '0'; i -= j*100000000;  goto x7;}
218  b7:if ((j = i / 10000000))   {*cp++ = j + '0'; i -= j*10000000;   goto x6;}
219     if ((j = i / 1000000))    {*cp++ = j + '0', i -= j*1000000;    goto x5;}
220  b5:if ((j = i / 100000))     {*cp++ = j + '0', i -= j*100000;     goto x4;}
221     if ((j = i / 10000))      {*cp++ = j + '0', i -= j*10000;      goto x3;}
222  b3:if ((j = i / 1000))       {*cp++ = j + '0', i -= j*1000;       goto x2;}
223     if ((j = i / 100))        {*cp++ = j + '0', i -= j*100;        goto x1;}
224  b1:if ((j = i / 10))         {*cp++ = j + '0', i -= j*10;         goto x0;}
225     if (i)                     *cp++ = i + '0';
226     return cp-op;
227 
228  x8:*cp++ = i / 100000000 + '0', i %= 100000000;
229  x7:*cp++ = i / 10000000  + '0', i %= 10000000;
230  x6:*cp++ = i / 1000000   + '0', i %= 1000000;
231  x5:*cp++ = i / 100000    + '0', i %= 100000;
232  x4:*cp++ = i / 10000     + '0', i %= 10000;
233  x3:*cp++ = i / 1000      + '0', i %= 1000;
234  x2:*cp++ = i / 100       + '0', i %= 100;
235  x1:*cp++ = i / 10        + '0', i %= 10;
236  x0:*cp++ = i             + '0';
237 
238     return cp-op;
239 }
240 
241 //-----------------------------------------------------------------------------
242 // Simple variable sized unsigned integers
i7put(uint8_t * cp,uint64_t i)243 static int i7put(uint8_t *cp, uint64_t i) {
244     uint8_t *op = cp;
245     int s = 0;
246     uint64_t o = i;
247 
248     do {
249 	s += 7;
250 	o >>= 7;
251     } while (o);
252 
253     do {
254 	s -= 7;
255 	*cp++ = ((i>>s)&0x7f) + (s?128:0);
256     } while (s);
257 
258     return cp-op;
259 }
260 
i7get(uint8_t * cp,uint64_t * i)261 static int i7get(uint8_t *cp, uint64_t *i) {
262     uint8_t *op = cp, c;
263     uint64_t j = 0;
264 
265     do {
266 	c = *cp++;
267 	j = (j<<7) | (c & 0x7f);
268     } while (c & 0x80);
269 
270     *i = j;
271     return cp-op;
272 }
273 
274 //-----------------------------------------------------------------------------
275 // Example descriptor encoding and IO.
276 //
277 // Here we just append to a buffer so we can dump out the results.
278 // These could then be passed through a static entropy encoder that
279 // encodes the entire buffer.
280 //
281 // Alternatively an adaptive entropy encoder could be place inline
282 // here to encode as it goes using additional knowledge from the
283 // supplied context.
284 
285 // Ensure room for sz more bytes.
descriptor_grow(descriptor * fd,uint32_t sz)286 static int descriptor_grow(descriptor *fd, uint32_t sz) {
287     while (fd->buf_l + sz > fd->buf_a) {
288 	size_t buf_a = fd->buf_a ? fd->buf_a*2 : 65536;
289 	unsigned char *buf = realloc(fd->buf, buf_a);
290 	if (!buf)
291 	    return -1;
292 	fd->buf = buf;
293 	fd->buf_a = buf_a;
294     }
295 
296     return 0;
297 }
298 
encode_token_type(name_context * ctx,int ntok,enum name_type type)299 static int encode_token_type(name_context *ctx, int ntok,
300 			     enum name_type type) {
301     int id = ntok<<4;
302 
303     if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
304 
305     ctx->desc[id].buf[ctx->desc[id].buf_l++] = type;
306 
307     return 0;
308 }
309 
encode_token_match(name_context * ctx,int ntok)310 static int encode_token_match(name_context *ctx, int ntok) {
311     return encode_token_type(ctx, ntok, N_MATCH);
312 }
313 
encode_token_end(name_context * ctx,int ntok)314 static int encode_token_end(name_context *ctx, int ntok) {
315     return encode_token_type(ctx, ntok, N_END);
316 }
317 
decode_token_type(name_context * ctx,int ntok)318 static enum name_type decode_token_type(name_context *ctx, int ntok) {
319     int id = ntok<<4;
320     if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1;
321     return ctx->desc[id].buf[ctx->desc[id].buf_l++];
322 }
323 
324 // int stored as 32-bit quantities
encode_token_int(name_context * ctx,int ntok,enum name_type type,uint32_t val)325 static int encode_token_int(name_context *ctx, int ntok,
326 			    enum name_type type, uint32_t val) {
327     int id = (ntok<<4) | type;
328 
329     if (encode_token_type(ctx, ntok, type) < 0) return -1;
330     if (descriptor_grow(&ctx->desc[id], 4) < 0)	return -1;
331 
332     // Assumes little endian and unalign access OK.
333     *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l) = val;
334     ctx->desc[id].buf_l += 4;
335 
336     return 0;
337 }
338 
339 // Return 0 on success, -1 on failure;
decode_token_int(name_context * ctx,int ntok,enum name_type type,uint32_t * val)340 static int decode_token_int(name_context *ctx, int ntok,
341 			    enum name_type type, uint32_t *val) {
342     int id = (ntok<<4) | type;
343 
344     if (ctx->desc[id].buf_l + 4 > ctx->desc[id].buf_a)
345 	return -1;
346 
347     // Assumes little endian and unalign access OK.
348     *val = *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l);
349     ctx->desc[id].buf_l += 4;
350 
351     return 0;
352 }
353 
354 // 8 bit integer quantity
encode_token_int1(name_context * ctx,int ntok,enum name_type type,uint32_t val)355 static int encode_token_int1(name_context *ctx, int ntok,
356 			     enum name_type type, uint32_t val) {
357     int id = (ntok<<4) | type;
358 
359     if (encode_token_type(ctx, ntok, type) < 0) return -1;
360     if (descriptor_grow(&ctx->desc[id], 1) < 0)	return -1;
361 
362     ctx->desc[id].buf[ctx->desc[id].buf_l++] = val;
363 
364     return 0;
365 }
366 
encode_token_int1_(name_context * ctx,int ntok,enum name_type type,uint32_t val)367 static int encode_token_int1_(name_context *ctx, int ntok,
368 			      enum name_type type, uint32_t val) {
369     int id = (ntok<<4) | type;
370 
371     if (descriptor_grow(&ctx->desc[id], 1) < 0)	return -1;
372 
373     ctx->desc[id].buf[ctx->desc[id].buf_l++] = val;
374 
375     return 0;
376 }
377 
378 // Return 0 on success, -1 on failure;
decode_token_int1(name_context * ctx,int ntok,enum name_type type,uint32_t * val)379 static int decode_token_int1(name_context *ctx, int ntok,
380 			     enum name_type type, uint32_t *val) {
381     int id = (ntok<<4) | type;
382 
383     if (ctx->desc[id].buf_l  >= ctx->desc[id].buf_a)
384 	return -1;
385     *val = ctx->desc[id].buf[ctx->desc[id].buf_l++];
386 
387     return 0;
388 }
389 
390 
391 // Basic C-string style for now.
392 //
393 // Maybe XOR with previous string as context?
394 // This permits partial match to be encoded efficiently.
encode_token_alpha(name_context * ctx,int ntok,char * str,int len)395 static int encode_token_alpha(name_context *ctx, int ntok,
396 			      char *str, int len) {
397     int id = (ntok<<4) | N_ALPHA;
398 
399     if (encode_token_type(ctx, ntok, N_ALPHA) < 0)  return -1;
400     if (descriptor_grow(&ctx->desc[id], len+1) < 0) return -1;
401     memcpy(&ctx->desc[id].buf[ctx->desc[id].buf_l], str, len);
402     ctx->desc[id].buf[ctx->desc[id].buf_l+len] = 0;
403     ctx->desc[id].buf_l += len+1;
404 
405     return 0;
406 }
407 
408 // FIXME: need limit on string length for security.
409 // Return length on success, -1 on failure;
decode_token_alpha(name_context * ctx,int ntok,char * str,int max_len)410 static int decode_token_alpha(name_context *ctx, int ntok, char *str, int max_len) {
411     int id = (ntok<<4) | N_ALPHA;
412     char c;
413     int len = 0;
414     if (ctx->desc[id].buf_l  >= ctx->desc[id].buf_a)
415 	return -1;
416     do {
417 	c = ctx->desc[id].buf[ctx->desc[id].buf_l++];
418 	str[len++] = c;
419     } while(c && len < max_len && ctx->desc[id].buf_l < ctx->desc[id].buf_a);
420 
421     return len-1;
422 }
423 
encode_token_char(name_context * ctx,int ntok,char c)424 static int encode_token_char(name_context *ctx, int ntok, char c) {
425     int id = (ntok<<4) | N_CHAR;
426 
427     if (encode_token_type(ctx, ntok, N_CHAR) < 0) return -1;
428     if (descriptor_grow(&ctx->desc[id], 1) < 0)    return -1;
429     ctx->desc[id].buf[ctx->desc[id].buf_l++] = c;
430 
431     return 0;
432 }
433 
434 // FIXME: need limit on string length for security
435 // Return length on success, -1 on failure;
decode_token_char(name_context * ctx,int ntok,char * str)436 static int decode_token_char(name_context *ctx, int ntok, char *str) {
437     int id = (ntok<<4) | N_CHAR;
438 
439     if (ctx->desc[id].buf_l  >= ctx->desc[id].buf_a)
440 	return -1;
441     *str = ctx->desc[id].buf[ctx->desc[id].buf_l++];
442 
443     return 1;
444 }
445 
446 
447 // A duplicated name
encode_token_dup(name_context * ctx,uint32_t val)448 static int encode_token_dup(name_context *ctx, uint32_t val) {
449     return encode_token_int(ctx, 0, N_DUP, val);
450 }
451 
452 // Which read to delta against
encode_token_diff(name_context * ctx,uint32_t val)453 static int encode_token_diff(name_context *ctx, uint32_t val) {
454     return encode_token_int(ctx, 0, N_DIFF, val);
455 }
456 
457 
458 //-----------------------------------------------------------------------------
459 // Trie implementation for tracking common name prefixes.
460 struct trie {
461     char c;
462     int count;
463     //struct trie *next[128];
464     struct trie *next, *sibling;
465     int n; // Nth line
466 };
467 
468 //static trie_t *t_head = NULL;
469 
free_trie(trie_t * t)470 void free_trie(trie_t *t) {
471     trie_t *x, *n;
472     for (x = t->next; x; x = n) {
473 	n = x->sibling;
474 	free_trie(x);
475     }
476     free(t);
477 }
478 
build_trie(name_context * ctx,char * data,size_t len,int n)479 int build_trie(name_context *ctx, char *data, size_t len, int n) {
480     int nlines = 0;
481     size_t i;
482     trie_t *t;
483 
484     if (!ctx->t_head) {
485 	ctx->t_head = calloc(1, sizeof(*ctx->t_head));
486 	if (!ctx->t_head)
487 	    return -1;
488     }
489 
490     // Build our trie, also counting input lines
491     for (nlines = i = 0; i < len; i++, nlines++) {
492 	t = ctx->t_head;
493 	t->count++;
494 	while (i < len && data[i] > '\n') {
495 	    unsigned char c = data[i++];
496 	    if (c & 0x80)
497 		//fprintf(stderr, "8-bit ASCII is unsupported\n");
498 		abort();
499 	    c &= 127;
500 
501 
502 	    trie_t *x = t->next, *l = NULL;
503 	    while (x && x->c != c) {
504 		l = x; x = x->sibling;
505 	    }
506 	    if (!x) {
507 		if (!ctx->pool)
508 		    ctx->pool = pool_create(sizeof(trie_t));
509 		if (!(x = (trie_t *)pool_alloc(ctx->pool)))
510 		    return -1;
511 		memset(x, 0, sizeof(*x));
512 		if (!l)
513 		    x = t->next    = x;
514 		else
515 		    x = l->sibling = x;
516 		x->n = n;
517 		x->c = c;
518 	    }
519 	    t = x;
520 	    t->c = c;
521 	    t->count++;
522 	}
523     }
524 
525     return 0;
526 }
527 
528 #if 0
529 void dump_trie(trie_t *t, int depth) {
530     if (depth == 0) {
531 	printf("graph x_%p {\n    splines = ortho\n    ranksep=2\n", t);
532 	printf("    p_%p [label=\"\"];\n", t);
533 	dump_trie(t, 1);
534 	printf("}\n");
535     } else {
536 	int j, k, count;//, cj;
537 	char label[100], *cp;
538 	trie_t *tp = t;
539 
540 //    patricia:
541 //	for (count = j = 0; j < 128; j++)
542 //	    if (t->next[j])
543 //		count++, cj=j;
544 //
545 //	if (count == 1) {
546 //	    t = t->next[cj];
547 //	    *cp++ = cj;
548 //	    goto patricia;
549 //	}
550 
551 	trie_t *x;
552 	for (x = t->next; x; x = x->sibling) {
553 	    printf("    p_%p [label=\"%c\"];\n", x, x->c);
554 	    printf("    p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, x, x->count, MAX((log(x->count)-3)*2,1));
555 	    //if (depth <= 11)
556 		dump_trie(x, depth+1);
557 	}
558 
559 #if 0
560 	for (j = 0; j < 128; j++) {
561 	    trie_t *tn;
562 
563 	    if (!t->next[j])
564 		continue;
565 
566 	    cp = label;
567 	    tn = t->next[j];
568 	    *cp++ = j;
569 //	patricia:
570 
571 	    for (count = k = 0; k < 128; k++)
572 		if (tn->next[k])
573 		    count++;//, cj=k;
574 
575 //	    if (count == 1) {
576 //		tn = tn->next[cj];
577 //		*cp++ = cj;
578 //		goto patricia;
579 //	    }
580 	    *cp++ = 0;
581 
582 	    printf("    p_%p [label=\"%s\"];\n", tn, label);
583 	    printf("    p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, tn, tn->count, MAX((log(tn->count)-3)*2,1));
584 	    if (depth <= 11)
585 		dump_trie(tn, depth+1);
586 	}
587 #endif
588     }
589 }
590 #endif
591 
search_trie(name_context * ctx,char * data,size_t len,int n,int * exact,int * is_fixed,int * fixed_len)592 int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, int *is_fixed, int *fixed_len) {
593     int nlines = 0;
594     size_t i;
595     trie_t *t;
596     int from = -1, p3 = -1;
597 
598     // Horrid hack for the encoder only.
599     // We optimise per known name format here.
600     int prefix_len;
601     char *d = *data == '@' ? data+1 : data;
602     int l   = *data == '@' ? len-1  : len;
603     int f = (*data == '>') ? 1 : 0;
604     if (l > 70 && d[f+0] == 'm' && d[7] == '_' && d[f+14] == '_' && d[f+61] == '/') {
605 	prefix_len = 60; // PacBio
606 	*is_fixed = 0;
607     } else if (l == 17 && d[f+5] == ':' && d[f+11] == ':') {
608 	prefix_len = 7;  // IonTorrent
609 	*fixed_len = 7;
610 	*is_fixed = 1;
611     } else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' &&
612 	       ((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) &&
613 	       ((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) {
614 	// ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template...
615 	prefix_len = 37;
616 	*fixed_len = 37;
617 	*is_fixed = 1;
618     } else {
619 	// Anything else we give up on the trie method, but we still want to search
620 	// for exact matches;
621 	prefix_len = INT_MAX;
622 	*is_fixed = 0;
623     }
624     //prefix_len = INT_MAX;
625 
626     if (!ctx->t_head) {
627 	ctx->t_head = calloc(1, sizeof(*ctx->t_head));
628 	if (!ctx->t_head)
629 	    return -1;
630     }
631 
632     // Find an item in the trie
633     for (nlines = i = 0; i < len; i++, nlines++) {
634 	t = ctx->t_head;
635 	while (i < len && data[i] > '\n') {
636 	    unsigned char c = data[i++];
637 	    if (c & 0x80)
638 		//fprintf(stderr, "8-bit ASCII is unsupported\n");
639 		abort();
640 	    c &= 127;
641 
642 	    trie_t *x = t->next;
643 	    while (x && x->c != c)
644 		x = x->sibling;
645 	    t = x;
646 
647 //	    t = t->next[c];
648 
649 //	    if (!t)
650 //		return -1;
651 
652 	    from = t->n;
653 	    if (i == prefix_len) p3 = t->n;
654 	    //if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio
655 	    //if (i == 60) p3 = t->n; // pacbio
656 	    //if (i == 7) p3 = t->n; // iontorrent
657 	    t->n = n;
658 	}
659     }
660 
661     //printf("Looked for %d, found %d, prefix %d\n", n, from, p3);
662 
663     *exact = (n != from);
664     return *exact ? from : p3;
665 }
666 
667 
668 //-----------------------------------------------------------------------------
669 // Name encoder
670 
671 /*
672  * Tokenises a read name using ctx as context as the previous
673  * tokenisation.
674  *
675  * Parsed elements are then emitted for encoding by calling the
676  * encode_token() function with the context, token number (Nth token
677  * in line), token type and token value.
678  *
679  * Returns 0 on success;
680  *        -1 on failure.
681  */
encode_name(name_context * ctx,char * name,int len,int mode)682 static int encode_name(name_context *ctx, char *name, int len, int mode) {
683     int i, is_fixed, fixed_len;
684 
685     int exact;
686     int cnum = ctx->counter++;
687     int pnum = search_trie(ctx, name, len, cnum, &exact, &is_fixed, &fixed_len);
688     if (pnum < 0) pnum = cnum ? cnum-1 : 0;
689     //pnum = pnum & (MAX_NAMES-1);
690     //cnum = cnum & (MAX_NAMES-1);
691     //if (pnum == cnum) {pnum = cnum ? cnum-1 : 0;}
692 #ifdef ENC_DEBUG
693     fprintf(stderr, "%d: pnum=%d (%d), exact=%d\n%s\n%s\n",
694 	    ctx->counter, pnum, cnum-pnum, exact, ctx->lc[pnum].last_name, name);
695 #endif
696 
697     // Return DUP or DIFF switch, plus the distance.
698     if (exact && len == strlen(ctx->lc[pnum].last_name)) {
699 	encode_token_dup(ctx, cnum-pnum);
700 	ctx->lc[cnum].last_name = name;
701 	ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok;
702 	// FIXME: optimise this
703 	int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS;
704 	memcpy(ctx->lc[cnum].last_token_type, ctx->lc[pnum].last_token_type, nc * sizeof(int));
705 	memcpy(ctx->lc[cnum].last_token_int , ctx->lc[pnum].last_token_int , nc * sizeof(int));
706 	memcpy(ctx->lc[cnum].last_token_str , ctx->lc[pnum].last_token_str , nc * sizeof(int));
707 	return 0;
708     }
709 
710     encode_token_diff(ctx, cnum-pnum);
711 
712     int ntok = 1;
713     i = 0;
714     if (is_fixed) {
715 	if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_ALPHA) {
716 	    if (ctx->lc[pnum].last_token_int[ntok] == fixed_len && memcmp(name, ctx->lc[pnum].last_name, fixed_len) == 0) {
717 		encode_token_match(ctx, ntok);
718 	    } else {
719 		encode_token_alpha(ctx, ntok, name, fixed_len);
720 	    }
721 	} else {
722 	    encode_token_alpha(ctx, ntok, name, fixed_len);
723 	}
724 	ctx->lc[cnum].last_token_int[ntok] = fixed_len;
725 	ctx->lc[cnum].last_token_str[ntok] = 0;
726 	ctx->lc[cnum].last_token_type[ntok++] = N_ALPHA;
727 	i = fixed_len;
728     }
729 
730     for (; i < len; i++) {
731 	/* Determine data type of this segment */
732 	if (isalpha(name[i])) {
733 	    int s = i+1;
734 //	    int S = i+1;
735 
736 //	    // FIXME: try which of these is best.  alnum is good sometimes.
737 //	    while (s < len && isalpha(name[s]))
738 	    while (s < len && (isalpha(name[s]) || ispunct(name[s])))
739 //	    while (s < len && name[s] != ':')
740 //	    while (s < len && !isdigit(name[s]) && name[s] != ':')
741 		s++;
742 
743 //	    if (!is_fixed) {
744 //		while (S < len && isalnum(name[S]))
745 //		    S++;
746 //		if (s < S)
747 //		    s = S;
748 //	    }
749 
750 	    // Single byte strings are better encoded as chars.
751 	    if (s-i == 1) goto n_char;
752 
753 	    if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_ALPHA) {
754 		if (s-i == ctx->lc[pnum].last_token_int[ntok] &&
755 		    memcmp(&name[i],
756 			   &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]],
757 			   s-i) == 0) {
758 #ifdef ENC_DEBUG
759 		    fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]);
760 #endif
761 		    if (encode_token_match(ctx, ntok) < 0) return -1;
762 		} else {
763 #ifdef ENC_DEBUG
764 		    fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA,
765 		    	    s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]], s-i, &name[i]);
766 #endif
767 		    // same token/length, but mismatches
768 		    if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
769 		}
770 	    } else {
771 #ifdef ENC_DEBUG
772 		fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]);
773 #endif
774 		if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
775 	    }
776 
777 	    ctx->lc[cnum].last_token_int[ntok] = s-i;
778 	    ctx->lc[cnum].last_token_str[ntok] = i;
779 	    ctx->lc[cnum].last_token_type[ntok] = N_ALPHA;
780 
781 	    i = s-1;
782 	} else if (name[i] == '0') digits0: {
783 	    // Digits starting with zero; encode length + value
784 	    uint32_t s = i;
785 	    uint32_t v = 0;
786 	    int d = 0;
787 
788 	    while (s < len && isdigit(name[s]) && s-i < 8) {
789 		v = v*10 + name[s] - '0';
790 		//putchar(name[s]);
791 		s++;
792 	    }
793 
794 	    // TODO: optimise choice over whether to switch from DIGITS to DELTA
795 	    // regularly vs all DIGITS, also MATCH vs DELTA 0.
796 	    if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_DIGITS0) {
797 		d = v - ctx->lc[pnum].last_token_int[ntok];
798 		if (d == 0 && ctx->lc[pnum].last_token_str[ntok] == s-i) {
799 #ifdef ENC_DEBUG
800 		    fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
801 #endif
802 		    if (encode_token_match(ctx, ntok) < 0) return -1;
803 		    //ctx->lc[pnum].last_token_delta[ntok]=0;
804 		} else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last_token_str[ntok] == s-i) {
805 #ifdef ENC_DEBUG
806 		    fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last_token_int[ntok], v);
807 #endif
808 		    //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
809 		    if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1;
810 		    //ctx->lc[pnum].last_token_delta[ntok]=1;
811 		} else {
812 #ifdef ENC_DEBUG
813 		    fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last_token_int[ntok], v);
814 #endif
815 		    if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
816 		    if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
817 		    //ctx->lc[pnum].last_token_delta[ntok]=0;
818 		}
819 	    } else {
820 #ifdef ENC_DEBUG
821 		fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
822 #endif
823 		if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
824 		if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
825 		//ctx->lc[pnum].last_token_delta[ntok]=0;
826 	    }
827 
828 	    ctx->lc[cnum].last_token_str[ntok] = s-i; // length
829 	    ctx->lc[cnum].last_token_int[ntok] = v;
830 	    ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
831 
832 	    i = s-1;
833 	} else if (isdigit(name[i])) {
834 	    // digits starting 1-9; encode value
835 	    uint32_t s = i;
836 	    uint32_t v = 0;
837 	    int d = 0;
838 
839 	    while (s < len && isdigit(name[s]) && s-i < 8) {
840 		v = v*10 + name[s] - '0';
841 		//putchar(name[s]);
842 		s++;
843 	    }
844 
845 	    // dataset/10/K562_cytosol_LID8465_TopHat_v2.names
846 	    // col 4 is Illumina lane - we don't want match & delta in there
847 	    // as it has multiple lanes (so not ALL match) and delta is just
848 	    // random chance, increasing entropy instead.
849 //	    if (ntok == 4  || ntok == 8 || ntok == 10) {
850 //		encode_token_int(ctx, ntok, N_DIGITS, v);
851 //	    } else {
852 
853 	    // If the last token was DIGITS0 and we are the same length, then encode
854 	    // using that method instead as it seems likely the entire column is fixed
855 	    // width, sometimes with leading zeros.
856 	    if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok &&
857 		ctx->lc[pnum].last_token_type[ntok] == N_DIGITS0 &&
858 		ctx->lc[pnum].last_token_str[ntok] == s-i)
859 		goto digits0;
860 
861 	    // TODO: optimise choice over whether to switch from DIGITS to DELTA
862 	    // regularly vs all DIGITS, also MATCH vs DELTA 0.
863 	    if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_DIGITS) {
864 		d = v - ctx->lc[pnum].last_token_int[ntok];
865 		if (d == 0) {
866 #ifdef ENC_DEBUG
867 		    fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
868 #endif
869 		    if (encode_token_match(ctx, ntok) < 0) return -1;
870 		    //ctx->lc[pnum].last_token_delta[ntok]=0;
871 		} else if (mode == 1 && d < 256 && d >= 0) {
872 #ifdef ENC_DEBUG
873 		    fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last_token_int[ntok], v);
874 #endif
875 		    if (encode_token_int1(ctx, ntok, N_DDELTA, d) < 0) return -1;
876 		    //ctx->lc[pnum].last_token_delta[ntok]=1;
877 		} else {
878 #ifdef ENC_DEBUG
879 		    fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last_token_int[ntok], v);
880 #endif
881 		    if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1;
882 		    //ctx->lc[pnum].last_token_delta[ntok]=0;
883 		}
884 	    } else {
885 #ifdef ENC_DEBUG
886 		fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
887 #endif
888 		if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1;
889 		//ctx->lc[pnum].last_token_delta[ntok]=0;
890 	    }
891 //	    }
892 
893 	    ctx->lc[cnum].last_token_int[ntok] = v;
894 	    ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
895 
896 	    i = s-1;
897 	} else {
898 	n_char:
899 	    //if (!isalpha(name[i])) putchar(name[i]);
900 	    if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last_token_type[ntok] == N_CHAR) {
901 		if (name[i] == ctx->lc[pnum].last_token_int[ntok]) {
902 #ifdef ENC_DEBUG
903 		    fprintf(stderr, "Tok %d (chr-mat, %c)\n", N_MATCH, name[i]);
904 #endif
905 		    if (encode_token_match(ctx, ntok) < 0) return -1;
906 		} else {
907 #ifdef ENC_DEBUG
908 		    fprintf(stderr, "Tok %d (chr, %c / %c)\n", N_CHAR, ctx->lc[pnum].last_token_int[ntok], name[i]);
909 #endif
910 		    if (encode_token_char(ctx, ntok, name[i]) < 0) return -1;
911 		}
912 	    } else {
913 #ifdef ENC_DEBUG
914 		fprintf(stderr, "Tok %d (new chr, %c)\n", N_CHAR, name[i]);
915 #endif
916 		if (encode_token_char(ctx, ntok, name[i]) < 0) return -1;
917 	    }
918 
919 	    ctx->lc[cnum].last_token_int[ntok] = name[i];
920 	    ctx->lc[cnum].last_token_type[ntok] = N_CHAR;
921 	}
922 
923 	ntok++;
924 	//putchar(' ');
925     }
926 
927 #ifdef ENC_DEBUG
928     fprintf(stderr, "Tok %d (end)\n", N_END);
929 #endif
930     if (encode_token_end(ctx, ntok) < 0) return -1;
931 
932     //printf("Encoded %.*s with %d tokens\n", len, name, ntok);
933 
934     ctx->lc[cnum].last_name = name;
935     ctx->lc[cnum].last_ntok = ntok;
936 
937     return 0;
938 }
939 
940 //-----------------------------------------------------------------------------
941 // Name decoder
942 
decode_name(name_context * ctx,char * name,int name_len)943 static int decode_name(name_context *ctx, char *name, int name_len) {
944     int t0 = decode_token_type(ctx, 0);
945     uint32_t dist;
946     int pnum, cnum = ctx->counter++;
947 
948     if (t0 < 0)
949 	return 0;
950 
951     if (decode_token_int(ctx, 0, t0, &dist) < 0 || dist > cnum)
952 	return -1;
953     if ((pnum = cnum - dist) < 0) pnum = 0;
954 
955     //fprintf(stderr, "t0=%d, dist=%d, pnum=%d, cnum=%d\n", t0, dist, pnum, cnum);
956 
957     if (t0 == N_DUP) {
958 	if (strlen(ctx->lc[pnum].last_name) +1 >= name_len) return -1;
959 	strcpy(name, ctx->lc[pnum].last_name);
960 	// FIXME: optimise this
961 	ctx->lc[cnum].last_name = name;
962 	ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok;
963 	int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS;
964 	memcpy(ctx->lc[cnum].last_token_type, ctx->lc[pnum].last_token_type, nc * sizeof(int));
965 	memcpy(ctx->lc[cnum].last_token_int , ctx->lc[pnum].last_token_int , nc * sizeof(int));
966 	memcpy(ctx->lc[cnum].last_token_str , ctx->lc[pnum].last_token_str , nc * sizeof(int));
967 
968 	return strlen(name)+1;
969     }
970 
971     *name = 0;
972     int ntok, len = 0, len2;
973 
974     for (ntok = 1; ntok < MAX_TOKENS; ntok++) {
975 	uint32_t v, vl;
976 	enum name_type tok;
977 	tok = decode_token_type(ctx, ntok);
978 	//fprintf(stderr, "Tok %d = %d\n", ntok, tok);
979 
980 	switch (tok) {
981 	case N_CHAR:
982 	    if (len+1 >= name_len) return -1;
983 	    if (decode_token_char(ctx, ntok, &name[len]) < 0) return -1;
984 	    //fprintf(stderr, "Tok %d CHAR %c\n", ntok, name[len]);
985 	    ctx->lc[cnum].last_token_type[ntok] = N_CHAR;
986 	    ctx->lc[cnum].last_token_int [ntok] = name[len++];
987 	    break;
988 
989 	case N_ALPHA:
990 	    len2 = decode_token_alpha(ctx, ntok, &name[len], name_len - len);
991 	    //fprintf(stderr, "Tok %d ALPHA %.*s\n", ntok, len2, &name[len]);
992 	    ctx->lc[cnum].last_token_type[ntok] = N_ALPHA;
993 	    ctx->lc[cnum].last_token_str [ntok] = len;
994 	    ctx->lc[cnum].last_token_int [ntok] = len2;
995 	    len += len2;
996 	    break;
997 
998 	case N_DIGITS0: // [0-9]*
999 	    if (decode_token_int1(ctx, ntok, N_DZLEN, &vl) < 0) return -1;
1000 	    if (decode_token_int(ctx, ntok, N_DIGITS0, &v) < 0) return -1;
1001 	    if (len+20+vl >= name_len) return -1;
1002 	    len += append_uint32_fixed(&name[len], v, vl);
1003 	    //fprintf(stderr, "Tok %d DIGITS0 %0*d\n", ntok, vl, v);
1004 	    ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
1005 	    ctx->lc[cnum].last_token_int [ntok] = v;
1006 	    ctx->lc[cnum].last_token_str [ntok] = vl;
1007 	    break;
1008 
1009 	case N_DDELTA0:
1010 	    if (decode_token_int1(ctx, ntok, N_DDELTA0, &v) < 0) return -1;
1011 	    v += ctx->lc[pnum].last_token_int[ntok];
1012 	    if (len+ctx->lc[pnum].last_token_str[ntok]+1 >= name_len) return -1;
1013 	    len += append_uint32_fixed(&name[len], v, ctx->lc[pnum].last_token_str[ntok]);
1014 	    //fprintf(stderr, "Tok %d DELTA0 %0*d\n", ntok, ctx->lc[pnum].last_token_str[ntok], v);
1015 	    ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
1016 	    ctx->lc[cnum].last_token_int [ntok] = v;
1017 	    ctx->lc[cnum].last_token_str [ntok] = ctx->lc[pnum].last_token_str[ntok];
1018 	    break;
1019 
1020 	case N_DIGITS: // [1-9][0-9]*
1021 	    if (decode_token_int(ctx, ntok, N_DIGITS, &v) < 0) return -1;
1022 	    if (len+20 >= name_len) return -1;
1023 	    len += append_uint32_var(&name[len], v);
1024 	    //fprintf(stderr, "Tok %d DIGITS %d\n", ntok, v);
1025 	    ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
1026 	    ctx->lc[cnum].last_token_int [ntok] = v;
1027 	    break;
1028 
1029 	case N_DDELTA:
1030 	    if (decode_token_int1(ctx, ntok, N_DDELTA, &v) < 0) return -1;
1031 	    v += ctx->lc[pnum].last_token_int[ntok];
1032 	    if (len+20 >= name_len) return -1;
1033 	    len += append_uint32_var(&name[len], v);
1034 	    //fprintf(stderr, "Tok %d DELTA %d\n", ntok, v);
1035 	    ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
1036 	    ctx->lc[cnum].last_token_int [ntok] = v;
1037 	    break;
1038 
1039 	case N_MATCH:
1040 	    switch (ctx->lc[pnum].last_token_type[ntok]) {
1041 	    case N_CHAR:
1042 		if (len+1 >= name_len) return -1;
1043 		name[len++] = ctx->lc[pnum].last_token_int[ntok];
1044 		//fprintf(stderr, "Tok %d MATCH CHAR %c\n", ntok, ctx->lc[pnum].last_token_int[ntok]);
1045 		ctx->lc[cnum].last_token_type[ntok] = N_CHAR;
1046 		ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1047 		break;
1048 
1049 	    case N_ALPHA:
1050 		if (ctx->lc[pnum].last_token_int[ntok] < 0 ||
1051 		    len+ctx->lc[pnum].last_token_int[ntok] >= name_len) return -1;
1052 		memcpy(&name[len],
1053 		       &ctx->lc[pnum].last_name[ctx->lc[pnum].last_token_str[ntok]],
1054 		       ctx->lc[pnum].last_token_int[ntok]);
1055 		//fprintf(stderr, "Tok %d MATCH ALPHA %.*s\n", ntok, ctx->lc[pnum].last_token_int[ntok], &name[len]);
1056 		ctx->lc[cnum].last_token_type[ntok] = N_ALPHA;
1057 		ctx->lc[cnum].last_token_str [ntok] = len;
1058 		ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1059 		len += ctx->lc[pnum].last_token_int[ntok];
1060 		break;
1061 
1062 	    case N_DIGITS:
1063 		if (len+20 >= name_len) return -1;
1064 		len += append_uint32_var(&name[len], ctx->lc[pnum].last_token_int[ntok]);
1065 		//fprintf(stderr, "Tok %d MATCH DIGITS %d\n", ntok, ctx->lc[pnum].last_token_int[ntok]);
1066 		ctx->lc[cnum].last_token_type[ntok] = N_DIGITS;
1067 		ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1068 		break;
1069 
1070 	    case N_DIGITS0:
1071 		if (len+ctx->lc[pnum].last_token_str[ntok] >= name_len) return -1;
1072 		len += append_uint32_fixed(&name[len], ctx->lc[pnum].last_token_int[ntok], ctx->lc[pnum].last_token_str[ntok]);
1073 		//fprintf(stderr, "Tok %d MATCH DIGITS %0*d\n", ntok, ctx->lc[pnum].last_token_str[ntok], ctx->lc[pnum].last_token_int[ntok]);
1074 		ctx->lc[cnum].last_token_type[ntok] = N_DIGITS0;
1075 		ctx->lc[cnum].last_token_int [ntok] = ctx->lc[pnum].last_token_int[ntok];
1076 		ctx->lc[cnum].last_token_str [ntok] = ctx->lc[pnum].last_token_str[ntok];
1077 		break;
1078 
1079 	    default:
1080 		return -1;
1081 	    }
1082 	    break;
1083 
1084 	default: // an elided N_END
1085 	case N_END:
1086 	    if (len+1 >= name_len) return -1;
1087 	    name[len++] = 0;
1088 	    ctx->lc[cnum].last_token_type[ntok] = N_END;
1089 
1090 	    ctx->lc[cnum].last_name = name;
1091 	    ctx->lc[cnum].last_ntok = ntok;
1092 
1093 	    return len;
1094 	}
1095     }
1096 
1097 
1098     return -1;
1099 }
1100 
1101 
1102 //-----------------------------------------------------------------------------
1103 // rANS codec
rans_encode(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len,int method)1104 static int rans_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) {
1105     unsigned int olen = *out_len-6, nb;
1106     if (rans_compress_to_4x16(in, in_len, out+6, &olen, method) == NULL)
1107 	return -1;
1108 
1109     nb = i7put(out, olen);
1110     memmove(out+nb, out+6, olen);
1111     *out_len = olen+nb;
1112     return 0;
1113 }
1114 
1115 // Returns number of bytes read from 'in' on success,
1116 //        -1 on failure.
rans_decode(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len)1117 static int64_t rans_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1118     unsigned int olen = *out_len;
1119 
1120     uint64_t clen;
1121     int nb = i7get(in, &clen);
1122     //fprintf(stderr, "Rans decode %x\n", in[nb]);
1123     if (rans_uncompress_to_4x16(in+nb, in_len-nb, out, &olen, 0) == NULL)
1124 	return -1;
1125     //fprintf(stderr, "    Stored clen=%d\n", (int)clen);
1126     return clen+nb;
1127 }
1128 
compress(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len)1129 static int compress(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1130     uint64_t best_sz = UINT64_MAX;
1131     int best = 0;
1132     uint64_t olen = *out_len;
1133 
1134     //fprintf(stderr, "=== try %d ===\n", (int)in_len);
1135 
1136     if (in_len <= 3) {
1137 	out[0] = in_len;
1138 	memcpy(out+1, in, in_len);
1139 	*out_len = in_len+1;
1140 	return 0;
1141     }
1142 
1143     //int rmethods[] = {0,1,128,129,64,65,192,193, 193+8}, m;
1144     //int rmethods[] = {0,1,128,129,64,65,192,193, 193+8, 0+4, 128+4}, m;
1145     // DO_DICT doesn't yet work in conjunction with DO_PACK.
1146     int rmethods[] = {0,1,128,129,64,65,192,193, 193+8, 0+4}, m;
1147     for (m = 0; m < sizeof(rmethods)/sizeof(*rmethods); m++) {
1148 	*out_len = olen;
1149 	if (rans_encode(in, in_len, out, out_len, rmethods[m]) < 0) return -1;
1150 
1151 	if (best_sz > *out_len) {
1152 	    best_sz = *out_len;
1153 	    best = rmethods[m];
1154 	}
1155     }
1156 
1157     *out_len = olen;
1158     if (rans_encode(in, in_len, out, out_len, best) < 0) return -1;
1159 
1160     assert(*out_len > 2);
1161 
1162 //    uint64_t tmp;
1163 //    fprintf(stderr, "%d -> %d via method %x, %x\n", (int)in_len, (int)best_sz, best, out[i7get(out,&tmp)]);
1164 
1165     return 0;
1166 }
1167 
uncompressed_size(uint8_t * in,uint64_t in_len)1168 static uint64_t uncompressed_size(uint8_t *in, uint64_t in_len) {
1169     uint64_t clen, ulen;
1170 
1171     int nb = i7get(in, &clen);
1172     if (clen <= 3) return clen;
1173 
1174     i7get(in+nb+1, &ulen);
1175 
1176     return ulen;
1177 }
1178 
uncompress(uint8_t * in,uint64_t in_len,uint8_t * out,uint64_t * out_len)1179 static int uncompress(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1180     uint64_t clen;
1181     i7get(in, &clen);
1182 
1183     if (clen <= 3) {
1184 	memcpy(out, in+1, clen);
1185 	*out_len = clen;
1186 	return clen+1;
1187 
1188 	//*out = in[1];
1189 	//*out_len = 1;
1190 	//return 2;
1191     }
1192 
1193     return rans_decode(in, in_len, out, out_len);
1194 }
1195 
1196 //-----------------------------------------------------------------------------
1197 
1198 /*
1199  * Converts a line or \0 separated block of reading names to a compressed buffer.
1200  * The code can only encode whole lines and will not attempt a partial line.
1201  * Use the "last_start_p" return value to identify the partial line start
1202  * offset, for continuation purposes.
1203  *
1204  * Returns a malloced buffer holding compressed data of size *out_len,
1205  *         or NULL on failure
1206  */
encode_names(char * blk,int len,int * out_len,int * last_start_p)1207 uint8_t *encode_names(char *blk, int len, int *out_len, int *last_start_p) {
1208     int last_start = 0, i, j, nreads;
1209 
1210     // Count lines
1211     for (nreads = i = 0; i < len; i++)
1212 	if (blk[i] <= '\n') // \n or \0 separated entries
1213 	    nreads++;
1214 
1215     name_context *ctx = create_context(nreads);
1216     if (!ctx)
1217 	return NULL;
1218 
1219     // Construct trie
1220     int ctr = 0;
1221     for (i = j = 0; i < len; j=++i) {
1222 	while (i < len && blk[i] > '\n')
1223 	    i++;
1224 	if (i >= len)
1225 	    break;
1226 
1227 	//blk[i] = '\0';
1228 	last_start = i+1;
1229 	if (build_trie(ctx, &blk[j], i-j, ctr++) < 0) {
1230 	    free_context(ctx);
1231 	    return NULL;
1232 	}
1233     }
1234     if (last_start_p)
1235 	*last_start_p = last_start;
1236 
1237     //fprintf(stderr, "Processed %d of %d in block, line %d\n", last_start, len, ctr);
1238 
1239     // Encode name
1240     for (i = j = 0; i < len; j=++i) {
1241 	while (i < len && blk[i] > '\n')
1242 	    i++;
1243 	if (i >= len)
1244 	    break;
1245 
1246 	blk[i] = '\0';
1247 	// try both 0 and 1 and pick best?
1248 	if (encode_name(ctx, &blk[j], i-j, 1) < 0) {
1249 	    free_context(ctx);
1250 	    return NULL;
1251 	}
1252     }
1253 
1254 #if 0
1255     for (i = 0; i < MAX_TBLOCKS; i++) {
1256 	char fn[1024];
1257 	if (!ctx->desc[i].buf_l) continue;
1258 	sprintf(fn, "_tok.%02d_%02d.%d", i>>4,i&15,i);
1259 	FILE *fp = fopen(fn, "w");
1260 	fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
1261 	fclose(fp);
1262     }
1263 #endif
1264 
1265     //dump_trie(t_head, 0);
1266 
1267     // FIXME: merge descriptors
1268     //
1269     // If we see foo7:1 foo7:12 foo7:7 etc then foo: is constant,
1270     // but it's encoded as alpha<foo>+dig<7>+char<:> instead of alpha<foo7:>.
1271     // Any time token type 0 is all match beyond the first location we have
1272     // a candidate for merging in string form.
1273     //
1274     // This saves around .1 to 1.3 percent on varying data sets.
1275     // Cruder hack is dedicated prefix/suffix matching to short-cut this.
1276 
1277 
1278     // Drop N_TYPE blocks if they all contain matches bar the first item,
1279     // as we can regenerate these from the subsequent blocks types during
1280     // decode.
1281     for (i = 0; i < MAX_TBLOCKS; i+=16) {
1282 	if (!ctx->desc[i].buf_l) continue;
1283 
1284 	int z;
1285 	for (z=1; z<ctx->desc[i].buf_l; z++) {
1286 	    if (ctx->desc[i].buf[z] != N_MATCH)
1287 		break;
1288 	}
1289 	if (z == ctx->desc[i].buf_l) {
1290 	    int k;
1291 	    for (k=1; k<16; k++)
1292 		if (ctx->desc[i+k].buf_l)
1293 		    break;
1294 
1295 	    ctx->desc[i].buf_l = 0;
1296 	    free(ctx->desc[i].buf);
1297 	}
1298     }
1299 
1300     // Serialise descriptors
1301     uint32_t tot_size = 8;
1302     int ndesc = 0;
1303     for (i = 0; i < MAX_TBLOCKS; i++) {
1304 	if (!ctx->desc[i].buf_l) continue;
1305 
1306 	ndesc++;
1307 
1308 	int tnum = i>>4;
1309 	int ttype = i&15;
1310 
1311 	uint64_t out_len = 1.5 * rans_compress_bound_4x16(ctx->desc[i].buf_l, 1); // guesswork
1312 	uint8_t *out = malloc(out_len);
1313 	if (!out) {
1314 	    free_context(ctx);
1315 	    return NULL;
1316 	}
1317 
1318 	if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, out, &out_len) < 0) {
1319 	    free_context(ctx);
1320 	    return NULL;
1321 	}
1322 
1323 	free(ctx->desc[i].buf);
1324 	ctx->desc[i].buf = out;
1325 	ctx->desc[i].buf_l = out_len;
1326 	ctx->desc[i].tnum = tnum;
1327 	ctx->desc[i].ttype = ttype;
1328 
1329 	// Find dups
1330 	int j;
1331 	for (j = 0; j < i; j++) {
1332 	    if (!ctx->desc[j].buf)
1333 		continue;
1334 	    if (ctx->desc[i].buf_l != ctx->desc[j].buf_l || ctx->desc[i].buf_l <= 4)
1335 		continue;
1336 	    if (memcmp(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_l) == 0)
1337 		break;
1338 	}
1339 	if (j < i) {
1340 	    ctx->desc[i].dup_from = j;
1341 	    tot_size += 4; // flag, dup_from, ttype
1342 	} else {
1343 	    ctx->desc[i].dup_from = 0;
1344 	    tot_size += out_len + 1; // ttype
1345 	}
1346     }
1347 
1348 #if 0
1349     for (i = 0; i < MAX_TBLOCKS; i++) {
1350 	char fn[1024];
1351 	if (!ctx->desc[i].buf_l && !ctx->desc[i].dup_from) continue;
1352 	sprintf(fn, "_tok.%02d_%02d.%d.comp", i>>4,i&15,i);
1353 	FILE *fp = fopen(fn, "w");
1354 	fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
1355 	fclose(fp);
1356     }
1357 #endif
1358 
1359     // Write
1360     uint8_t *out = malloc(tot_size+12);
1361     if (!out) {
1362 	free_context(ctx);
1363 	return NULL;
1364     }
1365 
1366     uint8_t *cp = out;
1367     //*out_len = tot_size+4;
1368     //*(uint32_t *)cp = tot_size;   cp += 4;
1369 
1370     *out_len = tot_size;
1371     *(uint32_t *)cp = last_start; cp += 4;
1372     *(uint32_t *)cp = nreads;     cp += 4;
1373     //write(1, &nreads, 4);
1374     int last_tnum = -1;
1375     for (i = 0; i < MAX_TBLOCKS; i++) {
1376 	if (!ctx->desc[i].buf_l) continue;
1377 	uint8_t ttype8 = ctx->desc[i].ttype;
1378 	if (ctx->desc[i].tnum != last_tnum) {
1379 	    ttype8 |= 128;
1380 	    last_tnum = ctx->desc[i].tnum;
1381 	}
1382 	if (ctx->desc[i].dup_from) {
1383 	    //fprintf(stderr, "Dup %d from %d, sz %d\n", i, ctx->desc[i].dup_from, ctx->desc[i].buf_l);
1384 	    //uint8_t x = 255;
1385 	    //write(1, &x, 1);
1386 	    *cp++ = 255;
1387 	    //uint16_t y = ctx->desc[i].dup_from;
1388 	    //write(1, &y, 2);
1389 	    *(uint16_t *)cp = ctx->desc[i].dup_from; cp+= 2;
1390 	    //write(1, &ttype8, 1);
1391 	    *cp++ = ttype8;
1392 	} else {
1393 	    //write(1, &ttype8, 1);
1394 	    //write(1, ctx->desc[i].buf, ctx->desc[i].buf_l);
1395 	    *cp++ = ttype8;
1396 	    memcpy(cp, ctx->desc[i].buf, ctx->desc[i].buf_l);
1397 	    cp += ctx->desc[i].buf_l;
1398 	}
1399     }
1400 
1401     //assert(cp-out == tot_size);
1402 
1403     for (i = 0; i < MAX_TBLOCKS; i++) {
1404 	if (!ctx->desc[i].buf_l) continue;
1405 	free(ctx->desc[i].buf);
1406     }
1407 
1408     free_context(ctx);
1409 
1410     return out;
1411 }
1412 
1413 /*
1414  * Decodes a compressed block of read names into \0 separated names.
1415  * The size of the data returned (malloced) is in *out_len.
1416  *
1417  * Returns NULL on failure.
1418  */
decode_names(uint8_t * in,uint32_t sz,int * out_len)1419 uint8_t *decode_names(uint8_t *in, uint32_t sz, int *out_len) {
1420     if (sz < 8)
1421 	return NULL;
1422 
1423     int i, o = 8;
1424     int ulen   = *(uint32_t *)in;
1425     int nreads = *(uint32_t *)(in+4);
1426     name_context *ctx = create_context(nreads);
1427     if (!ctx)
1428 	return NULL;
1429 
1430     // Unpack descriptors
1431     int tnum = -1;
1432     while (o < sz) {
1433 	uint8_t ttype = in[o++];
1434 	if (ttype == 255) {
1435 	    if (o+3 >= sz) return NULL;
1436 	    uint16_t j = *(uint16_t *)&in[o];
1437 	    o += 2;
1438 	    ttype = in[o++];
1439 	    //		if (ttype == 0)
1440 	    if (ttype & 128)
1441 		tnum++;
1442 
1443 	    if ((ttype & 15) != 0 && (ttype & 128)) {
1444 		ctx->desc[tnum<<4].buf = malloc(nreads);
1445 		if (!ctx->desc[tnum<<4].buf)
1446 		    return NULL;
1447 
1448 		ctx->desc[tnum<<4].buf_a = nreads;
1449 		ctx->desc[tnum<<4].buf[0] = ttype&15;
1450 		if ((ttype&15) == N_DZLEN)
1451 		    ctx->desc[tnum<<4].buf[0] = N_DIGITS0;
1452 		memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1);
1453 	    }
1454 
1455 	    i = (tnum<<4) | (ttype&15);
1456 	    if (j >= i)
1457 		return NULL;
1458 
1459 	    ctx->desc[i].buf_l = 0;
1460 	    ctx->desc[i].buf_a = ctx->desc[j].buf_a;
1461 	    ctx->desc[i].buf = malloc(ctx->desc[i].buf_a);
1462 	    if (!ctx->desc[i].buf)
1463 		return NULL;
1464 
1465 	    memcpy(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_a);
1466 	    //fprintf(stderr, "Copy ttype %d, i=%d,j=%d, size %d\n", ttype, i, j, (int)ctx->desc[i].buf_a);
1467 	    continue;
1468 	}
1469 
1470 	//if (ttype == 0)
1471 	if (ttype & 128)
1472 	    tnum++;
1473 
1474 	if ((ttype & 15) != 0 && (ttype & 128)) {
1475 	    ctx->desc[tnum<<4].buf = malloc(nreads);
1476 	    if (!ctx->desc[tnum<<4].buf) {
1477 		free_context(ctx);
1478 		return NULL;
1479 	    }
1480 	    ctx->desc[tnum<<4].buf_a = nreads;
1481 	    ctx->desc[tnum<<4].buf[0] = ttype&15;
1482 	    if ((ttype&15) == N_DZLEN)
1483 		ctx->desc[tnum<<4].buf[0] = N_DIGITS0;
1484 	    memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1);
1485 	}
1486 
1487 	//fprintf(stderr, "Read %02x\n", c);
1488 
1489 	// Load compressed block
1490 	int64_t clen, ulen = uncompressed_size(&in[o], sz-o);
1491 	if (ulen < 0) {
1492 	    free_context(ctx);
1493 	    return NULL;
1494 	}
1495 	i = (tnum<<4) | (ttype&15);
1496 
1497 	if (i >= MAX_TBLOCKS || i < 0)
1498 	    return NULL;
1499 
1500 	ctx->desc[i].buf_l = 0;
1501 	ctx->desc[i].buf = malloc(ulen);
1502 	if (!ctx->desc[i].buf) {
1503 	    free_context(ctx);
1504 	    return NULL;
1505 	}
1506 
1507 	ctx->desc[i].buf_a = ulen;
1508 	clen = uncompress(&in[o], sz-o, ctx->desc[i].buf, &ctx->desc[i].buf_a);
1509 	if (clen < 0) {
1510 	    free(ctx->desc[i].buf);
1511 	    free_context(ctx);
1512 	    return NULL;
1513 	}
1514 	assert(ctx->desc[i].buf_a == ulen);
1515 
1516 	//	    fprintf(stderr, "%d: Decode tnum %d type %d clen %d ulen %d via %d\n",
1517 	//		    o, tnum, ttype, (int)clen, (int)ctx->desc[i].buf_a, ctx->desc[i].buf[0]);
1518 
1519 	o += clen;
1520 
1521 	// Encode tnum 0 type 0 ulen 100000 clen 12530 via 2
1522 	// Encode tnum 0 type 6 ulen 196800 clen 43928 via 3
1523 	// Encode tnum 0 type 7 ulen 203200 clen 17531 via 3
1524 	// Encode tnum 1 type 0 ulen 50800 clen 10 via 1
1525 	// Encode tnum 1 type 1 ulen 3 clen 5 via 0
1526 	// Encode tnum 2 type 0 ulen 50800 clen 10 via 1
1527 	//
1528     }
1529 
1530     int ret;
1531     ulen += 1024; // for easy coding in decode_name.
1532     uint8_t *out = malloc(ulen);
1533     if (!out) {
1534 	free_context(ctx);
1535 	return NULL;
1536     }
1537 
1538     size_t out_sz = 0;
1539     while ((ret = decode_name(ctx, (char *)out+out_sz, ulen)) > 0) {
1540 	out_sz += ret;
1541 	ulen -= ret;
1542     }
1543 
1544     if (ret < 0)
1545 	free(out);
1546 
1547     for (i = 0; i < MAX_TBLOCKS; i++) {
1548 	if (ctx->desc[i].buf) {
1549 	    free(ctx->desc[i].buf);
1550 	    ctx->desc[i].buf = 0;
1551 	}
1552     }
1553 
1554     free_context(ctx);
1555 
1556     *out_len = out_sz;
1557     return ret == 0 ? out : NULL;
1558 }
1559 
1560 
1561 
1562 #ifdef TEST_TOKENISER
1563 //-----------------------------------------------------------------------------
1564 // main() implementation for testing
1565 
1566 // Large enough for whole file for now.
1567 #ifndef BLK_SIZE
1568 #define BLK_SIZE 1*1024*1024
1569 #endif
1570 static char blk[BLK_SIZE*2]; // temporary fix for decoder, which needs more space
1571 
encode(int argc,char ** argv)1572 static int encode(int argc, char **argv) {
1573     FILE *fp;
1574     int len, i, j;
1575     name_context *ctx;
1576 
1577     if (argc > 1) {
1578 	fp = fopen(argv[1], "r");
1579 	if (!fp) {
1580 	    perror(argv[1]);
1581 	    return 1;
1582 	}
1583     } else {
1584 	fp = stdin;
1585     }
1586 
1587     int blk_offset = 0;
1588     int blk_num = 0;
1589     for (;;) {
1590 	int last_start = 0;
1591 
1592 	len = fread(blk+blk_offset, 1, BLK_SIZE-blk_offset, fp);
1593 	if (len <= 0)
1594 	    break;
1595 	len += blk_offset;
1596 
1597 	int out_len;
1598 	uint8_t *out = encode_names(blk, len, &out_len, &last_start);
1599 	write(1, &out_len, 4);
1600 	write(1, out, out_len);   // encoded data
1601 	free(out);
1602 
1603 	if (len > last_start)
1604 	    memmove(blk, &blk[last_start], len - last_start);
1605 	blk_offset = len - last_start;
1606 	blk_num++;
1607     }
1608 
1609     if (fclose(fp) < 0) {
1610 	perror("closing file");
1611 	return 1;
1612     }
1613 
1614     return 0;
1615 }
1616 
decode(int argc,char ** argv)1617 static int decode(int argc, char **argv) {
1618     uint32_t in_sz, out_sz;
1619     while (fread(&in_sz, 1, 4, stdin) == 4) {
1620 	uint8_t *in = malloc(in_sz), *out;
1621 	if (!in)
1622 	    return -1;
1623 
1624 	if (fread(in, 1, in_sz, stdin) != in_sz) {
1625 	    free(in);
1626 	    return -1;
1627 	}
1628 
1629 	if ((out = decode_names(in, in_sz, &out_sz)) == NULL) {
1630 	    free(in);
1631 	    return -1;
1632 	}
1633 
1634 	write(1, out, out_sz);
1635 
1636 	free(in);
1637 	free(out);
1638     }
1639 
1640     return 0;
1641 }
1642 
main(int argc,char ** argv)1643 int main(int argc, char **argv) {
1644 
1645     if (argc > 1 && strcmp(argv[1], "-d") == 0)
1646 	return decode(argc-1, argv+1);
1647     else
1648 	return encode(argc, argv);
1649 }
1650 
1651 #endif // TEST_TOKENISER
1652