1 /*==============================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 */
25 #include <align/extern.h>
26 
27 #include <klib/log.h>
28 #include <klib/rc.h>
29 #include <klib/sort.h>
30 #include <klib/data-buffer.h>
31 #include <klib/container.h>
32 #include <klib/checksum.h>
33 #include <klib/text.h>
34 #include <kfs/mmap.h>
35 #include <kfs/file.h>
36 #include <kdb/manager.h>
37 #include <vdb/database.h>
38 #include <vdb/table.h>
39 #include <vdb/cursor.h>
40 #include <vdb/manager.h>
41 #include <vdb/vdb-priv.h>
42 #include <sra/sradb.h>
43 
44 #include <align/writer-reference.h>
45 #include <align/writer-refseq.h>
46 #include <align/refseq-mgr.h>
47 #include <align/align.h>
48 #include "refseq-mgr-priv.h"
49 #include "writer-ref.h"
50 #include "reader-cmn.h"
51 #include "reference-cmn.h"
52 #include "debug.h"
53 #include <os-native.h>
54 #include <sysalloc.h>
55 
56 #include <stdio.h>
57 #include <stdlib.h>
58 #include <string.h>
59 #include <ctype.h>
60 #include <assert.h>
61 
62 /*
63  * ReferenceSeq objects:
64  *  ReferenceSeq objects may be unattached, i.e. they might not yet represent an
65  *  actual sequence.
66  *
67  *  ReferenceSeq objects may be attached, i.e. they represent a sequence from
68  *  either RefSeq or a fasta file.
69  *
70  *  A ReferenceSeq object may be refered to by more than one id, but a
71  *  ReferenceSeq object has only one canonical id.
72  *
73  *  More than one ReferenceSeq object may be associated with the same seqId.
74  *
75  *  More than one ReferenceSeq object may be attached to the same sequence.
76  *  This will cause the REFERENCE table to have more than one copy of the
77  *  sequence.
78  *
79  *  ReferenceSeq objects may be created from the config file.  These objects
80  *  will have an id, a seqId, but no fastaSeqId.  These are unattached.
81  *
82  *  ReferenceSeq objects may be created from explicit fasta files.  These
83  *  objects will have a fastaSeqId, but *** NO id OR seqId ***. These are
84  *  attached.
85  *
86  *  ReferenceSeq objects may be created on the fly by requesting an id that
87  *  isn't already in the collection.  These objects will have the requested id.
88  *
89  * When a reference is requested (by id):
90  *  Resolve the id to a ReferenceSeq object.
91  *  If the object is unattached, attach it to a sequence.
92  *  If the sequence is not yet written to the REFERENCE table, write it to the
93  *  REFERENCE table.  NAME gets id; SEQID gets seqId unless seqId is null, then
94  *  SEQID gets id.
95  *
96  * Resolving id's to ReferenceSeq objects:
97  *  Search the id index and if the found object is attached, return it.
98  *  Search the identifiers in the fastaSeqIds or seqIds.
99  *  If different objects were found from both searches, use sequence length and
100  *  MD5 to break the tie (if both match then use RefSeq).  If no sequence length
101  *  or MD5 then fail.
102  *  If no objects were found from either search, then create a new unattached
103  *  ReferenceSeq object.
104  *  If the object's id is null, set it to the id.
105  *  It the object was not found in the id index, add it.
106  *
107  * Attaching ReferenceSeq objects to sequences:
108  *  Search RefSeq for seqId.
109  *  Else search RefSeq for id.
110  *  Else search for seqId in the loaded fasta files.
111  *  Else search data directory for id.fasta or id.fa; load it or fail.
112  *  Else search data directory for seqId.fasta or seqId.fa; load it or fail.
113  *  Else fail.
114  *  If failed, mark the object as dead.
115  *
116  *
117  *
118  * Config file:
119  *  The config file consists of lines containing whitespace (preferably tab)
120  *  seperated fields.  The fields are:
121  *      NAME (unique)
122  *      SEQID
123  *      extra (optional)
124  *
125  *  There is one ReferenceSeq object created per record in the config file.
126  *  NAME is stored in id; SEQID is stored in seqId; if extra contains the word
127  *  'circular' (case-insensitive), true is stored in circular.  These
128  *  ReferenceSeq object are created in the unattached state, i.e. not attached
129  *  to a fasta file or a RefSeq library object.
130  *
131  *  If SEQID is equal to UNMAPPED_SEQID_VALUE (see below) the Reference will be
132  *  considered to be unmapped.
133  *
134  * Fasta files:
135  *  Fasta file consists of one of more sequences.  A sequence in a fasta file
136  *  consists of a seqid line followed by lines containing the bases of the
137  *  sequence.  A seqid line starts with '>' and the next word (whitespace
138  *  delimited) is the seqid.  The seqid may consist of '|' delimited identifiers
139  *  (this is purposely vague).  The fasta seqid is stored in fastaSeqId.
140  *
141  * Fasta files may be loaded explicitly:
142  *  When a fasta file is loaded explicitly, a new ReferenceSeq object is created
143  *  (with id == NULL) for each sequence found in the file.
144  *
145  * Fasta files may be loaded implicitly:
146  *  When an id can't be found in the set of ReferenceSeq objects and can't be
147  *  found as an accession by RefSeq, an attempt is made to load a fasta file
148  *  named <id>.fasta or <id>.fa in the directory given to the constructor.  If
149  *  this succeeds, a new ReferenceSeq object with the given id is attached to
150  *  the sequence.  In this situation, to avoid ambiquity, there can be only one
151  *  sequence in the fasta file.
152  *
153  */
154 
155 #define UNMAPPED_SEQID_VALUE "*UNMAPPED"
156 
157 enum ReferenceSeqType {
158     rst_unattached,
159     rst_local,
160     rst_refSeqById,
161     rst_refSeqBySeqId,
162     rst_unmapped,
163     rst_renamed,
164     rst_dead
165 };
166 
167 struct ReferenceSeq {
168     ReferenceMgr *mgr;
169     char *id;
170     char *seqId;
171     char *fastaSeqId;
172 
173     char **used;
174     unsigned num_used;
175 
176     /* ref table position */
177     int64_t start_rowid;
178     /* total reference length */
179     INSDC_coord_len seq_len;
180     int type;
181     bool circular;
182     uint8_t md5[16];
183     union {
184         struct {
185             KDataBuffer buf;
186             uint8_t const *data;
187         } local;
188         RefSeq const *refseq;
189     } u;
190 };
191 
192 #define BUCKET_BITS (12U)
193 #define BUCKETS (1U << BUCKET_BITS)
194 #define BUCKET_MASK (BUCKETS - 1U)
195 
196 typedef struct {
197     unsigned count;
198     unsigned *index;
199 } Bucket;
200 
201 typedef struct {
202     int length, type;
203 } compress_buffer_t;
204 
205 struct ReferenceMgr {
206     TableWriterRef const *writer;
207     KDirectory const *dir;
208     RefSeqMgr const *rmgr;
209     VDatabase *db;
210     ReferenceSeq *refSeq;       /* == refSeqs.base      */
211 
212     int64_t ref_rowid;
213 
214     size_t cache;
215 
216     uint32_t options;
217     uint32_t num_open_max;
218     uint32_t num_open;
219     uint32_t max_seq_len;
220 
221     KDataBuffer compress;       /* [compress_buffer_t]  */
222     KDataBuffer seq;            /* [byte](max_seq_len)  */
223     KDataBuffer refSeqs;        /* [ReferenceSeq]       */
224 
225     Bucket used[BUCKETS];
226     Bucket ht[BUCKETS];
227 };
228 
hash(unsigned const length,char const key[])229 static unsigned hash(unsigned const length, char const key[])
230 {
231     /* FNV-1a hash with folding */
232     uint64_t h = 0xcbf29ce484222325ull;
233     unsigned i;
234 
235     for (i = 0; i < length; ++i) {
236         int const ch = key[i];
237         uint64_t const octet = (uint8_t)toupper(ch);
238 
239         h = (h ^ octet) * 0x100000001b3ull;
240     }
241     return (unsigned)(((h ^ (h >> 32))) & ((uint64_t)BUCKET_MASK));
242 }
243 
hash0(char const key[])244 static unsigned hash0(char const key[])
245 {
246     /* FNV-1a hash with folding */
247     uint64_t h = 0xcbf29ce484222325ull;
248     unsigned i;
249 
250     for (i = 0; ; ++i) {
251         int const ch = key[i];
252         if (ch != '\0') {
253             uint64_t const octet = (uint8_t)toupper(ch);
254             h = (h ^ octet) * 0x100000001b3ull;
255             continue;
256         }
257         break;
258     }
259     return (unsigned)(((h ^ (h >> 32))) & ((uint64_t)BUCKET_MASK));
260 }
261 
addToHashBucket(Bucket * const bucket,unsigned const index)262 static void addToHashBucket(Bucket *const bucket, unsigned const index)
263 {
264     unsigned i;
265     void *tmp = NULL;
266 
267     for (i = 0; i < bucket->count; ++i) {
268         if (bucket->index[i] == index)
269             return;
270     }
271 
272     tmp = realloc(bucket->index, (1 + bucket->count) * sizeof(bucket->index[0]));
273     assert(tmp != NULL);
274 	if (tmp == NULL)
275         abort();
276 
277     bucket->index = tmp;
278     bucket->index[bucket->count++] = index;
279 }
280 
addToHashTable(ReferenceMgr * const self,ReferenceSeq const * const rs)281 static void addToHashTable(ReferenceMgr *const self, ReferenceSeq const *const rs)
282 {
283     unsigned const index = rs - self->refSeq;
284 
285     if (rs->id)
286         addToHashBucket(&self->ht[hash0(rs->id)], index);
287     if (rs->seqId)
288         addToHashBucket(&self->ht[hash0(rs->seqId)], index);
289 }
290 
freeHashTableEntries(ReferenceMgr * const self)291 static void freeHashTableEntries(ReferenceMgr *const self)
292 {
293     unsigned i;
294     for (i = 0; i < BUCKETS; ++i) {
295         if (self->ht[i].count > 0)
296             free(self->ht[i].index);
297         if (self->used[i].count > 0)
298             free(self->used[i].index);
299     }
300 }
301 
302 static
ReferenceSeq_Whack(ReferenceSeq * self)303 void CC ReferenceSeq_Whack(ReferenceSeq *self)
304 {
305     unsigned i;
306 
307     for (i = 0; i < self->num_used; ++i)
308         free(self->used[i]);
309 
310     if (self->type == rst_local) {
311         KDataBufferWhack(&self->u.local.buf);
312     }
313     else if (self->type == rst_refSeqById || self->type == rst_refSeqBySeqId) {
314         RefSeq_Release(self->u.refseq);
315     }
316     free(self->id);
317     free(self->seqId);
318     free(self->fastaSeqId);
319     free(self->used);
320 }
321 
322 struct OpenConfigFile_ctx {
323     char const *name;
324     KDirectory const *dir;
325     KFile const **kfp;
326     rc_t rc;
327 };
328 
329 static
OpenConfigFile(char const server[],char const volume[],void * Ctx)330 bool OpenConfigFile(char const server[], char const volume[], void *Ctx)
331 {
332     struct OpenConfigFile_ctx *ctx = Ctx;
333     KDirectory const *dir;
334 
335     if(volume == NULL) {
336         ctx->rc = KDirectoryOpenDirRead(ctx->dir, &dir, false, "%s", server);
337     } else {
338         ctx->rc = KDirectoryOpenDirRead(ctx->dir, &dir, false, "%s/%s", server, volume);
339     }
340     if (ctx->rc == 0) {
341         ctx->rc = KDirectoryOpenFileRead(dir, ctx->kfp, "%s", ctx->name);
342         KDirectoryRelease(dir);
343         if (ctx->rc == 0) {
344             return true;
345         }
346     }
347     return false;
348 }
349 
350 static
FindAndOpenConfigFile(RefSeqMgr const * const rmgr,KDirectory const * const dir,KFile const ** const kfp,char const conf[])351 rc_t FindAndOpenConfigFile(RefSeqMgr const *const rmgr,
352                            KDirectory const *const dir,
353                            KFile const **const kfp, char const conf[])
354 {
355     rc_t rc = KDirectoryOpenFileRead(dir, kfp, "%s", conf);
356 
357     if (rc) {
358         struct OpenConfigFile_ctx ctx;
359 
360         ctx.name = conf;
361         ctx.dir = dir;
362         ctx.kfp = kfp;
363         ctx.rc = 0;
364 
365         rc = RefSeqMgr_ForEachVolume(rmgr, OpenConfigFile, &ctx);
366         if (rc == 0 && *kfp == NULL) {
367             rc = RC(rcAlign, rcIndex, rcConstructing, rcFile, rcNotFound);
368         }
369     }
370     return rc;
371 }
372 
373 enum comparison_weights {
374     no_match        = 0,
375     substring_match = (1u << 0),
376     expected_prefix = (1u << 1),
377     exact_match     = (1u << 2),
378     seqId_match     = (1u << 3),
379     id_match        = (1u << 4),
380     seq_len_match   = (1u << 5),
381     md5_match       = (1u << 6),
382     attached        = (1u << 7)
383 };
384 
385 static
str_weight(char const str[],char const qry[],unsigned const qry_len)386 unsigned str_weight(char const str[], char const qry[], unsigned const qry_len)
387 {
388     char const *const fnd = strcasestr(str, qry);
389     unsigned wt = no_match;
390 
391     if (fnd) {
392         unsigned const fnd_len = (unsigned)string_size(fnd);
393         unsigned const fndlen = (fnd_len > qry_len && fnd[qry_len] == '|') ? qry_len : fnd_len;
394 
395         if (fndlen == qry_len && (fnd == str || fnd[-1] == '|')) {
396             wt |= substring_match;
397 
398             if (fnd == str) {
399                 if (fnd[fndlen] == '\0')
400                     wt |= exact_match;
401             }
402             else {
403                 /* check for expected prefices */
404                 char const *ns = fnd - 1;
405 
406                 while (ns != str && ns[-1] != '|')
407                     --ns;
408 
409                 if (   memcmp(ns, "ref|", 4) == 0
410                     || memcmp(ns, "emb|", 4) == 0
411                     || memcmp(ns, "dbj|", 4) == 0
412                     || memcmp(ns, "tpg|", 4) == 0
413                     || memcmp(ns, "tpe|", 4) == 0
414                     || memcmp(ns, "tpd|", 4) == 0
415                     || memcmp(ns, "gpp|", 4) == 0
416                     || memcmp(ns,  "gb|", 3) == 0
417                    )
418                 {
419                     wt |= expected_prefix;
420                 }
421             }
422         }
423     }
424     return wt;
425 }
426 
addToIndex(ReferenceMgr * const self,char const ID[],ReferenceSeq * const rs)427 static void addToIndex(ReferenceMgr *const self, char const ID[],
428                        ReferenceSeq *const rs)
429 {
430     unsigned const n = rs->num_used;
431     unsigned i;
432 
433     for (i = 0; i < n; ++i) {
434         if (strcmp(ID, rs->used[i]) == 0)
435             goto SKIP_INSERT_ID;
436     }
437     {
438         unsigned const length = (unsigned)string_size(ID);
439         char *id = string_dup(ID, length);
440         void *tmp = realloc(rs->used, (n + 1) * sizeof(rs->used[0]));
441 
442         assert(tmp != NULL && id != NULL);
443         if (tmp == NULL || id == NULL) abort();
444 
445         ++rs->num_used;
446         rs->used = tmp;
447         rs->used[n] = id;
448     }
449 SKIP_INSERT_ID:
450     addToHashBucket(&self->used[hash0(ID)], rs - self->refSeq);
451 }
452 
findId(ReferenceMgr const * const self,char const id[])453 static int findId(ReferenceMgr const *const self, char const id[])
454 {
455     unsigned const hv = hash0(id);
456     Bucket const *const bucket = &self->used[hv];
457     unsigned const n = bucket->count;
458     unsigned i;
459 
460     for (i = 0; i < n; ++i) {
461         unsigned const index = bucket->index[i];
462         ReferenceSeq const *rs = &self->refSeq[index];
463         unsigned const m = rs->num_used;
464         unsigned j;
465 
466         if (rs->type == rst_dead)
467             continue;
468         for (j = 0; j < m; ++j) {
469             if (strcmp(id, rs->used[j]) == 0)
470                 return index;
471         }
472     }
473     return -1;
474 }
475 
newReferenceSeq(ReferenceMgr * const self,ReferenceSeq const * const template)476 static ReferenceSeq *newReferenceSeq(ReferenceMgr *const self, ReferenceSeq const *const template)
477 {
478     unsigned const last_rs = (unsigned)self->refSeqs.elem_count;
479     rc_t const rc = KDataBufferResize(&self->refSeqs, last_rs + 1);
480 
481     assert(rc == 0);
482     if (rc) abort();
483 
484     self->refSeq = self->refSeqs.base;
485     {
486         ReferenceSeq *const rslt = &self->refSeq[last_rs];
487         if (template)
488             *rslt = *template;
489         else
490             memset(rslt, 0, sizeof(*rslt));
491         rslt->mgr = self;
492         return rslt;
493     }
494 }
495 
496 static
config_cmp(void const * A,void const * B,void * Data)497 int64_t CC config_cmp(void const *A, void const *B, void *Data)
498 {
499     struct {
500         unsigned id;
501         unsigned seqId;
502         unsigned extra;
503         unsigned extralen;
504     } const *const a = A;
505     struct {
506         unsigned id;
507         unsigned seqId;
508         unsigned extra;
509         unsigned extralen;
510     } const *const b = B;
511     char const *const data = Data;
512 
513     return strcmp(&data[a->id], &data[b->id]);
514 }
515 
516 static
ReferenceMgr_ProcessConf(ReferenceMgr * const self,char Data[],unsigned const len)517 rc_t ReferenceMgr_ProcessConf(ReferenceMgr *const self, char Data[], unsigned const len)
518 {
519     rc_t rc;
520     unsigned i;
521     unsigned j;
522     struct {
523         unsigned id;
524         unsigned seqId;
525         unsigned extra;
526         unsigned extralen;
527     } *data, tmp;
528     KDataBuffer buf;
529 
530     memset(&buf, 0, sizeof(buf));
531     buf.elem_bits = sizeof(data[0]) * 8;
532 
533     for (j = i = 0; i < len; ++j) {
534         unsigned lineEnd;
535         unsigned id;
536         unsigned acc;
537         unsigned ii;
538 
539         for (lineEnd = i; lineEnd != len; ++lineEnd) {
540             int const ch = Data[lineEnd];
541 
542             if (ch == '\n')
543                 break;
544             if (ch == '\r')
545                 break;
546         }
547         if (i == lineEnd) {
548             ++i;
549             continue;
550         }
551         Data[lineEnd] = '\0';
552         for (id = i; id != lineEnd; ++id) {
553             int const ch = Data[id];
554 
555             if (!isspace(ch))
556                 break;
557         }
558         for (ii = id; ii != lineEnd; ++ii) {
559             int const ch = Data[ii];
560 
561             if (isspace(ch)) {
562                 Data[ii++] = '\0';
563                 break;
564             }
565         }
566         for (acc = ii; acc < lineEnd; ++acc) {
567             int const ch = Data[acc];
568 
569             if (!isspace(ch))
570                 break;
571         }
572         if (acc >= lineEnd)
573             return RC(rcAlign, rcFile, rcReading, rcFormat, rcInvalid);
574 
575         for (ii = acc; ii != lineEnd; ++ii) {
576             int const ch = Data[ii];
577 
578             if (isspace(ch)) {
579                 Data[ii++] = '\0';
580                 break;
581             }
582         }
583         tmp.id = id;
584         tmp.seqId = acc;
585         tmp.extra = ii;
586         tmp.extralen = lineEnd > ii ? lineEnd - ii : 0;
587 
588         if ((rc = KDataBufferResize(&buf, buf.elem_count + 1)) != 0) return rc;
589         data = buf.base;
590 
591         data[buf.elem_count-1] = tmp;
592         i = lineEnd + 1;
593     }
594 
595     /* check unique */
596     ksort(data, buf.elem_count, sizeof(data[0]), config_cmp, Data);
597     for (i = 1; i < buf.elem_count; ++i) {
598         if (strcmp(&Data[data[i-1].id], &Data[data[i].id]) == 0)
599             return RC(rcAlign, rcIndex, rcConstructing, rcItem, rcExists);
600     }
601 
602     for (i = 0; i != buf.elem_count; ++i) {
603         unsigned const extralen = data[i].extralen;
604         char const *const id    = &Data[data[i].id];
605         char const *const seqId = &Data[data[i].seqId];
606         char const *const extra = extralen ? &Data[data[i].extra] : NULL;
607         bool circular = false;
608         ReferenceSeq *rs;
609 
610         if (extra && extralen >= 8) {
611             char const *const circ = strcasestr(extra, "circular");
612 
613             circular = circ && (circ == extra || isspace(circ[-1])) &&
614                        (circ[8] == '\0' || isspace(circ[8]));
615         }
616         rs = newReferenceSeq(self, NULL);
617 
618         rs->id = string_dup(id, string_size(id));
619         if (rs->id == NULL)
620             return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
621 
622         if (strcmp(seqId, UNMAPPED_SEQID_VALUE) == 0) {
623             rs->type = rst_unmapped;
624         }
625         else {
626             rs->seqId = string_dup(seqId, string_size(seqId));
627             if (rs->seqId == NULL)
628                 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
629         }
630         rs->circular = circular;
631 
632         addToHashTable(self, rs);
633     }
634     KDataBufferWhack(&buf);
635     return 0;
636 }
637 
638 static
ReferenceMgr_Conf(ReferenceMgr * const self,char const conf[])639 rc_t ReferenceMgr_Conf(ReferenceMgr *const self, char const conf[])
640 {
641     rc_t rc;
642     const KFile* kf = NULL;
643 
644     if (conf == NULL)
645         return 0;
646 
647     rc = FindAndOpenConfigFile(self->rmgr, self->dir, &kf, conf);
648     if (rc == 0) {
649         uint64_t sz;
650         KDataBuffer buf;
651 
652         rc = KFileSize(kf, &sz);
653         assert(rc == 0);
654         if (sz == 0)
655             (void)PLOGMSG(klogWarn, (klogWarn, "Configuration file '$(file)' is empty", "file=%s", conf));
656         else {
657             rc = KDataBufferMakeBytes(&buf, sz + 1);
658             if (rc == 0) {
659                 size_t nread;
660 
661                 rc = KFileReadAll(kf, 0, buf.base, sz, &nread);
662                 if (rc == 0) {
663                     assert(nread == sz);
664                     ((char *)buf.base)[sz] = '\n'; /* make sure that last line is terminated */
665                     rc = ReferenceMgr_ProcessConf(self, buf.base, (unsigned)(sz + 1));
666                 }
667                 KDataBufferWhack(&buf);
668             }
669         }
670         KFileRelease(kf);
671     }
672     return rc;
673 }
674 
675 static
FastaFile_GetSeqIds(KDataBuffer * const buf,char const data[],uint64_t const len)676 rc_t FastaFile_GetSeqIds(KDataBuffer *const buf, char const data[], uint64_t const len)
677 {
678     uint64_t pos;
679     int st = 0;
680 
681     for (pos = 0; pos < len; ++pos) {
682         int const ch = data[pos];
683 
684         if (st == 0) {
685             if (ch == '>') {
686                 uint64_t const n = buf->elem_count;
687                 rc_t rc = KDataBufferResize(buf, n + 1);
688 
689                 if (rc)
690                     return rc;
691                 ((uint64_t *)buf->base)[n] = pos;
692                 st = 1;
693             }
694         }
695         else if (ch == '\r' || ch == '\n')
696             st = 0;
697     }
698     return 0;
699 }
700 
701 static
ImportFasta(ReferenceSeq * const obj,KDataBuffer const * const buf)702 rc_t ImportFasta(ReferenceSeq *const obj, KDataBuffer const *const buf)
703 {
704     unsigned seqId;
705     unsigned seqIdLen;
706     unsigned ln;
707     unsigned src;
708     unsigned dst;
709     unsigned start=0;
710     char *const data = buf->base;
711     unsigned const len = buf->elem_count;
712     char *fastaSeqId = NULL;
713     rc_t rc;
714     MD5State mds;
715 
716     if (len == 0)
717         return 0;
718     assert(data[0] == '>');
719 
720     for (ln = 1; ln != len; ++ln) {
721         int const ch = data[ln];
722 
723         if (ch == '\r' || ch == '\n') {
724             data[ln] = '\0';
725             start = ln + 1;
726             break;
727         }
728     }
729     for (seqId = 1; seqId != ln; ++seqId) {
730         if (!isspace(data[seqId]))
731             break;
732     }
733     for (seqIdLen = 0; seqId + seqIdLen < ln; ++seqIdLen) {
734         if (isspace(data[seqId + seqIdLen])) {
735             ln = seqId + seqIdLen;
736             data[ln] = '\0';
737             break;
738         }
739     }
740     if (seqIdLen == 0)
741         return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
742 
743     fastaSeqId = string_dup(&data[seqId], string_size(&data[seqId]));
744     if (fastaSeqId == NULL)
745         return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
746 
747     MD5StateInit(&mds);
748     for (dst = src = start; src != len; ++src) {
749         int const ch = toupper(data[src]);
750 
751         if (isspace(ch))
752             continue;
753 
754         if (strchr(INSDC_4na_map_CHARSET, ch) == NULL && ch != 'X')
755             return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
756 
757         data[dst] = ch == 'X' ? 'N' : ch;
758         MD5StateAppend(&mds, data + dst, 1);
759         ++dst;
760     }
761     MD5StateFinish(&mds, obj->md5);
762     rc = KDataBufferSub(buf, &obj->u.local.buf, start, dst - start);
763     if (rc == 0) {
764         obj->fastaSeqId = fastaSeqId;
765         obj->type = rst_local;
766         obj->seq_len = dst - start;
767     }
768     else
769         obj->type = rst_dead;
770     return rc;
771 }
772 
subBuffer(KDataBuffer const * const buf,uint64_t const offset,unsigned const length)773 static KDataBuffer subBuffer(KDataBuffer const *const buf, uint64_t const offset, unsigned const length)
774 {
775     KDataBuffer sub;
776     memset(&sub, 0, sizeof(sub));
777     KDataBufferSub(buf, &sub, offset, length);
778     return sub;
779 }
780 
compareSeqIdFields(unsigned const alen,char const a[],unsigned const blen,char const b[])781 static int compareSeqIdFields(unsigned const alen, char const a[], unsigned const blen, char const b[])
782 {
783     unsigned i;
784 
785     for (i = 0; i < alen && i < blen; ++i) {
786         int const cha = a[i];
787         int const chb = b[i];
788         int const diff = toupper(cha) - toupper(chb);
789 
790         if (diff < 0)
791             return -1;
792 
793         if (diff > 0)
794             return 1;
795 
796         if (cha == '\0')
797             return 0;
798     }
799     return alen < blen ? -1 : alen == blen ? 0 : 1;
800 }
801 
802 typedef struct {
803     unsigned length;
804     uint64_t offset;
805 } seqIdField_t;
806 
cmpSeqIdField(void const * A,void const * B,void * Ctx)807 static int64_t cmpSeqIdField(void const *A, void const *B, void *Ctx)
808 {
809     char const *const data = Ctx;
810     seqIdField_t const *a = A;
811     seqIdField_t const *b = B;
812     char const *avalue = &data[a->offset];
813     unsigned const alen = a->length;
814     char const *bvalue = &data[b->offset];
815     unsigned const blen = b->length;
816     int const diff = compareSeqIdFields(alen, avalue, blen, bvalue);
817 
818     return diff != 0 ? diff : a->offset < b->offset ? -1 : 1;
819 }
820 
isInKnownSet(unsigned const len,char const id[])821 static bool isInKnownSet(unsigned const len, char const id[])
822 {
823     if (len == 2 || len == 3) {
824         int const id1 = id[0];
825         int const id2 = id[1];
826 
827         if (len == 3) {
828             int const id3 = id[2];
829 
830             if (id1 == 'd' && id2 == 'b' && id3 == 'j') return true;
831             if (id1 == 'e' && id2 == 'm' && id3 == 'b') return true;
832             if (id1 == 'g' && id2 == 'p' && id3 == 'p') return true;
833             if (id1 == 'r' && id2 == 'e' && id3 == 'f') return true;
834             if (id1 == 't' && id2 == 'p') {
835                 if (id3 == 'd' || id3 == 'e' || id3 == 'g') return true;
836             }
837             return false;
838         }
839         if (id1 == 'g') {
840             if (id2 == 'b' || id2 == 'i')
841                 return true;
842         }
843     }
844     return false;
845 }
846 
ImportFastaFileMany(ReferenceMgr * const self,unsigned const seqIds,uint64_t const seqIdOffset[],KDataBuffer const * const buf)847 static rc_t ImportFastaFileMany(ReferenceMgr *const self,
848                                 unsigned const seqIds, uint64_t const seqIdOffset[],
849                                 KDataBuffer const *const buf)
850 {
851     unsigned const datalen = buf->elem_count;
852     char *const data = buf->base;
853     seqIdField_t *found = NULL;
854     seqIdField_t *ignore = NULL;
855     unsigned found_entries = 0;
856     unsigned found_entries_max = seqIds;
857     unsigned ignore_entries = 0;
858     unsigned ignore_entries_max = 0;
859     unsigned i;
860     rc_t rc = 0;
861 
862     found = malloc(found_entries_max * sizeof(found[0]));
863     assert(found);
864     if (found == NULL)
865         abort();
866 
867     for (i = 0; i < seqIds; ++i) {
868         uint64_t const ofs = seqIdOffset[i];
869         uint64_t const nxt = (i < seqIds - 1) ? seqIdOffset[i + 1] : datalen;
870         unsigned const len = (unsigned)(nxt - ofs);
871         unsigned j = 1;
872         unsigned k;
873 
874         for (k = j; k < len; ++k) {
875             int const ch = data[ofs + k];
876 
877             if (ch == '|' || isspace(ch)) {
878                 char const *const str = &data[ofs + j];
879                 unsigned const len = k - j;
880 
881                 if (len > 0 && (isspace(ch) || !isInKnownSet(len, str))) {
882                     if (found_entries == found_entries_max) {
883                         unsigned const new_max = found_entries_max * 2;
884                         void *const tmp = realloc(found, new_max * sizeof(found[0]));
885 
886                         assert(tmp);
887                         if (tmp == NULL)
888                             abort();
889 
890                         found = tmp;
891                         found_entries_max = new_max;
892                     }
893                     found[found_entries].length = len;
894                     found[found_entries].offset = ofs + j;
895                     ++found_entries;
896                 }
897                 j = k + 1;
898             }
899             if (isspace(ch))
900                 break;
901         }
902     }
903     ksort(found, found_entries, sizeof(found[0]), cmpSeqIdField, (void *)data);
904 
905     for (i = 1; i < found_entries; ++i) {
906         unsigned const llen = ignore_entries_max > 0 ? ignore[ignore_entries - 1].length : 0;
907         char const *const last = ignore_entries_max > 0 ? data + ignore[ignore_entries - 1].offset : 0;
908 
909         unsigned const plen = found[i - 1].length;
910         char const *const prv = data + found[i - 1].offset;
911 
912         unsigned const qlen = found[i].length;
913         char const *const qry = data + found[i].offset;
914 
915         bool const issame = compareSeqIdFields(plen, prv, qlen, qry) == 0;
916         bool const islast = llen > 0 ? compareSeqIdFields(llen, last, qlen, qry) == 0 : false;
917 
918         if (issame && !islast) {
919             if (ignore_entries == ignore_entries_max) {
920                 unsigned const new_max = ignore_entries_max == 0 ? (4096 / sizeof(ignore[0])) : (ignore_entries_max * 2);
921                 void *const tmp = realloc(ignore, new_max * sizeof(ignore[0]));
922 
923                 assert(tmp);
924                 if (tmp == NULL)
925                     abort();
926 
927                 ignore = tmp;
928                 ignore_entries_max = new_max;
929             }
930             ignore[ignore_entries] = found[i - 1];
931             ++ignore_entries;
932         }
933     }
934     free(found);
935 
936     for (i = 0; i < seqIds; ++i) {
937         unsigned index = 0;
938         uint64_t const ofs = seqIdOffset[i];
939         uint64_t const nxt = (i < seqIds - 1) ? seqIdOffset[i + 1] : datalen;
940         unsigned const len = (unsigned)(nxt - ofs);
941         unsigned j = 1;
942         unsigned k;
943         char const *const seqId = &data[ofs + 1];
944         {
945             ReferenceSeq new_seq;
946             KDataBuffer sub = subBuffer(buf, ofs, len);
947 
948             memset(&new_seq, 0, sizeof(new_seq));
949             rc = ImportFasta(&new_seq, &sub);
950             KDataBufferWhack(&sub);
951             if (rc) {
952                 PLOGERR(klogInfo, (klogInfo, rc, "Error reading '$(defline)'; ignored", "defline=%s", seqId));
953                 continue;
954             }
955             else {
956                 ReferenceSeq *p = newReferenceSeq(self, &new_seq);
957                 index = p - self->refSeq;
958                 addToHashBucket(&self->ht[hash0(seqId)], index);
959             }
960         }
961         for (k = j; k < len; ++k) {
962             int const ch = data[ofs + k];
963 
964             if (ch == '\0' || ch == '|' || isspace(ch)) {
965                 unsigned const length = k - j;
966                 uint64_t const offset = ofs + j;
967                 char const *const value = &data[offset];
968 
969                 if (length == 0 || (!isspace(ch) && isInKnownSet(length, value)))
970                     goto IGNORED;
971                 {
972                     unsigned f = 0;
973                     unsigned e = ignore_entries;
974 
975                     while (f < e) {
976                         unsigned const m = f + (e - f) / 2;
977                         unsigned const flen = ignore[m].length;
978                         char const *const fnd = data + ignore[m].offset;
979                         int const diff = compareSeqIdFields(length, value, flen, fnd);
980 
981                         if (diff == 0)
982                             goto IGNORED;
983                         if (diff < 0)
984                             e = m;
985                         else
986                             f = m + 1;
987                     }
988                 }
989                 addToHashBucket(&self->ht[hash(length, value)], index);
990             IGNORED:
991                 j = k + 1;
992                 if (ch == '\0' || isspace(ch))
993                     break;
994             }
995         }
996     }
997 
998     free(ignore);
999     return rc;
1000 }
1001 
1002 #define READ_CHUNK_SIZE (1024 * 1024)
1003 
1004 static
ImportFastaFile(ReferenceMgr * const self,KFile const * kf,ReferenceSeq * rslt)1005 rc_t ImportFastaFile(ReferenceMgr *const self, KFile const *kf,
1006                      ReferenceSeq *rslt)
1007 {
1008     uint64_t file_size;
1009     rc_t rc = KFileSize(kf, &file_size);
1010 
1011     if (rc == 0) {
1012         KDataBuffer fbuf;
1013 
1014         rc = KDataBufferMakeBytes(&fbuf, file_size);
1015         if (rc == 0) {
1016             char *const data = fbuf.base;
1017             uint64_t inbuf = 0;
1018 
1019             while (inbuf < file_size) {
1020                 uint64_t const remain = file_size - inbuf;
1021                 size_t nread = 0;
1022 
1023                 rc = KFileRead(kf, inbuf, &data[inbuf], remain > READ_CHUNK_SIZE ? READ_CHUNK_SIZE : remain, &nread);
1024                 if (rc != 0 || nread == 0)
1025                     break;
1026                 inbuf += nread;
1027             }
1028             if (rc == 0) {
1029                 char const *const base = fbuf.base;
1030                 KDataBuffer seqIdBuf;
1031 
1032                 memset(&seqIdBuf, 0, sizeof(seqIdBuf));
1033                 seqIdBuf.elem_bits = sizeof(file_size) * 8;
1034 
1035                 rc = FastaFile_GetSeqIds(&seqIdBuf, base, file_size);
1036                 if (rc == 0) {
1037                     uint64_t const *const seqIdOffset = seqIdBuf.base;
1038                     unsigned const seqIds = (unsigned)seqIdBuf.elem_count;
1039 
1040                     if (rslt) {
1041                         KDataBuffer sub = subBuffer(&fbuf, seqIdOffset[0], (unsigned)(file_size - seqIdOffset[0]));
1042 
1043                         if (seqIds > 1)
1044                             rc = RC(rcAlign, rcFile, rcReading, rcItem, rcUnexpected);
1045 
1046                         rc = ImportFasta(rslt, &sub);
1047                         KDataBufferWhack(&sub);
1048                         if (rc == 0)
1049                             addToHashTable(self, rslt);
1050                     }
1051                     else {
1052                         ImportFastaFileMany(self, seqIds, seqIdOffset, &fbuf);
1053                     }
1054                 }
1055                 KDataBufferWhack(&seqIdBuf);
1056             }
1057             KDataBufferWhack(&fbuf);
1058         }
1059     }
1060     return rc;
1061 }
1062 
1063 static
OpenFastaFile(KFile const ** const kf,KDirectory const * const dir,char const base[],unsigned const len)1064 rc_t OpenFastaFile(KFile const **const kf,
1065                    KDirectory const *const dir,
1066                    char const base[],
1067                    unsigned const len)
1068 {
1069     char fname_a[4096];
1070     char *fname_h = NULL;
1071     char *fname = fname_a;
1072     rc_t rc;
1073 
1074     if (len + 7 >= sizeof(fname_a)) {
1075         fname_h = malloc(len + 7);
1076         if (fname_h)
1077             fname = fname_h;
1078         else
1079             return RC(rcAlign, rcFile, rcOpening, rcMemory, rcExhausted);
1080     }
1081     memmove(fname, base, len);
1082     memmove(fname + len, ".fasta", 7);
1083 
1084     ALIGN_CF_DBGF(("trying to open fasta file: %.s\n", fname));
1085     rc = KDirectoryOpenFileRead(dir, kf, "%s", fname);
1086     if (rc) {
1087         fname[len + 3] = '\0'; /* base.fasta -> base.fa */
1088 
1089         ALIGN_CF_DBGF(("trying to open fasta file: %.s\n", fname));
1090         rc = KDirectoryOpenFileRead(dir, kf, "%s", fname);
1091     }
1092     free(fname_h);
1093     return rc;
1094 }
1095 
1096 #if 1
ReferenceSeq_Dump(ReferenceSeq const * const rs)1097 void ReferenceSeq_Dump(ReferenceSeq const *const rs)
1098 {
1099     static char const *types[] = {
1100         "'unattached'",
1101         "'fasta'",
1102         "'RefSeq-by-id'",
1103         "'RefSeq-by-seqid'",
1104         "'unmapped'",
1105         "'dead'"
1106     };
1107     unsigned j;
1108 
1109     ((void)types); /* stupid warning */
1110     ALIGN_CF_DBGF(("{ "));
1111     ALIGN_CF_DBGF(("type: %s, ", (rs->type < 0 || rs->type > rst_dead) ? "null" : types[rs->type]));
1112 
1113     if (rs->id)
1114         ALIGN_CF_DBGF(("id: '%s', ", rs->id));
1115     else
1116         ALIGN_CF_DBGF(("id: null, "));
1117 
1118     if (rs->seqId)
1119         ALIGN_CF_DBGF(("seqId: '%s', ", rs->seqId));
1120     else
1121         ALIGN_CF_DBGF(("seqId: null, "));
1122 
1123     if (rs->fastaSeqId)
1124         ALIGN_CF_DBGF(("fastaSeqId: '%s', ", rs->fastaSeqId));
1125     else
1126         ALIGN_CF_DBGF(("fastaSeqId: null, "));
1127 
1128     ALIGN_CF_DBGF(("seq-len: %u, ", rs->seq_len));
1129     ALIGN_CF_DBGF(("circular: %s, ", rs->circular ? "true" : "false"));
1130 
1131     ALIGN_CF_DBGF(("md5: '"));
1132     for (j = 0; j != 16; ++j)
1133         ALIGN_CF_DBGF(("%02X", rs->md5[j]));
1134     ALIGN_CF_DBGF(("', "));
1135 
1136     ALIGN_CF_DBGF(("keys: [ "));
1137     for (j = 0; j != rs->num_used; ++j) {
1138         char const *key = rs->used[j];
1139 
1140         ALIGN_CF_DBGF(("'%s', ", key));
1141     }
1142     ALIGN_CF_DBGF(("] }"));
1143 }
1144 
ReferenceMgr_DumpConfig(ReferenceMgr const * const self)1145 LIB_EXPORT void ReferenceMgr_DumpConfig(ReferenceMgr const *const self)
1146 {
1147     unsigned const n = (unsigned)self->refSeqs.elem_count;
1148     unsigned i;
1149 
1150     ALIGN_CF_DBGF(("config: [\n"));
1151     for (i = 0; i != n; ++i) {
1152         ALIGN_CF_DBGF(("\t"));
1153         ReferenceSeq_Dump(&self->refSeq[i]);
1154         ALIGN_CF_DBGF((",\n"));
1155     }
1156     ALIGN_CF_DBGF(("]\n"));
1157 }
1158 #endif
1159 
1160 static
ReferenceSeq_GetRefSeqInfo(ReferenceSeq * const self)1161 rc_t ReferenceSeq_GetRefSeqInfo(ReferenceSeq *const self)
1162 {
1163     rc_t rc;
1164     uint8_t const *md5;
1165 
1166     assert(self != NULL);
1167     assert(self->type == rst_refSeqById || self->type == rst_refSeqBySeqId);
1168 
1169     if ((rc = RefSeq_Circular(self->u.refseq, &self->circular)) != 0)
1170         return rc;
1171     if ((rc = RefSeq_SeqLength(self->u.refseq, &self->seq_len)) != 0)
1172         return rc;
1173     if ((rc = RefSeq_MD5(self->u.refseq, &md5)) != 0)
1174         return rc;
1175 
1176     if (md5)
1177         memmove(self->md5, md5, 16);
1178     else
1179         memset(self->md5, 0, 16);
1180     return 0;
1181 }
1182 
1183 
1184 /* Try to attach to a RefSeq or a fasta file */
1185 static
ReferenceSeq_Attach(ReferenceMgr * const self,ReferenceSeq * const rs)1186 rc_t ReferenceSeq_Attach(ReferenceMgr *const self, ReferenceSeq *const rs)
1187 {
1188     unsigned const seqid_len = rs->seqId ? (unsigned)string_size(rs->seqId) : 0;
1189     unsigned const id_len = rs->id ? (unsigned)string_size(rs->id) : 0;
1190     rc_t rc = 0;
1191     KFile const *kf = NULL;
1192 
1193     assert(rs->type == rst_unattached);
1194     assert(id_len != 0 || seqid_len != 0);
1195 
1196     if (seqid_len) {
1197         ALIGN_CF_DBGF(("trying to open refseq: %.*s\n", seqid_len, rs->seqId));
1198         if (RefSeqMgr_Exists(self->rmgr, rs->seqId, seqid_len, NULL) == 0) {
1199             rc = RefSeqMgr_GetSeq(self->rmgr, &rs->u.refseq, rs->seqId, seqid_len);
1200             if (rc == 0) {
1201                 rs->type = rst_refSeqBySeqId;
1202                 rc = ReferenceSeq_GetRefSeqInfo(rs);
1203             }
1204             return rc;
1205         }
1206     }
1207     if (id_len) {
1208         ALIGN_CF_DBGF(("trying to open refseq: %.*s\n", id_len, rs->id));
1209         if (RefSeqMgr_Exists(self->rmgr, rs->id, id_len, NULL) == 0) {
1210             rc = RefSeqMgr_GetSeq(self->rmgr, &rs->u.refseq, rs->id, id_len);
1211             if (rc == 0) {
1212                 rs->type = rst_refSeqById;
1213                 rc = ReferenceSeq_GetRefSeqInfo(rs);
1214             }
1215             return rc;
1216         }
1217     }
1218     if (id_len) {
1219         ALIGN_CF_DBGF(("trying to open fasta: %.*s\n", id_len, rs->id));
1220         rc = OpenFastaFile(&kf, self->dir, rs->id, id_len);
1221         if (rc && seqid_len) {
1222             ALIGN_CF_DBGF(("trying to open fasta: %.*s\n", seqid_len, rs->seqId));
1223             rc = OpenFastaFile(&kf, self->dir, rs->seqId, seqid_len);
1224         }
1225     }
1226     else {
1227         ALIGN_CF_DBGF(("trying to open fasta: %.*s\n", seqid_len, rs->seqId));
1228         rc = OpenFastaFile(&kf, self->dir, rs->seqId, seqid_len);
1229     }
1230     if (kf) {
1231         ReferenceSeq tmp = *rs;
1232 
1233         ALIGN_CF_DBGF(("importing fasta"));
1234         rc = ImportFastaFile(self, kf, rs);
1235         KFileRelease(kf);
1236         if (rc != 0)
1237             *rs = tmp;
1238         return rc;
1239     }
1240     return 0;
1241 }
1242 
1243 struct Candidate {
1244     int weight;
1245     unsigned index;
1246 };
1247 
cmpCandidate(void const * A,void const * B,void * Ctx)1248 static int64_t CC cmpCandidate(void const *A, void const *B, void *Ctx)
1249 {
1250     struct Candidate const *const a = A;
1251     struct Candidate const *const b = B;
1252 
1253     return (int64_t)(a->weight - b->weight);
1254 }
1255 
sortCandidateList(unsigned const len,struct Candidate * list)1256 static void sortCandidateList(unsigned const len, struct Candidate *list)
1257 {
1258     if (len > 1)
1259         ksort(list, len, sizeof(list[0]), cmpCandidate, NULL);
1260 }
1261 
candidates(ReferenceMgr * const self,unsigned * const count,struct Candidate ** const rslt,unsigned const idLen,char const id[],unsigned const seq_len,uint8_t const md5[16])1262 static void candidates(ReferenceMgr *const self,
1263                        unsigned *const count,
1264                        struct Candidate **const rslt,
1265                        unsigned const idLen,
1266                        char const id[],
1267                        unsigned const seq_len,
1268                        uint8_t const md5[16])
1269 {
1270     unsigned const hv = hash(idLen, id);
1271     Bucket const bucket = self->ht[hv];
1272     unsigned num_possible = 0;
1273     struct Candidate *possible = malloc(sizeof(possible[0]) * bucket.count);
1274     unsigned i;
1275 
1276     assert(possible != NULL);
1277     if (possible == NULL)
1278         abort();
1279     for (i = 0; i < bucket.count; ++i) {
1280         unsigned const index = bucket.index[i];
1281         ReferenceSeq *const rs = &self->refSeq[index];
1282 
1283         if (rs->type != rst_dead) {
1284             bool const sameId = rs->id != NULL && strcmp(rs->id, id) == 0;
1285             bool const sameSeqId = rs->seqId != NULL && strcasecmp(rs->seqId, id) == 0;
1286             int const sameFastaSeqId = rs->fastaSeqId != NULL ? str_weight(rs->fastaSeqId, id, idLen) : 0;
1287             struct Candidate nv;
1288 
1289             nv.weight = (sameId ? id_match : 0) | (sameSeqId ? seqId_match : 0) | sameFastaSeqId;
1290             nv.index = index;
1291 
1292             if (nv.weight > 0)
1293                 possible[num_possible++] = nv;
1294         }
1295     }
1296     if (num_possible == 0) {
1297         free(possible);
1298         possible = NULL;
1299     }
1300     *count = num_possible;
1301     *rslt = possible;
1302 }
1303 
tryFastaOrRefSeq(ReferenceMgr * const self,ReferenceSeq ** const rslt,unsigned const idLen,char const id[],bool * tryAgain)1304 static rc_t tryFastaOrRefSeq(ReferenceMgr *const self,
1305                      ReferenceSeq **const rslt,
1306                      unsigned const idLen,
1307                      char const id[],
1308                      bool *tryAgain)
1309 {
1310     KFile const *kf = NULL;
1311     rc_t rc = 0;
1312 
1313     if (OpenFastaFile(&kf, self->dir, id, idLen) == 0) {
1314         /* found a fasta file; load it and try again */
1315         rc = ImportFastaFile(self, kf, NULL);
1316         if (rc == 0) {
1317             *tryAgain = true;
1318             return 0;
1319         }
1320     }
1321     /* didn't find a fasta file; try RefSeq */
1322     ALIGN_CF_DBGF(("trying to open refseq: %s\n", id));
1323     if (RefSeqMgr_Exists(self->rmgr, id, idLen, NULL) == 0) {
1324         ReferenceSeq *const seq = newReferenceSeq(self, NULL);
1325 
1326         rc = RefSeqMgr_GetSeq(self->rmgr, &seq->u.refseq, id, idLen);
1327         if (rc == 0) {
1328             unsigned const hv = hash(idLen, id);
1329             seq->id = string_dup(id, idLen);
1330             seq->type = rst_refSeqById;
1331 
1332             addToHashBucket(&self->ht[hv], seq - self->refSeq);
1333 
1334             ReferenceSeq_GetRefSeqInfo(seq);
1335             *rslt = seq;
1336         }
1337     }
1338     else {
1339         /* nothing found and out of options */
1340         rc = RC(rcAlign, rcFile, rcConstructing, rcId, rcNotFound);
1341     }
1342     return rc;
1343 }
1344 
setSeqLenBit(ReferenceMgr * const self,unsigned const N,struct Candidate * const C,unsigned const seq_len)1345 static void setSeqLenBit(ReferenceMgr *const self, unsigned const N, struct Candidate *const C, unsigned const seq_len)
1346 {
1347     unsigned i;
1348     for (i = 0; i < N; ++i) {
1349         unsigned const index = C[i].index;
1350         ReferenceSeq const *const rs = &self->refSeq[index];
1351 
1352         if (seq_len == rs->seq_len)
1353             C[i].weight |= seq_len_match;
1354     }
1355 }
1356 
setMD5Bit(ReferenceMgr * const self,unsigned const N,struct Candidate * const C,uint8_t const md5[])1357 static void setMD5Bit(ReferenceMgr *const self, unsigned const N, struct Candidate *const C, uint8_t const md5[])
1358 {
1359     unsigned i;
1360     for (i = 0; i < N; ++i) {
1361         unsigned const index = C[i].index;
1362         ReferenceSeq const *const rs = &self->refSeq[index];
1363 
1364         if (memcmp(md5, rs->md5, 16) == 0)
1365             C[i].weight |= md5_match;
1366     }
1367 }
1368 
setAttachedBit(ReferenceMgr * const self,unsigned const N,struct Candidate * const C)1369 static void setAttachedBit(ReferenceMgr *const self, unsigned const N, struct Candidate *const C)
1370 {
1371     unsigned i;
1372     for (i = 0; i < N; ++i) {
1373         unsigned const index = C[i].index;
1374         ReferenceSeq const *const rs = &self->refSeq[index];
1375 
1376         if (rs->type != rst_unattached)
1377             C[i].weight |= attached;
1378     }
1379 }
1380 
checkForMultiMapping(ReferenceMgr * const self,ReferenceSeq * const chosen,bool * const wasRenamed)1381 static ReferenceSeq *checkForMultiMapping(ReferenceMgr *const self, ReferenceSeq *const chosen, bool *const wasRenamed)
1382 {
1383     unsigned const hv = hash0(chosen->seqId);
1384     Bucket const bucket = self->ht[hv];
1385     unsigned i;
1386 
1387     for (i = 0; i < bucket.count; ++i) {
1388         ReferenceSeq *qry = &self->refSeq[bucket.index[i]];
1389         if (qry == chosen || qry->type == rst_dead || qry->type == rst_unattached || qry->type == rst_renamed || qry->seqId == NULL)
1390             continue;
1391         if (strcasecmp(chosen->seqId, qry->seqId) == 0) {
1392             void *const oldId = chosen->id;
1393 
1394             chosen->type = rst_renamed;
1395             chosen->id = string_dup(qry->id, string_size(qry->id));
1396             *wasRenamed = true;
1397             free(oldId);
1398             return qry;
1399         }
1400     }
1401     return chosen;
1402 }
1403 
tryFasta(ReferenceMgr * const self,ReferenceSeq ** const rslt,char const seqId[],unsigned const seq_len,uint8_t const md5[16])1404 static rc_t tryFasta(ReferenceMgr *const self,
1405                      ReferenceSeq **const rslt,
1406                      char const seqId[],
1407                      unsigned const seq_len,
1408                      uint8_t const md5[16])
1409 {
1410     ReferenceSeq *const seq = *rslt;
1411     unsigned const hv = hash0(seqId);
1412     Bucket const bucket = self->ht[hv];
1413     unsigned best = bucket.count;
1414     unsigned best_score = 0;
1415     unsigned i;
1416 
1417     for (i = 0; i < bucket.count; ++i) {
1418         unsigned score = 0;
1419         ReferenceSeq *const qry = &self->refSeq[bucket.index[i]];
1420 
1421         if (qry != seq && qry->fastaSeqId != NULL) {
1422             char const *const substr = strcasestr(qry->fastaSeqId, seqId);
1423             if (substr != NULL) {
1424                 unsigned const at = substr - qry->fastaSeqId;
1425                 unsigned const slen = (unsigned)string_size(seqId);
1426                 unsigned const qlen = (unsigned)string_size(qry->fastaSeqId);
1427                 if (at == 0 && slen == qlen)
1428                     score |= exact_match;
1429                 else {
1430                     bool const leftEdge = at == 0 || qry->fastaSeqId[at - 1] == '|';
1431                     bool const rightEdge = (at + slen) == qlen || qry->fastaSeqId[at + slen] == '|';
1432 
1433                     if (leftEdge && rightEdge)
1434                         score |= substring_match;
1435                 }
1436             }
1437         }
1438         if (score == 0)
1439             continue;
1440 
1441         if (seq_len != 0 && qry->seq_len == seq_len)
1442             score |= seq_len_match;
1443         if (md5 != NULL && memcmp(md5, qry->md5, 16) == 0)
1444             score |= md5_match;
1445         if (score > best_score) {
1446             best = i;
1447             best_score = score;
1448         }
1449     }
1450     if (best != bucket.count) {
1451         *rslt = &self->refSeq[bucket.index[best]];
1452         return 0;
1453     }
1454     else {
1455         return RC(rcAlign, rcFile, rcConstructing, rcId, rcNotFound);
1456     }
1457 }
1458 
findSeq(ReferenceMgr * const self,ReferenceSeq ** const rslt,char const id[],unsigned const seq_len,uint8_t const md5[16],bool const allowMultiMapping,bool wasRenamed[])1459 static rc_t findSeq(ReferenceMgr *const self,
1460                     ReferenceSeq **const rslt,
1461                     char const id[],
1462                     unsigned const seq_len,
1463                     uint8_t const md5[16],
1464                     bool const allowMultiMapping,
1465                     bool wasRenamed[])
1466 {
1467     unsigned const idLen = (unsigned)string_size(id);
1468     unsigned num_possible = 0;
1469     struct Candidate *possible = NULL;
1470     ReferenceSeq *chosen = NULL;
1471     rc_t rc = 0;
1472 
1473     candidates(self, &num_possible, &possible, idLen, id, seq_len, md5);
1474     if (num_possible == 0) {
1475         /* nothing was found; try a fasta file */
1476         bool tryAgain = false;
1477         rc_t const rc = tryFastaOrRefSeq(self, rslt, idLen, id, &tryAgain);
1478 
1479         if (rc != 0 && !tryAgain)
1480             return rc;
1481 
1482         return findSeq(self, rslt, id, seq_len, md5, allowMultiMapping, wasRenamed);
1483     }
1484     if (num_possible > 1) {
1485         if (seq_len != 0)
1486             setSeqLenBit(self, num_possible, possible, seq_len);
1487         if (md5 != NULL)
1488             setMD5Bit(self, num_possible, possible, md5);
1489         sortCandidateList(num_possible, possible);
1490         setAttachedBit(self, num_possible, possible);
1491         sortCandidateList(num_possible - 1, possible);
1492     }
1493     /* if there is no second best OR the best is better than the second best
1494      * then we have a clear choice
1495      */
1496     if (num_possible == 1 || (possible[num_possible - 1].weight & ~attached) > (possible[num_possible - 2].weight & ~attached)) {
1497         unsigned const index = possible[num_possible - 1].index;
1498         chosen = &self->refSeq[index];
1499     }
1500     free(possible);
1501     if (chosen == NULL) {
1502         /* there was no clear choice */
1503         return RC(rcAlign, rcFile, rcConstructing, rcId, rcAmbiguous);
1504     }
1505     if (chosen->seqId != NULL && !allowMultiMapping) {
1506         chosen = checkForMultiMapping(self, chosen, wasRenamed);
1507     }
1508     if (chosen->type == rst_unattached) {
1509         rc = ReferenceSeq_Attach(self, chosen);
1510         if (rc == 0 && chosen->type == rst_unattached) {
1511             /* still not attached; try seqId fasta */
1512             char const *const seqId = chosen->seqId;
1513 
1514             chosen->type = rst_dead;
1515             if (seqId)
1516                 rc = tryFasta(self, &chosen, seqId, seq_len, md5);
1517             else
1518                 rc = RC(rcAlign, rcFile, rcConstructing, rcId, rcNotFound);
1519         }
1520     }
1521     if (rc == 0) {
1522         if (chosen->id == NULL)
1523             chosen->id = string_dup(id, idLen);
1524         if (chosen->seqId == NULL && chosen->fastaSeqId != NULL)
1525             chosen->seqId = string_dup(chosen->fastaSeqId, string_size(chosen->fastaSeqId));
1526         *rslt = chosen;
1527     }
1528     if (rc)
1529         chosen->type = rst_dead;
1530     addToIndex(self, id, chosen);
1531     return rc;
1532 }
1533 
ReferenceMgr_FindSeq(ReferenceMgr const * const self,char const id[])1534 static ReferenceSeq *ReferenceMgr_FindSeq(ReferenceMgr const *const self, char const id[])
1535 {
1536     int const fnd = findId(self, id);
1537     return (fnd >= 0) ? &self->refSeq[fnd] : NULL;
1538 }
1539 
1540 static
ReferenceMgr_OpenSeq(ReferenceMgr * const self,ReferenceSeq ** const rslt,char const id[],unsigned const seq_len,uint8_t const md5[16],bool const allowMultiMapping,bool wasRenamed[])1541 rc_t ReferenceMgr_OpenSeq(ReferenceMgr *const self,
1542                           ReferenceSeq **const rslt,
1543                           char const id[],
1544                           unsigned const seq_len,
1545                           uint8_t const md5[16],
1546                           bool const allowMultiMapping,
1547                           bool wasRenamed[])
1548 {
1549     ReferenceSeq *const obj = ReferenceMgr_FindSeq(self, id);
1550     if (obj) {
1551         assert(rslt != NULL);
1552         *rslt = NULL;
1553         if (obj->type == rst_dead)
1554             return RC(rcAlign, rcIndex, rcSearching, rcItem, rcInvalid);
1555         if (obj->type == rst_renamed) {
1556             bool dummy = false;
1557 
1558             *wasRenamed = true;
1559             return ReferenceMgr_OpenSeq(self, rslt, obj->id, seq_len, md5, allowMultiMapping, &dummy);
1560         }
1561         if (obj->type == rst_refSeqBySeqId || obj->type == rst_refSeqById) {
1562             RefSeq const *dummy = NULL;
1563             char const *const key = (obj->type == rst_refSeqById) ? obj->id : obj->seqId;
1564             rc_t const rc = RefSeqMgr_GetSeq(self->rmgr, &dummy, key, (unsigned)string_size(key));
1565 
1566             assert(rc == 0);
1567             assert(dummy == obj->u.refseq);
1568         }
1569         *rslt = obj;
1570         return 0;
1571     }
1572     return findSeq(self, rslt, id, seq_len, md5, allowMultiMapping, wasRenamed);
1573 }
1574 
ReferenceMgr_SetCache(ReferenceMgr const * const self,size_t cache,uint32_t num_open)1575 LIB_EXPORT rc_t CC ReferenceMgr_SetCache(ReferenceMgr const *const self, size_t cache, uint32_t num_open)
1576 {
1577     return RefSeqMgr_SetCache(self->rmgr, cache, num_open);
1578 }
1579 
1580 static
OpenDataDirectory(KDirectory const ** rslt,char const path[])1581 rc_t OpenDataDirectory(KDirectory const **rslt, char const path[])
1582 {
1583     KDirectory *dir;
1584     rc_t rc = KDirectoryNativeDir(&dir);
1585 
1586     if (rc == 0) {
1587         if (path) {
1588             rc = KDirectoryOpenDirRead(dir, rslt, false, "%s", path);
1589             KDirectoryRelease(dir);
1590         }
1591         else
1592             *rslt = dir;
1593     }
1594     return rc;
1595 }
1596 
ReferenceMgr_Make(ReferenceMgr const ** cself,VDatabase * db,VDBManager const * vmgr,const uint32_t options,char const conf[],char const path[],uint32_t max_seq_len,size_t cache,uint32_t num_open)1597 LIB_EXPORT rc_t CC ReferenceMgr_Make(ReferenceMgr const **cself, VDatabase *db,
1598                                      VDBManager const* vmgr,
1599                                      const uint32_t options, char const conf[], char const path[],
1600                                      uint32_t max_seq_len, size_t cache, uint32_t num_open)
1601 {
1602     rc_t rc;
1603     ReferenceMgr *self;
1604     uint32_t wopt = 0;
1605 
1606     if (cself == NULL)
1607         return RC(rcAlign, rcIndex, rcConstructing, rcParam, rcNull);
1608 
1609     wopt |= (options & ewrefmgr_co_allREADs) ? ewref_co_SaveRead : 0;
1610     wopt |= (options & ewrefmgr_co_Coverage) ? ewref_co_Coverage : 0;
1611 
1612     if (max_seq_len == 0)
1613         max_seq_len = TableWriterRefSeq_MAX_SEQ_LEN;
1614 
1615     self = calloc(1, sizeof(*self));
1616     if (self) {
1617         rc = KDataBufferMakeBytes(&self->seq, max_seq_len);
1618         if (rc == 0) {
1619             self->compress.elem_bits = sizeof(compress_buffer_t) * 8;
1620             self->refSeqs.elem_bits = sizeof(ReferenceSeq) * 8;
1621 
1622             self->options = options;
1623             self->cache = cache;
1624             self->num_open_max = num_open;
1625             self->max_seq_len = max_seq_len;
1626             if (db) VDatabaseAddRef(self->db = db);
1627             rc = OpenDataDirectory(&self->dir, path);
1628             if (rc == 0) {
1629                 rc = RefSeqMgr_Make(&self->rmgr, vmgr, 0, cache, num_open);
1630                 if (rc == 0) {
1631                     rc = ReferenceMgr_Conf(self, conf);
1632                     if (rc == 0) {
1633                         *cself = self;
1634                         ALIGN_DBG("conf %s, local path '%s'", conf ? conf : "", path ? path : "");
1635                         return 0;
1636                     }
1637                     (void)PLOGERR(klogErr, (klogErr, rc, "failed to open configuration $(file)", "file=%s/%s", path ? path : ".", conf));
1638                 }
1639             }
1640         }
1641         ReferenceMgr_Release(self, false, NULL, false, NULL);
1642     }
1643     else
1644         rc = RC(rcAlign, rcIndex, rcConstructing, rcMemory, rcExhausted);
1645 
1646     ALIGN_DBGERR(rc);
1647 
1648     return rc;
1649 }
1650 
1651 #define ID_CHUNK_SZ 256
1652 
1653 typedef struct TChunk32_struct {
1654     struct TChunk32_struct* next;
1655     uint32_t id[ID_CHUNK_SZ]; /*** will only work with positive ids **/
1656 } TChunk32;
1657 
1658 typedef struct AlignId32List_struct {
1659 	TChunk32* head;
1660 	TChunk32* tail;
1661 	uint32_t  tail_qty;  /** number elements in the last chunk **/
1662 	uint32_t  chunk_qty; /** number of chunks */
1663 } AlignId32List;
1664 typedef struct AlignIdList_struct {
1665 	AlignId32List **sub_list;
1666 	uint32_t        sub_list_count;
1667 } AlignIdList;
1668 
AlignId32ListCount(AlignId32List * l)1669 static uint64_t AlignId32ListCount(AlignId32List *l)
1670 { return (l->chunk_qty>0)?ID_CHUNK_SZ*(l->chunk_qty-1)+ l->tail_qty:0;}
1671 
AlignIdListCount(AlignIdList * l)1672 static uint64_t AlignIdListCount(AlignIdList *l)
1673 {
1674 	uint64_t ret=0;
1675 	if(l){
1676 		uint32_t i;
1677 		for(i=0;i<l->sub_list_count;i++){
1678 			if(l->sub_list[i]){
1679 				ret += AlignId32ListCount(l->sub_list[i]);
1680 			}
1681 		}
1682 	}
1683 	return ret;
1684 }
AlignIdListFlatCopy(AlignIdList * l,int64_t * buf,uint64_t num_elem,bool do_sort)1685 static uint64_t AlignIdListFlatCopy(AlignIdList *l,int64_t *buf,uint64_t num_elem,bool do_sort)
1686 {
1687 	uint64_t res=0;
1688 	uint32_t i,j;
1689 	AlignId32List* cl;
1690 	assert(l!=0);
1691 
1692 	if((cl = l->sub_list[0])!=NULL){
1693 		TChunk32* head  = cl->head;
1694 		while(head !=  cl->tail){
1695 			for(i=0;i<ID_CHUNK_SZ && res < num_elem;i++,res++){
1696 				buf[res] = head->id[i];
1697 			}
1698 			head = head->next;
1699 		}
1700 		for(i=0;i<cl->tail_qty && res < num_elem;i++,res++){
1701 			buf[res] = head->id[i];
1702 		}
1703 	}
1704 	for(j = 1; j< l->sub_list_count && res < num_elem;j++){
1705 		if((cl = l->sub_list[j])!=NULL){
1706 			TChunk32* head  = cl->head;
1707 			uint64_t  hi = ((uint64_t)j) << 32;
1708 			while(head !=  cl->tail){
1709 				for(i=0;i<ID_CHUNK_SZ && res < num_elem;i++,res++){
1710 					buf[res] = hi | head->id[i];
1711 				}
1712 				head = head->next;
1713 			}
1714 			for(i=0;i<cl->tail_qty && res < num_elem;i++,res++){
1715 				buf[res] = hi | head->id[i];
1716 			}
1717 		}
1718 	}
1719 	if(do_sort && res > 1)
1720 		ksort_int64_t(buf,res);
1721 	return res;
1722 }
1723 
AlignId32ListAddId(AlignId32List * l,const uint32_t id)1724 static rc_t AlignId32ListAddId(AlignId32List *l,const uint32_t id)
1725 {
1726 	if(l->tail  == NULL){
1727 		l->head = l->tail = malloc(sizeof(*l->tail));
1728 		if(l->tail == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1729 		l->chunk_qty =1;
1730 		l->tail_qty = 0;
1731 	}
1732 	if(l->tail_qty == ID_CHUNK_SZ){/** chunk is full **/
1733 		l->tail->next = malloc(sizeof(*l->tail));
1734 		if(l->tail == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1735 		l->tail = l->tail->next;
1736 		l->chunk_qty ++;
1737 		l->tail_qty = 0;
1738 	}
1739 	l->tail->id[l->tail_qty++]=id;
1740 	return 0;
1741 }
1742 
AlignIdListAddId(AlignIdList * l,const int64_t id)1743 static rc_t AlignIdListAddId(AlignIdList *l,const int64_t id)
1744 {
1745 	uint32_t  sub_id,id32;
1746 	if(id < 0) return RC(rcAlign, rcTable, rcCommitting, rcId, rcOutofrange);
1747 	id32 = (uint32_t) id;
1748 	sub_id = id >> 32;
1749 	if(sub_id >= l->sub_list_count) return RC(rcAlign, rcTable, rcCommitting, rcId, rcOutofrange);
1750 	if(l->sub_list[sub_id] == NULL){
1751 		l->sub_list[sub_id] = calloc(1,sizeof(AlignId32List));
1752 		if(l->sub_list[sub_id] == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1753 	}
1754 	return AlignId32ListAddId(l->sub_list[sub_id],id32);
1755 }
1756 
AlignId32ListRelease(AlignId32List * l)1757 static void AlignId32ListRelease(AlignId32List *l)
1758 {
1759 	if(l){
1760 		while(l->head != l->tail){
1761 			TChunk32* head = l->head;
1762 			l->head = l->head->next;
1763 			free(head);
1764 		}
1765 		free(l->head);
1766 		free(l);
1767 	}
1768 }
AlignIdListRelease(AlignIdList * l)1769 static void AlignIdListRelease(AlignIdList *l)
1770 {
1771         if(l){
1772 		uint32_t i;
1773 		for(i=0;i<l->sub_list_count;i++){
1774 			AlignId32ListRelease(l->sub_list[i]);
1775 		}
1776 		free(l->sub_list);
1777 		free(l);
1778         }
1779 }
1780 
1781 
1782 typedef struct {
1783     AlignIdList*	idlist;
1784     ReferenceSeqCoverage cover;
1785     INSDC_coord_len bin_seq_len;
1786 } TCover;
1787 
1788 static
ReferenceMgr_TCoverRelease(TCover * c)1789 void ReferenceMgr_TCoverRelease(TCover* c)
1790 {
1791 	if(c){
1792 		AlignIdListRelease(c->idlist);
1793 		c->idlist = NULL;
1794 	}
1795 }
1796 static
ReferenceMgr_TCoverSetMaxId(TCover * c,int64_t id)1797 rc_t ReferenceMgr_TCoverSetMaxId(TCover* c,int64_t id)
1798 {
1799 	uint32_t  sub_id;
1800 	if(id < 0) return RC(rcAlign, rcTable, rcCommitting, rcId, rcOutofrange);
1801 	sub_id = id >> 32;
1802 	if(c->idlist == NULL){
1803 		c->idlist = calloc(1,sizeof(AlignIdList));
1804 		if(c->idlist==NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1805 		c->idlist->sub_list_count = sub_id+1;
1806 		c->idlist->sub_list = calloc(c->idlist->sub_list_count,sizeof(c->idlist->sub_list[0]));
1807 		if(c->idlist->sub_list == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1808 	} else {
1809 		return RC(rcAlign, rcTable, rcCommitting, rcParam, rcUnexpected);
1810 	}
1811 	return 0;
1812 }
1813 
1814 static
CoverageGetSeqLen(ReferenceMgr const * const mgr,TCover data[],uint64_t const rows)1815 rc_t CoverageGetSeqLen(ReferenceMgr const *const mgr, TCover data[], uint64_t const rows)
1816 {
1817     TableReaderColumn acols[] =
1818     {
1819         {0, "(INSDC:coord:len)SEQ_LEN", {NULL}, 0, 0},
1820         {0, NULL, {NULL}, 0, 0}
1821     };
1822     VTable const *tbl;
1823     rc_t rc = VDatabaseOpenTableRead(mgr->db, &tbl, "REFERENCE");
1824 
1825     if (rc == 0) {
1826         TableReader const *reader;
1827 
1828         rc = TableReader_Make(&reader, tbl, acols, 0);
1829         if (rc == 0) {
1830             uint64_t i;
1831 
1832             for (i = 0; i != rows; ++i) {
1833                 rc = TableReader_ReadRow(reader, i + 1);
1834                 if (rc == 0 && acols->len > 0)
1835                     data[i].bin_seq_len = acols->base.coord_len[0];
1836             }
1837             TableReader_Whack(reader);
1838         }
1839         VTableRelease(tbl);
1840     }
1841     return rc;
1842 }
1843 
1844 static
ReferenceMgr_ReCover(const ReferenceMgr * cself,uint64_t ref_rows,rc_t (* const quitting)(void))1845 rc_t ReferenceMgr_ReCover(const ReferenceMgr* cself, uint64_t ref_rows, rc_t (*const quitting)(void))
1846 {
1847     rc_t rc = 0;
1848     uint64_t new_rows = 0;
1849     const TableWriterRefCoverage* cover_writer = NULL;
1850 
1851     TableReaderColumn acols[] =
1852     {
1853         {0, "REF_ID", {NULL}, 0, 0},
1854         {0, "REF_START", {NULL}, 0, 0},
1855         {0, "CIGAR_LONG",{NULL}, 0, 0},
1856         {0, "REF_POS",{NULL}, 0, 0},
1857         {0, NULL, {NULL}, 0, 0}
1858     };
1859     const int64_t** al_ref_id = &acols[0].base.i64;
1860     const INSDC_coord_zero** al_ref_start = &acols[1].base.coord0;
1861     const TableReaderColumn* cigar =  &acols[2];
1862     const INSDC_coord_zero** al_ref_pos = &acols[3].base.coord0;
1863     /* order is important see ReferenceSeqCoverage struct */
1864     struct {
1865         const char* nm;
1866 	    const char* col;
1867         bool ids_only;
1868     } tbls[] = { {"PRIMARY_ALIGNMENT", "PRIMARY_ALIGNMENT_IDS",false},
1869         {"SECONDARY_ALIGNMENT", "SECONDARY_ALIGNMENT_IDS",false},
1870         {"EVIDENCE_INTERVAL", "EVIDENCE_INTERVAL_IDS", true} };
1871 	int tbls_qty=(sizeof(tbls)/sizeof(tbls[0]));
1872     rc_t rc1 = 0;
1873     int64_t rr;
1874     uint32_t i;
1875     uint8_t* hilo=NULL;
1876     TCover* data = NULL;
1877 
1878     /* allocate mem for ref_rows of reference coverage*/
1879     if((data = calloc(ref_rows, (sizeof(*data) + cself->max_seq_len))) == NULL) {
1880 		rc = RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1881     } else {
1882 		/** allocation for both data and hilo was done in 1 shot ***/
1883 		hilo = (uint8_t *)&data[ref_rows];
1884         rc = CoverageGetSeqLen(cself, data, ref_rows);
1885     }
1886     /* grep through tables for coverage data */
1887     ALIGN_R_DBG("covering REFERENCE rowid range [1:%ld]",ref_rows);
1888     for(i = 0; rc == 0 && i < tbls_qty; i++) { /* TABLE LOOP STARTS */
1889         const VTable* table = NULL;
1890         const TableReader* reader = NULL;
1891         int64_t al_from;
1892         uint64_t al_qty;
1893 
1894         ALIGN_R_DBG("covering REFERENCE with %s", tbls[i].nm);
1895         if((rc = VDatabaseOpenTableRead(cself->db, &table, "%s", tbls[i].nm)) != 0) {
1896             if(GetRCState(rc) == rcNotFound) {
1897                 ALIGN_R_DBG("table %s was not found, ignored", tbls[i].nm);
1898                 rc = 0;
1899                 continue;
1900             } else {
1901                 break;
1902             }
1903         }
1904         if((rc = TableReader_Make(&reader, table, acols, cself->cache)) == 0 &&
1905            (rc = TableReader_IdRange(reader, &al_from, &al_qty)) == 0) {
1906             int64_t al_rowid;
1907 
1908             for(al_rowid = al_from; rc == 0 && al_rowid < al_from + al_qty; al_rowid++) {
1909                 if((rc = TableReader_ReadRow(reader, al_rowid)) != 0) {
1910                     break;
1911                 }
1912                 rr    = **al_ref_id-1;
1913                 /**** Record ALIGNMENT_IDS ***/
1914                 if(  data[rr].idlist == NULL
1915                    && (rc = ReferenceMgr_TCoverSetMaxId(data+rr,al_from + al_qty))!=0){
1916                     break; /*** out-of-memory ***/
1917                 }
1918                 if((rc = AlignIdListAddId(data[rr].idlist,al_rowid))!=0){
1919                     break; /*** out-of-memory ***/
1920                 }
1921                 /**** Done alignment ids ***/
1922                 if(!tbls[i].ids_only) { /*** work on statistics ***/
1923                     char const *c = cigar->base.str;
1924                     const char *c_end = c + cigar->len;
1925                     int64_t const global_ref_pos = rr*cself->max_seq_len + **al_ref_start; /** global_ref_start **/
1926                     int64_t const global_refseq_start = global_ref_pos -  **al_ref_pos;   /** global_ref_start of current reference **/
1927                     unsigned const bin_no = (unsigned)(global_ref_pos / cself->max_seq_len);
1928                     TCover *const bin = &data[bin_no];
1929                     uint8_t *const cov = &hilo[global_ref_pos];
1930                     int64_t ref_offset = 0;
1931                     int64_t max_ref_offset = 0;
1932                     int64_t min_ref_offset = 0;
1933                     int64_t j;
1934 
1935                     while (rc == 0 && c < c_end) {
1936                         int op_len = (int)strtol(c, (char **)&c, 10);
1937                         int const op = *c++;
1938 
1939                         switch (op){
1940                         case 'I':/* extra bases in the read **/
1941                             ++bin->cover.indels;
1942                         case 'S':/* skip in the read */
1943                             break;
1944                         case 'B':/* back up in the sequence */
1945                             if (ref_offset > op_len)
1946                                 ref_offset -= op_len;
1947                             else
1948                                 ref_offset = 0;
1949                             break;
1950                         case 'D': /** delete in the reference ***/
1951                             ++bin->cover.indels;
1952                         case 'N': /** expected skip in the reference ***/
1953                             ref_offset += op_len;
1954                             break;
1955                         case 'X':
1956                             bin->cover.mismatches += op_len;
1957                         case '=':
1958                             ref_offset += op_len;
1959                             break;
1960                         default:
1961                             rc = RC(rcAlign, rcTable, rcCommitting, rcData, rcUnrecognized);
1962                         }
1963                         if (min_ref_offset > ref_offset)
1964                             min_ref_offset = ref_offset;
1965                         if (max_ref_offset < ref_offset)
1966                             max_ref_offset = ref_offset;
1967                     }
1968                     for (j = min_ref_offset; j < max_ref_offset; ++j) {
1969                         unsigned const hl = cov[j];
1970 
1971                         if (hl < UINT8_MAX)
1972                             cov[j] = hl + 1;
1973                     }
1974                     /*** check if OVERLAPS are needed ***/
1975                     {
1976                         int64_t min_rr = (global_ref_pos + min_ref_offset)/cself->max_seq_len;
1977                         int64_t max_rr = (global_ref_pos + max_ref_offset)/cself->max_seq_len;
1978 
1979                         if(min_rr < 0) min_rr = 0;
1980                         if(max_rr >= ref_rows) max_rr = ref_rows -1;
1981 
1982                         assert(min_rr<= max_rr);
1983 
1984                         if(min_rr < max_rr){
1985                             int64_t  overlap_ref_pos; /** relative the beginning of the reference **/
1986                             uint32_t overlap_ref_len = (global_ref_pos + max_ref_offset) % cself->max_seq_len ;
1987 
1988                             min_rr++;
1989                             if (global_ref_pos + min_ref_offset > global_refseq_start) {
1990                                 overlap_ref_pos = global_ref_pos + min_ref_offset - global_refseq_start;
1991                             }
1992                             else {
1993                                 overlap_ref_pos = 1;
1994                             }
1995                             for (; min_rr < max_rr; ++min_rr) {
1996                                 if (  data[min_rr].cover.overlap_ref_pos[i] == 0 /*** NOT SET***/
1997                                     || overlap_ref_pos < data[min_rr].cover.overlap_ref_pos[i])
1998                                 {
1999 									data[min_rr].cover.overlap_ref_pos[i] = (INSDC_coord_zero)overlap_ref_pos;
2000                                 }
2001                                 data[min_rr].cover.overlap_ref_len[i] = cself->max_seq_len; /*** in between chunks get full length of overlap **/
2002                             }
2003                             if (  data[min_rr].cover.overlap_ref_pos[i] == 0
2004                                 || overlap_ref_pos < data[min_rr].cover.overlap_ref_pos[i])
2005                             {
2006 								data[min_rr].cover.overlap_ref_pos[i] = (INSDC_coord_zero)overlap_ref_pos;
2007                             }
2008                             if (overlap_ref_len > data[min_rr].cover.overlap_ref_len[i])
2009 								data[min_rr].cover.overlap_ref_len[i] = overlap_ref_len;
2010                         }
2011                     }
2012                 } /**** DONE WITH WORK ON STATISTICS ***/
2013                 ALIGN_DBGERR(rc);
2014                 rc = rc ? rc : quitting();
2015             }
2016 		    /*** HAVE TO RELEASE **/
2017 		    TableReader_Whack(reader);
2018 		    VTableRelease(table);
2019 		    /*** NOW SAVE AND RELEASE THE COLUMN ***/
2020 		    if((rc = TableWriterRefCoverage_MakeIds(&cover_writer, cself->db, tbls[i].col)) == 0) {
2021                 for(rr=0; rc ==0 &&  rr < ref_rows; rr ++){
2022                     uint64_t num_elem = AlignIdListCount(data[rr].idlist);
2023                     uint64_t num_elem_copied = 0;
2024                     if(num_elem > 0){
2025 #define BUF_STACK_COUNT 128 * 1024
2026                         int64_t buf_stack[BUF_STACK_COUNT];
2027                         int64_t *buf_alloc = NULL;
2028                         int64_t *buf = buf_stack;
2029                         if(num_elem > BUF_STACK_COUNT){
2030                             buf=buf_alloc=malloc(num_elem*sizeof(buf[0]));
2031                             if(buf_alloc == NULL)
2032                                 rc = RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
2033                         }
2034                         if(rc == 0){
2035                             num_elem_copied = AlignIdListFlatCopy(data[rr].idlist,buf,num_elem,true);
2036                             assert(num_elem == num_elem_copied);
2037                         }
2038                         ReferenceMgr_TCoverRelease(data+rr); /** release memory ***/
2039                         if(rc == 0){
2040                             rc = TableWriterRefCoverage_WriteIds(cover_writer, rr+1, buf, (uint32_t)num_elem);
2041                         }
2042                         if(buf_alloc) free(buf_alloc);
2043                     } else {
2044                         rc = TableWriterRefCoverage_WriteIds(cover_writer, rr+1, NULL,0);
2045                     }
2046                 }
2047                 if(rc == 0){
2048                     rc = TableWriterRefCoverage_Whack(cover_writer, rc == 0, &new_rows);
2049                     if(rc == 0  && ref_rows != new_rows) {
2050                         rc = RC(rcAlign, rcTable, rcCommitting, rcData, rcInconsistent);
2051                     }
2052                 }
2053                 ALIGN_DBGERR(rc);
2054 		    }
2055 		} else {
2056 			TableReader_Whack(reader);
2057 			VTableRelease(table);
2058 		}
2059 	}/* TABLE LOOP ENDS **/
2060     /* prep and write coverage data */
2061 	if(rc == 0) {
2062         uint64_t k;
2063 
2064 		rc = TableWriterRefCoverage_MakeCoverage(&cover_writer, cself->db, 0);
2065 		for (rr = 0, k = 0; rc == 0 && rr != ref_rows; ++rr, k += cself->max_seq_len) {
2066             unsigned hi = 0;
2067             unsigned lo = 255;
2068 
2069 		    for (i = 0; i != data[rr].bin_seq_len; ++i) {
2070                 unsigned const depth = hilo[k + i];
2071 
2072                 if (hi < depth) hi = depth;
2073                 if (lo > depth) lo = depth;
2074 		    }
2075             data[rr].cover.high = hi;
2076             data[rr].cover.low  = lo;
2077 		    rc = TableWriterRefCoverage_WriteCoverage(cover_writer,rr+1, &data[rr].cover);
2078 		}
2079 		free(data);
2080 		rc1 = TableWriterRefCoverage_Whack(cover_writer, rc == 0, &new_rows);
2081 		rc = rc ? rc : rc1;
2082 		if(rc == 0 && ref_rows != new_rows) {
2083 		    rc = RC(rcAlign, rcTable, rcCommitting, rcData, rcInconsistent);
2084 		}
2085 	}
2086     ALIGN_DBGERR(rc);
2087 	return rc;
2088 }
2089 
ReferenceMgr_Release(const ReferenceMgr * cself,const bool commit,uint64_t * const Rows,const bool build_coverage,rc_t (* const quitting)(void))2090 LIB_EXPORT rc_t CC ReferenceMgr_Release(const ReferenceMgr *cself,
2091                                         const bool commit,
2092                                         uint64_t *const Rows,
2093                                         const bool build_coverage,
2094                                         rc_t (*const quitting)(void)
2095                                        )
2096 {
2097     rc_t rc = 0;
2098 
2099     if (cself != NULL) {
2100         ReferenceMgr *const self = (ReferenceMgr *)cself;
2101         uint64_t rows = 0;
2102         unsigned i;
2103 
2104         freeHashTableEntries(self);
2105 
2106         rc = TableWriterRef_Whack(self->writer, commit, &rows);
2107         if (Rows) *Rows = rows;
2108         KDirectoryRelease(self->dir);
2109 
2110         for (i = 0; i != self->refSeqs.elem_count; ++i)
2111             ReferenceSeq_Whack(&self->refSeq[i]);
2112 
2113         KDataBufferWhack(&self->compress);
2114         KDataBufferWhack(&self->seq);
2115         KDataBufferWhack(&self->refSeqs);
2116 
2117         if (rc == 0 && build_coverage && commit && rows > 0)
2118             rc = ReferenceMgr_ReCover(cself, rows, quitting);
2119 #if 0
2120         {
2121             VTable* t = NULL;
2122 
2123             if (VDatabaseOpenTableUpdate(self->db, &t, "SECONDARY_ALIGNMENT") == 0) {
2124                 VTableDropColumn(t, "TMP_MISMATCH");
2125                 VTableDropColumn(t, "TMP_HAS_MISMATCH");
2126             }
2127             VTableRelease(t);
2128         }
2129 #endif
2130         VDatabaseRelease(self->db);
2131         RefSeqMgr_Release(self->rmgr);
2132         free(self);
2133     }
2134     return rc;
2135 }
2136 
2137 static
ReferenceSeq_ReadDirect(ReferenceSeq * self,int offset,unsigned const len,bool read_circular,uint8_t buffer[],unsigned * written,bool force_linear)2138 rc_t ReferenceSeq_ReadDirect(ReferenceSeq *self,
2139                              int offset,
2140                              unsigned const len,
2141                              bool read_circular,
2142                              uint8_t buffer[],
2143                              unsigned* written,
2144                              bool force_linear)
2145 {
2146     *written = 0;
2147     if (len == 0)
2148         return 0;
2149 
2150     if (read_circular || self->circular) {
2151         if (offset < 0) {
2152             unsigned const n = (-offset) / self->seq_len;
2153             offset = (self->seq_len * (n + 1) + offset) % self->seq_len;
2154         }
2155         else if (offset > self->seq_len)
2156             offset %= self->seq_len;
2157     }
2158     else if (offset >= self->seq_len)
2159         return RC(rcAlign, rcType, rcReading, rcOffset, rcOutofrange);
2160 
2161     if (self->type == rst_local) {
2162         uint8_t const *const src = self->u.local.buf.base;
2163         unsigned dst_off = 0;
2164 
2165         while (dst_off < len) {
2166             unsigned const writable = len - dst_off;
2167             unsigned const readable = self->seq_len - offset;
2168             unsigned const to_write = readable < writable ? readable : writable;
2169 
2170             memmove(&buffer[dst_off], &src[offset], to_write);
2171             *written += to_write;
2172             if (!self->circular)
2173                 break;
2174             offset = 0;
2175             dst_off += to_write;
2176         }
2177         return 0;
2178     }
2179     else if (self->type == rst_refSeqById || self->type == rst_refSeqBySeqId) {
2180         unsigned to_write = len;
2181 
2182         if (!self->circular || force_linear) {
2183             unsigned const readable = self->seq_len - offset;
2184 
2185             if (to_write > readable)
2186                 to_write = readable;
2187         }
2188         return RefSeq_Read(self->u.refseq, offset, to_write, buffer, written);
2189     }
2190     return RC(rcAlign, rcType, rcReading, rcType, rcInvalid);
2191 }
2192 
2193 static
ReferenceMgr_LoadSeq(ReferenceMgr * const self,ReferenceSeq * obj)2194 rc_t ReferenceMgr_LoadSeq(ReferenceMgr *const self, ReferenceSeq *obj)
2195 {
2196     KDataBuffer readBuf;
2197     rc_t rc = KDataBufferMake(&readBuf, 8, self->max_seq_len);
2198 
2199     if (rc == 0) {
2200         char const *const id = obj->id;
2201         char const *const seqId = obj->seqId ? obj->seqId : id;
2202         TableWriterRefData data;
2203         INSDC_coord_zero offset = 0;
2204 
2205         obj->start_rowid = self->ref_rowid + 1;
2206         data.name.buffer = id;
2207         data.name.elements = string_size(id);
2208         data.read.buffer = readBuf.base;
2209         data.seq_id.buffer = seqId;
2210         data.seq_id.elements = string_size(seqId);
2211         data.force_READ_write = obj->type == rst_local || (self->options & ewrefmgr_co_allREADs);
2212         data.circular = obj->circular;
2213 
2214         if (self->writer == NULL) {
2215             uint32_t wopt = 0;
2216 
2217             wopt |= (self->options & ewrefmgr_co_allREADs) ? ewref_co_SaveRead : 0;
2218             wopt |= (self->options & ewrefmgr_co_Coverage) ? ewref_co_Coverage : 0;
2219             if((rc = TableWriterRef_Make(&self->writer, self->db, wopt)) == 0) {
2220                 TableWriterData mlen;
2221 
2222                 mlen.buffer = &self->max_seq_len;
2223                 mlen.elements = 1;
2224                 rc = TableWriterRef_WriteDefaultData(self->writer, ewrefd_cn_MAX_SEQ_LEN, &mlen);
2225             }
2226         }
2227         while (rc == 0 && offset < obj->seq_len) {
2228             unsigned row_len;
2229 
2230             rc = ReferenceSeq_ReadDirect(obj, offset, self->max_seq_len, false,
2231                                          readBuf.base, &row_len, true);
2232             if (rc != 0 || row_len == 0) break;
2233 
2234             data.read.elements = row_len;
2235             rc = TableWriterRef_Write(self->writer, &data, NULL);
2236             offset += row_len;
2237             ++self->ref_rowid;
2238         }
2239         KDataBufferWhack(&readBuf);
2240     }
2241     return rc;
2242 }
2243 
ReferenceMgr_GetSeq(ReferenceMgr const * const cself,ReferenceSeq const ** const seq,const char * const id,bool * const shouldUnmap,bool const allowMultiMapping,bool wasRenamed[])2244 LIB_EXPORT rc_t CC ReferenceMgr_GetSeq(ReferenceMgr const *const cself,
2245                                        ReferenceSeq const **const seq,
2246                                        const char *const id,
2247                                        bool *const shouldUnmap,
2248                                        bool const allowMultiMapping,
2249                                        bool wasRenamed[])
2250 {
2251     ReferenceMgr *const self = (ReferenceMgr *)cself;
2252 
2253     if  (self == NULL || seq == NULL || id == NULL)
2254         return RC(rcAlign, rcFile, rcConstructing, rcParam, rcNull);
2255 
2256     *seq = NULL;
2257     *shouldUnmap = false;
2258     {
2259         ReferenceSeq *obj;
2260         rc_t rc = ReferenceMgr_OpenSeq(self, &obj, id, 0, NULL, allowMultiMapping, wasRenamed);
2261 
2262         if (rc) return rc;
2263         if (obj->type == rst_unmapped) {
2264             *shouldUnmap = true;
2265             return 0;
2266         }
2267         if (obj->start_rowid == 0) {
2268             rc = ReferenceMgr_LoadSeq(self, obj);
2269             if (rc) return rc;
2270         }
2271         *seq = obj;
2272     }
2273     return 0;
2274 }
2275 
ReferenceMgr_Verify(ReferenceMgr const * const cself,char const id[],INSDC_coord_len const length,uint8_t const md5[16],bool const allowMultiMapping,bool wasRenamed[])2276 LIB_EXPORT rc_t CC ReferenceMgr_Verify(ReferenceMgr const *const cself,
2277                                        char const id[],
2278                                        INSDC_coord_len const length,
2279                                        uint8_t const md5[16],
2280                                        bool const allowMultiMapping,
2281                                        bool wasRenamed[])
2282 {
2283     if (cself == NULL || id == NULL)
2284         return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
2285     {
2286         ReferenceMgr *self = (ReferenceMgr *)cself;
2287         ReferenceSeq *rseq;
2288         rc_t rc = ReferenceMgr_OpenSeq(self, &rseq, id, length, md5, allowMultiMapping, wasRenamed);
2289 
2290         if (rc) return rc;
2291         if (rseq->seq_len != length) {
2292             rc = RC(rcAlign, rcFile, rcValidating, rcSize, rcUnequal);
2293             ALIGN_DBGERRP("%s->%s SEQ_LEN verification", rc, id, rseq->seqId);
2294         }
2295         if (md5 && memcmp(md5, rseq->md5, sizeof(rseq->md5)) != 0) {
2296             unsigned i;
2297 
2298             rc = RC(rcAlign, rcTable, rcValidating, rcChecksum, rcUnequal);
2299             ALIGN_DBGERRP("%s->%s MD5 verification", rc, id, rseq->seqId);
2300             ALIGN_DBGF((" found '"));
2301             for(i = 0; i < sizeof(rseq->md5); i++) {
2302                 ALIGN_DBGF(("%02hx", rseq->md5[i]));
2303             }
2304             ALIGN_DBGF(("'  != requested '"));
2305             for(i = 0; i < sizeof(rseq->md5); i++) {
2306                 ALIGN_DBGF(("%02hx", md5[i]));
2307             }
2308             ALIGN_DBGF(("'\n"));
2309         } else {
2310             ALIGN_DBG("%s->%s MD5 verification ok", id, rseq->seqId);
2311         }
2312         if(rc == 0) {
2313             ALIGN_DBG("%s verification ok", id);
2314         } else {
2315             ALIGN_DBGERRP("%s verification", rc, id);
2316         }
2317         return rc;
2318     }
2319 }
2320 
ReferenceMgr_Get1stRow(const ReferenceMgr * cself,int64_t * row_id,char const id[])2321 LIB_EXPORT rc_t CC ReferenceMgr_Get1stRow(const ReferenceMgr* cself, int64_t* row_id, char const id[])
2322 {
2323     rc_t rc = 0;
2324     ReferenceSeq *seq;
2325 
2326     if (cself == NULL || row_id == NULL) {
2327         rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2328     }
2329     else if ((seq = ReferenceMgr_FindSeq(cself, id)) == NULL)
2330         rc = RC(rcAlign, rcFile, rcReading, rcId, rcNotFound);
2331     else {
2332         *row_id = seq->start_rowid;
2333     }
2334     return rc;
2335 }
2336 
ReferenceMgr_FastaPath(const ReferenceMgr * cself,const char * fasta_path)2337 LIB_EXPORT rc_t CC ReferenceMgr_FastaPath(const ReferenceMgr* cself, const char* fasta_path)
2338 {
2339     rc_t rc = 0;
2340     KDirectory* dir;
2341 
2342     if(cself == NULL || fasta_path == NULL) {
2343         rc = RC(rcAlign, rcFile, rcConstructing, rcParam, rcNull);
2344     } else if((rc = KDirectoryNativeDir(&dir)) == 0) {
2345         const KFile* kf;
2346         if((rc = KDirectoryOpenFileRead(dir, &kf, "%s", fasta_path)) == 0) {
2347             rc = ReferenceMgr_FastaFile(cself, kf);
2348             KFileRelease(kf);
2349         }
2350         KDirectoryRelease(dir);
2351     }
2352     ALIGN_DBGERRP("from file %s", rc, fasta_path);
2353     return rc;
2354 }
2355 
ReferenceMgr_FastaFile(const ReferenceMgr * cself,const KFile * file)2356 LIB_EXPORT rc_t CC ReferenceMgr_FastaFile(const ReferenceMgr* cself, const KFile* file)
2357 {
2358     if(cself == NULL || file == NULL) {
2359         return RC(rcAlign, rcFile, rcConstructing, rcParam, rcNull);
2360     }
2361     return ImportFastaFile((ReferenceMgr *)cself, file, NULL);
2362 }
2363 
2364 typedef struct {
2365     unsigned length: 28, gentype:4, type: 8, code: 8;
2366 } cigar_bin_t;
2367 
2368 static
cigar2offset_2(unsigned const cigar_len,cigar_bin_t const cigar[],unsigned const out_sz,unsigned const out_used,compress_buffer_t out_offset[],INSDC_coord_len out_seq_len[],INSDC_coord_len out_ref_len[],INSDC_coord_len out_max_ref_len[])2369 rc_t cigar2offset_2(unsigned const cigar_len,
2370                     cigar_bin_t const cigar[],
2371                     unsigned const out_sz,
2372                     unsigned const out_used,
2373                     compress_buffer_t out_offset[],
2374                     INSDC_coord_len out_seq_len[],
2375                     INSDC_coord_len out_ref_len[],
2376                     INSDC_coord_len out_max_ref_len[])
2377 {
2378     unsigned i;
2379     INSDC_coord_len seq_len = 0;
2380     INSDC_coord_len ref_len = 0;
2381     INSDC_coord_len max_ref_len = 0;
2382 
2383     for (i = 0; i < cigar_len; ++i) {
2384         unsigned const op_len = cigar[i].length;
2385         char const op = cigar[i].code;
2386         uint8_t const type = cigar[i].type;
2387 
2388         switch(op) {
2389             case 'M':
2390             case '=':
2391             case 'X':
2392                 seq_len += op_len;
2393                 ref_len += op_len;
2394                 if(max_ref_len < ref_len)
2395                     max_ref_len = ref_len;
2396                 break;
2397             case 'B':
2398                 /* Complete Genomics CIGAR style specific:
2399                  overlap between consecutive reads
2400                  ex: sequence 6 bases: ACACTG, reference 2 bases: ACTG,
2401                  cigar will be: 2M2B2M
2402                  no need to move sequence position
2403                  */
2404             case 'S':
2405             case 'I':
2406                 if (seq_len < out_sz) {
2407                     out_offset[seq_len].length = -op_len;
2408                     out_offset[seq_len].type   = type;
2409                     ALIGN_C_DBGF(("%s:%u: seq_pos: %u, ref_pos: %u, offset: %i\n", __func__, __LINE__, seq_len, ref_len, -op_len));
2410                     if (op == 'B') ref_len -= op_len;
2411                     else           seq_len += op_len;
2412                 }
2413                 else
2414                     return RC(rcAlign, rcFile, rcProcessing, rcData, rcInconsistent);
2415                 break;
2416             case 'N':
2417             case 'D':
2418                 if (seq_len < out_sz) {
2419                     out_offset[seq_len].length = op_len;
2420                     out_offset[seq_len].type   = type;
2421                     ALIGN_C_DBGF(("%s:%u: seq_pos: %u, ref_pos: %u, offset: %i\n", __func__, __LINE__, seq_len, ref_len, op_len));
2422                 }
2423                 else {
2424                     out_offset[seq_len-1].length = op_len;
2425                     out_offset[seq_len-1].type   = type;
2426                     ALIGN_C_DBGF(("%s:%u: seq_pos: %u, ref_pos: %u, offset: %i\n", __func__, __LINE__, seq_len-1, ref_len, op_len));
2427                 }
2428                 ref_len += op_len;
2429                 if(max_ref_len < ref_len)
2430                     max_ref_len = ref_len;
2431                 break;
2432             default:
2433                 break;
2434         }
2435     }
2436     out_seq_len[0] = seq_len;
2437     out_ref_len[0] = ref_len;
2438     out_max_ref_len[0] = max_ref_len;
2439 
2440     ALIGN_C_DBGF(("%s:%u: SEQLEN: %u, REFLEN: %u, MAXREFLEN: %u\n", __func__, __LINE__, seq_len, ref_len, max_ref_len));
2441 
2442     return 0;
2443 }
2444 
2445 static char const cigar_op_codes[] = "MIDNSHP=XB";
2446 
2447 static NCBI_align_ro_type const cigar_op_types[] = {
2448     NCBI_align_ro_normal,			/* M */
2449     NCBI_align_ro_normal,			/* I */
2450     NCBI_align_ro_normal,			/* D */
2451     NCBI_align_ro_intron_unknown,	/* N */
2452     NCBI_align_ro_soft_clip,		/* S */
2453     NCBI_align_ro_normal,			/* H */
2454     NCBI_align_ro_normal,			/* P */
2455     NCBI_align_ro_normal,			/* = */
2456     NCBI_align_ro_normal,			/* X */
2457     NCBI_align_ro_complete_genomics	/* B */
2458 };
2459 
2460 enum {
2461     gen_match_type,
2462     gen_insert_type,
2463     gen_delete_type,
2464     gen_ignore_type
2465 };
2466 
2467 static int const cigar_op_gentypes[] = {
2468     gen_match_type,			/* M */
2469     gen_insert_type,		/* I */
2470     gen_delete_type,		/* D */
2471     gen_delete_type,		/* N */
2472     gen_insert_type,		/* S */
2473     gen_ignore_type,		/* H */
2474     gen_ignore_type,		/* P */
2475     gen_match_type,			/* = */
2476     gen_match_type,			/* X */
2477     gen_insert_type			/* B */
2478 };
2479 
2480 static
cigar_bin(cigar_bin_t cigar[],unsigned const cigar_len,void const * cigar_in)2481 rc_t cigar_bin(cigar_bin_t cigar[],
2482                unsigned const cigar_len,
2483                void const *cigar_in)
2484 {
2485     unsigned i;
2486     uint32_t const *const cigar_bin = cigar_in;
2487 
2488     ALIGN_C_DBGF(("%s:%u: '", __func__, __LINE__));
2489     for (i = 0; i < cigar_len; ++i) {
2490         uint32_t c;
2491 
2492         memmove(&c, cigar_bin + i, 4);
2493         {
2494             int const op = c & 0x0F;
2495             int const len = c >> 4;
2496 
2497             if (op >= sizeof(cigar_op_codes)) {
2498                 rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcUnrecognized);
2499                 (void)PLOGERR(klogErr, (klogErr, rc, "Invalid or unrecognized CIGAR operation (binary code: $(opbin))", "opbin=%u", op));
2500                 return rc;
2501             }
2502             ALIGN_C_DBGF(("%u%c", len, cigar_op_codes[op]));
2503             cigar[i].length = len;
2504             cigar[i].code = cigar_op_codes[op];
2505             cigar[i].type = cigar_op_types[op];
2506             cigar[i].gentype = cigar_op_gentypes[op];
2507         }
2508     }
2509     ALIGN_C_DBGF(("'[%u]\n", cigar_len));
2510     return 0;
2511 }
2512 
2513 static
cigar_string(cigar_bin_t cigar[],unsigned const cigar_len,void const * cigar_in)2514 rc_t cigar_string(cigar_bin_t cigar[],
2515                   unsigned const cigar_len,
2516                   void const *cigar_in)
2517 {
2518     unsigned i;
2519     unsigned j;
2520     char const *const cigar_string = cigar_in;
2521 
2522     ALIGN_C_DBGF(("%s:%u: '%s'[%u]\n", __func__, __LINE__, cigar_in, cigar_len));
2523     for (i = j = 0; j < cigar_len; ++j) {
2524         int len = 0;
2525 
2526         for (; ;) {
2527             int const ch = cigar_string[i++];
2528 
2529             if (isdigit(ch))
2530                 len = (len * 10) + (ch - '0');
2531             else {
2532                 int op;
2533 
2534                 for (op = 0; op < sizeof(cigar_op_codes); ++op) {
2535                     if (ch == cigar_op_codes[op])
2536                         break;
2537                 }
2538                 if (op == sizeof(cigar_op_codes)) {
2539                     rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcUnrecognized);
2540                     (void)PLOGERR(klogErr, (klogErr, rc, "Invalid or unrecognized CIGAR operation '$(opcode)'", "opcode=%c", ch));
2541                     return rc;
2542                 }
2543                 cigar[j].length = len;
2544                 cigar[j].code = cigar_op_codes[op];
2545                 cigar[j].type = cigar_op_types[op];
2546                 cigar[j].gentype = cigar_op_gentypes[op];
2547                 break;
2548             }
2549         }
2550     }
2551     return 0;
2552 }
2553 
cigar_string_op_count(char const cigar[])2554 static int cigar_string_op_count(char const cigar[])
2555 {
2556     unsigned n = 0;
2557     int st = 0;
2558     int i = 0;
2559 
2560     for (; ;) {
2561         int const ch = cigar[i];
2562 
2563         if (ch == '\0')
2564         break;
2565 
2566         switch (st) {
2567             case 0:
2568                 if (!isdigit(ch))
2569                 return -1;
2570                 ++st;
2571                 break;
2572             case 1:
2573                 if (!isdigit(ch)) {
2574                     ++n;
2575                     --st;
2576                 }
2577                 break;
2578         }
2579         ++i;
2580     }
2581     return st == 0 ? n : -1;
2582 }
2583 
cigar_remove_ignored(unsigned opcount,cigar_bin_t cigar[])2584 static unsigned cigar_remove_ignored(unsigned opcount, cigar_bin_t cigar[])
2585 {
2586     unsigned i = opcount;
2587 
2588     while (i) {
2589         unsigned const oi = i;
2590         unsigned const type = cigar[--i].gentype;
2591 
2592         if (type == gen_ignore_type) {
2593             memmove(cigar + i, cigar + oi, (opcount - oi) * sizeof(cigar[0]));
2594             --opcount;
2595         }
2596     }
2597     return opcount;
2598 }
2599 
2600 static
cigar2offset(int const options,unsigned const cigar_len,void const * in_cigar,unsigned const out_sz,unsigned const out_used,uint8_t const intron_type,compress_buffer_t out_offset[],INSDC_coord_len out_seq_len[],INSDC_coord_len out_ref_len[],INSDC_coord_len out_max_ref_len[],INSDC_coord_len out_adjust[])2601 rc_t cigar2offset(int const options,
2602                   unsigned const cigar_len,
2603                   void const *in_cigar,
2604                   unsigned const out_sz,
2605                   unsigned const out_used,
2606                   uint8_t const intron_type,
2607                   compress_buffer_t out_offset[],
2608                   INSDC_coord_len out_seq_len[],
2609                   INSDC_coord_len out_ref_len[],
2610                   INSDC_coord_len out_max_ref_len[],
2611                   INSDC_coord_len out_adjust[])
2612 {
2613     bool const binary = (options & ewrefmgr_cmp_Binary) ? true : false;
2614     int const maxopcount = binary ? cigar_len : cigar_string_op_count(in_cigar);
2615 
2616     memset(out_offset, 0, out_used * sizeof(*out_offset));
2617 
2618     if (maxopcount > 0) {
2619         cigar_bin_t  scigar[256];
2620         cigar_bin_t *hcigar = NULL;
2621         cigar_bin_t *cigar = scigar;
2622 
2623         if (maxopcount > sizeof(scigar)/sizeof(scigar[0])) {
2624             hcigar = malloc(maxopcount * sizeof(hcigar[0]));
2625 
2626             if (hcigar == NULL) {
2627                 rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcMemory, rcExhausted);
2628                 (void)PLOGERR(klogErr, (klogErr, rc, "out of memory trying to allocate $(bytes) bytes for CIGAR operations", "bytes=%u", (unsigned)(maxopcount * sizeof(hcigar[0]))));
2629                 return rc;
2630             }
2631             cigar = hcigar;
2632         }
2633         {
2634             rc_t const rc = (binary ? cigar_bin : cigar_string)(cigar,
2635                                                                 maxopcount,
2636                                                                 in_cigar);
2637             if (rc)
2638                 return rc;
2639         }
2640         /* check for hard clipping if not accepted */
2641         if ((options & ewrefmgr_co_AcceptHardClip) == 0) {
2642             unsigned i;
2643 
2644             for (i = 0; i < maxopcount; ++i) {
2645                 if (cigar[i].code == 'H') {
2646                     rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcNotAvailable);
2647                     (void)LOGERR(klogErr, rc, "Hard clipping of sequence data is not allowed");
2648                     return rc;
2649                 }
2650             }
2651         }
2652         {
2653             unsigned first = 0;
2654             unsigned opcount = cigar_remove_ignored(maxopcount, cigar);
2655 
2656             out_adjust[0] = 0;
2657             if ((options & ewrefmgr_cmp_Exact) == 0) {
2658                 /* remove any leading delete operations */
2659                 for (first = 0; first < opcount; ++first) {
2660                     if (cigar[first].gentype != gen_delete_type)
2661                         break;
2662                     out_adjust[0] += cigar[first].length;
2663                 }
2664                 /* make sure any adjacent deletes and inserts are ordered so that
2665                  * the delete follows the insert
2666                  */
2667                 {
2668                     unsigned i;
2669 #if 1
2670                     for (i = first; i < opcount - 1; ) {
2671                         cigar_bin_t const cur = cigar[i + 0];
2672                         cigar_bin_t const nxt = cigar[i + 1];
2673 
2674                         if (cur.gentype != gen_delete_type)
2675                             ;
2676                         else if (nxt.gentype == gen_delete_type) {
2677                             unsigned const type = (cur.type == NCBI_align_ro_normal && nxt.type == NCBI_align_ro_normal) ? NCBI_align_ro_normal : NCBI_align_ro_intron_unknown;
2678                             int const code = type == NCBI_align_ro_normal ? 'D' : 'N';
2679                             unsigned const length = cur.length + nxt.length;
2680 
2681                             --opcount;
2682                             memmove(cigar + i, cigar + i + 1, (opcount - i) * sizeof(cigar[0]));
2683 
2684                             cigar[i].type = type;
2685                             cigar[i].code = code;
2686                             cigar[i].length = length;
2687 
2688                             continue;
2689                         }
2690                         else if (nxt.gentype == gen_insert_type) {
2691                             if (nxt.type == NCBI_align_ro_complete_genomics) {
2692                                 assert(i + 2 < opcount);
2693                                 cigar[i + 0] = nxt;
2694                                 cigar[i + 1] = cigar[i + 2];
2695                                 cigar[i + 2] = cur;
2696                                 ++i;
2697                             }
2698                             else {
2699                                 cigar[i + 0] = nxt;
2700                                 cigar[i + 1] = cur;
2701                             }
2702                         }
2703                         ++i;
2704                     }
2705 #else
2706                     for (i = first + 1; i < opcount; ) {
2707                         if (cigar[i].gentype == gen_insert_type && cigar[i-1].gentype == gen_delete_type) {
2708                             cigar_bin_t const prv = cigar[i - 1];
2709                             cigar_bin_t const cur = cigar[i];
2710 
2711                             cigar[  i] = prv;
2712                             cigar[--i] = cur;
2713                             if (i <= first + 1)
2714                                 i  = first + 1;
2715                         }
2716                         else
2717                             ++i;
2718                     }
2719 #endif
2720                 }
2721                 /* merge adjacent delete type operations D+D -> D else becomes N */
2722                 {
2723                     unsigned i;
2724 
2725                     for (i = first + 1; i < opcount;) {
2726                         if (cigar[i].gentype == gen_delete_type && cigar[i-1].gentype == gen_delete_type) {
2727                             cigar[i].length += cigar[i-1].length;
2728                             if (cigar[i].type == NCBI_align_ro_normal && cigar[i-1].type == NCBI_align_ro_normal) {
2729                                 cigar[i].type = NCBI_align_ro_normal;
2730                                 cigar[i].code = 'D';
2731                             }
2732                             else {
2733                                 cigar[i].type = NCBI_align_ro_intron_unknown;
2734                                 cigar[i].code = 'N';
2735                             }
2736                             memmove(cigar + i - 1, cigar + i, (opcount - i) * sizeof(cigar[0]));
2737                             --opcount;
2738                         }
2739                         else
2740                             ++i;
2741                     }
2742                 }
2743             }
2744             /* remove any ignored operations */
2745             {
2746                 unsigned i = opcount;
2747 
2748                 while (i) {
2749                     unsigned const oi = i;
2750                     cigar_bin_t const op = cigar[--i];
2751 
2752                     if (op.gentype == gen_ignore_type) {
2753                         memmove(cigar + i, cigar + oi, (opcount - oi) * sizeof(cigar[0]));
2754                         --opcount;
2755                     }
2756                 }
2757             }
2758             /* make the intron the known type */
2759             {
2760                 unsigned i;
2761 
2762                 for (i = first; i < opcount; ++i) {
2763                     if (cigar[i].type == NCBI_align_ro_intron_unknown)
2764                         cigar[i].type = intron_type;
2765                 }
2766             }
2767             {
2768                 rc_t const rc = cigar2offset_2(opcount - first,
2769                                                cigar + first,
2770                                                out_sz,
2771                                                out_used,
2772                                                out_offset,
2773                                                out_seq_len,
2774                                                out_ref_len,
2775                                                out_max_ref_len);
2776                 if (hcigar)
2777                     free(hcigar);
2778                 return rc;
2779             }
2780         }
2781         {
2782         }
2783     }
2784     else {
2785         rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcUnrecognized);
2786         (void)PLOGERR(klogErr, (klogErr, rc, "Invalid CIGAR string '$(cigar)'", "cigar=%s", in_cigar));
2787         return rc;
2788     }
2789 }
2790 
ReferenceSeq_TranslateOffset_int(ReferenceSeq const * const cself,INSDC_coord_zero const offset,int64_t * const ref_id,INSDC_coord_zero * const ref_start,uint64_t * const global_ref_start)2791 LIB_EXPORT rc_t CC ReferenceSeq_TranslateOffset_int(ReferenceSeq const *const cself,
2792                                                     INSDC_coord_zero const offset,
2793                                                     int64_t *const ref_id,
2794                                                     INSDC_coord_zero *const ref_start,
2795                                                     uint64_t *const global_ref_start)
2796 {
2797     if(cself == NULL)
2798         return RC(rcAlign, rcFile, rcProcessing, rcSelf, rcNull);
2799 
2800     if (ref_id)
2801         *ref_id = cself->start_rowid + offset / cself->mgr->max_seq_len;
2802 
2803     if (ref_start)
2804         *ref_start = offset % cself->mgr->max_seq_len;
2805 
2806     if (global_ref_start)
2807         *global_ref_start = (cself->start_rowid - 1) * cself->mgr->max_seq_len + offset;
2808 
2809     return 0;
2810 }
2811 
ReferenceMgr_Compress(const ReferenceMgr * cself,uint32_t options,const char * id,INSDC_coord_zero offset,const char * seq,INSDC_coord_len seq_len,const void * cigar,uint32_t cigar_len,INSDC_coord_zero allele_offset,const char * allele,INSDC_coord_len allele_len,INSDC_coord_zero offset_in_allele,const void * allele_cigar,uint32_t allele_cigar_len,uint8_t const rna_orient,TableWriterAlgnData * data)2812 LIB_EXPORT rc_t CC ReferenceMgr_Compress(const ReferenceMgr* cself,
2813                                          uint32_t options,
2814                                          const char* id,
2815                                          INSDC_coord_zero offset,
2816                                          const char* seq,
2817                                          INSDC_coord_len seq_len,
2818                                          const void* cigar,
2819                                          uint32_t cigar_len,
2820                                          INSDC_coord_zero allele_offset,
2821                                          const char* allele,
2822                                          INSDC_coord_len allele_len,
2823                                          INSDC_coord_zero offset_in_allele,
2824                                          const void* allele_cigar,
2825                                          uint32_t allele_cigar_len,
2826                                          uint8_t const rna_orient,
2827                                          TableWriterAlgnData* data)
2828 {
2829     rc_t rc = 0;
2830     bool shouldUnmap = false;
2831     bool wasRenamed = false;
2832     const ReferenceSeq* refseq;
2833 
2834     if(cself == NULL || id == NULL) {
2835         rc = RC(rcAlign, rcFile, rcProcessing, rcParam, rcNull);
2836     }
2837     else if((rc = ReferenceMgr_GetSeq(cself, &refseq, id, &shouldUnmap, false, &wasRenamed)) == 0) {
2838         assert(shouldUnmap == false);
2839         assert(wasRenamed == false);
2840         rc = ReferenceSeq_Compress(refseq, options, offset, seq, seq_len, cigar, cigar_len,
2841                                    allele_offset, allele, allele_len,offset_in_allele,
2842                                    allele_cigar, allele_cigar_len, rna_orient, data);
2843         ReferenceSeq_Release(refseq);
2844     }
2845     ALIGN_C_DBGERR(rc);
2846     return rc;
2847 }
2848 
isMatchingBase(int const refBase,int const seqBase)2849 static bool isMatchingBase(int const refBase, int const seqBase)
2850 {
2851     return (refBase == seqBase || seqBase == '=' || (refBase == 'T' && seqBase == 'U'));
2852 }
2853 
ReferenceSeq_Compress(ReferenceSeq const * const cself,uint32_t options,INSDC_coord_zero offset,const char * seq,INSDC_coord_len seq_len,const void * cigar,uint32_t cigar_len,INSDC_coord_zero allele_offset,const char * allele,INSDC_coord_len allele_len,INSDC_coord_zero offset_in_allele,const void * allele_cigar,uint32_t allele_cigar_len,uint8_t const rna_orient,TableWriterAlgnData * data)2854 LIB_EXPORT rc_t CC ReferenceSeq_Compress(ReferenceSeq const *const cself,
2855                                          uint32_t options,
2856                                          INSDC_coord_zero offset,
2857                                          const char* seq, INSDC_coord_len seq_len,
2858                                          const void* cigar, uint32_t cigar_len,
2859                                          INSDC_coord_zero allele_offset, const char* allele,
2860                                          INSDC_coord_len allele_len,
2861                                          INSDC_coord_zero offset_in_allele,
2862                                          const void* allele_cigar, uint32_t allele_cigar_len,
2863                                          uint8_t const rna_orient,
2864                                          TableWriterAlgnData* data)
2865 {
2866     rc_t rc = 0;
2867     ReferenceSeq *const self = (ReferenceSeq *)cself;
2868 
2869     if (self == NULL || seq == NULL || cigar == NULL || cigar_len == 0 || data == NULL ||
2870         (!(allele == NULL && allele_len == 0 && allele_cigar == NULL && allele_cigar_len == 0) &&
2871          !(allele != NULL && allele_cigar != NULL && allele_cigar_len != 0)))
2872     {
2873         return RC(rcAlign, rcFile, rcProcessing, rcParam, rcInvalid);
2874     }
2875 
2876     if (seq_len > self->mgr->compress.elem_count) {
2877         rc = KDataBufferResize(&self->mgr->compress, seq_len);
2878         if (rc) return rc;
2879     }
2880     {
2881         INSDC_coord_len i, seq_pos = 0, allele_ref_end = 0, ref_len = 0, rl = 0, max_rl = 0;
2882         INSDC_coord_zero* read_start = &((INSDC_coord_zero*)(data->read_start.buffer))[data->ploidy];
2883         INSDC_coord_len* read_len = &((INSDC_coord_len*)(data->read_len.buffer))[data->ploidy];
2884         bool* has_ref_offset, *has_mismatch;
2885         int32_t *const ref_offset       = (int32_t *)data->ref_offset.buffer;
2886         uint8_t *const ref_offset_type  = (uint8_t *)data->ref_offset_type.buffer;
2887         uint8_t *mismatch;
2888         uint8_t sref_buf[64 * 1024];
2889         void *href_buf = NULL;
2890         uint8_t *ref_buf = sref_buf;
2891         compress_buffer_t allele_off_buf[1024];
2892         INSDC_coord_len position_adjust = 0;
2893 #if _DEBUGGING
2894         uint64_t i_ref_offset_elements, i_mismatch_elements;
2895         char x[4096];
2896 #endif
2897 
2898         if(data->ploidy == 0) {
2899             data->has_ref_offset.elements = seq_len;
2900             data->ref_offset.elements = 0;
2901             data->has_mismatch.elements = seq_len;
2902             data->mismatch.elements = 0;
2903             *read_start = 0;
2904         }
2905         else {
2906             data->has_ref_offset.elements += seq_len;
2907             data->has_mismatch.elements += seq_len;
2908             *read_start = read_start[-1] + read_len[-1];
2909         }
2910         *read_len = seq_len;
2911         has_ref_offset = &((bool*)(data->has_ref_offset.buffer))[*read_start];
2912         has_mismatch = &((bool*)(data->has_mismatch.buffer))[*read_start];
2913         mismatch = (uint8_t*)(data->mismatch.buffer);
2914 
2915 #if _DEBUGGING
2916         i_ref_offset_elements = data->ref_offset.elements;
2917         i_mismatch_elements = data->mismatch.elements;
2918         ALIGN_C_DBG("align%s '%.*s'[%u] to '%s:%s' at %i", (options & ewrefmgr_cmp_Exact) ? " EXACT" : "",
2919                     seq_len, seq, seq_len, cself->id, cself->seqId, offset);
2920 #endif
2921         if(allele != NULL) {
2922             /* determine length of reference for subst by allele */
2923             ALIGN_C_DBG("apply allele %.*s[%u] at %i w/cigar below",
2924                         allele_len, allele, allele_len, allele_offset);
2925             rc = cigar2offset(options|ewrefmgr_cmp_Exact,
2926                               allele_cigar_len,
2927                               allele_cigar,
2928                               sizeof(allele_off_buf) / sizeof(*allele_off_buf),
2929                               allele_len,
2930                               ' ',
2931                               allele_off_buf,
2932                               &seq_pos,
2933                               &allele_ref_end,
2934                               &max_rl,
2935                               &position_adjust);
2936             /* where allele ends on reference */
2937             allele_ref_end += allele_offset;
2938         }
2939         if(rc == 0) {
2940             rc = cigar2offset(options,
2941                               cigar_len,
2942                               cigar,
2943                               (unsigned)self->mgr->compress.elem_count,
2944                               seq_len,
2945                               rna_orient,
2946                               self->mgr->compress.base,
2947                               &seq_pos,
2948                               &rl,
2949                               &max_rl,
2950                               &position_adjust);
2951             offset += position_adjust;
2952         }
2953         if (allele != NULL) {
2954             if (allele_ref_end < offset || offset + rl < allele_offset) {
2955                 /* allele and alignment don't intersect             */
2956                 /* TODO: [VDB-1585] try to make this recoverable    */
2957                 rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInvalid);
2958                 (void)PLOGERR(klogWarn, (klogWarn, rc,
2959                     "allele $(a) offset $(ao) $(ac) is not within referenced region in $(id) at offset $(ro) $(rc)",
2960                     "a=%.*s,ao=%i,ac=%.*s,id=%s,ro=%i,rc=%.*s",
2961                     allele_len, allele, allele_offset, (options & ewrefmgr_cmp_Binary) ? 0 : allele_cigar_len, allele_cigar,
2962                     self->seqId, offset, (options & ewrefmgr_cmp_Binary) ? 0 : cigar_len, cigar));
2963             }
2964         }
2965         if(rc == 0) {
2966             ref_len = rl;
2967             if((offset + max_rl) > self->seq_len && !self->circular) {
2968                 max_rl = self->seq_len - offset;
2969                 if(max_rl < rl) {
2970                     /* ref_len used for compression cannot be shorter than ref_len derived from cigar,
2971                        if there is a shortage it will fail later here */
2972                     max_rl = rl;
2973                 }
2974                 ALIGN_C_DBG("max_ref_len truncated to %u cause it goes beyond refseq length %lu at offset %i",
2975                              max_rl, cself->seq_len, offset);
2976             }
2977             ALIGN_C_DBG("chosen REF_LEN %u, ref len for match %u", ref_len, max_rl);
2978 
2979             if (seq_len != seq_pos) {
2980                 rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInvalid);
2981             }
2982             if (rc == 0) {
2983                 if (max_rl > sizeof(sref_buf)) {
2984                     if (href_buf)
2985                         free(href_buf);
2986                     href_buf = malloc(max_rl);
2987                     if (href_buf == NULL)
2988                         rc = RC(rcAlign, rcFile, rcProcessing, rcMemory, rcExhausted);
2989                     ref_buf = href_buf;
2990                 }
2991             }
2992             if (rc == 0) {
2993                 if(allele != NULL) {
2994                     /* subst allele in reference */
2995                     if(allele_offset <= offset) {
2996                         /* move allele start inside referenced chunk */
2997                         allele     += offset_in_allele;
2998                         allele_len -= offset_in_allele;
2999                         rl = 0;
3000                     } else {
3001                         /* fetch portion of reference which comes before allele */
3002                         rl = allele_offset - offset;
3003                         rc = ReferenceSeq_ReadDirect(self, offset, rl, true, ref_buf, &i, false);
3004                         if(rc == 0 && rl != i) {
3005                             /* here we need to test it otherwise excessive portion of allele could be fetch in next if */
3006                             rc = RC(rcAlign, rcFile, rcProcessing, rcRange, rcExcessive);
3007                         }
3008                     }
3009                     if(rc == 0 && allele_len < (max_rl - rl)) {
3010                         memmove(&ref_buf[rl], allele, allele_len);
3011                         rl += allele_len;
3012                         /* append tail of actual reference */
3013                         rc = ReferenceSeq_ReadDirect(self, allele_ref_end, max_rl - rl, true, &ref_buf[rl], &i, false);
3014                         rl += i;
3015                     } else if(rc == 0) {
3016                         /* allele is longer than needed */
3017                         memmove(&ref_buf[rl], allele, max_rl - rl);
3018                         rl = max_rl;
3019                     }
3020                 }
3021                 else {
3022                     rc = ReferenceSeq_ReadDirect(self, offset, max_rl, true, ref_buf, &rl, false);
3023                 }
3024                 if (rc != 0 || max_rl != rl) {
3025                     rc = rc ? rc : RC(rcAlign, rcFile, rcProcessing, rcRange, rcExcessive);
3026                     ALIGN_C_DBGERRP("refseq is shorter: at offset %i need %u bases", rc, offset, max_rl);
3027                 }
3028                 else {
3029                     compress_buffer_t *const compress_buf = self->mgr->compress.base;
3030                     unsigned ro = (unsigned)data->ref_offset.elements;
3031                     int ref_pos;
3032 
3033                     for (seq_pos = 0, ref_pos = 0; seq_pos < seq_len; seq_pos++, ref_pos++) {
3034                         int const length = compress_buf[seq_pos].length;
3035                         int const type = compress_buf[seq_pos].type;
3036 
3037 #if 0
3038                         ALIGN_C_DBG("seq_pos: %u, ref_pos: %i, offset: %i, type: %i, ro: %u", seq_pos, ref_pos, length, type, ro);
3039 #endif
3040                         if (length == 0 && type == 0)
3041                             has_ref_offset[seq_pos] = 0;
3042                         else {
3043                             has_ref_offset[seq_pos] = 1;
3044                             ref_offset[ro] = length;
3045                             ref_offset_type[ro] = type;
3046                             ref_pos += length;
3047                             ++ro;
3048                         }
3049                         if (ref_pos < 0 || ref_pos >= max_rl
3050                             || !isMatchingBase(toupper(ref_buf[ref_pos]), toupper(seq[seq_pos])))
3051                         {
3052                             has_mismatch[seq_pos] = 1;
3053                             mismatch[data->mismatch.elements++] = seq[seq_pos];
3054                         }
3055                         else {
3056                             has_mismatch[seq_pos] = 0;
3057                         }
3058                     }
3059                     data->ref_offset.elements = data->ref_offset_type.elements = ro;
3060                 }
3061             }
3062         }
3063 #if _DEBUGGING
3064         if(rc == 0) {
3065             int32_t j;
3066             memset(x, '-', sizeof(x) - 1);
3067             x[sizeof(x) - 2] = '\0';
3068 
3069             ALIGN_C_DBG("ref: %.*s [%u]", max_rl, ref_buf, max_rl);
3070             ALIGN_C_DBGF(("%s:%u: ref: ", __func__, __LINE__));
3071             for(seq_pos = 0, j = 0, rl = 0, i = 0; seq_pos < seq_len; seq_pos++, j++) {
3072                 if(has_ref_offset[seq_pos]) {
3073                     if(ref_offset[i_ref_offset_elements + rl] > 0) {
3074                         ALIGN_C_DBGF(("%.*s", (uint32_t)(ref_offset[i_ref_offset_elements + rl]), &ref_buf[j]));
3075                     } else {
3076                         i = -ref_offset[i_ref_offset_elements + rl];
3077                     }
3078                     j += ref_offset[i_ref_offset_elements + rl];
3079                     rl++;
3080                 }
3081                 ALIGN_C_DBGF(("%c", (j < 0 || j >= max_rl) ? '-' : (i > 0) ? tolower(ref_buf[j]) : ref_buf[j]));
3082                 if(i > 0 ) {
3083                     i--;
3084                 }
3085             }
3086             ALIGN_C_DBGF(("\n%s:%u: seq: ", __func__, __LINE__));
3087             for(i = 0, j = 0; i < seq_len; i++) {
3088                 if(has_ref_offset[i] && ref_offset[i_ref_offset_elements + j++] > 0) {
3089                     ALIGN_C_DBGF(("%.*s", ref_offset[i_ref_offset_elements + j - 1], x));
3090                 }
3091                 ALIGN_C_DBGF(("%c", seq[i]));
3092             }
3093             ALIGN_C_DBGF((" [%u]\n", seq_len));
3094             ALIGN_C_DBGF(("%s:%u: hro: ", __func__, __LINE__));
3095             for(i = 0, j = 0; i < seq_len; i++) {
3096                 if(has_ref_offset[i] && ref_offset[i_ref_offset_elements + j++] > 0) {
3097                     ALIGN_C_DBGF(("%.*s", ref_offset[i_ref_offset_elements + j - 1], x));
3098                 }
3099                 ALIGN_C_DBGF(("%c", has_ref_offset[i] + '0'));
3100             }
3101             ALIGN_C_DBGF((", ro:"));
3102             for(i = i_ref_offset_elements; i < data->ref_offset.elements; i++) {
3103                 ALIGN_C_DBGF((" %i,", ref_offset[i]));
3104             }
3105             ALIGN_C_DBGF(("[%u]\n", data->ref_offset.elements - i_ref_offset_elements));
3106             ALIGN_C_DBGF(("%s:%u: hmm: ", __func__, __LINE__));
3107             for(i = 0, j = 0; i < seq_len; i++) {
3108                 if(has_ref_offset[i] && ref_offset[i_ref_offset_elements + j++] > 0) {
3109                     ALIGN_C_DBGF(("%.*s", ref_offset[i_ref_offset_elements + j - 1], x));
3110                 }
3111                 ALIGN_C_DBGF(("%c", has_mismatch[i] + '0'));
3112             }
3113             ALIGN_C_DBGF((", mm: '%.*s'[%u]\n", (int)(data->mismatch.elements - i_mismatch_elements),
3114                 &mismatch[i_mismatch_elements], data->mismatch.elements - i_mismatch_elements));
3115         }
3116 #endif
3117         if(rc == 0) {
3118             if(data->ploidy == 0) {
3119                 int64_t *const ref_id = (int64_t *)data->ref_id.buffer;
3120                 INSDC_coord_zero *const ref_start = (INSDC_coord_zero *)data->ref_start.buffer;
3121                 uint64_t *const global_ref_start = (uint64_t *)data->global_ref_start.buffer;
3122 
3123                 data->ref_1st_row_id = self->start_rowid;
3124                 data->effective_offset = offset;
3125                 data->ref_len = ref_len;
3126                 ALIGN_C_DBGF(("%s:%u: reference 1st ROW_ID %li OFFSET %i REF_LEN %u",
3127                     __func__, __LINE__, data->ref_1st_row_id, data->effective_offset, data->ref_len));
3128 
3129                 ReferenceSeq_TranslateOffset_int(self, offset, ref_id, ref_start, global_ref_start);
3130 
3131                 if (ref_id) {
3132                     data->ref_id.elements = 1;
3133                     ALIGN_C_DBGF((" REF_ID %li", ref_id[0]));
3134                 }
3135                 if (ref_start) {
3136                     data->ref_start.elements = 1;
3137                     ALIGN_C_DBGF((" REF_START %i", ref_start[0]));
3138                 }
3139                 if (global_ref_start) {
3140                     data->global_ref_start.elements = 1;
3141                     ALIGN_C_DBGF((" GLOBAL_REF_START %lu", global_ref_start[0]));
3142                 }
3143                 ALIGN_C_DBGF(("\n"));
3144             } else {
3145                 if(data->ref_1st_row_id != self->start_rowid || data->effective_offset != offset) {
3146                     rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInconsistent);
3147                     (void)PLOGERR(klogErr, (klogErr, rc,
3148                         "all reads in alignment record must align to same refseq at same location $(r1)@$(o1) <> $(r2):$(a2)@$(o2)",
3149                         "r1=%li,o1=%i,r2=%s,a2=%s,o2=%i", data->ref_1st_row_id, data->effective_offset, self->id, self->seqId, offset));
3150                 } else if(data->ref_len != ref_len) {
3151                     rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInconsistent);
3152                     (void)PLOGERR(klogErr, (klogErr, rc,
3153                         "all reads in alignment record must have same size projection on refseq $(rl1) <> $(rl2) $(r):$(a)@$(o)",
3154                         "rl1=%u,rl2=%u,r=%s,a=%s,o=%i", data->ref_len, ref_len, self->id, self->seqId, offset));
3155                 }
3156             }
3157         }
3158         if(rc == 0) {
3159             data->ploidy++;
3160             data->read_start.elements = data->ploidy;
3161             data->read_len.elements = data->ploidy;
3162         }
3163         if (href_buf)
3164             free(href_buf);
3165     }
3166     ALIGN_C_DBGERR(rc);
3167     return rc;
3168 }
3169 
ReferenceSeq_Read(const ReferenceSeq * cself,INSDC_coord_zero offset,INSDC_coord_len len,uint8_t * buffer,INSDC_coord_len * ref_len)3170 LIB_EXPORT rc_t CC ReferenceSeq_Read(const ReferenceSeq* cself, INSDC_coord_zero offset, INSDC_coord_len len,
3171                                      uint8_t* buffer, INSDC_coord_len* ref_len)
3172 {
3173     rc_t rc = 0;
3174 
3175     if(cself == NULL || buffer == NULL || ref_len == NULL) {
3176         rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
3177     } else {
3178         rc = ReferenceSeq_ReadDirect((ReferenceSeq*)cself, offset, len, true, buffer, ref_len, false);
3179     }
3180     ALIGN_DBGERR(rc);
3181     return rc;
3182 }
3183 
ReferenceSeq_Get1stRow(const ReferenceSeq * cself,int64_t * row_id)3184 LIB_EXPORT rc_t CC ReferenceSeq_Get1stRow(const ReferenceSeq* cself, int64_t* row_id)
3185 {
3186     rc_t rc = 0;
3187 
3188     if(cself == NULL || row_id == NULL) {
3189         rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
3190     } else {
3191         *row_id = cself->start_rowid;
3192     }
3193     return rc;
3194 }
3195 
ReferenceSeq_GetID(ReferenceSeq const * const self,char const ** const rslt)3196 LIB_EXPORT rc_t CC ReferenceSeq_GetID(ReferenceSeq const *const self, char const **const rslt)
3197 {
3198     assert(self != NULL);
3199     assert(rslt != NULL);
3200     *rslt = self->id;
3201     return 0;
3202 }
3203 
ReferenceSeq_AddCoverage(const ReferenceSeq * cself,INSDC_coord_zero offset,const ReferenceSeqCoverage * data)3204 LIB_EXPORT rc_t CC ReferenceSeq_AddCoverage(const ReferenceSeq* cself, INSDC_coord_zero offset, const ReferenceSeqCoverage* data)
3205 {
3206     rc_t rc = 0;
3207 
3208     if(cself == NULL || data == NULL) {
3209         rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
3210     } else if(!(cself->mgr->options & ewrefmgr_co_Coverage)) {
3211         rc = RC(rcAlign, rcType, rcWriting, rcData, rcUnexpected);
3212         ALIGN_R_DBGERRP("coverage %s", rc, "data");
3213     } else if((rc = ReferenceSeq_ReOffset(cself->circular, cself->seq_len, &offset)) == 0) {
3214         rc = TableWriterRef_WriteCoverage(cself->mgr->writer, cself->start_rowid, offset, data);
3215     }
3216     ALIGN_DBGERR(rc);
3217     return rc;
3218 }
3219 
ReferenceSeq_Release(const ReferenceSeq * cself)3220 LIB_EXPORT rc_t CC ReferenceSeq_Release(const ReferenceSeq *cself)
3221 {
3222     return 0;
3223 }
3224