1 /*==============================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 */
25 #include <align/extern.h>
26
27 #include <klib/log.h>
28 #include <klib/rc.h>
29 #include <klib/sort.h>
30 #include <klib/data-buffer.h>
31 #include <klib/container.h>
32 #include <klib/checksum.h>
33 #include <klib/text.h>
34 #include <kfs/mmap.h>
35 #include <kfs/file.h>
36 #include <kdb/manager.h>
37 #include <vdb/database.h>
38 #include <vdb/table.h>
39 #include <vdb/cursor.h>
40 #include <vdb/manager.h>
41 #include <vdb/vdb-priv.h>
42 #include <sra/sradb.h>
43
44 #include <align/writer-reference.h>
45 #include <align/writer-refseq.h>
46 #include <align/refseq-mgr.h>
47 #include <align/align.h>
48 #include "refseq-mgr-priv.h"
49 #include "writer-ref.h"
50 #include "reader-cmn.h"
51 #include "reference-cmn.h"
52 #include "debug.h"
53 #include <os-native.h>
54 #include <sysalloc.h>
55
56 #include <stdio.h>
57 #include <stdlib.h>
58 #include <string.h>
59 #include <ctype.h>
60 #include <assert.h>
61
62 /*
63 * ReferenceSeq objects:
64 * ReferenceSeq objects may be unattached, i.e. they might not yet represent an
65 * actual sequence.
66 *
67 * ReferenceSeq objects may be attached, i.e. they represent a sequence from
68 * either RefSeq or a fasta file.
69 *
70 * A ReferenceSeq object may be refered to by more than one id, but a
71 * ReferenceSeq object has only one canonical id.
72 *
73 * More than one ReferenceSeq object may be associated with the same seqId.
74 *
75 * More than one ReferenceSeq object may be attached to the same sequence.
76 * This will cause the REFERENCE table to have more than one copy of the
77 * sequence.
78 *
79 * ReferenceSeq objects may be created from the config file. These objects
80 * will have an id, a seqId, but no fastaSeqId. These are unattached.
81 *
82 * ReferenceSeq objects may be created from explicit fasta files. These
83 * objects will have a fastaSeqId, but *** NO id OR seqId ***. These are
84 * attached.
85 *
86 * ReferenceSeq objects may be created on the fly by requesting an id that
87 * isn't already in the collection. These objects will have the requested id.
88 *
89 * When a reference is requested (by id):
90 * Resolve the id to a ReferenceSeq object.
91 * If the object is unattached, attach it to a sequence.
92 * If the sequence is not yet written to the REFERENCE table, write it to the
93 * REFERENCE table. NAME gets id; SEQID gets seqId unless seqId is null, then
94 * SEQID gets id.
95 *
96 * Resolving id's to ReferenceSeq objects:
97 * Search the id index and if the found object is attached, return it.
98 * Search the identifiers in the fastaSeqIds or seqIds.
99 * If different objects were found from both searches, use sequence length and
100 * MD5 to break the tie (if both match then use RefSeq). If no sequence length
101 * or MD5 then fail.
102 * If no objects were found from either search, then create a new unattached
103 * ReferenceSeq object.
104 * If the object's id is null, set it to the id.
105 * It the object was not found in the id index, add it.
106 *
107 * Attaching ReferenceSeq objects to sequences:
108 * Search RefSeq for seqId.
109 * Else search RefSeq for id.
110 * Else search for seqId in the loaded fasta files.
111 * Else search data directory for id.fasta or id.fa; load it or fail.
112 * Else search data directory for seqId.fasta or seqId.fa; load it or fail.
113 * Else fail.
114 * If failed, mark the object as dead.
115 *
116 *
117 *
118 * Config file:
119 * The config file consists of lines containing whitespace (preferably tab)
120 * seperated fields. The fields are:
121 * NAME (unique)
122 * SEQID
123 * extra (optional)
124 *
125 * There is one ReferenceSeq object created per record in the config file.
126 * NAME is stored in id; SEQID is stored in seqId; if extra contains the word
127 * 'circular' (case-insensitive), true is stored in circular. These
128 * ReferenceSeq object are created in the unattached state, i.e. not attached
129 * to a fasta file or a RefSeq library object.
130 *
131 * If SEQID is equal to UNMAPPED_SEQID_VALUE (see below) the Reference will be
132 * considered to be unmapped.
133 *
134 * Fasta files:
135 * Fasta file consists of one of more sequences. A sequence in a fasta file
136 * consists of a seqid line followed by lines containing the bases of the
137 * sequence. A seqid line starts with '>' and the next word (whitespace
138 * delimited) is the seqid. The seqid may consist of '|' delimited identifiers
139 * (this is purposely vague). The fasta seqid is stored in fastaSeqId.
140 *
141 * Fasta files may be loaded explicitly:
142 * When a fasta file is loaded explicitly, a new ReferenceSeq object is created
143 * (with id == NULL) for each sequence found in the file.
144 *
145 * Fasta files may be loaded implicitly:
146 * When an id can't be found in the set of ReferenceSeq objects and can't be
147 * found as an accession by RefSeq, an attempt is made to load a fasta file
148 * named <id>.fasta or <id>.fa in the directory given to the constructor. If
149 * this succeeds, a new ReferenceSeq object with the given id is attached to
150 * the sequence. In this situation, to avoid ambiquity, there can be only one
151 * sequence in the fasta file.
152 *
153 */
154
155 #define UNMAPPED_SEQID_VALUE "*UNMAPPED"
156
157 enum ReferenceSeqType {
158 rst_unattached,
159 rst_local,
160 rst_refSeqById,
161 rst_refSeqBySeqId,
162 rst_unmapped,
163 rst_renamed,
164 rst_dead
165 };
166
167 struct ReferenceSeq {
168 ReferenceMgr *mgr;
169 char *id;
170 char *seqId;
171 char *fastaSeqId;
172
173 char **used;
174 unsigned num_used;
175
176 /* ref table position */
177 int64_t start_rowid;
178 /* total reference length */
179 INSDC_coord_len seq_len;
180 int type;
181 bool circular;
182 uint8_t md5[16];
183 union {
184 struct {
185 KDataBuffer buf;
186 uint8_t const *data;
187 } local;
188 RefSeq const *refseq;
189 } u;
190 };
191
192 #define BUCKET_BITS (12U)
193 #define BUCKETS (1U << BUCKET_BITS)
194 #define BUCKET_MASK (BUCKETS - 1U)
195
196 typedef struct {
197 unsigned count;
198 unsigned *index;
199 } Bucket;
200
201 typedef struct {
202 int length, type;
203 } compress_buffer_t;
204
205 struct ReferenceMgr {
206 TableWriterRef const *writer;
207 KDirectory const *dir;
208 RefSeqMgr const *rmgr;
209 VDatabase *db;
210 ReferenceSeq *refSeq; /* == refSeqs.base */
211
212 int64_t ref_rowid;
213
214 size_t cache;
215
216 uint32_t options;
217 uint32_t num_open_max;
218 uint32_t num_open;
219 uint32_t max_seq_len;
220
221 KDataBuffer compress; /* [compress_buffer_t] */
222 KDataBuffer seq; /* [byte](max_seq_len) */
223 KDataBuffer refSeqs; /* [ReferenceSeq] */
224
225 Bucket used[BUCKETS];
226 Bucket ht[BUCKETS];
227 };
228
hash(unsigned const length,char const key[])229 static unsigned hash(unsigned const length, char const key[])
230 {
231 /* FNV-1a hash with folding */
232 uint64_t h = 0xcbf29ce484222325ull;
233 unsigned i;
234
235 for (i = 0; i < length; ++i) {
236 int const ch = key[i];
237 uint64_t const octet = (uint8_t)toupper(ch);
238
239 h = (h ^ octet) * 0x100000001b3ull;
240 }
241 return (unsigned)(((h ^ (h >> 32))) & ((uint64_t)BUCKET_MASK));
242 }
243
hash0(char const key[])244 static unsigned hash0(char const key[])
245 {
246 /* FNV-1a hash with folding */
247 uint64_t h = 0xcbf29ce484222325ull;
248 unsigned i;
249
250 for (i = 0; ; ++i) {
251 int const ch = key[i];
252 if (ch != '\0') {
253 uint64_t const octet = (uint8_t)toupper(ch);
254 h = (h ^ octet) * 0x100000001b3ull;
255 continue;
256 }
257 break;
258 }
259 return (unsigned)(((h ^ (h >> 32))) & ((uint64_t)BUCKET_MASK));
260 }
261
addToHashBucket(Bucket * const bucket,unsigned const index)262 static void addToHashBucket(Bucket *const bucket, unsigned const index)
263 {
264 unsigned i;
265 void *tmp = NULL;
266
267 for (i = 0; i < bucket->count; ++i) {
268 if (bucket->index[i] == index)
269 return;
270 }
271
272 tmp = realloc(bucket->index, (1 + bucket->count) * sizeof(bucket->index[0]));
273 assert(tmp != NULL);
274 if (tmp == NULL)
275 abort();
276
277 bucket->index = tmp;
278 bucket->index[bucket->count++] = index;
279 }
280
addToHashTable(ReferenceMgr * const self,ReferenceSeq const * const rs)281 static void addToHashTable(ReferenceMgr *const self, ReferenceSeq const *const rs)
282 {
283 unsigned const index = rs - self->refSeq;
284
285 if (rs->id)
286 addToHashBucket(&self->ht[hash0(rs->id)], index);
287 if (rs->seqId)
288 addToHashBucket(&self->ht[hash0(rs->seqId)], index);
289 }
290
freeHashTableEntries(ReferenceMgr * const self)291 static void freeHashTableEntries(ReferenceMgr *const self)
292 {
293 unsigned i;
294 for (i = 0; i < BUCKETS; ++i) {
295 if (self->ht[i].count > 0)
296 free(self->ht[i].index);
297 if (self->used[i].count > 0)
298 free(self->used[i].index);
299 }
300 }
301
302 static
ReferenceSeq_Whack(ReferenceSeq * self)303 void CC ReferenceSeq_Whack(ReferenceSeq *self)
304 {
305 unsigned i;
306
307 for (i = 0; i < self->num_used; ++i)
308 free(self->used[i]);
309
310 if (self->type == rst_local) {
311 KDataBufferWhack(&self->u.local.buf);
312 }
313 else if (self->type == rst_refSeqById || self->type == rst_refSeqBySeqId) {
314 RefSeq_Release(self->u.refseq);
315 }
316 free(self->id);
317 free(self->seqId);
318 free(self->fastaSeqId);
319 free(self->used);
320 }
321
322 struct OpenConfigFile_ctx {
323 char const *name;
324 KDirectory const *dir;
325 KFile const **kfp;
326 rc_t rc;
327 };
328
329 static
OpenConfigFile(char const server[],char const volume[],void * Ctx)330 bool OpenConfigFile(char const server[], char const volume[], void *Ctx)
331 {
332 struct OpenConfigFile_ctx *ctx = Ctx;
333 KDirectory const *dir;
334
335 if(volume == NULL) {
336 ctx->rc = KDirectoryOpenDirRead(ctx->dir, &dir, false, "%s", server);
337 } else {
338 ctx->rc = KDirectoryOpenDirRead(ctx->dir, &dir, false, "%s/%s", server, volume);
339 }
340 if (ctx->rc == 0) {
341 ctx->rc = KDirectoryOpenFileRead(dir, ctx->kfp, "%s", ctx->name);
342 KDirectoryRelease(dir);
343 if (ctx->rc == 0) {
344 return true;
345 }
346 }
347 return false;
348 }
349
350 static
FindAndOpenConfigFile(RefSeqMgr const * const rmgr,KDirectory const * const dir,KFile const ** const kfp,char const conf[])351 rc_t FindAndOpenConfigFile(RefSeqMgr const *const rmgr,
352 KDirectory const *const dir,
353 KFile const **const kfp, char const conf[])
354 {
355 rc_t rc = KDirectoryOpenFileRead(dir, kfp, "%s", conf);
356
357 if (rc) {
358 struct OpenConfigFile_ctx ctx;
359
360 ctx.name = conf;
361 ctx.dir = dir;
362 ctx.kfp = kfp;
363 ctx.rc = 0;
364
365 rc = RefSeqMgr_ForEachVolume(rmgr, OpenConfigFile, &ctx);
366 if (rc == 0 && *kfp == NULL) {
367 rc = RC(rcAlign, rcIndex, rcConstructing, rcFile, rcNotFound);
368 }
369 }
370 return rc;
371 }
372
373 enum comparison_weights {
374 no_match = 0,
375 substring_match = (1u << 0),
376 expected_prefix = (1u << 1),
377 exact_match = (1u << 2),
378 seqId_match = (1u << 3),
379 id_match = (1u << 4),
380 seq_len_match = (1u << 5),
381 md5_match = (1u << 6),
382 attached = (1u << 7)
383 };
384
385 static
str_weight(char const str[],char const qry[],unsigned const qry_len)386 unsigned str_weight(char const str[], char const qry[], unsigned const qry_len)
387 {
388 char const *const fnd = strcasestr(str, qry);
389 unsigned wt = no_match;
390
391 if (fnd) {
392 unsigned const fnd_len = (unsigned)string_size(fnd);
393 unsigned const fndlen = (fnd_len > qry_len && fnd[qry_len] == '|') ? qry_len : fnd_len;
394
395 if (fndlen == qry_len && (fnd == str || fnd[-1] == '|')) {
396 wt |= substring_match;
397
398 if (fnd == str) {
399 if (fnd[fndlen] == '\0')
400 wt |= exact_match;
401 }
402 else {
403 /* check for expected prefices */
404 char const *ns = fnd - 1;
405
406 while (ns != str && ns[-1] != '|')
407 --ns;
408
409 if ( memcmp(ns, "ref|", 4) == 0
410 || memcmp(ns, "emb|", 4) == 0
411 || memcmp(ns, "dbj|", 4) == 0
412 || memcmp(ns, "tpg|", 4) == 0
413 || memcmp(ns, "tpe|", 4) == 0
414 || memcmp(ns, "tpd|", 4) == 0
415 || memcmp(ns, "gpp|", 4) == 0
416 || memcmp(ns, "gb|", 3) == 0
417 )
418 {
419 wt |= expected_prefix;
420 }
421 }
422 }
423 }
424 return wt;
425 }
426
addToIndex(ReferenceMgr * const self,char const ID[],ReferenceSeq * const rs)427 static void addToIndex(ReferenceMgr *const self, char const ID[],
428 ReferenceSeq *const rs)
429 {
430 unsigned const n = rs->num_used;
431 unsigned i;
432
433 for (i = 0; i < n; ++i) {
434 if (strcmp(ID, rs->used[i]) == 0)
435 goto SKIP_INSERT_ID;
436 }
437 {
438 unsigned const length = (unsigned)string_size(ID);
439 char *id = string_dup(ID, length);
440 void *tmp = realloc(rs->used, (n + 1) * sizeof(rs->used[0]));
441
442 assert(tmp != NULL && id != NULL);
443 if (tmp == NULL || id == NULL) abort();
444
445 ++rs->num_used;
446 rs->used = tmp;
447 rs->used[n] = id;
448 }
449 SKIP_INSERT_ID:
450 addToHashBucket(&self->used[hash0(ID)], rs - self->refSeq);
451 }
452
findId(ReferenceMgr const * const self,char const id[])453 static int findId(ReferenceMgr const *const self, char const id[])
454 {
455 unsigned const hv = hash0(id);
456 Bucket const *const bucket = &self->used[hv];
457 unsigned const n = bucket->count;
458 unsigned i;
459
460 for (i = 0; i < n; ++i) {
461 unsigned const index = bucket->index[i];
462 ReferenceSeq const *rs = &self->refSeq[index];
463 unsigned const m = rs->num_used;
464 unsigned j;
465
466 if (rs->type == rst_dead)
467 continue;
468 for (j = 0; j < m; ++j) {
469 if (strcmp(id, rs->used[j]) == 0)
470 return index;
471 }
472 }
473 return -1;
474 }
475
newReferenceSeq(ReferenceMgr * const self,ReferenceSeq const * const template)476 static ReferenceSeq *newReferenceSeq(ReferenceMgr *const self, ReferenceSeq const *const template)
477 {
478 unsigned const last_rs = (unsigned)self->refSeqs.elem_count;
479 rc_t const rc = KDataBufferResize(&self->refSeqs, last_rs + 1);
480
481 assert(rc == 0);
482 if (rc) abort();
483
484 self->refSeq = self->refSeqs.base;
485 {
486 ReferenceSeq *const rslt = &self->refSeq[last_rs];
487 if (template)
488 *rslt = *template;
489 else
490 memset(rslt, 0, sizeof(*rslt));
491 rslt->mgr = self;
492 return rslt;
493 }
494 }
495
496 static
config_cmp(void const * A,void const * B,void * Data)497 int64_t CC config_cmp(void const *A, void const *B, void *Data)
498 {
499 struct {
500 unsigned id;
501 unsigned seqId;
502 unsigned extra;
503 unsigned extralen;
504 } const *const a = A;
505 struct {
506 unsigned id;
507 unsigned seqId;
508 unsigned extra;
509 unsigned extralen;
510 } const *const b = B;
511 char const *const data = Data;
512
513 return strcmp(&data[a->id], &data[b->id]);
514 }
515
516 static
ReferenceMgr_ProcessConf(ReferenceMgr * const self,char Data[],unsigned const len)517 rc_t ReferenceMgr_ProcessConf(ReferenceMgr *const self, char Data[], unsigned const len)
518 {
519 rc_t rc;
520 unsigned i;
521 unsigned j;
522 struct {
523 unsigned id;
524 unsigned seqId;
525 unsigned extra;
526 unsigned extralen;
527 } *data, tmp;
528 KDataBuffer buf;
529
530 memset(&buf, 0, sizeof(buf));
531 buf.elem_bits = sizeof(data[0]) * 8;
532
533 for (j = i = 0; i < len; ++j) {
534 unsigned lineEnd;
535 unsigned id;
536 unsigned acc;
537 unsigned ii;
538
539 for (lineEnd = i; lineEnd != len; ++lineEnd) {
540 int const ch = Data[lineEnd];
541
542 if (ch == '\n')
543 break;
544 if (ch == '\r')
545 break;
546 }
547 if (i == lineEnd) {
548 ++i;
549 continue;
550 }
551 Data[lineEnd] = '\0';
552 for (id = i; id != lineEnd; ++id) {
553 int const ch = Data[id];
554
555 if (!isspace(ch))
556 break;
557 }
558 for (ii = id; ii != lineEnd; ++ii) {
559 int const ch = Data[ii];
560
561 if (isspace(ch)) {
562 Data[ii++] = '\0';
563 break;
564 }
565 }
566 for (acc = ii; acc < lineEnd; ++acc) {
567 int const ch = Data[acc];
568
569 if (!isspace(ch))
570 break;
571 }
572 if (acc >= lineEnd)
573 return RC(rcAlign, rcFile, rcReading, rcFormat, rcInvalid);
574
575 for (ii = acc; ii != lineEnd; ++ii) {
576 int const ch = Data[ii];
577
578 if (isspace(ch)) {
579 Data[ii++] = '\0';
580 break;
581 }
582 }
583 tmp.id = id;
584 tmp.seqId = acc;
585 tmp.extra = ii;
586 tmp.extralen = lineEnd > ii ? lineEnd - ii : 0;
587
588 if ((rc = KDataBufferResize(&buf, buf.elem_count + 1)) != 0) return rc;
589 data = buf.base;
590
591 data[buf.elem_count-1] = tmp;
592 i = lineEnd + 1;
593 }
594
595 /* check unique */
596 ksort(data, buf.elem_count, sizeof(data[0]), config_cmp, Data);
597 for (i = 1; i < buf.elem_count; ++i) {
598 if (strcmp(&Data[data[i-1].id], &Data[data[i].id]) == 0)
599 return RC(rcAlign, rcIndex, rcConstructing, rcItem, rcExists);
600 }
601
602 for (i = 0; i != buf.elem_count; ++i) {
603 unsigned const extralen = data[i].extralen;
604 char const *const id = &Data[data[i].id];
605 char const *const seqId = &Data[data[i].seqId];
606 char const *const extra = extralen ? &Data[data[i].extra] : NULL;
607 bool circular = false;
608 ReferenceSeq *rs;
609
610 if (extra && extralen >= 8) {
611 char const *const circ = strcasestr(extra, "circular");
612
613 circular = circ && (circ == extra || isspace(circ[-1])) &&
614 (circ[8] == '\0' || isspace(circ[8]));
615 }
616 rs = newReferenceSeq(self, NULL);
617
618 rs->id = string_dup(id, string_size(id));
619 if (rs->id == NULL)
620 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
621
622 if (strcmp(seqId, UNMAPPED_SEQID_VALUE) == 0) {
623 rs->type = rst_unmapped;
624 }
625 else {
626 rs->seqId = string_dup(seqId, string_size(seqId));
627 if (rs->seqId == NULL)
628 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
629 }
630 rs->circular = circular;
631
632 addToHashTable(self, rs);
633 }
634 KDataBufferWhack(&buf);
635 return 0;
636 }
637
638 static
ReferenceMgr_Conf(ReferenceMgr * const self,char const conf[])639 rc_t ReferenceMgr_Conf(ReferenceMgr *const self, char const conf[])
640 {
641 rc_t rc;
642 const KFile* kf = NULL;
643
644 if (conf == NULL)
645 return 0;
646
647 rc = FindAndOpenConfigFile(self->rmgr, self->dir, &kf, conf);
648 if (rc == 0) {
649 uint64_t sz;
650 KDataBuffer buf;
651
652 rc = KFileSize(kf, &sz);
653 assert(rc == 0);
654 if (sz == 0)
655 (void)PLOGMSG(klogWarn, (klogWarn, "Configuration file '$(file)' is empty", "file=%s", conf));
656 else {
657 rc = KDataBufferMakeBytes(&buf, sz + 1);
658 if (rc == 0) {
659 size_t nread;
660
661 rc = KFileReadAll(kf, 0, buf.base, sz, &nread);
662 if (rc == 0) {
663 assert(nread == sz);
664 ((char *)buf.base)[sz] = '\n'; /* make sure that last line is terminated */
665 rc = ReferenceMgr_ProcessConf(self, buf.base, (unsigned)(sz + 1));
666 }
667 KDataBufferWhack(&buf);
668 }
669 }
670 KFileRelease(kf);
671 }
672 return rc;
673 }
674
675 static
FastaFile_GetSeqIds(KDataBuffer * const buf,char const data[],uint64_t const len)676 rc_t FastaFile_GetSeqIds(KDataBuffer *const buf, char const data[], uint64_t const len)
677 {
678 uint64_t pos;
679 int st = 0;
680
681 for (pos = 0; pos < len; ++pos) {
682 int const ch = data[pos];
683
684 if (st == 0) {
685 if (ch == '>') {
686 uint64_t const n = buf->elem_count;
687 rc_t rc = KDataBufferResize(buf, n + 1);
688
689 if (rc)
690 return rc;
691 ((uint64_t *)buf->base)[n] = pos;
692 st = 1;
693 }
694 }
695 else if (ch == '\r' || ch == '\n')
696 st = 0;
697 }
698 return 0;
699 }
700
701 static
ImportFasta(ReferenceSeq * const obj,KDataBuffer const * const buf)702 rc_t ImportFasta(ReferenceSeq *const obj, KDataBuffer const *const buf)
703 {
704 unsigned seqId;
705 unsigned seqIdLen;
706 unsigned ln;
707 unsigned src;
708 unsigned dst;
709 unsigned start=0;
710 char *const data = buf->base;
711 unsigned const len = buf->elem_count;
712 char *fastaSeqId = NULL;
713 rc_t rc;
714 MD5State mds;
715
716 if (len == 0)
717 return 0;
718 assert(data[0] == '>');
719
720 for (ln = 1; ln != len; ++ln) {
721 int const ch = data[ln];
722
723 if (ch == '\r' || ch == '\n') {
724 data[ln] = '\0';
725 start = ln + 1;
726 break;
727 }
728 }
729 for (seqId = 1; seqId != ln; ++seqId) {
730 if (!isspace(data[seqId]))
731 break;
732 }
733 for (seqIdLen = 0; seqId + seqIdLen < ln; ++seqIdLen) {
734 if (isspace(data[seqId + seqIdLen])) {
735 ln = seqId + seqIdLen;
736 data[ln] = '\0';
737 break;
738 }
739 }
740 if (seqIdLen == 0)
741 return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
742
743 fastaSeqId = string_dup(&data[seqId], string_size(&data[seqId]));
744 if (fastaSeqId == NULL)
745 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
746
747 MD5StateInit(&mds);
748 for (dst = src = start; src != len; ++src) {
749 int const ch = toupper(data[src]);
750
751 if (isspace(ch))
752 continue;
753
754 if (strchr(INSDC_4na_map_CHARSET, ch) == NULL && ch != 'X')
755 return RC(rcAlign, rcFile, rcReading, rcData, rcInvalid);
756
757 data[dst] = ch == 'X' ? 'N' : ch;
758 MD5StateAppend(&mds, data + dst, 1);
759 ++dst;
760 }
761 MD5StateFinish(&mds, obj->md5);
762 rc = KDataBufferSub(buf, &obj->u.local.buf, start, dst - start);
763 if (rc == 0) {
764 obj->fastaSeqId = fastaSeqId;
765 obj->type = rst_local;
766 obj->seq_len = dst - start;
767 }
768 else
769 obj->type = rst_dead;
770 return rc;
771 }
772
subBuffer(KDataBuffer const * const buf,uint64_t const offset,unsigned const length)773 static KDataBuffer subBuffer(KDataBuffer const *const buf, uint64_t const offset, unsigned const length)
774 {
775 KDataBuffer sub;
776 memset(&sub, 0, sizeof(sub));
777 KDataBufferSub(buf, &sub, offset, length);
778 return sub;
779 }
780
compareSeqIdFields(unsigned const alen,char const a[],unsigned const blen,char const b[])781 static int compareSeqIdFields(unsigned const alen, char const a[], unsigned const blen, char const b[])
782 {
783 unsigned i;
784
785 for (i = 0; i < alen && i < blen; ++i) {
786 int const cha = a[i];
787 int const chb = b[i];
788 int const diff = toupper(cha) - toupper(chb);
789
790 if (diff < 0)
791 return -1;
792
793 if (diff > 0)
794 return 1;
795
796 if (cha == '\0')
797 return 0;
798 }
799 return alen < blen ? -1 : alen == blen ? 0 : 1;
800 }
801
802 typedef struct {
803 unsigned length;
804 uint64_t offset;
805 } seqIdField_t;
806
cmpSeqIdField(void const * A,void const * B,void * Ctx)807 static int64_t cmpSeqIdField(void const *A, void const *B, void *Ctx)
808 {
809 char const *const data = Ctx;
810 seqIdField_t const *a = A;
811 seqIdField_t const *b = B;
812 char const *avalue = &data[a->offset];
813 unsigned const alen = a->length;
814 char const *bvalue = &data[b->offset];
815 unsigned const blen = b->length;
816 int const diff = compareSeqIdFields(alen, avalue, blen, bvalue);
817
818 return diff != 0 ? diff : a->offset < b->offset ? -1 : 1;
819 }
820
isInKnownSet(unsigned const len,char const id[])821 static bool isInKnownSet(unsigned const len, char const id[])
822 {
823 if (len == 2 || len == 3) {
824 int const id1 = id[0];
825 int const id2 = id[1];
826
827 if (len == 3) {
828 int const id3 = id[2];
829
830 if (id1 == 'd' && id2 == 'b' && id3 == 'j') return true;
831 if (id1 == 'e' && id2 == 'm' && id3 == 'b') return true;
832 if (id1 == 'g' && id2 == 'p' && id3 == 'p') return true;
833 if (id1 == 'r' && id2 == 'e' && id3 == 'f') return true;
834 if (id1 == 't' && id2 == 'p') {
835 if (id3 == 'd' || id3 == 'e' || id3 == 'g') return true;
836 }
837 return false;
838 }
839 if (id1 == 'g') {
840 if (id2 == 'b' || id2 == 'i')
841 return true;
842 }
843 }
844 return false;
845 }
846
ImportFastaFileMany(ReferenceMgr * const self,unsigned const seqIds,uint64_t const seqIdOffset[],KDataBuffer const * const buf)847 static rc_t ImportFastaFileMany(ReferenceMgr *const self,
848 unsigned const seqIds, uint64_t const seqIdOffset[],
849 KDataBuffer const *const buf)
850 {
851 unsigned const datalen = buf->elem_count;
852 char *const data = buf->base;
853 seqIdField_t *found = NULL;
854 seqIdField_t *ignore = NULL;
855 unsigned found_entries = 0;
856 unsigned found_entries_max = seqIds;
857 unsigned ignore_entries = 0;
858 unsigned ignore_entries_max = 0;
859 unsigned i;
860 rc_t rc = 0;
861
862 found = malloc(found_entries_max * sizeof(found[0]));
863 assert(found);
864 if (found == NULL)
865 abort();
866
867 for (i = 0; i < seqIds; ++i) {
868 uint64_t const ofs = seqIdOffset[i];
869 uint64_t const nxt = (i < seqIds - 1) ? seqIdOffset[i + 1] : datalen;
870 unsigned const len = (unsigned)(nxt - ofs);
871 unsigned j = 1;
872 unsigned k;
873
874 for (k = j; k < len; ++k) {
875 int const ch = data[ofs + k];
876
877 if (ch == '|' || isspace(ch)) {
878 char const *const str = &data[ofs + j];
879 unsigned const len = k - j;
880
881 if (len > 0 && (isspace(ch) || !isInKnownSet(len, str))) {
882 if (found_entries == found_entries_max) {
883 unsigned const new_max = found_entries_max * 2;
884 void *const tmp = realloc(found, new_max * sizeof(found[0]));
885
886 assert(tmp);
887 if (tmp == NULL)
888 abort();
889
890 found = tmp;
891 found_entries_max = new_max;
892 }
893 found[found_entries].length = len;
894 found[found_entries].offset = ofs + j;
895 ++found_entries;
896 }
897 j = k + 1;
898 }
899 if (isspace(ch))
900 break;
901 }
902 }
903 ksort(found, found_entries, sizeof(found[0]), cmpSeqIdField, (void *)data);
904
905 for (i = 1; i < found_entries; ++i) {
906 unsigned const llen = ignore_entries_max > 0 ? ignore[ignore_entries - 1].length : 0;
907 char const *const last = ignore_entries_max > 0 ? data + ignore[ignore_entries - 1].offset : 0;
908
909 unsigned const plen = found[i - 1].length;
910 char const *const prv = data + found[i - 1].offset;
911
912 unsigned const qlen = found[i].length;
913 char const *const qry = data + found[i].offset;
914
915 bool const issame = compareSeqIdFields(plen, prv, qlen, qry) == 0;
916 bool const islast = llen > 0 ? compareSeqIdFields(llen, last, qlen, qry) == 0 : false;
917
918 if (issame && !islast) {
919 if (ignore_entries == ignore_entries_max) {
920 unsigned const new_max = ignore_entries_max == 0 ? (4096 / sizeof(ignore[0])) : (ignore_entries_max * 2);
921 void *const tmp = realloc(ignore, new_max * sizeof(ignore[0]));
922
923 assert(tmp);
924 if (tmp == NULL)
925 abort();
926
927 ignore = tmp;
928 ignore_entries_max = new_max;
929 }
930 ignore[ignore_entries] = found[i - 1];
931 ++ignore_entries;
932 }
933 }
934 free(found);
935
936 for (i = 0; i < seqIds; ++i) {
937 unsigned index = 0;
938 uint64_t const ofs = seqIdOffset[i];
939 uint64_t const nxt = (i < seqIds - 1) ? seqIdOffset[i + 1] : datalen;
940 unsigned const len = (unsigned)(nxt - ofs);
941 unsigned j = 1;
942 unsigned k;
943 char const *const seqId = &data[ofs + 1];
944 {
945 ReferenceSeq new_seq;
946 KDataBuffer sub = subBuffer(buf, ofs, len);
947
948 memset(&new_seq, 0, sizeof(new_seq));
949 rc = ImportFasta(&new_seq, &sub);
950 KDataBufferWhack(&sub);
951 if (rc) {
952 PLOGERR(klogInfo, (klogInfo, rc, "Error reading '$(defline)'; ignored", "defline=%s", seqId));
953 continue;
954 }
955 else {
956 ReferenceSeq *p = newReferenceSeq(self, &new_seq);
957 index = p - self->refSeq;
958 addToHashBucket(&self->ht[hash0(seqId)], index);
959 }
960 }
961 for (k = j; k < len; ++k) {
962 int const ch = data[ofs + k];
963
964 if (ch == '\0' || ch == '|' || isspace(ch)) {
965 unsigned const length = k - j;
966 uint64_t const offset = ofs + j;
967 char const *const value = &data[offset];
968
969 if (length == 0 || (!isspace(ch) && isInKnownSet(length, value)))
970 goto IGNORED;
971 {
972 unsigned f = 0;
973 unsigned e = ignore_entries;
974
975 while (f < e) {
976 unsigned const m = f + (e - f) / 2;
977 unsigned const flen = ignore[m].length;
978 char const *const fnd = data + ignore[m].offset;
979 int const diff = compareSeqIdFields(length, value, flen, fnd);
980
981 if (diff == 0)
982 goto IGNORED;
983 if (diff < 0)
984 e = m;
985 else
986 f = m + 1;
987 }
988 }
989 addToHashBucket(&self->ht[hash(length, value)], index);
990 IGNORED:
991 j = k + 1;
992 if (ch == '\0' || isspace(ch))
993 break;
994 }
995 }
996 }
997
998 free(ignore);
999 return rc;
1000 }
1001
1002 #define READ_CHUNK_SIZE (1024 * 1024)
1003
1004 static
ImportFastaFile(ReferenceMgr * const self,KFile const * kf,ReferenceSeq * rslt)1005 rc_t ImportFastaFile(ReferenceMgr *const self, KFile const *kf,
1006 ReferenceSeq *rslt)
1007 {
1008 uint64_t file_size;
1009 rc_t rc = KFileSize(kf, &file_size);
1010
1011 if (rc == 0) {
1012 KDataBuffer fbuf;
1013
1014 rc = KDataBufferMakeBytes(&fbuf, file_size);
1015 if (rc == 0) {
1016 char *const data = fbuf.base;
1017 uint64_t inbuf = 0;
1018
1019 while (inbuf < file_size) {
1020 uint64_t const remain = file_size - inbuf;
1021 size_t nread = 0;
1022
1023 rc = KFileRead(kf, inbuf, &data[inbuf], remain > READ_CHUNK_SIZE ? READ_CHUNK_SIZE : remain, &nread);
1024 if (rc != 0 || nread == 0)
1025 break;
1026 inbuf += nread;
1027 }
1028 if (rc == 0) {
1029 char const *const base = fbuf.base;
1030 KDataBuffer seqIdBuf;
1031
1032 memset(&seqIdBuf, 0, sizeof(seqIdBuf));
1033 seqIdBuf.elem_bits = sizeof(file_size) * 8;
1034
1035 rc = FastaFile_GetSeqIds(&seqIdBuf, base, file_size);
1036 if (rc == 0) {
1037 uint64_t const *const seqIdOffset = seqIdBuf.base;
1038 unsigned const seqIds = (unsigned)seqIdBuf.elem_count;
1039
1040 if (rslt) {
1041 KDataBuffer sub = subBuffer(&fbuf, seqIdOffset[0], (unsigned)(file_size - seqIdOffset[0]));
1042
1043 if (seqIds > 1)
1044 rc = RC(rcAlign, rcFile, rcReading, rcItem, rcUnexpected);
1045
1046 rc = ImportFasta(rslt, &sub);
1047 KDataBufferWhack(&sub);
1048 if (rc == 0)
1049 addToHashTable(self, rslt);
1050 }
1051 else {
1052 ImportFastaFileMany(self, seqIds, seqIdOffset, &fbuf);
1053 }
1054 }
1055 KDataBufferWhack(&seqIdBuf);
1056 }
1057 KDataBufferWhack(&fbuf);
1058 }
1059 }
1060 return rc;
1061 }
1062
1063 static
OpenFastaFile(KFile const ** const kf,KDirectory const * const dir,char const base[],unsigned const len)1064 rc_t OpenFastaFile(KFile const **const kf,
1065 KDirectory const *const dir,
1066 char const base[],
1067 unsigned const len)
1068 {
1069 char fname_a[4096];
1070 char *fname_h = NULL;
1071 char *fname = fname_a;
1072 rc_t rc;
1073
1074 if (len + 7 >= sizeof(fname_a)) {
1075 fname_h = malloc(len + 7);
1076 if (fname_h)
1077 fname = fname_h;
1078 else
1079 return RC(rcAlign, rcFile, rcOpening, rcMemory, rcExhausted);
1080 }
1081 memmove(fname, base, len);
1082 memmove(fname + len, ".fasta", 7);
1083
1084 ALIGN_CF_DBGF(("trying to open fasta file: %.s\n", fname));
1085 rc = KDirectoryOpenFileRead(dir, kf, "%s", fname);
1086 if (rc) {
1087 fname[len + 3] = '\0'; /* base.fasta -> base.fa */
1088
1089 ALIGN_CF_DBGF(("trying to open fasta file: %.s\n", fname));
1090 rc = KDirectoryOpenFileRead(dir, kf, "%s", fname);
1091 }
1092 free(fname_h);
1093 return rc;
1094 }
1095
1096 #if 1
ReferenceSeq_Dump(ReferenceSeq const * const rs)1097 void ReferenceSeq_Dump(ReferenceSeq const *const rs)
1098 {
1099 static char const *types[] = {
1100 "'unattached'",
1101 "'fasta'",
1102 "'RefSeq-by-id'",
1103 "'RefSeq-by-seqid'",
1104 "'unmapped'",
1105 "'dead'"
1106 };
1107 unsigned j;
1108
1109 ((void)types); /* stupid warning */
1110 ALIGN_CF_DBGF(("{ "));
1111 ALIGN_CF_DBGF(("type: %s, ", (rs->type < 0 || rs->type > rst_dead) ? "null" : types[rs->type]));
1112
1113 if (rs->id)
1114 ALIGN_CF_DBGF(("id: '%s', ", rs->id));
1115 else
1116 ALIGN_CF_DBGF(("id: null, "));
1117
1118 if (rs->seqId)
1119 ALIGN_CF_DBGF(("seqId: '%s', ", rs->seqId));
1120 else
1121 ALIGN_CF_DBGF(("seqId: null, "));
1122
1123 if (rs->fastaSeqId)
1124 ALIGN_CF_DBGF(("fastaSeqId: '%s', ", rs->fastaSeqId));
1125 else
1126 ALIGN_CF_DBGF(("fastaSeqId: null, "));
1127
1128 ALIGN_CF_DBGF(("seq-len: %u, ", rs->seq_len));
1129 ALIGN_CF_DBGF(("circular: %s, ", rs->circular ? "true" : "false"));
1130
1131 ALIGN_CF_DBGF(("md5: '"));
1132 for (j = 0; j != 16; ++j)
1133 ALIGN_CF_DBGF(("%02X", rs->md5[j]));
1134 ALIGN_CF_DBGF(("', "));
1135
1136 ALIGN_CF_DBGF(("keys: [ "));
1137 for (j = 0; j != rs->num_used; ++j) {
1138 char const *key = rs->used[j];
1139
1140 ALIGN_CF_DBGF(("'%s', ", key));
1141 }
1142 ALIGN_CF_DBGF(("] }"));
1143 }
1144
ReferenceMgr_DumpConfig(ReferenceMgr const * const self)1145 LIB_EXPORT void ReferenceMgr_DumpConfig(ReferenceMgr const *const self)
1146 {
1147 unsigned const n = (unsigned)self->refSeqs.elem_count;
1148 unsigned i;
1149
1150 ALIGN_CF_DBGF(("config: [\n"));
1151 for (i = 0; i != n; ++i) {
1152 ALIGN_CF_DBGF(("\t"));
1153 ReferenceSeq_Dump(&self->refSeq[i]);
1154 ALIGN_CF_DBGF((",\n"));
1155 }
1156 ALIGN_CF_DBGF(("]\n"));
1157 }
1158 #endif
1159
1160 static
ReferenceSeq_GetRefSeqInfo(ReferenceSeq * const self)1161 rc_t ReferenceSeq_GetRefSeqInfo(ReferenceSeq *const self)
1162 {
1163 rc_t rc;
1164 uint8_t const *md5;
1165
1166 assert(self != NULL);
1167 assert(self->type == rst_refSeqById || self->type == rst_refSeqBySeqId);
1168
1169 if ((rc = RefSeq_Circular(self->u.refseq, &self->circular)) != 0)
1170 return rc;
1171 if ((rc = RefSeq_SeqLength(self->u.refseq, &self->seq_len)) != 0)
1172 return rc;
1173 if ((rc = RefSeq_MD5(self->u.refseq, &md5)) != 0)
1174 return rc;
1175
1176 if (md5)
1177 memmove(self->md5, md5, 16);
1178 else
1179 memset(self->md5, 0, 16);
1180 return 0;
1181 }
1182
1183
1184 /* Try to attach to a RefSeq or a fasta file */
1185 static
ReferenceSeq_Attach(ReferenceMgr * const self,ReferenceSeq * const rs)1186 rc_t ReferenceSeq_Attach(ReferenceMgr *const self, ReferenceSeq *const rs)
1187 {
1188 unsigned const seqid_len = rs->seqId ? (unsigned)string_size(rs->seqId) : 0;
1189 unsigned const id_len = rs->id ? (unsigned)string_size(rs->id) : 0;
1190 rc_t rc = 0;
1191 KFile const *kf = NULL;
1192
1193 assert(rs->type == rst_unattached);
1194 assert(id_len != 0 || seqid_len != 0);
1195
1196 if (seqid_len) {
1197 ALIGN_CF_DBGF(("trying to open refseq: %.*s\n", seqid_len, rs->seqId));
1198 if (RefSeqMgr_Exists(self->rmgr, rs->seqId, seqid_len, NULL) == 0) {
1199 rc = RefSeqMgr_GetSeq(self->rmgr, &rs->u.refseq, rs->seqId, seqid_len);
1200 if (rc == 0) {
1201 rs->type = rst_refSeqBySeqId;
1202 rc = ReferenceSeq_GetRefSeqInfo(rs);
1203 }
1204 return rc;
1205 }
1206 }
1207 if (id_len) {
1208 ALIGN_CF_DBGF(("trying to open refseq: %.*s\n", id_len, rs->id));
1209 if (RefSeqMgr_Exists(self->rmgr, rs->id, id_len, NULL) == 0) {
1210 rc = RefSeqMgr_GetSeq(self->rmgr, &rs->u.refseq, rs->id, id_len);
1211 if (rc == 0) {
1212 rs->type = rst_refSeqById;
1213 rc = ReferenceSeq_GetRefSeqInfo(rs);
1214 }
1215 return rc;
1216 }
1217 }
1218 if (id_len) {
1219 ALIGN_CF_DBGF(("trying to open fasta: %.*s\n", id_len, rs->id));
1220 rc = OpenFastaFile(&kf, self->dir, rs->id, id_len);
1221 if (rc && seqid_len) {
1222 ALIGN_CF_DBGF(("trying to open fasta: %.*s\n", seqid_len, rs->seqId));
1223 rc = OpenFastaFile(&kf, self->dir, rs->seqId, seqid_len);
1224 }
1225 }
1226 else {
1227 ALIGN_CF_DBGF(("trying to open fasta: %.*s\n", seqid_len, rs->seqId));
1228 rc = OpenFastaFile(&kf, self->dir, rs->seqId, seqid_len);
1229 }
1230 if (kf) {
1231 ReferenceSeq tmp = *rs;
1232
1233 ALIGN_CF_DBGF(("importing fasta"));
1234 rc = ImportFastaFile(self, kf, rs);
1235 KFileRelease(kf);
1236 if (rc != 0)
1237 *rs = tmp;
1238 return rc;
1239 }
1240 return 0;
1241 }
1242
1243 struct Candidate {
1244 int weight;
1245 unsigned index;
1246 };
1247
cmpCandidate(void const * A,void const * B,void * Ctx)1248 static int64_t CC cmpCandidate(void const *A, void const *B, void *Ctx)
1249 {
1250 struct Candidate const *const a = A;
1251 struct Candidate const *const b = B;
1252
1253 return (int64_t)(a->weight - b->weight);
1254 }
1255
sortCandidateList(unsigned const len,struct Candidate * list)1256 static void sortCandidateList(unsigned const len, struct Candidate *list)
1257 {
1258 if (len > 1)
1259 ksort(list, len, sizeof(list[0]), cmpCandidate, NULL);
1260 }
1261
candidates(ReferenceMgr * const self,unsigned * const count,struct Candidate ** const rslt,unsigned const idLen,char const id[],unsigned const seq_len,uint8_t const md5[16])1262 static void candidates(ReferenceMgr *const self,
1263 unsigned *const count,
1264 struct Candidate **const rslt,
1265 unsigned const idLen,
1266 char const id[],
1267 unsigned const seq_len,
1268 uint8_t const md5[16])
1269 {
1270 unsigned const hv = hash(idLen, id);
1271 Bucket const bucket = self->ht[hv];
1272 unsigned num_possible = 0;
1273 struct Candidate *possible = malloc(sizeof(possible[0]) * bucket.count);
1274 unsigned i;
1275
1276 assert(possible != NULL);
1277 if (possible == NULL)
1278 abort();
1279 for (i = 0; i < bucket.count; ++i) {
1280 unsigned const index = bucket.index[i];
1281 ReferenceSeq *const rs = &self->refSeq[index];
1282
1283 if (rs->type != rst_dead) {
1284 bool const sameId = rs->id != NULL && strcmp(rs->id, id) == 0;
1285 bool const sameSeqId = rs->seqId != NULL && strcasecmp(rs->seqId, id) == 0;
1286 int const sameFastaSeqId = rs->fastaSeqId != NULL ? str_weight(rs->fastaSeqId, id, idLen) : 0;
1287 struct Candidate nv;
1288
1289 nv.weight = (sameId ? id_match : 0) | (sameSeqId ? seqId_match : 0) | sameFastaSeqId;
1290 nv.index = index;
1291
1292 if (nv.weight > 0)
1293 possible[num_possible++] = nv;
1294 }
1295 }
1296 if (num_possible == 0) {
1297 free(possible);
1298 possible = NULL;
1299 }
1300 *count = num_possible;
1301 *rslt = possible;
1302 }
1303
tryFastaOrRefSeq(ReferenceMgr * const self,ReferenceSeq ** const rslt,unsigned const idLen,char const id[],bool * tryAgain)1304 static rc_t tryFastaOrRefSeq(ReferenceMgr *const self,
1305 ReferenceSeq **const rslt,
1306 unsigned const idLen,
1307 char const id[],
1308 bool *tryAgain)
1309 {
1310 KFile const *kf = NULL;
1311 rc_t rc = 0;
1312
1313 if (OpenFastaFile(&kf, self->dir, id, idLen) == 0) {
1314 /* found a fasta file; load it and try again */
1315 rc = ImportFastaFile(self, kf, NULL);
1316 if (rc == 0) {
1317 *tryAgain = true;
1318 return 0;
1319 }
1320 }
1321 /* didn't find a fasta file; try RefSeq */
1322 ALIGN_CF_DBGF(("trying to open refseq: %s\n", id));
1323 if (RefSeqMgr_Exists(self->rmgr, id, idLen, NULL) == 0) {
1324 ReferenceSeq *const seq = newReferenceSeq(self, NULL);
1325
1326 rc = RefSeqMgr_GetSeq(self->rmgr, &seq->u.refseq, id, idLen);
1327 if (rc == 0) {
1328 unsigned const hv = hash(idLen, id);
1329 seq->id = string_dup(id, idLen);
1330 seq->type = rst_refSeqById;
1331
1332 addToHashBucket(&self->ht[hv], seq - self->refSeq);
1333
1334 ReferenceSeq_GetRefSeqInfo(seq);
1335 *rslt = seq;
1336 }
1337 }
1338 else {
1339 /* nothing found and out of options */
1340 rc = RC(rcAlign, rcFile, rcConstructing, rcId, rcNotFound);
1341 }
1342 return rc;
1343 }
1344
setSeqLenBit(ReferenceMgr * const self,unsigned const N,struct Candidate * const C,unsigned const seq_len)1345 static void setSeqLenBit(ReferenceMgr *const self, unsigned const N, struct Candidate *const C, unsigned const seq_len)
1346 {
1347 unsigned i;
1348 for (i = 0; i < N; ++i) {
1349 unsigned const index = C[i].index;
1350 ReferenceSeq const *const rs = &self->refSeq[index];
1351
1352 if (seq_len == rs->seq_len)
1353 C[i].weight |= seq_len_match;
1354 }
1355 }
1356
setMD5Bit(ReferenceMgr * const self,unsigned const N,struct Candidate * const C,uint8_t const md5[])1357 static void setMD5Bit(ReferenceMgr *const self, unsigned const N, struct Candidate *const C, uint8_t const md5[])
1358 {
1359 unsigned i;
1360 for (i = 0; i < N; ++i) {
1361 unsigned const index = C[i].index;
1362 ReferenceSeq const *const rs = &self->refSeq[index];
1363
1364 if (memcmp(md5, rs->md5, 16) == 0)
1365 C[i].weight |= md5_match;
1366 }
1367 }
1368
setAttachedBit(ReferenceMgr * const self,unsigned const N,struct Candidate * const C)1369 static void setAttachedBit(ReferenceMgr *const self, unsigned const N, struct Candidate *const C)
1370 {
1371 unsigned i;
1372 for (i = 0; i < N; ++i) {
1373 unsigned const index = C[i].index;
1374 ReferenceSeq const *const rs = &self->refSeq[index];
1375
1376 if (rs->type != rst_unattached)
1377 C[i].weight |= attached;
1378 }
1379 }
1380
checkForMultiMapping(ReferenceMgr * const self,ReferenceSeq * const chosen,bool * const wasRenamed)1381 static ReferenceSeq *checkForMultiMapping(ReferenceMgr *const self, ReferenceSeq *const chosen, bool *const wasRenamed)
1382 {
1383 unsigned const hv = hash0(chosen->seqId);
1384 Bucket const bucket = self->ht[hv];
1385 unsigned i;
1386
1387 for (i = 0; i < bucket.count; ++i) {
1388 ReferenceSeq *qry = &self->refSeq[bucket.index[i]];
1389 if (qry == chosen || qry->type == rst_dead || qry->type == rst_unattached || qry->type == rst_renamed || qry->seqId == NULL)
1390 continue;
1391 if (strcasecmp(chosen->seqId, qry->seqId) == 0) {
1392 void *const oldId = chosen->id;
1393
1394 chosen->type = rst_renamed;
1395 chosen->id = string_dup(qry->id, string_size(qry->id));
1396 *wasRenamed = true;
1397 free(oldId);
1398 return qry;
1399 }
1400 }
1401 return chosen;
1402 }
1403
tryFasta(ReferenceMgr * const self,ReferenceSeq ** const rslt,char const seqId[],unsigned const seq_len,uint8_t const md5[16])1404 static rc_t tryFasta(ReferenceMgr *const self,
1405 ReferenceSeq **const rslt,
1406 char const seqId[],
1407 unsigned const seq_len,
1408 uint8_t const md5[16])
1409 {
1410 ReferenceSeq *const seq = *rslt;
1411 unsigned const hv = hash0(seqId);
1412 Bucket const bucket = self->ht[hv];
1413 unsigned best = bucket.count;
1414 unsigned best_score = 0;
1415 unsigned i;
1416
1417 for (i = 0; i < bucket.count; ++i) {
1418 unsigned score = 0;
1419 ReferenceSeq *const qry = &self->refSeq[bucket.index[i]];
1420
1421 if (qry != seq && qry->fastaSeqId != NULL) {
1422 char const *const substr = strcasestr(qry->fastaSeqId, seqId);
1423 if (substr != NULL) {
1424 unsigned const at = substr - qry->fastaSeqId;
1425 unsigned const slen = (unsigned)string_size(seqId);
1426 unsigned const qlen = (unsigned)string_size(qry->fastaSeqId);
1427 if (at == 0 && slen == qlen)
1428 score |= exact_match;
1429 else {
1430 bool const leftEdge = at == 0 || qry->fastaSeqId[at - 1] == '|';
1431 bool const rightEdge = (at + slen) == qlen || qry->fastaSeqId[at + slen] == '|';
1432
1433 if (leftEdge && rightEdge)
1434 score |= substring_match;
1435 }
1436 }
1437 }
1438 if (score == 0)
1439 continue;
1440
1441 if (seq_len != 0 && qry->seq_len == seq_len)
1442 score |= seq_len_match;
1443 if (md5 != NULL && memcmp(md5, qry->md5, 16) == 0)
1444 score |= md5_match;
1445 if (score > best_score) {
1446 best = i;
1447 best_score = score;
1448 }
1449 }
1450 if (best != bucket.count) {
1451 *rslt = &self->refSeq[bucket.index[best]];
1452 return 0;
1453 }
1454 else {
1455 return RC(rcAlign, rcFile, rcConstructing, rcId, rcNotFound);
1456 }
1457 }
1458
findSeq(ReferenceMgr * const self,ReferenceSeq ** const rslt,char const id[],unsigned const seq_len,uint8_t const md5[16],bool const allowMultiMapping,bool wasRenamed[])1459 static rc_t findSeq(ReferenceMgr *const self,
1460 ReferenceSeq **const rslt,
1461 char const id[],
1462 unsigned const seq_len,
1463 uint8_t const md5[16],
1464 bool const allowMultiMapping,
1465 bool wasRenamed[])
1466 {
1467 unsigned const idLen = (unsigned)string_size(id);
1468 unsigned num_possible = 0;
1469 struct Candidate *possible = NULL;
1470 ReferenceSeq *chosen = NULL;
1471 rc_t rc = 0;
1472
1473 candidates(self, &num_possible, &possible, idLen, id, seq_len, md5);
1474 if (num_possible == 0) {
1475 /* nothing was found; try a fasta file */
1476 bool tryAgain = false;
1477 rc_t const rc = tryFastaOrRefSeq(self, rslt, idLen, id, &tryAgain);
1478
1479 if (rc != 0 && !tryAgain)
1480 return rc;
1481
1482 return findSeq(self, rslt, id, seq_len, md5, allowMultiMapping, wasRenamed);
1483 }
1484 if (num_possible > 1) {
1485 if (seq_len != 0)
1486 setSeqLenBit(self, num_possible, possible, seq_len);
1487 if (md5 != NULL)
1488 setMD5Bit(self, num_possible, possible, md5);
1489 sortCandidateList(num_possible, possible);
1490 setAttachedBit(self, num_possible, possible);
1491 sortCandidateList(num_possible - 1, possible);
1492 }
1493 /* if there is no second best OR the best is better than the second best
1494 * then we have a clear choice
1495 */
1496 if (num_possible == 1 || (possible[num_possible - 1].weight & ~attached) > (possible[num_possible - 2].weight & ~attached)) {
1497 unsigned const index = possible[num_possible - 1].index;
1498 chosen = &self->refSeq[index];
1499 }
1500 free(possible);
1501 if (chosen == NULL) {
1502 /* there was no clear choice */
1503 return RC(rcAlign, rcFile, rcConstructing, rcId, rcAmbiguous);
1504 }
1505 if (chosen->seqId != NULL && !allowMultiMapping) {
1506 chosen = checkForMultiMapping(self, chosen, wasRenamed);
1507 }
1508 if (chosen->type == rst_unattached) {
1509 rc = ReferenceSeq_Attach(self, chosen);
1510 if (rc == 0 && chosen->type == rst_unattached) {
1511 /* still not attached; try seqId fasta */
1512 char const *const seqId = chosen->seqId;
1513
1514 chosen->type = rst_dead;
1515 if (seqId)
1516 rc = tryFasta(self, &chosen, seqId, seq_len, md5);
1517 else
1518 rc = RC(rcAlign, rcFile, rcConstructing, rcId, rcNotFound);
1519 }
1520 }
1521 if (rc == 0) {
1522 if (chosen->id == NULL)
1523 chosen->id = string_dup(id, idLen);
1524 if (chosen->seqId == NULL && chosen->fastaSeqId != NULL)
1525 chosen->seqId = string_dup(chosen->fastaSeqId, string_size(chosen->fastaSeqId));
1526 *rslt = chosen;
1527 }
1528 if (rc)
1529 chosen->type = rst_dead;
1530 addToIndex(self, id, chosen);
1531 return rc;
1532 }
1533
ReferenceMgr_FindSeq(ReferenceMgr const * const self,char const id[])1534 static ReferenceSeq *ReferenceMgr_FindSeq(ReferenceMgr const *const self, char const id[])
1535 {
1536 int const fnd = findId(self, id);
1537 return (fnd >= 0) ? &self->refSeq[fnd] : NULL;
1538 }
1539
1540 static
ReferenceMgr_OpenSeq(ReferenceMgr * const self,ReferenceSeq ** const rslt,char const id[],unsigned const seq_len,uint8_t const md5[16],bool const allowMultiMapping,bool wasRenamed[])1541 rc_t ReferenceMgr_OpenSeq(ReferenceMgr *const self,
1542 ReferenceSeq **const rslt,
1543 char const id[],
1544 unsigned const seq_len,
1545 uint8_t const md5[16],
1546 bool const allowMultiMapping,
1547 bool wasRenamed[])
1548 {
1549 ReferenceSeq *const obj = ReferenceMgr_FindSeq(self, id);
1550 if (obj) {
1551 assert(rslt != NULL);
1552 *rslt = NULL;
1553 if (obj->type == rst_dead)
1554 return RC(rcAlign, rcIndex, rcSearching, rcItem, rcInvalid);
1555 if (obj->type == rst_renamed) {
1556 bool dummy = false;
1557
1558 *wasRenamed = true;
1559 return ReferenceMgr_OpenSeq(self, rslt, obj->id, seq_len, md5, allowMultiMapping, &dummy);
1560 }
1561 if (obj->type == rst_refSeqBySeqId || obj->type == rst_refSeqById) {
1562 RefSeq const *dummy = NULL;
1563 char const *const key = (obj->type == rst_refSeqById) ? obj->id : obj->seqId;
1564 rc_t const rc = RefSeqMgr_GetSeq(self->rmgr, &dummy, key, (unsigned)string_size(key));
1565
1566 assert(rc == 0);
1567 assert(dummy == obj->u.refseq);
1568 }
1569 *rslt = obj;
1570 return 0;
1571 }
1572 return findSeq(self, rslt, id, seq_len, md5, allowMultiMapping, wasRenamed);
1573 }
1574
ReferenceMgr_SetCache(ReferenceMgr const * const self,size_t cache,uint32_t num_open)1575 LIB_EXPORT rc_t CC ReferenceMgr_SetCache(ReferenceMgr const *const self, size_t cache, uint32_t num_open)
1576 {
1577 return RefSeqMgr_SetCache(self->rmgr, cache, num_open);
1578 }
1579
1580 static
OpenDataDirectory(KDirectory const ** rslt,char const path[])1581 rc_t OpenDataDirectory(KDirectory const **rslt, char const path[])
1582 {
1583 KDirectory *dir;
1584 rc_t rc = KDirectoryNativeDir(&dir);
1585
1586 if (rc == 0) {
1587 if (path) {
1588 rc = KDirectoryOpenDirRead(dir, rslt, false, "%s", path);
1589 KDirectoryRelease(dir);
1590 }
1591 else
1592 *rslt = dir;
1593 }
1594 return rc;
1595 }
1596
ReferenceMgr_Make(ReferenceMgr const ** cself,VDatabase * db,VDBManager const * vmgr,const uint32_t options,char const conf[],char const path[],uint32_t max_seq_len,size_t cache,uint32_t num_open)1597 LIB_EXPORT rc_t CC ReferenceMgr_Make(ReferenceMgr const **cself, VDatabase *db,
1598 VDBManager const* vmgr,
1599 const uint32_t options, char const conf[], char const path[],
1600 uint32_t max_seq_len, size_t cache, uint32_t num_open)
1601 {
1602 rc_t rc;
1603 ReferenceMgr *self;
1604 uint32_t wopt = 0;
1605
1606 if (cself == NULL)
1607 return RC(rcAlign, rcIndex, rcConstructing, rcParam, rcNull);
1608
1609 wopt |= (options & ewrefmgr_co_allREADs) ? ewref_co_SaveRead : 0;
1610 wopt |= (options & ewrefmgr_co_Coverage) ? ewref_co_Coverage : 0;
1611
1612 if (max_seq_len == 0)
1613 max_seq_len = TableWriterRefSeq_MAX_SEQ_LEN;
1614
1615 self = calloc(1, sizeof(*self));
1616 if (self) {
1617 rc = KDataBufferMakeBytes(&self->seq, max_seq_len);
1618 if (rc == 0) {
1619 self->compress.elem_bits = sizeof(compress_buffer_t) * 8;
1620 self->refSeqs.elem_bits = sizeof(ReferenceSeq) * 8;
1621
1622 self->options = options;
1623 self->cache = cache;
1624 self->num_open_max = num_open;
1625 self->max_seq_len = max_seq_len;
1626 if (db) VDatabaseAddRef(self->db = db);
1627 rc = OpenDataDirectory(&self->dir, path);
1628 if (rc == 0) {
1629 rc = RefSeqMgr_Make(&self->rmgr, vmgr, 0, cache, num_open);
1630 if (rc == 0) {
1631 rc = ReferenceMgr_Conf(self, conf);
1632 if (rc == 0) {
1633 *cself = self;
1634 ALIGN_DBG("conf %s, local path '%s'", conf ? conf : "", path ? path : "");
1635 return 0;
1636 }
1637 (void)PLOGERR(klogErr, (klogErr, rc, "failed to open configuration $(file)", "file=%s/%s", path ? path : ".", conf));
1638 }
1639 }
1640 }
1641 ReferenceMgr_Release(self, false, NULL, false, NULL);
1642 }
1643 else
1644 rc = RC(rcAlign, rcIndex, rcConstructing, rcMemory, rcExhausted);
1645
1646 ALIGN_DBGERR(rc);
1647
1648 return rc;
1649 }
1650
1651 #define ID_CHUNK_SZ 256
1652
1653 typedef struct TChunk32_struct {
1654 struct TChunk32_struct* next;
1655 uint32_t id[ID_CHUNK_SZ]; /*** will only work with positive ids **/
1656 } TChunk32;
1657
1658 typedef struct AlignId32List_struct {
1659 TChunk32* head;
1660 TChunk32* tail;
1661 uint32_t tail_qty; /** number elements in the last chunk **/
1662 uint32_t chunk_qty; /** number of chunks */
1663 } AlignId32List;
1664 typedef struct AlignIdList_struct {
1665 AlignId32List **sub_list;
1666 uint32_t sub_list_count;
1667 } AlignIdList;
1668
AlignId32ListCount(AlignId32List * l)1669 static uint64_t AlignId32ListCount(AlignId32List *l)
1670 { return (l->chunk_qty>0)?ID_CHUNK_SZ*(l->chunk_qty-1)+ l->tail_qty:0;}
1671
AlignIdListCount(AlignIdList * l)1672 static uint64_t AlignIdListCount(AlignIdList *l)
1673 {
1674 uint64_t ret=0;
1675 if(l){
1676 uint32_t i;
1677 for(i=0;i<l->sub_list_count;i++){
1678 if(l->sub_list[i]){
1679 ret += AlignId32ListCount(l->sub_list[i]);
1680 }
1681 }
1682 }
1683 return ret;
1684 }
AlignIdListFlatCopy(AlignIdList * l,int64_t * buf,uint64_t num_elem,bool do_sort)1685 static uint64_t AlignIdListFlatCopy(AlignIdList *l,int64_t *buf,uint64_t num_elem,bool do_sort)
1686 {
1687 uint64_t res=0;
1688 uint32_t i,j;
1689 AlignId32List* cl;
1690 assert(l!=0);
1691
1692 if((cl = l->sub_list[0])!=NULL){
1693 TChunk32* head = cl->head;
1694 while(head != cl->tail){
1695 for(i=0;i<ID_CHUNK_SZ && res < num_elem;i++,res++){
1696 buf[res] = head->id[i];
1697 }
1698 head = head->next;
1699 }
1700 for(i=0;i<cl->tail_qty && res < num_elem;i++,res++){
1701 buf[res] = head->id[i];
1702 }
1703 }
1704 for(j = 1; j< l->sub_list_count && res < num_elem;j++){
1705 if((cl = l->sub_list[j])!=NULL){
1706 TChunk32* head = cl->head;
1707 uint64_t hi = ((uint64_t)j) << 32;
1708 while(head != cl->tail){
1709 for(i=0;i<ID_CHUNK_SZ && res < num_elem;i++,res++){
1710 buf[res] = hi | head->id[i];
1711 }
1712 head = head->next;
1713 }
1714 for(i=0;i<cl->tail_qty && res < num_elem;i++,res++){
1715 buf[res] = hi | head->id[i];
1716 }
1717 }
1718 }
1719 if(do_sort && res > 1)
1720 ksort_int64_t(buf,res);
1721 return res;
1722 }
1723
AlignId32ListAddId(AlignId32List * l,const uint32_t id)1724 static rc_t AlignId32ListAddId(AlignId32List *l,const uint32_t id)
1725 {
1726 if(l->tail == NULL){
1727 l->head = l->tail = malloc(sizeof(*l->tail));
1728 if(l->tail == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1729 l->chunk_qty =1;
1730 l->tail_qty = 0;
1731 }
1732 if(l->tail_qty == ID_CHUNK_SZ){/** chunk is full **/
1733 l->tail->next = malloc(sizeof(*l->tail));
1734 if(l->tail == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1735 l->tail = l->tail->next;
1736 l->chunk_qty ++;
1737 l->tail_qty = 0;
1738 }
1739 l->tail->id[l->tail_qty++]=id;
1740 return 0;
1741 }
1742
AlignIdListAddId(AlignIdList * l,const int64_t id)1743 static rc_t AlignIdListAddId(AlignIdList *l,const int64_t id)
1744 {
1745 uint32_t sub_id,id32;
1746 if(id < 0) return RC(rcAlign, rcTable, rcCommitting, rcId, rcOutofrange);
1747 id32 = (uint32_t) id;
1748 sub_id = id >> 32;
1749 if(sub_id >= l->sub_list_count) return RC(rcAlign, rcTable, rcCommitting, rcId, rcOutofrange);
1750 if(l->sub_list[sub_id] == NULL){
1751 l->sub_list[sub_id] = calloc(1,sizeof(AlignId32List));
1752 if(l->sub_list[sub_id] == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1753 }
1754 return AlignId32ListAddId(l->sub_list[sub_id],id32);
1755 }
1756
AlignId32ListRelease(AlignId32List * l)1757 static void AlignId32ListRelease(AlignId32List *l)
1758 {
1759 if(l){
1760 while(l->head != l->tail){
1761 TChunk32* head = l->head;
1762 l->head = l->head->next;
1763 free(head);
1764 }
1765 free(l->head);
1766 free(l);
1767 }
1768 }
AlignIdListRelease(AlignIdList * l)1769 static void AlignIdListRelease(AlignIdList *l)
1770 {
1771 if(l){
1772 uint32_t i;
1773 for(i=0;i<l->sub_list_count;i++){
1774 AlignId32ListRelease(l->sub_list[i]);
1775 }
1776 free(l->sub_list);
1777 free(l);
1778 }
1779 }
1780
1781
1782 typedef struct {
1783 AlignIdList* idlist;
1784 ReferenceSeqCoverage cover;
1785 INSDC_coord_len bin_seq_len;
1786 } TCover;
1787
1788 static
ReferenceMgr_TCoverRelease(TCover * c)1789 void ReferenceMgr_TCoverRelease(TCover* c)
1790 {
1791 if(c){
1792 AlignIdListRelease(c->idlist);
1793 c->idlist = NULL;
1794 }
1795 }
1796 static
ReferenceMgr_TCoverSetMaxId(TCover * c,int64_t id)1797 rc_t ReferenceMgr_TCoverSetMaxId(TCover* c,int64_t id)
1798 {
1799 uint32_t sub_id;
1800 if(id < 0) return RC(rcAlign, rcTable, rcCommitting, rcId, rcOutofrange);
1801 sub_id = id >> 32;
1802 if(c->idlist == NULL){
1803 c->idlist = calloc(1,sizeof(AlignIdList));
1804 if(c->idlist==NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1805 c->idlist->sub_list_count = sub_id+1;
1806 c->idlist->sub_list = calloc(c->idlist->sub_list_count,sizeof(c->idlist->sub_list[0]));
1807 if(c->idlist->sub_list == NULL) return RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1808 } else {
1809 return RC(rcAlign, rcTable, rcCommitting, rcParam, rcUnexpected);
1810 }
1811 return 0;
1812 }
1813
1814 static
CoverageGetSeqLen(ReferenceMgr const * const mgr,TCover data[],uint64_t const rows)1815 rc_t CoverageGetSeqLen(ReferenceMgr const *const mgr, TCover data[], uint64_t const rows)
1816 {
1817 TableReaderColumn acols[] =
1818 {
1819 {0, "(INSDC:coord:len)SEQ_LEN", {NULL}, 0, 0},
1820 {0, NULL, {NULL}, 0, 0}
1821 };
1822 VTable const *tbl;
1823 rc_t rc = VDatabaseOpenTableRead(mgr->db, &tbl, "REFERENCE");
1824
1825 if (rc == 0) {
1826 TableReader const *reader;
1827
1828 rc = TableReader_Make(&reader, tbl, acols, 0);
1829 if (rc == 0) {
1830 uint64_t i;
1831
1832 for (i = 0; i != rows; ++i) {
1833 rc = TableReader_ReadRow(reader, i + 1);
1834 if (rc == 0 && acols->len > 0)
1835 data[i].bin_seq_len = acols->base.coord_len[0];
1836 }
1837 TableReader_Whack(reader);
1838 }
1839 VTableRelease(tbl);
1840 }
1841 return rc;
1842 }
1843
1844 static
ReferenceMgr_ReCover(const ReferenceMgr * cself,uint64_t ref_rows,rc_t (* const quitting)(void))1845 rc_t ReferenceMgr_ReCover(const ReferenceMgr* cself, uint64_t ref_rows, rc_t (*const quitting)(void))
1846 {
1847 rc_t rc = 0;
1848 uint64_t new_rows = 0;
1849 const TableWriterRefCoverage* cover_writer = NULL;
1850
1851 TableReaderColumn acols[] =
1852 {
1853 {0, "REF_ID", {NULL}, 0, 0},
1854 {0, "REF_START", {NULL}, 0, 0},
1855 {0, "CIGAR_LONG",{NULL}, 0, 0},
1856 {0, "REF_POS",{NULL}, 0, 0},
1857 {0, NULL, {NULL}, 0, 0}
1858 };
1859 const int64_t** al_ref_id = &acols[0].base.i64;
1860 const INSDC_coord_zero** al_ref_start = &acols[1].base.coord0;
1861 const TableReaderColumn* cigar = &acols[2];
1862 const INSDC_coord_zero** al_ref_pos = &acols[3].base.coord0;
1863 /* order is important see ReferenceSeqCoverage struct */
1864 struct {
1865 const char* nm;
1866 const char* col;
1867 bool ids_only;
1868 } tbls[] = { {"PRIMARY_ALIGNMENT", "PRIMARY_ALIGNMENT_IDS",false},
1869 {"SECONDARY_ALIGNMENT", "SECONDARY_ALIGNMENT_IDS",false},
1870 {"EVIDENCE_INTERVAL", "EVIDENCE_INTERVAL_IDS", true} };
1871 int tbls_qty=(sizeof(tbls)/sizeof(tbls[0]));
1872 rc_t rc1 = 0;
1873 int64_t rr;
1874 uint32_t i;
1875 uint8_t* hilo=NULL;
1876 TCover* data = NULL;
1877
1878 /* allocate mem for ref_rows of reference coverage*/
1879 if((data = calloc(ref_rows, (sizeof(*data) + cself->max_seq_len))) == NULL) {
1880 rc = RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
1881 } else {
1882 /** allocation for both data and hilo was done in 1 shot ***/
1883 hilo = (uint8_t *)&data[ref_rows];
1884 rc = CoverageGetSeqLen(cself, data, ref_rows);
1885 }
1886 /* grep through tables for coverage data */
1887 ALIGN_R_DBG("covering REFERENCE rowid range [1:%ld]",ref_rows);
1888 for(i = 0; rc == 0 && i < tbls_qty; i++) { /* TABLE LOOP STARTS */
1889 const VTable* table = NULL;
1890 const TableReader* reader = NULL;
1891 int64_t al_from;
1892 uint64_t al_qty;
1893
1894 ALIGN_R_DBG("covering REFERENCE with %s", tbls[i].nm);
1895 if((rc = VDatabaseOpenTableRead(cself->db, &table, "%s", tbls[i].nm)) != 0) {
1896 if(GetRCState(rc) == rcNotFound) {
1897 ALIGN_R_DBG("table %s was not found, ignored", tbls[i].nm);
1898 rc = 0;
1899 continue;
1900 } else {
1901 break;
1902 }
1903 }
1904 if((rc = TableReader_Make(&reader, table, acols, cself->cache)) == 0 &&
1905 (rc = TableReader_IdRange(reader, &al_from, &al_qty)) == 0) {
1906 int64_t al_rowid;
1907
1908 for(al_rowid = al_from; rc == 0 && al_rowid < al_from + al_qty; al_rowid++) {
1909 if((rc = TableReader_ReadRow(reader, al_rowid)) != 0) {
1910 break;
1911 }
1912 rr = **al_ref_id-1;
1913 /**** Record ALIGNMENT_IDS ***/
1914 if( data[rr].idlist == NULL
1915 && (rc = ReferenceMgr_TCoverSetMaxId(data+rr,al_from + al_qty))!=0){
1916 break; /*** out-of-memory ***/
1917 }
1918 if((rc = AlignIdListAddId(data[rr].idlist,al_rowid))!=0){
1919 break; /*** out-of-memory ***/
1920 }
1921 /**** Done alignment ids ***/
1922 if(!tbls[i].ids_only) { /*** work on statistics ***/
1923 char const *c = cigar->base.str;
1924 const char *c_end = c + cigar->len;
1925 int64_t const global_ref_pos = rr*cself->max_seq_len + **al_ref_start; /** global_ref_start **/
1926 int64_t const global_refseq_start = global_ref_pos - **al_ref_pos; /** global_ref_start of current reference **/
1927 unsigned const bin_no = (unsigned)(global_ref_pos / cself->max_seq_len);
1928 TCover *const bin = &data[bin_no];
1929 uint8_t *const cov = &hilo[global_ref_pos];
1930 int64_t ref_offset = 0;
1931 int64_t max_ref_offset = 0;
1932 int64_t min_ref_offset = 0;
1933 int64_t j;
1934
1935 while (rc == 0 && c < c_end) {
1936 int op_len = (int)strtol(c, (char **)&c, 10);
1937 int const op = *c++;
1938
1939 switch (op){
1940 case 'I':/* extra bases in the read **/
1941 ++bin->cover.indels;
1942 case 'S':/* skip in the read */
1943 break;
1944 case 'B':/* back up in the sequence */
1945 if (ref_offset > op_len)
1946 ref_offset -= op_len;
1947 else
1948 ref_offset = 0;
1949 break;
1950 case 'D': /** delete in the reference ***/
1951 ++bin->cover.indels;
1952 case 'N': /** expected skip in the reference ***/
1953 ref_offset += op_len;
1954 break;
1955 case 'X':
1956 bin->cover.mismatches += op_len;
1957 case '=':
1958 ref_offset += op_len;
1959 break;
1960 default:
1961 rc = RC(rcAlign, rcTable, rcCommitting, rcData, rcUnrecognized);
1962 }
1963 if (min_ref_offset > ref_offset)
1964 min_ref_offset = ref_offset;
1965 if (max_ref_offset < ref_offset)
1966 max_ref_offset = ref_offset;
1967 }
1968 for (j = min_ref_offset; j < max_ref_offset; ++j) {
1969 unsigned const hl = cov[j];
1970
1971 if (hl < UINT8_MAX)
1972 cov[j] = hl + 1;
1973 }
1974 /*** check if OVERLAPS are needed ***/
1975 {
1976 int64_t min_rr = (global_ref_pos + min_ref_offset)/cself->max_seq_len;
1977 int64_t max_rr = (global_ref_pos + max_ref_offset)/cself->max_seq_len;
1978
1979 if(min_rr < 0) min_rr = 0;
1980 if(max_rr >= ref_rows) max_rr = ref_rows -1;
1981
1982 assert(min_rr<= max_rr);
1983
1984 if(min_rr < max_rr){
1985 int64_t overlap_ref_pos; /** relative the beginning of the reference **/
1986 uint32_t overlap_ref_len = (global_ref_pos + max_ref_offset) % cself->max_seq_len ;
1987
1988 min_rr++;
1989 if (global_ref_pos + min_ref_offset > global_refseq_start) {
1990 overlap_ref_pos = global_ref_pos + min_ref_offset - global_refseq_start;
1991 }
1992 else {
1993 overlap_ref_pos = 1;
1994 }
1995 for (; min_rr < max_rr; ++min_rr) {
1996 if ( data[min_rr].cover.overlap_ref_pos[i] == 0 /*** NOT SET***/
1997 || overlap_ref_pos < data[min_rr].cover.overlap_ref_pos[i])
1998 {
1999 data[min_rr].cover.overlap_ref_pos[i] = (INSDC_coord_zero)overlap_ref_pos;
2000 }
2001 data[min_rr].cover.overlap_ref_len[i] = cself->max_seq_len; /*** in between chunks get full length of overlap **/
2002 }
2003 if ( data[min_rr].cover.overlap_ref_pos[i] == 0
2004 || overlap_ref_pos < data[min_rr].cover.overlap_ref_pos[i])
2005 {
2006 data[min_rr].cover.overlap_ref_pos[i] = (INSDC_coord_zero)overlap_ref_pos;
2007 }
2008 if (overlap_ref_len > data[min_rr].cover.overlap_ref_len[i])
2009 data[min_rr].cover.overlap_ref_len[i] = overlap_ref_len;
2010 }
2011 }
2012 } /**** DONE WITH WORK ON STATISTICS ***/
2013 ALIGN_DBGERR(rc);
2014 rc = rc ? rc : quitting();
2015 }
2016 /*** HAVE TO RELEASE **/
2017 TableReader_Whack(reader);
2018 VTableRelease(table);
2019 /*** NOW SAVE AND RELEASE THE COLUMN ***/
2020 if((rc = TableWriterRefCoverage_MakeIds(&cover_writer, cself->db, tbls[i].col)) == 0) {
2021 for(rr=0; rc ==0 && rr < ref_rows; rr ++){
2022 uint64_t num_elem = AlignIdListCount(data[rr].idlist);
2023 uint64_t num_elem_copied = 0;
2024 if(num_elem > 0){
2025 #define BUF_STACK_COUNT 128 * 1024
2026 int64_t buf_stack[BUF_STACK_COUNT];
2027 int64_t *buf_alloc = NULL;
2028 int64_t *buf = buf_stack;
2029 if(num_elem > BUF_STACK_COUNT){
2030 buf=buf_alloc=malloc(num_elem*sizeof(buf[0]));
2031 if(buf_alloc == NULL)
2032 rc = RC(rcAlign, rcTable, rcCommitting, rcMemory, rcExhausted);
2033 }
2034 if(rc == 0){
2035 num_elem_copied = AlignIdListFlatCopy(data[rr].idlist,buf,num_elem,true);
2036 assert(num_elem == num_elem_copied);
2037 }
2038 ReferenceMgr_TCoverRelease(data+rr); /** release memory ***/
2039 if(rc == 0){
2040 rc = TableWriterRefCoverage_WriteIds(cover_writer, rr+1, buf, (uint32_t)num_elem);
2041 }
2042 if(buf_alloc) free(buf_alloc);
2043 } else {
2044 rc = TableWriterRefCoverage_WriteIds(cover_writer, rr+1, NULL,0);
2045 }
2046 }
2047 if(rc == 0){
2048 rc = TableWriterRefCoverage_Whack(cover_writer, rc == 0, &new_rows);
2049 if(rc == 0 && ref_rows != new_rows) {
2050 rc = RC(rcAlign, rcTable, rcCommitting, rcData, rcInconsistent);
2051 }
2052 }
2053 ALIGN_DBGERR(rc);
2054 }
2055 } else {
2056 TableReader_Whack(reader);
2057 VTableRelease(table);
2058 }
2059 }/* TABLE LOOP ENDS **/
2060 /* prep and write coverage data */
2061 if(rc == 0) {
2062 uint64_t k;
2063
2064 rc = TableWriterRefCoverage_MakeCoverage(&cover_writer, cself->db, 0);
2065 for (rr = 0, k = 0; rc == 0 && rr != ref_rows; ++rr, k += cself->max_seq_len) {
2066 unsigned hi = 0;
2067 unsigned lo = 255;
2068
2069 for (i = 0; i != data[rr].bin_seq_len; ++i) {
2070 unsigned const depth = hilo[k + i];
2071
2072 if (hi < depth) hi = depth;
2073 if (lo > depth) lo = depth;
2074 }
2075 data[rr].cover.high = hi;
2076 data[rr].cover.low = lo;
2077 rc = TableWriterRefCoverage_WriteCoverage(cover_writer,rr+1, &data[rr].cover);
2078 }
2079 free(data);
2080 rc1 = TableWriterRefCoverage_Whack(cover_writer, rc == 0, &new_rows);
2081 rc = rc ? rc : rc1;
2082 if(rc == 0 && ref_rows != new_rows) {
2083 rc = RC(rcAlign, rcTable, rcCommitting, rcData, rcInconsistent);
2084 }
2085 }
2086 ALIGN_DBGERR(rc);
2087 return rc;
2088 }
2089
ReferenceMgr_Release(const ReferenceMgr * cself,const bool commit,uint64_t * const Rows,const bool build_coverage,rc_t (* const quitting)(void))2090 LIB_EXPORT rc_t CC ReferenceMgr_Release(const ReferenceMgr *cself,
2091 const bool commit,
2092 uint64_t *const Rows,
2093 const bool build_coverage,
2094 rc_t (*const quitting)(void)
2095 )
2096 {
2097 rc_t rc = 0;
2098
2099 if (cself != NULL) {
2100 ReferenceMgr *const self = (ReferenceMgr *)cself;
2101 uint64_t rows = 0;
2102 unsigned i;
2103
2104 freeHashTableEntries(self);
2105
2106 rc = TableWriterRef_Whack(self->writer, commit, &rows);
2107 if (Rows) *Rows = rows;
2108 KDirectoryRelease(self->dir);
2109
2110 for (i = 0; i != self->refSeqs.elem_count; ++i)
2111 ReferenceSeq_Whack(&self->refSeq[i]);
2112
2113 KDataBufferWhack(&self->compress);
2114 KDataBufferWhack(&self->seq);
2115 KDataBufferWhack(&self->refSeqs);
2116
2117 if (rc == 0 && build_coverage && commit && rows > 0)
2118 rc = ReferenceMgr_ReCover(cself, rows, quitting);
2119 #if 0
2120 {
2121 VTable* t = NULL;
2122
2123 if (VDatabaseOpenTableUpdate(self->db, &t, "SECONDARY_ALIGNMENT") == 0) {
2124 VTableDropColumn(t, "TMP_MISMATCH");
2125 VTableDropColumn(t, "TMP_HAS_MISMATCH");
2126 }
2127 VTableRelease(t);
2128 }
2129 #endif
2130 VDatabaseRelease(self->db);
2131 RefSeqMgr_Release(self->rmgr);
2132 free(self);
2133 }
2134 return rc;
2135 }
2136
2137 static
ReferenceSeq_ReadDirect(ReferenceSeq * self,int offset,unsigned const len,bool read_circular,uint8_t buffer[],unsigned * written,bool force_linear)2138 rc_t ReferenceSeq_ReadDirect(ReferenceSeq *self,
2139 int offset,
2140 unsigned const len,
2141 bool read_circular,
2142 uint8_t buffer[],
2143 unsigned* written,
2144 bool force_linear)
2145 {
2146 *written = 0;
2147 if (len == 0)
2148 return 0;
2149
2150 if (read_circular || self->circular) {
2151 if (offset < 0) {
2152 unsigned const n = (-offset) / self->seq_len;
2153 offset = (self->seq_len * (n + 1) + offset) % self->seq_len;
2154 }
2155 else if (offset > self->seq_len)
2156 offset %= self->seq_len;
2157 }
2158 else if (offset >= self->seq_len)
2159 return RC(rcAlign, rcType, rcReading, rcOffset, rcOutofrange);
2160
2161 if (self->type == rst_local) {
2162 uint8_t const *const src = self->u.local.buf.base;
2163 unsigned dst_off = 0;
2164
2165 while (dst_off < len) {
2166 unsigned const writable = len - dst_off;
2167 unsigned const readable = self->seq_len - offset;
2168 unsigned const to_write = readable < writable ? readable : writable;
2169
2170 memmove(&buffer[dst_off], &src[offset], to_write);
2171 *written += to_write;
2172 if (!self->circular)
2173 break;
2174 offset = 0;
2175 dst_off += to_write;
2176 }
2177 return 0;
2178 }
2179 else if (self->type == rst_refSeqById || self->type == rst_refSeqBySeqId) {
2180 unsigned to_write = len;
2181
2182 if (!self->circular || force_linear) {
2183 unsigned const readable = self->seq_len - offset;
2184
2185 if (to_write > readable)
2186 to_write = readable;
2187 }
2188 return RefSeq_Read(self->u.refseq, offset, to_write, buffer, written);
2189 }
2190 return RC(rcAlign, rcType, rcReading, rcType, rcInvalid);
2191 }
2192
2193 static
ReferenceMgr_LoadSeq(ReferenceMgr * const self,ReferenceSeq * obj)2194 rc_t ReferenceMgr_LoadSeq(ReferenceMgr *const self, ReferenceSeq *obj)
2195 {
2196 KDataBuffer readBuf;
2197 rc_t rc = KDataBufferMake(&readBuf, 8, self->max_seq_len);
2198
2199 if (rc == 0) {
2200 char const *const id = obj->id;
2201 char const *const seqId = obj->seqId ? obj->seqId : id;
2202 TableWriterRefData data;
2203 INSDC_coord_zero offset = 0;
2204
2205 obj->start_rowid = self->ref_rowid + 1;
2206 data.name.buffer = id;
2207 data.name.elements = string_size(id);
2208 data.read.buffer = readBuf.base;
2209 data.seq_id.buffer = seqId;
2210 data.seq_id.elements = string_size(seqId);
2211 data.force_READ_write = obj->type == rst_local || (self->options & ewrefmgr_co_allREADs);
2212 data.circular = obj->circular;
2213
2214 if (self->writer == NULL) {
2215 uint32_t wopt = 0;
2216
2217 wopt |= (self->options & ewrefmgr_co_allREADs) ? ewref_co_SaveRead : 0;
2218 wopt |= (self->options & ewrefmgr_co_Coverage) ? ewref_co_Coverage : 0;
2219 if((rc = TableWriterRef_Make(&self->writer, self->db, wopt)) == 0) {
2220 TableWriterData mlen;
2221
2222 mlen.buffer = &self->max_seq_len;
2223 mlen.elements = 1;
2224 rc = TableWriterRef_WriteDefaultData(self->writer, ewrefd_cn_MAX_SEQ_LEN, &mlen);
2225 }
2226 }
2227 while (rc == 0 && offset < obj->seq_len) {
2228 unsigned row_len;
2229
2230 rc = ReferenceSeq_ReadDirect(obj, offset, self->max_seq_len, false,
2231 readBuf.base, &row_len, true);
2232 if (rc != 0 || row_len == 0) break;
2233
2234 data.read.elements = row_len;
2235 rc = TableWriterRef_Write(self->writer, &data, NULL);
2236 offset += row_len;
2237 ++self->ref_rowid;
2238 }
2239 KDataBufferWhack(&readBuf);
2240 }
2241 return rc;
2242 }
2243
ReferenceMgr_GetSeq(ReferenceMgr const * const cself,ReferenceSeq const ** const seq,const char * const id,bool * const shouldUnmap,bool const allowMultiMapping,bool wasRenamed[])2244 LIB_EXPORT rc_t CC ReferenceMgr_GetSeq(ReferenceMgr const *const cself,
2245 ReferenceSeq const **const seq,
2246 const char *const id,
2247 bool *const shouldUnmap,
2248 bool const allowMultiMapping,
2249 bool wasRenamed[])
2250 {
2251 ReferenceMgr *const self = (ReferenceMgr *)cself;
2252
2253 if (self == NULL || seq == NULL || id == NULL)
2254 return RC(rcAlign, rcFile, rcConstructing, rcParam, rcNull);
2255
2256 *seq = NULL;
2257 *shouldUnmap = false;
2258 {
2259 ReferenceSeq *obj;
2260 rc_t rc = ReferenceMgr_OpenSeq(self, &obj, id, 0, NULL, allowMultiMapping, wasRenamed);
2261
2262 if (rc) return rc;
2263 if (obj->type == rst_unmapped) {
2264 *shouldUnmap = true;
2265 return 0;
2266 }
2267 if (obj->start_rowid == 0) {
2268 rc = ReferenceMgr_LoadSeq(self, obj);
2269 if (rc) return rc;
2270 }
2271 *seq = obj;
2272 }
2273 return 0;
2274 }
2275
ReferenceMgr_Verify(ReferenceMgr const * const cself,char const id[],INSDC_coord_len const length,uint8_t const md5[16],bool const allowMultiMapping,bool wasRenamed[])2276 LIB_EXPORT rc_t CC ReferenceMgr_Verify(ReferenceMgr const *const cself,
2277 char const id[],
2278 INSDC_coord_len const length,
2279 uint8_t const md5[16],
2280 bool const allowMultiMapping,
2281 bool wasRenamed[])
2282 {
2283 if (cself == NULL || id == NULL)
2284 return RC(rcAlign, rcFile, rcValidating, rcParam, rcNull);
2285 {
2286 ReferenceMgr *self = (ReferenceMgr *)cself;
2287 ReferenceSeq *rseq;
2288 rc_t rc = ReferenceMgr_OpenSeq(self, &rseq, id, length, md5, allowMultiMapping, wasRenamed);
2289
2290 if (rc) return rc;
2291 if (rseq->seq_len != length) {
2292 rc = RC(rcAlign, rcFile, rcValidating, rcSize, rcUnequal);
2293 ALIGN_DBGERRP("%s->%s SEQ_LEN verification", rc, id, rseq->seqId);
2294 }
2295 if (md5 && memcmp(md5, rseq->md5, sizeof(rseq->md5)) != 0) {
2296 unsigned i;
2297
2298 rc = RC(rcAlign, rcTable, rcValidating, rcChecksum, rcUnequal);
2299 ALIGN_DBGERRP("%s->%s MD5 verification", rc, id, rseq->seqId);
2300 ALIGN_DBGF((" found '"));
2301 for(i = 0; i < sizeof(rseq->md5); i++) {
2302 ALIGN_DBGF(("%02hx", rseq->md5[i]));
2303 }
2304 ALIGN_DBGF(("' != requested '"));
2305 for(i = 0; i < sizeof(rseq->md5); i++) {
2306 ALIGN_DBGF(("%02hx", md5[i]));
2307 }
2308 ALIGN_DBGF(("'\n"));
2309 } else {
2310 ALIGN_DBG("%s->%s MD5 verification ok", id, rseq->seqId);
2311 }
2312 if(rc == 0) {
2313 ALIGN_DBG("%s verification ok", id);
2314 } else {
2315 ALIGN_DBGERRP("%s verification", rc, id);
2316 }
2317 return rc;
2318 }
2319 }
2320
ReferenceMgr_Get1stRow(const ReferenceMgr * cself,int64_t * row_id,char const id[])2321 LIB_EXPORT rc_t CC ReferenceMgr_Get1stRow(const ReferenceMgr* cself, int64_t* row_id, char const id[])
2322 {
2323 rc_t rc = 0;
2324 ReferenceSeq *seq;
2325
2326 if (cself == NULL || row_id == NULL) {
2327 rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
2328 }
2329 else if ((seq = ReferenceMgr_FindSeq(cself, id)) == NULL)
2330 rc = RC(rcAlign, rcFile, rcReading, rcId, rcNotFound);
2331 else {
2332 *row_id = seq->start_rowid;
2333 }
2334 return rc;
2335 }
2336
ReferenceMgr_FastaPath(const ReferenceMgr * cself,const char * fasta_path)2337 LIB_EXPORT rc_t CC ReferenceMgr_FastaPath(const ReferenceMgr* cself, const char* fasta_path)
2338 {
2339 rc_t rc = 0;
2340 KDirectory* dir;
2341
2342 if(cself == NULL || fasta_path == NULL) {
2343 rc = RC(rcAlign, rcFile, rcConstructing, rcParam, rcNull);
2344 } else if((rc = KDirectoryNativeDir(&dir)) == 0) {
2345 const KFile* kf;
2346 if((rc = KDirectoryOpenFileRead(dir, &kf, "%s", fasta_path)) == 0) {
2347 rc = ReferenceMgr_FastaFile(cself, kf);
2348 KFileRelease(kf);
2349 }
2350 KDirectoryRelease(dir);
2351 }
2352 ALIGN_DBGERRP("from file %s", rc, fasta_path);
2353 return rc;
2354 }
2355
ReferenceMgr_FastaFile(const ReferenceMgr * cself,const KFile * file)2356 LIB_EXPORT rc_t CC ReferenceMgr_FastaFile(const ReferenceMgr* cself, const KFile* file)
2357 {
2358 if(cself == NULL || file == NULL) {
2359 return RC(rcAlign, rcFile, rcConstructing, rcParam, rcNull);
2360 }
2361 return ImportFastaFile((ReferenceMgr *)cself, file, NULL);
2362 }
2363
2364 typedef struct {
2365 unsigned length: 28, gentype:4, type: 8, code: 8;
2366 } cigar_bin_t;
2367
2368 static
cigar2offset_2(unsigned const cigar_len,cigar_bin_t const cigar[],unsigned const out_sz,unsigned const out_used,compress_buffer_t out_offset[],INSDC_coord_len out_seq_len[],INSDC_coord_len out_ref_len[],INSDC_coord_len out_max_ref_len[])2369 rc_t cigar2offset_2(unsigned const cigar_len,
2370 cigar_bin_t const cigar[],
2371 unsigned const out_sz,
2372 unsigned const out_used,
2373 compress_buffer_t out_offset[],
2374 INSDC_coord_len out_seq_len[],
2375 INSDC_coord_len out_ref_len[],
2376 INSDC_coord_len out_max_ref_len[])
2377 {
2378 unsigned i;
2379 INSDC_coord_len seq_len = 0;
2380 INSDC_coord_len ref_len = 0;
2381 INSDC_coord_len max_ref_len = 0;
2382
2383 for (i = 0; i < cigar_len; ++i) {
2384 unsigned const op_len = cigar[i].length;
2385 char const op = cigar[i].code;
2386 uint8_t const type = cigar[i].type;
2387
2388 switch(op) {
2389 case 'M':
2390 case '=':
2391 case 'X':
2392 seq_len += op_len;
2393 ref_len += op_len;
2394 if(max_ref_len < ref_len)
2395 max_ref_len = ref_len;
2396 break;
2397 case 'B':
2398 /* Complete Genomics CIGAR style specific:
2399 overlap between consecutive reads
2400 ex: sequence 6 bases: ACACTG, reference 2 bases: ACTG,
2401 cigar will be: 2M2B2M
2402 no need to move sequence position
2403 */
2404 case 'S':
2405 case 'I':
2406 if (seq_len < out_sz) {
2407 out_offset[seq_len].length = -op_len;
2408 out_offset[seq_len].type = type;
2409 ALIGN_C_DBGF(("%s:%u: seq_pos: %u, ref_pos: %u, offset: %i\n", __func__, __LINE__, seq_len, ref_len, -op_len));
2410 if (op == 'B') ref_len -= op_len;
2411 else seq_len += op_len;
2412 }
2413 else
2414 return RC(rcAlign, rcFile, rcProcessing, rcData, rcInconsistent);
2415 break;
2416 case 'N':
2417 case 'D':
2418 if (seq_len < out_sz) {
2419 out_offset[seq_len].length = op_len;
2420 out_offset[seq_len].type = type;
2421 ALIGN_C_DBGF(("%s:%u: seq_pos: %u, ref_pos: %u, offset: %i\n", __func__, __LINE__, seq_len, ref_len, op_len));
2422 }
2423 else {
2424 out_offset[seq_len-1].length = op_len;
2425 out_offset[seq_len-1].type = type;
2426 ALIGN_C_DBGF(("%s:%u: seq_pos: %u, ref_pos: %u, offset: %i\n", __func__, __LINE__, seq_len-1, ref_len, op_len));
2427 }
2428 ref_len += op_len;
2429 if(max_ref_len < ref_len)
2430 max_ref_len = ref_len;
2431 break;
2432 default:
2433 break;
2434 }
2435 }
2436 out_seq_len[0] = seq_len;
2437 out_ref_len[0] = ref_len;
2438 out_max_ref_len[0] = max_ref_len;
2439
2440 ALIGN_C_DBGF(("%s:%u: SEQLEN: %u, REFLEN: %u, MAXREFLEN: %u\n", __func__, __LINE__, seq_len, ref_len, max_ref_len));
2441
2442 return 0;
2443 }
2444
2445 static char const cigar_op_codes[] = "MIDNSHP=XB";
2446
2447 static NCBI_align_ro_type const cigar_op_types[] = {
2448 NCBI_align_ro_normal, /* M */
2449 NCBI_align_ro_normal, /* I */
2450 NCBI_align_ro_normal, /* D */
2451 NCBI_align_ro_intron_unknown, /* N */
2452 NCBI_align_ro_soft_clip, /* S */
2453 NCBI_align_ro_normal, /* H */
2454 NCBI_align_ro_normal, /* P */
2455 NCBI_align_ro_normal, /* = */
2456 NCBI_align_ro_normal, /* X */
2457 NCBI_align_ro_complete_genomics /* B */
2458 };
2459
2460 enum {
2461 gen_match_type,
2462 gen_insert_type,
2463 gen_delete_type,
2464 gen_ignore_type
2465 };
2466
2467 static int const cigar_op_gentypes[] = {
2468 gen_match_type, /* M */
2469 gen_insert_type, /* I */
2470 gen_delete_type, /* D */
2471 gen_delete_type, /* N */
2472 gen_insert_type, /* S */
2473 gen_ignore_type, /* H */
2474 gen_ignore_type, /* P */
2475 gen_match_type, /* = */
2476 gen_match_type, /* X */
2477 gen_insert_type /* B */
2478 };
2479
2480 static
cigar_bin(cigar_bin_t cigar[],unsigned const cigar_len,void const * cigar_in)2481 rc_t cigar_bin(cigar_bin_t cigar[],
2482 unsigned const cigar_len,
2483 void const *cigar_in)
2484 {
2485 unsigned i;
2486 uint32_t const *const cigar_bin = cigar_in;
2487
2488 ALIGN_C_DBGF(("%s:%u: '", __func__, __LINE__));
2489 for (i = 0; i < cigar_len; ++i) {
2490 uint32_t c;
2491
2492 memmove(&c, cigar_bin + i, 4);
2493 {
2494 int const op = c & 0x0F;
2495 int const len = c >> 4;
2496
2497 if (op >= sizeof(cigar_op_codes)) {
2498 rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcUnrecognized);
2499 (void)PLOGERR(klogErr, (klogErr, rc, "Invalid or unrecognized CIGAR operation (binary code: $(opbin))", "opbin=%u", op));
2500 return rc;
2501 }
2502 ALIGN_C_DBGF(("%u%c", len, cigar_op_codes[op]));
2503 cigar[i].length = len;
2504 cigar[i].code = cigar_op_codes[op];
2505 cigar[i].type = cigar_op_types[op];
2506 cigar[i].gentype = cigar_op_gentypes[op];
2507 }
2508 }
2509 ALIGN_C_DBGF(("'[%u]\n", cigar_len));
2510 return 0;
2511 }
2512
2513 static
cigar_string(cigar_bin_t cigar[],unsigned const cigar_len,void const * cigar_in)2514 rc_t cigar_string(cigar_bin_t cigar[],
2515 unsigned const cigar_len,
2516 void const *cigar_in)
2517 {
2518 unsigned i;
2519 unsigned j;
2520 char const *const cigar_string = cigar_in;
2521
2522 ALIGN_C_DBGF(("%s:%u: '%s'[%u]\n", __func__, __LINE__, cigar_in, cigar_len));
2523 for (i = j = 0; j < cigar_len; ++j) {
2524 int len = 0;
2525
2526 for (; ;) {
2527 int const ch = cigar_string[i++];
2528
2529 if (isdigit(ch))
2530 len = (len * 10) + (ch - '0');
2531 else {
2532 int op;
2533
2534 for (op = 0; op < sizeof(cigar_op_codes); ++op) {
2535 if (ch == cigar_op_codes[op])
2536 break;
2537 }
2538 if (op == sizeof(cigar_op_codes)) {
2539 rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcUnrecognized);
2540 (void)PLOGERR(klogErr, (klogErr, rc, "Invalid or unrecognized CIGAR operation '$(opcode)'", "opcode=%c", ch));
2541 return rc;
2542 }
2543 cigar[j].length = len;
2544 cigar[j].code = cigar_op_codes[op];
2545 cigar[j].type = cigar_op_types[op];
2546 cigar[j].gentype = cigar_op_gentypes[op];
2547 break;
2548 }
2549 }
2550 }
2551 return 0;
2552 }
2553
cigar_string_op_count(char const cigar[])2554 static int cigar_string_op_count(char const cigar[])
2555 {
2556 unsigned n = 0;
2557 int st = 0;
2558 int i = 0;
2559
2560 for (; ;) {
2561 int const ch = cigar[i];
2562
2563 if (ch == '\0')
2564 break;
2565
2566 switch (st) {
2567 case 0:
2568 if (!isdigit(ch))
2569 return -1;
2570 ++st;
2571 break;
2572 case 1:
2573 if (!isdigit(ch)) {
2574 ++n;
2575 --st;
2576 }
2577 break;
2578 }
2579 ++i;
2580 }
2581 return st == 0 ? n : -1;
2582 }
2583
cigar_remove_ignored(unsigned opcount,cigar_bin_t cigar[])2584 static unsigned cigar_remove_ignored(unsigned opcount, cigar_bin_t cigar[])
2585 {
2586 unsigned i = opcount;
2587
2588 while (i) {
2589 unsigned const oi = i;
2590 unsigned const type = cigar[--i].gentype;
2591
2592 if (type == gen_ignore_type) {
2593 memmove(cigar + i, cigar + oi, (opcount - oi) * sizeof(cigar[0]));
2594 --opcount;
2595 }
2596 }
2597 return opcount;
2598 }
2599
2600 static
cigar2offset(int const options,unsigned const cigar_len,void const * in_cigar,unsigned const out_sz,unsigned const out_used,uint8_t const intron_type,compress_buffer_t out_offset[],INSDC_coord_len out_seq_len[],INSDC_coord_len out_ref_len[],INSDC_coord_len out_max_ref_len[],INSDC_coord_len out_adjust[])2601 rc_t cigar2offset(int const options,
2602 unsigned const cigar_len,
2603 void const *in_cigar,
2604 unsigned const out_sz,
2605 unsigned const out_used,
2606 uint8_t const intron_type,
2607 compress_buffer_t out_offset[],
2608 INSDC_coord_len out_seq_len[],
2609 INSDC_coord_len out_ref_len[],
2610 INSDC_coord_len out_max_ref_len[],
2611 INSDC_coord_len out_adjust[])
2612 {
2613 bool const binary = (options & ewrefmgr_cmp_Binary) ? true : false;
2614 int const maxopcount = binary ? cigar_len : cigar_string_op_count(in_cigar);
2615
2616 memset(out_offset, 0, out_used * sizeof(*out_offset));
2617
2618 if (maxopcount > 0) {
2619 cigar_bin_t scigar[256];
2620 cigar_bin_t *hcigar = NULL;
2621 cigar_bin_t *cigar = scigar;
2622
2623 if (maxopcount > sizeof(scigar)/sizeof(scigar[0])) {
2624 hcigar = malloc(maxopcount * sizeof(hcigar[0]));
2625
2626 if (hcigar == NULL) {
2627 rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcMemory, rcExhausted);
2628 (void)PLOGERR(klogErr, (klogErr, rc, "out of memory trying to allocate $(bytes) bytes for CIGAR operations", "bytes=%u", (unsigned)(maxopcount * sizeof(hcigar[0]))));
2629 return rc;
2630 }
2631 cigar = hcigar;
2632 }
2633 {
2634 rc_t const rc = (binary ? cigar_bin : cigar_string)(cigar,
2635 maxopcount,
2636 in_cigar);
2637 if (rc)
2638 return rc;
2639 }
2640 /* check for hard clipping if not accepted */
2641 if ((options & ewrefmgr_co_AcceptHardClip) == 0) {
2642 unsigned i;
2643
2644 for (i = 0; i < maxopcount; ++i) {
2645 if (cigar[i].code == 'H') {
2646 rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcNotAvailable);
2647 (void)LOGERR(klogErr, rc, "Hard clipping of sequence data is not allowed");
2648 return rc;
2649 }
2650 }
2651 }
2652 {
2653 unsigned first = 0;
2654 unsigned opcount = cigar_remove_ignored(maxopcount, cigar);
2655
2656 out_adjust[0] = 0;
2657 if ((options & ewrefmgr_cmp_Exact) == 0) {
2658 /* remove any leading delete operations */
2659 for (first = 0; first < opcount; ++first) {
2660 if (cigar[first].gentype != gen_delete_type)
2661 break;
2662 out_adjust[0] += cigar[first].length;
2663 }
2664 /* make sure any adjacent deletes and inserts are ordered so that
2665 * the delete follows the insert
2666 */
2667 {
2668 unsigned i;
2669 #if 1
2670 for (i = first; i < opcount - 1; ) {
2671 cigar_bin_t const cur = cigar[i + 0];
2672 cigar_bin_t const nxt = cigar[i + 1];
2673
2674 if (cur.gentype != gen_delete_type)
2675 ;
2676 else if (nxt.gentype == gen_delete_type) {
2677 unsigned const type = (cur.type == NCBI_align_ro_normal && nxt.type == NCBI_align_ro_normal) ? NCBI_align_ro_normal : NCBI_align_ro_intron_unknown;
2678 int const code = type == NCBI_align_ro_normal ? 'D' : 'N';
2679 unsigned const length = cur.length + nxt.length;
2680
2681 --opcount;
2682 memmove(cigar + i, cigar + i + 1, (opcount - i) * sizeof(cigar[0]));
2683
2684 cigar[i].type = type;
2685 cigar[i].code = code;
2686 cigar[i].length = length;
2687
2688 continue;
2689 }
2690 else if (nxt.gentype == gen_insert_type) {
2691 if (nxt.type == NCBI_align_ro_complete_genomics) {
2692 assert(i + 2 < opcount);
2693 cigar[i + 0] = nxt;
2694 cigar[i + 1] = cigar[i + 2];
2695 cigar[i + 2] = cur;
2696 ++i;
2697 }
2698 else {
2699 cigar[i + 0] = nxt;
2700 cigar[i + 1] = cur;
2701 }
2702 }
2703 ++i;
2704 }
2705 #else
2706 for (i = first + 1; i < opcount; ) {
2707 if (cigar[i].gentype == gen_insert_type && cigar[i-1].gentype == gen_delete_type) {
2708 cigar_bin_t const prv = cigar[i - 1];
2709 cigar_bin_t const cur = cigar[i];
2710
2711 cigar[ i] = prv;
2712 cigar[--i] = cur;
2713 if (i <= first + 1)
2714 i = first + 1;
2715 }
2716 else
2717 ++i;
2718 }
2719 #endif
2720 }
2721 /* merge adjacent delete type operations D+D -> D else becomes N */
2722 {
2723 unsigned i;
2724
2725 for (i = first + 1; i < opcount;) {
2726 if (cigar[i].gentype == gen_delete_type && cigar[i-1].gentype == gen_delete_type) {
2727 cigar[i].length += cigar[i-1].length;
2728 if (cigar[i].type == NCBI_align_ro_normal && cigar[i-1].type == NCBI_align_ro_normal) {
2729 cigar[i].type = NCBI_align_ro_normal;
2730 cigar[i].code = 'D';
2731 }
2732 else {
2733 cigar[i].type = NCBI_align_ro_intron_unknown;
2734 cigar[i].code = 'N';
2735 }
2736 memmove(cigar + i - 1, cigar + i, (opcount - i) * sizeof(cigar[0]));
2737 --opcount;
2738 }
2739 else
2740 ++i;
2741 }
2742 }
2743 }
2744 /* remove any ignored operations */
2745 {
2746 unsigned i = opcount;
2747
2748 while (i) {
2749 unsigned const oi = i;
2750 cigar_bin_t const op = cigar[--i];
2751
2752 if (op.gentype == gen_ignore_type) {
2753 memmove(cigar + i, cigar + oi, (opcount - oi) * sizeof(cigar[0]));
2754 --opcount;
2755 }
2756 }
2757 }
2758 /* make the intron the known type */
2759 {
2760 unsigned i;
2761
2762 for (i = first; i < opcount; ++i) {
2763 if (cigar[i].type == NCBI_align_ro_intron_unknown)
2764 cigar[i].type = intron_type;
2765 }
2766 }
2767 {
2768 rc_t const rc = cigar2offset_2(opcount - first,
2769 cigar + first,
2770 out_sz,
2771 out_used,
2772 out_offset,
2773 out_seq_len,
2774 out_ref_len,
2775 out_max_ref_len);
2776 if (hcigar)
2777 free(hcigar);
2778 return rc;
2779 }
2780 }
2781 {
2782 }
2783 }
2784 else {
2785 rc_t const rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcUnrecognized);
2786 (void)PLOGERR(klogErr, (klogErr, rc, "Invalid CIGAR string '$(cigar)'", "cigar=%s", in_cigar));
2787 return rc;
2788 }
2789 }
2790
ReferenceSeq_TranslateOffset_int(ReferenceSeq const * const cself,INSDC_coord_zero const offset,int64_t * const ref_id,INSDC_coord_zero * const ref_start,uint64_t * const global_ref_start)2791 LIB_EXPORT rc_t CC ReferenceSeq_TranslateOffset_int(ReferenceSeq const *const cself,
2792 INSDC_coord_zero const offset,
2793 int64_t *const ref_id,
2794 INSDC_coord_zero *const ref_start,
2795 uint64_t *const global_ref_start)
2796 {
2797 if(cself == NULL)
2798 return RC(rcAlign, rcFile, rcProcessing, rcSelf, rcNull);
2799
2800 if (ref_id)
2801 *ref_id = cself->start_rowid + offset / cself->mgr->max_seq_len;
2802
2803 if (ref_start)
2804 *ref_start = offset % cself->mgr->max_seq_len;
2805
2806 if (global_ref_start)
2807 *global_ref_start = (cself->start_rowid - 1) * cself->mgr->max_seq_len + offset;
2808
2809 return 0;
2810 }
2811
ReferenceMgr_Compress(const ReferenceMgr * cself,uint32_t options,const char * id,INSDC_coord_zero offset,const char * seq,INSDC_coord_len seq_len,const void * cigar,uint32_t cigar_len,INSDC_coord_zero allele_offset,const char * allele,INSDC_coord_len allele_len,INSDC_coord_zero offset_in_allele,const void * allele_cigar,uint32_t allele_cigar_len,uint8_t const rna_orient,TableWriterAlgnData * data)2812 LIB_EXPORT rc_t CC ReferenceMgr_Compress(const ReferenceMgr* cself,
2813 uint32_t options,
2814 const char* id,
2815 INSDC_coord_zero offset,
2816 const char* seq,
2817 INSDC_coord_len seq_len,
2818 const void* cigar,
2819 uint32_t cigar_len,
2820 INSDC_coord_zero allele_offset,
2821 const char* allele,
2822 INSDC_coord_len allele_len,
2823 INSDC_coord_zero offset_in_allele,
2824 const void* allele_cigar,
2825 uint32_t allele_cigar_len,
2826 uint8_t const rna_orient,
2827 TableWriterAlgnData* data)
2828 {
2829 rc_t rc = 0;
2830 bool shouldUnmap = false;
2831 bool wasRenamed = false;
2832 const ReferenceSeq* refseq;
2833
2834 if(cself == NULL || id == NULL) {
2835 rc = RC(rcAlign, rcFile, rcProcessing, rcParam, rcNull);
2836 }
2837 else if((rc = ReferenceMgr_GetSeq(cself, &refseq, id, &shouldUnmap, false, &wasRenamed)) == 0) {
2838 assert(shouldUnmap == false);
2839 assert(wasRenamed == false);
2840 rc = ReferenceSeq_Compress(refseq, options, offset, seq, seq_len, cigar, cigar_len,
2841 allele_offset, allele, allele_len,offset_in_allele,
2842 allele_cigar, allele_cigar_len, rna_orient, data);
2843 ReferenceSeq_Release(refseq);
2844 }
2845 ALIGN_C_DBGERR(rc);
2846 return rc;
2847 }
2848
isMatchingBase(int const refBase,int const seqBase)2849 static bool isMatchingBase(int const refBase, int const seqBase)
2850 {
2851 return (refBase == seqBase || seqBase == '=' || (refBase == 'T' && seqBase == 'U'));
2852 }
2853
ReferenceSeq_Compress(ReferenceSeq const * const cself,uint32_t options,INSDC_coord_zero offset,const char * seq,INSDC_coord_len seq_len,const void * cigar,uint32_t cigar_len,INSDC_coord_zero allele_offset,const char * allele,INSDC_coord_len allele_len,INSDC_coord_zero offset_in_allele,const void * allele_cigar,uint32_t allele_cigar_len,uint8_t const rna_orient,TableWriterAlgnData * data)2854 LIB_EXPORT rc_t CC ReferenceSeq_Compress(ReferenceSeq const *const cself,
2855 uint32_t options,
2856 INSDC_coord_zero offset,
2857 const char* seq, INSDC_coord_len seq_len,
2858 const void* cigar, uint32_t cigar_len,
2859 INSDC_coord_zero allele_offset, const char* allele,
2860 INSDC_coord_len allele_len,
2861 INSDC_coord_zero offset_in_allele,
2862 const void* allele_cigar, uint32_t allele_cigar_len,
2863 uint8_t const rna_orient,
2864 TableWriterAlgnData* data)
2865 {
2866 rc_t rc = 0;
2867 ReferenceSeq *const self = (ReferenceSeq *)cself;
2868
2869 if (self == NULL || seq == NULL || cigar == NULL || cigar_len == 0 || data == NULL ||
2870 (!(allele == NULL && allele_len == 0 && allele_cigar == NULL && allele_cigar_len == 0) &&
2871 !(allele != NULL && allele_cigar != NULL && allele_cigar_len != 0)))
2872 {
2873 return RC(rcAlign, rcFile, rcProcessing, rcParam, rcInvalid);
2874 }
2875
2876 if (seq_len > self->mgr->compress.elem_count) {
2877 rc = KDataBufferResize(&self->mgr->compress, seq_len);
2878 if (rc) return rc;
2879 }
2880 {
2881 INSDC_coord_len i, seq_pos = 0, allele_ref_end = 0, ref_len = 0, rl = 0, max_rl = 0;
2882 INSDC_coord_zero* read_start = &((INSDC_coord_zero*)(data->read_start.buffer))[data->ploidy];
2883 INSDC_coord_len* read_len = &((INSDC_coord_len*)(data->read_len.buffer))[data->ploidy];
2884 bool* has_ref_offset, *has_mismatch;
2885 int32_t *const ref_offset = (int32_t *)data->ref_offset.buffer;
2886 uint8_t *const ref_offset_type = (uint8_t *)data->ref_offset_type.buffer;
2887 uint8_t *mismatch;
2888 uint8_t sref_buf[64 * 1024];
2889 void *href_buf = NULL;
2890 uint8_t *ref_buf = sref_buf;
2891 compress_buffer_t allele_off_buf[1024];
2892 INSDC_coord_len position_adjust = 0;
2893 #if _DEBUGGING
2894 uint64_t i_ref_offset_elements, i_mismatch_elements;
2895 char x[4096];
2896 #endif
2897
2898 if(data->ploidy == 0) {
2899 data->has_ref_offset.elements = seq_len;
2900 data->ref_offset.elements = 0;
2901 data->has_mismatch.elements = seq_len;
2902 data->mismatch.elements = 0;
2903 *read_start = 0;
2904 }
2905 else {
2906 data->has_ref_offset.elements += seq_len;
2907 data->has_mismatch.elements += seq_len;
2908 *read_start = read_start[-1] + read_len[-1];
2909 }
2910 *read_len = seq_len;
2911 has_ref_offset = &((bool*)(data->has_ref_offset.buffer))[*read_start];
2912 has_mismatch = &((bool*)(data->has_mismatch.buffer))[*read_start];
2913 mismatch = (uint8_t*)(data->mismatch.buffer);
2914
2915 #if _DEBUGGING
2916 i_ref_offset_elements = data->ref_offset.elements;
2917 i_mismatch_elements = data->mismatch.elements;
2918 ALIGN_C_DBG("align%s '%.*s'[%u] to '%s:%s' at %i", (options & ewrefmgr_cmp_Exact) ? " EXACT" : "",
2919 seq_len, seq, seq_len, cself->id, cself->seqId, offset);
2920 #endif
2921 if(allele != NULL) {
2922 /* determine length of reference for subst by allele */
2923 ALIGN_C_DBG("apply allele %.*s[%u] at %i w/cigar below",
2924 allele_len, allele, allele_len, allele_offset);
2925 rc = cigar2offset(options|ewrefmgr_cmp_Exact,
2926 allele_cigar_len,
2927 allele_cigar,
2928 sizeof(allele_off_buf) / sizeof(*allele_off_buf),
2929 allele_len,
2930 ' ',
2931 allele_off_buf,
2932 &seq_pos,
2933 &allele_ref_end,
2934 &max_rl,
2935 &position_adjust);
2936 /* where allele ends on reference */
2937 allele_ref_end += allele_offset;
2938 }
2939 if(rc == 0) {
2940 rc = cigar2offset(options,
2941 cigar_len,
2942 cigar,
2943 (unsigned)self->mgr->compress.elem_count,
2944 seq_len,
2945 rna_orient,
2946 self->mgr->compress.base,
2947 &seq_pos,
2948 &rl,
2949 &max_rl,
2950 &position_adjust);
2951 offset += position_adjust;
2952 }
2953 if (allele != NULL) {
2954 if (allele_ref_end < offset || offset + rl < allele_offset) {
2955 /* allele and alignment don't intersect */
2956 /* TODO: [VDB-1585] try to make this recoverable */
2957 rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInvalid);
2958 (void)PLOGERR(klogWarn, (klogWarn, rc,
2959 "allele $(a) offset $(ao) $(ac) is not within referenced region in $(id) at offset $(ro) $(rc)",
2960 "a=%.*s,ao=%i,ac=%.*s,id=%s,ro=%i,rc=%.*s",
2961 allele_len, allele, allele_offset, (options & ewrefmgr_cmp_Binary) ? 0 : allele_cigar_len, allele_cigar,
2962 self->seqId, offset, (options & ewrefmgr_cmp_Binary) ? 0 : cigar_len, cigar));
2963 }
2964 }
2965 if(rc == 0) {
2966 ref_len = rl;
2967 if((offset + max_rl) > self->seq_len && !self->circular) {
2968 max_rl = self->seq_len - offset;
2969 if(max_rl < rl) {
2970 /* ref_len used for compression cannot be shorter than ref_len derived from cigar,
2971 if there is a shortage it will fail later here */
2972 max_rl = rl;
2973 }
2974 ALIGN_C_DBG("max_ref_len truncated to %u cause it goes beyond refseq length %lu at offset %i",
2975 max_rl, cself->seq_len, offset);
2976 }
2977 ALIGN_C_DBG("chosen REF_LEN %u, ref len for match %u", ref_len, max_rl);
2978
2979 if (seq_len != seq_pos) {
2980 rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInvalid);
2981 }
2982 if (rc == 0) {
2983 if (max_rl > sizeof(sref_buf)) {
2984 if (href_buf)
2985 free(href_buf);
2986 href_buf = malloc(max_rl);
2987 if (href_buf == NULL)
2988 rc = RC(rcAlign, rcFile, rcProcessing, rcMemory, rcExhausted);
2989 ref_buf = href_buf;
2990 }
2991 }
2992 if (rc == 0) {
2993 if(allele != NULL) {
2994 /* subst allele in reference */
2995 if(allele_offset <= offset) {
2996 /* move allele start inside referenced chunk */
2997 allele += offset_in_allele;
2998 allele_len -= offset_in_allele;
2999 rl = 0;
3000 } else {
3001 /* fetch portion of reference which comes before allele */
3002 rl = allele_offset - offset;
3003 rc = ReferenceSeq_ReadDirect(self, offset, rl, true, ref_buf, &i, false);
3004 if(rc == 0 && rl != i) {
3005 /* here we need to test it otherwise excessive portion of allele could be fetch in next if */
3006 rc = RC(rcAlign, rcFile, rcProcessing, rcRange, rcExcessive);
3007 }
3008 }
3009 if(rc == 0 && allele_len < (max_rl - rl)) {
3010 memmove(&ref_buf[rl], allele, allele_len);
3011 rl += allele_len;
3012 /* append tail of actual reference */
3013 rc = ReferenceSeq_ReadDirect(self, allele_ref_end, max_rl - rl, true, &ref_buf[rl], &i, false);
3014 rl += i;
3015 } else if(rc == 0) {
3016 /* allele is longer than needed */
3017 memmove(&ref_buf[rl], allele, max_rl - rl);
3018 rl = max_rl;
3019 }
3020 }
3021 else {
3022 rc = ReferenceSeq_ReadDirect(self, offset, max_rl, true, ref_buf, &rl, false);
3023 }
3024 if (rc != 0 || max_rl != rl) {
3025 rc = rc ? rc : RC(rcAlign, rcFile, rcProcessing, rcRange, rcExcessive);
3026 ALIGN_C_DBGERRP("refseq is shorter: at offset %i need %u bases", rc, offset, max_rl);
3027 }
3028 else {
3029 compress_buffer_t *const compress_buf = self->mgr->compress.base;
3030 unsigned ro = (unsigned)data->ref_offset.elements;
3031 int ref_pos;
3032
3033 for (seq_pos = 0, ref_pos = 0; seq_pos < seq_len; seq_pos++, ref_pos++) {
3034 int const length = compress_buf[seq_pos].length;
3035 int const type = compress_buf[seq_pos].type;
3036
3037 #if 0
3038 ALIGN_C_DBG("seq_pos: %u, ref_pos: %i, offset: %i, type: %i, ro: %u", seq_pos, ref_pos, length, type, ro);
3039 #endif
3040 if (length == 0 && type == 0)
3041 has_ref_offset[seq_pos] = 0;
3042 else {
3043 has_ref_offset[seq_pos] = 1;
3044 ref_offset[ro] = length;
3045 ref_offset_type[ro] = type;
3046 ref_pos += length;
3047 ++ro;
3048 }
3049 if (ref_pos < 0 || ref_pos >= max_rl
3050 || !isMatchingBase(toupper(ref_buf[ref_pos]), toupper(seq[seq_pos])))
3051 {
3052 has_mismatch[seq_pos] = 1;
3053 mismatch[data->mismatch.elements++] = seq[seq_pos];
3054 }
3055 else {
3056 has_mismatch[seq_pos] = 0;
3057 }
3058 }
3059 data->ref_offset.elements = data->ref_offset_type.elements = ro;
3060 }
3061 }
3062 }
3063 #if _DEBUGGING
3064 if(rc == 0) {
3065 int32_t j;
3066 memset(x, '-', sizeof(x) - 1);
3067 x[sizeof(x) - 2] = '\0';
3068
3069 ALIGN_C_DBG("ref: %.*s [%u]", max_rl, ref_buf, max_rl);
3070 ALIGN_C_DBGF(("%s:%u: ref: ", __func__, __LINE__));
3071 for(seq_pos = 0, j = 0, rl = 0, i = 0; seq_pos < seq_len; seq_pos++, j++) {
3072 if(has_ref_offset[seq_pos]) {
3073 if(ref_offset[i_ref_offset_elements + rl] > 0) {
3074 ALIGN_C_DBGF(("%.*s", (uint32_t)(ref_offset[i_ref_offset_elements + rl]), &ref_buf[j]));
3075 } else {
3076 i = -ref_offset[i_ref_offset_elements + rl];
3077 }
3078 j += ref_offset[i_ref_offset_elements + rl];
3079 rl++;
3080 }
3081 ALIGN_C_DBGF(("%c", (j < 0 || j >= max_rl) ? '-' : (i > 0) ? tolower(ref_buf[j]) : ref_buf[j]));
3082 if(i > 0 ) {
3083 i--;
3084 }
3085 }
3086 ALIGN_C_DBGF(("\n%s:%u: seq: ", __func__, __LINE__));
3087 for(i = 0, j = 0; i < seq_len; i++) {
3088 if(has_ref_offset[i] && ref_offset[i_ref_offset_elements + j++] > 0) {
3089 ALIGN_C_DBGF(("%.*s", ref_offset[i_ref_offset_elements + j - 1], x));
3090 }
3091 ALIGN_C_DBGF(("%c", seq[i]));
3092 }
3093 ALIGN_C_DBGF((" [%u]\n", seq_len));
3094 ALIGN_C_DBGF(("%s:%u: hro: ", __func__, __LINE__));
3095 for(i = 0, j = 0; i < seq_len; i++) {
3096 if(has_ref_offset[i] && ref_offset[i_ref_offset_elements + j++] > 0) {
3097 ALIGN_C_DBGF(("%.*s", ref_offset[i_ref_offset_elements + j - 1], x));
3098 }
3099 ALIGN_C_DBGF(("%c", has_ref_offset[i] + '0'));
3100 }
3101 ALIGN_C_DBGF((", ro:"));
3102 for(i = i_ref_offset_elements; i < data->ref_offset.elements; i++) {
3103 ALIGN_C_DBGF((" %i,", ref_offset[i]));
3104 }
3105 ALIGN_C_DBGF(("[%u]\n", data->ref_offset.elements - i_ref_offset_elements));
3106 ALIGN_C_DBGF(("%s:%u: hmm: ", __func__, __LINE__));
3107 for(i = 0, j = 0; i < seq_len; i++) {
3108 if(has_ref_offset[i] && ref_offset[i_ref_offset_elements + j++] > 0) {
3109 ALIGN_C_DBGF(("%.*s", ref_offset[i_ref_offset_elements + j - 1], x));
3110 }
3111 ALIGN_C_DBGF(("%c", has_mismatch[i] + '0'));
3112 }
3113 ALIGN_C_DBGF((", mm: '%.*s'[%u]\n", (int)(data->mismatch.elements - i_mismatch_elements),
3114 &mismatch[i_mismatch_elements], data->mismatch.elements - i_mismatch_elements));
3115 }
3116 #endif
3117 if(rc == 0) {
3118 if(data->ploidy == 0) {
3119 int64_t *const ref_id = (int64_t *)data->ref_id.buffer;
3120 INSDC_coord_zero *const ref_start = (INSDC_coord_zero *)data->ref_start.buffer;
3121 uint64_t *const global_ref_start = (uint64_t *)data->global_ref_start.buffer;
3122
3123 data->ref_1st_row_id = self->start_rowid;
3124 data->effective_offset = offset;
3125 data->ref_len = ref_len;
3126 ALIGN_C_DBGF(("%s:%u: reference 1st ROW_ID %li OFFSET %i REF_LEN %u",
3127 __func__, __LINE__, data->ref_1st_row_id, data->effective_offset, data->ref_len));
3128
3129 ReferenceSeq_TranslateOffset_int(self, offset, ref_id, ref_start, global_ref_start);
3130
3131 if (ref_id) {
3132 data->ref_id.elements = 1;
3133 ALIGN_C_DBGF((" REF_ID %li", ref_id[0]));
3134 }
3135 if (ref_start) {
3136 data->ref_start.elements = 1;
3137 ALIGN_C_DBGF((" REF_START %i", ref_start[0]));
3138 }
3139 if (global_ref_start) {
3140 data->global_ref_start.elements = 1;
3141 ALIGN_C_DBGF((" GLOBAL_REF_START %lu", global_ref_start[0]));
3142 }
3143 ALIGN_C_DBGF(("\n"));
3144 } else {
3145 if(data->ref_1st_row_id != self->start_rowid || data->effective_offset != offset) {
3146 rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInconsistent);
3147 (void)PLOGERR(klogErr, (klogErr, rc,
3148 "all reads in alignment record must align to same refseq at same location $(r1)@$(o1) <> $(r2):$(a2)@$(o2)",
3149 "r1=%li,o1=%i,r2=%s,a2=%s,o2=%i", data->ref_1st_row_id, data->effective_offset, self->id, self->seqId, offset));
3150 } else if(data->ref_len != ref_len) {
3151 rc = RC(rcAlign, rcFile, rcProcessing, rcData, rcInconsistent);
3152 (void)PLOGERR(klogErr, (klogErr, rc,
3153 "all reads in alignment record must have same size projection on refseq $(rl1) <> $(rl2) $(r):$(a)@$(o)",
3154 "rl1=%u,rl2=%u,r=%s,a=%s,o=%i", data->ref_len, ref_len, self->id, self->seqId, offset));
3155 }
3156 }
3157 }
3158 if(rc == 0) {
3159 data->ploidy++;
3160 data->read_start.elements = data->ploidy;
3161 data->read_len.elements = data->ploidy;
3162 }
3163 if (href_buf)
3164 free(href_buf);
3165 }
3166 ALIGN_C_DBGERR(rc);
3167 return rc;
3168 }
3169
ReferenceSeq_Read(const ReferenceSeq * cself,INSDC_coord_zero offset,INSDC_coord_len len,uint8_t * buffer,INSDC_coord_len * ref_len)3170 LIB_EXPORT rc_t CC ReferenceSeq_Read(const ReferenceSeq* cself, INSDC_coord_zero offset, INSDC_coord_len len,
3171 uint8_t* buffer, INSDC_coord_len* ref_len)
3172 {
3173 rc_t rc = 0;
3174
3175 if(cself == NULL || buffer == NULL || ref_len == NULL) {
3176 rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
3177 } else {
3178 rc = ReferenceSeq_ReadDirect((ReferenceSeq*)cself, offset, len, true, buffer, ref_len, false);
3179 }
3180 ALIGN_DBGERR(rc);
3181 return rc;
3182 }
3183
ReferenceSeq_Get1stRow(const ReferenceSeq * cself,int64_t * row_id)3184 LIB_EXPORT rc_t CC ReferenceSeq_Get1stRow(const ReferenceSeq* cself, int64_t* row_id)
3185 {
3186 rc_t rc = 0;
3187
3188 if(cself == NULL || row_id == NULL) {
3189 rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
3190 } else {
3191 *row_id = cself->start_rowid;
3192 }
3193 return rc;
3194 }
3195
ReferenceSeq_GetID(ReferenceSeq const * const self,char const ** const rslt)3196 LIB_EXPORT rc_t CC ReferenceSeq_GetID(ReferenceSeq const *const self, char const **const rslt)
3197 {
3198 assert(self != NULL);
3199 assert(rslt != NULL);
3200 *rslt = self->id;
3201 return 0;
3202 }
3203
ReferenceSeq_AddCoverage(const ReferenceSeq * cself,INSDC_coord_zero offset,const ReferenceSeqCoverage * data)3204 LIB_EXPORT rc_t CC ReferenceSeq_AddCoverage(const ReferenceSeq* cself, INSDC_coord_zero offset, const ReferenceSeqCoverage* data)
3205 {
3206 rc_t rc = 0;
3207
3208 if(cself == NULL || data == NULL) {
3209 rc = RC(rcAlign, rcFile, rcReading, rcParam, rcNull);
3210 } else if(!(cself->mgr->options & ewrefmgr_co_Coverage)) {
3211 rc = RC(rcAlign, rcType, rcWriting, rcData, rcUnexpected);
3212 ALIGN_R_DBGERRP("coverage %s", rc, "data");
3213 } else if((rc = ReferenceSeq_ReOffset(cself->circular, cself->seq_len, &offset)) == 0) {
3214 rc = TableWriterRef_WriteCoverage(cself->mgr->writer, cself->start_rowid, offset, data);
3215 }
3216 ALIGN_DBGERR(rc);
3217 return rc;
3218 }
3219
ReferenceSeq_Release(const ReferenceSeq * cself)3220 LIB_EXPORT rc_t CC ReferenceSeq_Release(const ReferenceSeq *cself)
3221 {
3222 return 0;
3223 }
3224