1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /* #include "bam-load.vers.h" */
28
29 #include <klib/callconv.h>
30 #include <klib/data-buffer.h>
31 #include <klib/text.h>
32 #include <klib/log.h>
33 #include <klib/out.h>
34 #include <klib/status.h>
35 #include <klib/rc.h>
36 #include <klib/sort.h>
37 #include <klib/printf.h>
38
39 #include <kfs/directory.h>
40 #include <kfs/file.h>
41 #include <kdb/btree.h>
42 #include <kdb/manager.h>
43 #include <kdb/database.h>
44 #include <kdb/table.h>
45 #include <kdb/meta.h>
46
47 #include <vdb/manager.h>
48 #include <vdb/schema.h>
49 #include <vdb/database.h>
50 #include <vdb/table.h>
51 #include <vdb/cursor.h>
52 #include <vdb/vdb-priv.h>
53
54 #include <insdc/insdc.h>
55 #include <insdc/sra.h>
56 #include <align/dna-reverse-cmpl.h>
57 #include <align/align.h>
58
59 #include <kapp/main.h>
60 #include <kapp/args.h>
61 #include <kapp/loader-file.h>
62 #include <kapp/loader-meta.h>
63 #include <kapp/log-xml.h>
64 #include <kapp/progressbar.h>
65
66 #include <kproc/queue.h>
67 #include <kproc/thread.h>
68 #include <kproc/timeout.h>
69 #include <os-native.h>
70
71 #include <sysalloc.h>
72 #include <atomic32.h>
73
74 #include <sys/stat.h>
75 #include <unistd.h>
76 #include <fcntl.h>
77 #include <sys/mman.h>
78 #include <stdlib.h>
79 #include <stdio.h>
80 #include <string.h>
81 #include <ctype.h>
82 #include <assert.h>
83 #include <limits.h>
84 #include <time.h>
85 #include <zlib.h>
86 #include "bam.h"
87 #include "bam-alignment.h"
88 #include "Globals.h"
89 #include "sequence-writer.h"
90 #include "reference-writer.h"
91 #include "alignment-writer.h"
92 #include "mem-bank.h"
93 #include "low-match-count.h"
94
95 #define NUM_ID_SPACES (256u)
96
97 #define MMA_NUM_CHUNKS_BITS (20u)
98 #define MMA_NUM_SUBCHUNKS_BITS ((32u)-(MMA_NUM_CHUNKS_BITS))
99 #define MMA_SUBCHUNK_SIZE (1u << MMA_NUM_CHUNKS_BITS)
100 #define MMA_SUBCHUNK_COUNT (1u << MMA_NUM_SUBCHUNKS_BITS)
101
102 typedef struct {
103 int fd;
104 size_t elemSize;
105 off_t fsize;
106 uint8_t *current;
107 struct mma_map_s {
108 struct mma_submap_s {
109 uint8_t *base;
110 } submap[MMA_SUBCHUNK_COUNT];
111 } map[NUM_ID_SPACES];
112 } MMArray;
113
114 typedef struct {
115 uint32_t primaryId[2];
116 uint32_t spotId;
117 uint32_t fragmentId;
118 uint8_t fragment_len[2]; /*** lowest byte of fragment length to prevent different sizes of primary and secondary alignments **/
119 uint8_t platform;
120 uint8_t pId_ext[2];
121 uint8_t spotId_ext;
122 uint8_t alignmentCount[2]; /* 0..254; 254: saturated max; 255: special meaning "too many" */
123 uint8_t unmated: 1,
124 pcr_dup: 1,
125 unaligned_1: 1,
126 unaligned_2: 1,
127 hardclipped: 1,
128 primary_is_set: 1;
129 } ctx_value_t;
130
131 #define CTX_VALUE_SET_P_ID(O,N,V) do { int64_t tv = (V); (O).primaryId[N] = (uint32_t)tv; (O).pId_ext[N] = tv >> 32; } while(0);
132 #define CTX_VALUE_GET_P_ID(O,N) ((((int64_t)((O).pId_ext[N])) << 32) | (O).primaryId[N])
133
134 #define CTX_VALUE_SET_S_ID(O,V) do { int64_t tv = (V); (O).spotId = (uint32_t)tv; (O).spotId_ext = tv >> 32; } while(0);
135 #define CTX_VALUE_GET_S_ID(O) ((((int64_t)(O).spotId_ext) << 32) | (O).spotId)
136
137 typedef struct FragmentInfo {
138 uint64_t ti;
139 uint32_t readlen;
140 uint8_t aligned;
141 uint8_t is_bad;
142 uint8_t orientation;
143 uint8_t readNo;
144 uint8_t sglen;
145 uint8_t lglen;
146 uint8_t cskey;
147 } FragmentInfo;
148
149 typedef struct KeyToID {
150 KBTree *key2id[NUM_ID_SPACES];
151 char *key2id_names;
152
153 uint32_t idCount[NUM_ID_SPACES];
154 uint32_t key2id_hash[NUM_ID_SPACES];
155
156 unsigned key2id_max;
157 unsigned key2id_name_max;
158 unsigned key2id_name_alloc;
159 unsigned key2id_count;
160
161 unsigned key2id_name[NUM_ID_SPACES];
162 /* this array is kept in name order */
163 /* this maps the names to key2id and idCount */
164 unsigned key2id_oid[NUM_ID_SPACES];
165 } KeyToID;
166
167 typedef struct context_t {
168 KeyToID keyToID;
169 const KLoadProgressbar *progress[4];
170 MMArray *id2value;
171 MemBank *frags;
172 int64_t spotId;
173 int64_t primaryId;
174 int64_t secondId;
175 uint64_t alignCount;
176
177 unsigned pass;
178 bool isColorSpace;
179 } context_t;
180
181 #if 0
182 static char const *Print_ctx_value_t(ctx_value_t const *const self)
183 {
184 static char buffer[16384];
185 rc_t rc = string_printf(buffer, sizeof(buffer), NULL, "pid: { %lu, %lu }, sid: %lu, fid: %u, alc: { %u, %u }, flg: %x", CTX_VALUE_GET_P_ID(*self, 0), CTX_VALUE_GET_P_ID(*self, 1), CTX_VALUE_GET_S_ID(*self), self->fragmentId, self->alignmentCount[0], self->alignmentCount[1], *(self->alignmentCount + sizeof(self->alignmentCount)/sizeof(self->alignmentCount[0])));
186
187 if (rc)
188 return 0;
189 return buffer;
190 }
191 #endif
192
MMArrayMake(MMArray ** rslt,int fd,uint32_t elemSize)193 static rc_t MMArrayMake(MMArray **rslt, int fd, uint32_t elemSize)
194 {
195 MMArray *const self = calloc(1, sizeof(*self));
196
197 if (self == NULL)
198 return RC(rcExe, rcMemMap, rcConstructing, rcMemory, rcExhausted);
199 self->elemSize = (elemSize + 3) & ~(3u); /** align to 4 byte **/
200 self->fd = fd;
201 *rslt = self;
202 return 0;
203 }
204
205 #define PERF 0
206 #define PROT 0
207
MMArrayGet(MMArray * const self,void ** const value,uint64_t const element)208 static rc_t MMArrayGet(MMArray *const self, void **const value, uint64_t const element)
209 {
210 size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
211 unsigned const bin_no = element >> 32;
212 unsigned const subbin = ((uint32_t)element) >> MMA_NUM_CHUNKS_BITS;
213 unsigned const in_bin = (uint32_t)element & (MMA_SUBCHUNK_SIZE - 1);
214
215 if (bin_no >= sizeof(self->map)/sizeof(self->map[0]))
216 return RC(rcExe, rcMemMap, rcConstructing, rcId, rcExcessive);
217
218 if (self->map[bin_no].submap[subbin].base == NULL) {
219 off_t const cur_fsize = self->fsize;
220 off_t const new_fsize = cur_fsize + chunk;
221
222 if (ftruncate(self->fd, new_fsize) != 0)
223 return RC(rcExe, rcFile, rcResizing, rcSize, rcExcessive);
224 else {
225 void *const base = mmap(NULL, chunk, PROT_READ|PROT_WRITE,
226 MAP_FILE|MAP_SHARED, self->fd, cur_fsize);
227
228 self->fsize = new_fsize;
229 if (base == MAP_FAILED) {
230 PLOGMSG(klogErr, (klogErr, "Failed to construct map for bin $(bin), subbin $(subbin)", "bin=%u,subbin=%u", bin_no, subbin));
231 return RC(rcExe, rcMemMap, rcConstructing, rcMemory, rcExhausted);
232 }
233 else {
234 #if PERF
235 static unsigned mapcount = 0;
236
237 (void)PLOGMSG(klogInfo, (klogInfo, "Number of mmaps: $(cnt)", "cnt=%u", ++mapcount));
238 #endif
239 self->map[bin_no].submap[subbin].base = base;
240 }
241 }
242 }
243 uint8_t *const next = self->map[bin_no].submap[subbin].base;
244 #if PROT
245 if (next != self->current) {
246 void *const current = self->current;
247
248 if (current)
249 mprotect(current, chunk, PROT_NONE);
250
251 mprotect(self->current = next, chunk, PROT_READ|PROT_WRITE);
252 }
253 #endif
254 *value = &next[(size_t)in_bin * self->elemSize];
255 return 0;
256 }
257
258 #if 0
259 static rc_t MMArrayGetRead(MMArray *const self, void const **const value, uint64_t const element)
260 {
261 unsigned const bin_no = element >> 32;
262 unsigned const subbin = ((uint32_t)element) >> MMA_NUM_CHUNKS_BITS;
263 unsigned const in_bin = (uint32_t)element & (MMA_SUBCHUNK_SIZE - 1);
264
265 if (bin_no >= sizeof(self->map)/sizeof(self->map[0]))
266 return RC(rcExe, rcMemMap, rcConstructing, rcId, rcExcessive);
267
268 if (self->map[bin_no].submap[subbin].base == NULL)
269 return RC(rcExe, rcMemMap, rcReading, rcId, rcInvalid);
270
271 uint8_t *const next = self->map[bin_no].submap[subbin].base;
272 #if PROT
273 size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
274 if (next != self->current) {
275 void *const current = self->current;
276
277 if (current)
278 mprotect(current, chunk, PROT_NONE);
279
280 mprotect(self->current = next, chunk, PROT_READ);
281 }
282 #endif
283 *value = &next[(size_t)in_bin * self->elemSize];
284 return 0;
285 }
286 #endif
287
MMArrayLock(MMArray * const self)288 static void MMArrayLock(MMArray *const self)
289 {
290 #if PROT
291 size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
292 void *const current = self->current;
293
294 self->current = NULL;
295 if (current)
296 mprotect(current, chunk, PROT_NONE);
297 #endif
298 }
299
MMArrayClear(MMArray * self)300 static void MMArrayClear(MMArray *self)
301 {
302 size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
303 unsigned i;
304
305 for (i = 0; i != sizeof(self->map)/sizeof(self->map[0]); ++i) {
306 unsigned j;
307
308 for (j = 0; j != sizeof(self->map[0].submap)/sizeof(self->map[0].submap[0]); ++j) {
309 if (self->map[i].submap[j].base) {
310 #if PROT
311 mprotect(self->map[i].submap[j].base, chunk, PROT_READ|PROT_WRITE);
312 #endif
313 memset(self->map[i].submap[j].base, 0, chunk);
314 #if PROT
315 mprotect(self->map[i].submap[j].base, chunk, PROT_NONE);
316 #endif
317 }
318 }
319 }
320 #if PROT
321 self->current = NULL;
322 #endif
323 }
324
MMArrayWhack(MMArray * self)325 static void MMArrayWhack(MMArray *self)
326 {
327 size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
328 unsigned i;
329
330 for (i = 0; i != sizeof(self->map)/sizeof(self->map[0]); ++i) {
331 unsigned j;
332
333 for (j = 0; j != sizeof(self->map[0].submap)/sizeof(self->map[0].submap[0]); ++j) {
334 if (self->map[i].submap[j].base)
335 munmap(self->map[i].submap[j].base, chunk);
336 }
337 }
338 close(self->fd);
339 free(self);
340 }
341
OpenKBTree(KBTree ** const rslt,unsigned n,unsigned max)342 static rc_t OpenKBTree(KBTree **const rslt, unsigned n, unsigned max)
343 {
344 size_t const cacheSize = (((G.cache_size - (G.cache_size / 2) - (G.cache_size / 8)) / max)
345 + 0xFFFFF) & ~((size_t)0xFFFFF);
346 KFile *file = NULL;
347 KDirectory *dir;
348 char fname[4096];
349 rc_t rc;
350
351 rc = KDirectoryNativeDir(&dir);
352 if (rc)
353 return rc;
354
355 rc = string_printf(fname, sizeof(fname), NULL, "%s/key2id.%u.%u", G.tmpfs, G.pid, n); if (rc) return rc;
356 rc = KDirectoryCreateFile(dir, &file, true, 0600, kcmInit, "%s", fname);
357 KDirectoryRemove(dir, 0, "%s", fname);
358 KDirectoryRelease(dir);
359 if (rc == 0) {
360 rc = KBTreeMakeUpdate(rslt, file, cacheSize,
361 false, kbtOpaqueKey,
362 1, 255, sizeof ( uint32_t ),
363 NULL
364 );
365 KFileRelease(file);
366 #if PERF
367 if (rc == 0) {
368 static unsigned treecount = 0;
369
370 (void)PLOGMSG(klogInfo, (klogInfo, "Number of trees: $(cnt)", "cnt=%u", ++treecount));
371 }
372 #endif
373 }
374 return rc;
375 }
376
GetKeyIDOld(KeyToID * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],unsigned const namelen)377 static rc_t GetKeyIDOld(KeyToID *const ctx, uint64_t *const rslt, bool *const wasInserted, char const key[], char const name[], unsigned const namelen)
378 {
379 unsigned const keylen = strlen(key);
380 rc_t rc;
381 uint64_t tmpKey;
382
383 if (ctx->key2id_count == 0) {
384 rc = OpenKBTree(&ctx->key2id[0], 1, 1);
385 if (rc) return rc;
386 ctx->key2id_count = 1;
387 }
388 if (memcmp(key, name, keylen) == 0) {
389 /* qname starts with read group; no append */
390 tmpKey = ctx->idCount[0];
391 rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, name, namelen);
392 }
393 else {
394 char sbuf[4096];
395 char *buf = sbuf;
396 char *hbuf = NULL;
397 size_t bsize = sizeof(sbuf);
398 size_t actsize;
399
400 if (keylen + namelen + 2 > bsize) {
401 hbuf = malloc(bsize = keylen + namelen + 2);
402 if (hbuf == NULL)
403 return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
404 buf = hbuf;
405 }
406 rc = string_printf(buf, bsize, &actsize, "%s\t%.*s", key, (int)namelen, name);
407
408 tmpKey = ctx->idCount[0];
409 rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, buf, actsize);
410 if (hbuf)
411 free(hbuf);
412 }
413 if (rc == 0) {
414 *rslt = tmpKey;
415 if (*wasInserted)
416 ++ctx->idCount[0];
417 }
418 return rc;
419 }
420
HashKey(void const * const key,unsigned const keylen)421 static unsigned HashKey(void const *const key, unsigned const keylen)
422 {
423 #if 0
424 /* There is nothing special about this hash. It was randomly generated. */
425 static const uint8_t T1[] = {
426 64, 186, 39, 203, 54, 211, 98, 32, 26, 23, 219, 94, 77, 60, 56, 184,
427 129, 242, 10, 91, 84, 192, 19, 197, 231, 133, 125, 244, 48, 176, 160, 164,
428 17, 41, 57, 137, 44, 196, 116, 146, 105, 40, 122, 47, 220, 226, 213, 212,
429 107, 191, 52, 144, 9, 145, 81, 101, 217, 206, 85, 134, 143, 58, 128, 20,
430 236, 102, 83, 149, 148, 180, 167, 163, 12, 239, 31, 0, 73, 152, 1, 15,
431 75, 200, 4, 165, 5, 66, 25, 111, 255, 70, 174, 151, 96, 126, 147, 34,
432 112, 161, 127, 181, 237, 78, 37, 74, 222, 123, 21, 132, 95, 51, 141, 45,
433 61, 131, 193, 68, 62, 249, 178, 33, 7, 195, 228, 82, 27, 46, 254, 90,
434 185, 240, 246, 124, 205, 182, 42, 22, 198, 69, 166, 92, 169, 136, 223, 245,
435 118, 97, 115, 80, 252, 209, 49, 79, 221, 38, 28, 35, 36, 208, 187, 248,
436 158, 201, 202, 168, 2, 18, 189, 119, 216, 214, 11, 6, 89, 16, 229, 109,
437 120, 43, 162, 106, 204, 8, 199, 235, 142, 210, 86, 153, 227, 230, 24, 100,
438 224, 113, 190, 243, 218, 215, 225, 173, 99, 238, 138, 59, 183, 154, 171, 232,
439 157, 247, 233, 67, 88, 50, 253, 251, 140, 104, 156, 170, 150, 103, 117, 110,
440 155, 72, 207, 250, 159, 194, 177, 130, 135, 87, 71, 175, 14, 55, 172, 121,
441 234, 13, 30, 241, 93, 188, 53, 114, 76, 29, 65, 3, 179, 108, 63, 139
442 };
443 unsigned h = 0x55;
444 unsigned i = keylen;
445
446 do { h = T1[h ^ ((uint8_t)i)]; } while ((i >>= 8) != 0);
447
448 for (i = 0; i != keylen; ++i)
449 h = T1[h ^ ((uint8_t const *)key)[i]];
450
451 return h;
452 #else
453 /* FNV-1a hash with folding */
454 uint64_t h = 0xcbf29ce484222325;
455 unsigned i;
456
457 for (i = 0; i < keylen; ++i) {
458 uint8_t const octet = ((uint8_t const *)key)[i];
459 h = (h ^ octet) * 0x100000001b3ull;
460 }
461 return ((uint32_t)(h ^ (h >> 32))) % NUM_ID_SPACES;
462 #endif
463 }
464
465 #define USE_ILLUMINA_NAMING_CORRECTION 1
466
GetFixedNameLength(char const name[],size_t const namelen)467 static size_t GetFixedNameLength(char const name[], size_t const namelen)
468 {
469 #if USE_ILLUMINA_NAMING_CORRECTION
470 /*** Check for possible fixes to illumina names ****/
471 size_t newlen=namelen;
472 /*** First get rid of possible "/1" "/2" "/3" at the end - violates SAM spec **/
473 if(newlen > 2 && name[newlen-2] == '/' && (name[newlen-1] == '1' || name[newlen-1] == '2' || name[newlen-1] == '3')){
474 newlen -=2;
475 }
476 if(newlen > 2 && name[newlen-2] == '#' && (name[newlen-1] == '0')){ /*** Now, find "#0" ***/
477 newlen -=2;
478 } else if(newlen>10){ /*** find #ACGT ***/
479 int i=newlen;
480 for(i--;i>4;i--){ /*** stopping at 4 since the rest of record should still contain :x:y ***/
481 char a=toupper(name[i]);
482 if(a != 'A' && a != 'C' && a !='G' && a !='T'){
483 break;
484 }
485 }
486 if (name[i] == '#'){
487 switch (newlen-i) { /** allowed values for illumina barcodes :5,6,8 **/
488 case 5:
489 case 6:
490 case 8:
491 newlen=i;
492 break;
493 default:
494 break;
495 }
496 }
497 }
498 if(newlen < namelen){ /*** check for :x:y at the end now - to make sure it is illumina **/
499 int i=newlen;
500 for(i--;i>0 && isdigit(name[i]);i--){}
501 if(name[i]==':'){
502 for(i--;i>0 && isdigit(name[i]);i--){}
503 if(name[i]==':' && newlen > 0){ /*** some name before :x:y should still exist **/
504 /*** looks like illumina ***/
505 return newlen;
506 }
507 }
508 }
509 #endif
510 return namelen;
511 }
512
513 static
GetKeyID(KeyToID * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],size_t const o_namelen)514 rc_t GetKeyID(KeyToID *const ctx,
515 uint64_t *const rslt,
516 bool *const wasInserted,
517 char const key[],
518 char const name[],
519 size_t const o_namelen)
520 {
521 size_t const namelen = GetFixedNameLength(name, o_namelen);
522
523 if (ctx->key2id_max == 1)
524 return GetKeyIDOld(ctx, rslt, wasInserted, key, name, namelen);
525 else {
526 unsigned const keylen = strlen(key);
527 unsigned const h = HashKey(key, keylen);
528 unsigned f;
529 unsigned e = ctx->key2id_count;
530 uint64_t tmpKey;
531
532 *rslt = 0;
533 {{
534 uint32_t const bucket_value = ctx->key2id_hash[h];
535 unsigned const n = (uint8_t) bucket_value;
536 unsigned const i1 = (uint8_t)(bucket_value >> 8);
537 unsigned const i2 = (uint8_t)(bucket_value >> 16);
538 unsigned const i3 = (uint8_t)(bucket_value >> 24);
539
540 if (n > 0 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i1]) == 0) {
541 f = i1;
542 /*
543 ctx->key2id_hash[h] = (i3 << 24) | (i2 << 16) | (i1 << 8) | n;
544 */
545 goto GET_ID;
546 }
547 if (n > 1 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i2]) == 0) {
548 f = i2;
549 ctx->key2id_hash[h] = (i3 << 24) | (i1 << 16) | (i2 << 8) | n;
550 goto GET_ID;
551 }
552 if (n > 2 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i3]) == 0) {
553 f = i3;
554 ctx->key2id_hash[h] = (i2 << 24) | (i1 << 16) | (i3 << 8) | n;
555 goto GET_ID;
556 }
557 }}
558 f = 0;
559 while (f < e) {
560 unsigned const m = (f + e) / 2;
561 unsigned const oid = ctx->key2id_oid[m];
562 int const diff = strcmp(key, ctx->key2id_names + ctx->key2id_name[oid]);
563
564 if (diff < 0)
565 e = m;
566 else if (diff > 0)
567 f = m + 1;
568 else {
569 f = oid;
570 goto GET_ID;
571 }
572 }
573 if (ctx->key2id_count < ctx->key2id_max) {
574 unsigned const name_max = ctx->key2id_name_max + keylen + 1;
575 KBTree *tree;
576 rc_t rc = OpenKBTree(&tree, ctx->key2id_count + 1, 1); /* ctx->key2id_max); */
577
578 if (rc) return rc;
579
580 if (ctx->key2id_name_alloc < name_max) {
581 unsigned alloc = ctx->key2id_name_alloc;
582 void *tmp;
583
584 if (alloc == 0)
585 alloc = 4096;
586 while (alloc < name_max)
587 alloc <<= 1;
588 tmp = realloc(ctx->key2id_names, alloc);
589 if (tmp == NULL)
590 return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
591 ctx->key2id_names = tmp;
592 ctx->key2id_name_alloc = alloc;
593 }
594 if (f < ctx->key2id_count) {
595 memmove(&ctx->key2id_oid[f + 1], &ctx->key2id_oid[f], (ctx->key2id_count - f) * sizeof(ctx->key2id_oid[f]));
596 }
597 ctx->key2id_oid[f] = ctx->key2id_count;
598 ++ctx->key2id_count;
599 f = ctx->key2id_oid[f];
600 ctx->key2id_name[f] = ctx->key2id_name_max;
601 ctx->key2id_name_max = name_max;
602
603 memmove(&ctx->key2id_names[ctx->key2id_name[f]], key, keylen + 1);
604 ctx->key2id[f] = tree;
605 ctx->idCount[f] = 0;
606 if ((uint8_t)ctx->key2id_hash[h] < 3) {
607 unsigned const n = (uint8_t)ctx->key2id_hash[h] + 1;
608
609 ctx->key2id_hash[h] = (((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | n;
610 }
611 else {
612 /* the hash function isn't working too well
613 * keep the 3 mru
614 */
615 ctx->key2id_hash[h] = (((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | 3;
616 }
617 GET_ID:
618 tmpKey = ctx->idCount[f];
619 rc = KBTreeEntry(ctx->key2id[f], &tmpKey, wasInserted, name, namelen);
620 if (rc == 0) {
621 *rslt = (((uint64_t)f) << 32) | tmpKey;
622 if (*wasInserted)
623 ++ctx->idCount[f];
624 if (!(tmpKey < ctx->idCount[f])) {
625 (void)PLOGMSG(klogErr, (klogErr, "too many spots for read group '$(rg)'; ran out of 32-bit ids", "rg=%s", key));
626 return RC(rcExe, rcTree, rcAllocating, rcId, rcExhausted);
627 }
628 }
629 return rc;
630 }
631 (void)PLOGMSG(klogErr, (klogErr, "too many read groups: max is $(max)", "max=%d", (int)ctx->key2id_max));
632 return RC(rcExe, rcTree, rcAllocating, rcConstraint, rcViolated);
633 }
634 }
635
OpenMMapFile(context_t * const ctx,KDirectory * const dir)636 static rc_t OpenMMapFile(context_t *const ctx, KDirectory *const dir)
637 {
638 int fd;
639 char fname[4096];
640 rc_t rc = string_printf(fname, sizeof(fname), NULL, "%s/id2value.%u", G.tmpfs, G.pid);
641
642 if (rc)
643 return rc;
644
645 fd = open(fname, O_RDWR|O_TRUNC|O_CREAT, S_IRUSR|S_IWUSR);
646 if (fd < 0)
647 return RC(rcExe, rcFile, rcCreating, rcFile, rcNotFound);
648 unlink(fname);
649 return MMArrayMake(&ctx->id2value, fd, sizeof(ctx_value_t));
650 }
651
TmpfsDirectory(KDirectory ** const rslt)652 static rc_t TmpfsDirectory(KDirectory **const rslt)
653 {
654 KDirectory *dir;
655 rc_t rc = KDirectoryNativeDir(&dir);
656 if (rc == 0) {
657 rc = KDirectoryOpenDirUpdate(dir, rslt, false, "%s", G.tmpfs);
658 KDirectoryRelease(dir);
659 }
660 return rc;
661 }
662
SetupContext(context_t * ctx,unsigned numfiles)663 static rc_t SetupContext(context_t *ctx, unsigned numfiles)
664 {
665 rc_t rc = 0;
666
667 // memset(ctx, 0, sizeof(*ctx));
668
669 if (G.mode == mode_Archive) {
670 KDirectory *dir;
671 size_t fragSize[2];
672
673 fragSize[1] = (G.cache_size / 8);
674 fragSize[0] = fragSize[1] * 4;
675
676 rc = TmpfsDirectory(&dir);
677 if (rc == 0)
678 rc = OpenMMapFile(ctx, dir);
679 if (rc == 0)
680 rc = MemBankMake(&ctx->frags, dir, G.pid, fragSize);
681 KDirectoryRelease(dir);
682 }
683 else if (G.mode == mode_Remap) {
684 KeyToID const save1 = ctx->keyToID;
685 MMArray *const save2 = ctx->id2value;
686 int64_t const save3 = ctx->spotId;
687
688 memset(ctx, 0, sizeof(*ctx));
689 ctx->keyToID = save1;
690 ctx->id2value = save2;
691 ctx->spotId = save3;
692 }
693
694 rc = KLoadProgressbar_Make(&ctx->progress[0], 0); if (rc) return rc;
695 rc = KLoadProgressbar_Make(&ctx->progress[1], 0); if (rc) return rc;
696 rc = KLoadProgressbar_Make(&ctx->progress[2], 0); if (rc) return rc;
697 rc = KLoadProgressbar_Make(&ctx->progress[3], 0); if (rc) return rc;
698
699 KLoadProgressbar_Append(ctx->progress[0], 100 * numfiles);
700
701 return rc;
702 }
703
ContextReleaseMemBank(context_t * ctx)704 static void ContextReleaseMemBank(context_t *ctx)
705 {
706 MemBankRelease(ctx->frags);
707 ctx->frags = NULL;
708 }
709
ContextRelease(context_t * ctx,bool continuing)710 static void ContextRelease(context_t *ctx, bool continuing)
711 {
712 KLoadProgressbar_Release(ctx->progress[0], true);
713 KLoadProgressbar_Release(ctx->progress[1], true);
714 KLoadProgressbar_Release(ctx->progress[2], true);
715 KLoadProgressbar_Release(ctx->progress[3], true);
716 if (!continuing)
717 MMArrayWhack(ctx->id2value);
718 else
719 MMArrayClear(ctx->id2value);
720 }
721
722 static
COPY_QUAL(uint8_t D[],uint8_t const S[],unsigned const L,bool const R)723 void COPY_QUAL(uint8_t D[], uint8_t const S[], unsigned const L, bool const R)
724 {
725 if (R) {
726 unsigned i;
727 unsigned j;
728
729 for (i = 0, j = L - 1; i != L; ++i, --j)
730 D[i] = S[j];
731 }
732 else
733 memmove(D, S, L);
734 }
735
736 static
COPY_READ(INSDC_dna_text D[],INSDC_dna_text const S[],unsigned const L,bool const R)737 void COPY_READ(INSDC_dna_text D[], INSDC_dna_text const S[], unsigned const L, bool const R)
738 {
739 static INSDC_dna_text const compl[] = {
740 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
741 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
742 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
743 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
744 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
745 0 , 0 , 0 , 0 , 0 , 0 , '.', 0 ,
746 '0', '1', '2', '3', 0 , 0 , 0 , 0 ,
747 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
748 0 , 'T', 'V', 'G', 'H', 0 , 0 , 'C',
749 'D', 0 , 0 , 'M', 0 , 'K', 'N', 0 ,
750 0 , 0 , 'Y', 'S', 'A', 'A', 'B', 'W',
751 0 , 'R', 0 , 0 , 0 , 0 , 0 , 0 ,
752 0 , 'T', 'V', 'G', 'H', 0 , 0 , 'C',
753 'D', 0 , 0 , 'M', 0 , 'K', 'N', 0 ,
754 0 , 0 , 'Y', 'S', 'A', 'A', 'B', 'W',
755 0 , 'R', 0 , 0 , 0 , 0 , 0 , 0 ,
756 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
757 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
758 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
759 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
760 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
761 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
762 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
763 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
764 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
765 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
766 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
767 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
768 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
769 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
770 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
771 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
772 };
773 if (R) {
774 unsigned i;
775 unsigned j;
776
777 for (i = 0, j = L - 1; i != L; ++i, --j)
778 D[i] = compl[((uint8_t const *)S)[j]];
779 }
780 else
781 memmove(D, S, L);
782 }
783
MakeDeferralFile()784 static KFile *MakeDeferralFile() {
785 if (G.deferSecondary) {
786 char template[4096];
787 int fd;
788 KFile *f;
789 KDirectory *d;
790 size_t nwrit;
791
792 KDirectoryNativeDir(&d);
793 string_printf(template, sizeof(template), &nwrit, "%s/defer.XXXXXX", G.tmpfs);
794 fd = mkstemp(template);
795 KDirectoryOpenFileWrite(d, &f, true, template);
796 KDirectoryRelease(d);
797 close(fd);
798 unlink(template);
799 return f;
800 }
801 return NULL;
802 }
803
OpenBAM(const BAM_File ** bam,VDatabase * db,const char bamFile[])804 static rc_t OpenBAM(const BAM_File **bam, VDatabase *db, const char bamFile[])
805 {
806 rc_t rc = 0;
807 KFile *defer = MakeDeferralFile();
808
809 if (strcmp(bamFile, "/dev/stdin") == 0) {
810 rc = BAM_FileMake(bam, defer, G.headerText, "/dev/stdin");
811 }
812 else {
813 rc = BAM_FileMake(bam, defer, G.headerText, "%s", bamFile);
814 }
815 KFileRelease(defer); /* it was retained by BAM file */
816
817 if (rc) {
818 (void)PLOGERR(klogErr, (klogErr, rc, "Failed to open '$(file)'", "file=%s", bamFile));
819 }
820 else if (db) {
821 KMetadata *dbmeta;
822
823 rc = VDatabaseOpenMetadataUpdate(db, &dbmeta);
824 if (rc == 0) {
825 KMDataNode *node;
826
827 rc = KMetadataOpenNodeUpdate(dbmeta, &node, "BAM_HEADER");
828 KMetadataRelease(dbmeta);
829 if (rc == 0) {
830 char const *header;
831 size_t size;
832
833 rc = BAM_FileGetHeaderText(*bam, &header, &size);
834 if (rc == 0) {
835 rc = KMDataNodeWrite(node, header, size);
836 }
837 KMDataNodeRelease(node);
838 }
839 }
840 }
841
842 return rc;
843 }
844
VerifyReferences(BAM_File const * bam,Reference const * ref)845 static rc_t VerifyReferences(BAM_File const *bam, Reference const *ref)
846 {
847 rc_t rc = 0;
848 uint32_t n;
849 unsigned i;
850
851 BAM_FileGetRefSeqCount(bam, &n);
852 for (i = 0; i != n; ++i) {
853 BAMRefSeq const *refSeq;
854
855 BAM_FileGetRefSeq(bam, i, &refSeq);
856 if (G.refFilter && strcmp(refSeq->name, G.refFilter) != 0)
857 continue;
858
859 rc = ReferenceVerify(ref, refSeq->name, refSeq->length, refSeq->checksum);
860 if (rc) {
861 if (GetRCObject(rc) == rcChecksum && GetRCState(rc) == rcUnequal) {
862 #if NCBI
863 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); checksums do not match", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
864 #endif
865 }
866 else if (GetRCObject(rc) == rcSize && GetRCState(rc) == rcUnequal) {
867 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); lengths do not match", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
868 }
869 else if (GetRCObject(rc) == rcSize && GetRCState(rc) == rcEmpty) {
870 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); fasta file is empty", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
871 }
872 else if (GetRCObject(rc) == rcId && GetRCState(rc) == rcNotFound) {
873 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); no match found", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
874 }
875 else {
876 (void)PLOGERR(klogWarn, (klogWarn, rc, "Reference: '$(name)', Length: $(len); error", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
877 }
878 }
879 else if (G.onlyVerifyReferences) {
880 (void)PLOGMSG(klogInfo, (klogInfo, "Reference: '$(name)', Length: $(len); match found", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
881 }
882 }
883 return 0;
884 }
885
GetMapQ(BAM_Alignment const * rec)886 static uint8_t GetMapQ(BAM_Alignment const *rec)
887 {
888 uint8_t mapQ;
889
890 BAM_AlignmentGetMapQuality(rec, &mapQ);
891 return mapQ;
892 }
893
894 #if 0
895 static bool EditAlignedQualities(uint8_t qual[], bool const hasMismatch[], unsigned readlen)
896 {
897 unsigned i;
898 bool changed = false;
899
900 for (i = 0; i < readlen; ++i) {
901 uint8_t const q_0 = qual[i];
902 uint8_t const q_1= hasMismatch[i] ? G.alignedQualValue : q_0;
903
904 if (q_0 != q_1) {
905 changed = true;
906 break;
907 }
908 }
909 if (!changed)
910 return false;
911 for (i = 0; i < readlen; ++i) {
912 uint8_t const q_0 = qual[i];
913 uint8_t const q_1= hasMismatch[i] ? G.alignedQualValue : q_0;
914
915 qual[i] = q_1;
916 }
917 return true;
918 }
919 #endif
920
921 #if 0
922 static bool EditUnalignedQualities(uint8_t qual[], bool const hasMismatch[], unsigned readlen)
923 {
924 unsigned i;
925 bool changed = false;
926
927 for (i = 0; i < readlen; ++i) {
928 uint8_t const q_0 = qual[i];
929 uint8_t const q_1 = (q_0 & 0x7F) | (hasMismatch[i] ? 0x80 : 0);
930
931 if (q_0 != q_1) {
932 changed = true;
933 break;
934 }
935 }
936 if (!changed)
937 return false;
938 for (i = 0; i < readlen; ++i) {
939 uint8_t const q_0 = qual[i];
940 uint8_t const q_1 = (q_0 & 0x7F) | (hasMismatch[i] ? 0x80 : 0);
941
942 qual[i] = q_1;
943 }
944 return true;
945 }
946 #endif
947
platform_cmp(char const platform[],char const test[])948 static bool platform_cmp(char const platform[], char const test[])
949 {
950 unsigned i;
951
952 for (i = 0; ; ++i) {
953 int ch1 = test[i];
954 int ch2 = toupper(platform[i]);
955
956 if (ch1 != ch2)
957 break;
958 if (ch1 == 0)
959 return true;
960 }
961 return false;
962 }
963
964 static
GetINSDCPlatform(BAM_File const * bam,char const name[])965 INSDC_SRA_platform_id GetINSDCPlatform(BAM_File const *bam, char const name[]) {
966 if (name) {
967 BAMReadGroup const *rg;
968
969 BAM_FileGetReadGroupByName(bam, name, &rg);
970 if (rg && rg->platform) {
971 switch (toupper(rg->platform[0])) {
972 case 'C':
973 if (platform_cmp(rg->platform, "COMPLETE GENOMICS"))
974 return SRA_PLATFORM_COMPLETE_GENOMICS;
975 if (platform_cmp(rg->platform, "CAPILLARY"))
976 return SRA_PLATFORM_CAPILLARY;
977 break;
978 case 'H':
979 if (platform_cmp(rg->platform, "HELICOS"))
980 return SRA_PLATFORM_HELICOS;
981 break;
982 case 'I':
983 if (platform_cmp(rg->platform, "ILLUMINA"))
984 return SRA_PLATFORM_ILLUMINA;
985 if (platform_cmp(rg->platform, "IONTORRENT"))
986 return SRA_PLATFORM_ION_TORRENT;
987 break;
988 case 'L':
989 if (platform_cmp(rg->platform, "LS454"))
990 return SRA_PLATFORM_454;
991 break;
992 case 'N':
993 if (platform_cmp(name, "NANOPORE"))
994 return SRA_PLATFORM_OXFORD_NANOPORE;
995 break;
996 case 'O':
997 if (platform_cmp(name, "OXFORD_NANOPORE"))
998 return SRA_PLATFORM_OXFORD_NANOPORE;
999 break;
1000 case 'P':
1001 if (platform_cmp(rg->platform, "PACBIO"))
1002 return SRA_PLATFORM_PACBIO_SMRT;
1003 break;
1004 case 'S':
1005 if (platform_cmp(rg->platform, "SOLID"))
1006 return SRA_PLATFORM_ABSOLID;
1007 if (platform_cmp(name, "SANGER"))
1008 return SRA_PLATFORM_CAPILLARY;
1009 break;
1010 default:
1011 break;
1012 }
1013 }
1014 }
1015 return SRA_PLATFORM_UNDEFINED;
1016 }
1017
1018 static
CheckLimitAndLogError(void)1019 rc_t CheckLimitAndLogError(void)
1020 {
1021 unsigned const count = ++G.errCount;
1022
1023 if (G.maxErrCount > 0 && count > G.maxErrCount) {
1024 (void)PLOGERR(klogErr, (klogErr, SILENT_RC(rcAlign, rcFile, rcReading, rcError, rcExcessive), "Number of errors $(cnt) exceeds limit of $(max): Exiting", "cnt=%u,max=%u", count, G.maxErrCount));
1025 return RC(rcAlign, rcFile, rcReading, rcError, rcExcessive);
1026 }
1027 return 0;
1028 }
1029
1030 static
RecordNoMatch(char const readName[],char const refName[],uint32_t const refPos)1031 void RecordNoMatch(char const readName[], char const refName[], uint32_t const refPos)
1032 {
1033 if (G.noMatchLog) {
1034 static uint64_t lpos = 0;
1035 char logbuf[256];
1036 size_t len;
1037
1038 if (string_printf(logbuf, sizeof(logbuf), &len, "%s\t%s\t%u\n", readName, refName, refPos) == 0) {
1039 KFileWrite(G.noMatchLog, lpos, logbuf, len, NULL);
1040 lpos += len;
1041 }
1042 }
1043 }
1044
1045 static LowMatchCounter *lmc = NULL;
1046
1047 static
LogNoMatch(char const readName[],char const refName[],unsigned rpos,unsigned matches)1048 rc_t LogNoMatch(char const readName[], char const refName[], unsigned rpos, unsigned matches)
1049 {
1050 rc_t const rc = CheckLimitAndLogError();
1051 static unsigned count = 0;
1052
1053 if (lmc == NULL)
1054 lmc = LowMatchCounterMake();
1055 assert(lmc != NULL);
1056 LowMatchCounterAdd(lmc, refName);
1057
1058 ++count;
1059 if (rc) {
1060 (void)PLOGMSG(klogInfo, (klogInfo, "This is the last warning; this class of warning occurred $(occurred) times",
1061 "occurred=%u", count));
1062 (void)PLOGMSG(klogErr, (klogErr, "Spot '$(name)' contains too few ($(count)) matching bases to reference '$(ref)' at $(pos)",
1063 "name=%s,ref=%s,pos=%u,count=%u", readName, refName, rpos, matches));
1064 return rc;
1065 }
1066 if (G.maxWarnCount_NoMatch == 0 || count < G.maxWarnCount_NoMatch)
1067 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' contains too few ($(count)) matching bases to reference '$(ref)' at $(pos)",
1068 "name=%s,ref=%s,pos=%u,count=%u", readName, refName, rpos, matches));
1069 return 0;
1070 }
1071
1072 struct rlmc_context {
1073 KMDataNode *node;
1074 unsigned node_number;
1075 rc_t rc;
1076 };
1077
RecordLowMatchCount(void * Ctx,char const name[],unsigned const count)1078 static void RecordLowMatchCount(void *Ctx, char const name[], unsigned const count)
1079 {
1080 struct rlmc_context *const ctx = Ctx;
1081
1082 if (ctx->rc == 0) {
1083 KMDataNode *sub = NULL;
1084
1085 ctx->rc = KMDataNodeOpenNodeUpdate(ctx->node, &sub, "LOW_MATCH_COUNT_%u", ++ctx->node_number);
1086 if (ctx->rc == 0) {
1087 uint32_t const count_temp = count;
1088 ctx->rc = KMDataNodeWriteAttr(sub, "REFNAME", name);
1089 if (ctx->rc == 0)
1090 ctx->rc = KMDataNodeWriteB32(sub, &count_temp);
1091
1092 KMDataNodeRelease(sub);
1093 }
1094 }
1095 }
1096
RecordLowMatchCounts(KMDataNode * const node)1097 static rc_t RecordLowMatchCounts(KMDataNode *const node)
1098 {
1099 struct rlmc_context ctx;
1100
1101 assert(lmc != NULL);
1102 if (node) {
1103 ctx.node = node;
1104 ctx.node_number = 0;
1105 ctx.rc = 0;
1106
1107 LowMatchCounterEach(lmc, &ctx, RecordLowMatchCount);
1108 }
1109 return ctx.rc;
1110 }
1111
1112 #if 0
1113 static
1114 rc_t LogDupConflict(char const readName[])
1115 {
1116 rc_t const rc = CheckLimitAndLogError();
1117 static unsigned count = 0;
1118
1119 ++count;
1120 if (rc) {
1121 (void)PLOGMSG(klogInfo, (klogInfo, "This is the last warning; this class of warning occurred $(occurred) times",
1122 "occurred=%u", count));
1123 (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
1124 "Spot '$(name)' is both a duplicate and NOT a duplicate!",
1125 "name=%s", readName));
1126 }
1127 else if (G.maxWarnCount_DupConflict == 0 || count < G.maxWarnCount_DupConflict)
1128 (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
1129 "Spot '$(name)' is both a duplicate and NOT a duplicate!",
1130 "name=%s", readName));
1131 return rc;
1132 }
1133 #endif
1134
1135 static char const *const CHANGED[] = {
1136 "FLAG changed",
1137 "QUAL changed",
1138 "SEQ changed",
1139 "record made unaligned",
1140 "record made unfragmented",
1141 "mate alignment lost",
1142 "record discarded",
1143 "reference name changed",
1144 "CIGAR changed"
1145 };
1146
1147 #define FLAG_CHANGED (0)
1148 #define QUAL_CHANGED (1)
1149 #define SEQ_CHANGED (2)
1150 #define MAKE_UNALIGNED (3)
1151 #define MAKE_UNFRAGMENTED (4)
1152 #define MATE_LOST (5)
1153 #define DISCARDED (6)
1154 #define REF_NAME_CHANGED (7)
1155 #define CIGAR_CHANGED (8)
1156
1157 static char const *const REASONS[] = {
1158 /* FLAG changed */
1159 "0x400 and 0x200 both set", /* 0 */
1160 "conflicting PCR Dup flags", /* 1 */
1161 "primary alignment already exists", /* 2 */
1162 "was already recorded as unaligned", /* 3 */
1163 /* QUAL changed */
1164 "original quality used", /* 4 */
1165 "unaligned colorspace", /* 5 */
1166 "aligned bases", /* 6 */
1167 "unaligned bases", /* 7 */
1168 "reversed", /* 8 */
1169 /* unaligned */
1170 "low MAPQ", /* 9 */
1171 "low match count", /* 10 */
1172 "missing alignment info", /* 11 */
1173 "missing reference position", /* 12 */
1174 "invalid alignment info", /* 13 */
1175 "invalid reference position", /* 14 */
1176 "invalid reference", /* 15 */
1177 "unaligned reference", /* 16 */
1178 "unknown reference", /* 17 */
1179 "hard-clipped colorspace", /* 18 */
1180 /* unfragmented */
1181 "missing fragment info", /* 19 */
1182 "too many fragments", /* 20 */
1183 /* mate info lost */
1184 "invalid mate reference", /* 21 */
1185 "missing mate alignment info", /* 22 */
1186 "unknown mate reference", /* 23 */
1187 /* discarded */
1188 "conflicting PCR duplicate", /* 24 */
1189 "conflicting fragment info", /* 25 */
1190 "reference is skipped", /* 26 */
1191 /* reference name changed */
1192 "reference was named more than once", /* 27 */
1193 /* CIGAR changed */
1194 "alignment overhanging end of reference", /* 28 */
1195 /* discarded */
1196 "hard-clipped secondary alignment", /* 29 */
1197 "low-matching secondary alignment", /* 30 */
1198 };
1199
1200 static struct {
1201 unsigned what, why;
1202 } const CHANGES[] = {
1203 {FLAG_CHANGED, 0},
1204 {FLAG_CHANGED, 1},
1205 {FLAG_CHANGED, 2},
1206 {FLAG_CHANGED, 3},
1207 {QUAL_CHANGED, 4},
1208 {QUAL_CHANGED, 5},
1209 {QUAL_CHANGED, 6},
1210 {QUAL_CHANGED, 7},
1211 {QUAL_CHANGED, 8},
1212 {SEQ_CHANGED, 8},
1213 {MAKE_UNALIGNED, 9},
1214 {MAKE_UNALIGNED, 10},
1215 {MAKE_UNALIGNED, 11},
1216 {MAKE_UNALIGNED, 12},
1217 {MAKE_UNALIGNED, 13},
1218 {MAKE_UNALIGNED, 14},
1219 {MAKE_UNALIGNED, 15},
1220 {MAKE_UNALIGNED, 16},
1221 {MAKE_UNALIGNED, 17},
1222 {MAKE_UNALIGNED, 18},
1223 {MAKE_UNFRAGMENTED, 19},
1224 {MAKE_UNFRAGMENTED, 20},
1225 {MATE_LOST, 21},
1226 {MATE_LOST, 22},
1227 {MATE_LOST, 23},
1228 {DISCARDED, 24},
1229 {DISCARDED, 25},
1230 {DISCARDED, 26},
1231 {DISCARDED, 17},
1232 {REF_NAME_CHANGED, 27},
1233 {CIGAR_CHANGED, 28},
1234 {DISCARDED, 29},
1235 {DISCARDED, 30},
1236 };
1237
1238 #define NUMBER_OF_CHANGES ((unsigned)(sizeof(CHANGES)/sizeof(CHANGES[0])))
1239 static unsigned change_counter[NUMBER_OF_CHANGES];
1240
LOG_CHANGE(unsigned const change)1241 static void LOG_CHANGE(unsigned const change)
1242 {
1243 ++change_counter[change];
1244 }
1245
PrintChangeReport(void)1246 static void PrintChangeReport(void)
1247 {
1248 unsigned i;
1249
1250 for (i = 0; i != NUMBER_OF_CHANGES; ++i) {
1251 if (change_counter[i] > 0) {
1252 char const *const what = CHANGED[CHANGES[i].what];
1253 char const *const why = REASONS[CHANGES[i].why];
1254
1255 PLOGMSG(klogInfo, (klogInfo, "$(what) $(times) times because $(reason)", "what=%s,reason=%s,times=%u", what, why, change_counter[i]));
1256 }
1257 }
1258 }
1259
RecordChange(KMDataNode * const node,char const node_name[],unsigned const node_number,char const what[],char const why[],unsigned const count)1260 static rc_t RecordChange(KMDataNode *const node,
1261 char const node_name[],
1262 unsigned const node_number,
1263 char const what[],
1264 char const why[],
1265 unsigned const count)
1266 {
1267 KMDataNode *sub = NULL;
1268 rc_t const rc_sub = KMDataNodeOpenNodeUpdate(node, &sub, "%s_%u", node_name, node_number);
1269
1270 if (rc_sub) return rc_sub;
1271 {
1272 uint32_t const count_temp = count;
1273 rc_t const rc_attr1 = KMDataNodeWriteAttr(sub, "change", what);
1274 rc_t const rc_attr2 = KMDataNodeWriteAttr(sub, "reason", why);
1275 rc_t const rc_value = KMDataNodeWriteB32(sub, &count_temp);
1276
1277 KMDataNodeRelease(sub);
1278 if (rc_attr1) return rc_attr1;
1279 if (rc_attr2) return rc_attr2;
1280 if (rc_value) return rc_value;
1281
1282 return 0;
1283 }
1284 }
1285
RecordChanges(KMDataNode * const node,char const name[])1286 static rc_t RecordChanges(KMDataNode *const node, char const name[])
1287 {
1288 if (node) {
1289 unsigned i;
1290 unsigned j = 0;
1291
1292 for (i = 0; i != NUMBER_OF_CHANGES; ++i) {
1293 if (change_counter[i] > 0) {
1294 char const *const what = CHANGED[CHANGES[i].what];
1295 char const *const why = REASONS[CHANGES[i].why];
1296 rc_t const rc = RecordChange(node, name, ++j, what, why, change_counter[i]);
1297
1298 if (rc) return rc;
1299 }
1300 }
1301 }
1302 return 0;
1303 }
1304
1305 #define FLAG_CHANGED_400_AND_200 do { LOG_CHANGE( 0); } while(0)
1306 #define FLAG_CHANGED_PCR_DUP do { LOG_CHANGE( 1); } while(0)
1307 #define FLAG_CHANGED_PRIMARY_DUP do { LOG_CHANGE( 2); } while(0)
1308 #define FLAG_CHANGED_WAS_UNALIGNED do { LOG_CHANGE( 3); } while(0)
1309 #define QUAL_CHANGED_OQ do { LOG_CHANGE( 4); } while(0)
1310 #define QUAL_CHANGED_UNALIGNED_CS do { LOG_CHANGE( 5); } while(0)
1311 #define QUAL_CHANGED_ALIGNED_EDIT do { LOG_CHANGE( 6); } while(0)
1312 #define QUAL_CHANGED_UNALIGN_EDIT do { LOG_CHANGE( 7); } while(0)
1313 #define QUAL_CHANGED_REVERSED do { LOG_CHANGE( 8); } while(0)
1314 #define SEQ__CHANGED_REV_COMP do { LOG_CHANGE( 9); } while(0)
1315 #define UNALIGNED_LOW_MAPQ do { LOG_CHANGE(10); } while(0)
1316 #define UNALIGNED_LOW_MATCH_COUNT do { LOG_CHANGE(11); } while(0)
1317 #define UNALIGNED_MISSING_INFO do { LOG_CHANGE(12); } while(0)
1318 #define UNALIGNED_MISSING_REF_POS do { LOG_CHANGE(13); } while(0)
1319 #define UNALIGNED_INVALID_INFO do { LOG_CHANGE(14); } while(0)
1320 #define UNALIGNED_INVALID_REF_POS do { LOG_CHANGE(15); } while(0)
1321 #define UNALIGNED_INVALID_REF do { LOG_CHANGE(16); } while(0)
1322 #define UNALIGNED_UNALIGNED_REF do { LOG_CHANGE(17); } while(0)
1323 #define UNALIGNED_UNKNOWN_REF do { LOG_CHANGE(18); } while(0)
1324 #define UNALIGNED_HARD_CLIPPED_CS do { LOG_CHANGE(19); } while(0)
1325 #define UNFRAGMENT_MISSING_INFO do { LOG_CHANGE(20); } while(0)
1326 #define UNFRAGMENT_TOO_MANY do { LOG_CHANGE(21); } while(0)
1327 #define MATE_INFO_LOST_INVALID do { LOG_CHANGE(22); } while(0)
1328 #define MATE_INFO_LOST_MISSING do { LOG_CHANGE(23); } while(0)
1329 #define MATE_INFO_LOST_UNKNOWN_REF do { LOG_CHANGE(24); } while(0)
1330 #define DISCARD_PCR_DUP do { LOG_CHANGE(25); } while(0)
1331 #define DISCARD_BAD_FRAGMENT_INFO do { LOG_CHANGE(26); } while(0)
1332 #define DISCARD_SKIP_REFERENCE do { LOG_CHANGE(27); } while(0)
1333 #define DISCARD_UNKNOWN_REFERENCE do { LOG_CHANGE(28); } while(0)
1334 #define RENAMED_REFERENCE do { LOG_CHANGE(29); } while(0)
1335 #define OVERHANGING_ALIGNMENT do { LOG_CHANGE(30); } while(0)
1336 #define DISCARD_HARDCLIP_SECONDARY do { LOG_CHANGE(31); } while(0)
1337 #define DISCARD_BAD_SECONDARY do { LOG_CHANGE(32); } while(0)
1338
isHardClipped(unsigned const ops,uint32_t const cigar[])1339 static bool isHardClipped(unsigned const ops, uint32_t const cigar[/* ops */])
1340 {
1341 unsigned i;
1342
1343 for (i = 0; i < ops; ++i) {
1344 uint32_t const op = cigar[i];
1345 int const code = op & 0x0F;
1346
1347 if (code == 5)
1348 return true;
1349 }
1350 return false;
1351 }
1352
FixOverhangingAlignment(KDataBuffer * cigBuf,uint32_t * opCount,uint32_t refPos,uint32_t refLen,uint32_t readlen)1353 static rc_t FixOverhangingAlignment(KDataBuffer *cigBuf, uint32_t *opCount, uint32_t refPos, uint32_t refLen, uint32_t readlen)
1354 {
1355 uint32_t const *cigar = cigBuf->base;
1356 int refend = refPos;
1357 int seqpos = 0;
1358 unsigned i;
1359
1360 for (i = 0; i < *opCount; ++i) {
1361 uint32_t const op = cigar[i];
1362 int const len = op >> 4;
1363 int const code = op & 0x0F;
1364
1365 switch (code) {
1366 case 0: /* M */
1367 case 7: /* = */
1368 case 8: /* X */
1369 seqpos += len;
1370 refend += len;
1371 break;
1372 case 2: /* D */
1373 case 3: /* N */
1374 refend += len;
1375 break;
1376 case 1: /* I */
1377 case 4: /* S */
1378 case 9: /* B */
1379 seqpos += len;
1380 default:
1381 break;
1382 }
1383 if (refend > refLen) {
1384 int const chop = refend - refLen;
1385 int const newlen = len - chop;
1386 int const left = seqpos - chop;
1387 if (left * 2 > readlen) {
1388 int const clip = readlen - left;
1389 rc_t rc;
1390
1391 *opCount = i + 2;
1392 rc = KDataBufferResize(cigBuf, *opCount);
1393 if (rc) return rc;
1394 ((uint32_t *)cigBuf->base)[i ] = (newlen << 4) | code;
1395 ((uint32_t *)cigBuf->base)[i+1] = (clip << 4) | 4;
1396 OVERHANGING_ALIGNMENT;
1397 break;
1398 }
1399 }
1400 }
1401 return 0;
1402 }
1403
1404 static context_t GlobalContext;
1405 static KQueue *bamq;
1406 static KThread *bamread_thread;
1407
BAM_FileReadDetached(BAM_File const * self,BAM_Alignment ** rec)1408 static rc_t BAM_FileReadDetached(BAM_File const *self, BAM_Alignment **rec)
1409 {
1410 BAM_Alignment const *crec = NULL;
1411 rc_t const rc = BAM_FileRead2(self, &crec);
1412 if (rc == 0) {
1413 if ((*rec = BAM_AlignmentDetach(crec)) != NULL)
1414 return 0;
1415 return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1416 }
1417 BAM_AlignmentRelease(crec);
1418 return rc;
1419 }
1420
run_bamread_thread(const KThread * self,void * const file)1421 static rc_t run_bamread_thread(const KThread *self, void *const file)
1422 {
1423 rc_t rc = 0;
1424 size_t NR = 0;
1425
1426 while (rc == 0) {
1427 BAM_Alignment *rec = NULL;
1428
1429 ++NR;
1430 rc = BAM_FileReadDetached(file, &rec);
1431 if ((int)GetRCObject(rc) == rcRow && (int)GetRCState(rc) == rcEmpty) {
1432 rc = CheckLimitAndLogError();
1433 continue;
1434 }
1435 if ((int)GetRCObject(rc) == rcRow && (int)GetRCState(rc) == rcNotFound) {
1436 /* EOF */
1437 rc = 0;
1438 --NR;
1439 break;
1440 }
1441 if (rc) break;
1442
1443 {
1444 static char const dummy[] = "";
1445 char const *spotGroup;
1446 char const *name;
1447 size_t namelen;
1448
1449 BAM_AlignmentGetReadName2(rec, &name, &namelen);
1450 BAM_AlignmentGetReadGroupName(rec, &spotGroup);
1451 rc = GetKeyID(&GlobalContext.keyToID, &rec->keyId, &rec->wasInserted, spotGroup ? spotGroup : dummy, name, namelen);
1452 if (rc) break;
1453 }
1454
1455 for ( ; ; ) {
1456 timeout_t tm;
1457 TimeoutInit(&tm, 1000);
1458 rc = KQueuePush(bamq, rec, &tm);
1459 if (rc == 0 || (int)GetRCObject(rc) != rcTimeout)
1460 break;
1461 }
1462 }
1463 KQueueSeal(bamq);
1464 if (rc) {
1465 (void)LOGERR(klogErr, rc, "bamread_thread done");
1466 }
1467 else {
1468 (void)PLOGMSG(klogInfo, (klogInfo, "bamread_thread done; read $(NR) records", "NR=%lu", NR));
1469 }
1470 return rc;
1471 }
1472
1473 /* call on main thread only */
getNextRecord(BAM_File const * const bam,rc_t * const rc)1474 static BAM_Alignment const *getNextRecord(BAM_File const *const bam, rc_t *const rc)
1475 {
1476 if (bamq == NULL) {
1477 *rc = KQueueMake(&bamq, 4096);
1478 if (*rc) return NULL;
1479 *rc = KThreadMake(&bamread_thread, run_bamread_thread, (void *)bam);
1480 if (*rc) {
1481 KQueueRelease(bamq);
1482 bamq = NULL;
1483 return NULL;
1484 }
1485 }
1486 while (*rc == 0 && (*rc = Quitting()) == 0) {
1487 BAM_Alignment const *rec = NULL;
1488 timeout_t tm;
1489
1490 TimeoutInit(&tm, 10000);
1491 *rc = KQueuePop(bamq, (void **)&rec, &tm);
1492 if (*rc == 0)
1493 return rec; /* this is the normal return */
1494
1495 if ((int)GetRCObject(*rc) == rcTimeout)
1496 *rc = 0;
1497 else {
1498 if ((int)GetRCObject(*rc) == rcData && (int)GetRCState(*rc) == rcDone)
1499 (void)LOGMSG(klogDebug, "KQueuePop Done");
1500 else
1501 (void)PLOGERR(klogWarn, (klogWarn, *rc, "KQueuePop Error", NULL));
1502 }
1503 }
1504 {
1505 rc_t rc2 = 0;
1506 KThreadWait(bamread_thread, &rc2);
1507 if (rc2 != 0)
1508 *rc = rc2; // return the rc from the reader thread
1509 }
1510 KThreadRelease(bamread_thread);
1511 bamread_thread = NULL;
1512 KQueueRelease(bamq);
1513 bamq = NULL;
1514 return NULL;
1515 }
1516
getSpotGroup(BAM_Alignment const * const rec,char spotGroup[])1517 static void getSpotGroup(BAM_Alignment const *const rec, char spotGroup[])
1518 {
1519 char const *rgname;
1520
1521 BAM_AlignmentGetReadGroupName(rec, &rgname);
1522 if (rgname)
1523 strcpy(spotGroup, rgname);
1524 else
1525 spotGroup[0] = '\0';
1526 }
1527
getLinkageGroup(BAM_Alignment const * const rec)1528 static char const *getLinkageGroup(BAM_Alignment const *const rec)
1529 {
1530 static char linkageGroup[1024];
1531 char const *BX = NULL;
1532 char const *CB = NULL;
1533 char const *UB = NULL;
1534
1535 linkageGroup[0] = '\0';
1536 BAM_AlignmentGetLinkageGroup(rec, &BX, &CB, &UB);
1537 if (BX == NULL) {
1538 if (CB != NULL && UB != NULL) {
1539 unsigned const cblen = strlen(CB);
1540 unsigned const ublen = strlen(UB);
1541 if (cblen + ublen + 8 < sizeof(linkageGroup)) {
1542 memmove(&linkageGroup[ 0], "CB:", 3);
1543 memmove(&linkageGroup[ 3], CB, cblen);
1544 memmove(&linkageGroup[cblen + 3], "|UB:", 4);
1545 memmove(&linkageGroup[cblen + 7], UB, ublen + 1);
1546 }
1547 }
1548 }
1549 else {
1550 unsigned const bxlen = strlen(BX);
1551 if (bxlen + 1 < sizeof(linkageGroup))
1552 memmove(linkageGroup, BX, bxlen + 1);
1553 }
1554 return linkageGroup;
1555 }
1556
ProcessBAM(char const bamFile[],context_t * ctx,VDatabase * db,Reference * ref,Sequence * seq,Alignment * align,bool * had_alignments,bool * had_sequences)1557 static rc_t ProcessBAM(char const bamFile[], context_t *ctx, VDatabase *db,
1558 /* data outputs */
1559 Reference *ref, Sequence *seq, Alignment *align,
1560 /* output parameters */
1561 bool *had_alignments, bool *had_sequences)
1562 {
1563 const BAM_File *bam;
1564 const BAM_Alignment *rec;
1565 KDataBuffer buf;
1566 KDataBuffer fragBuf;
1567 KDataBuffer cigBuf;
1568 rc_t rc;
1569 const BAMRefSeq *refSeq = NULL;
1570 int32_t lastRefSeqId = -1;
1571 bool wasRenamed = false;
1572 size_t rsize;
1573 uint64_t keyId = 0;
1574 uint64_t reccount = 0;
1575 char spotGroup[512];
1576 size_t namelen;
1577 float progress = 0.0;
1578 unsigned warned = 0;
1579 long fcountBoth=0;
1580 long fcountOne=0;
1581 int skipRefSeqID = -1;
1582 int unmapRefSeqId = -1;
1583 uint64_t recordsRead = 0;
1584 uint64_t recordsProcessed = 0;
1585 uint64_t filterFlagConflictRecords=0; /*** counts number of conflicts between flags 0x400 and 0x200 ***/
1586 #define MAX_WARNINGS_FLAG_CONFLICT 10000 /*** maximum errors to report ***/
1587
1588 bool isColorSpace = false;
1589 bool isNotColorSpace = G.noColorSpace;
1590 char alignGroup[32];
1591 size_t alignGroupLen;
1592 AlignmentRecord data;
1593 KDataBuffer seqBuffer;
1594 KDataBuffer qualBuffer;
1595 SequenceRecord srec;
1596 SequenceRecordStorage srecStorage;
1597
1598 /* setting up buffers */
1599 memset(&data, 0, sizeof(data));
1600 memset(&srec, 0, sizeof(srec));
1601
1602 srec.ti = srecStorage.ti;
1603 srec.readStart = srecStorage.readStart;
1604 srec.readLen = srecStorage.readLen;
1605 srec.orientation = srecStorage.orientation;
1606 srec.is_bad = srecStorage.is_bad;
1607 srec.alignmentCount = srecStorage.alignmentCount;
1608 srec.aligned = srecStorage.aligned;
1609 srec.cskey = srecStorage. cskey;
1610
1611 rc = OpenBAM(&bam, db, bamFile);
1612 if (rc) return rc;
1613 if (!G.noVerifyReferences && ref != NULL) {
1614 rc = VerifyReferences(bam, ref);
1615 if (G.onlyVerifyReferences) {
1616 BAM_FileRelease(bam);
1617 return rc;
1618 }
1619 }
1620 if (ctx->keyToID.key2id_max == 0) {
1621 uint32_t rgcount;
1622 unsigned rgi;
1623
1624 BAM_FileGetReadGroupCount(bam, &rgcount);
1625 if (rgcount > (sizeof(ctx->keyToID.key2id)/sizeof(ctx->keyToID.key2id[0]) - 1))
1626 ctx->keyToID.key2id_max = 1;
1627 else
1628 ctx->keyToID.key2id_max = sizeof(ctx->keyToID.key2id)/sizeof(ctx->keyToID.key2id[0]);
1629
1630 for (rgi = 0; rgi != rgcount; ++rgi) {
1631 BAMReadGroup const *rg;
1632
1633 BAM_FileGetReadGroup(bam, rgi, &rg);
1634 if (rg && rg->platform && platform_cmp(rg->platform, "CAPILLARY")) {
1635 G.hasTI = true;
1636 break;
1637 }
1638 }
1639 }
1640
1641 /* setting up more buffers */
1642 rc = KDataBufferMake(&cigBuf, 32, 0);
1643 if (rc)
1644 return rc;
1645
1646 rc = KDataBufferMake(&fragBuf, 8, 1024);
1647 if (rc)
1648 return rc;
1649
1650 rc = KDataBufferMake(&buf, 16, 0);
1651 if (rc)
1652 return rc;
1653
1654 rc = KDataBufferMake(&seqBuffer, 8, 4096);
1655 if (rc)
1656 return rc;
1657
1658 rc = KDataBufferMake(&qualBuffer, 8, 4096);
1659 if (rc)
1660 return rc;
1661
1662 if (rc == 0) {
1663 (void)PLOGMSG(klogInfo, (klogInfo, "Loading '$(file)'", "file=%s", bamFile));
1664 }
1665
1666 while ((rec = getNextRecord(bam, &rc)) != NULL) {
1667 bool aligned;
1668 uint32_t readlen;
1669 uint16_t flags;
1670 int64_t rpos=0;
1671 char *seqDNA;
1672 ctx_value_t *value;
1673 bool wasInserted;
1674 int32_t refSeqId=-1;
1675 uint8_t *qual;
1676 bool mated;
1677 const char *name;
1678 char cskey = 0;
1679 bool originally_aligned;
1680 bool isPrimary;
1681 uint32_t opCount;
1682 bool hasCG = false;
1683 uint64_t ti = 0;
1684 uint32_t csSeqLen = 0;
1685 int lpad = 0;
1686 int rpad = 0;
1687 bool hardclipped = false;
1688 bool revcmp = false;
1689 unsigned readNo = 0;
1690 bool wasPromoted = false;
1691 char const *barCode = NULL;
1692 char const *linkageGroup;
1693
1694 ++recordsRead;
1695
1696 BAM_AlignmentGetReadName2(rec, &name, &namelen);
1697
1698 keyId = rec->keyId;
1699 wasInserted = rec->wasInserted;
1700
1701 rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
1702 if (rc) {
1703 (void)PLOGERR(klogErr, (klogErr, rc, "MMArrayGet: failed on id '$(id)'", "id=%u", keyId));
1704 goto LOOP_END;
1705 }
1706
1707 {
1708 float const new_value = BAM_FileGetProportionalPosition(bam) * 100.0;
1709 float const delta = new_value - progress;
1710 if (delta > 1.0) {
1711 KLoadProgressbar_Process(ctx->progress[0], delta, false);
1712 progress = new_value;
1713 }
1714 }
1715
1716 BAM_AlignmentGetBarCode(rec, &barCode);
1717 linkageGroup = getLinkageGroup(rec);
1718
1719 if (!G.noColorSpace) {
1720 if (BAM_AlignmentHasColorSpace(rec)) {
1721 if (isNotColorSpace) {
1722 MIXED_BASE_AND_COLOR:
1723 rc = RC(rcApp, rcFile, rcReading, rcData, rcInconsistent);
1724 (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains base space and color space", "file=%s", bamFile));
1725 goto LOOP_END;
1726 }
1727 /* COLORSPACE is disabled!
1728 * ctx->isColorSpace = isColorSpace = true; */
1729 }
1730 else if (isColorSpace)
1731 goto MIXED_BASE_AND_COLOR;
1732 else
1733 isNotColorSpace = true;
1734 }
1735 BAM_AlignmentGetFlags(rec, &flags);
1736
1737 originally_aligned = (flags & BAMFlags_SelfIsUnmapped) == 0;
1738 aligned = originally_aligned;
1739
1740 mated = false;
1741 if (flags & BAMFlags_WasPaired) {
1742 if ((flags & BAMFlags_IsFirst) != 0)
1743 readNo |= 1;
1744 if ((flags & BAMFlags_IsSecond) != 0)
1745 readNo |= 2;
1746 switch (readNo) {
1747 case 1:
1748 case 2:
1749 mated = true;
1750 break;
1751 case 0:
1752 if ((warned & 1) == 0) {
1753 (void)LOGMSG(klogWarn, "Spots without fragment info have been encountered");
1754 warned |= 1;
1755 }
1756 UNFRAGMENT_MISSING_INFO;
1757 break;
1758 case 3:
1759 if ((warned & 2) == 0) {
1760 (void)LOGMSG(klogWarn, "Spots with more than two fragments have been encountered");
1761 warned |= 2;
1762 }
1763 UNFRAGMENT_TOO_MANY;
1764 break;
1765 }
1766 }
1767 if (!mated)
1768 readNo = 1;
1769
1770 isPrimary = (flags & (BAMFlags_IsNotPrimary|BAMFlags_IsSupplemental)) == 0 ? true : false;
1771 if (G.deferSecondary && !isPrimary && aligned && CTX_VALUE_GET_P_ID(*value, readNo - 1) == 0) {
1772 /* promote to primary alignment */
1773 isPrimary = true;
1774 wasPromoted = true;
1775 }
1776
1777 getSpotGroup(rec, spotGroup);
1778 if (wasInserted) {
1779 if (G.mode == mode_Remap) {
1780 (void)PLOGERR(klogErr, (klogErr, rc = RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
1781 "Spot '$(name)' is a new spot, not a remapping",
1782 "name=%s", name));
1783 goto LOOP_END;
1784 }
1785 /* first time spot is seen */
1786 /* need to make sure that every goto LOOP_END */
1787 /* above this point is with rc != 0 */
1788 /* else this structure won't get initialized */
1789 memset(value, 0, sizeof(*value));
1790 value->unmated = !mated;
1791 if (isPrimary || G.assembleWithSecondary || G.deferSecondary) {
1792 value->pcr_dup = (flags & BAMFlags_IsDuplicate) == 0 ? 0 : 1;
1793 value->platform = GetINSDCPlatform(bam, spotGroup);
1794 value->primary_is_set = 1;
1795 }
1796 }
1797 if (!isPrimary && G.noSecondary)
1798 goto LOOP_END;
1799
1800 rc = BAM_AlignmentCGReadLength(rec, &readlen);
1801 if (rc != 0 && GetRCState(rc) != rcNotFound) {
1802 (void)LOGERR(klogErr, rc, "Invalid CG data");
1803 goto LOOP_END;
1804 }
1805 if (rc == 0) {
1806 hasCG = true;
1807 BAM_AlignmentGetCigarCount(rec, &opCount);
1808 rc = KDataBufferResize(&cigBuf, opCount * 2 + 5);
1809 if (rc) {
1810 (void)LOGERR(klogErr, rc, "Failed to resize CIGAR buffer");
1811 goto LOOP_END;
1812 }
1813 rc = AlignmentRecordInit(&data, readlen);
1814 if (rc == 0)
1815 rc = KDataBufferResize(&buf, readlen);
1816 if (rc) {
1817 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1818 goto LOOP_END;
1819 }
1820
1821 seqDNA = buf.base;
1822 qual = (uint8_t *)&seqDNA[readlen];
1823
1824 rc = BAM_AlignmentGetCGSeqQual(rec, seqDNA, qual);
1825 if (rc == 0) {
1826 rc = BAM_AlignmentGetCGCigar(rec, cigBuf.base, cigBuf.elem_count, &opCount);
1827 }
1828 if (rc) {
1829 (void)LOGERR(klogErr, rc, "Failed to read CG data");
1830 goto LOOP_END;
1831 }
1832 data.data.align_group.elements = 0;
1833 data.data.align_group.buffer = alignGroup;
1834 if (BAM_AlignmentGetCGAlignGroup(rec, alignGroup, sizeof(alignGroup), &alignGroupLen) == 0)
1835 data.data.align_group.elements = alignGroupLen;
1836 }
1837 else {
1838 /* normal flow i.e. NOT CG */
1839 uint32_t const *tmp;
1840
1841 /* resize buffers */
1842 BAM_AlignmentGetReadLength(rec, &readlen);
1843 BAM_AlignmentGetRawCigar(rec, &tmp, &opCount);
1844 rc = KDataBufferResize(&cigBuf, opCount);
1845 assert(rc == 0);
1846 if (rc) {
1847 (void)LOGERR(klogErr, rc, "Failed to resize CIGAR buffer");
1848 goto LOOP_END;
1849 }
1850 memmove(cigBuf.base, tmp, opCount * sizeof(uint32_t));
1851
1852 hardclipped = isHardClipped(opCount, cigBuf.base);
1853 if (hardclipped) {
1854 if (isPrimary && !wasPromoted) {
1855 /* when we promote a secondary to primary and it is hardclipped, we want to "fix" it */
1856 if (!G.acceptHardClip) {
1857 rc = RC(rcApp, rcFile, rcReading, rcConstraint, rcViolated);
1858 (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains hard clipped primary alignments", "file=%s", bamFile));
1859 goto LOOP_END;
1860 }
1861 }
1862 else if (!G.acceptHardClip) { /* convert to soft clip */
1863 uint32_t *const cigar = cigBuf.base;
1864 uint32_t const lOp = cigar[0];
1865 uint32_t const rOp = cigar[opCount - 1];
1866
1867 lpad = (lOp & 0xF) == 5 ? (lOp >> 4) : 0;
1868 rpad = (rOp & 0xF) == 5 ? (rOp >> 4) : 0;
1869
1870 if (lpad + rpad == 0) {
1871 rc = RC(rcApp, rcFile, rcReading, rcData, rcInvalid);
1872 (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains invalid CIGAR", "file=%s", bamFile));
1873 goto LOOP_END;
1874 }
1875 if (lpad != 0) {
1876 uint32_t const new_lOp = (((uint32_t)lpad) << 4) | 4;
1877 cigar[0] = new_lOp;
1878 }
1879 if (rpad != 0) {
1880 uint32_t const new_rOp = (((uint32_t)rpad) << 4) | 4;
1881 cigar[opCount - 1] = new_rOp;
1882 }
1883 }
1884 }
1885
1886 if (G.deferSecondary && !isPrimary) {
1887 /*** try to see if hard-clipped secondary alignment can be salvaged **/
1888 if (readlen + lpad + rpad < 256 && readlen + lpad + rpad < value->fragment_len[readNo - 1]) {
1889 rc = KDataBufferResize(&cigBuf, opCount + 1);
1890 assert(rc == 0);
1891 if (rc) {
1892 (void)LOGERR(klogErr, rc, "Failed to resize CIGAR buffer");
1893 goto LOOP_END;
1894 }
1895 if (rpad > 0 && lpad == 0) {
1896 uint32_t *const cigar = cigBuf.base;
1897 lpad = value->fragment_len[readNo - 1] - readlen - rpad;
1898 memmove(cigar + 1, cigar, opCount * sizeof(*cigar));
1899 cigar[0] = (uint32_t)((lpad << 4) | 4);
1900 }
1901 else {
1902 uint32_t *const cigar = cigBuf.base;
1903 rpad += value->fragment_len[readNo - 1] - readlen - lpad;
1904 cigar[opCount] = (uint32_t)((rpad << 4) | 4);
1905 }
1906 opCount++;
1907 }
1908 }
1909 rc = AlignmentRecordInit(&data, readlen + lpad + rpad);
1910 assert(rc == 0);
1911 if (rc == 0)
1912 rc = KDataBufferResize(&buf, readlen + lpad + rpad);
1913 assert(rc == 0);
1914 if (rc) {
1915 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1916 goto LOOP_END;
1917 }
1918
1919 seqDNA = buf.base;
1920 qual = (uint8_t *)&seqDNA[(readlen | csSeqLen) + lpad + rpad];
1921 memset(seqDNA, 'N', (readlen | csSeqLen) + lpad + rpad);
1922 memset(qual, 0, (readlen | csSeqLen) + lpad + rpad);
1923
1924 BAM_AlignmentGetSequence(rec, seqDNA + lpad);
1925 if (G.useQUAL) {
1926 uint8_t const *squal;
1927
1928 BAM_AlignmentGetQuality(rec, &squal);
1929 memmove(qual + lpad, squal, readlen);
1930 }
1931 else {
1932 uint8_t const *squal;
1933 uint8_t qoffset = 0;
1934 unsigned i;
1935
1936 rc = BAM_AlignmentGetQuality2(rec, &squal, &qoffset);
1937 if (rc) {
1938 (void)PLOGERR(klogErr, (klogErr, rc, "Spot '$(name)': length of original quality does not match sequence", "name=%s", name));
1939 goto LOOP_END;
1940 }
1941 if (qoffset) {
1942 for (i = 0; i != readlen; ++i)
1943 qual[i + lpad] = squal[i] - qoffset;
1944 QUAL_CHANGED_OQ;
1945 }
1946 else
1947 memmove(qual + lpad, squal, readlen);
1948 }
1949 readlen = readlen + lpad + rpad;
1950 data.data.align_group.elements = 0;
1951 data.data.align_group.buffer = alignGroup;
1952 }
1953 if (G.hasTI) {
1954 rc = BAM_AlignmentGetTI(rec, &ti);
1955 if (rc)
1956 ti = 0;
1957 rc = 0;
1958 }
1959
1960 rc = KDataBufferResize(&seqBuffer, readlen);
1961 if (rc) {
1962 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1963 goto LOOP_END;
1964 }
1965 rc = KDataBufferResize(&qualBuffer, readlen);
1966 if (rc) {
1967 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1968 goto LOOP_END;
1969 }
1970 AR_REF_ORIENT(data) = (flags & BAMFlags_SelfIsReverse) == 0 ? false : true;
1971
1972 rpos = -1;
1973 if (aligned) {
1974 BAM_AlignmentGetPosition(rec, &rpos);
1975 BAM_AlignmentGetRefSeqId(rec, &refSeqId);
1976 if (refSeqId != lastRefSeqId) {
1977 refSeq = NULL;
1978 BAM_FileGetRefSeqById(bam, refSeqId, &refSeq);
1979 }
1980 }
1981
1982 revcmp = (isColorSpace && !aligned) ? false : AR_REF_ORIENT(data);
1983 (void)PLOGMSG(klogDebug, (klogDebug, "Read '$(name)' is $(or) at $(ref):$(pos)", "name=%s,or=%s,ref=%s,pos=%i", name, revcmp ? "reverse" : "forward", refSeq ? refSeq->name : "(none)", rpos));
1984 COPY_READ(seqBuffer.base, seqDNA, readlen, revcmp);
1985 COPY_QUAL(qualBuffer.base, qual, readlen, revcmp);
1986
1987 AR_MAPQ(data) = GetMapQ(rec);
1988 if (!isPrimary && AR_MAPQ(data) < G.minMapQual)
1989 goto LOOP_END;
1990
1991 if (aligned && align == NULL) {
1992 rc = RC(rcApp, rcFile, rcReading, rcData, rcInconsistent);
1993 (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains aligned records", "file=%s", bamFile));
1994 goto LOOP_END;
1995 }
1996 while (aligned) {
1997 if (rpos >= 0 && refSeqId >= 0) {
1998 if (refSeqId == skipRefSeqID) {
1999 DISCARD_SKIP_REFERENCE;
2000 goto LOOP_END;
2001 }
2002 if (refSeqId == unmapRefSeqId) {
2003 aligned = false;
2004 UNALIGNED_UNALIGNED_REF;
2005 break;
2006 }
2007 unmapRefSeqId = -1;
2008 if (refSeq == NULL) {
2009 rc = SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent);
2010 (void)PLOGERR(klogWarn, (klogWarn, rc, "File '$(file)': Spot '$(name)' refers to an unknown Reference number $(refSeqId)", "file=%s,refSeqId=%i,name=%s", bamFile, (int)refSeqId, name));
2011 rc = CheckLimitAndLogError();
2012 DISCARD_UNKNOWN_REFERENCE;
2013 goto LOOP_END;
2014 }
2015 else {
2016 bool shouldUnmap = false;
2017
2018 if (G.refFilter && strcmp(G.refFilter, refSeq->name) != 0) {
2019 (void)PLOGMSG(klogInfo, (klogInfo, "Skipping Reference '$(name)'", "name=%s", refSeq->name));
2020 skipRefSeqID = refSeqId;
2021 DISCARD_SKIP_REFERENCE;
2022 goto LOOP_END;
2023 }
2024
2025 rc = ReferenceSetFile(ref, refSeq->name, refSeq->length, refSeq->checksum, &shouldUnmap, &wasRenamed);
2026 if (rc == 0) {
2027 lastRefSeqId = refSeqId;
2028 if (shouldUnmap) {
2029 aligned = false;
2030 unmapRefSeqId = refSeqId;
2031 UNALIGNED_UNALIGNED_REF;
2032 }
2033 break;
2034 }
2035 if (GetRCObject(rc) == rcConstraint && GetRCState(rc) == rcViolated) {
2036 int const level = G.limit2config ? klogWarn : klogErr;
2037
2038 (void)PLOGMSG(level, (level, "Could not find a Reference to match { name: '$(name)', length: $(rlen) }", "name=%s,rlen=%u", refSeq->name, (unsigned)refSeq->length));
2039 }
2040 else if (!G.limit2config)
2041 (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)': Spot '$(sname)' refers to an unknown Reference '$(rname)'", "file=%s,rname=%s,sname=%s", bamFile, refSeq->name, name));
2042 if (G.limit2config) {
2043 rc = 0;
2044 UNALIGNED_UNKNOWN_REF;
2045 }
2046 goto LOOP_END;
2047 }
2048 }
2049 else if (refSeqId < 0) {
2050 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' was marked aligned, but reference id = $(id) is invalid", "name=%.*s,id=%i", namelen, name, refSeqId));
2051 if ((rc = CheckLimitAndLogError()) != 0) goto LOOP_END;
2052 UNALIGNED_INVALID_REF;
2053 }
2054 else {
2055 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' was marked aligned, but reference position = $(pos) is invalid", "name=%.*s,pos=%i", namelen, name, rpos));
2056 if ((rc = CheckLimitAndLogError()) != 0) goto LOOP_END;
2057 UNALIGNED_INVALID_REF_POS;
2058 }
2059
2060 aligned = false;
2061 }
2062 if (!aligned && (G.refFilter != NULL || G.limit2config)) {
2063 assert(!"this shouldn't happen");
2064 goto LOOP_END;
2065 }
2066
2067 AR_KEY(data) = keyId;
2068 AR_READNO(data) = readNo;
2069
2070 if (wasInserted) {
2071 }
2072 else if (isPrimary || G.assembleWithSecondary || G.deferSecondary) {
2073 /* other times */
2074 int o_pcr_dup = value->pcr_dup;
2075 int const n_pcr_dup = (flags & BAMFlags_IsDuplicate) == 0 ? 0 : 1;
2076
2077 if (!value->primary_is_set) {
2078 o_pcr_dup = n_pcr_dup;
2079 value->primary_is_set = 1;
2080 }
2081
2082 value->pcr_dup = o_pcr_dup & n_pcr_dup;
2083 if (o_pcr_dup != (o_pcr_dup & n_pcr_dup)) {
2084 FLAG_CHANGED_PCR_DUP;
2085 }
2086 if (mated && value->unmated) {
2087 (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
2088 "Spot '$(name)', which was first seen without mate info, now has mate info",
2089 "name=%s", name));
2090 rc = CheckLimitAndLogError();
2091 DISCARD_BAD_FRAGMENT_INFO;
2092 goto LOOP_END;
2093 }
2094 else if (!mated && !value->unmated) {
2095 (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
2096 "Spot '$(name)', which was first seen with mate info, now has no mate info",
2097 "name=%s", name));
2098 rc = CheckLimitAndLogError();
2099 DISCARD_BAD_FRAGMENT_INFO;
2100 goto LOOP_END;
2101 }
2102 }
2103 if (isPrimary) {
2104 switch (readNo) {
2105 case 1:
2106 if (CTX_VALUE_GET_P_ID(*value, 0) != 0) {
2107 isPrimary = false;
2108 FLAG_CHANGED_PRIMARY_DUP;
2109 }
2110 else if (aligned && value->unaligned_1) {
2111 (void)PLOGMSG(klogWarn, (klogWarn, "Read 1 of spot '$(name)', which was unmapped, is now being mapped at position $(pos) on reference '$(ref)'; this alignment will be considered as secondary", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2112 isPrimary = false;
2113 FLAG_CHANGED_WAS_UNALIGNED;
2114 }
2115 break;
2116 case 2:
2117 if (CTX_VALUE_GET_P_ID(*value, 1) != 0) {
2118 isPrimary = false;
2119 FLAG_CHANGED_PRIMARY_DUP;
2120 }
2121 else if (aligned && value->unaligned_2) {
2122 (void)PLOGMSG(klogWarn, (klogWarn, "Read 2 of spot '$(name)', which was unmapped, is now being mapped at position $(pos) on reference '$(ref)'; this alignment will be considered as secondary", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2123 isPrimary = false;
2124 FLAG_CHANGED_WAS_UNALIGNED;
2125 }
2126 break;
2127 default:
2128 break;
2129 }
2130 }
2131 if (hardclipped) {
2132 value->hardclipped = 1;
2133 }
2134 #if 0 /** EY TO REVIEW **/
2135 if (!isPrimary && value->hardclipped) {
2136 DISCARD_HARDCLIP_SECONDARY;
2137 goto LOOP_END;
2138 }
2139 #endif
2140
2141 /* input is clean */
2142 ++recordsProcessed;
2143
2144 data.isPrimary = isPrimary;
2145 if (aligned) {
2146 uint32_t matches = 0;
2147 uint32_t misses = 0;
2148 uint8_t rna_orient = ' ';
2149
2150 FixOverhangingAlignment(&cigBuf, &opCount, rpos, refSeq->length, readlen);
2151 BAM_AlignmentGetRNAStrand(rec, &rna_orient);
2152 {
2153 int const intronType = rna_orient == '+' ? NCBI_align_ro_intron_plus :
2154 rna_orient == '-' ? NCBI_align_ro_intron_minus :
2155 hasCG ? NCBI_align_ro_complete_genomics :
2156 NCBI_align_ro_intron_unknown;
2157 rc = ReferenceRead(ref, &data, rpos, cigBuf.base, opCount, seqDNA, readlen, intronType, &matches, &misses);
2158 }
2159 if (rc == 0) {
2160 int const i = readNo - 1;
2161 int const clipped_rl = readlen < 255 ? readlen : 255;
2162 if (i >= 0 && i < 2) {
2163 int const rl = value->fragment_len[i];
2164
2165 if (rl == 0)
2166 value->fragment_len[i] = clipped_rl;
2167 else if (rl != clipped_rl) {
2168 if (isPrimary) {
2169 rc = RC(rcApp, rcFile, rcReading, rcConstraint, rcViolated);
2170 (void)PLOGERR(klogErr, (klogErr, rc, "Primary alignment for '$(name)' has different length ($(len)) than previously recorded secondary alignment. Try to defer secondary alignment processing.",
2171 "name=%s,len=%d", name, readlen));
2172 }
2173 else {
2174 rc = SILENT_RC(rcApp, rcFile, rcReading, rcConstraint, rcViolated);
2175 (void)PLOGERR(klogWarn, (klogWarn, rc, "Secondary alignment for '$(name)' has different length ($(len)) than previously recorded primary alignment; discarding secondary alignment.",
2176 "name=%s,len=%d", name, readlen));
2177 DISCARD_BAD_SECONDARY;
2178 rc = CheckLimitAndLogError();
2179 }
2180 goto LOOP_END;
2181 }
2182 }
2183 }
2184 if (rc == 0 && (matches < G.minMatchCount || (matches == 0 && !G.acceptNoMatch))) {
2185 if (isPrimary) {
2186 if (misses > matches) {
2187 RecordNoMatch(name, refSeq->name, rpos);
2188 rc = LogNoMatch(name, refSeq->name, (unsigned)rpos, (unsigned)matches);
2189 if (rc)
2190 goto LOOP_END;
2191 }
2192 }
2193 else {
2194 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' contains too few ($(count)) matching bases to reference '$(ref)' at $(pos); discarding secondary alignment",
2195 "name=%s,ref=%s,pos=%u,count=%u", name, refSeq->name, (unsigned)rpos, (unsigned)matches));
2196 DISCARD_BAD_SECONDARY;
2197 rc = 0;
2198 goto LOOP_END;
2199 }
2200 }
2201 if (rc) {
2202 aligned = false;
2203
2204 if (((int)GetRCObject(rc)) == ((int)rcData) && GetRCState(rc) == rcNotAvailable) {
2205 /* because of code above converting hard clips to soft clips, this should be unreachable */
2206 abort();
2207 }
2208 else if (((int)GetRCObject(rc)) == ((int)rcData)) {
2209 UNALIGNED_INVALID_INFO;
2210 (void)PLOGERR(klogWarn, (klogWarn, rc, "Spot '$(name)': bad alignment to reference '$(ref)' at $(pos)", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2211 /* Data errors may get reset; alignment will be unmapped at any rate */
2212 rc = CheckLimitAndLogError();
2213 }
2214 else {
2215 UNALIGNED_INVALID_REF_POS;
2216 (void)PLOGERR(klogWarn, (klogWarn, rc, "Spot '$(name)': error reading reference '$(ref)' at $(pos)", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2217 rc = CheckLimitAndLogError();
2218 }
2219 if (rc) goto LOOP_END;
2220 }
2221 }
2222
2223 if (!aligned && isPrimary) {
2224 switch (readNo) {
2225 case 1:
2226 value->unaligned_1 = 1;
2227 break;
2228 case 2:
2229 value->unaligned_2 = 1;
2230 break;
2231 default:
2232 break;
2233 }
2234 }
2235 if (isPrimary) {
2236 switch (readNo) {
2237 case 1:
2238 if (CTX_VALUE_GET_P_ID(*value, 0) == 0 && aligned) {
2239 data.alignId = ++ctx->primaryId;
2240 CTX_VALUE_SET_P_ID(*value, 0, data.alignId);
2241 }
2242 break;
2243 case 2:
2244 if (CTX_VALUE_GET_P_ID(*value, 1) == 0 && aligned) {
2245 data.alignId = ++ctx->primaryId;
2246 CTX_VALUE_SET_P_ID(*value, 1, data.alignId);
2247 }
2248 break;
2249 default:
2250 break;
2251 }
2252 }
2253 if (G.mode == mode_Archive)
2254 goto WRITE_SEQUENCE;
2255 else
2256 goto WRITE_ALIGNMENT;
2257 if (0) {
2258 WRITE_SEQUENCE:
2259 if (barCode) {
2260 if (spotGroup[0] != '\0' && value->platform == SRA_PLATFORM_UNDEFINED) {
2261 /* don't use bar code */
2262 }
2263 else {
2264 unsigned const sglen = strlen(barCode);
2265 if (sglen + 1 < sizeof(spotGroup))
2266 memmove(spotGroup, barCode, sglen + 1);
2267 }
2268 }
2269 if (mated) {
2270 int64_t const spotId = CTX_VALUE_GET_S_ID(*value);
2271 uint32_t const fragmentId = value->fragmentId;
2272 bool const spotHasBeenWritten = (spotId != 0);
2273 bool const spotHasFragmentInfo = (fragmentId != 0);
2274 bool const spotIsFirstSeen = (spotHasBeenWritten || spotHasFragmentInfo) ? false : true;
2275
2276 if (spotHasBeenWritten) {
2277 /* do nothing */
2278 }
2279 else if (spotIsFirstSeen) {
2280 /* start spot assembly */
2281 unsigned sz;
2282 FragmentInfo fi;
2283 int32_t mate_refSeqId = -1;
2284 int64_t pnext = 0;
2285
2286 if (!isPrimary) {
2287 if ( (!G.assembleWithSecondary || hardclipped) && !G.deferSecondary ) {
2288 goto WRITE_ALIGNMENT;
2289 }
2290 (void)PLOGMSG(klogDebug, (klogDebug, "Spot '$(name)' (id $(id)) is being constructed from secondary alignment information", "id=%lx,name=%s", keyId, name));
2291 }
2292 memset(&fi, 0, sizeof(fi));
2293
2294 fi.aligned = isPrimary ? aligned : 0;
2295 fi.ti = ti;
2296 fi.orientation = AR_REF_ORIENT(data);
2297 fi.readNo = readNo;
2298 fi.sglen = strlen(spotGroup);
2299 fi.lglen = strlen(linkageGroup);
2300
2301 fi.readlen = readlen;
2302 fi.cskey = cskey;
2303 fi.is_bad = (flags & BAMFlags_IsLowQuality) != 0;
2304 sz = sizeof(fi) + 2*fi.readlen + fi.sglen + fi.lglen;
2305 if (align) {
2306 BAM_AlignmentGetMateRefSeqId(rec, &mate_refSeqId);
2307 BAM_AlignmentGetMatePosition(rec, &pnext);
2308 }
2309 if(align && mate_refSeqId == refSeqId && pnext > 0 && pnext!=rpos /*** weird case in some bams**/){
2310 rc = MemBankAlloc(ctx->frags, &value->fragmentId, sz, 0, false);
2311 fcountBoth++;
2312 } else {
2313 rc = MemBankAlloc(ctx->frags, &value->fragmentId, sz, 0, true);
2314 fcountOne++;
2315 }
2316 if (rc) {
2317 (void)LOGERR(klogErr, rc, "KMemBankAlloc failed");
2318 goto LOOP_END;
2319 }
2320 /*printf("IN:%10d\tcnt2=%ld\tcnt1=%ld\n",value->fragmentId,fcountBoth,fcountOne);*/
2321
2322 rc = KDataBufferResize(&fragBuf, sz);
2323 if (rc) {
2324 (void)LOGERR(klogErr, rc, "Failed to resize fragment buffer");
2325 goto LOOP_END;
2326 }
2327 {{
2328 uint8_t *dst = (uint8_t*) fragBuf.base;
2329
2330 memmove(dst,&fi,sizeof(fi));
2331 dst += sizeof(fi);
2332 memmove(dst, seqBuffer.base, readlen);
2333 dst += readlen;
2334 memmove(dst, qualBuffer.base, readlen);
2335 dst += fi.readlen;
2336 memmove(dst, spotGroup, fi.sglen);
2337 dst += fi.sglen;
2338 memmove(dst, linkageGroup, fi.lglen);
2339 dst += fi.lglen;
2340 }}
2341 rc = MemBankWrite(ctx->frags, value->fragmentId, 0, fragBuf.base, sz, &rsize);
2342 if (rc) {
2343 (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankWrite failed writing fragment $(id)", "id=%u", value->fragmentId));
2344 goto LOOP_END;
2345 }
2346 if (revcmp) {
2347 QUAL_CHANGED_REVERSED;
2348 SEQ__CHANGED_REV_COMP;
2349 }
2350 }
2351 else if (spotHasFragmentInfo) {
2352 /* continue spot assembly */
2353 FragmentInfo *fip;
2354 {
2355 size_t size1;
2356 size_t size2;
2357
2358 rc = MemBankSize(ctx->frags, fragmentId, &size1);
2359 if (rc) {
2360 (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankSize failed on fragment $(id)", "id=%u", fragmentId));
2361 goto LOOP_END;
2362 }
2363
2364 rc = KDataBufferResize(&fragBuf, size1);
2365 fip = (FragmentInfo *)fragBuf.base;
2366 if (rc) {
2367 (void)PLOGERR(klogErr, (klogErr, rc, "Failed to resize fragment buffer", ""));
2368 goto LOOP_END;
2369 }
2370
2371 rc = MemBankRead(ctx->frags, fragmentId, 0, fragBuf.base, size1, &size2);
2372 if (rc) {
2373 (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankRead failed on fragment $(id)", "id=%u", fragmentId));
2374 goto LOOP_END;
2375 }
2376 assert(size1 == size2);
2377 }
2378 if (readNo == fip->readNo) {
2379 /* is a repeat of the same read; do nothing */
2380 }
2381 else {
2382 /* mate found; finish spot assembly */
2383 unsigned read1 = 0;
2384 unsigned read2 = 1;
2385 char const *const seq1 = (void *)&fip[1];
2386 char const *const qual1 = (void *)(seq1 + fip->readlen);
2387 char const *const sg1 = (void *)(qual1 + fip->readlen);
2388 char const *const bx1 = (void *)(sg1 + fip->sglen);
2389
2390 if (!isPrimary) {
2391 if ((!G.assembleWithSecondary || hardclipped) && !G.deferSecondary ) {
2392 goto WRITE_ALIGNMENT;
2393 }
2394 (void)PLOGMSG(klogDebug, (klogDebug, "Spot '$(name)' (id $(id)) is being constructed from secondary alignment information", "id=%lx,name=%s", keyId, name));
2395 }
2396 rc = KDataBufferResize(&seqBuffer, readlen + fip->readlen);
2397 if (rc) {
2398 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
2399 goto LOOP_END;
2400 }
2401 rc = KDataBufferResize(&qualBuffer, readlen + fip->readlen);
2402 if (rc) {
2403 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
2404 goto LOOP_END;
2405 }
2406 if (readNo < fip->readNo) {
2407 read1 = 1;
2408 read2 = 0;
2409 }
2410
2411 memset(&srecStorage, 0, sizeof(srecStorage));
2412 srec.numreads = 2;
2413 srec.readLen[read1] = fip->readlen;
2414 srec.readLen[read2] = readlen;
2415 srec.readStart[1] = srec.readLen[0];
2416 {
2417 char const *const s1 = seq1;
2418 char const *const s2 = seqBuffer.base;
2419 char *const d = seqBuffer.base;
2420 char *const d1 = d + srec.readStart[read1];
2421 char *const d2 = d + srec.readStart[read2];
2422
2423 srec.seq = seqBuffer.base;
2424 if (d2 != s2) {
2425 memmove(d2, s2, readlen);
2426 }
2427 memmove(d1, s1, fip->readlen);
2428 }
2429 {
2430 char const *const s1 = qual1;
2431 char const *const s2 = qualBuffer.base;
2432 char *const d = qualBuffer.base;
2433 char *const d1 = d + srec.readStart[read1];
2434 char *const d2 = d + srec.readStart[read2];
2435
2436 srec.qual = qualBuffer.base;
2437 if (d2 != s2) {
2438 memmove(d2, s2, readlen);
2439 }
2440 memmove(d1, s1, fip->readlen);
2441 }
2442
2443 srec.ti[read1] = fip->ti;
2444 srec.ti[read2] = ti;
2445
2446 srec.aligned[read1] = fip->aligned;
2447 srec.aligned[read2] = isPrimary ? aligned : 0;
2448
2449 srec.is_bad[read1] = fip->is_bad;
2450 srec.is_bad[read2] = (flags & BAMFlags_IsLowQuality) != 0;
2451
2452 srec.orientation[read1] = fip->orientation;
2453 srec.orientation[read2] = AR_REF_ORIENT(data);
2454
2455 srec.cskey[read1] = fip->cskey;
2456 srec.cskey[read2] = cskey;
2457
2458 srec.keyId = keyId;
2459
2460 srec.spotGroup = sg1;
2461 srec.spotGroupLen = fip->sglen;
2462
2463 srec.linkageGroup = bx1;
2464 srec.linkageGroupLen = fip->lglen;
2465
2466 srec.seq = seqBuffer.base;
2467 srec.qual = qualBuffer.base;
2468
2469 rc = SequenceWriteRecord(seq, &srec, isColorSpace, value->pcr_dup, value->platform);
2470 if (rc) {
2471 (void)LOGERR(klogErr, rc, "SequenceWriteRecord failed");
2472 goto LOOP_END;
2473 }
2474 CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
2475 if(fragmentId & 1){
2476 fcountOne--;
2477 } else {
2478 fcountBoth--;
2479 }
2480 /* printf("OUT:%9d\tcnt2=%ld\tcnt1=%ld\n",fragmentId,fcountBoth,fcountOne);*/
2481 rc = MemBankFree(ctx->frags, fragmentId);
2482 if (rc) {
2483 (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankFree failed on fragment $(id)", "id=%u", fragmentId));
2484 goto LOOP_END;
2485 }
2486 value->fragmentId = 0;
2487 if (revcmp) {
2488 QUAL_CHANGED_REVERSED;
2489 SEQ__CHANGED_REV_COMP;
2490 }
2491 if (value->pcr_dup && (srec.is_bad[0] || srec.is_bad[1])) {
2492 FLAG_CHANGED_400_AND_200;
2493 filterFlagConflictRecords++;
2494 if (filterFlagConflictRecords < MAX_WARNINGS_FLAG_CONFLICT) {
2495 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2496 }
2497 else if (filterFlagConflictRecords == MAX_WARNINGS_FLAG_CONFLICT) {
2498 (void)PLOGMSG(klogWarn, (klogWarn, "Last reported warning: Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2499 }
2500 }
2501 }
2502 }
2503 else {
2504 (void)PLOGMSG(klogErr, (klogErr, "Spot '$(name)' has caused the loader to enter an illogical state", "name=%s", name));
2505 assert("this should never happen");
2506 }
2507 }
2508 else if (CTX_VALUE_GET_S_ID(*value) == 0) {
2509 /* new unmated fragment - no spot assembly */
2510 if (!isPrimary) {
2511 if ((!G.assembleWithSecondary || hardclipped) && !G.deferSecondary ) {
2512 goto WRITE_ALIGNMENT;
2513 }
2514 (void)PLOGMSG(klogDebug, (klogDebug, "Spot '$(name)' (id $(id)) is being constructed from secondary alignment information", "id=%lx,name=%s", keyId, name));
2515 }
2516 memset(&srecStorage, 0, sizeof(srecStorage));
2517 srec.numreads = 1;
2518
2519 srec.readLen[0] = readlen;
2520 srec.ti[0] = ti;
2521 srec.aligned[0] = isPrimary ? aligned : 0;
2522 srec.is_bad[0] = (flags & BAMFlags_IsLowQuality) != 0;
2523 srec.orientation[0] = AR_REF_ORIENT(data);
2524 srec.cskey[0] = cskey;
2525
2526 srec.keyId = keyId;
2527
2528 srec.spotGroup = spotGroup;
2529 srec.spotGroupLen = strlen(spotGroup);
2530
2531 srec.linkageGroup = linkageGroup;
2532 srec.linkageGroupLen = strlen(linkageGroup);
2533
2534 srec.seq = seqBuffer.base;
2535 srec.qual = qualBuffer.base;
2536
2537 rc = SequenceWriteRecord(seq, &srec, isColorSpace, value->pcr_dup, value->platform);
2538 if (rc) {
2539 (void)PLOGERR(klogErr, (klogErr, rc, "SequenceWriteRecord failed", ""));
2540 goto LOOP_END;
2541 }
2542 CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
2543 value->fragmentId = 0;
2544 if (value->pcr_dup && srec.is_bad[0]) {
2545 FLAG_CHANGED_400_AND_200;
2546 filterFlagConflictRecords++;
2547 if (filterFlagConflictRecords < MAX_WARNINGS_FLAG_CONFLICT) {
2548 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2549 }
2550 else if (filterFlagConflictRecords == MAX_WARNINGS_FLAG_CONFLICT) {
2551 (void)PLOGMSG(klogWarn, (klogWarn, "Last reported warning: Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2552 }
2553 }
2554 if (revcmp) {
2555 QUAL_CHANGED_REVERSED;
2556 SEQ__CHANGED_REV_COMP;
2557 }
2558 }
2559 }
2560 WRITE_ALIGNMENT:
2561 if (aligned) {
2562 if (mated && !isPrimary) {
2563 int32_t bam_mrid;
2564 int64_t mpos;
2565 int64_t mrid = 0;
2566 int64_t tlen;
2567
2568 BAM_AlignmentGetMatePosition(rec, &mpos);
2569 BAM_AlignmentGetMateRefSeqId(rec, &bam_mrid);
2570 BAM_AlignmentGetInsertSize(rec, &tlen);
2571
2572 if (mpos >= 0 && bam_mrid >= 0 && tlen != 0) {
2573 BAMRefSeq const *mref;
2574
2575 BAM_FileGetRefSeq(bam, bam_mrid, &mref);
2576 if (mref) {
2577 rc_t rc_temp = ReferenceGet1stRow(ref, &mrid, mref->name);
2578 if (rc_temp == 0) {
2579 data.mate_ref_pos = mpos;
2580 data.template_len = tlen;
2581 data.mate_ref_orientation = (flags & BAMFlags_MateIsReverse) ? 1 : 0;
2582 }
2583 else {
2584 (void)PLOGERR(klogWarn, (klogWarn, rc_temp, "Failed to get refID for $(name)", "name=%s", mref->name));
2585 MATE_INFO_LOST_UNKNOWN_REF;
2586 }
2587 data.mate_ref_id = mrid;
2588 }
2589 else {
2590 MATE_INFO_LOST_INVALID;
2591 }
2592 }
2593 else if (mpos >= 0 || bam_mrid >= 0 || tlen != 0) {
2594 MATE_INFO_LOST_MISSING;
2595 }
2596 }
2597
2598 if (wasRenamed) {
2599 RENAMED_REFERENCE;
2600 }
2601 if (value->alignmentCount[readNo - 1] < 254)
2602 ++value->alignmentCount[readNo - 1];
2603 ++ctx->alignCount;
2604
2605 assert(keyId >> 32 < ctx->keyToID.key2id_count);
2606 assert((uint32_t)keyId < ctx->keyToID.idCount[keyId >> 32]);
2607
2608 if (linkageGroup[0] != '\0') {
2609 AR_LINKAGE_GROUP(data).elements = strlen(linkageGroup);
2610 AR_LINKAGE_GROUP(data).buffer = linkageGroup;
2611 }
2612
2613 rc = AlignmentWriteRecord(align, &data);
2614 if (rc == 0) {
2615 if (!isPrimary)
2616 data.alignId = ++ctx->secondId;
2617
2618 rc = ReferenceAddAlignId(ref, data.alignId, isPrimary);
2619 if (rc) {
2620 (void)PLOGERR(klogErr, (klogErr, rc, "ReferenceAddAlignId failed", ""));
2621 }
2622 else {
2623 *had_alignments = true;
2624 }
2625 }
2626 else {
2627 (void)PLOGERR(klogErr, (klogErr, rc, "AlignmentWriteRecord failed", ""));
2628 }
2629 }
2630 /**************************************************************/
2631
2632 LOOP_END:
2633 BAM_AlignmentRelease(rec);
2634 ++reccount;
2635 if (G.maxAlignCount > 0 && reccount >= G.maxAlignCount)
2636 break;
2637 if (rc == 0)
2638 *had_sequences = true;
2639 else
2640 break;
2641 }
2642 if (bamread_thread != NULL && bamq != NULL) {
2643 KQueueSeal(bamq);
2644 for ( ; ; ) {
2645 timeout_t tm;
2646 void *rr = NULL;
2647 rc_t rc2;
2648
2649 TimeoutInit(&tm, 1000);
2650 rc2 = KQueuePop(bamq, &rr, &tm);
2651 if (rc2) break;
2652 BAM_AlignmentRelease((BAM_Alignment *)rr);
2653 }
2654 KThreadWait(bamread_thread, NULL);
2655 }
2656 KThreadRelease(bamread_thread);
2657 KQueueRelease(bamq);
2658
2659 if (rc) {
2660 if ( (GetRCModule(rc) == rcCont && (int)GetRCObject(rc) == rcData && GetRCState(rc) == rcDone)
2661 || (GetRCModule(rc) == rcAlign && GetRCObject(rc) == rcRow && GetRCState(rc) == rcNotFound))
2662 {
2663 (void)PLOGMSG(klogInfo, (klogInfo, "EOF '$(file)'; processed $(proc)", "file=%s,read=%lu,proc=%lu", bamFile, (unsigned long)recordsRead, (unsigned long)recordsProcessed));
2664 rc = 0;
2665 }
2666 else {
2667 (void)PLOGERR(klogInfo, (klogInfo, rc, "Error '$(file)'; read $(read); processed $(proc)", "file=%s,read=%lu,proc=%lu", bamFile, (unsigned long)recordsRead, (unsigned long)recordsProcessed));
2668 }
2669 }
2670 if (filterFlagConflictRecords > 0) {
2671 (void)PLOGMSG(klogWarn, (klogWarn, "$(cnt1) out of $(cnt2) records contained warning : both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "cnt1=%lu,cnt2=%lu", filterFlagConflictRecords,recordsProcessed));
2672 }
2673 if (rc == 0 && recordsProcessed == 0) {
2674 (void)LOGMSG(klogWarn, (G.limit2config || G.refFilter != NULL) ?
2675 "All records from the file were filtered out" :
2676 "The file contained no records that were processed.");
2677 rc = RC(rcAlign, rcFile, rcReading, rcData, rcEmpty);
2678 }
2679
2680 BAM_FileRelease(bam);
2681 MMArrayLock(ctx->id2value);
2682 KDataBufferWhack(&seqBuffer);
2683 KDataBufferWhack(&qualBuffer);
2684 KDataBufferWhack(&buf);
2685 KDataBufferWhack(&fragBuf);
2686 KDataBufferWhack(&cigBuf);
2687 KDataBufferWhack(&data.buffer);
2688 return rc;
2689 }
2690
WriteSoloFragments(context_t * ctx,Sequence * seq)2691 static rc_t WriteSoloFragments(context_t *ctx, Sequence *seq)
2692 {
2693 uint32_t i;
2694 unsigned j;
2695 uint64_t idCount = 0;
2696 rc_t rc;
2697 KDataBuffer fragBuf;
2698 SequenceRecordStorage srecStorage;
2699 SequenceRecord srec;
2700
2701 ++ctx->pass;
2702 memset(&srec, 0, sizeof(srec));
2703
2704 srec.ti = srecStorage.ti;
2705 srec.readStart = srecStorage.readStart;
2706 srec.readLen = srecStorage.readLen;
2707 srec.orientation = srecStorage.orientation;
2708 srec.is_bad = srecStorage.is_bad;
2709 srec.alignmentCount = srecStorage.alignmentCount;
2710 srec.aligned = srecStorage.aligned;
2711 srec.cskey = srecStorage. cskey;
2712
2713 rc = KDataBufferMake(&fragBuf, 8, 0);
2714 if (rc) {
2715 (void)LOGERR(klogErr, rc, "KDataBufferMake failed");
2716 return rc;
2717 }
2718 for (idCount = 0, j = 0; j < ctx->keyToID.key2id_count; ++j) {
2719 idCount += ctx->keyToID.idCount[j];
2720 }
2721 KLoadProgressbar_Append(ctx->progress[ctx->pass - 1], idCount);
2722
2723 for (idCount = 0, j = 0; j < ctx->keyToID.key2id_count; ++j) {
2724 for (i = 0; i != ctx->keyToID.idCount[j]; ++i, ++idCount) {
2725 uint64_t const keyId = ((uint64_t)j << 32) | i;
2726 ctx_value_t *value;
2727 size_t rsize;
2728 size_t sz;
2729 char const *src;
2730 FragmentInfo const *fip;
2731
2732 rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
2733 if (rc)
2734 break;
2735 KLoadProgressbar_Process(ctx->progress[ctx->pass - 1], 1, false);
2736 if (value->fragmentId == 0)
2737 continue;
2738
2739 rc = MemBankSize(ctx->frags, value->fragmentId, &sz);
2740 if (rc) {
2741 (void)LOGERR(klogErr, rc, "KMemBankSize failed");
2742 break;
2743 }
2744 rc = KDataBufferResize(&fragBuf, (size_t)sz);
2745 if (rc) {
2746 (void)LOGERR(klogErr, rc, "KDataBufferResize failed");
2747 break;
2748 }
2749 rc = MemBankRead(ctx->frags, value->fragmentId, 0, fragBuf.base, sz, &rsize);
2750 if (rc) {
2751 (void)LOGERR(klogErr, rc, "KMemBankRead failed");
2752 break;
2753 }
2754 assert( rsize == sz );
2755
2756 fip = fragBuf.base;
2757 src = (char const *)&fip[1];
2758
2759 memset(&srecStorage, 0, sizeof(srecStorage));
2760 if (value->unmated) {
2761 srec.numreads = 1;
2762 srec.readLen[0] = fip->readlen;
2763 srec.ti[0] = fip->ti;
2764 srec.aligned[0] = fip->aligned;
2765 srec.is_bad[0] = fip->is_bad;
2766 srec.orientation[0] = fip->orientation;
2767 srec.cskey[0] = fip->cskey;
2768 }
2769 else {
2770 unsigned const read = ((fip->aligned && CTX_VALUE_GET_P_ID(*value, 0) == 0) || value->unaligned_2) ? 1 : 0;
2771
2772 srec.numreads = 2;
2773 srec.readLen[read] = fip->readlen;
2774 srec.readStart[1] = srec.readLen[0];
2775 srec.ti[read] = fip->ti;
2776 srec.aligned[read] = fip->aligned;
2777 srec.is_bad[read] = fip->is_bad;
2778 srec.orientation[read] = fip->orientation;
2779 srec.cskey[0] = srec.cskey[1] = 'N';
2780 srec.cskey[read] = fip->cskey;
2781 }
2782 srec.seq = (char *)src;
2783 srec.qual = (uint8_t *)(src + fip->readlen);
2784 srec.spotGroup = (char *)(src + 2 * fip->readlen);
2785 srec.spotGroupLen = fip->sglen;
2786 srec.linkageGroup = (char *)(src + 2 * fip->readlen * fip->sglen);
2787 srec.linkageGroupLen = fip->lglen;
2788 srec.keyId = keyId;
2789 rc = SequenceWriteRecord(seq, &srec, ctx->isColorSpace, value->pcr_dup, value->platform);
2790 if (rc) {
2791 (void)LOGERR(klogErr, rc, "SequenceWriteRecord failed");
2792 break;
2793 }
2794 /*rc = KMemBankFree(frags, id);*/
2795 CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
2796 }
2797 }
2798 MMArrayLock(ctx->id2value);
2799 KDataBufferWhack(&fragBuf);
2800 return rc;
2801 }
2802
SequenceUpdateAlignInfo(context_t * ctx,Sequence * seq)2803 static rc_t SequenceUpdateAlignInfo(context_t *ctx, Sequence *seq)
2804 {
2805 rc_t rc = 0;
2806 uint64_t row;
2807 uint64_t keyId;
2808
2809 ++ctx->pass;
2810 KLoadProgressbar_Append(ctx->progress[ctx->pass - 1], ctx->spotId + 1);
2811
2812 for (row = 1; row <= ctx->spotId; ++row) {
2813 ctx_value_t *value;
2814
2815 rc = SequenceReadKey(seq, row, &keyId);
2816 if (rc) {
2817 (void)PLOGERR(klogErr, (klogErr, rc, "Failed to get key for row $(row)", "row=%u", (unsigned)row));
2818 break;
2819 }
2820 rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
2821 if (rc) {
2822 (void)PLOGERR(klogErr, (klogErr, rc, "Failed to read info for row $(row), index $(idx)", "row=%u,idx=%u", (unsigned)row, (unsigned)keyId));
2823 break;
2824 }
2825 if (G.mode == mode_Remap) {
2826 CTX_VALUE_SET_S_ID(*value, row);
2827 }
2828 if (row != CTX_VALUE_GET_S_ID(*value)) {
2829 rc = RC(rcApp, rcTable, rcWriting, rcData, rcUnexpected);
2830 (void)PLOGMSG(klogErr, (klogErr, "Unexpected spot id $(spotId) for row $(row), index $(idx)", "spotId=%u,row=%u,idx=%u", (unsigned)CTX_VALUE_GET_S_ID(*value), (unsigned)row, (unsigned)keyId));
2831 break;
2832 }
2833 {{
2834 int64_t primaryId[2];
2835 int const logLevel = klogWarn; /*G.assembleWithSecondary ? klogWarn : klogErr;*/
2836
2837 primaryId[0] = CTX_VALUE_GET_P_ID(*value, 0);
2838 primaryId[1] = CTX_VALUE_GET_P_ID(*value, 1);
2839
2840 if (primaryId[0] == 0 && value->alignmentCount[0] != 0) {
2841 rc = RC(rcApp, rcTable, rcWriting, rcConstraint, rcViolated);
2842 (void)PLOGERR(logLevel, (logLevel, rc, "Spot id $(id) read 1 never had a primary alignment", "id=%lx", keyId));
2843 }
2844 if (!value->unmated && primaryId[1] == 0 && value->alignmentCount[1] != 0) {
2845 rc = RC(rcApp, rcTable, rcWriting, rcConstraint, rcViolated);
2846 (void)PLOGERR(logLevel, (logLevel, rc, "Spot id $(id) read 2 never had a primary alignment", "id=%lx", keyId));
2847 }
2848 if (rc != 0 && logLevel == klogErr)
2849 break;
2850
2851 rc = SequenceUpdateAlignData(seq, row, value->unmated ? 1 : 2,
2852 primaryId,
2853 value->alignmentCount);
2854 }}
2855 if (rc) {
2856 (void)LOGERR(klogErr, rc, "Failed updating Alignment data in sequence table");
2857 break;
2858 }
2859 KLoadProgressbar_Process(ctx->progress[ctx->pass - 1], 1, false);
2860 }
2861 MMArrayLock(ctx->id2value);
2862 return rc;
2863 }
2864
AlignmentUpdateSpotInfo(context_t * ctx,Alignment * align)2865 static rc_t AlignmentUpdateSpotInfo(context_t *ctx, Alignment *align)
2866 {
2867 rc_t rc;
2868 uint64_t keyId;
2869
2870 ++ctx->pass;
2871
2872 KLoadProgressbar_Append(ctx->progress[ctx->pass - 1], ctx->alignCount);
2873
2874 rc = AlignmentStartUpdatingSpotIds(align);
2875 while (rc == 0 && (rc = Quitting()) == 0) {
2876 ctx_value_t *value;
2877
2878 rc = AlignmentGetSpotKey(align, &keyId);
2879 if (rc) {
2880 if (GetRCObject(rc) == rcRow && GetRCState(rc) == rcNotFound)
2881 rc = 0;
2882 break;
2883 }
2884 assert(keyId >> 32 < ctx->keyToID.key2id_count);
2885 assert((uint32_t)keyId < ctx->keyToID.idCount[keyId >> 32]);
2886 rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
2887 if (rc == 0) {
2888 int64_t const spotId = CTX_VALUE_GET_S_ID(*value);
2889
2890 if (spotId == 0) {
2891 rc = RC(rcApp, rcTable, rcWriting, rcConstraint, rcViolated);
2892 (void)PLOGERR(klogErr, (klogErr, rc, "Spot '$(id)' was never assigned a spot id, probably has no primary alignments", "id=%lx", keyId));
2893 break;
2894 }
2895 rc = AlignmentWriteSpotId(align, spotId);
2896 }
2897 KLoadProgressbar_Process(ctx->progress[ctx->pass - 1], 1, false);
2898 }
2899 MMArrayLock(ctx->id2value);
2900 return rc;
2901 }
2902
2903
ArchiveBAM(VDBManager * mgr,VDatabase * db,unsigned bamFiles,char const * bamFile[],unsigned seqFiles,char const * seqFile[],bool * has_alignments,bool continuing)2904 static rc_t ArchiveBAM(VDBManager *mgr, VDatabase *db,
2905 unsigned bamFiles, char const *bamFile[],
2906 unsigned seqFiles, char const *seqFile[],
2907 bool *has_alignments,
2908 bool continuing)
2909 {
2910 rc_t rc = 0;
2911 rc_t rc2;
2912 Reference ref;
2913 Sequence seq;
2914 Alignment *align;
2915 static context_t *ctx = &GlobalContext;
2916 bool has_sequences = false;
2917 unsigned i;
2918
2919 *has_alignments = false;
2920 rc = ReferenceInit(&ref, mgr, db);
2921 if (rc)
2922 return rc;
2923
2924 if (G.onlyVerifyReferences) {
2925 for (i = 0; i < bamFiles && rc == 0; ++i) {
2926 rc = ProcessBAM(bamFile[i], NULL, db, &ref, NULL, NULL, NULL, NULL);
2927 }
2928 ReferenceWhack(&ref, false);
2929 return rc;
2930 }
2931 SequenceInit(&seq, db);
2932 align = AlignmentMake(db);
2933
2934 rc = SetupContext(ctx, bamFiles + seqFiles);
2935 if (rc)
2936 return rc;
2937
2938 ctx->pass = 1;
2939 for (i = 0; i < bamFiles && rc == 0; ++i) {
2940 bool this_has_alignments = false;
2941 bool this_has_sequences = false;
2942
2943 rc = ProcessBAM(bamFile[i], ctx, db, &ref, &seq, align, &this_has_alignments, &this_has_sequences);
2944 *has_alignments |= this_has_alignments;
2945 has_sequences |= this_has_sequences;
2946 }
2947 for (i = 0; i < seqFiles && rc == 0; ++i) {
2948 bool this_has_alignments = false;
2949 bool this_has_sequences = false;
2950
2951 rc = ProcessBAM(seqFile[i], ctx, db, &ref, &seq, align, &this_has_alignments, &this_has_sequences);
2952 *has_alignments |= this_has_alignments;
2953 has_sequences |= this_has_sequences;
2954 }
2955 if (!continuing) {
2956 /*** No longer need memory for key2id ***/
2957 for (i = 0; i != ctx->keyToID.key2id_count; ++i) {
2958 KBTreeDropBacking(ctx->keyToID.key2id[i]);
2959 KBTreeRelease(ctx->keyToID.key2id[i]);
2960 ctx->keyToID.key2id[i] = NULL;
2961 }
2962 free(ctx->keyToID.key2id_names);
2963 ctx->keyToID.key2id_names = NULL;
2964 /*******************/
2965 }
2966
2967 if (has_sequences) {
2968 if (rc == 0 && (rc = Quitting()) == 0) {
2969 if (G.mode == mode_Archive) {
2970 (void)LOGMSG(klogInfo, "Writing unpaired sequences");
2971 rc = WriteSoloFragments(ctx, &seq);
2972 ContextReleaseMemBank(ctx);
2973 }
2974 if (rc == 0) {
2975 rc = SequenceDoneWriting(&seq);
2976 if (rc == 0) {
2977 (void)LOGMSG(klogInfo, "Updating sequence alignment info");
2978 rc = SequenceUpdateAlignInfo(ctx, &seq);
2979 }
2980 }
2981 }
2982 }
2983
2984 if (*has_alignments && rc == 0 && (rc = Quitting()) == 0) {
2985 (void)LOGMSG(klogInfo, "Writing alignment spot ids");
2986 rc = AlignmentUpdateSpotInfo(ctx, align);
2987 }
2988 rc2 = AlignmentWhack(align, *has_alignments && rc == 0 && (rc = Quitting()) == 0);
2989 if (rc == 0)
2990 rc = rc2;
2991
2992 rc2 = ReferenceWhack(&ref, *has_alignments && rc == 0 && (rc = Quitting()) == 0);
2993 if (rc == 0)
2994 rc = rc2;
2995
2996 SequenceWhack(&seq, rc == 0);
2997
2998 ContextRelease(ctx, continuing);
2999
3000 if (rc == 0) {
3001 (void)LOGMSG(klogInfo, "Successfully loaded all files");
3002 }
3003 return rc;
3004 }
3005
WriteLoaderSignature(KMetadata * meta,char const progName[])3006 rc_t WriteLoaderSignature(KMetadata *meta, char const progName[])
3007 {
3008 KMDataNode *node;
3009 rc_t rc = KMetadataOpenNodeUpdate(meta, &node, "/");
3010
3011 if (rc == 0) {
3012 rc = KLoaderMeta_Write(node, progName, __DATE__, "BAM", KAppVersion());
3013 KMDataNodeRelease(node);
3014 }
3015 if (rc) {
3016 (void)LOGERR(klogErr, rc, "Cannot update loader meta");
3017 }
3018 return rc;
3019 }
3020
OpenPath(char const path[],KDirectory ** dir)3021 rc_t OpenPath(char const path[], KDirectory **dir)
3022 {
3023 KDirectory *p;
3024 rc_t rc = KDirectoryNativeDir(&p);
3025
3026 if (rc == 0) {
3027 rc = KDirectoryOpenDirUpdate(p, dir, false, "%s", path);
3028 KDirectoryRelease(p);
3029 }
3030 return rc;
3031 }
3032
3033 static
ConvertDatabaseToUnmapped(VDatabase * db)3034 rc_t ConvertDatabaseToUnmapped(VDatabase *db)
3035 {
3036 VTable* tbl;
3037 rc_t rc = VDatabaseOpenTableUpdate(db, &tbl, "SEQUENCE");
3038 if (rc == 0)
3039 {
3040 VTableRenameColumn(tbl, false, "CMP_ALTREAD", "ALTREAD");
3041 VTableRenameColumn(tbl, false, "CMP_READ", "READ");
3042 VTableRenameColumn(tbl, false, "CMP_ALTCSREAD", "ALTCSREAD");
3043 VTableRenameColumn(tbl, false, "CMP_CSREAD", "CSREAD");
3044 rc = VTableRelease(tbl);
3045 }
3046 return rc;
3047 }
3048
run(char const progName[],unsigned bamFiles,char const * bamFile[],unsigned seqFiles,char const * seqFile[],bool continuing)3049 rc_t run(char const progName[],
3050 unsigned bamFiles, char const *bamFile[],
3051 unsigned seqFiles, char const *seqFile[],
3052 bool continuing)
3053 {
3054 VDBManager *mgr;
3055 rc_t rc;
3056 rc_t rc2;
3057 char const *db_type = G.expectUnsorted ? "NCBI:align:db:alignment_unsorted" : "NCBI:align:db:alignment_sorted";
3058
3059 rc = VDBManagerMakeUpdate(&mgr, NULL);
3060 if (rc) {
3061 (void)LOGERR (klogErr, rc, "failed to create VDB Manager!");
3062 }
3063 else {
3064 bool has_alignments = false;
3065
3066 /* VDBManagerDisableFlushThread(mgr); */
3067 rc = VDBManagerDisablePagemapThread(mgr);
3068 if (rc == 0)
3069 {
3070 if (G.onlyVerifyReferences) {
3071 rc = ArchiveBAM(mgr, NULL, bamFiles, bamFile, 0, NULL, &has_alignments, continuing);
3072 }
3073 else {
3074 VSchema *schema;
3075
3076 rc = VDBManagerMakeSchema(mgr, &schema);
3077 if (rc) {
3078 (void)LOGERR (klogErr, rc, "failed to create schema");
3079 }
3080 else {
3081 (void)(rc = VSchemaAddIncludePath(schema, "%s", G.schemaIncludePath));
3082 rc = VSchemaParseFile(schema, "%s", G.schemaPath);
3083 if (rc) {
3084 (void)PLOGERR(klogErr, (klogErr, rc, "failed to parse schema file $(file)", "file=%s", G.schemaPath));
3085 }
3086 else {
3087 VDatabase *db;
3088
3089 rc = VDBManagerCreateDB(mgr, &db, schema, db_type,
3090 kcmInit + kcmMD5, "%s", G.outpath);
3091 VSchemaRelease(schema);
3092 if (rc == 0) {
3093 rc = ArchiveBAM(mgr, db, bamFiles, bamFile, seqFiles, seqFile, &has_alignments, continuing);
3094 if (rc == 0)
3095 PrintChangeReport();
3096 if (rc == 0 && !has_alignments) {
3097 rc = ConvertDatabaseToUnmapped(db);
3098 }
3099 else if (rc == 0 && lmc != NULL) {
3100 VTable *tbl = NULL;
3101 KTable *ktbl = NULL;
3102 KMetadata *meta = NULL;
3103 KMDataNode *node = NULL;
3104
3105 VDatabaseOpenTableUpdate(db, &tbl, "REFERENCE");
3106 VTableOpenKTableUpdate(tbl, &ktbl);
3107 VTableRelease(tbl);
3108
3109 KTableOpenMetadataUpdate(ktbl, &meta);
3110 KTableRelease(ktbl);
3111
3112 KMetadataOpenNodeUpdate(meta, &node, "LOW_MATCH_COUNT");
3113 KMetadataRelease(meta);
3114
3115 RecordLowMatchCounts(node);
3116
3117 KMDataNodeRelease(node);
3118
3119 LowMatchCounterFree(lmc);
3120 lmc = NULL;
3121 }
3122 VDatabaseRelease(db);
3123
3124 if (rc == 0 && G.globalMode == mode_Remap && !continuing) {
3125 VTable *tbl = NULL;
3126
3127 VDBManagerOpenDBUpdate(mgr, &db, NULL, G.firstOut);
3128 VDatabaseOpenTableUpdate(db, &tbl, "SEQUENCE");
3129 VDatabaseRelease(db);
3130 VTableDropColumn(tbl, "TMP_KEY_ID");
3131 VTableDropColumn(tbl, "READ");
3132 VTableDropColumn(tbl, "ALTREAD");
3133 VTableRelease(tbl);
3134 }
3135
3136 if (rc == 0) {
3137 KMetadata *meta = NULL;
3138
3139 {
3140 KDBManager *kmgr = NULL;
3141
3142 rc = VDBManagerOpenKDBManagerUpdate(mgr, &kmgr);
3143 if (rc == 0) {
3144 KDatabase *kdb;
3145
3146 rc = KDBManagerOpenDBUpdate(kmgr, &kdb, "%s", G.outpath);
3147 if (rc == 0) {
3148 rc = KDatabaseOpenMetadataUpdate(kdb, &meta);
3149 KDatabaseRelease(kdb);
3150 }
3151 KDBManagerRelease(kmgr);
3152 }
3153 }
3154 if (rc == 0) {
3155 rc = WriteLoaderSignature(meta, progName);
3156 if (rc == 0) {
3157 KMDataNode *changes = NULL;
3158
3159 rc = KMetadataOpenNodeUpdate(meta, &changes, "CHANGES");
3160 if (rc == 0)
3161 RecordChanges(changes, "CHANGE");
3162 KMDataNodeRelease(changes);
3163 }
3164 KMetadataRelease(meta);
3165 }
3166 }
3167 }
3168 }
3169 }
3170 }
3171 }
3172 rc2 = VDBManagerRelease(mgr);
3173 if (rc2)
3174 (void)LOGERR(klogWarn, rc2, "Failed to release VDB Manager");
3175 if (rc == 0)
3176 rc = rc2;
3177 }
3178 return rc;
3179 }
3180