1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 /* #include "bam-load.vers.h" */
28 
29 #include <klib/callconv.h>
30 #include <klib/data-buffer.h>
31 #include <klib/text.h>
32 #include <klib/log.h>
33 #include <klib/out.h>
34 #include <klib/status.h>
35 #include <klib/rc.h>
36 #include <klib/sort.h>
37 #include <klib/printf.h>
38 
39 #include <kfs/directory.h>
40 #include <kfs/file.h>
41 #include <kdb/btree.h>
42 #include <kdb/manager.h>
43 #include <kdb/database.h>
44 #include <kdb/table.h>
45 #include <kdb/meta.h>
46 
47 #include <vdb/manager.h>
48 #include <vdb/schema.h>
49 #include <vdb/database.h>
50 #include <vdb/table.h>
51 #include <vdb/cursor.h>
52 #include <vdb/vdb-priv.h>
53 
54 #include <insdc/insdc.h>
55 #include <insdc/sra.h>
56 #include <align/dna-reverse-cmpl.h>
57 #include <align/align.h>
58 
59 #include <kapp/main.h>
60 #include <kapp/args.h>
61 #include <kapp/loader-file.h>
62 #include <kapp/loader-meta.h>
63 #include <kapp/log-xml.h>
64 #include <kapp/progressbar.h>
65 
66 #include <kproc/queue.h>
67 #include <kproc/thread.h>
68 #include <kproc/timeout.h>
69 #include <os-native.h>
70 
71 #include <sysalloc.h>
72 #include <atomic32.h>
73 
74 #include <sys/stat.h>
75 #include <unistd.h>
76 #include <fcntl.h>
77 #include <sys/mman.h>
78 #include <stdlib.h>
79 #include <stdio.h>
80 #include <string.h>
81 #include <ctype.h>
82 #include <assert.h>
83 #include <limits.h>
84 #include <time.h>
85 #include <zlib.h>
86 #include "bam.h"
87 #include "bam-alignment.h"
88 #include "Globals.h"
89 #include "sequence-writer.h"
90 #include "reference-writer.h"
91 #include "alignment-writer.h"
92 #include "mem-bank.h"
93 #include "low-match-count.h"
94 
95 #define NUM_ID_SPACES (256u)
96 
97 #define MMA_NUM_CHUNKS_BITS (20u)
98 #define MMA_NUM_SUBCHUNKS_BITS ((32u)-(MMA_NUM_CHUNKS_BITS))
99 #define MMA_SUBCHUNK_SIZE (1u << MMA_NUM_CHUNKS_BITS)
100 #define MMA_SUBCHUNK_COUNT (1u << MMA_NUM_SUBCHUNKS_BITS)
101 
102 typedef struct {
103     int fd;
104     size_t elemSize;
105     off_t fsize;
106     uint8_t *current;
107     struct mma_map_s {
108         struct mma_submap_s {
109             uint8_t *base;
110         } submap[MMA_SUBCHUNK_COUNT];
111     } map[NUM_ID_SPACES];
112 } MMArray;
113 
114 typedef struct {
115     uint32_t primaryId[2];
116     uint32_t spotId;
117     uint32_t fragmentId;
118 	uint8_t  fragment_len[2]; /*** lowest byte of fragment length to prevent different sizes of primary and secondary alignments **/
119     uint8_t  platform;
120     uint8_t  pId_ext[2];
121     uint8_t  spotId_ext;
122     uint8_t  alignmentCount[2]; /* 0..254; 254: saturated max; 255: special meaning "too many" */
123     uint8_t  unmated: 1,
124              pcr_dup: 1,
125              unaligned_1: 1,
126              unaligned_2: 1,
127              hardclipped: 1,
128 			 primary_is_set: 1;
129 } ctx_value_t;
130 
131 #define CTX_VALUE_SET_P_ID(O,N,V) do { int64_t tv = (V); (O).primaryId[N] = (uint32_t)tv; (O).pId_ext[N] = tv >> 32; } while(0);
132 #define CTX_VALUE_GET_P_ID(O,N) ((((int64_t)((O).pId_ext[N])) << 32) | (O).primaryId[N])
133 
134 #define CTX_VALUE_SET_S_ID(O,V) do { int64_t tv = (V); (O).spotId = (uint32_t)tv; (O).spotId_ext = tv >> 32; } while(0);
135 #define CTX_VALUE_GET_S_ID(O) ((((int64_t)(O).spotId_ext) << 32) | (O).spotId)
136 
137 typedef struct FragmentInfo {
138     uint64_t ti;
139     uint32_t readlen;
140     uint8_t  aligned;
141     uint8_t  is_bad;
142     uint8_t  orientation;
143     uint8_t  readNo;
144     uint8_t  sglen;
145     uint8_t  lglen;
146     uint8_t  cskey;
147 } FragmentInfo;
148 
149 typedef struct KeyToID {
150     KBTree *key2id[NUM_ID_SPACES];
151     char *key2id_names;
152 
153     uint32_t idCount[NUM_ID_SPACES];
154     uint32_t key2id_hash[NUM_ID_SPACES];
155 
156     unsigned key2id_max;
157     unsigned key2id_name_max;
158     unsigned key2id_name_alloc;
159     unsigned key2id_count;
160 
161     unsigned key2id_name[NUM_ID_SPACES];
162     /* this array is kept in name order */
163     /* this maps the names to key2id and idCount */
164     unsigned key2id_oid[NUM_ID_SPACES];
165 } KeyToID;
166 
167 typedef struct context_t {
168     KeyToID keyToID;
169     const KLoadProgressbar *progress[4];
170     MMArray *id2value;
171     MemBank *frags;
172     int64_t spotId;
173     int64_t primaryId;
174     int64_t secondId;
175     uint64_t alignCount;
176 
177     unsigned pass;
178     bool isColorSpace;
179 } context_t;
180 
181 #if 0
182 static char const *Print_ctx_value_t(ctx_value_t const *const self)
183 {
184     static char buffer[16384];
185     rc_t rc = string_printf(buffer, sizeof(buffer), NULL, "pid: { %lu, %lu }, sid: %lu, fid: %u, alc: { %u, %u }, flg: %x", CTX_VALUE_GET_P_ID(*self, 0), CTX_VALUE_GET_P_ID(*self, 1), CTX_VALUE_GET_S_ID(*self), self->fragmentId, self->alignmentCount[0], self->alignmentCount[1], *(self->alignmentCount + sizeof(self->alignmentCount)/sizeof(self->alignmentCount[0])));
186 
187     if (rc)
188         return 0;
189     return buffer;
190 }
191 #endif
192 
MMArrayMake(MMArray ** rslt,int fd,uint32_t elemSize)193 static rc_t MMArrayMake(MMArray **rslt, int fd, uint32_t elemSize)
194 {
195     MMArray *const self = calloc(1, sizeof(*self));
196 
197     if (self == NULL)
198         return RC(rcExe, rcMemMap, rcConstructing, rcMemory, rcExhausted);
199     self->elemSize = (elemSize + 3) & ~(3u); /** align to 4 byte **/
200     self->fd = fd;
201     *rslt = self;
202     return 0;
203 }
204 
205 #define PERF 0
206 #define PROT 0
207 
MMArrayGet(MMArray * const self,void ** const value,uint64_t const element)208 static rc_t MMArrayGet(MMArray *const self, void **const value, uint64_t const element)
209 {
210     size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
211     unsigned const bin_no = element >> 32;
212     unsigned const subbin = ((uint32_t)element) >> MMA_NUM_CHUNKS_BITS;
213     unsigned const in_bin = (uint32_t)element & (MMA_SUBCHUNK_SIZE - 1);
214 
215     if (bin_no >= sizeof(self->map)/sizeof(self->map[0]))
216         return RC(rcExe, rcMemMap, rcConstructing, rcId, rcExcessive);
217 
218     if (self->map[bin_no].submap[subbin].base == NULL) {
219         off_t const cur_fsize = self->fsize;
220         off_t const new_fsize = cur_fsize + chunk;
221 
222         if (ftruncate(self->fd, new_fsize) != 0)
223             return RC(rcExe, rcFile, rcResizing, rcSize, rcExcessive);
224         else {
225             void *const base = mmap(NULL, chunk, PROT_READ|PROT_WRITE,
226                                     MAP_FILE|MAP_SHARED, self->fd, cur_fsize);
227 
228             self->fsize = new_fsize;
229             if (base == MAP_FAILED) {
230                 PLOGMSG(klogErr, (klogErr, "Failed to construct map for bin $(bin), subbin $(subbin)", "bin=%u,subbin=%u", bin_no, subbin));
231                 return RC(rcExe, rcMemMap, rcConstructing, rcMemory, rcExhausted);
232             }
233             else {
234 #if PERF
235                 static unsigned mapcount = 0;
236 
237                 (void)PLOGMSG(klogInfo, (klogInfo, "Number of mmaps: $(cnt)", "cnt=%u", ++mapcount));
238 #endif
239                 self->map[bin_no].submap[subbin].base = base;
240             }
241         }
242     }
243     uint8_t *const next = self->map[bin_no].submap[subbin].base;
244 #if PROT
245     if (next != self->current) {
246         void *const current = self->current;
247 
248         if (current)
249             mprotect(current, chunk, PROT_NONE);
250 
251         mprotect(self->current = next, chunk, PROT_READ|PROT_WRITE);
252     }
253 #endif
254     *value = &next[(size_t)in_bin * self->elemSize];
255     return 0;
256 }
257 
258 #if 0
259 static rc_t MMArrayGetRead(MMArray *const self, void const **const value, uint64_t const element)
260 {
261     unsigned const bin_no = element >> 32;
262     unsigned const subbin = ((uint32_t)element) >> MMA_NUM_CHUNKS_BITS;
263     unsigned const in_bin = (uint32_t)element & (MMA_SUBCHUNK_SIZE - 1);
264 
265     if (bin_no >= sizeof(self->map)/sizeof(self->map[0]))
266         return RC(rcExe, rcMemMap, rcConstructing, rcId, rcExcessive);
267 
268     if (self->map[bin_no].submap[subbin].base == NULL)
269         return RC(rcExe, rcMemMap, rcReading, rcId, rcInvalid);
270 
271     uint8_t *const next = self->map[bin_no].submap[subbin].base;
272 #if PROT
273     size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
274     if (next != self->current) {
275         void *const current = self->current;
276 
277         if (current)
278             mprotect(current, chunk, PROT_NONE);
279 
280         mprotect(self->current = next, chunk, PROT_READ);
281     }
282 #endif
283     *value = &next[(size_t)in_bin * self->elemSize];
284     return 0;
285 }
286 #endif
287 
MMArrayLock(MMArray * const self)288 static void MMArrayLock(MMArray *const self)
289 {
290 #if PROT
291     size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
292     void *const current = self->current;
293 
294     self->current = NULL;
295     if (current)
296         mprotect(current, chunk, PROT_NONE);
297 #endif
298 }
299 
MMArrayClear(MMArray * self)300 static void MMArrayClear(MMArray *self)
301 {
302     size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
303     unsigned i;
304 
305     for (i = 0; i != sizeof(self->map)/sizeof(self->map[0]); ++i) {
306         unsigned j;
307 
308         for (j = 0; j != sizeof(self->map[0].submap)/sizeof(self->map[0].submap[0]); ++j) {
309             if (self->map[i].submap[j].base) {
310 #if PROT
311                 mprotect(self->map[i].submap[j].base, chunk, PROT_READ|PROT_WRITE);
312 #endif
313             	memset(self->map[i].submap[j].base, 0, chunk);
314 #if PROT
315                 mprotect(self->map[i].submap[j].base, chunk, PROT_NONE);
316 #endif
317             }
318         }
319     }
320 #if PROT
321     self->current = NULL;
322 #endif
323 }
324 
MMArrayWhack(MMArray * self)325 static void MMArrayWhack(MMArray *self)
326 {
327     size_t const chunk = MMA_SUBCHUNK_SIZE * self->elemSize;
328     unsigned i;
329 
330     for (i = 0; i != sizeof(self->map)/sizeof(self->map[0]); ++i) {
331         unsigned j;
332 
333         for (j = 0; j != sizeof(self->map[0].submap)/sizeof(self->map[0].submap[0]); ++j) {
334             if (self->map[i].submap[j].base)
335             	munmap(self->map[i].submap[j].base, chunk);
336         }
337     }
338     close(self->fd);
339     free(self);
340 }
341 
OpenKBTree(KBTree ** const rslt,unsigned n,unsigned max)342 static rc_t OpenKBTree(KBTree **const rslt, unsigned n, unsigned max)
343 {
344     size_t const cacheSize = (((G.cache_size - (G.cache_size / 2) - (G.cache_size / 8)) / max)
345                             + 0xFFFFF) & ~((size_t)0xFFFFF);
346     KFile *file = NULL;
347     KDirectory *dir;
348     char fname[4096];
349     rc_t rc;
350 
351     rc = KDirectoryNativeDir(&dir);
352     if (rc)
353         return rc;
354 
355     rc = string_printf(fname, sizeof(fname), NULL, "%s/key2id.%u.%u", G.tmpfs, G.pid, n); if (rc) return rc;
356     rc = KDirectoryCreateFile(dir, &file, true, 0600, kcmInit, "%s", fname);
357     KDirectoryRemove(dir, 0, "%s", fname);
358     KDirectoryRelease(dir);
359     if (rc == 0) {
360         rc = KBTreeMakeUpdate(rslt, file, cacheSize,
361                               false, kbtOpaqueKey,
362                               1, 255, sizeof ( uint32_t ),
363                               NULL
364                               );
365         KFileRelease(file);
366 #if PERF
367         if (rc == 0) {
368             static unsigned treecount = 0;
369 
370             (void)PLOGMSG(klogInfo, (klogInfo, "Number of trees: $(cnt)", "cnt=%u", ++treecount));
371         }
372 #endif
373     }
374     return rc;
375 }
376 
GetKeyIDOld(KeyToID * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],unsigned const namelen)377 static rc_t GetKeyIDOld(KeyToID *const ctx, uint64_t *const rslt, bool *const wasInserted, char const key[], char const name[], unsigned const namelen)
378 {
379     unsigned const keylen = strlen(key);
380     rc_t rc;
381     uint64_t tmpKey;
382 
383     if (ctx->key2id_count == 0) {
384         rc = OpenKBTree(&ctx->key2id[0], 1, 1);
385         if (rc) return rc;
386         ctx->key2id_count = 1;
387     }
388     if (memcmp(key, name, keylen) == 0) {
389         /* qname starts with read group; no append */
390         tmpKey = ctx->idCount[0];
391         rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, name, namelen);
392     }
393     else {
394         char sbuf[4096];
395         char *buf = sbuf;
396         char *hbuf = NULL;
397         size_t bsize = sizeof(sbuf);
398         size_t actsize;
399 
400         if (keylen + namelen + 2 > bsize) {
401             hbuf = malloc(bsize = keylen + namelen + 2);
402             if (hbuf == NULL)
403                 return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
404             buf = hbuf;
405         }
406         rc = string_printf(buf, bsize, &actsize, "%s\t%.*s", key, (int)namelen, name);
407 
408         tmpKey = ctx->idCount[0];
409         rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, buf, actsize);
410         if (hbuf)
411             free(hbuf);
412     }
413     if (rc == 0) {
414         *rslt = tmpKey;
415         if (*wasInserted)
416             ++ctx->idCount[0];
417     }
418     return rc;
419 }
420 
HashKey(void const * const key,unsigned const keylen)421 static unsigned HashKey(void const *const key, unsigned const keylen)
422 {
423 #if 0
424     /* There is nothing special about this hash. It was randomly generated. */
425     static const uint8_t T1[] = {
426          64, 186,  39, 203,  54, 211,  98,  32,  26,  23, 219,  94,  77,  60,  56, 184,
427         129, 242,  10,  91,  84, 192,  19, 197, 231, 133, 125, 244,  48, 176, 160, 164,
428          17,  41,  57, 137,  44, 196, 116, 146, 105,  40, 122,  47, 220, 226, 213, 212,
429         107, 191,  52, 144,   9, 145,  81, 101, 217, 206,  85, 134, 143,  58, 128,  20,
430         236, 102,  83, 149, 148, 180, 167, 163,  12, 239,  31,   0,  73, 152,   1,  15,
431          75, 200,   4, 165,   5,  66,  25, 111, 255,  70, 174, 151,  96, 126, 147,  34,
432         112, 161, 127, 181, 237,  78,  37,  74, 222, 123,  21, 132,  95,  51, 141,  45,
433          61, 131, 193,  68,  62, 249, 178,  33,   7, 195, 228,  82,  27,  46, 254,  90,
434         185, 240, 246, 124, 205, 182,  42,  22, 198,  69, 166,  92, 169, 136, 223, 245,
435         118,  97, 115,  80, 252, 209,  49,  79, 221,  38,  28,  35,  36, 208, 187, 248,
436         158, 201, 202, 168,   2,  18, 189, 119, 216, 214,  11,   6,  89,  16, 229, 109,
437         120,  43, 162, 106, 204,   8, 199, 235, 142, 210,  86, 153, 227, 230,  24, 100,
438         224, 113, 190, 243, 218, 215, 225, 173,  99, 238, 138,  59, 183, 154, 171, 232,
439         157, 247, 233,  67,  88,  50, 253, 251, 140, 104, 156, 170, 150, 103, 117, 110,
440         155,  72, 207, 250, 159, 194, 177, 130, 135,  87,  71, 175,  14,  55, 172, 121,
441         234,  13,  30, 241,  93, 188,  53, 114,  76,  29,  65,   3, 179, 108,  63, 139
442     };
443     unsigned h = 0x55;
444     unsigned i = keylen;
445 
446     do { h = T1[h ^ ((uint8_t)i)]; } while ((i >>= 8) != 0);
447 
448     for (i = 0; i != keylen; ++i)
449         h = T1[h ^ ((uint8_t const *)key)[i]];
450 
451     return h;
452 #else
453     /* FNV-1a hash with folding */
454     uint64_t h = 0xcbf29ce484222325;
455     unsigned i;
456 
457     for (i = 0; i < keylen; ++i) {
458         uint8_t const octet = ((uint8_t const *)key)[i];
459         h = (h ^ octet) * 0x100000001b3ull;
460     }
461     return ((uint32_t)(h ^ (h >> 32))) % NUM_ID_SPACES;
462 #endif
463 }
464 
465 #define USE_ILLUMINA_NAMING_CORRECTION 1
466 
GetFixedNameLength(char const name[],size_t const namelen)467 static size_t GetFixedNameLength(char const name[], size_t const namelen)
468 {
469 #if USE_ILLUMINA_NAMING_CORRECTION
470     /*** Check for possible fixes to illumina names ****/
471     size_t newlen=namelen;
472     /*** First get rid of possible "/1" "/2" "/3" at the end - violates SAM spec **/
473     if(newlen > 2  && name[newlen-2] == '/' &&  (name[newlen-1] == '1' || name[newlen-1] == '2' || name[newlen-1] == '3')){
474         newlen -=2;
475     }
476     if(newlen > 2 && name[newlen-2] == '#' &&  (name[newlen-1] == '0')){ /*** Now, find "#0" ***/
477         newlen -=2;
478     } else if(newlen>10){ /*** find #ACGT ***/
479         int i=newlen;
480         for(i--;i>4;i--){ /*** stopping at 4 since the rest of record should still contain :x:y ***/
481             char a=toupper(name[i]);
482             if(a != 'A' && a != 'C' && a !='G' && a !='T'){
483                 break;
484             }
485         }
486         if (name[i] == '#'){
487             switch (newlen-i) { /** allowed values for illumina barcodes :5,6,8 **/
488                 case 5:
489                 case 6:
490                 case 8:
491                     newlen=i;
492                     break;
493                 default:
494                     break;
495             }
496         }
497     }
498     if(newlen < namelen){ /*** check for :x:y at the end now - to make sure it is illumina **/
499         int i=newlen;
500         for(i--;i>0 && isdigit(name[i]);i--){}
501         if(name[i]==':'){
502             for(i--;i>0 && isdigit(name[i]);i--){}
503             if(name[i]==':' && newlen > 0){ /*** some name before :x:y should still exist **/
504                 /*** looks like illumina ***/
505                 return newlen;
506             }
507         }
508     }
509 #endif
510     return namelen;
511 }
512 
513 static
GetKeyID(KeyToID * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],size_t const o_namelen)514 rc_t GetKeyID(KeyToID *const ctx,
515               uint64_t *const rslt,
516               bool *const wasInserted,
517               char const key[],
518               char const name[],
519               size_t const o_namelen)
520 {
521     size_t const namelen = GetFixedNameLength(name, o_namelen);
522 
523     if (ctx->key2id_max == 1)
524         return GetKeyIDOld(ctx, rslt, wasInserted, key, name, namelen);
525     else {
526         unsigned const keylen = strlen(key);
527         unsigned const h = HashKey(key, keylen);
528         unsigned f;
529         unsigned e = ctx->key2id_count;
530         uint64_t tmpKey;
531 
532         *rslt = 0;
533         {{
534             uint32_t const bucket_value = ctx->key2id_hash[h];
535             unsigned const n  = (uint8_t) bucket_value;
536             unsigned const i1 = (uint8_t)(bucket_value >>  8);
537             unsigned const i2 = (uint8_t)(bucket_value >> 16);
538             unsigned const i3 = (uint8_t)(bucket_value >> 24);
539 
540             if (n > 0 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i1]) == 0) {
541                 f = i1;
542                 /*
543                 ctx->key2id_hash[h] = (i3 << 24) | (i2 << 16) | (i1 << 8) | n;
544                  */
545                 goto GET_ID;
546             }
547             if (n > 1 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i2]) == 0) {
548                 f = i2;
549                 ctx->key2id_hash[h] = (i3 << 24) | (i1 << 16) | (i2 << 8) | n;
550                 goto GET_ID;
551             }
552             if (n > 2 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i3]) == 0) {
553                 f = i3;
554                 ctx->key2id_hash[h] = (i2 << 24) | (i1 << 16) | (i3 << 8) | n;
555                 goto GET_ID;
556             }
557         }}
558         f = 0;
559         while (f < e) {
560             unsigned const m = (f + e) / 2;
561             unsigned const oid = ctx->key2id_oid[m];
562             int const diff = strcmp(key, ctx->key2id_names + ctx->key2id_name[oid]);
563 
564             if (diff < 0)
565                 e = m;
566             else if (diff > 0)
567                 f = m + 1;
568             else {
569                 f = oid;
570                 goto GET_ID;
571             }
572         }
573         if (ctx->key2id_count < ctx->key2id_max) {
574             unsigned const name_max = ctx->key2id_name_max + keylen + 1;
575             KBTree *tree;
576             rc_t rc = OpenKBTree(&tree, ctx->key2id_count + 1, 1); /* ctx->key2id_max); */
577 
578             if (rc) return rc;
579 
580             if (ctx->key2id_name_alloc < name_max) {
581                 unsigned alloc = ctx->key2id_name_alloc;
582                 void *tmp;
583 
584                 if (alloc == 0)
585                     alloc = 4096;
586                 while (alloc < name_max)
587                     alloc <<= 1;
588                 tmp = realloc(ctx->key2id_names, alloc);
589                 if (tmp == NULL)
590                     return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
591                 ctx->key2id_names = tmp;
592                 ctx->key2id_name_alloc = alloc;
593             }
594             if (f < ctx->key2id_count) {
595                 memmove(&ctx->key2id_oid[f + 1], &ctx->key2id_oid[f], (ctx->key2id_count - f) * sizeof(ctx->key2id_oid[f]));
596             }
597             ctx->key2id_oid[f] = ctx->key2id_count;
598             ++ctx->key2id_count;
599             f = ctx->key2id_oid[f];
600             ctx->key2id_name[f] = ctx->key2id_name_max;
601             ctx->key2id_name_max = name_max;
602 
603             memmove(&ctx->key2id_names[ctx->key2id_name[f]], key, keylen + 1);
604             ctx->key2id[f] = tree;
605             ctx->idCount[f] = 0;
606             if ((uint8_t)ctx->key2id_hash[h] < 3) {
607                 unsigned const n = (uint8_t)ctx->key2id_hash[h] + 1;
608 
609                 ctx->key2id_hash[h] = (((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | n;
610             }
611             else {
612                 /* the hash function isn't working too well
613                  * keep the 3 mru
614                  */
615                 ctx->key2id_hash[h] = (((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | 3;
616             }
617         GET_ID:
618             tmpKey = ctx->idCount[f];
619             rc = KBTreeEntry(ctx->key2id[f], &tmpKey, wasInserted, name, namelen);
620             if (rc == 0) {
621                 *rslt = (((uint64_t)f) << 32) | tmpKey;
622                 if (*wasInserted)
623                     ++ctx->idCount[f];
624                 if (!(tmpKey < ctx->idCount[f])) {
625                     (void)PLOGMSG(klogErr, (klogErr, "too many spots for read group '$(rg)'; ran out of 32-bit ids", "rg=%s", key));
626                     return RC(rcExe, rcTree, rcAllocating, rcId, rcExhausted);
627                 }
628             }
629             return rc;
630         }
631         (void)PLOGMSG(klogErr, (klogErr, "too many read groups: max is $(max)", "max=%d", (int)ctx->key2id_max));
632         return RC(rcExe, rcTree, rcAllocating, rcConstraint, rcViolated);
633     }
634 }
635 
OpenMMapFile(context_t * const ctx,KDirectory * const dir)636 static rc_t OpenMMapFile(context_t *const ctx, KDirectory *const dir)
637 {
638     int fd;
639     char fname[4096];
640     rc_t rc = string_printf(fname, sizeof(fname), NULL, "%s/id2value.%u", G.tmpfs, G.pid);
641 
642     if (rc)
643         return rc;
644 
645     fd = open(fname, O_RDWR|O_TRUNC|O_CREAT, S_IRUSR|S_IWUSR);
646     if (fd < 0)
647         return RC(rcExe, rcFile, rcCreating, rcFile, rcNotFound);
648     unlink(fname);
649     return MMArrayMake(&ctx->id2value, fd, sizeof(ctx_value_t));
650 }
651 
TmpfsDirectory(KDirectory ** const rslt)652 static rc_t TmpfsDirectory(KDirectory **const rslt)
653 {
654     KDirectory *dir;
655     rc_t rc = KDirectoryNativeDir(&dir);
656     if (rc == 0) {
657         rc = KDirectoryOpenDirUpdate(dir, rslt, false, "%s", G.tmpfs);
658         KDirectoryRelease(dir);
659     }
660     return rc;
661 }
662 
SetupContext(context_t * ctx,unsigned numfiles)663 static rc_t SetupContext(context_t *ctx, unsigned numfiles)
664 {
665     rc_t rc = 0;
666 
667     // memset(ctx, 0, sizeof(*ctx));
668 
669     if (G.mode == mode_Archive) {
670         KDirectory *dir;
671         size_t fragSize[2];
672 
673         fragSize[1] = (G.cache_size / 8);
674         fragSize[0] = fragSize[1] * 4;
675 
676         rc = TmpfsDirectory(&dir);
677         if (rc == 0)
678             rc = OpenMMapFile(ctx, dir);
679         if (rc == 0)
680             rc = MemBankMake(&ctx->frags, dir, G.pid, fragSize);
681         KDirectoryRelease(dir);
682     }
683     else if (G.mode == mode_Remap) {
684         KeyToID const save1 = ctx->keyToID;
685         MMArray *const save2 = ctx->id2value;
686         int64_t const save3 = ctx->spotId;
687 
688         memset(ctx, 0, sizeof(*ctx));
689         ctx->keyToID = save1;
690         ctx->id2value = save2;
691         ctx->spotId = save3;
692     }
693 
694     rc = KLoadProgressbar_Make(&ctx->progress[0], 0); if (rc) return rc;
695     rc = KLoadProgressbar_Make(&ctx->progress[1], 0); if (rc) return rc;
696     rc = KLoadProgressbar_Make(&ctx->progress[2], 0); if (rc) return rc;
697     rc = KLoadProgressbar_Make(&ctx->progress[3], 0); if (rc) return rc;
698 
699     KLoadProgressbar_Append(ctx->progress[0], 100 * numfiles);
700 
701     return rc;
702 }
703 
ContextReleaseMemBank(context_t * ctx)704 static void ContextReleaseMemBank(context_t *ctx)
705 {
706     MemBankRelease(ctx->frags);
707     ctx->frags = NULL;
708 }
709 
ContextRelease(context_t * ctx,bool continuing)710 static void ContextRelease(context_t *ctx, bool continuing)
711 {
712     KLoadProgressbar_Release(ctx->progress[0], true);
713     KLoadProgressbar_Release(ctx->progress[1], true);
714     KLoadProgressbar_Release(ctx->progress[2], true);
715     KLoadProgressbar_Release(ctx->progress[3], true);
716     if (!continuing)
717         MMArrayWhack(ctx->id2value);
718     else
719         MMArrayClear(ctx->id2value);
720 }
721 
722 static
COPY_QUAL(uint8_t D[],uint8_t const S[],unsigned const L,bool const R)723 void COPY_QUAL(uint8_t D[], uint8_t const S[], unsigned const L, bool const R)
724 {
725     if (R) {
726         unsigned i;
727         unsigned j;
728 
729         for (i = 0, j = L - 1; i != L; ++i, --j)
730             D[i] = S[j];
731     }
732     else
733         memmove(D, S, L);
734 }
735 
736 static
COPY_READ(INSDC_dna_text D[],INSDC_dna_text const S[],unsigned const L,bool const R)737 void COPY_READ(INSDC_dna_text D[], INSDC_dna_text const S[], unsigned const L, bool const R)
738 {
739     static INSDC_dna_text const compl[] = {
740          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
741          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
742          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
743          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
744          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
745          0 ,  0 ,  0 ,  0 ,  0 ,  0 , '.',  0 ,
746         '0', '1', '2', '3',  0 ,  0 ,  0 ,  0 ,
747          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
748          0 , 'T', 'V', 'G', 'H',  0 ,  0 , 'C',
749         'D',  0 ,  0 , 'M',  0 , 'K', 'N',  0 ,
750          0 ,  0 , 'Y', 'S', 'A', 'A', 'B', 'W',
751          0 , 'R',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
752          0 , 'T', 'V', 'G', 'H',  0 ,  0 , 'C',
753         'D',  0 ,  0 , 'M',  0 , 'K', 'N',  0 ,
754          0 ,  0 , 'Y', 'S', 'A', 'A', 'B', 'W',
755          0 , 'R',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
756          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
757          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
758          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
759          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
760          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
761          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
762          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
763          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
764          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
765          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
766          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
767          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
768          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
769          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
770          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
771          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0
772     };
773     if (R) {
774         unsigned i;
775         unsigned j;
776 
777         for (i = 0, j = L - 1; i != L; ++i, --j)
778             D[i] = compl[((uint8_t const *)S)[j]];
779     }
780     else
781         memmove(D, S, L);
782 }
783 
MakeDeferralFile()784 static KFile *MakeDeferralFile() {
785     if (G.deferSecondary) {
786         char template[4096];
787         int fd;
788         KFile *f;
789         KDirectory *d;
790         size_t nwrit;
791 
792         KDirectoryNativeDir(&d);
793         string_printf(template, sizeof(template), &nwrit, "%s/defer.XXXXXX", G.tmpfs);
794         fd = mkstemp(template);
795         KDirectoryOpenFileWrite(d, &f, true, template);
796         KDirectoryRelease(d);
797         close(fd);
798         unlink(template);
799         return f;
800     }
801     return NULL;
802 }
803 
OpenBAM(const BAM_File ** bam,VDatabase * db,const char bamFile[])804 static rc_t OpenBAM(const BAM_File **bam, VDatabase *db, const char bamFile[])
805 {
806     rc_t rc = 0;
807     KFile *defer = MakeDeferralFile();
808 
809     if (strcmp(bamFile, "/dev/stdin") == 0) {
810         rc = BAM_FileMake(bam, defer, G.headerText, "/dev/stdin");
811     }
812     else {
813         rc = BAM_FileMake(bam, defer, G.headerText, "%s", bamFile);
814     }
815     KFileRelease(defer); /* it was retained by BAM file */
816 
817     if (rc) {
818         (void)PLOGERR(klogErr, (klogErr, rc, "Failed to open '$(file)'", "file=%s", bamFile));
819     }
820     else if (db) {
821         KMetadata *dbmeta;
822 
823         rc = VDatabaseOpenMetadataUpdate(db, &dbmeta);
824         if (rc == 0) {
825             KMDataNode *node;
826 
827             rc = KMetadataOpenNodeUpdate(dbmeta, &node, "BAM_HEADER");
828             KMetadataRelease(dbmeta);
829             if (rc == 0) {
830                 char const *header;
831                 size_t size;
832 
833                 rc = BAM_FileGetHeaderText(*bam, &header, &size);
834                 if (rc == 0) {
835                     rc = KMDataNodeWrite(node, header, size);
836                 }
837                 KMDataNodeRelease(node);
838             }
839         }
840     }
841 
842     return rc;
843 }
844 
VerifyReferences(BAM_File const * bam,Reference const * ref)845 static rc_t VerifyReferences(BAM_File const *bam, Reference const *ref)
846 {
847     rc_t rc = 0;
848     uint32_t n;
849     unsigned i;
850 
851     BAM_FileGetRefSeqCount(bam, &n);
852     for (i = 0; i != n; ++i) {
853         BAMRefSeq const *refSeq;
854 
855         BAM_FileGetRefSeq(bam, i, &refSeq);
856         if (G.refFilter && strcmp(refSeq->name, G.refFilter) != 0)
857             continue;
858 
859         rc = ReferenceVerify(ref, refSeq->name, refSeq->length, refSeq->checksum);
860         if (rc) {
861             if (GetRCObject(rc) == rcChecksum && GetRCState(rc) == rcUnequal) {
862 #if NCBI
863                 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); checksums do not match", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
864 #endif
865             }
866             else if (GetRCObject(rc) == rcSize && GetRCState(rc) == rcUnequal) {
867                 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); lengths do not match", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
868             }
869             else if (GetRCObject(rc) == rcSize && GetRCState(rc) == rcEmpty) {
870                 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); fasta file is empty", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
871             }
872             else if (GetRCObject(rc) == rcId && GetRCState(rc) == rcNotFound) {
873                 (void)PLOGMSG(klogWarn, (klogWarn, "Reference: '$(name)', Length: $(len); no match found", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
874             }
875             else {
876                 (void)PLOGERR(klogWarn, (klogWarn, rc, "Reference: '$(name)', Length: $(len); error", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
877             }
878         }
879         else if (G.onlyVerifyReferences) {
880             (void)PLOGMSG(klogInfo, (klogInfo, "Reference: '$(name)', Length: $(len); match found", "name=%s,len=%u", refSeq->name, (unsigned)refSeq->length));
881         }
882     }
883     return 0;
884 }
885 
GetMapQ(BAM_Alignment const * rec)886 static uint8_t GetMapQ(BAM_Alignment const *rec)
887 {
888     uint8_t mapQ;
889 
890     BAM_AlignmentGetMapQuality(rec, &mapQ);
891     return mapQ;
892 }
893 
894 #if 0
895 static bool EditAlignedQualities(uint8_t qual[], bool const hasMismatch[], unsigned readlen)
896 {
897     unsigned i;
898     bool changed = false;
899 
900     for (i = 0; i < readlen; ++i) {
901         uint8_t const q_0 = qual[i];
902         uint8_t const q_1= hasMismatch[i] ? G.alignedQualValue : q_0;
903 
904         if (q_0 != q_1) {
905             changed = true;
906             break;
907         }
908     }
909     if (!changed)
910         return false;
911     for (i = 0; i < readlen; ++i) {
912         uint8_t const q_0 = qual[i];
913         uint8_t const q_1= hasMismatch[i] ? G.alignedQualValue : q_0;
914 
915         qual[i] = q_1;
916     }
917     return true;
918 }
919 #endif
920 
921 #if 0
922 static bool EditUnalignedQualities(uint8_t qual[], bool const hasMismatch[], unsigned readlen)
923 {
924     unsigned i;
925     bool changed = false;
926 
927     for (i = 0; i < readlen; ++i) {
928         uint8_t const q_0 = qual[i];
929         uint8_t const q_1 = (q_0 & 0x7F) | (hasMismatch[i] ? 0x80 : 0);
930 
931         if (q_0 != q_1) {
932             changed = true;
933             break;
934         }
935     }
936     if (!changed)
937         return false;
938     for (i = 0; i < readlen; ++i) {
939         uint8_t const q_0 = qual[i];
940         uint8_t const q_1 = (q_0 & 0x7F) | (hasMismatch[i] ? 0x80 : 0);
941 
942         qual[i] = q_1;
943     }
944     return true;
945 }
946 #endif
947 
platform_cmp(char const platform[],char const test[])948 static bool platform_cmp(char const platform[], char const test[])
949 {
950     unsigned i;
951 
952     for (i = 0; ; ++i) {
953         int ch1 = test[i];
954         int ch2 = toupper(platform[i]);
955 
956         if (ch1 != ch2)
957             break;
958         if (ch1 == 0)
959             return true;
960     }
961     return false;
962 }
963 
964 static
GetINSDCPlatform(BAM_File const * bam,char const name[])965 INSDC_SRA_platform_id GetINSDCPlatform(BAM_File const *bam, char const name[]) {
966     if (name) {
967         BAMReadGroup const *rg;
968 
969         BAM_FileGetReadGroupByName(bam, name, &rg);
970         if (rg && rg->platform) {
971             switch (toupper(rg->platform[0])) {
972             case 'C':
973                 if (platform_cmp(rg->platform, "COMPLETE GENOMICS"))
974                     return SRA_PLATFORM_COMPLETE_GENOMICS;
975                 if (platform_cmp(rg->platform, "CAPILLARY"))
976                     return SRA_PLATFORM_CAPILLARY;
977                 break;
978             case 'H':
979                 if (platform_cmp(rg->platform, "HELICOS"))
980                     return SRA_PLATFORM_HELICOS;
981                 break;
982             case 'I':
983                 if (platform_cmp(rg->platform, "ILLUMINA"))
984                     return SRA_PLATFORM_ILLUMINA;
985                 if (platform_cmp(rg->platform, "IONTORRENT"))
986                     return SRA_PLATFORM_ION_TORRENT;
987                 break;
988             case 'L':
989                 if (platform_cmp(rg->platform, "LS454"))
990                     return SRA_PLATFORM_454;
991                 break;
992             case 'N':
993                 if (platform_cmp(name, "NANOPORE"))
994                     return SRA_PLATFORM_OXFORD_NANOPORE;
995                 break;
996             case 'O':
997                 if (platform_cmp(name, "OXFORD_NANOPORE"))
998                     return SRA_PLATFORM_OXFORD_NANOPORE;
999                 break;
1000             case 'P':
1001                 if (platform_cmp(rg->platform, "PACBIO"))
1002                     return SRA_PLATFORM_PACBIO_SMRT;
1003                 break;
1004             case 'S':
1005                 if (platform_cmp(rg->platform, "SOLID"))
1006                     return SRA_PLATFORM_ABSOLID;
1007                 if (platform_cmp(name, "SANGER"))
1008                     return SRA_PLATFORM_CAPILLARY;
1009                 break;
1010             default:
1011                 break;
1012             }
1013         }
1014     }
1015     return SRA_PLATFORM_UNDEFINED;
1016 }
1017 
1018 static
CheckLimitAndLogError(void)1019 rc_t CheckLimitAndLogError(void)
1020 {
1021     unsigned const count = ++G.errCount;
1022 
1023     if (G.maxErrCount > 0 && count > G.maxErrCount) {
1024         (void)PLOGERR(klogErr, (klogErr, SILENT_RC(rcAlign, rcFile, rcReading, rcError, rcExcessive), "Number of errors $(cnt) exceeds limit of $(max): Exiting", "cnt=%u,max=%u", count, G.maxErrCount));
1025         return RC(rcAlign, rcFile, rcReading, rcError, rcExcessive);
1026     }
1027     return 0;
1028 }
1029 
1030 static
RecordNoMatch(char const readName[],char const refName[],uint32_t const refPos)1031 void RecordNoMatch(char const readName[], char const refName[], uint32_t const refPos)
1032 {
1033     if (G.noMatchLog) {
1034         static uint64_t lpos = 0;
1035         char logbuf[256];
1036         size_t len;
1037 
1038         if (string_printf(logbuf, sizeof(logbuf), &len, "%s\t%s\t%u\n", readName, refName, refPos) == 0) {
1039             KFileWrite(G.noMatchLog, lpos, logbuf, len, NULL);
1040             lpos += len;
1041         }
1042     }
1043 }
1044 
1045 static LowMatchCounter *lmc = NULL;
1046 
1047 static
LogNoMatch(char const readName[],char const refName[],unsigned rpos,unsigned matches)1048 rc_t LogNoMatch(char const readName[], char const refName[], unsigned rpos, unsigned matches)
1049 {
1050     rc_t const rc = CheckLimitAndLogError();
1051     static unsigned count = 0;
1052 
1053     if (lmc == NULL)
1054         lmc = LowMatchCounterMake();
1055     assert(lmc != NULL);
1056     LowMatchCounterAdd(lmc, refName);
1057 
1058     ++count;
1059     if (rc) {
1060         (void)PLOGMSG(klogInfo, (klogInfo, "This is the last warning; this class of warning occurred $(occurred) times",
1061                                  "occurred=%u", count));
1062         (void)PLOGMSG(klogErr, (klogErr, "Spot '$(name)' contains too few ($(count)) matching bases to reference '$(ref)' at $(pos)",
1063                                  "name=%s,ref=%s,pos=%u,count=%u", readName, refName, rpos, matches));
1064         return rc;
1065     }
1066     if (G.maxWarnCount_NoMatch == 0 || count < G.maxWarnCount_NoMatch)
1067         (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' contains too few ($(count)) matching bases to reference '$(ref)' at $(pos)",
1068                                  "name=%s,ref=%s,pos=%u,count=%u", readName, refName, rpos, matches));
1069     return 0;
1070 }
1071 
1072 struct rlmc_context {
1073     KMDataNode *node;
1074     unsigned node_number;
1075     rc_t rc;
1076 };
1077 
RecordLowMatchCount(void * Ctx,char const name[],unsigned const count)1078 static void RecordLowMatchCount(void *Ctx, char const name[], unsigned const count)
1079 {
1080     struct rlmc_context *const ctx = Ctx;
1081 
1082     if (ctx->rc == 0) {
1083         KMDataNode *sub = NULL;
1084 
1085         ctx->rc = KMDataNodeOpenNodeUpdate(ctx->node, &sub, "LOW_MATCH_COUNT_%u", ++ctx->node_number);
1086         if (ctx->rc == 0) {
1087             uint32_t const count_temp = count;
1088             ctx->rc = KMDataNodeWriteAttr(sub, "REFNAME", name);
1089             if (ctx->rc == 0)
1090                 ctx->rc = KMDataNodeWriteB32(sub, &count_temp);
1091 
1092             KMDataNodeRelease(sub);
1093         }
1094     }
1095 }
1096 
RecordLowMatchCounts(KMDataNode * const node)1097 static rc_t RecordLowMatchCounts(KMDataNode *const node)
1098 {
1099     struct rlmc_context ctx;
1100 
1101     assert(lmc != NULL);
1102     if (node) {
1103         ctx.node = node;
1104         ctx.node_number = 0;
1105         ctx.rc = 0;
1106 
1107         LowMatchCounterEach(lmc, &ctx, RecordLowMatchCount);
1108     }
1109     return ctx.rc;
1110 }
1111 
1112 #if 0
1113 static
1114 rc_t LogDupConflict(char const readName[])
1115 {
1116     rc_t const rc = CheckLimitAndLogError();
1117     static unsigned count = 0;
1118 
1119     ++count;
1120     if (rc) {
1121         (void)PLOGMSG(klogInfo, (klogInfo, "This is the last warning; this class of warning occurred $(occurred) times",
1122                                  "occurred=%u", count));
1123         (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
1124                                  "Spot '$(name)' is both a duplicate and NOT a duplicate!",
1125                                  "name=%s", readName));
1126     }
1127     else if (G.maxWarnCount_DupConflict == 0 || count < G.maxWarnCount_DupConflict)
1128         (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
1129                                  "Spot '$(name)' is both a duplicate and NOT a duplicate!",
1130                                  "name=%s", readName));
1131     return rc;
1132 }
1133 #endif
1134 
1135 static char const *const CHANGED[] = {
1136     "FLAG changed",
1137     "QUAL changed",
1138     "SEQ changed",
1139     "record made unaligned",
1140     "record made unfragmented",
1141     "mate alignment lost",
1142     "record discarded",
1143     "reference name changed",
1144     "CIGAR changed"
1145 };
1146 
1147 #define FLAG_CHANGED (0)
1148 #define QUAL_CHANGED (1)
1149 #define SEQ_CHANGED (2)
1150 #define MAKE_UNALIGNED (3)
1151 #define MAKE_UNFRAGMENTED (4)
1152 #define MATE_LOST (5)
1153 #define DISCARDED (6)
1154 #define REF_NAME_CHANGED (7)
1155 #define CIGAR_CHANGED (8)
1156 
1157 static char const *const REASONS[] = {
1158 /* FLAG changed */
1159     "0x400 and 0x200 both set",                 /*  0 */
1160     "conflicting PCR Dup flags",                /*  1 */
1161     "primary alignment already exists",         /*  2 */
1162     "was already recorded as unaligned",        /*  3 */
1163 /* QUAL changed */
1164     "original quality used",                    /*  4 */
1165     "unaligned colorspace",                     /*  5 */
1166     "aligned bases",                            /*  6 */
1167     "unaligned bases",                          /*  7 */
1168     "reversed",                                 /*  8 */
1169 /* unaligned */
1170     "low MAPQ",                                 /*  9 */
1171     "low match count",                          /* 10 */
1172     "missing alignment info",                   /* 11 */
1173     "missing reference position",               /* 12 */
1174     "invalid alignment info",                   /* 13 */
1175     "invalid reference position",               /* 14 */
1176     "invalid reference",                        /* 15 */
1177     "unaligned reference",                      /* 16 */
1178     "unknown reference",                        /* 17 */
1179     "hard-clipped colorspace",                  /* 18 */
1180 /* unfragmented */
1181     "missing fragment info",                    /* 19 */
1182     "too many fragments",                       /* 20 */
1183 /* mate info lost */
1184     "invalid mate reference",                   /* 21 */
1185     "missing mate alignment info",              /* 22 */
1186     "unknown mate reference",                   /* 23 */
1187 /* discarded */
1188     "conflicting PCR duplicate",                /* 24 */
1189     "conflicting fragment info",                /* 25 */
1190     "reference is skipped",                     /* 26 */
1191 /* reference name changed */
1192     "reference was named more than once",       /* 27 */
1193 /* CIGAR changed */
1194     "alignment overhanging end of reference",   /* 28 */
1195 /* discarded */
1196     "hard-clipped secondary alignment",         /* 29 */
1197     "low-matching secondary alignment",         /* 30 */
1198 };
1199 
1200 static struct {
1201     unsigned what, why;
1202 } const CHANGES[] = {
1203     {FLAG_CHANGED,  0},
1204     {FLAG_CHANGED,  1},
1205     {FLAG_CHANGED,  2},
1206     {FLAG_CHANGED,  3},
1207     {QUAL_CHANGED,  4},
1208     {QUAL_CHANGED,  5},
1209     {QUAL_CHANGED,  6},
1210     {QUAL_CHANGED,  7},
1211     {QUAL_CHANGED,  8},
1212     {SEQ_CHANGED,  8},
1213     {MAKE_UNALIGNED,  9},
1214     {MAKE_UNALIGNED, 10},
1215     {MAKE_UNALIGNED, 11},
1216     {MAKE_UNALIGNED, 12},
1217     {MAKE_UNALIGNED, 13},
1218     {MAKE_UNALIGNED, 14},
1219     {MAKE_UNALIGNED, 15},
1220     {MAKE_UNALIGNED, 16},
1221     {MAKE_UNALIGNED, 17},
1222     {MAKE_UNALIGNED, 18},
1223     {MAKE_UNFRAGMENTED, 19},
1224     {MAKE_UNFRAGMENTED, 20},
1225     {MATE_LOST, 21},
1226     {MATE_LOST, 22},
1227     {MATE_LOST, 23},
1228     {DISCARDED, 24},
1229     {DISCARDED, 25},
1230     {DISCARDED, 26},
1231     {DISCARDED, 17},
1232     {REF_NAME_CHANGED, 27},
1233     {CIGAR_CHANGED, 28},
1234     {DISCARDED, 29},
1235     {DISCARDED, 30},
1236 };
1237 
1238 #define NUMBER_OF_CHANGES ((unsigned)(sizeof(CHANGES)/sizeof(CHANGES[0])))
1239 static unsigned change_counter[NUMBER_OF_CHANGES];
1240 
LOG_CHANGE(unsigned const change)1241 static void LOG_CHANGE(unsigned const change)
1242 {
1243     ++change_counter[change];
1244 }
1245 
PrintChangeReport(void)1246 static void PrintChangeReport(void)
1247 {
1248     unsigned i;
1249 
1250     for (i = 0; i != NUMBER_OF_CHANGES; ++i) {
1251         if (change_counter[i] > 0) {
1252             char const *const what = CHANGED[CHANGES[i].what];
1253             char const *const why  = REASONS[CHANGES[i].why];
1254 
1255             PLOGMSG(klogInfo, (klogInfo, "$(what) $(times) times because $(reason)", "what=%s,reason=%s,times=%u", what, why, change_counter[i]));
1256         }
1257     }
1258 }
1259 
RecordChange(KMDataNode * const node,char const node_name[],unsigned const node_number,char const what[],char const why[],unsigned const count)1260 static rc_t RecordChange(KMDataNode *const node,
1261                          char const node_name[],
1262                          unsigned const node_number,
1263                          char const what[],
1264                          char const why[],
1265                          unsigned const count)
1266 {
1267     KMDataNode *sub = NULL;
1268     rc_t const rc_sub = KMDataNodeOpenNodeUpdate(node, &sub, "%s_%u", node_name, node_number);
1269 
1270     if (rc_sub) return rc_sub;
1271     {
1272         uint32_t const count_temp = count;
1273         rc_t const rc_attr1 = KMDataNodeWriteAttr(sub, "change", what);
1274         rc_t const rc_attr2 = KMDataNodeWriteAttr(sub, "reason", why);
1275         rc_t const rc_value = KMDataNodeWriteB32(sub, &count_temp);
1276 
1277         KMDataNodeRelease(sub);
1278         if (rc_attr1) return rc_attr1;
1279         if (rc_attr2) return rc_attr2;
1280         if (rc_value) return rc_value;
1281 
1282         return 0;
1283     }
1284 }
1285 
RecordChanges(KMDataNode * const node,char const name[])1286 static rc_t RecordChanges(KMDataNode *const node, char const name[])
1287 {
1288     if (node) {
1289         unsigned i;
1290         unsigned j = 0;
1291 
1292         for (i = 0; i != NUMBER_OF_CHANGES; ++i) {
1293             if (change_counter[i] > 0) {
1294                 char const *const what = CHANGED[CHANGES[i].what];
1295                 char const *const why  = REASONS[CHANGES[i].why];
1296                 rc_t const rc = RecordChange(node, name, ++j, what, why, change_counter[i]);
1297 
1298                 if (rc) return rc;
1299             }
1300         }
1301     }
1302     return 0;
1303 }
1304 
1305 #define FLAG_CHANGED_400_AND_200   do { LOG_CHANGE( 0); } while(0)
1306 #define FLAG_CHANGED_PCR_DUP       do { LOG_CHANGE( 1); } while(0)
1307 #define FLAG_CHANGED_PRIMARY_DUP   do { LOG_CHANGE( 2); } while(0)
1308 #define FLAG_CHANGED_WAS_UNALIGNED do { LOG_CHANGE( 3); } while(0)
1309 #define QUAL_CHANGED_OQ            do { LOG_CHANGE( 4); } while(0)
1310 #define QUAL_CHANGED_UNALIGNED_CS  do { LOG_CHANGE( 5); } while(0)
1311 #define QUAL_CHANGED_ALIGNED_EDIT  do { LOG_CHANGE( 6); } while(0)
1312 #define QUAL_CHANGED_UNALIGN_EDIT  do { LOG_CHANGE( 7); } while(0)
1313 #define QUAL_CHANGED_REVERSED      do { LOG_CHANGE( 8); } while(0)
1314 #define SEQ__CHANGED_REV_COMP      do { LOG_CHANGE( 9); } while(0)
1315 #define UNALIGNED_LOW_MAPQ         do { LOG_CHANGE(10); } while(0)
1316 #define UNALIGNED_LOW_MATCH_COUNT  do { LOG_CHANGE(11); } while(0)
1317 #define UNALIGNED_MISSING_INFO     do { LOG_CHANGE(12); } while(0)
1318 #define UNALIGNED_MISSING_REF_POS  do { LOG_CHANGE(13); } while(0)
1319 #define UNALIGNED_INVALID_INFO     do { LOG_CHANGE(14); } while(0)
1320 #define UNALIGNED_INVALID_REF_POS  do { LOG_CHANGE(15); } while(0)
1321 #define UNALIGNED_INVALID_REF      do { LOG_CHANGE(16); } while(0)
1322 #define UNALIGNED_UNALIGNED_REF    do { LOG_CHANGE(17); } while(0)
1323 #define UNALIGNED_UNKNOWN_REF      do { LOG_CHANGE(18); } while(0)
1324 #define UNALIGNED_HARD_CLIPPED_CS  do { LOG_CHANGE(19); } while(0)
1325 #define UNFRAGMENT_MISSING_INFO    do { LOG_CHANGE(20); } while(0)
1326 #define UNFRAGMENT_TOO_MANY        do { LOG_CHANGE(21); } while(0)
1327 #define MATE_INFO_LOST_INVALID     do { LOG_CHANGE(22); } while(0)
1328 #define MATE_INFO_LOST_MISSING     do { LOG_CHANGE(23); } while(0)
1329 #define MATE_INFO_LOST_UNKNOWN_REF do { LOG_CHANGE(24); } while(0)
1330 #define DISCARD_PCR_DUP            do { LOG_CHANGE(25); } while(0)
1331 #define DISCARD_BAD_FRAGMENT_INFO  do { LOG_CHANGE(26); } while(0)
1332 #define DISCARD_SKIP_REFERENCE     do { LOG_CHANGE(27); } while(0)
1333 #define DISCARD_UNKNOWN_REFERENCE  do { LOG_CHANGE(28); } while(0)
1334 #define RENAMED_REFERENCE          do { LOG_CHANGE(29); } while(0)
1335 #define OVERHANGING_ALIGNMENT      do { LOG_CHANGE(30); } while(0)
1336 #define DISCARD_HARDCLIP_SECONDARY do { LOG_CHANGE(31); } while(0)
1337 #define DISCARD_BAD_SECONDARY      do { LOG_CHANGE(32); } while(0)
1338 
isHardClipped(unsigned const ops,uint32_t const cigar[])1339 static bool isHardClipped(unsigned const ops, uint32_t const cigar[/* ops */])
1340 {
1341     unsigned i;
1342 
1343     for (i = 0; i < ops; ++i) {
1344         uint32_t const op = cigar[i];
1345         int const code = op & 0x0F;
1346 
1347         if (code == 5)
1348             return true;
1349     }
1350     return false;
1351 }
1352 
FixOverhangingAlignment(KDataBuffer * cigBuf,uint32_t * opCount,uint32_t refPos,uint32_t refLen,uint32_t readlen)1353 static rc_t FixOverhangingAlignment(KDataBuffer *cigBuf, uint32_t *opCount, uint32_t refPos, uint32_t refLen, uint32_t readlen)
1354 {
1355     uint32_t const *cigar = cigBuf->base;
1356     int refend = refPos;
1357     int seqpos = 0;
1358     unsigned i;
1359 
1360     for (i = 0; i < *opCount; ++i) {
1361         uint32_t const op = cigar[i];
1362         int const len = op >> 4;
1363         int const code = op & 0x0F;
1364 
1365         switch (code) {
1366         case 0: /* M */
1367         case 7: /* = */
1368         case 8: /* X */
1369             seqpos += len;
1370             refend += len;
1371             break;
1372         case 2: /* D */
1373         case 3: /* N */
1374             refend += len;
1375             break;
1376         case 1: /* I */
1377         case 4: /* S */
1378         case 9: /* B */
1379             seqpos += len;
1380         default:
1381             break;
1382         }
1383         if (refend > refLen) {
1384             int const chop = refend - refLen;
1385             int const newlen = len - chop;
1386             int const left = seqpos - chop;
1387             if (left * 2 > readlen) {
1388                 int const clip = readlen - left;
1389                 rc_t rc;
1390 
1391                 *opCount = i + 2;
1392                 rc = KDataBufferResize(cigBuf, *opCount);
1393                 if (rc) return rc;
1394                 ((uint32_t *)cigBuf->base)[i  ] = (newlen << 4) | code;
1395                 ((uint32_t *)cigBuf->base)[i+1] = (clip << 4) | 4;
1396                 OVERHANGING_ALIGNMENT;
1397                 break;
1398             }
1399         }
1400     }
1401     return 0;
1402 }
1403 
1404 static context_t GlobalContext;
1405 static KQueue *bamq;
1406 static KThread *bamread_thread;
1407 
BAM_FileReadDetached(BAM_File const * self,BAM_Alignment ** rec)1408 static rc_t BAM_FileReadDetached(BAM_File const *self, BAM_Alignment **rec)
1409 {
1410     BAM_Alignment const *crec = NULL;
1411     rc_t const rc = BAM_FileRead2(self, &crec);
1412     if (rc == 0) {
1413         if ((*rec = BAM_AlignmentDetach(crec)) != NULL)
1414             return 0;
1415         return RC(rcAlign, rcFile, rcReading, rcMemory, rcExhausted);
1416     }
1417     BAM_AlignmentRelease(crec);
1418     return rc;
1419 }
1420 
run_bamread_thread(const KThread * self,void * const file)1421 static rc_t run_bamread_thread(const KThread *self, void *const file)
1422 {
1423     rc_t rc = 0;
1424     size_t NR = 0;
1425 
1426     while (rc == 0) {
1427         BAM_Alignment *rec = NULL;
1428 
1429         ++NR;
1430         rc = BAM_FileReadDetached(file, &rec);
1431         if ((int)GetRCObject(rc) == rcRow && (int)GetRCState(rc) == rcEmpty) {
1432             rc = CheckLimitAndLogError();
1433             continue;
1434         }
1435         if ((int)GetRCObject(rc) == rcRow && (int)GetRCState(rc) == rcNotFound) {
1436             /* EOF */
1437             rc = 0;
1438             --NR;
1439             break;
1440         }
1441         if (rc) break;
1442 
1443         {
1444             static char const dummy[] = "";
1445             char const *spotGroup;
1446             char const *name;
1447             size_t namelen;
1448 
1449             BAM_AlignmentGetReadName2(rec, &name, &namelen);
1450             BAM_AlignmentGetReadGroupName(rec, &spotGroup);
1451             rc = GetKeyID(&GlobalContext.keyToID, &rec->keyId, &rec->wasInserted, spotGroup ? spotGroup : dummy, name, namelen);
1452             if (rc) break;
1453         }
1454 
1455         for ( ; ; ) {
1456             timeout_t tm;
1457             TimeoutInit(&tm, 1000);
1458             rc = KQueuePush(bamq, rec, &tm);
1459             if (rc == 0 || (int)GetRCObject(rc) != rcTimeout)
1460                 break;
1461         }
1462     }
1463     KQueueSeal(bamq);
1464     if (rc) {
1465         (void)LOGERR(klogErr, rc, "bamread_thread done");
1466     }
1467     else {
1468         (void)PLOGMSG(klogInfo, (klogInfo, "bamread_thread done; read $(NR) records", "NR=%lu", NR));
1469     }
1470     return rc;
1471 }
1472 
1473 /* call on main thread only */
getNextRecord(BAM_File const * const bam,rc_t * const rc)1474 static BAM_Alignment const *getNextRecord(BAM_File const *const bam, rc_t *const rc)
1475 {
1476     if (bamq == NULL) {
1477         *rc = KQueueMake(&bamq, 4096);
1478         if (*rc) return NULL;
1479         *rc = KThreadMake(&bamread_thread, run_bamread_thread, (void *)bam);
1480         if (*rc) {
1481             KQueueRelease(bamq);
1482             bamq = NULL;
1483             return NULL;
1484         }
1485     }
1486     while (*rc == 0 && (*rc = Quitting()) == 0) {
1487         BAM_Alignment const *rec = NULL;
1488         timeout_t tm;
1489 
1490         TimeoutInit(&tm, 10000);
1491         *rc = KQueuePop(bamq, (void **)&rec, &tm);
1492         if (*rc == 0)
1493             return rec; /* this is the normal return */
1494 
1495         if ((int)GetRCObject(*rc) == rcTimeout)
1496             *rc = 0;
1497         else {
1498             if ((int)GetRCObject(*rc) == rcData && (int)GetRCState(*rc) == rcDone)
1499                 (void)LOGMSG(klogDebug, "KQueuePop Done");
1500             else
1501                 (void)PLOGERR(klogWarn, (klogWarn, *rc, "KQueuePop Error", NULL));
1502         }
1503     }
1504     {
1505         rc_t rc2 = 0;
1506         KThreadWait(bamread_thread, &rc2);
1507         if (rc2 != 0)
1508             *rc = rc2; // return the rc from the reader thread
1509     }
1510     KThreadRelease(bamread_thread);
1511     bamread_thread = NULL;
1512 	KQueueRelease(bamq);
1513     bamq = NULL;
1514     return NULL;
1515 }
1516 
getSpotGroup(BAM_Alignment const * const rec,char spotGroup[])1517 static void getSpotGroup(BAM_Alignment const *const rec, char spotGroup[])
1518 {
1519     char const *rgname;
1520 
1521     BAM_AlignmentGetReadGroupName(rec, &rgname);
1522     if (rgname)
1523         strcpy(spotGroup, rgname);
1524     else
1525         spotGroup[0] = '\0';
1526 }
1527 
getLinkageGroup(BAM_Alignment const * const rec)1528 static char const *getLinkageGroup(BAM_Alignment const *const rec)
1529 {
1530     static char linkageGroup[1024];
1531     char const *BX = NULL;
1532     char const *CB = NULL;
1533     char const *UB = NULL;
1534 
1535     linkageGroup[0] = '\0';
1536     BAM_AlignmentGetLinkageGroup(rec, &BX, &CB, &UB);
1537     if (BX == NULL) {
1538         if (CB != NULL && UB != NULL) {
1539             unsigned const cblen = strlen(CB);
1540             unsigned const ublen = strlen(UB);
1541             if (cblen + ublen + 8 < sizeof(linkageGroup)) {
1542                 memmove(&linkageGroup[        0], "CB:", 3);
1543                 memmove(&linkageGroup[        3], CB, cblen);
1544                 memmove(&linkageGroup[cblen + 3], "|UB:", 4);
1545                 memmove(&linkageGroup[cblen + 7], UB, ublen + 1);
1546             }
1547         }
1548     }
1549     else {
1550         unsigned const bxlen = strlen(BX);
1551         if (bxlen + 1 < sizeof(linkageGroup))
1552             memmove(linkageGroup, BX, bxlen + 1);
1553     }
1554     return linkageGroup;
1555 }
1556 
ProcessBAM(char const bamFile[],context_t * ctx,VDatabase * db,Reference * ref,Sequence * seq,Alignment * align,bool * had_alignments,bool * had_sequences)1557 static rc_t ProcessBAM(char const bamFile[], context_t *ctx, VDatabase *db,
1558                         /* data outputs */
1559                        Reference *ref, Sequence *seq, Alignment *align,
1560                        /* output parameters */
1561                        bool *had_alignments, bool *had_sequences)
1562 {
1563     const BAM_File *bam;
1564     const BAM_Alignment *rec;
1565     KDataBuffer buf;
1566     KDataBuffer fragBuf;
1567     KDataBuffer cigBuf;
1568     rc_t rc;
1569     const BAMRefSeq *refSeq = NULL;
1570     int32_t lastRefSeqId = -1;
1571     bool wasRenamed = false;
1572     size_t rsize;
1573     uint64_t keyId = 0;
1574     uint64_t reccount = 0;
1575     char spotGroup[512];
1576     size_t namelen;
1577     float progress = 0.0;
1578     unsigned warned = 0;
1579     long     fcountBoth=0;
1580     long     fcountOne=0;
1581     int skipRefSeqID = -1;
1582     int unmapRefSeqId = -1;
1583     uint64_t recordsRead = 0;
1584     uint64_t recordsProcessed = 0;
1585     uint64_t filterFlagConflictRecords=0; /*** counts number of conflicts between flags 0x400 and 0x200 ***/
1586 #define MAX_WARNINGS_FLAG_CONFLICT 10000 /*** maximum errors to report ***/
1587 
1588     bool isColorSpace = false;
1589     bool isNotColorSpace = G.noColorSpace;
1590     char alignGroup[32];
1591     size_t alignGroupLen;
1592     AlignmentRecord data;
1593     KDataBuffer seqBuffer;
1594     KDataBuffer qualBuffer;
1595     SequenceRecord srec;
1596     SequenceRecordStorage srecStorage;
1597 
1598     /* setting up buffers */
1599     memset(&data, 0, sizeof(data));
1600     memset(&srec, 0, sizeof(srec));
1601 
1602     srec.ti             = srecStorage.ti;
1603     srec.readStart      = srecStorage.readStart;
1604     srec.readLen        = srecStorage.readLen;
1605     srec.orientation    = srecStorage.orientation;
1606     srec.is_bad         = srecStorage.is_bad;
1607     srec.alignmentCount = srecStorage.alignmentCount;
1608     srec.aligned        = srecStorage.aligned;
1609     srec.cskey          = srecStorage. cskey;
1610 
1611     rc = OpenBAM(&bam, db, bamFile);
1612     if (rc) return rc;
1613     if (!G.noVerifyReferences && ref != NULL) {
1614         rc = VerifyReferences(bam, ref);
1615         if (G.onlyVerifyReferences) {
1616             BAM_FileRelease(bam);
1617             return rc;
1618         }
1619     }
1620     if (ctx->keyToID.key2id_max == 0) {
1621         uint32_t rgcount;
1622         unsigned rgi;
1623 
1624         BAM_FileGetReadGroupCount(bam, &rgcount);
1625         if (rgcount > (sizeof(ctx->keyToID.key2id)/sizeof(ctx->keyToID.key2id[0]) - 1))
1626             ctx->keyToID.key2id_max = 1;
1627         else
1628             ctx->keyToID.key2id_max = sizeof(ctx->keyToID.key2id)/sizeof(ctx->keyToID.key2id[0]);
1629 
1630         for (rgi = 0; rgi != rgcount; ++rgi) {
1631             BAMReadGroup const *rg;
1632 
1633             BAM_FileGetReadGroup(bam, rgi, &rg);
1634             if (rg && rg->platform && platform_cmp(rg->platform, "CAPILLARY")) {
1635                 G.hasTI = true;
1636                 break;
1637             }
1638         }
1639     }
1640 
1641     /* setting up more buffers */
1642     rc = KDataBufferMake(&cigBuf, 32, 0);
1643     if (rc)
1644         return rc;
1645 
1646     rc = KDataBufferMake(&fragBuf, 8, 1024);
1647     if (rc)
1648         return rc;
1649 
1650     rc = KDataBufferMake(&buf, 16, 0);
1651     if (rc)
1652         return rc;
1653 
1654     rc = KDataBufferMake(&seqBuffer, 8, 4096);
1655     if (rc)
1656         return rc;
1657 
1658     rc = KDataBufferMake(&qualBuffer, 8, 4096);
1659     if (rc)
1660         return rc;
1661 
1662     if (rc == 0) {
1663         (void)PLOGMSG(klogInfo, (klogInfo, "Loading '$(file)'", "file=%s", bamFile));
1664     }
1665 
1666     while ((rec = getNextRecord(bam, &rc)) != NULL) {
1667         bool aligned;
1668         uint32_t readlen;
1669         uint16_t flags;
1670         int64_t rpos=0;
1671         char *seqDNA;
1672         ctx_value_t *value;
1673         bool wasInserted;
1674         int32_t refSeqId=-1;
1675         uint8_t *qual;
1676         bool mated;
1677         const char *name;
1678         char cskey = 0;
1679         bool originally_aligned;
1680         bool isPrimary;
1681         uint32_t opCount;
1682         bool hasCG = false;
1683         uint64_t ti = 0;
1684         uint32_t csSeqLen = 0;
1685         int lpad = 0;
1686         int rpad = 0;
1687         bool hardclipped = false;
1688         bool revcmp = false;
1689         unsigned readNo = 0;
1690         bool wasPromoted = false;
1691         char const *barCode = NULL;
1692         char const *linkageGroup;
1693 
1694         ++recordsRead;
1695 
1696         BAM_AlignmentGetReadName2(rec, &name, &namelen);
1697 
1698         keyId = rec->keyId;
1699         wasInserted = rec->wasInserted;
1700 
1701         rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
1702         if (rc) {
1703             (void)PLOGERR(klogErr, (klogErr, rc, "MMArrayGet: failed on id '$(id)'", "id=%u", keyId));
1704             goto LOOP_END;
1705         }
1706 
1707         {
1708             float const new_value = BAM_FileGetProportionalPosition(bam) * 100.0;
1709             float const delta = new_value - progress;
1710             if (delta > 1.0) {
1711                 KLoadProgressbar_Process(ctx->progress[0], delta, false);
1712                 progress = new_value;
1713             }
1714         }
1715 
1716         BAM_AlignmentGetBarCode(rec, &barCode);
1717         linkageGroup = getLinkageGroup(rec);
1718 
1719         if (!G.noColorSpace) {
1720             if (BAM_AlignmentHasColorSpace(rec)) {
1721                 if (isNotColorSpace) {
1722 MIXED_BASE_AND_COLOR:
1723                     rc = RC(rcApp, rcFile, rcReading, rcData, rcInconsistent);
1724                     (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains base space and color space", "file=%s", bamFile));
1725                     goto LOOP_END;
1726                 }
1727                 /* COLORSPACE is disabled!
1728                  * ctx->isColorSpace = isColorSpace = true; */
1729             }
1730             else if (isColorSpace)
1731                 goto MIXED_BASE_AND_COLOR;
1732             else
1733                 isNotColorSpace = true;
1734         }
1735         BAM_AlignmentGetFlags(rec, &flags);
1736 
1737         originally_aligned = (flags & BAMFlags_SelfIsUnmapped) == 0;
1738         aligned = originally_aligned;
1739 
1740         mated = false;
1741         if (flags & BAMFlags_WasPaired) {
1742             if ((flags & BAMFlags_IsFirst) != 0)
1743                 readNo |= 1;
1744             if ((flags & BAMFlags_IsSecond) != 0)
1745                 readNo |= 2;
1746             switch (readNo) {
1747             case 1:
1748             case 2:
1749                 mated = true;
1750                 break;
1751             case 0:
1752                 if ((warned & 1) == 0) {
1753                     (void)LOGMSG(klogWarn, "Spots without fragment info have been encountered");
1754                     warned |= 1;
1755                 }
1756                 UNFRAGMENT_MISSING_INFO;
1757                 break;
1758             case 3:
1759                 if ((warned & 2) == 0) {
1760                     (void)LOGMSG(klogWarn, "Spots with more than two fragments have been encountered");
1761                     warned |= 2;
1762                 }
1763                 UNFRAGMENT_TOO_MANY;
1764                 break;
1765             }
1766         }
1767         if (!mated)
1768             readNo = 1;
1769 
1770         isPrimary = (flags & (BAMFlags_IsNotPrimary|BAMFlags_IsSupplemental)) == 0 ? true : false;
1771         if (G.deferSecondary && !isPrimary && aligned && CTX_VALUE_GET_P_ID(*value, readNo - 1) == 0) {
1772             /* promote to primary alignment */
1773             isPrimary = true;
1774             wasPromoted = true;
1775         }
1776 
1777         getSpotGroup(rec, spotGroup);
1778         if (wasInserted) {
1779             if (G.mode == mode_Remap) {
1780                 (void)PLOGERR(klogErr, (klogErr, rc = RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
1781                                          "Spot '$(name)' is a new spot, not a remapping",
1782                                          "name=%s", name));
1783                 goto LOOP_END;
1784             }
1785             /* first time spot is seen                    */
1786             /* need to make sure that every goto LOOP_END */
1787             /* above this point is with rc != 0           */
1788             /* else this structure won't get initialized  */
1789             memset(value, 0, sizeof(*value));
1790             value->unmated = !mated;
1791             if (isPrimary || G.assembleWithSecondary || G.deferSecondary) {
1792                 value->pcr_dup = (flags & BAMFlags_IsDuplicate) == 0 ? 0 : 1;
1793                 value->platform = GetINSDCPlatform(bam, spotGroup);
1794                 value->primary_is_set = 1;
1795             }
1796         }
1797         if (!isPrimary && G.noSecondary)
1798             goto LOOP_END;
1799 
1800         rc = BAM_AlignmentCGReadLength(rec, &readlen);
1801         if (rc != 0 && GetRCState(rc) != rcNotFound) {
1802             (void)LOGERR(klogErr, rc, "Invalid CG data");
1803             goto LOOP_END;
1804         }
1805         if (rc == 0) {
1806             hasCG = true;
1807             BAM_AlignmentGetCigarCount(rec, &opCount);
1808             rc = KDataBufferResize(&cigBuf, opCount * 2 + 5);
1809             if (rc) {
1810                 (void)LOGERR(klogErr, rc, "Failed to resize CIGAR buffer");
1811                 goto LOOP_END;
1812             }
1813             rc = AlignmentRecordInit(&data, readlen);
1814             if (rc == 0)
1815                 rc = KDataBufferResize(&buf, readlen);
1816             if (rc) {
1817                 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1818                 goto LOOP_END;
1819             }
1820 
1821             seqDNA = buf.base;
1822             qual = (uint8_t *)&seqDNA[readlen];
1823 
1824             rc = BAM_AlignmentGetCGSeqQual(rec, seqDNA, qual);
1825             if (rc == 0) {
1826                 rc = BAM_AlignmentGetCGCigar(rec, cigBuf.base, cigBuf.elem_count, &opCount);
1827             }
1828             if (rc) {
1829                 (void)LOGERR(klogErr, rc, "Failed to read CG data");
1830                 goto LOOP_END;
1831             }
1832             data.data.align_group.elements = 0;
1833             data.data.align_group.buffer = alignGroup;
1834             if (BAM_AlignmentGetCGAlignGroup(rec, alignGroup, sizeof(alignGroup), &alignGroupLen) == 0)
1835                 data.data.align_group.elements = alignGroupLen;
1836         }
1837         else {
1838             /* normal flow i.e. NOT CG */
1839             uint32_t const *tmp;
1840 
1841             /* resize buffers */
1842             BAM_AlignmentGetReadLength(rec, &readlen);
1843             BAM_AlignmentGetRawCigar(rec, &tmp, &opCount);
1844             rc = KDataBufferResize(&cigBuf, opCount);
1845             assert(rc == 0);
1846             if (rc) {
1847                 (void)LOGERR(klogErr, rc, "Failed to resize CIGAR buffer");
1848                 goto LOOP_END;
1849             }
1850             memmove(cigBuf.base, tmp, opCount * sizeof(uint32_t));
1851 
1852             hardclipped = isHardClipped(opCount, cigBuf.base);
1853             if (hardclipped) {
1854                 if (isPrimary && !wasPromoted) {
1855                     /* when we promote a secondary to primary and it is hardclipped, we want to "fix" it */
1856                     if (!G.acceptHardClip) {
1857                         rc = RC(rcApp, rcFile, rcReading, rcConstraint, rcViolated);
1858                         (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains hard clipped primary alignments", "file=%s", bamFile));
1859                         goto LOOP_END;
1860                     }
1861                 }
1862                 else if (!G.acceptHardClip) { /* convert to soft clip */
1863                     uint32_t *const cigar = cigBuf.base;
1864                     uint32_t const lOp = cigar[0];
1865                     uint32_t const rOp = cigar[opCount - 1];
1866 
1867                     lpad = (lOp & 0xF) == 5 ? (lOp >> 4) : 0;
1868                     rpad = (rOp & 0xF) == 5 ? (rOp >> 4) : 0;
1869 
1870                     if (lpad + rpad == 0) {
1871                         rc = RC(rcApp, rcFile, rcReading, rcData, rcInvalid);
1872                         (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains invalid CIGAR", "file=%s", bamFile));
1873                         goto LOOP_END;
1874                     }
1875                     if (lpad != 0) {
1876                         uint32_t const new_lOp = (((uint32_t)lpad) << 4) | 4;
1877                         cigar[0] = new_lOp;
1878                     }
1879                     if (rpad != 0) {
1880                         uint32_t const new_rOp = (((uint32_t)rpad) << 4) | 4;
1881                         cigar[opCount - 1] = new_rOp;
1882                     }
1883                 }
1884             }
1885 
1886             if (G.deferSecondary && !isPrimary) {
1887                 /*** try to see if hard-clipped secondary alignment can be salvaged **/
1888                 if (readlen + lpad + rpad < 256 && readlen + lpad + rpad < value->fragment_len[readNo - 1]) {
1889                     rc = KDataBufferResize(&cigBuf, opCount + 1);
1890                     assert(rc == 0);
1891                     if (rc) {
1892                         (void)LOGERR(klogErr, rc, "Failed to resize CIGAR buffer");
1893                         goto LOOP_END;
1894                     }
1895                     if (rpad > 0 && lpad == 0) {
1896                         uint32_t *const cigar = cigBuf.base;
1897                         lpad =  value->fragment_len[readNo - 1] - readlen - rpad;
1898                         memmove(cigar + 1, cigar, opCount * sizeof(*cigar));
1899                         cigar[0] = (uint32_t)((lpad << 4) | 4);
1900                     }
1901                     else {
1902                         uint32_t *const cigar = cigBuf.base;
1903                         rpad += value->fragment_len[readNo - 1] - readlen - lpad;
1904                         cigar[opCount] = (uint32_t)((rpad << 4) | 4);
1905                     }
1906                     opCount++;
1907                 }
1908             }
1909             rc = AlignmentRecordInit(&data, readlen + lpad + rpad);
1910             assert(rc == 0);
1911             if (rc == 0)
1912                 rc = KDataBufferResize(&buf, readlen + lpad + rpad);
1913             assert(rc == 0);
1914             if (rc) {
1915                 (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1916                 goto LOOP_END;
1917             }
1918 
1919             seqDNA = buf.base;
1920             qual = (uint8_t *)&seqDNA[(readlen | csSeqLen) + lpad + rpad];
1921             memset(seqDNA, 'N', (readlen | csSeqLen) + lpad + rpad);
1922             memset(qual, 0, (readlen | csSeqLen) + lpad + rpad);
1923 
1924             BAM_AlignmentGetSequence(rec, seqDNA + lpad);
1925             if (G.useQUAL) {
1926                 uint8_t const *squal;
1927 
1928                 BAM_AlignmentGetQuality(rec, &squal);
1929                 memmove(qual + lpad, squal, readlen);
1930             }
1931             else {
1932                 uint8_t const *squal;
1933                 uint8_t qoffset = 0;
1934                 unsigned i;
1935 
1936                 rc = BAM_AlignmentGetQuality2(rec, &squal, &qoffset);
1937                 if (rc) {
1938                     (void)PLOGERR(klogErr, (klogErr, rc, "Spot '$(name)': length of original quality does not match sequence", "name=%s", name));
1939                     goto LOOP_END;
1940                 }
1941                 if (qoffset) {
1942                     for (i = 0; i != readlen; ++i)
1943                         qual[i + lpad] = squal[i] - qoffset;
1944                     QUAL_CHANGED_OQ;
1945                 }
1946                 else
1947                     memmove(qual + lpad, squal, readlen);
1948             }
1949             readlen = readlen + lpad + rpad;
1950             data.data.align_group.elements = 0;
1951             data.data.align_group.buffer = alignGroup;
1952         }
1953         if (G.hasTI) {
1954             rc = BAM_AlignmentGetTI(rec, &ti);
1955             if (rc)
1956                 ti = 0;
1957             rc = 0;
1958         }
1959 
1960         rc = KDataBufferResize(&seqBuffer, readlen);
1961         if (rc) {
1962             (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1963             goto LOOP_END;
1964         }
1965         rc = KDataBufferResize(&qualBuffer, readlen);
1966         if (rc) {
1967             (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
1968             goto LOOP_END;
1969         }
1970         AR_REF_ORIENT(data) = (flags & BAMFlags_SelfIsReverse) == 0 ? false : true;
1971 
1972         rpos = -1;
1973         if (aligned) {
1974             BAM_AlignmentGetPosition(rec, &rpos);
1975             BAM_AlignmentGetRefSeqId(rec, &refSeqId);
1976             if (refSeqId != lastRefSeqId) {
1977                 refSeq = NULL;
1978                 BAM_FileGetRefSeqById(bam, refSeqId, &refSeq);
1979             }
1980         }
1981 
1982         revcmp = (isColorSpace && !aligned) ? false : AR_REF_ORIENT(data);
1983         (void)PLOGMSG(klogDebug, (klogDebug, "Read '$(name)' is $(or) at $(ref):$(pos)", "name=%s,or=%s,ref=%s,pos=%i", name, revcmp ? "reverse" : "forward", refSeq ? refSeq->name : "(none)", rpos));
1984         COPY_READ(seqBuffer.base, seqDNA, readlen, revcmp);
1985         COPY_QUAL(qualBuffer.base, qual, readlen, revcmp);
1986 
1987         AR_MAPQ(data) = GetMapQ(rec);
1988         if (!isPrimary && AR_MAPQ(data) < G.minMapQual)
1989             goto LOOP_END;
1990 
1991         if (aligned && align == NULL) {
1992             rc = RC(rcApp, rcFile, rcReading, rcData, rcInconsistent);
1993             (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' contains aligned records", "file=%s", bamFile));
1994             goto LOOP_END;
1995         }
1996         while (aligned) {
1997             if (rpos >= 0 && refSeqId >= 0) {
1998                 if (refSeqId == skipRefSeqID) {
1999                     DISCARD_SKIP_REFERENCE;
2000                     goto LOOP_END;
2001                 }
2002                 if (refSeqId == unmapRefSeqId) {
2003                     aligned = false;
2004                     UNALIGNED_UNALIGNED_REF;
2005                     break;
2006                 }
2007                 unmapRefSeqId = -1;
2008                 if (refSeq == NULL) {
2009                     rc = SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent);
2010                     (void)PLOGERR(klogWarn, (klogWarn, rc, "File '$(file)': Spot '$(name)' refers to an unknown Reference number $(refSeqId)", "file=%s,refSeqId=%i,name=%s", bamFile, (int)refSeqId, name));
2011                     rc = CheckLimitAndLogError();
2012                     DISCARD_UNKNOWN_REFERENCE;
2013                     goto LOOP_END;
2014                 }
2015                 else {
2016                     bool shouldUnmap = false;
2017 
2018                     if (G.refFilter && strcmp(G.refFilter, refSeq->name) != 0) {
2019                         (void)PLOGMSG(klogInfo, (klogInfo, "Skipping Reference '$(name)'", "name=%s", refSeq->name));
2020                         skipRefSeqID = refSeqId;
2021                         DISCARD_SKIP_REFERENCE;
2022                         goto LOOP_END;
2023                     }
2024 
2025                     rc = ReferenceSetFile(ref, refSeq->name, refSeq->length, refSeq->checksum, &shouldUnmap, &wasRenamed);
2026                     if (rc == 0) {
2027                         lastRefSeqId = refSeqId;
2028                         if (shouldUnmap) {
2029                             aligned = false;
2030                             unmapRefSeqId = refSeqId;
2031                             UNALIGNED_UNALIGNED_REF;
2032                         }
2033                         break;
2034                     }
2035                     if (GetRCObject(rc) == rcConstraint && GetRCState(rc) == rcViolated) {
2036                         int const level = G.limit2config ? klogWarn : klogErr;
2037 
2038                         (void)PLOGMSG(level, (level, "Could not find a Reference to match { name: '$(name)', length: $(rlen) }", "name=%s,rlen=%u", refSeq->name, (unsigned)refSeq->length));
2039                     }
2040                     else if (!G.limit2config)
2041                         (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)': Spot '$(sname)' refers to an unknown Reference '$(rname)'", "file=%s,rname=%s,sname=%s", bamFile, refSeq->name, name));
2042                     if (G.limit2config) {
2043                         rc = 0;
2044                         UNALIGNED_UNKNOWN_REF;
2045                     }
2046                     goto LOOP_END;
2047                 }
2048             }
2049             else if (refSeqId < 0) {
2050                 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' was marked aligned, but reference id = $(id) is invalid", "name=%.*s,id=%i", namelen, name, refSeqId));
2051                 if ((rc = CheckLimitAndLogError()) != 0) goto LOOP_END;
2052                 UNALIGNED_INVALID_REF;
2053             }
2054             else {
2055                 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' was marked aligned, but reference position = $(pos) is invalid", "name=%.*s,pos=%i", namelen, name, rpos));
2056                 if ((rc = CheckLimitAndLogError()) != 0) goto LOOP_END;
2057                 UNALIGNED_INVALID_REF_POS;
2058             }
2059 
2060             aligned = false;
2061         }
2062         if (!aligned && (G.refFilter != NULL || G.limit2config)) {
2063             assert(!"this shouldn't happen");
2064             goto LOOP_END;
2065         }
2066 
2067         AR_KEY(data) = keyId;
2068         AR_READNO(data) = readNo;
2069 
2070         if (wasInserted) {
2071         }
2072         else if (isPrimary || G.assembleWithSecondary || G.deferSecondary) {
2073             /* other times */
2074             int o_pcr_dup = value->pcr_dup;
2075             int const n_pcr_dup = (flags & BAMFlags_IsDuplicate) == 0 ? 0 : 1;
2076 
2077             if (!value->primary_is_set) {
2078                 o_pcr_dup = n_pcr_dup;
2079                 value->primary_is_set = 1;
2080             }
2081 
2082             value->pcr_dup = o_pcr_dup & n_pcr_dup;
2083             if (o_pcr_dup != (o_pcr_dup & n_pcr_dup)) {
2084                 FLAG_CHANGED_PCR_DUP;
2085             }
2086             if (mated && value->unmated) {
2087                 (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
2088                                          "Spot '$(name)', which was first seen without mate info, now has mate info",
2089                                          "name=%s", name));
2090                 rc = CheckLimitAndLogError();
2091                 DISCARD_BAD_FRAGMENT_INFO;
2092                 goto LOOP_END;
2093             }
2094             else if (!mated && !value->unmated) {
2095                 (void)PLOGERR(klogWarn, (klogWarn, SILENT_RC(rcApp, rcFile, rcReading, rcData, rcInconsistent),
2096                                          "Spot '$(name)', which was first seen with mate info, now has no mate info",
2097                                          "name=%s", name));
2098                 rc = CheckLimitAndLogError();
2099                 DISCARD_BAD_FRAGMENT_INFO;
2100                 goto LOOP_END;
2101             }
2102         }
2103         if (isPrimary) {
2104             switch (readNo) {
2105             case 1:
2106                 if (CTX_VALUE_GET_P_ID(*value, 0) != 0) {
2107                     isPrimary = false;
2108                     FLAG_CHANGED_PRIMARY_DUP;
2109                 }
2110                 else if (aligned && value->unaligned_1) {
2111                     (void)PLOGMSG(klogWarn, (klogWarn, "Read 1 of spot '$(name)', which was unmapped, is now being mapped at position $(pos) on reference '$(ref)'; this alignment will be considered as secondary", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2112                     isPrimary = false;
2113                     FLAG_CHANGED_WAS_UNALIGNED;
2114                 }
2115                 break;
2116             case 2:
2117                 if (CTX_VALUE_GET_P_ID(*value, 1) != 0) {
2118                     isPrimary = false;
2119                     FLAG_CHANGED_PRIMARY_DUP;
2120                 }
2121                 else if (aligned && value->unaligned_2) {
2122                     (void)PLOGMSG(klogWarn, (klogWarn, "Read 2 of spot '$(name)', which was unmapped, is now being mapped at position $(pos) on reference '$(ref)'; this alignment will be considered as secondary", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2123                     isPrimary = false;
2124                     FLAG_CHANGED_WAS_UNALIGNED;
2125                 }
2126                 break;
2127             default:
2128                 break;
2129             }
2130         }
2131         if (hardclipped) {
2132             value->hardclipped = 1;
2133         }
2134 #if 0 /** EY TO REVIEW **/
2135         if (!isPrimary && value->hardclipped) {
2136             DISCARD_HARDCLIP_SECONDARY;
2137             goto LOOP_END;
2138         }
2139 #endif
2140 
2141         /* input is clean */
2142         ++recordsProcessed;
2143 
2144         data.isPrimary = isPrimary;
2145         if (aligned) {
2146             uint32_t matches = 0;
2147             uint32_t misses = 0;
2148             uint8_t rna_orient = ' ';
2149 
2150             FixOverhangingAlignment(&cigBuf, &opCount, rpos, refSeq->length, readlen);
2151             BAM_AlignmentGetRNAStrand(rec, &rna_orient);
2152             {
2153                 int const intronType = rna_orient == '+' ? NCBI_align_ro_intron_plus :
2154                                        rna_orient == '-' ? NCBI_align_ro_intron_minus :
2155                                                    hasCG ? NCBI_align_ro_complete_genomics :
2156                                                            NCBI_align_ro_intron_unknown;
2157                 rc = ReferenceRead(ref, &data, rpos, cigBuf.base, opCount, seqDNA, readlen, intronType, &matches, &misses);
2158             }
2159             if (rc == 0) {
2160                 int const i = readNo - 1;
2161                 int const clipped_rl = readlen < 255 ? readlen : 255;
2162                 if (i >= 0 && i < 2) {
2163                     int const rl = value->fragment_len[i];
2164 
2165                     if (rl == 0)
2166                         value->fragment_len[i] = clipped_rl;
2167                     else if (rl != clipped_rl) {
2168                         if (isPrimary) {
2169                             rc = RC(rcApp, rcFile, rcReading, rcConstraint, rcViolated);
2170                             (void)PLOGERR(klogErr, (klogErr, rc, "Primary alignment for '$(name)' has different length ($(len)) than previously recorded secondary alignment. Try to defer secondary alignment processing.",
2171                                                     "name=%s,len=%d", name, readlen));
2172                         }
2173                         else {
2174                             rc = SILENT_RC(rcApp, rcFile, rcReading, rcConstraint, rcViolated);
2175                             (void)PLOGERR(klogWarn, (klogWarn, rc, "Secondary alignment for '$(name)' has different length ($(len)) than previously recorded primary alignment; discarding secondary alignment.",
2176                                                      "name=%s,len=%d", name, readlen));
2177                             DISCARD_BAD_SECONDARY;
2178                             rc = CheckLimitAndLogError();
2179                         }
2180                         goto LOOP_END;
2181                     }
2182                 }
2183             }
2184             if (rc == 0 && (matches < G.minMatchCount || (matches == 0 && !G.acceptNoMatch))) {
2185                 if (isPrimary) {
2186                     if (misses > matches) {
2187                         RecordNoMatch(name, refSeq->name, rpos);
2188                         rc = LogNoMatch(name, refSeq->name, (unsigned)rpos, (unsigned)matches);
2189                         if (rc)
2190                             goto LOOP_END;
2191                     }
2192                 }
2193                 else {
2194                     (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)' contains too few ($(count)) matching bases to reference '$(ref)' at $(pos); discarding secondary alignment",
2195                                              "name=%s,ref=%s,pos=%u,count=%u", name, refSeq->name, (unsigned)rpos, (unsigned)matches));
2196                     DISCARD_BAD_SECONDARY;
2197                     rc = 0;
2198                     goto LOOP_END;
2199                 }
2200             }
2201             if (rc) {
2202                 aligned = false;
2203 
2204                 if (((int)GetRCObject(rc)) == ((int)rcData) && GetRCState(rc) == rcNotAvailable) {
2205                     /* because of code above converting hard clips to soft clips, this should be unreachable */
2206                     abort();
2207                 }
2208                 else if (((int)GetRCObject(rc)) == ((int)rcData)) {
2209                     UNALIGNED_INVALID_INFO;
2210                     (void)PLOGERR(klogWarn, (klogWarn, rc, "Spot '$(name)': bad alignment to reference '$(ref)' at $(pos)", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2211                     /* Data errors may get reset; alignment will be unmapped at any rate */
2212                     rc = CheckLimitAndLogError();
2213                 }
2214                 else {
2215                     UNALIGNED_INVALID_REF_POS;
2216                     (void)PLOGERR(klogWarn, (klogWarn, rc, "Spot '$(name)': error reading reference '$(ref)' at $(pos)", "name=%s,ref=%s,pos=%u", name, refSeq->name, rpos));
2217                     rc = CheckLimitAndLogError();
2218                 }
2219                 if (rc) goto LOOP_END;
2220             }
2221         }
2222 
2223         if (!aligned && isPrimary) {
2224             switch (readNo) {
2225             case 1:
2226                 value->unaligned_1 = 1;
2227                 break;
2228             case 2:
2229                 value->unaligned_2 = 1;
2230                 break;
2231             default:
2232                 break;
2233             }
2234         }
2235         if (isPrimary) {
2236             switch (readNo) {
2237             case 1:
2238                 if (CTX_VALUE_GET_P_ID(*value, 0) == 0 && aligned) {
2239                     data.alignId = ++ctx->primaryId;
2240                     CTX_VALUE_SET_P_ID(*value, 0, data.alignId);
2241                 }
2242                 break;
2243             case 2:
2244                 if (CTX_VALUE_GET_P_ID(*value, 1) == 0 && aligned) {
2245                     data.alignId = ++ctx->primaryId;
2246                     CTX_VALUE_SET_P_ID(*value, 1, data.alignId);
2247                 }
2248                 break;
2249             default:
2250                 break;
2251             }
2252         }
2253         if (G.mode == mode_Archive)
2254             goto WRITE_SEQUENCE;
2255         else
2256             goto WRITE_ALIGNMENT;
2257         if (0) {
2258 WRITE_SEQUENCE:
2259             if (barCode) {
2260                 if (spotGroup[0] != '\0' && value->platform == SRA_PLATFORM_UNDEFINED) {
2261                     /* don't use bar code */
2262                 }
2263                 else {
2264                     unsigned const sglen = strlen(barCode);
2265                     if (sglen + 1 < sizeof(spotGroup))
2266                         memmove(spotGroup, barCode, sglen + 1);
2267                 }
2268             }
2269             if (mated) {
2270                 int64_t const spotId = CTX_VALUE_GET_S_ID(*value);
2271                 uint32_t const fragmentId = value->fragmentId;
2272                 bool const spotHasBeenWritten = (spotId != 0);
2273                 bool const spotHasFragmentInfo = (fragmentId != 0);
2274                 bool const spotIsFirstSeen = (spotHasBeenWritten || spotHasFragmentInfo) ? false : true;
2275 
2276                 if (spotHasBeenWritten) {
2277                     /* do nothing */
2278                 }
2279                 else if (spotIsFirstSeen) {
2280                     /* start spot assembly */
2281                     unsigned sz;
2282                     FragmentInfo fi;
2283                     int32_t mate_refSeqId = -1;
2284                     int64_t pnext = 0;
2285 
2286                     if (!isPrimary) {
2287                         if ( (!G.assembleWithSecondary || hardclipped) && !G.deferSecondary ) {
2288                             goto WRITE_ALIGNMENT;
2289                         }
2290                         (void)PLOGMSG(klogDebug, (klogDebug, "Spot '$(name)' (id $(id)) is being constructed from secondary alignment information", "id=%lx,name=%s", keyId, name));
2291                     }
2292                     memset(&fi, 0, sizeof(fi));
2293 
2294                     fi.aligned = isPrimary ? aligned : 0;
2295                     fi.ti = ti;
2296                     fi.orientation = AR_REF_ORIENT(data);
2297                     fi.readNo = readNo;
2298                     fi.sglen = strlen(spotGroup);
2299                     fi.lglen = strlen(linkageGroup);
2300 
2301                     fi.readlen = readlen;
2302                     fi.cskey = cskey;
2303                     fi.is_bad = (flags & BAMFlags_IsLowQuality) != 0;
2304                     sz = sizeof(fi) + 2*fi.readlen + fi.sglen + fi.lglen;
2305                     if (align) {
2306                         BAM_AlignmentGetMateRefSeqId(rec, &mate_refSeqId);
2307                         BAM_AlignmentGetMatePosition(rec, &pnext);
2308                     }
2309                     if(align && mate_refSeqId == refSeqId && pnext > 0 && pnext!=rpos /*** weird case in some bams**/){
2310                         rc = MemBankAlloc(ctx->frags, &value->fragmentId, sz, 0, false);
2311                         fcountBoth++;
2312                     } else {
2313                         rc = MemBankAlloc(ctx->frags, &value->fragmentId, sz, 0, true);
2314                         fcountOne++;
2315                     }
2316                     if (rc) {
2317                         (void)LOGERR(klogErr, rc, "KMemBankAlloc failed");
2318                         goto LOOP_END;
2319                     }
2320                     /*printf("IN:%10d\tcnt2=%ld\tcnt1=%ld\n",value->fragmentId,fcountBoth,fcountOne);*/
2321 
2322                     rc = KDataBufferResize(&fragBuf, sz);
2323                     if (rc) {
2324                         (void)LOGERR(klogErr, rc, "Failed to resize fragment buffer");
2325                         goto LOOP_END;
2326                     }
2327                     {{
2328                         uint8_t *dst = (uint8_t*) fragBuf.base;
2329 
2330                         memmove(dst,&fi,sizeof(fi));
2331                         dst += sizeof(fi);
2332                         memmove(dst, seqBuffer.base, readlen);
2333                         dst += readlen;
2334                         memmove(dst, qualBuffer.base, readlen);
2335                         dst += fi.readlen;
2336                         memmove(dst, spotGroup, fi.sglen);
2337                         dst += fi.sglen;
2338                         memmove(dst, linkageGroup, fi.lglen);
2339                         dst += fi.lglen;
2340                     }}
2341                     rc = MemBankWrite(ctx->frags, value->fragmentId, 0, fragBuf.base, sz, &rsize);
2342                     if (rc) {
2343                         (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankWrite failed writing fragment $(id)", "id=%u", value->fragmentId));
2344                         goto LOOP_END;
2345                     }
2346                     if (revcmp) {
2347                         QUAL_CHANGED_REVERSED;
2348                         SEQ__CHANGED_REV_COMP;
2349                     }
2350                 }
2351                 else if (spotHasFragmentInfo) {
2352                     /* continue spot assembly */
2353                     FragmentInfo *fip;
2354                     {
2355                         size_t size1;
2356                         size_t size2;
2357 
2358                         rc = MemBankSize(ctx->frags, fragmentId, &size1);
2359                         if (rc) {
2360                             (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankSize failed on fragment $(id)", "id=%u", fragmentId));
2361                             goto LOOP_END;
2362                         }
2363 
2364                         rc = KDataBufferResize(&fragBuf, size1);
2365                         fip = (FragmentInfo *)fragBuf.base;
2366                         if (rc) {
2367                             (void)PLOGERR(klogErr, (klogErr, rc, "Failed to resize fragment buffer", ""));
2368                             goto LOOP_END;
2369                         }
2370 
2371                         rc = MemBankRead(ctx->frags, fragmentId, 0, fragBuf.base, size1, &size2);
2372                         if (rc) {
2373                             (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankRead failed on fragment $(id)", "id=%u", fragmentId));
2374                             goto LOOP_END;
2375                         }
2376                         assert(size1 == size2);
2377                     }
2378                     if (readNo == fip->readNo) {
2379                         /* is a repeat of the same read; do nothing */
2380                     }
2381                     else {
2382                         /* mate found; finish spot assembly */
2383                         unsigned read1 = 0;
2384                         unsigned read2 = 1;
2385                         char const *const seq1 = (void *)&fip[1];
2386                         char const *const qual1 = (void *)(seq1 + fip->readlen);
2387                         char const *const sg1 = (void *)(qual1 + fip->readlen);
2388                         char const *const bx1 = (void *)(sg1 + fip->sglen);
2389 
2390                         if (!isPrimary) {
2391                             if ((!G.assembleWithSecondary || hardclipped) && !G.deferSecondary ) {
2392                                 goto WRITE_ALIGNMENT;
2393                             }
2394                             (void)PLOGMSG(klogDebug, (klogDebug, "Spot '$(name)' (id $(id)) is being constructed from secondary alignment information", "id=%lx,name=%s", keyId, name));
2395                         }
2396                         rc = KDataBufferResize(&seqBuffer, readlen + fip->readlen);
2397                         if (rc) {
2398                             (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
2399                             goto LOOP_END;
2400                         }
2401                         rc = KDataBufferResize(&qualBuffer, readlen + fip->readlen);
2402                         if (rc) {
2403                             (void)LOGERR(klogErr, rc, "Failed to resize record buffer");
2404                             goto LOOP_END;
2405                         }
2406                         if (readNo < fip->readNo) {
2407                             read1 = 1;
2408                             read2 = 0;
2409                         }
2410 
2411                         memset(&srecStorage, 0, sizeof(srecStorage));
2412                         srec.numreads = 2;
2413                         srec.readLen[read1] = fip->readlen;
2414                         srec.readLen[read2] = readlen;
2415                         srec.readStart[1] = srec.readLen[0];
2416                         {
2417                             char const *const s1 = seq1;
2418                             char const *const s2 = seqBuffer.base;
2419                             char *const d = seqBuffer.base;
2420                             char *const d1 = d + srec.readStart[read1];
2421                             char *const d2 = d + srec.readStart[read2];
2422 
2423                             srec.seq = seqBuffer.base;
2424                             if (d2 != s2) {
2425                                 memmove(d2, s2, readlen);
2426                             }
2427                             memmove(d1, s1, fip->readlen);
2428                         }
2429                         {
2430                             char const *const s1 = qual1;
2431                             char const *const s2 = qualBuffer.base;
2432                             char *const d = qualBuffer.base;
2433                             char *const d1 = d + srec.readStart[read1];
2434                             char *const d2 = d + srec.readStart[read2];
2435 
2436                             srec.qual = qualBuffer.base;
2437                             if (d2 != s2) {
2438                                 memmove(d2, s2, readlen);
2439                             }
2440                             memmove(d1, s1, fip->readlen);
2441                         }
2442 
2443                         srec.ti[read1] = fip->ti;
2444                         srec.ti[read2] = ti;
2445 
2446                         srec.aligned[read1] = fip->aligned;
2447                         srec.aligned[read2] = isPrimary ? aligned : 0;
2448 
2449                         srec.is_bad[read1] = fip->is_bad;
2450                         srec.is_bad[read2] = (flags & BAMFlags_IsLowQuality) != 0;
2451 
2452                         srec.orientation[read1] = fip->orientation;
2453                         srec.orientation[read2] = AR_REF_ORIENT(data);
2454 
2455                         srec.cskey[read1] = fip->cskey;
2456                         srec.cskey[read2] = cskey;
2457 
2458                         srec.keyId = keyId;
2459 
2460                         srec.spotGroup = sg1;
2461                         srec.spotGroupLen = fip->sglen;
2462 
2463                         srec.linkageGroup = bx1;
2464                         srec.linkageGroupLen = fip->lglen;
2465 
2466                         srec.seq = seqBuffer.base;
2467                         srec.qual = qualBuffer.base;
2468 
2469                         rc = SequenceWriteRecord(seq, &srec, isColorSpace, value->pcr_dup, value->platform);
2470                         if (rc) {
2471                             (void)LOGERR(klogErr, rc, "SequenceWriteRecord failed");
2472                             goto LOOP_END;
2473                         }
2474                         CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
2475                         if(fragmentId & 1){
2476                             fcountOne--;
2477                         } else {
2478                             fcountBoth--;
2479                         }
2480                         /*	printf("OUT:%9d\tcnt2=%ld\tcnt1=%ld\n",fragmentId,fcountBoth,fcountOne);*/
2481                         rc = MemBankFree(ctx->frags, fragmentId);
2482                         if (rc) {
2483                             (void)PLOGERR(klogErr, (klogErr, rc, "KMemBankFree failed on fragment $(id)", "id=%u", fragmentId));
2484                             goto LOOP_END;
2485                         }
2486                         value->fragmentId = 0;
2487                         if (revcmp) {
2488                             QUAL_CHANGED_REVERSED;
2489                             SEQ__CHANGED_REV_COMP;
2490                         }
2491                         if (value->pcr_dup && (srec.is_bad[0] || srec.is_bad[1])) {
2492                             FLAG_CHANGED_400_AND_200;
2493                             filterFlagConflictRecords++;
2494                             if (filterFlagConflictRecords < MAX_WARNINGS_FLAG_CONFLICT) {
2495                                 (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2496                             }
2497                             else if (filterFlagConflictRecords == MAX_WARNINGS_FLAG_CONFLICT) {
2498                                 (void)PLOGMSG(klogWarn, (klogWarn, "Last reported warning: Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2499                             }
2500                         }
2501                     }
2502                 }
2503                 else {
2504                     (void)PLOGMSG(klogErr, (klogErr, "Spot '$(name)' has caused the loader to enter an illogical state", "name=%s", name));
2505                     assert("this should never happen");
2506                 }
2507             }
2508             else if (CTX_VALUE_GET_S_ID(*value) == 0) {
2509                 /* new unmated fragment - no spot assembly */
2510                 if (!isPrimary) {
2511                     if ((!G.assembleWithSecondary || hardclipped) && !G.deferSecondary ) {
2512                         goto WRITE_ALIGNMENT;
2513                     }
2514                     (void)PLOGMSG(klogDebug, (klogDebug, "Spot '$(name)' (id $(id)) is being constructed from secondary alignment information", "id=%lx,name=%s", keyId, name));
2515                 }
2516                 memset(&srecStorage, 0, sizeof(srecStorage));
2517                 srec.numreads = 1;
2518 
2519                 srec.readLen[0] = readlen;
2520                 srec.ti[0] = ti;
2521                 srec.aligned[0] = isPrimary ? aligned : 0;
2522                 srec.is_bad[0] = (flags & BAMFlags_IsLowQuality) != 0;
2523                 srec.orientation[0] = AR_REF_ORIENT(data);
2524                 srec.cskey[0] = cskey;
2525 
2526                 srec.keyId = keyId;
2527 
2528                 srec.spotGroup = spotGroup;
2529                 srec.spotGroupLen = strlen(spotGroup);
2530 
2531                 srec.linkageGroup = linkageGroup;
2532                 srec.linkageGroupLen = strlen(linkageGroup);
2533 
2534                 srec.seq = seqBuffer.base;
2535                 srec.qual = qualBuffer.base;
2536 
2537                 rc = SequenceWriteRecord(seq, &srec, isColorSpace, value->pcr_dup, value->platform);
2538                 if (rc) {
2539                     (void)PLOGERR(klogErr, (klogErr, rc, "SequenceWriteRecord failed", ""));
2540                     goto LOOP_END;
2541                 }
2542                 CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
2543                 value->fragmentId = 0;
2544                 if (value->pcr_dup && srec.is_bad[0]) {
2545                     FLAG_CHANGED_400_AND_200;
2546                     filterFlagConflictRecords++;
2547                     if (filterFlagConflictRecords < MAX_WARNINGS_FLAG_CONFLICT) {
2548                         (void)PLOGMSG(klogWarn, (klogWarn, "Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2549                     }
2550                     else if (filterFlagConflictRecords == MAX_WARNINGS_FLAG_CONFLICT) {
2551                         (void)PLOGMSG(klogWarn, (klogWarn, "Last reported warning: Spot '$(name)': both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "name=%s", name));
2552                     }
2553                 }
2554                 if (revcmp) {
2555                     QUAL_CHANGED_REVERSED;
2556                     SEQ__CHANGED_REV_COMP;
2557                 }
2558             }
2559         }
2560 WRITE_ALIGNMENT:
2561         if (aligned) {
2562             if (mated && !isPrimary) {
2563                 int32_t bam_mrid;
2564                 int64_t mpos;
2565                 int64_t mrid = 0;
2566                 int64_t tlen;
2567 
2568                 BAM_AlignmentGetMatePosition(rec, &mpos);
2569                 BAM_AlignmentGetMateRefSeqId(rec, &bam_mrid);
2570                 BAM_AlignmentGetInsertSize(rec, &tlen);
2571 
2572                 if (mpos >= 0 && bam_mrid >= 0 && tlen != 0) {
2573                     BAMRefSeq const *mref;
2574 
2575                     BAM_FileGetRefSeq(bam, bam_mrid, &mref);
2576                     if (mref) {
2577                         rc_t rc_temp = ReferenceGet1stRow(ref, &mrid, mref->name);
2578                         if (rc_temp == 0) {
2579                             data.mate_ref_pos = mpos;
2580                             data.template_len = tlen;
2581                             data.mate_ref_orientation = (flags & BAMFlags_MateIsReverse) ? 1 : 0;
2582                         }
2583                         else {
2584                             (void)PLOGERR(klogWarn, (klogWarn, rc_temp, "Failed to get refID for $(name)", "name=%s", mref->name));
2585                             MATE_INFO_LOST_UNKNOWN_REF;
2586                         }
2587                         data.mate_ref_id = mrid;
2588                     }
2589                     else {
2590                         MATE_INFO_LOST_INVALID;
2591                     }
2592                 }
2593                 else if (mpos >= 0 || bam_mrid >= 0 || tlen != 0) {
2594                     MATE_INFO_LOST_MISSING;
2595                 }
2596             }
2597 
2598             if (wasRenamed) {
2599                 RENAMED_REFERENCE;
2600             }
2601             if (value->alignmentCount[readNo - 1] < 254)
2602                 ++value->alignmentCount[readNo - 1];
2603             ++ctx->alignCount;
2604 
2605             assert(keyId >> 32 < ctx->keyToID.key2id_count);
2606             assert((uint32_t)keyId < ctx->keyToID.idCount[keyId >> 32]);
2607 
2608             if (linkageGroup[0] != '\0') {
2609                 AR_LINKAGE_GROUP(data).elements = strlen(linkageGroup);
2610                 AR_LINKAGE_GROUP(data).buffer = linkageGroup;
2611             }
2612 
2613             rc = AlignmentWriteRecord(align, &data);
2614             if (rc == 0) {
2615                 if (!isPrimary)
2616                     data.alignId = ++ctx->secondId;
2617 
2618                 rc = ReferenceAddAlignId(ref, data.alignId, isPrimary);
2619                 if (rc) {
2620                     (void)PLOGERR(klogErr, (klogErr, rc, "ReferenceAddAlignId failed", ""));
2621                 }
2622                 else {
2623                     *had_alignments = true;
2624                 }
2625             }
2626             else {
2627                 (void)PLOGERR(klogErr, (klogErr, rc, "AlignmentWriteRecord failed", ""));
2628             }
2629         }
2630         /**************************************************************/
2631 
2632     LOOP_END:
2633         BAM_AlignmentRelease(rec);
2634         ++reccount;
2635         if (G.maxAlignCount > 0 && reccount >= G.maxAlignCount)
2636             break;
2637         if (rc == 0)
2638             *had_sequences = true;
2639         else
2640             break;
2641     }
2642     if (bamread_thread != NULL && bamq != NULL) {
2643         KQueueSeal(bamq);
2644         for ( ; ; ) {
2645             timeout_t tm;
2646             void *rr = NULL;
2647             rc_t rc2;
2648 
2649             TimeoutInit(&tm, 1000);
2650             rc2 = KQueuePop(bamq, &rr, &tm);
2651             if (rc2) break;
2652             BAM_AlignmentRelease((BAM_Alignment *)rr);
2653         }
2654         KThreadWait(bamread_thread, NULL);
2655     }
2656     KThreadRelease(bamread_thread);
2657     KQueueRelease(bamq);
2658 
2659     if (rc) {
2660         if (   (GetRCModule(rc) == rcCont && (int)GetRCObject(rc) == rcData && GetRCState(rc) == rcDone)
2661             || (GetRCModule(rc) == rcAlign && GetRCObject(rc) == rcRow && GetRCState(rc) == rcNotFound))
2662         {
2663             (void)PLOGMSG(klogInfo, (klogInfo, "EOF '$(file)'; processed $(proc)", "file=%s,read=%lu,proc=%lu", bamFile, (unsigned long)recordsRead, (unsigned long)recordsProcessed));
2664             rc = 0;
2665         }
2666         else {
2667             (void)PLOGERR(klogInfo, (klogInfo, rc, "Error '$(file)'; read $(read); processed $(proc)", "file=%s,read=%lu,proc=%lu", bamFile, (unsigned long)recordsRead, (unsigned long)recordsProcessed));
2668         }
2669     }
2670     if (filterFlagConflictRecords > 0) {
2671         (void)PLOGMSG(klogWarn, (klogWarn, "$(cnt1) out of $(cnt2) records contained warning : both 0x400 and 0x200 flag bits set, only 0x400 will be saved", "cnt1=%lu,cnt2=%lu", filterFlagConflictRecords,recordsProcessed));
2672     }
2673     if (rc == 0 && recordsProcessed == 0) {
2674         (void)LOGMSG(klogWarn, (G.limit2config || G.refFilter != NULL) ?
2675                      "All records from the file were filtered out" :
2676                      "The file contained no records that were processed.");
2677         rc = RC(rcAlign, rcFile, rcReading, rcData, rcEmpty);
2678     }
2679 
2680     BAM_FileRelease(bam);
2681     MMArrayLock(ctx->id2value);
2682     KDataBufferWhack(&seqBuffer);
2683     KDataBufferWhack(&qualBuffer);
2684     KDataBufferWhack(&buf);
2685     KDataBufferWhack(&fragBuf);
2686     KDataBufferWhack(&cigBuf);
2687     KDataBufferWhack(&data.buffer);
2688     return rc;
2689 }
2690 
WriteSoloFragments(context_t * ctx,Sequence * seq)2691 static rc_t WriteSoloFragments(context_t *ctx, Sequence *seq)
2692 {
2693     uint32_t i;
2694     unsigned j;
2695     uint64_t idCount = 0;
2696     rc_t rc;
2697     KDataBuffer fragBuf;
2698     SequenceRecordStorage srecStorage;
2699     SequenceRecord srec;
2700 
2701     ++ctx->pass;
2702     memset(&srec, 0, sizeof(srec));
2703 
2704     srec.ti             = srecStorage.ti;
2705     srec.readStart      = srecStorage.readStart;
2706     srec.readLen        = srecStorage.readLen;
2707     srec.orientation    = srecStorage.orientation;
2708     srec.is_bad         = srecStorage.is_bad;
2709     srec.alignmentCount = srecStorage.alignmentCount;
2710     srec.aligned        = srecStorage.aligned;
2711     srec.cskey          = srecStorage. cskey;
2712 
2713     rc = KDataBufferMake(&fragBuf, 8, 0);
2714     if (rc) {
2715         (void)LOGERR(klogErr, rc, "KDataBufferMake failed");
2716         return rc;
2717     }
2718     for (idCount = 0, j = 0; j < ctx->keyToID.key2id_count; ++j) {
2719         idCount += ctx->keyToID.idCount[j];
2720     }
2721     KLoadProgressbar_Append(ctx->progress[ctx->pass - 1], idCount);
2722 
2723     for (idCount = 0, j = 0; j < ctx->keyToID.key2id_count; ++j) {
2724         for (i = 0; i != ctx->keyToID.idCount[j]; ++i, ++idCount) {
2725             uint64_t const keyId = ((uint64_t)j << 32) | i;
2726             ctx_value_t *value;
2727             size_t rsize;
2728             size_t sz;
2729             char const *src;
2730             FragmentInfo const *fip;
2731 
2732             rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
2733             if (rc)
2734                 break;
2735             KLoadProgressbar_Process(ctx->progress[ctx->pass - 1], 1, false);
2736             if (value->fragmentId == 0)
2737                 continue;
2738 
2739             rc = MemBankSize(ctx->frags, value->fragmentId, &sz);
2740             if (rc) {
2741                 (void)LOGERR(klogErr, rc, "KMemBankSize failed");
2742                 break;
2743             }
2744             rc = KDataBufferResize(&fragBuf, (size_t)sz);
2745             if (rc) {
2746                 (void)LOGERR(klogErr, rc, "KDataBufferResize failed");
2747                 break;
2748             }
2749             rc = MemBankRead(ctx->frags, value->fragmentId, 0, fragBuf.base, sz, &rsize);
2750             if (rc) {
2751                 (void)LOGERR(klogErr, rc, "KMemBankRead failed");
2752                 break;
2753             }
2754             assert( rsize == sz );
2755 
2756             fip = fragBuf.base;
2757             src = (char const *)&fip[1];
2758 
2759             memset(&srecStorage, 0, sizeof(srecStorage));
2760             if (value->unmated) {
2761                 srec.numreads = 1;
2762                 srec.readLen[0] = fip->readlen;
2763                 srec.ti[0] = fip->ti;
2764                 srec.aligned[0] = fip->aligned;
2765                 srec.is_bad[0] = fip->is_bad;
2766                 srec.orientation[0] = fip->orientation;
2767                 srec.cskey[0] = fip->cskey;
2768             }
2769             else {
2770                 unsigned const read = ((fip->aligned && CTX_VALUE_GET_P_ID(*value, 0) == 0) || value->unaligned_2) ? 1 : 0;
2771 
2772                 srec.numreads = 2;
2773                 srec.readLen[read] = fip->readlen;
2774                 srec.readStart[1] = srec.readLen[0];
2775                 srec.ti[read] = fip->ti;
2776                 srec.aligned[read] = fip->aligned;
2777                 srec.is_bad[read] = fip->is_bad;
2778                 srec.orientation[read] = fip->orientation;
2779                 srec.cskey[0] = srec.cskey[1] = 'N';
2780                 srec.cskey[read] = fip->cskey;
2781             }
2782             srec.seq = (char *)src;
2783             srec.qual = (uint8_t *)(src + fip->readlen);
2784             srec.spotGroup = (char *)(src + 2 * fip->readlen);
2785             srec.spotGroupLen = fip->sglen;
2786             srec.linkageGroup = (char *)(src + 2 * fip->readlen * fip->sglen);
2787             srec.linkageGroupLen = fip->lglen;
2788             srec.keyId = keyId;
2789             rc = SequenceWriteRecord(seq, &srec, ctx->isColorSpace, value->pcr_dup, value->platform);
2790             if (rc) {
2791                 (void)LOGERR(klogErr, rc, "SequenceWriteRecord failed");
2792                 break;
2793             }
2794             /*rc = KMemBankFree(frags, id);*/
2795             CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
2796         }
2797     }
2798     MMArrayLock(ctx->id2value);
2799     KDataBufferWhack(&fragBuf);
2800     return rc;
2801 }
2802 
SequenceUpdateAlignInfo(context_t * ctx,Sequence * seq)2803 static rc_t SequenceUpdateAlignInfo(context_t *ctx, Sequence *seq)
2804 {
2805     rc_t rc = 0;
2806     uint64_t row;
2807     uint64_t keyId;
2808 
2809     ++ctx->pass;
2810     KLoadProgressbar_Append(ctx->progress[ctx->pass - 1], ctx->spotId + 1);
2811 
2812     for (row = 1; row <= ctx->spotId; ++row) {
2813         ctx_value_t *value;
2814 
2815         rc = SequenceReadKey(seq, row, &keyId);
2816         if (rc) {
2817             (void)PLOGERR(klogErr, (klogErr, rc, "Failed to get key for row $(row)", "row=%u", (unsigned)row));
2818             break;
2819         }
2820         rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
2821         if (rc) {
2822             (void)PLOGERR(klogErr, (klogErr, rc, "Failed to read info for row $(row), index $(idx)", "row=%u,idx=%u", (unsigned)row, (unsigned)keyId));
2823             break;
2824         }
2825         if (G.mode == mode_Remap) {
2826             CTX_VALUE_SET_S_ID(*value, row);
2827         }
2828         if (row != CTX_VALUE_GET_S_ID(*value)) {
2829             rc = RC(rcApp, rcTable, rcWriting, rcData, rcUnexpected);
2830             (void)PLOGMSG(klogErr, (klogErr, "Unexpected spot id $(spotId) for row $(row), index $(idx)", "spotId=%u,row=%u,idx=%u", (unsigned)CTX_VALUE_GET_S_ID(*value), (unsigned)row, (unsigned)keyId));
2831             break;
2832         }
2833         {{
2834             int64_t primaryId[2];
2835             int const logLevel = klogWarn; /*G.assembleWithSecondary ? klogWarn : klogErr;*/
2836 
2837             primaryId[0] = CTX_VALUE_GET_P_ID(*value, 0);
2838             primaryId[1] = CTX_VALUE_GET_P_ID(*value, 1);
2839 
2840             if (primaryId[0] == 0 && value->alignmentCount[0] != 0) {
2841                 rc = RC(rcApp, rcTable, rcWriting, rcConstraint, rcViolated);
2842                 (void)PLOGERR(logLevel, (logLevel, rc, "Spot id $(id) read 1 never had a primary alignment", "id=%lx", keyId));
2843             }
2844             if (!value->unmated && primaryId[1] == 0 && value->alignmentCount[1] != 0) {
2845                 rc = RC(rcApp, rcTable, rcWriting, rcConstraint, rcViolated);
2846                 (void)PLOGERR(logLevel, (logLevel, rc, "Spot id $(id) read 2 never had a primary alignment", "id=%lx", keyId));
2847             }
2848             if (rc != 0 && logLevel == klogErr)
2849                 break;
2850 
2851             rc = SequenceUpdateAlignData(seq, row, value->unmated ? 1 : 2,
2852                                          primaryId,
2853                                          value->alignmentCount);
2854         }}
2855         if (rc) {
2856             (void)LOGERR(klogErr, rc, "Failed updating Alignment data in sequence table");
2857             break;
2858         }
2859         KLoadProgressbar_Process(ctx->progress[ctx->pass - 1], 1, false);
2860     }
2861     MMArrayLock(ctx->id2value);
2862     return rc;
2863 }
2864 
AlignmentUpdateSpotInfo(context_t * ctx,Alignment * align)2865 static rc_t AlignmentUpdateSpotInfo(context_t *ctx, Alignment *align)
2866 {
2867     rc_t rc;
2868     uint64_t keyId;
2869 
2870     ++ctx->pass;
2871 
2872     KLoadProgressbar_Append(ctx->progress[ctx->pass - 1], ctx->alignCount);
2873 
2874     rc = AlignmentStartUpdatingSpotIds(align);
2875     while (rc == 0 && (rc = Quitting()) == 0) {
2876         ctx_value_t *value;
2877 
2878         rc = AlignmentGetSpotKey(align, &keyId);
2879         if (rc) {
2880             if (GetRCObject(rc) == rcRow && GetRCState(rc) == rcNotFound)
2881                 rc = 0;
2882             break;
2883         }
2884         assert(keyId >> 32 < ctx->keyToID.key2id_count);
2885         assert((uint32_t)keyId < ctx->keyToID.idCount[keyId >> 32]);
2886         rc = MMArrayGet(ctx->id2value, (void **)&value, keyId);
2887         if (rc == 0) {
2888             int64_t const spotId = CTX_VALUE_GET_S_ID(*value);
2889 
2890             if (spotId == 0) {
2891                 rc = RC(rcApp, rcTable, rcWriting, rcConstraint, rcViolated);
2892                 (void)PLOGERR(klogErr, (klogErr, rc, "Spot '$(id)' was never assigned a spot id, probably has no primary alignments", "id=%lx", keyId));
2893                 break;
2894             }
2895             rc = AlignmentWriteSpotId(align, spotId);
2896         }
2897         KLoadProgressbar_Process(ctx->progress[ctx->pass - 1], 1, false);
2898     }
2899     MMArrayLock(ctx->id2value);
2900     return rc;
2901 }
2902 
2903 
ArchiveBAM(VDBManager * mgr,VDatabase * db,unsigned bamFiles,char const * bamFile[],unsigned seqFiles,char const * seqFile[],bool * has_alignments,bool continuing)2904 static rc_t ArchiveBAM(VDBManager *mgr, VDatabase *db,
2905                        unsigned bamFiles, char const *bamFile[],
2906                        unsigned seqFiles, char const *seqFile[],
2907                        bool *has_alignments,
2908                        bool continuing)
2909 {
2910     rc_t rc = 0;
2911     rc_t rc2;
2912     Reference ref;
2913     Sequence seq;
2914     Alignment *align;
2915     static context_t *ctx = &GlobalContext;
2916     bool has_sequences = false;
2917     unsigned i;
2918 
2919     *has_alignments = false;
2920     rc = ReferenceInit(&ref, mgr, db);
2921     if (rc)
2922         return rc;
2923 
2924     if (G.onlyVerifyReferences) {
2925         for (i = 0; i < bamFiles && rc == 0; ++i) {
2926             rc = ProcessBAM(bamFile[i], NULL, db, &ref, NULL, NULL, NULL, NULL);
2927         }
2928         ReferenceWhack(&ref, false);
2929         return rc;
2930     }
2931     SequenceInit(&seq, db);
2932     align = AlignmentMake(db);
2933 
2934     rc = SetupContext(ctx, bamFiles + seqFiles);
2935     if (rc)
2936         return rc;
2937 
2938     ctx->pass = 1;
2939     for (i = 0; i < bamFiles && rc == 0; ++i) {
2940         bool this_has_alignments = false;
2941         bool this_has_sequences = false;
2942 
2943         rc = ProcessBAM(bamFile[i], ctx, db, &ref, &seq, align, &this_has_alignments, &this_has_sequences);
2944         *has_alignments |= this_has_alignments;
2945         has_sequences |= this_has_sequences;
2946     }
2947     for (i = 0; i < seqFiles && rc == 0; ++i) {
2948         bool this_has_alignments = false;
2949         bool this_has_sequences = false;
2950 
2951         rc = ProcessBAM(seqFile[i], ctx, db, &ref, &seq, align, &this_has_alignments, &this_has_sequences);
2952         *has_alignments |= this_has_alignments;
2953         has_sequences |= this_has_sequences;
2954     }
2955     if (!continuing) {
2956 /*** No longer need memory for key2id ***/
2957         for (i = 0; i != ctx->keyToID.key2id_count; ++i) {
2958             KBTreeDropBacking(ctx->keyToID.key2id[i]);
2959             KBTreeRelease(ctx->keyToID.key2id[i]);
2960             ctx->keyToID.key2id[i] = NULL;
2961         }
2962         free(ctx->keyToID.key2id_names);
2963         ctx->keyToID.key2id_names = NULL;
2964 /*******************/
2965     }
2966 
2967     if (has_sequences) {
2968         if (rc == 0 && (rc = Quitting()) == 0) {
2969             if (G.mode == mode_Archive) {
2970                 (void)LOGMSG(klogInfo, "Writing unpaired sequences");
2971                 rc = WriteSoloFragments(ctx, &seq);
2972                 ContextReleaseMemBank(ctx);
2973             }
2974             if (rc == 0) {
2975                 rc = SequenceDoneWriting(&seq);
2976                 if (rc == 0) {
2977                     (void)LOGMSG(klogInfo, "Updating sequence alignment info");
2978                     rc = SequenceUpdateAlignInfo(ctx, &seq);
2979                 }
2980             }
2981         }
2982     }
2983 
2984     if (*has_alignments && rc == 0 && (rc = Quitting()) == 0) {
2985         (void)LOGMSG(klogInfo, "Writing alignment spot ids");
2986         rc = AlignmentUpdateSpotInfo(ctx, align);
2987     }
2988     rc2 = AlignmentWhack(align, *has_alignments && rc == 0 && (rc = Quitting()) == 0);
2989     if (rc == 0)
2990         rc = rc2;
2991 
2992     rc2 = ReferenceWhack(&ref, *has_alignments && rc == 0 && (rc = Quitting()) == 0);
2993     if (rc == 0)
2994         rc = rc2;
2995 
2996     SequenceWhack(&seq, rc == 0);
2997 
2998     ContextRelease(ctx, continuing);
2999 
3000     if (rc == 0) {
3001         (void)LOGMSG(klogInfo, "Successfully loaded all files");
3002     }
3003     return rc;
3004 }
3005 
WriteLoaderSignature(KMetadata * meta,char const progName[])3006 rc_t WriteLoaderSignature(KMetadata *meta, char const progName[])
3007 {
3008     KMDataNode *node;
3009     rc_t rc = KMetadataOpenNodeUpdate(meta, &node, "/");
3010 
3011     if (rc == 0) {
3012         rc = KLoaderMeta_Write(node, progName, __DATE__, "BAM", KAppVersion());
3013         KMDataNodeRelease(node);
3014     }
3015     if (rc) {
3016         (void)LOGERR(klogErr, rc, "Cannot update loader meta");
3017     }
3018     return rc;
3019 }
3020 
OpenPath(char const path[],KDirectory ** dir)3021 rc_t OpenPath(char const path[], KDirectory **dir)
3022 {
3023     KDirectory *p;
3024     rc_t rc = KDirectoryNativeDir(&p);
3025 
3026     if (rc == 0) {
3027         rc = KDirectoryOpenDirUpdate(p, dir, false, "%s", path);
3028         KDirectoryRelease(p);
3029     }
3030     return rc;
3031 }
3032 
3033 static
ConvertDatabaseToUnmapped(VDatabase * db)3034 rc_t ConvertDatabaseToUnmapped(VDatabase *db)
3035 {
3036     VTable* tbl;
3037     rc_t rc = VDatabaseOpenTableUpdate(db, &tbl, "SEQUENCE");
3038     if (rc == 0)
3039     {
3040         VTableRenameColumn(tbl, false, "CMP_ALTREAD", "ALTREAD");
3041         VTableRenameColumn(tbl, false, "CMP_READ", "READ");
3042         VTableRenameColumn(tbl, false, "CMP_ALTCSREAD", "ALTCSREAD");
3043         VTableRenameColumn(tbl, false, "CMP_CSREAD", "CSREAD");
3044         rc = VTableRelease(tbl);
3045     }
3046     return rc;
3047 }
3048 
run(char const progName[],unsigned bamFiles,char const * bamFile[],unsigned seqFiles,char const * seqFile[],bool continuing)3049 rc_t run(char const progName[],
3050          unsigned bamFiles, char const *bamFile[],
3051          unsigned seqFiles, char const *seqFile[],
3052          bool continuing)
3053 {
3054     VDBManager *mgr;
3055     rc_t rc;
3056     rc_t rc2;
3057     char const *db_type = G.expectUnsorted ? "NCBI:align:db:alignment_unsorted" : "NCBI:align:db:alignment_sorted";
3058 
3059     rc = VDBManagerMakeUpdate(&mgr, NULL);
3060     if (rc) {
3061         (void)LOGERR (klogErr, rc, "failed to create VDB Manager!");
3062     }
3063     else {
3064         bool has_alignments = false;
3065 
3066         /* VDBManagerDisableFlushThread(mgr); */
3067         rc = VDBManagerDisablePagemapThread(mgr);
3068         if (rc == 0)
3069         {
3070             if (G.onlyVerifyReferences) {
3071                 rc = ArchiveBAM(mgr, NULL, bamFiles, bamFile, 0, NULL, &has_alignments, continuing);
3072             }
3073             else {
3074                 VSchema *schema;
3075 
3076                 rc = VDBManagerMakeSchema(mgr, &schema);
3077                 if (rc) {
3078                     (void)LOGERR (klogErr, rc, "failed to create schema");
3079                 }
3080                 else {
3081                     (void)(rc = VSchemaAddIncludePath(schema, "%s", G.schemaIncludePath));
3082                     rc = VSchemaParseFile(schema, "%s", G.schemaPath);
3083                     if (rc) {
3084                         (void)PLOGERR(klogErr, (klogErr, rc, "failed to parse schema file $(file)", "file=%s", G.schemaPath));
3085                     }
3086                     else {
3087                         VDatabase *db;
3088 
3089                         rc = VDBManagerCreateDB(mgr, &db, schema, db_type,
3090                                                 kcmInit + kcmMD5, "%s", G.outpath);
3091                         VSchemaRelease(schema);
3092                         if (rc == 0) {
3093                             rc = ArchiveBAM(mgr, db, bamFiles, bamFile, seqFiles, seqFile, &has_alignments, continuing);
3094                             if (rc == 0)
3095                                 PrintChangeReport();
3096                             if (rc == 0 && !has_alignments) {
3097                                 rc = ConvertDatabaseToUnmapped(db);
3098                             }
3099                             else if (rc == 0 && lmc != NULL) {
3100                                 VTable *tbl = NULL;
3101                                 KTable *ktbl = NULL;
3102                                 KMetadata *meta = NULL;
3103                                 KMDataNode *node = NULL;
3104 
3105                                 VDatabaseOpenTableUpdate(db, &tbl, "REFERENCE");
3106                                 VTableOpenKTableUpdate(tbl, &ktbl);
3107                                 VTableRelease(tbl);
3108 
3109                                 KTableOpenMetadataUpdate(ktbl, &meta);
3110                                 KTableRelease(ktbl);
3111 
3112                                 KMetadataOpenNodeUpdate(meta, &node, "LOW_MATCH_COUNT");
3113                                 KMetadataRelease(meta);
3114 
3115                                 RecordLowMatchCounts(node);
3116 
3117                                 KMDataNodeRelease(node);
3118 
3119                                 LowMatchCounterFree(lmc);
3120                                 lmc = NULL;
3121                             }
3122                             VDatabaseRelease(db);
3123 
3124                             if (rc == 0 && G.globalMode == mode_Remap && !continuing) {
3125                                 VTable *tbl = NULL;
3126 
3127                                 VDBManagerOpenDBUpdate(mgr, &db, NULL, G.firstOut);
3128                                 VDatabaseOpenTableUpdate(db, &tbl, "SEQUENCE");
3129                                 VDatabaseRelease(db);
3130                                 VTableDropColumn(tbl, "TMP_KEY_ID");
3131                                 VTableDropColumn(tbl, "READ");
3132                                 VTableDropColumn(tbl, "ALTREAD");
3133                                 VTableRelease(tbl);
3134                             }
3135 
3136                             if (rc == 0) {
3137                                 KMetadata *meta = NULL;
3138 
3139                                 {
3140                                     KDBManager *kmgr = NULL;
3141 
3142                                     rc = VDBManagerOpenKDBManagerUpdate(mgr, &kmgr);
3143                                     if (rc == 0) {
3144                                         KDatabase *kdb;
3145 
3146                                         rc = KDBManagerOpenDBUpdate(kmgr, &kdb, "%s", G.outpath);
3147                                         if (rc == 0) {
3148                                             rc = KDatabaseOpenMetadataUpdate(kdb, &meta);
3149                                             KDatabaseRelease(kdb);
3150                                         }
3151                                         KDBManagerRelease(kmgr);
3152                                     }
3153                                 }
3154                                 if (rc == 0) {
3155                                     rc = WriteLoaderSignature(meta, progName);
3156                                     if (rc == 0) {
3157                                         KMDataNode *changes = NULL;
3158 
3159                                         rc = KMetadataOpenNodeUpdate(meta, &changes, "CHANGES");
3160                                         if (rc == 0)
3161                                             RecordChanges(changes, "CHANGE");
3162                                         KMDataNodeRelease(changes);
3163                                     }
3164                                     KMetadataRelease(meta);
3165                                 }
3166                             }
3167                         }
3168                     }
3169                 }
3170             }
3171         }
3172         rc2 = VDBManagerRelease(mgr);
3173         if (rc2)
3174             (void)LOGERR(klogWarn, rc2, "Failed to release VDB Manager");
3175         if (rc == 0)
3176             rc = rc2;
3177     }
3178     return rc;
3179 }
3180