1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #include "spot-assembler.h"
28 
29 #include <sys/stat.h>
30 
31 #include <klib/status.h>
32 #include <klib/printf.h>
33 
34 #include <kfs/file.h>
35 #include <kfs/directory.h>
36 
37 #include <kdb/btree.h>
38 
39 #include <kapp/progressbar.h>
40 
41 #include "sequence-writer.h"
42 
43 #define MMA_ELEM_T ctx_value_t
44 #include "mmarray.c"
45 #undef MMA_ELEM_T
46 
SpotAssemblerGetCtxValue(SpotAssembler * self,rc_t * const prc,uint64_t const keyId)47 ctx_value_t * SpotAssemblerGetCtxValue(SpotAssembler * self, rc_t *const prc, uint64_t const keyId)
48 {
49     return MMArrayGet(self->id2value, prc, keyId);
50 }
51 
OpenKBTree(size_t cache_size,const char * tmpfs,uint64_t pid,struct KBTree ** const rslt,size_t const n,size_t const max)52 rc_t OpenKBTree(size_t cache_size, const char * tmpfs, uint64_t pid, struct KBTree **const rslt, size_t const n, size_t const max)
53 {
54     size_t const cacheSize = (((cache_size - (cache_size / 2) - (cache_size / 8)) / max)
55                             + 0xFFFFF) & ~((size_t)0xFFFFF);
56     KFile *file = NULL;
57     KDirectory *dir;
58     char fname[4096];
59     rc_t rc;
60 
61     rc = KDirectoryNativeDir(&dir);
62     if (rc)
63         return rc;
64 
65     rc = string_printf(fname, sizeof(fname), NULL, "%s/key2id.%u.%u", tmpfs, pid, n); if (rc) return rc;
66     STSMSG(1, ("Path for scratch files: %s\n", fname));
67     rc = KDirectoryCreateFile(dir, &file, true, 0600, kcmInit, "%s", fname);
68     KDirectoryRemove(dir, 0, "%s", fname);
69     KDirectoryRelease(dir);
70     if (rc == 0) {
71         rc = KBTreeMakeUpdate(rslt, file, cacheSize,
72                               false, kbtOpaqueKey,
73                               1, 255, sizeof ( uint32_t ),
74                               NULL
75                               );
76         KFileRelease(file);
77     }
78     return rc;
79 }
80 
GetKeyIDOld(SpotAssembler * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],size_t const namelen)81 rc_t GetKeyIDOld(SpotAssembler* const ctx, uint64_t *const rslt, bool *const wasInserted, char const key[], char const name[], size_t const namelen)
82 {
83     size_t const keylen = strlen(key);
84     rc_t rc;
85     uint64_t tmpKey;
86 
87     if (ctx->key2id_count == 0) {
88         rc = OpenKBTree(ctx->cache_size, ctx->tmpfs, ctx->pid, &ctx->key2id[0], 1, 1);
89         if (rc) return rc;
90         ctx->key2id_count = 1;
91     }
92     if (keylen == 0 || memcmp(key, name, keylen) == 0) {
93         /* qname starts with read group; no append */
94         tmpKey = ctx->idCount[0];
95         rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, name, namelen);
96     }
97     else {
98         char sbuf[4096];
99         char *buf = sbuf;
100         char *hbuf = NULL;
101         size_t bsize = sizeof(sbuf);
102         size_t actsize;
103 
104         if (keylen + namelen + 2 > bsize) {
105             hbuf = malloc(bsize = keylen + namelen + 2);
106             if (hbuf == NULL)
107                 return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
108             buf = hbuf;
109         }
110         rc = string_printf(buf, bsize, &actsize, "%s\t%.*s", key, (int)namelen, name);
111 
112         tmpKey = ctx->idCount[0];
113         rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, buf, actsize);
114         if (hbuf)
115             free(hbuf);
116     }
117     if (rc == 0) {
118         *rslt = tmpKey;
119         if (*wasInserted)
120             ++ctx->idCount[0];
121     }
122 
123     return rc;
124 }
125 
HashValue(unsigned const len,unsigned char const value[])126 static unsigned HashValue(unsigned const len, unsigned char const value[])
127 {
128     /* FNV-1a hash with folding */
129     unsigned long long h = 0xcbf29ce484222325ull;
130     unsigned i;
131 
132     for (i = 0; i < len; ++i) {
133         int const octet = value[i];
134         h = (h ^ octet) * 0x100000001b3ull;
135     }
136     return (unsigned)(h ^ (h >> 32));
137 }
138 
HashKey(void const * const key,size_t const keylen)139 static unsigned HashKey(void const *const key, size_t const keylen)
140 {
141     return HashValue(keylen, key) % NUM_ID_SPACES;
142 }
143 
SeqHashKey(void const * const key,size_t const keylen)144 unsigned SeqHashKey(void const *const key, size_t const keylen)
145 {
146     return HashValue(keylen, key) % 0x10000;
147 }
148 
149 #define USE_ILLUMINA_NAMING_POUND_NUMBER_SLASH_HACK 1
150 
GetFixedNameLength(char const name[],size_t const namelen)151 static size_t GetFixedNameLength(char const name[], size_t const namelen)
152 {
153 #if USE_ILLUMINA_NAMING_POUND_NUMBER_SLASH_HACK
154     char const *const pound = string_chr(name, namelen, '#');
155 
156     if (pound && pound + 2u < name + namelen && pound[1] >= '0' && pound[1] <= '9' && pound[2] == '/') {
157         return (size_t)(pound - name) + 2u;
158     }
159 #endif
160     return namelen;
161 }
162 
SpotAssemblerGetKeyID(SpotAssembler * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],size_t const o_namelen)163 rc_t SpotAssemblerGetKeyID(SpotAssembler *const ctx,
164                            uint64_t *const rslt,
165                            bool *const wasInserted,
166                            char const key[],
167                            char const name[],
168                            size_t const o_namelen)
169 {
170     rc_t rc;
171     size_t const namelen = GetFixedNameLength(name, o_namelen);
172 
173     if (ctx->key2id_max == 1)
174     {
175         rc = GetKeyIDOld(ctx, rslt, wasInserted, key, name, namelen);
176     }
177     else
178     {
179         size_t const keylen = strlen(key);
180         unsigned const h = HashKey(key, keylen);
181         size_t f;
182         size_t e = ctx->key2id_count;
183         uint64_t tmpKey;
184 
185         *rslt = 0;
186         {{
187             uint32_t const bucket_value = ctx->key2id_hash[h];
188             unsigned const n  = (uint8_t) bucket_value;
189             unsigned const i1 = (uint8_t)(bucket_value >>  8);
190             unsigned const i2 = (uint8_t)(bucket_value >> 16);
191             unsigned const i3 = (uint8_t)(bucket_value >> 24);
192 
193             if (n > 0 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i1]) == 0) {
194                 f = i1;
195                 /*
196                 ctx->key2id_hash[h] = (i3 << 24) | (i2 << 16) | (i1 << 8) | n;
197                  */
198                 goto GET_ID;
199             }
200             if (n > 1 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i2]) == 0) {
201                 f = i2;
202                 ctx->key2id_hash[h] = (i3 << 24) | (i1 << 16) | (i2 << 8) | n;
203                 goto GET_ID;
204             }
205             if (n > 2 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i3]) == 0) {
206                 f = i3;
207                 ctx->key2id_hash[h] = (i2 << 24) | (i1 << 16) | (i3 << 8) | n;
208                 goto GET_ID;
209             }
210         }}
211         f = 0;
212         while (f < e) {
213             size_t const m = (f + e) / 2;
214             size_t const oid = ctx->key2id_oid[m];
215             int const diff = strcmp(key, ctx->key2id_names + ctx->key2id_name[oid]);
216 
217             if (diff < 0)
218                 e = m;
219             else if (diff > 0)
220                 f = m + 1;
221             else {
222                 f = oid;
223                 goto GET_ID;
224             }
225         }
226         if (ctx->key2id_count < ctx->key2id_max) {
227             size_t const name_max = ctx->key2id_name_max + keylen + 1;
228             KBTree *tree;
229             rc = OpenKBTree(ctx->cache_size, ctx->tmpfs, ctx->pid, &tree, ctx->key2id_count + 1, 1); /* ctx->key2id_max); */
230 
231             if (rc) return rc;
232 
233             if (ctx->key2id_name_alloc < name_max) {
234                 size_t alloc = ctx->key2id_name_alloc;
235                 void *tmp;
236 
237                 if (alloc == 0)
238                     alloc = 4096;
239                 while (alloc < name_max)
240                     alloc <<= 1;
241                 tmp = realloc(ctx->key2id_names, alloc);
242                 if (tmp == NULL)
243                     return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
244                 ctx->key2id_names = tmp;
245                 ctx->key2id_name_alloc = alloc;
246             }
247             if (f < ctx->key2id_count) {
248                 memmove(&ctx->key2id_oid[f + 1], &ctx->key2id_oid[f], (ctx->key2id_count - f) * sizeof(ctx->key2id_oid[f]));
249             }
250             ctx->key2id_oid[f] = ctx->key2id_count;
251             ++ctx->key2id_count;
252             f = ctx->key2id_oid[f];
253             ctx->key2id_name[f] = ctx->key2id_name_max;
254             ctx->key2id_name_max = name_max;
255 
256             memmove(&ctx->key2id_names[ctx->key2id_name[f]], key, keylen + 1);
257             ctx->key2id[f] = tree;
258             ctx->idCount[f] = 0;
259             if ((uint8_t)ctx->key2id_hash[h] < 3) {
260                 unsigned const n = (uint8_t)ctx->key2id_hash[h] + 1;
261 
262                 ctx->key2id_hash[h] = (uint32_t)((((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | n);
263             }
264             else {
265                 /* the hash function isn't working too well
266                  * keep the 3 mru
267                  */
268                 ctx->key2id_hash[h] = (uint32_t)((((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | 3);
269             }
270         GET_ID:
271             tmpKey = ctx->idCount[f];
272             rc = KBTreeEntry(ctx->key2id[f], &tmpKey, wasInserted, name, namelen);
273             if (rc == 0) {
274                 *rslt = (((uint64_t)f) << 32) | tmpKey;
275                 if (*wasInserted)
276                 {
277                     ++ctx->idCount[f];
278                 }
279                 assert(tmpKey < ctx->idCount[f]);
280             }
281         }
282         else
283         {
284             rc = RC(rcExe, rcTree, rcAllocating, rcConstraint, rcViolated);
285         }
286     }
287 
288     if ( rc == 0 && *wasInserted )
289     {   /* save the read name, to be used when the mate shows up, or in SpotAssemblerWriteSoloFragments() */
290         rc = Id2Name_Add ( & ctx->id2name, *rslt, name );
291     }
292 
293     return rc;
294 }
295 
openTempFile(char const path[])296 static int openTempFile(char const path[])
297 {
298     int const fd = open(path, O_RDWR|O_TRUNC|O_CREAT, S_IRUSR|S_IWUSR);
299     unlink(path);
300     return fd;
301 }
302 
OpenMMapFile(SpotAssembler * const ctx,KDirectory * const dir)303 static rc_t OpenMMapFile(SpotAssembler *const ctx, KDirectory *const dir)
304 {
305     char fname[4096];
306     rc_t rc = string_printf(fname, sizeof(fname), NULL, "%s/id2value.%u", ctx->tmpfs, ctx->pid);
307 
308     if (rc == 0) {
309         int const fd = openTempFile(fname);
310         if (fd >= 0)
311             ctx->id2value = MMArrayMake(&rc, fd);
312         else
313             rc = RC(rcExe, rcFile, rcCreating, rcFile, rcNotFound);
314     }
315     return rc;
316 }
317 
OpenMBankFile(SpotAssembler * const ctx,KDirectory * const dir)318 static rc_t OpenMBankFile(SpotAssembler *const ctx, KDirectory *const dir)
319 {
320     char fname[4096];
321     rc_t rc = string_printf(fname, sizeof(fname), NULL, "%s/fragments.%u", ctx->tmpfs, ctx->pid);
322 
323     if (rc == 0) {
324         int const fd = openTempFile(fname);
325         if (fd < 0)
326             rc = RC(rcExe, rcFile, rcCreating, rcFile, rcNotFound);
327         else
328             ctx->fragmentFd = fd;
329     }
330     return rc;
331 }
332 
SpotAssemblerMake(SpotAssembler ** p_self,size_t cache_size,const char * tmpfs,uint64_t pid)333 rc_t SpotAssemblerMake(SpotAssembler **p_self, size_t cache_size, const char * tmpfs, uint64_t pid)
334 {
335     rc_t rc = 0;
336 
337     assert ( p_self != NULL );
338 
339     SpotAssembler * self = calloc ( 1, sizeof ( * self ) );
340     if ( self == NULL )
341     {
342         return RC ( rcExe, rcName, rcAllocating, rcMemory, rcExhausted );
343     }
344 
345     self -> fragment = calloc ( FRAGMENT_HOT_COUNT, sizeof ( self -> fragment [ 0 ] ) );
346     if ( self -> fragment == NULL )
347     {
348         free(self);
349         return RC ( rcExe, rcName, rcAllocating, rcMemory, rcExhausted );
350     }
351 
352     self -> cache_size = cache_size;
353     self -> tmpfs = tmpfs;
354     self -> pid = pid;
355     self -> key2id_max = 1; /* make sure to use GetKeyIDOld() */
356 
357     STSMSG(1, ("Cache size: %uM\n", cache_size / 1024 / 1024));
358 
359     {
360         KDirectory *dir;
361         rc = KDirectoryNativeDir(&dir);
362         if (rc == 0)
363             rc = OpenMMapFile(self, dir);
364         if (rc == 0)
365             rc = OpenMBankFile(self, dir);
366 
367         KDirectoryRelease(dir);
368     }
369 
370     if ( rc == 0 )
371     {
372         rc = Id2Name_Init ( & self -> id2name );
373     }
374 
375     if ( rc == 0 )
376     {
377         * p_self = self;
378     }
379     else
380     {
381         SpotAssemblerRelease ( self );
382     }
383 
384     return rc;
385 }
386 
SpotAssemblerReleaseMemBank(SpotAssembler * self)387 void SpotAssemblerReleaseMemBank(SpotAssembler *self)
388 {
389     int i;
390 
391     for (i = 0; i < FRAGMENT_HOT_COUNT; ++i)
392         free(self->fragment[i].data);
393 
394     close(self->fragmentFd);
395 }
396 
SpotAssemblerRelease(SpotAssembler * self)397 void SpotAssemblerRelease(SpotAssembler * self)
398 {
399     size_t i;
400     for (i = 0; i != self->key2id_count; ++i)
401     {
402         KBTreeDropBacking ( self->key2id[i] );
403         KBTreeRelease ( self->key2id[i]) ;
404         self->key2id[i] = NULL;
405     }
406     free ( self->key2id_names );
407     self->key2id_names = NULL;
408 
409     MMArrayWhack ( self->id2value );
410     Id2Name_Whack ( & self->id2name );
411 
412     free(self->fragment);
413     free(self);
414 }
415 
SpotAssemblerWriteSoloFragments(SpotAssembler * ctx,bool isColorSpace,INSDC_SRA_platform_id platform,bool keepMismatchQual,bool no_real_output,bool hasTI,const char * QualQuantizer,bool dropReadnames,SequenceWriter * seq,const struct KLoadProgressbar * progress)416 rc_t SpotAssemblerWriteSoloFragments(SpotAssembler* ctx,
417                                      bool isColorSpace,
418                                      INSDC_SRA_platform_id platform,
419                                      bool keepMismatchQual,
420                                      bool no_real_output,
421                                      bool hasTI,
422                                      const char * QualQuantizer,
423                                      bool dropReadnames,
424                                      SequenceWriter *seq,
425                                      const struct KLoadProgressbar *progress)
426 {
427     uint32_t i;
428     unsigned j;
429     uint64_t idCount = 0;
430     rc_t rc;
431     KDataBuffer fragBuf;
432     SequenceRecord srec;
433 
434     memset(&srec, 0, sizeof(srec));
435 
436     rc = KDataBufferMake(&fragBuf, 8, 0);
437     if (rc) {
438         (void)LOGERR(klogErr, rc, "KDataBufferMake failed");
439         return rc;
440     }
441     for (idCount = 0, j = 0; j < ctx->key2id_count; ++j) {
442         idCount += ctx->idCount[j];
443     }
444     KLoadProgressbar_Append(progress, idCount);
445 
446     for (idCount = 0, j = 0; j < ctx->key2id_count; ++j) {
447         for (i = 0; i != ctx->idCount[j]; ++i, ++idCount) {
448             uint64_t const keyId = ((uint64_t)j << 32) | i;
449             ctx_value_t *value;
450             unsigned readLen[2];
451             unsigned read = 0;
452             FragmentInfo const *fip;
453             uint8_t const *src;
454 
455             value = MMArrayGet(ctx->id2value, &rc, keyId);
456             if (value == NULL)
457                 break;
458             KLoadProgressbar_Process(progress, 1, false);
459 
460             if (value->written)
461                 continue;
462 
463             assert(!value->unmated);
464             if (ctx->fragment[keyId % FRAGMENT_HOT_COUNT].id == keyId) {
465                 fip = (FragmentInfo const *)ctx->fragment[keyId % FRAGMENT_HOT_COUNT].data;
466             }
467             else {
468                 rc = KDataBufferResize(&fragBuf, (size_t)value->fragmentSize);
469                 if (rc == 0) {
470                     int64_t const nread = pread(ctx->fragmentFd, fragBuf.base, value->fragmentSize, value->fragmentOffset);
471                     if (nread == value->fragmentSize)
472                         fip = (FragmentInfo const *)fragBuf.base;
473                     else {
474                         (void)LOGERR(klogErr, rc = RC(rcExe, rcFile, rcReading, rcData, rcNotFound), "KMemBankRead failed");
475                         break;
476                     }
477                 }
478                 else {
479                     (void)LOGERR(klogErr, rc, "KDataBufferResize failed");
480                     break;
481                 }
482             }
483             src = (uint8_t const *)&fip[1];
484 
485             readLen[0] = readLen[1] = 0;
486             read = fip->otherReadNo - 1;
487 
488             readLen[read] = fip->readlen;
489             rc = SequenceRecordInit(&srec, 2, readLen);
490             if (rc) {
491                 (void)LOGERR(klogErr, rc, "SequenceRecordInit failed");
492                 break;
493             }
494 
495             srec.is_bad[read] = fip->is_bad;
496             srec.orientation[read] = fip->orientation;
497             srec.cskey[read] = fip->cskey;
498             memmove(srec.seq + srec.readStart[read], src, srec.readLen[read]);
499             src += fip->readlen;
500 
501             memmove(srec.qual + srec.readStart[read], src, srec.readLen[read]);
502             src += fip->readlen;
503             srec.spotGroup = (char *)src;
504             srec.spotGroupLen = fip->sglen;
505             srec.keyId = keyId;
506 
507             rc = Id2Name_Get ( & ctx->id2name, keyId, (const char**) & srec.spotName );
508             if (rc)
509             {
510                 (void)LOGERR(klogErr, rc, "Is2Name_Get failed");
511                 break;
512             }
513             srec.spotNameLen = strlen(srec.spotName);
514 
515             rc = SequenceWriteRecord(seq, &srec, isColorSpace, false, platform, keepMismatchQual, no_real_output, hasTI, QualQuantizer, dropReadnames);
516             if (rc) {
517                 (void)LOGERR(klogErr, rc, "SequenceWriteRecord failed");
518                 break;
519             }
520             /*rc = KMemBankFree(frags, id);*/
521             CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
522             value->written = 1;
523         }
524     }
525     /*printf("DONE_SOLO:\tcnt2=%d\tcnt1=%d\n",fcountBoth,fcountOne);*/
526     KDataBufferWhack(&fragBuf);
527     KDataBufferWhack(&srec.storage);
528     return rc;
529 }
530 
531 FragmentEntry *
SpotAssemblerGetFragmentEntry(SpotAssembler * self,uint64_t keyId)532 SpotAssemblerGetFragmentEntry(SpotAssembler * self, uint64_t keyId)
533 {
534     return & self -> fragment [ keyId % FRAGMENT_HOT_COUNT ];
535 }
536