1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "spot-assembler.h"
28
29 #include <sys/stat.h>
30
31 #include <klib/status.h>
32 #include <klib/printf.h>
33
34 #include <kfs/file.h>
35 #include <kfs/directory.h>
36
37 #include <kdb/btree.h>
38
39 #include <kapp/progressbar.h>
40
41 #include "sequence-writer.h"
42
43 #define MMA_ELEM_T ctx_value_t
44 #include "mmarray.c"
45 #undef MMA_ELEM_T
46
SpotAssemblerGetCtxValue(SpotAssembler * self,rc_t * const prc,uint64_t const keyId)47 ctx_value_t * SpotAssemblerGetCtxValue(SpotAssembler * self, rc_t *const prc, uint64_t const keyId)
48 {
49 return MMArrayGet(self->id2value, prc, keyId);
50 }
51
OpenKBTree(size_t cache_size,const char * tmpfs,uint64_t pid,struct KBTree ** const rslt,size_t const n,size_t const max)52 rc_t OpenKBTree(size_t cache_size, const char * tmpfs, uint64_t pid, struct KBTree **const rslt, size_t const n, size_t const max)
53 {
54 size_t const cacheSize = (((cache_size - (cache_size / 2) - (cache_size / 8)) / max)
55 + 0xFFFFF) & ~((size_t)0xFFFFF);
56 KFile *file = NULL;
57 KDirectory *dir;
58 char fname[4096];
59 rc_t rc;
60
61 rc = KDirectoryNativeDir(&dir);
62 if (rc)
63 return rc;
64
65 rc = string_printf(fname, sizeof(fname), NULL, "%s/key2id.%u.%u", tmpfs, pid, n); if (rc) return rc;
66 STSMSG(1, ("Path for scratch files: %s\n", fname));
67 rc = KDirectoryCreateFile(dir, &file, true, 0600, kcmInit, "%s", fname);
68 KDirectoryRemove(dir, 0, "%s", fname);
69 KDirectoryRelease(dir);
70 if (rc == 0) {
71 rc = KBTreeMakeUpdate(rslt, file, cacheSize,
72 false, kbtOpaqueKey,
73 1, 255, sizeof ( uint32_t ),
74 NULL
75 );
76 KFileRelease(file);
77 }
78 return rc;
79 }
80
GetKeyIDOld(SpotAssembler * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],size_t const namelen)81 rc_t GetKeyIDOld(SpotAssembler* const ctx, uint64_t *const rslt, bool *const wasInserted, char const key[], char const name[], size_t const namelen)
82 {
83 size_t const keylen = strlen(key);
84 rc_t rc;
85 uint64_t tmpKey;
86
87 if (ctx->key2id_count == 0) {
88 rc = OpenKBTree(ctx->cache_size, ctx->tmpfs, ctx->pid, &ctx->key2id[0], 1, 1);
89 if (rc) return rc;
90 ctx->key2id_count = 1;
91 }
92 if (keylen == 0 || memcmp(key, name, keylen) == 0) {
93 /* qname starts with read group; no append */
94 tmpKey = ctx->idCount[0];
95 rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, name, namelen);
96 }
97 else {
98 char sbuf[4096];
99 char *buf = sbuf;
100 char *hbuf = NULL;
101 size_t bsize = sizeof(sbuf);
102 size_t actsize;
103
104 if (keylen + namelen + 2 > bsize) {
105 hbuf = malloc(bsize = keylen + namelen + 2);
106 if (hbuf == NULL)
107 return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
108 buf = hbuf;
109 }
110 rc = string_printf(buf, bsize, &actsize, "%s\t%.*s", key, (int)namelen, name);
111
112 tmpKey = ctx->idCount[0];
113 rc = KBTreeEntry(ctx->key2id[0], &tmpKey, wasInserted, buf, actsize);
114 if (hbuf)
115 free(hbuf);
116 }
117 if (rc == 0) {
118 *rslt = tmpKey;
119 if (*wasInserted)
120 ++ctx->idCount[0];
121 }
122
123 return rc;
124 }
125
HashValue(unsigned const len,unsigned char const value[])126 static unsigned HashValue(unsigned const len, unsigned char const value[])
127 {
128 /* FNV-1a hash with folding */
129 unsigned long long h = 0xcbf29ce484222325ull;
130 unsigned i;
131
132 for (i = 0; i < len; ++i) {
133 int const octet = value[i];
134 h = (h ^ octet) * 0x100000001b3ull;
135 }
136 return (unsigned)(h ^ (h >> 32));
137 }
138
HashKey(void const * const key,size_t const keylen)139 static unsigned HashKey(void const *const key, size_t const keylen)
140 {
141 return HashValue(keylen, key) % NUM_ID_SPACES;
142 }
143
SeqHashKey(void const * const key,size_t const keylen)144 unsigned SeqHashKey(void const *const key, size_t const keylen)
145 {
146 return HashValue(keylen, key) % 0x10000;
147 }
148
149 #define USE_ILLUMINA_NAMING_POUND_NUMBER_SLASH_HACK 1
150
GetFixedNameLength(char const name[],size_t const namelen)151 static size_t GetFixedNameLength(char const name[], size_t const namelen)
152 {
153 #if USE_ILLUMINA_NAMING_POUND_NUMBER_SLASH_HACK
154 char const *const pound = string_chr(name, namelen, '#');
155
156 if (pound && pound + 2u < name + namelen && pound[1] >= '0' && pound[1] <= '9' && pound[2] == '/') {
157 return (size_t)(pound - name) + 2u;
158 }
159 #endif
160 return namelen;
161 }
162
SpotAssemblerGetKeyID(SpotAssembler * const ctx,uint64_t * const rslt,bool * const wasInserted,char const key[],char const name[],size_t const o_namelen)163 rc_t SpotAssemblerGetKeyID(SpotAssembler *const ctx,
164 uint64_t *const rslt,
165 bool *const wasInserted,
166 char const key[],
167 char const name[],
168 size_t const o_namelen)
169 {
170 rc_t rc;
171 size_t const namelen = GetFixedNameLength(name, o_namelen);
172
173 if (ctx->key2id_max == 1)
174 {
175 rc = GetKeyIDOld(ctx, rslt, wasInserted, key, name, namelen);
176 }
177 else
178 {
179 size_t const keylen = strlen(key);
180 unsigned const h = HashKey(key, keylen);
181 size_t f;
182 size_t e = ctx->key2id_count;
183 uint64_t tmpKey;
184
185 *rslt = 0;
186 {{
187 uint32_t const bucket_value = ctx->key2id_hash[h];
188 unsigned const n = (uint8_t) bucket_value;
189 unsigned const i1 = (uint8_t)(bucket_value >> 8);
190 unsigned const i2 = (uint8_t)(bucket_value >> 16);
191 unsigned const i3 = (uint8_t)(bucket_value >> 24);
192
193 if (n > 0 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i1]) == 0) {
194 f = i1;
195 /*
196 ctx->key2id_hash[h] = (i3 << 24) | (i2 << 16) | (i1 << 8) | n;
197 */
198 goto GET_ID;
199 }
200 if (n > 1 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i2]) == 0) {
201 f = i2;
202 ctx->key2id_hash[h] = (i3 << 24) | (i1 << 16) | (i2 << 8) | n;
203 goto GET_ID;
204 }
205 if (n > 2 && strcmp(key, ctx->key2id_names + ctx->key2id_name[i3]) == 0) {
206 f = i3;
207 ctx->key2id_hash[h] = (i2 << 24) | (i1 << 16) | (i3 << 8) | n;
208 goto GET_ID;
209 }
210 }}
211 f = 0;
212 while (f < e) {
213 size_t const m = (f + e) / 2;
214 size_t const oid = ctx->key2id_oid[m];
215 int const diff = strcmp(key, ctx->key2id_names + ctx->key2id_name[oid]);
216
217 if (diff < 0)
218 e = m;
219 else if (diff > 0)
220 f = m + 1;
221 else {
222 f = oid;
223 goto GET_ID;
224 }
225 }
226 if (ctx->key2id_count < ctx->key2id_max) {
227 size_t const name_max = ctx->key2id_name_max + keylen + 1;
228 KBTree *tree;
229 rc = OpenKBTree(ctx->cache_size, ctx->tmpfs, ctx->pid, &tree, ctx->key2id_count + 1, 1); /* ctx->key2id_max); */
230
231 if (rc) return rc;
232
233 if (ctx->key2id_name_alloc < name_max) {
234 size_t alloc = ctx->key2id_name_alloc;
235 void *tmp;
236
237 if (alloc == 0)
238 alloc = 4096;
239 while (alloc < name_max)
240 alloc <<= 1;
241 tmp = realloc(ctx->key2id_names, alloc);
242 if (tmp == NULL)
243 return RC(rcExe, rcName, rcAllocating, rcMemory, rcExhausted);
244 ctx->key2id_names = tmp;
245 ctx->key2id_name_alloc = alloc;
246 }
247 if (f < ctx->key2id_count) {
248 memmove(&ctx->key2id_oid[f + 1], &ctx->key2id_oid[f], (ctx->key2id_count - f) * sizeof(ctx->key2id_oid[f]));
249 }
250 ctx->key2id_oid[f] = ctx->key2id_count;
251 ++ctx->key2id_count;
252 f = ctx->key2id_oid[f];
253 ctx->key2id_name[f] = ctx->key2id_name_max;
254 ctx->key2id_name_max = name_max;
255
256 memmove(&ctx->key2id_names[ctx->key2id_name[f]], key, keylen + 1);
257 ctx->key2id[f] = tree;
258 ctx->idCount[f] = 0;
259 if ((uint8_t)ctx->key2id_hash[h] < 3) {
260 unsigned const n = (uint8_t)ctx->key2id_hash[h] + 1;
261
262 ctx->key2id_hash[h] = (uint32_t)((((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | n);
263 }
264 else {
265 /* the hash function isn't working too well
266 * keep the 3 mru
267 */
268 ctx->key2id_hash[h] = (uint32_t)((((ctx->key2id_hash[h] & ~(0xFFu)) | f) << 8) | 3);
269 }
270 GET_ID:
271 tmpKey = ctx->idCount[f];
272 rc = KBTreeEntry(ctx->key2id[f], &tmpKey, wasInserted, name, namelen);
273 if (rc == 0) {
274 *rslt = (((uint64_t)f) << 32) | tmpKey;
275 if (*wasInserted)
276 {
277 ++ctx->idCount[f];
278 }
279 assert(tmpKey < ctx->idCount[f]);
280 }
281 }
282 else
283 {
284 rc = RC(rcExe, rcTree, rcAllocating, rcConstraint, rcViolated);
285 }
286 }
287
288 if ( rc == 0 && *wasInserted )
289 { /* save the read name, to be used when the mate shows up, or in SpotAssemblerWriteSoloFragments() */
290 rc = Id2Name_Add ( & ctx->id2name, *rslt, name );
291 }
292
293 return rc;
294 }
295
openTempFile(char const path[])296 static int openTempFile(char const path[])
297 {
298 int const fd = open(path, O_RDWR|O_TRUNC|O_CREAT, S_IRUSR|S_IWUSR);
299 unlink(path);
300 return fd;
301 }
302
OpenMMapFile(SpotAssembler * const ctx,KDirectory * const dir)303 static rc_t OpenMMapFile(SpotAssembler *const ctx, KDirectory *const dir)
304 {
305 char fname[4096];
306 rc_t rc = string_printf(fname, sizeof(fname), NULL, "%s/id2value.%u", ctx->tmpfs, ctx->pid);
307
308 if (rc == 0) {
309 int const fd = openTempFile(fname);
310 if (fd >= 0)
311 ctx->id2value = MMArrayMake(&rc, fd);
312 else
313 rc = RC(rcExe, rcFile, rcCreating, rcFile, rcNotFound);
314 }
315 return rc;
316 }
317
OpenMBankFile(SpotAssembler * const ctx,KDirectory * const dir)318 static rc_t OpenMBankFile(SpotAssembler *const ctx, KDirectory *const dir)
319 {
320 char fname[4096];
321 rc_t rc = string_printf(fname, sizeof(fname), NULL, "%s/fragments.%u", ctx->tmpfs, ctx->pid);
322
323 if (rc == 0) {
324 int const fd = openTempFile(fname);
325 if (fd < 0)
326 rc = RC(rcExe, rcFile, rcCreating, rcFile, rcNotFound);
327 else
328 ctx->fragmentFd = fd;
329 }
330 return rc;
331 }
332
SpotAssemblerMake(SpotAssembler ** p_self,size_t cache_size,const char * tmpfs,uint64_t pid)333 rc_t SpotAssemblerMake(SpotAssembler **p_self, size_t cache_size, const char * tmpfs, uint64_t pid)
334 {
335 rc_t rc = 0;
336
337 assert ( p_self != NULL );
338
339 SpotAssembler * self = calloc ( 1, sizeof ( * self ) );
340 if ( self == NULL )
341 {
342 return RC ( rcExe, rcName, rcAllocating, rcMemory, rcExhausted );
343 }
344
345 self -> fragment = calloc ( FRAGMENT_HOT_COUNT, sizeof ( self -> fragment [ 0 ] ) );
346 if ( self -> fragment == NULL )
347 {
348 free(self);
349 return RC ( rcExe, rcName, rcAllocating, rcMemory, rcExhausted );
350 }
351
352 self -> cache_size = cache_size;
353 self -> tmpfs = tmpfs;
354 self -> pid = pid;
355 self -> key2id_max = 1; /* make sure to use GetKeyIDOld() */
356
357 STSMSG(1, ("Cache size: %uM\n", cache_size / 1024 / 1024));
358
359 {
360 KDirectory *dir;
361 rc = KDirectoryNativeDir(&dir);
362 if (rc == 0)
363 rc = OpenMMapFile(self, dir);
364 if (rc == 0)
365 rc = OpenMBankFile(self, dir);
366
367 KDirectoryRelease(dir);
368 }
369
370 if ( rc == 0 )
371 {
372 rc = Id2Name_Init ( & self -> id2name );
373 }
374
375 if ( rc == 0 )
376 {
377 * p_self = self;
378 }
379 else
380 {
381 SpotAssemblerRelease ( self );
382 }
383
384 return rc;
385 }
386
SpotAssemblerReleaseMemBank(SpotAssembler * self)387 void SpotAssemblerReleaseMemBank(SpotAssembler *self)
388 {
389 int i;
390
391 for (i = 0; i < FRAGMENT_HOT_COUNT; ++i)
392 free(self->fragment[i].data);
393
394 close(self->fragmentFd);
395 }
396
SpotAssemblerRelease(SpotAssembler * self)397 void SpotAssemblerRelease(SpotAssembler * self)
398 {
399 size_t i;
400 for (i = 0; i != self->key2id_count; ++i)
401 {
402 KBTreeDropBacking ( self->key2id[i] );
403 KBTreeRelease ( self->key2id[i]) ;
404 self->key2id[i] = NULL;
405 }
406 free ( self->key2id_names );
407 self->key2id_names = NULL;
408
409 MMArrayWhack ( self->id2value );
410 Id2Name_Whack ( & self->id2name );
411
412 free(self->fragment);
413 free(self);
414 }
415
SpotAssemblerWriteSoloFragments(SpotAssembler * ctx,bool isColorSpace,INSDC_SRA_platform_id platform,bool keepMismatchQual,bool no_real_output,bool hasTI,const char * QualQuantizer,bool dropReadnames,SequenceWriter * seq,const struct KLoadProgressbar * progress)416 rc_t SpotAssemblerWriteSoloFragments(SpotAssembler* ctx,
417 bool isColorSpace,
418 INSDC_SRA_platform_id platform,
419 bool keepMismatchQual,
420 bool no_real_output,
421 bool hasTI,
422 const char * QualQuantizer,
423 bool dropReadnames,
424 SequenceWriter *seq,
425 const struct KLoadProgressbar *progress)
426 {
427 uint32_t i;
428 unsigned j;
429 uint64_t idCount = 0;
430 rc_t rc;
431 KDataBuffer fragBuf;
432 SequenceRecord srec;
433
434 memset(&srec, 0, sizeof(srec));
435
436 rc = KDataBufferMake(&fragBuf, 8, 0);
437 if (rc) {
438 (void)LOGERR(klogErr, rc, "KDataBufferMake failed");
439 return rc;
440 }
441 for (idCount = 0, j = 0; j < ctx->key2id_count; ++j) {
442 idCount += ctx->idCount[j];
443 }
444 KLoadProgressbar_Append(progress, idCount);
445
446 for (idCount = 0, j = 0; j < ctx->key2id_count; ++j) {
447 for (i = 0; i != ctx->idCount[j]; ++i, ++idCount) {
448 uint64_t const keyId = ((uint64_t)j << 32) | i;
449 ctx_value_t *value;
450 unsigned readLen[2];
451 unsigned read = 0;
452 FragmentInfo const *fip;
453 uint8_t const *src;
454
455 value = MMArrayGet(ctx->id2value, &rc, keyId);
456 if (value == NULL)
457 break;
458 KLoadProgressbar_Process(progress, 1, false);
459
460 if (value->written)
461 continue;
462
463 assert(!value->unmated);
464 if (ctx->fragment[keyId % FRAGMENT_HOT_COUNT].id == keyId) {
465 fip = (FragmentInfo const *)ctx->fragment[keyId % FRAGMENT_HOT_COUNT].data;
466 }
467 else {
468 rc = KDataBufferResize(&fragBuf, (size_t)value->fragmentSize);
469 if (rc == 0) {
470 int64_t const nread = pread(ctx->fragmentFd, fragBuf.base, value->fragmentSize, value->fragmentOffset);
471 if (nread == value->fragmentSize)
472 fip = (FragmentInfo const *)fragBuf.base;
473 else {
474 (void)LOGERR(klogErr, rc = RC(rcExe, rcFile, rcReading, rcData, rcNotFound), "KMemBankRead failed");
475 break;
476 }
477 }
478 else {
479 (void)LOGERR(klogErr, rc, "KDataBufferResize failed");
480 break;
481 }
482 }
483 src = (uint8_t const *)&fip[1];
484
485 readLen[0] = readLen[1] = 0;
486 read = fip->otherReadNo - 1;
487
488 readLen[read] = fip->readlen;
489 rc = SequenceRecordInit(&srec, 2, readLen);
490 if (rc) {
491 (void)LOGERR(klogErr, rc, "SequenceRecordInit failed");
492 break;
493 }
494
495 srec.is_bad[read] = fip->is_bad;
496 srec.orientation[read] = fip->orientation;
497 srec.cskey[read] = fip->cskey;
498 memmove(srec.seq + srec.readStart[read], src, srec.readLen[read]);
499 src += fip->readlen;
500
501 memmove(srec.qual + srec.readStart[read], src, srec.readLen[read]);
502 src += fip->readlen;
503 srec.spotGroup = (char *)src;
504 srec.spotGroupLen = fip->sglen;
505 srec.keyId = keyId;
506
507 rc = Id2Name_Get ( & ctx->id2name, keyId, (const char**) & srec.spotName );
508 if (rc)
509 {
510 (void)LOGERR(klogErr, rc, "Is2Name_Get failed");
511 break;
512 }
513 srec.spotNameLen = strlen(srec.spotName);
514
515 rc = SequenceWriteRecord(seq, &srec, isColorSpace, false, platform, keepMismatchQual, no_real_output, hasTI, QualQuantizer, dropReadnames);
516 if (rc) {
517 (void)LOGERR(klogErr, rc, "SequenceWriteRecord failed");
518 break;
519 }
520 /*rc = KMemBankFree(frags, id);*/
521 CTX_VALUE_SET_S_ID(*value, ++ctx->spotId);
522 value->written = 1;
523 }
524 }
525 /*printf("DONE_SOLO:\tcnt2=%d\tcnt1=%d\n",fcountBoth,fcountOne);*/
526 KDataBufferWhack(&fragBuf);
527 KDataBufferWhack(&srec.storage);
528 return rc;
529 }
530
531 FragmentEntry *
SpotAssemblerGetFragmentEntry(SpotAssembler * self,uint64_t keyId)532 SpotAssemblerGetFragmentEntry(SpotAssembler * self, uint64_t keyId)
533 {
534 return & self -> fragment [ keyId % FRAGMENT_HOT_COUNT ];
535 }
536