1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 #define BLOCKSIZE (4096 * 4096)
64
65 static unsigned int udb_dbaccel = 0;
66
67 typedef struct wordfreq
68 {
69 unsigned int kmer;
70 unsigned int count;
71 } wordfreq_t;
72
wc_compare(const void * a,const void * b)73 int wc_compare(const void * a, const void * b)
74 {
75 auto * x = (wordfreq_t *) a;
76 auto * y = (wordfreq_t *) b;
77 if (x->count < y->count) {
78 return -1;
79 } else if (x->count > y->count) {
80 return +1;
81 } else
82 {
83 if (x->kmer < y->kmer) {
84 return +1;
85 } else if (x->kmer > y->kmer) {
86 return -1;
87 } else {
88 return 0;
89
90 }
91 }
92 }
93
largeread(int fd,void * buf,uint64_t nbyte,uint64_t offset)94 uint64_t largeread(int fd, void * buf, uint64_t nbyte, uint64_t offset)
95 {
96 /* call pread multiple times and update progress */
97
98 uint64_t progress = offset;
99 for(uint64_t i = 0; i < nbyte; i += BLOCKSIZE)
100 {
101 uint64_t res = xlseek(fd, offset + i, SEEK_SET);
102 if (res != offset + i) {
103 fatal("Unable to seek in UDB file or invalid UDB file");
104
105 }
106
107 uint64 rem = MIN(BLOCKSIZE, nbyte - i);
108 uint64_t bytesread = read(fd, ((char*)buf) + i, rem);
109 if (bytesread != rem) {
110 fatal("Unable to read from UDB file or invalid UDB file");
111
112 }
113
114 progress += rem;
115 progress_update(progress);
116 }
117 return nbyte;
118 }
119
largewrite(int fd,void * buf,uint64_t nbyte,uint64_t offset)120 uint64_t largewrite(int fd, void * buf, uint64_t nbyte, uint64_t offset)
121 {
122 /* call write multiple times and update progress */
123
124 uint64_t progress = offset;
125 for(uint64_t i = 0; i < nbyte; i += BLOCKSIZE)
126 {
127 uint64_t res = xlseek(fd, offset + i, SEEK_SET);
128 if (res != offset + i) {
129 fatal("Unable to seek in UDB file or invalid UDB file");
130
131 }
132
133 uint64 rem = MIN(BLOCKSIZE, nbyte - i);
134 uint64_t byteswritten = write(fd, ((char*)buf) + i, rem);
135 if (byteswritten != rem) {
136 fatal("Unable to write to UDB file");
137
138 }
139
140 progress += rem;
141 progress_update(progress);
142 }
143 return nbyte;
144 }
145
udb_detect_isudb(const char * filename)146 bool udb_detect_isudb(const char * filename)
147 {
148 /*
149 Detect whether the given filename seems to refer to an UDB file.
150 It must be an uncompressed regular file, not a pipe.
151 */
152
153 xstat_t fs;
154
155 if (xstat(filename, & fs)) {
156 fatal("Unable to get status for input file (%s)", filename);
157 }
158
159 bool is_pipe = S_ISFIFO(fs.st_mode);
160 if (is_pipe) {
161 return false;
162 }
163
164 int fd = 0;
165 fd = xopen_read(filename);
166 if (!fd) {
167 fatal("Unable to open input file for reading (%s)", filename);
168 }
169
170 unsigned int magic = 0;
171 uint64_t bytesread = read(fd, & magic, 4);
172 close(fd);
173
174 if ((bytesread == 4) && (magic == 0x55444246)) {
175 return true;
176 }
177
178 return false;
179 }
180
udb_info()181 void udb_info()
182 {
183 /* Read UDB header and show basic info */
184
185 unsigned int buffer[50];
186
187 int fd_udbinfo = 0;
188
189 fd_udbinfo = xopen_read(opt_udbinfo);
190 if (! fd_udbinfo) {
191 fatal("Unable to open UDB file for reading");
192
193 }
194
195 uint64_t bytesread = read(fd_udbinfo, buffer, 4 * 50);
196 if (bytesread != 4 * 50) {
197 fatal("Unable to read from UDB file or invalid UDB file");
198
199 }
200
201 if ((buffer[0] != 0x55444246) ||
202 (buffer[2] != 32) ||
203 (buffer[4] < 3) ||
204 (buffer[4] > 15) ||
205 (buffer[13] == 0) ||
206 (buffer[17] != 0x0000746e) ||
207 (buffer[49] != 0x55444266)) {
208 fatal("Invalid UDB file");
209
210 }
211
212 if (!opt_quiet)
213 {
214 fprintf(stderr, " Seqs %u\n", buffer[13]);
215 fprintf(stderr, " SeqIx bits %u\n", buffer[2]);
216 fprintf(stderr, " Alpha nt (4)\n");
217 fprintf(stderr, " Word width %u\n", buffer[4]);
218 fprintf(stderr, " Slots %u\n", buffer[11]);
219 fprintf(stderr, " Dict size %u (%.1fk)\n",
220 (1 << (2 * buffer[4])),
221 (1 << (2 * buffer[4])) * 1.0 / 1000.0);
222 fprintf(stderr, " DBstep %u\n", buffer[5]);
223 fprintf(stderr, " DBAccel %u%%\n", buffer[6]);
224 }
225
226 if (opt_log)
227 {
228 fprintf(fp_log, " Seqs %u\n", buffer[13]);
229 fprintf(fp_log, " SeqIx bits %u\n", buffer[2]);
230 fprintf(fp_log, " Alpha nt (4)\n");
231 fprintf(fp_log, " Word width %u\n", buffer[4]);
232 fprintf(fp_log, " Slots %u\n", buffer[11]);
233 fprintf(fp_log, " Dict size %u (%.1fk)\n",
234 (1 << (2 * buffer[4])),
235 (1 << (2 * buffer[4])) * 1.0 / 1000.0);
236 fprintf(fp_log, " DBstep %u\n", buffer[5]);
237 fprintf(fp_log, " DBAccel %u%%\n", buffer[6]);
238 }
239
240 close(fd_udbinfo);
241 }
242
udb_read(const char * filename,bool create_bitmaps,bool parse_abundances)243 void udb_read(const char * filename,
244 bool create_bitmaps,
245 bool parse_abundances)
246 {
247 /* read UDB as indexed database */
248
249 unsigned int seqcount = 0;
250 unsigned int udb_wordlength = 0;
251 uint64 nucleotides = 0;
252
253 xstat_t fs;
254 if (xstat(filename, & fs)) {
255 fatal("Unable to get status for input file (%s)", filename);
256
257 }
258
259 bool is_pipe = S_ISFIFO(fs.st_mode);
260 if (is_pipe) {
261 fatal("Cannot read UDB file from a pipe");
262
263 }
264
265 /* get file size */
266
267 uint64_t filesize = fs.st_size;
268
269 /* open UDB file */
270
271 int fd_udb = 0;
272
273 fd_udb = xopen_read(filename);
274 if (! fd_udb) {
275 fatal("Unable to open UDB file for reading");
276
277 }
278
279 char * prompt = nullptr;
280 if (xsprintf(& prompt, "Reading UDB file %s", filename) == -1) {
281 fatal("Out of memory");
282
283 }
284
285 progress_init(prompt, filesize);
286
287 /* header */
288
289 unsigned int buffer[50];
290 uint64_t pos = 0;
291
292 pos += largeread(fd_udb, buffer, 4 * 50, pos);
293
294 if ((buffer[0] != 0x55444246) ||
295 (buffer[2] != 32) ||
296 (buffer[4] < 3) ||
297 (buffer[4] > 15) ||
298 (buffer[13] == 0) ||
299 (buffer[17] != 0x0000746e) ||
300 (buffer[49] != 0x55444266)) {
301 fatal("Invalid UDB file");
302
303 }
304
305 udb_wordlength = buffer[4];
306 seqcount = buffer[13];
307 udb_dbaccel = buffer[6];
308
309 if (udb_wordlength != opt_wordlength)
310 {
311 fprintf(stderr, "\nWARNING: Wordlength adjusted to %u as indicated in UDB file\n", udb_wordlength);
312 opt_wordlength = udb_wordlength;
313 }
314
315 /* word match counts */
316
317 kmerhashsize = 1 << (2 * udb_wordlength);
318 kmercount = (unsigned int*) xmalloc(kmerhashsize * sizeof(unsigned int));
319 kmerhash = (uint64_t *) xmalloc(kmerhashsize * sizeof(uint64_t));
320 kmerbitmap = (bitmap_t * *) xmalloc(kmerhashsize * sizeof(bitmap_t**));
321
322 memset(kmerbitmap, 0, kmerhashsize * sizeof(bitmap_t**));
323
324 pos += largeread(fd_udb, kmercount, 4 * kmerhashsize, pos);
325
326 kmerindexsize = 0;
327 for(uint64_t i = 0; i < kmerhashsize; i++)
328 {
329 kmerhash[i] = kmerindexsize;
330 kmerindexsize += kmercount[i];
331 }
332
333 /* signature */
334
335 pos += largeread(fd_udb, buffer, 4, pos);
336
337 if (buffer[0] != 0x55444233) {
338 fatal("Invalid UDB file");
339
340 }
341
342 /* sequence numbers for word matches */
343
344 kmerindex = (unsigned int *) xmalloc(kmerindexsize * 4);
345
346 pos += largeread(fd_udb, kmerindex, 4 * kmerindexsize, pos);
347
348 /* new header */
349
350 pos += largeread(fd_udb, buffer, 4 * 8, pos);
351
352 if ((buffer[0] != 0x55444234) ||
353 (buffer[1] != 0x005e0db3) ||
354 (buffer[2] != seqcount) ||
355 (buffer[7] != 0x005e0db4)) {
356 fatal("Invalid UDB file");
357
358 }
359
360 nucleotides = (((uint64_t) buffer[4]) << 32) | buffer[3];
361 uint64_t udb_headerchars = (((uint64_t) buffer[6]) << 32) | buffer[5];
362
363 /* header index */
364
365 seqindex = (seqinfo_t *) xmalloc(seqcount * sizeof(seqinfo_t));
366
367 int * header_index = (int *) xmalloc(4 * (seqcount+1));
368
369 pos += largeread(fd_udb, header_index, 4 * seqcount, pos);
370
371 header_index[seqcount] = udb_headerchars;
372
373 unsigned last = 0;
374 for(unsigned int i = 0; i < seqcount; i++)
375 {
376 unsigned int x = header_index[i];
377 if ((x < last) || (x >= udb_headerchars)) {
378 fatal("Invalid UDB file");
379
380 }
381 seqindex[i].header_p = x;
382 seqindex[i].headerlen = header_index[i+1] - x - 1;
383 seqindex[i].size = 1;
384 last = x;
385 }
386
387 xfree(header_index);
388
389 /* headers */
390
391 datap = (char *) xmalloc(udb_headerchars + nucleotides + seqcount);
392
393 pos += largeread(fd_udb, datap, udb_headerchars, pos);
394
395 uint64_t longestheader = 0;
396 for(unsigned int i = 0; i < seqcount; i++)
397 {
398 if (seqindex[i].headerlen > longestheader) {
399 longestheader = seqindex[i].headerlen;
400
401 }
402 }
403
404 /* sequence lengths */
405
406 int * sequence_lengths = (int *) xmalloc(4 * seqcount);
407
408 pos += largeread(fd_udb, sequence_lengths, 4 * seqcount, pos);
409
410 uint64_t sum = 0;
411 unsigned int shortest = UINT_MAX;
412 unsigned int longest = 0;
413
414 for(unsigned int i = 0; i < seqcount; i++)
415 {
416 unsigned int x = sequence_lengths[i];
417
418 seqindex[i].seq_p = udb_headerchars + sum;
419 seqindex[i].seqlen = x;
420 seqindex[i].qual_p = 0;
421
422 if (x < shortest) {
423 shortest = x;
424
425 }
426
427 if (x > longest) {
428 longest = x;
429
430 }
431
432 sum += x;
433
434 if (sum > nucleotides) {
435 fatal("Invalid UDB file");
436
437 }
438 }
439
440 xfree(sequence_lengths);
441
442 if (sum != nucleotides) {
443 fatal("Invalid UDB file");
444
445 }
446
447 /* sequences */
448
449 pos += largeread(fd_udb, datap + udb_headerchars, nucleotides, pos);
450
451 if (pos != filesize) {
452 fatal("Incorrect UDB file size");
453
454 }
455
456 /* close UDB file */
457
458 close(fd_udb);
459
460 progress_done();
461 xfree(prompt);
462
463 /* move sequences and insert zero at end of each sequence */
464
465 progress_init("Reorganizing data in memory", seqcount);
466 for(unsigned int i = seqcount-1; i > 0; i--)
467 {
468 size_t old_p = seqindex[i].seq_p;
469 size_t new_p = seqindex[i].seq_p + i;
470 size_t len = seqindex[i].seqlen;
471 memmove(datap + new_p, datap + old_p, len);
472 *(datap + new_p + len) = 0;
473 seqindex[i].seq_p = new_p;
474 progress_update(seqcount - i);
475 }
476 *(datap + seqindex[0].seq_p + seqindex[0].seqlen) = 0;
477 progress_done();
478
479 /* Create bitmaps for the most frequent words */
480
481 if (create_bitmaps)
482 {
483 progress_init("Creating bitmaps", kmerhashsize);
484 unsigned int bitmap_mincount = seqcount / 8;
485 for(unsigned int i = 0; i < kmerhashsize; i++)
486 {
487 if (kmercount[i] >= bitmap_mincount)
488 {
489 kmerbitmap[i] = bitmap_init(seqcount+127); // pad for xmm
490 bitmap_reset_all(kmerbitmap[i]);
491 for(unsigned j = 0; j < kmercount[i]; j++) {
492 bitmap_set(kmerbitmap[i], kmerindex[kmerhash[i]+j]);
493
494 }
495 }
496 progress_update(i+1);
497 }
498 progress_done();
499 }
500
501 /* get abundances and longest header */
502
503 if (parse_abundances)
504 {
505 progress_init("Parsing abundances", seqcount);
506 for(unsigned int i = 0; i < seqcount; i++)
507 {
508 int64_t size = header_get_size(datap + seqindex[i].header_p,
509 seqindex[i].headerlen);
510 if (size > 0) {
511 seqindex[i].size = size;
512 } else {
513 seqindex[i].size = 1;
514
515 }
516 progress_update(i+1);
517 }
518 progress_done();
519 }
520
521 /* set database info */
522
523 dbindex_uh = unique_init();
524
525 db_setinfo(false,
526 seqcount,
527 nucleotides,
528 longest,
529 shortest,
530 longestheader);
531
532 /* make mapping from indexno to seqno */
533
534 dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int));
535 dbindex_count = seqcount;
536
537 for (unsigned int i = 0; i < seqcount; i++) {
538 dbindex_map[i] = i;
539
540 }
541
542 /* done */
543
544 /* some stats */
545
546 if (!opt_quiet)
547 {
548 if (seqcount > 0) {
549 fprintf(stderr,
550 "%'" PRIu64 " nt in %'" PRIu64 " seqs, min %'" PRIu64 ", max %'" PRIu64 ", avg %'.0f\n",
551 db_getnucleotidecount(),
552 db_getsequencecount(),
553 db_getshortestsequence(),
554 db_getlongestsequence(),
555 db_getnucleotidecount() * 1.0 / db_getsequencecount());
556 } else {
557 fprintf(stderr,
558 "%'" PRIu64 " nt in %'" PRIu64 " seqs\n",
559 db_getnucleotidecount(),
560 db_getsequencecount());
561
562 }
563 }
564
565 if (opt_log)
566 {
567 if (seqcount > 0) {
568 fprintf(fp_log,
569 "%'" PRIu64 " nt in %'" PRIu64 " seqs, min %'" PRIu64 ", max %'" PRIu64 ", avg %'.0f\n\n",
570 db_getnucleotidecount(),
571 db_getsequencecount(),
572 db_getshortestsequence(),
573 db_getlongestsequence(),
574 db_getnucleotidecount() * 1.0 / db_getsequencecount());
575 } else {
576 fprintf(fp_log,
577 "%'" PRIu64 " nt in %'" PRIu64 " seqs\n\n",
578 db_getnucleotidecount(),
579 db_getsequencecount());
580
581 }
582 }
583 }
584
udb_fasta()585 void udb_fasta()
586 {
587 /* open FASTA file for writing */
588
589 FILE * fp_output = fopen_output(opt_output);
590 if (!fp_output) {
591 fatal("Unable to open FASTA output file for writing");
592
593 }
594
595 /* read UDB file */
596
597 udb_read(opt_udb2fasta, false, false);
598
599 /* dump fasta */
600
601 unsigned int seqcount = db_getsequencecount();
602 progress_init("Writing FASTA file", seqcount);
603 for(unsigned int i = 0; i < seqcount; i++)
604 {
605 fasta_print_db_relabel(fp_output, i, i+1);
606 progress_update(i+1);
607 }
608 progress_done();
609 fclose(fp_output);
610
611 dbindex_free();
612 db_free();
613 }
614
udb_stats()615 void udb_stats()
616 {
617 /* show word statistics for an UDB file */
618
619 /* read UDB file */
620
621 udb_read(opt_udbstats, false, false);
622
623 /* analyze word counts */
624
625 auto * freqtable = (wordfreq_t *) xmalloc
626 (sizeof(wordfreq_t) * kmerhashsize);
627
628 for(unsigned int i = 0; i < kmerhashsize; i++)
629 {
630 freqtable[i].kmer = i;
631 freqtable[i].count = kmercount[i];
632 }
633
634 qsort(freqtable, kmerhashsize, sizeof(wordfreq_t), wc_compare);
635
636 unsigned int wcmax = freqtable[kmerhashsize-1].count;
637 unsigned int wcmedian = ( freqtable[(kmerhashsize / 2) - 1].count +
638 freqtable[kmerhashsize / 2].count ) / 2;
639
640 unsigned int seqcount = db_getsequencecount();
641 uint64_t nt = db_getnucleotidecount();
642
643 /* show stats */
644
645 if (opt_log)
646 {
647 fprintf(fp_log, " Alphabet nt\n");
648 fprintf(fp_log, " Word width %" PRIu64 "\n", opt_wordlength);
649 fprintf(fp_log, " Word ones %" PRIu64 "\n", opt_wordlength);
650 fprintf(fp_log, " Spaced No\n");
651 fprintf(fp_log, " Hashed No\n");
652 fprintf(fp_log, " Coded No\n");
653 fprintf(fp_log, " Stepped No\n");
654 fprintf(fp_log,
655 " Slots %u (%.1fk)\n",
656 kmerhashsize,
657 1.0 * kmerhashsize / 1000.0);
658 fprintf(fp_log, " DBAccel %u%%\n", udb_dbaccel);
659 fprintf(fp_log, "\n");
660
661 fprintf(fp_log,
662 "%10" PRIu64 " DB size (%.1fk)\n",
663 nt,
664 1.0 * nt / 1000.0);
665 fprintf(fp_log, "%10" PRIu64 " Words\n", kmerindexsize);
666 fprintf(fp_log, "%10u Median size\n", wcmedian);
667 fprintf(fp_log,
668 "%10.1f Mean size\n",
669 1.0 * kmerindexsize / kmerhashsize);
670 fprintf(fp_log, "\n");
671
672 fprintf(fp_log,
673 " iWord sWord Cap Size Row\n");
674 fprintf(fp_log,
675 "---------- ------------ ---------- ---------- ---\n");
676
677 for(unsigned int i = 0; i < kmerhashsize; i++)
678 {
679 fprintf(fp_log,
680 "%10u ",
681 freqtable[kmerhashsize-1-i].kmer);
682
683 fprintf(fp_log,
684 "%.*s", MAX(12 - (int)(opt_wordlength), 0), " ");
685
686 fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize-1-i].kmer);
687
688 fprintf(fp_log,
689 " %10u %10u",
690 0,
691 freqtable[kmerhashsize-1-i].count);
692
693 fprintf(fp_log, " ");
694
695 for(unsigned j = 0; j < freqtable[kmerhashsize-1-i].count; j++)
696 {
697 fprintf(fp_log,
698 " %u", kmerindex[kmerhash[freqtable[kmerhashsize-1-i].kmer]+j]);
699
700 if (j == 7) {
701 break;
702
703 }
704 }
705
706
707 if (freqtable[kmerhashsize-1-i].count > 8) {
708 fprintf(fp_log, "...");
709
710 }
711
712 fprintf(fp_log, "\n");
713
714 if (i == 10) {
715 break;
716
717 }
718 }
719
720 fprintf(fp_log, "\n\n");
721
722 fprintf(fp_log, "Word width %" PRIu64 "\n", opt_wordlength);
723 fprintf(fp_log, "Slots %u\n", kmerhashsize);
724 fprintf(fp_log, "Words %" PRIu64 "\n", kmerindexsize);
725 fprintf(fp_log, "Max size %u (", wcmax);
726 fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize-1].kmer);
727 fprintf(fp_log, ")\n\n");
728
729 fprintf(fp_log, " Size lo Size hi Total size Nr. Words Pct TotPct\n");
730 fprintf(fp_log, "---------- ---------- ---------- ---------- ------ ------\n");
731
732
733 unsigned int size_lo = 0;
734 unsigned int size_hi = 0;
735 unsigned int x = 0;
736 double totpct = 0.0;
737
738 while (size_lo < seqcount)
739 {
740
741 int count = 0;
742 int size = 0;
743 while((x < kmerhashsize) && (freqtable[x].count <= size_hi))
744 {
745 count++;
746 size += freqtable[x].count;
747 x++;
748 }
749
750 double pct = 100.0 * count / kmerhashsize;
751 totpct += pct;
752
753 if (size_lo < size_hi) {
754 fprintf(fp_log, "%10u", size_lo);
755 } else {
756 fprintf(fp_log, " ");
757
758 }
759
760 fprintf(fp_log, " %10u", size_hi);
761
762 if (size >= 10000) {
763 fprintf(fp_log, " %9.1fk", size * 0.001);
764 } else {
765 fprintf(fp_log, " %10.1f", size * 1.0);
766
767 }
768
769 if (count >= 10000) {
770 fprintf(fp_log, " %9.1fk", count * 0.001);
771 } else {
772 fprintf(fp_log, " %10.1f", count * 1.0);
773
774 }
775
776 fprintf(fp_log, " %5.1f%% %5.1f%%", pct, totpct);
777
778 int dots = int (pct / 3.0 + 0.5);
779
780 if (dots > 0) {
781 fprintf(fp_log, " ");
782
783 }
784
785 for (int i = 0; i < dots ; i++) {
786 fprintf(fp_log, "*");
787
788 }
789
790 fprintf(fp_log, "\n");
791
792 size_lo = size_hi + 1;
793 if (size_hi > 0) {
794 size_hi *= 2;
795 } else {
796 size_hi = 1;
797
798 }
799 if (size_hi > seqcount) {
800 size_hi = seqcount;
801
802 }
803 }
804
805 fprintf(fp_log, "---------- ---------- ---------- ----------\n");
806 fprintf(fp_log, " ");
807
808 if (kmerindexsize >= 10000) {
809 fprintf(fp_log, " %9.1fk", kmerindexsize * 0.001);
810 } else {
811 fprintf(fp_log, " %10.1f", kmerindexsize * 1.0);
812
813 }
814
815 if (kmerhashsize >= 10000) {
816 fprintf(fp_log, " %9.1fk", kmerhashsize * 0.001);
817 } else {
818 fprintf(fp_log, " %10.1f", kmerhashsize * 1.0);
819
820 }
821
822 fprintf(fp_log, "\n\n");
823
824 fprintf(fp_log, "%10" PRIu64 " Upper\n", nt);
825 fprintf(fp_log, "%10u Lower (%.1f%%)\n", 0, 0.0);
826 fprintf(fp_log, "%10" PRIu64 " Total\n", nt);
827 fprintf(fp_log, "%10" PRIu64 " Indexed words\n", kmerindexsize);
828 }
829
830 xfree(freqtable);
831 dbindex_free();
832 db_free();
833 }
834
udb_make()835 void udb_make()
836 {
837 int fd_output = 0;
838
839 fd_output = xopen_write(opt_output);
840 if (!fd_output) {
841 fatal("Unable to open output file for writing");
842
843 }
844
845 db_read(opt_makeudb_usearch, 1);
846
847 if (opt_dbmask == MASK_DUST) {
848 dust_all();
849 } else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask)) {
850 hardmask_all();
851
852 }
853
854 dbindex_prepare(1, opt_dbmask);
855 dbindex_addallsequences(opt_dbmask);
856
857 unsigned int seqcount = db_getsequencecount();
858 uint64_t ntcount = db_getnucleotidecount();
859
860 uint64_t header_characters = 0;
861 for (unsigned int i=0; i<seqcount; i++) {
862 header_characters += db_getheaderlen(i) + 1;
863
864 }
865
866 uint64_t kmerhashsize = 1 << (2 * opt_wordlength);
867
868 /* count word matches */
869 uint64_t wordmatches = 0;
870 for(unsigned int i = 0; i < kmerhashsize; i++) {
871 wordmatches += kmercount[i];
872
873 }
874
875 uint64_t pos = 0;
876 uint64_t progress_all =
877 4 * 50 +
878 4 * kmerhashsize +
879 4 * 1 +
880 4 * wordmatches +
881 4 * 8 +
882 4 * seqcount +
883 header_characters +
884 4 * seqcount +
885 ntcount;
886
887 progress_init("Writing UDB file", progress_all);
888
889 uint64_t buffersize = 4 * MAX(50, seqcount);
890 auto * buffer = (unsigned int *) xmalloc(buffersize);
891 memset(buffer, 0, buffersize);
892
893 /* Header */
894 buffer[0] = 0x55444246; /* FBDU UDBF */
895 buffer[2] = 32; /* bits */
896 buffer[4] = opt_wordlength; /* default 8 */
897 buffer[5] = 1; /* dbstep */
898 buffer[6] = 100; /* dbaccelpct % */
899 buffer[11] = 0; /* slots */
900 buffer[13] = (unsigned int) seqcount; /* number of sequences */
901 buffer[17] = 0x0000746e; /* alphabet: "nt" */
902 buffer[49] = 0x55444266; /* fBDU UDBf */
903 pos += largewrite(fd_output, buffer, 50 * 4, 0);
904
905 /* write 4^wordlength uint32's with word match counts */
906 pos += largewrite(fd_output, kmercount, 4 * kmerhashsize, pos);
907
908 /* 3BDU */
909 buffer[0] = 0x55444233; /* 3BDU UDB3 */
910 pos += largewrite(fd_output, buffer, 1 * 4, pos);
911
912 /* lists of sequence no's with matches for all words */
913 for(unsigned int i = 0; i < kmerhashsize; i++)
914 {
915 if (kmerbitmap[i])
916 {
917 memset(buffer, 0, 4 * kmercount[i]);
918 unsigned int elements = 0;
919 for (unsigned int j = 0; j < seqcount; j++) {
920 if (bitmap_get(kmerbitmap[i], j)) {
921 buffer[elements++] = j;
922
923 }
924
925 }
926 pos += largewrite(fd_output, buffer, 4 * elements, pos);
927 }
928 else
929 {
930 if (kmercount[i] > 0)
931 {
932 pos += largewrite(fd_output,
933 kmerindex + kmerhash[i],
934 4 * kmercount[i],
935 pos);
936 }
937 }
938 }
939
940 /* New header */
941 buffer[0] = 0x55444234; /* 4BDU UDB4 */
942 /* 0x005e0db3 */
943 buffer[1] = 0x005e0db3;
944 /* number of sequences, uint32 */
945 buffer[2] = (unsigned int) seqcount;
946 /* total number of nucleotides, uint64 */
947 buffer[3] = (unsigned int)(ntcount & 0xffffffff);
948 buffer[4] = (unsigned int)(ntcount >> 32);
949 /* total number of header characters, incl zero-terminator, uint64 */
950 buffer[5] = (unsigned int)(header_characters & 0xffffffff);
951 buffer[6] = (unsigned int)(header_characters >> 32);
952 /* 0x005e0db4 */
953 buffer[7] = 0x005e0db4;
954 pos += largewrite(fd_output, buffer, 4 * 8, pos);
955
956 /* indices to headers (uint32) */
957 unsigned int sum = 0;
958 for (unsigned int i = 0; i < seqcount; i++)
959 {
960 buffer[i] = sum;
961 sum += db_getheaderlen(i) + 1;
962 }
963 pos += largewrite(fd_output, buffer, 4 * seqcount, pos);
964
965 /* headers (ascii, zero terminated, not padded) */
966 for (unsigned int i = 0; i < seqcount; i++)
967 {
968 unsigned int len = db_getheaderlen(i);
969 pos += largewrite(fd_output, db_getheader(i), len + 1, pos);
970 }
971
972 /* sequence lengths (uint32) */
973 for (unsigned int i = 0; i < seqcount; i++) {
974 buffer[i] = db_getsequencelen(i);
975
976 }
977 pos += largewrite(fd_output, buffer, 4 * seqcount, pos);
978
979 /* sequences (ascii, no term, no pad) */
980 for (unsigned int i = 0; i < seqcount; i++)
981 {
982 unsigned int len = db_getsequencelen(i);
983 pos += largewrite(fd_output, db_getsequence(i), len, pos);
984 }
985
986 if (close(fd_output) != 0) {
987 fatal("Unable to close UDB file");
988
989 }
990
991 progress_done();
992 dbindex_free();
993 db_free();
994 xfree(buffer);
995 }
996