1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 #define BLOCKSIZE (4096 * 4096)
64 
65 static unsigned int udb_dbaccel = 0;
66 
67 typedef struct wordfreq
68 {
69   unsigned int kmer;
70   unsigned int count;
71 } wordfreq_t;
72 
wc_compare(const void * a,const void * b)73 int wc_compare(const void * a, const void * b)
74 {
75   auto * x = (wordfreq_t *) a;
76   auto * y = (wordfreq_t *) b;
77   if (x->count < y->count) {
78     return -1;
79   } else if (x->count > y->count) {
80     return +1;
81   } else
82     {
83       if (x->kmer < y->kmer) {
84         return +1;
85       } else if (x->kmer > y->kmer) {
86         return -1;
87       } else {
88         return 0;
89 
90         }
91     }
92 }
93 
largeread(int fd,void * buf,uint64_t nbyte,uint64_t offset)94 uint64_t largeread(int fd, void * buf, uint64_t nbyte, uint64_t offset)
95 {
96   /* call pread multiple times and update progress */
97 
98   uint64_t progress = offset;
99   for(uint64_t i = 0; i < nbyte; i += BLOCKSIZE)
100     {
101       uint64_t res = xlseek(fd, offset + i, SEEK_SET);
102       if (res != offset + i) {
103         fatal("Unable to seek in UDB file or invalid UDB file");
104 
105         }
106 
107       uint64 rem = MIN(BLOCKSIZE, nbyte - i);
108       uint64_t bytesread = read(fd, ((char*)buf) + i, rem);
109       if (bytesread != rem) {
110         fatal("Unable to read from UDB file or invalid UDB file");
111 
112         }
113 
114       progress += rem;
115       progress_update(progress);
116     }
117   return nbyte;
118 }
119 
largewrite(int fd,void * buf,uint64_t nbyte,uint64_t offset)120 uint64_t largewrite(int fd, void * buf, uint64_t nbyte, uint64_t offset)
121 {
122   /* call write multiple times and update progress */
123 
124   uint64_t progress = offset;
125   for(uint64_t i = 0; i < nbyte; i += BLOCKSIZE)
126     {
127       uint64_t res = xlseek(fd, offset + i, SEEK_SET);
128       if (res != offset + i) {
129         fatal("Unable to seek in UDB file or invalid UDB file");
130 
131         }
132 
133       uint64 rem = MIN(BLOCKSIZE, nbyte - i);
134       uint64_t byteswritten = write(fd, ((char*)buf) + i, rem);
135       if (byteswritten != rem) {
136         fatal("Unable to write to UDB file");
137 
138         }
139 
140       progress += rem;
141       progress_update(progress);
142     }
143   return nbyte;
144 }
145 
udb_detect_isudb(const char * filename)146 bool udb_detect_isudb(const char * filename)
147 {
148   /*
149     Detect whether the given filename seems to refer to an UDB file.
150     It must be an uncompressed regular file, not a pipe.
151   */
152 
153   xstat_t fs;
154 
155   if (xstat(filename, & fs)) {
156     fatal("Unable to get status for input file (%s)", filename);
157   }
158 
159   bool is_pipe = S_ISFIFO(fs.st_mode);
160   if (is_pipe) {
161     return false;
162   }
163 
164   int fd = 0;
165   fd = xopen_read(filename);
166   if (!fd) {
167     fatal("Unable to open input file for reading (%s)", filename);
168   }
169 
170   unsigned int magic = 0;
171   uint64_t bytesread = read(fd, & magic, 4);
172   close(fd);
173 
174   if ((bytesread == 4) && (magic == 0x55444246)) {
175     return true;
176   }
177 
178   return false;
179 }
180 
udb_info()181 void udb_info()
182 {
183   /* Read UDB header and show basic info */
184 
185   unsigned int buffer[50];
186 
187   int fd_udbinfo = 0;
188 
189   fd_udbinfo = xopen_read(opt_udbinfo);
190   if (! fd_udbinfo) {
191     fatal("Unable to open UDB file for reading");
192 
193         }
194 
195   uint64_t bytesread = read(fd_udbinfo, buffer, 4 * 50);
196   if (bytesread != 4 * 50) {
197     fatal("Unable to read from UDB file or invalid UDB file");
198 
199         }
200 
201   if ((buffer[0]  != 0x55444246) ||
202       (buffer[2] != 32) ||
203       (buffer[4] < 3) ||
204       (buffer[4] > 15) ||
205       (buffer[13] == 0) ||
206       (buffer[17] != 0x0000746e) ||
207       (buffer[49] != 0x55444266)) {
208     fatal("Invalid UDB file");
209 
210         }
211 
212   if (!opt_quiet)
213     {
214       fprintf(stderr, "           Seqs  %u\n", buffer[13]);
215       fprintf(stderr, "     SeqIx bits  %u\n", buffer[2]);
216       fprintf(stderr, "          Alpha  nt (4)\n");
217       fprintf(stderr, "     Word width  %u\n", buffer[4]);
218       fprintf(stderr, "          Slots  %u\n", buffer[11]);
219       fprintf(stderr, "      Dict size  %u (%.1fk)\n",
220               (1 << (2 * buffer[4])),
221               (1 << (2 * buffer[4])) * 1.0 / 1000.0);
222       fprintf(stderr, "         DBstep  %u\n", buffer[5]);
223       fprintf(stderr, "        DBAccel  %u%%\n", buffer[6]);
224     }
225 
226   if (opt_log)
227     {
228       fprintf(fp_log, "           Seqs  %u\n", buffer[13]);
229       fprintf(fp_log, "     SeqIx bits  %u\n", buffer[2]);
230       fprintf(fp_log, "          Alpha  nt (4)\n");
231       fprintf(fp_log, "     Word width  %u\n", buffer[4]);
232       fprintf(fp_log, "          Slots  %u\n", buffer[11]);
233       fprintf(fp_log, "      Dict size  %u (%.1fk)\n",
234               (1 << (2 * buffer[4])),
235               (1 << (2 * buffer[4])) * 1.0 / 1000.0);
236       fprintf(fp_log, "         DBstep  %u\n", buffer[5]);
237       fprintf(fp_log, "        DBAccel  %u%%\n", buffer[6]);
238     }
239 
240   close(fd_udbinfo);
241 }
242 
udb_read(const char * filename,bool create_bitmaps,bool parse_abundances)243 void udb_read(const char * filename,
244               bool create_bitmaps,
245               bool parse_abundances)
246 {
247   /* read UDB as indexed database */
248 
249   unsigned int seqcount = 0;
250   unsigned int udb_wordlength = 0;
251   uint64 nucleotides = 0;
252 
253   xstat_t fs;
254   if (xstat(filename, & fs)) {
255     fatal("Unable to get status for input file (%s)", filename);
256 
257         }
258 
259   bool is_pipe = S_ISFIFO(fs.st_mode);
260   if (is_pipe) {
261     fatal("Cannot read UDB file from a pipe");
262 
263         }
264 
265   /* get file size */
266 
267   uint64_t filesize = fs.st_size;
268 
269   /* open UDB file */
270 
271   int fd_udb = 0;
272 
273   fd_udb = xopen_read(filename);
274   if (! fd_udb) {
275     fatal("Unable to open UDB file for reading");
276 
277         }
278 
279   char * prompt = nullptr;
280   if (xsprintf(& prompt, "Reading UDB file %s", filename) == -1) {
281     fatal("Out of memory");
282 
283         }
284 
285   progress_init(prompt, filesize);
286 
287   /* header */
288 
289   unsigned int buffer[50];
290   uint64_t pos = 0;
291 
292   pos += largeread(fd_udb, buffer, 4 * 50, pos);
293 
294   if ((buffer[0]  != 0x55444246) ||
295       (buffer[2] != 32) ||
296       (buffer[4] < 3) ||
297       (buffer[4] > 15) ||
298       (buffer[13] == 0) ||
299       (buffer[17] != 0x0000746e) ||
300       (buffer[49] != 0x55444266)) {
301     fatal("Invalid UDB file");
302 
303         }
304 
305   udb_wordlength = buffer[4];
306   seqcount = buffer[13];
307   udb_dbaccel = buffer[6];
308 
309   if (udb_wordlength != opt_wordlength)
310     {
311       fprintf(stderr, "\nWARNING: Wordlength adjusted to %u as indicated in UDB file\n", udb_wordlength);
312       opt_wordlength = udb_wordlength;
313     }
314 
315   /* word match counts */
316 
317   kmerhashsize = 1 << (2 * udb_wordlength);
318   kmercount = (unsigned int*) xmalloc(kmerhashsize * sizeof(unsigned int));
319   kmerhash = (uint64_t *) xmalloc(kmerhashsize * sizeof(uint64_t));
320   kmerbitmap = (bitmap_t * *) xmalloc(kmerhashsize * sizeof(bitmap_t**));
321 
322   memset(kmerbitmap, 0, kmerhashsize * sizeof(bitmap_t**));
323 
324   pos += largeread(fd_udb, kmercount, 4 * kmerhashsize, pos);
325 
326   kmerindexsize = 0;
327   for(uint64_t i = 0; i < kmerhashsize; i++)
328     {
329       kmerhash[i] = kmerindexsize;
330       kmerindexsize += kmercount[i];
331     }
332 
333   /* signature */
334 
335   pos += largeread(fd_udb, buffer, 4, pos);
336 
337   if (buffer[0] != 0x55444233) {
338     fatal("Invalid UDB file");
339 
340         }
341 
342   /* sequence numbers for word matches */
343 
344   kmerindex = (unsigned int *) xmalloc(kmerindexsize * 4);
345 
346   pos += largeread(fd_udb, kmerindex, 4 * kmerindexsize, pos);
347 
348   /* new header */
349 
350   pos += largeread(fd_udb, buffer, 4 * 8, pos);
351 
352   if ((buffer[0] != 0x55444234) ||
353       (buffer[1] != 0x005e0db3) ||
354       (buffer[2] != seqcount) ||
355       (buffer[7] != 0x005e0db4)) {
356     fatal("Invalid UDB file");
357 
358         }
359 
360   nucleotides = (((uint64_t) buffer[4]) << 32) | buffer[3];
361   uint64_t udb_headerchars = (((uint64_t) buffer[6]) << 32) | buffer[5];
362 
363   /* header index */
364 
365   seqindex = (seqinfo_t *) xmalloc(seqcount * sizeof(seqinfo_t));
366 
367   int * header_index = (int *) xmalloc(4 * (seqcount+1));
368 
369   pos += largeread(fd_udb, header_index, 4 * seqcount, pos);
370 
371   header_index[seqcount] = udb_headerchars;
372 
373   unsigned last = 0;
374   for(unsigned int i = 0; i < seqcount; i++)
375     {
376       unsigned int x = header_index[i];
377       if ((x < last) || (x >= udb_headerchars)) {
378         fatal("Invalid UDB file");
379 
380         }
381       seqindex[i].header_p = x;
382       seqindex[i].headerlen = header_index[i+1] - x - 1;
383       seqindex[i].size = 1;
384       last = x;
385     }
386 
387   xfree(header_index);
388 
389   /* headers */
390 
391   datap = (char *) xmalloc(udb_headerchars + nucleotides + seqcount);
392 
393   pos += largeread(fd_udb, datap, udb_headerchars, pos);
394 
395   uint64_t longestheader = 0;
396   for(unsigned int i = 0; i < seqcount; i++)
397     {
398       if (seqindex[i].headerlen > longestheader) {
399         longestheader = seqindex[i].headerlen;
400 
401         }
402     }
403 
404   /* sequence lengths */
405 
406   int * sequence_lengths = (int *) xmalloc(4 * seqcount);
407 
408   pos += largeread(fd_udb, sequence_lengths, 4 * seqcount, pos);
409 
410   uint64_t sum = 0;
411   unsigned int shortest = UINT_MAX;
412   unsigned int longest = 0;
413 
414   for(unsigned int i = 0; i < seqcount; i++)
415     {
416       unsigned int x = sequence_lengths[i];
417 
418       seqindex[i].seq_p = udb_headerchars + sum;
419       seqindex[i].seqlen = x;
420       seqindex[i].qual_p = 0;
421 
422       if (x < shortest) {
423         shortest = x;
424 
425         }
426 
427       if (x > longest) {
428         longest = x;
429 
430         }
431 
432       sum += x;
433 
434       if (sum > nucleotides) {
435         fatal("Invalid UDB file");
436 
437         }
438     }
439 
440   xfree(sequence_lengths);
441 
442   if (sum != nucleotides) {
443     fatal("Invalid UDB file");
444 
445         }
446 
447   /* sequences */
448 
449   pos += largeread(fd_udb, datap + udb_headerchars, nucleotides, pos);
450 
451   if (pos != filesize) {
452     fatal("Incorrect UDB file size");
453 
454         }
455 
456   /* close UDB file */
457 
458   close(fd_udb);
459 
460   progress_done();
461   xfree(prompt);
462 
463   /* move sequences and insert zero at end of each sequence */
464 
465   progress_init("Reorganizing data in memory", seqcount);
466   for(unsigned int i = seqcount-1; i > 0; i--)
467     {
468       size_t old_p = seqindex[i].seq_p;
469       size_t new_p = seqindex[i].seq_p + i;
470       size_t len   = seqindex[i].seqlen;
471       memmove(datap + new_p, datap + old_p, len);
472       *(datap + new_p + len) = 0;
473       seqindex[i].seq_p = new_p;
474       progress_update(seqcount - i);
475     }
476   *(datap + seqindex[0].seq_p + seqindex[0].seqlen) = 0;
477   progress_done();
478 
479   /* Create bitmaps for the most frequent words */
480 
481   if (create_bitmaps)
482     {
483       progress_init("Creating bitmaps", kmerhashsize);
484       unsigned int bitmap_mincount = seqcount / 8;
485       for(unsigned int i = 0; i < kmerhashsize; i++)
486         {
487           if (kmercount[i] >= bitmap_mincount)
488             {
489               kmerbitmap[i] = bitmap_init(seqcount+127); // pad for xmm
490               bitmap_reset_all(kmerbitmap[i]);
491               for(unsigned j = 0; j < kmercount[i]; j++) {
492                 bitmap_set(kmerbitmap[i], kmerindex[kmerhash[i]+j]);
493 
494         }
495             }
496           progress_update(i+1);
497         }
498       progress_done();
499     }
500 
501   /* get abundances and longest header */
502 
503   if (parse_abundances)
504     {
505       progress_init("Parsing abundances", seqcount);
506       for(unsigned int i = 0; i < seqcount; i++)
507         {
508           int64_t size = header_get_size(datap + seqindex[i].header_p,
509                                          seqindex[i].headerlen);
510           if (size > 0) {
511             seqindex[i].size = size;
512           } else {
513             seqindex[i].size = 1;
514 
515         }
516           progress_update(i+1);
517         }
518       progress_done();
519     }
520 
521   /* set database info */
522 
523   dbindex_uh = unique_init();
524 
525   db_setinfo(false,
526              seqcount,
527              nucleotides,
528              longest,
529              shortest,
530              longestheader);
531 
532   /* make mapping from indexno to seqno */
533 
534   dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int));
535   dbindex_count = seqcount;
536 
537   for (unsigned int i = 0; i < seqcount; i++) {
538     dbindex_map[i] = i;
539 
540         }
541 
542   /* done */
543 
544   /* some stats */
545 
546   if (!opt_quiet)
547     {
548       if (seqcount > 0) {
549         fprintf(stderr,
550                 "%'" PRIu64 " nt in %'" PRIu64 " seqs, min %'" PRIu64 ", max %'" PRIu64 ", avg %'.0f\n",
551                 db_getnucleotidecount(),
552                 db_getsequencecount(),
553                 db_getshortestsequence(),
554                 db_getlongestsequence(),
555                 db_getnucleotidecount() * 1.0 / db_getsequencecount());
556       } else {
557         fprintf(stderr,
558                 "%'" PRIu64 " nt in %'" PRIu64 " seqs\n",
559                 db_getnucleotidecount(),
560                 db_getsequencecount());
561 
562         }
563     }
564 
565   if (opt_log)
566     {
567       if (seqcount > 0) {
568         fprintf(fp_log,
569                 "%'" PRIu64 " nt in %'" PRIu64 " seqs, min %'" PRIu64 ", max %'" PRIu64 ", avg %'.0f\n\n",
570                 db_getnucleotidecount(),
571                 db_getsequencecount(),
572                 db_getshortestsequence(),
573                 db_getlongestsequence(),
574                 db_getnucleotidecount() * 1.0 / db_getsequencecount());
575       } else {
576         fprintf(fp_log,
577                 "%'" PRIu64 " nt in %'" PRIu64 " seqs\n\n",
578                 db_getnucleotidecount(),
579                 db_getsequencecount());
580 
581         }
582     }
583 }
584 
udb_fasta()585 void udb_fasta()
586 {
587   /* open FASTA file for writing */
588 
589   FILE * fp_output = fopen_output(opt_output);
590   if (!fp_output) {
591     fatal("Unable to open FASTA output file for writing");
592 
593         }
594 
595   /* read UDB file */
596 
597   udb_read(opt_udb2fasta, false, false);
598 
599   /* dump fasta */
600 
601   unsigned int seqcount = db_getsequencecount();
602   progress_init("Writing FASTA file", seqcount);
603   for(unsigned int i = 0; i < seqcount; i++)
604     {
605       fasta_print_db_relabel(fp_output, i, i+1);
606       progress_update(i+1);
607     }
608   progress_done();
609   fclose(fp_output);
610 
611   dbindex_free();
612   db_free();
613 }
614 
udb_stats()615 void udb_stats()
616 {
617   /* show word statistics for an UDB file */
618 
619   /* read UDB file */
620 
621   udb_read(opt_udbstats, false, false);
622 
623   /* analyze word counts */
624 
625   auto * freqtable = (wordfreq_t *) xmalloc
626     (sizeof(wordfreq_t) * kmerhashsize);
627 
628   for(unsigned int i = 0; i < kmerhashsize; i++)
629     {
630       freqtable[i].kmer = i;
631       freqtable[i].count = kmercount[i];
632     }
633 
634   qsort(freqtable, kmerhashsize, sizeof(wordfreq_t), wc_compare);
635 
636   unsigned int wcmax = freqtable[kmerhashsize-1].count;
637   unsigned int wcmedian = ( freqtable[(kmerhashsize / 2) - 1].count +
638                             freqtable[kmerhashsize / 2].count ) / 2;
639 
640   unsigned int seqcount = db_getsequencecount();
641   uint64_t nt = db_getnucleotidecount();
642 
643   /* show stats */
644 
645   if (opt_log)
646     {
647       fprintf(fp_log, "      Alphabet  nt\n");
648       fprintf(fp_log, "    Word width  %" PRIu64 "\n", opt_wordlength);
649       fprintf(fp_log, "     Word ones  %" PRIu64 "\n", opt_wordlength);
650       fprintf(fp_log, "        Spaced  No\n");
651       fprintf(fp_log, "        Hashed  No\n");
652       fprintf(fp_log, "         Coded  No\n");
653       fprintf(fp_log, "       Stepped  No\n");
654       fprintf(fp_log,
655               "         Slots  %u (%.1fk)\n",
656               kmerhashsize,
657               1.0 * kmerhashsize / 1000.0);
658       fprintf(fp_log, "       DBAccel  %u%%\n", udb_dbaccel);
659       fprintf(fp_log, "\n");
660 
661       fprintf(fp_log,
662               "%10" PRIu64 "  DB size (%.1fk)\n",
663               nt,
664               1.0 * nt / 1000.0);
665       fprintf(fp_log, "%10" PRIu64 "  Words\n", kmerindexsize);
666       fprintf(fp_log, "%10u  Median size\n", wcmedian);
667       fprintf(fp_log,
668               "%10.1f  Mean size\n",
669               1.0 * kmerindexsize / kmerhashsize);
670       fprintf(fp_log, "\n");
671 
672       fprintf(fp_log,
673               "     iWord         sWord         Cap        Size  Row\n");
674       fprintf(fp_log,
675               "----------  ------------  ----------  ----------  ---\n");
676 
677       for(unsigned int i = 0; i < kmerhashsize; i++)
678         {
679           fprintf(fp_log,
680                   "%10u  ",
681                   freqtable[kmerhashsize-1-i].kmer);
682 
683           fprintf(fp_log,
684                   "%.*s", MAX(12 - (int)(opt_wordlength), 0), "            ");
685 
686           fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize-1-i].kmer);
687 
688           fprintf(fp_log,
689                   "  %10u  %10u",
690                   0,
691                   freqtable[kmerhashsize-1-i].count);
692 
693           fprintf(fp_log, " ");
694 
695           for(unsigned j = 0; j < freqtable[kmerhashsize-1-i].count; j++)
696             {
697               fprintf(fp_log,
698                       " %u", kmerindex[kmerhash[freqtable[kmerhashsize-1-i].kmer]+j]);
699 
700               if (j == 7) {
701                 break;
702 
703         }
704             }
705 
706 
707           if (freqtable[kmerhashsize-1-i].count > 8) {
708             fprintf(fp_log, "...");
709 
710         }
711 
712           fprintf(fp_log, "\n");
713 
714           if (i == 10) {
715             break;
716 
717         }
718         }
719 
720       fprintf(fp_log, "\n\n");
721 
722       fprintf(fp_log, "Word width  %" PRIu64 "\n", opt_wordlength);
723       fprintf(fp_log, "Slots       %u\n", kmerhashsize);
724       fprintf(fp_log, "Words       %" PRIu64 "\n", kmerindexsize);
725       fprintf(fp_log, "Max size    %u (", wcmax);
726       fprint_kmer(fp_log, opt_wordlength, freqtable[kmerhashsize-1].kmer);
727       fprintf(fp_log, ")\n\n");
728 
729       fprintf(fp_log, "   Size lo     Size hi  Total size   Nr. Words     Pct  TotPct\n");
730       fprintf(fp_log, "----------  ----------  ----------  ----------  ------  ------\n");
731 
732 
733       unsigned int size_lo = 0;
734       unsigned int size_hi = 0;
735       unsigned int x = 0;
736       double totpct = 0.0;
737 
738       while (size_lo < seqcount)
739         {
740 
741           int count = 0;
742           int size = 0;
743           while((x < kmerhashsize) && (freqtable[x].count <= size_hi))
744             {
745               count++;
746               size += freqtable[x].count;
747               x++;
748             }
749 
750           double pct = 100.0 * count / kmerhashsize;
751           totpct += pct;
752 
753           if (size_lo < size_hi) {
754             fprintf(fp_log, "%10u", size_lo);
755           } else {
756             fprintf(fp_log, "          ");
757 
758         }
759 
760           fprintf(fp_log, "  %10u", size_hi);
761 
762           if (size >= 10000) {
763             fprintf(fp_log, "  %9.1fk", size * 0.001);
764           } else {
765             fprintf(fp_log, "  %10.1f", size * 1.0);
766 
767         }
768 
769           if (count >= 10000) {
770             fprintf(fp_log, "  %9.1fk", count * 0.001);
771           } else {
772             fprintf(fp_log, "  %10.1f", count * 1.0);
773 
774         }
775 
776           fprintf(fp_log, "  %5.1f%%  %5.1f%%", pct, totpct);
777 
778           int dots = int (pct / 3.0 + 0.5);
779 
780           if (dots > 0) {
781             fprintf(fp_log, "  ");
782 
783         }
784 
785           for (int i = 0; i < dots ; i++) {
786             fprintf(fp_log, "*");
787 
788         }
789 
790           fprintf(fp_log, "\n");
791 
792           size_lo = size_hi + 1;
793           if (size_hi > 0) {
794             size_hi *= 2;
795           } else {
796             size_hi = 1;
797 
798         }
799           if (size_hi > seqcount) {
800             size_hi = seqcount;
801 
802         }
803         }
804 
805       fprintf(fp_log, "----------  ----------  ----------  ----------\n");
806       fprintf(fp_log, "                      ");
807 
808       if (kmerindexsize >= 10000) {
809         fprintf(fp_log, "  %9.1fk", kmerindexsize * 0.001);
810       } else {
811         fprintf(fp_log, "  %10.1f", kmerindexsize * 1.0);
812 
813         }
814 
815       if (kmerhashsize >= 10000) {
816         fprintf(fp_log, "  %9.1fk", kmerhashsize * 0.001);
817       } else {
818         fprintf(fp_log, "  %10.1f", kmerhashsize * 1.0);
819 
820         }
821 
822       fprintf(fp_log, "\n\n");
823 
824       fprintf(fp_log, "%10" PRIu64 "  Upper\n", nt);
825       fprintf(fp_log, "%10u  Lower (%.1f%%)\n", 0, 0.0);
826       fprintf(fp_log, "%10" PRIu64 "  Total\n", nt);
827       fprintf(fp_log, "%10" PRIu64 "  Indexed words\n", kmerindexsize);
828   }
829 
830   xfree(freqtable);
831   dbindex_free();
832   db_free();
833 }
834 
udb_make()835 void udb_make()
836 {
837   int fd_output = 0;
838 
839   fd_output = xopen_write(opt_output);
840   if (!fd_output) {
841     fatal("Unable to open output file for writing");
842 
843         }
844 
845   db_read(opt_makeudb_usearch, 1);
846 
847   if (opt_dbmask == MASK_DUST) {
848     dust_all();
849   } else if ((opt_dbmask == MASK_SOFT) && (opt_hardmask)) {
850     hardmask_all();
851 
852         }
853 
854   dbindex_prepare(1, opt_dbmask);
855   dbindex_addallsequences(opt_dbmask);
856 
857   unsigned int seqcount = db_getsequencecount();
858   uint64_t ntcount = db_getnucleotidecount();
859 
860   uint64_t header_characters = 0;
861   for (unsigned int i=0; i<seqcount; i++) {
862     header_characters += db_getheaderlen(i) + 1;
863 
864         }
865 
866   uint64_t kmerhashsize = 1 << (2 * opt_wordlength);
867 
868   /* count word matches */
869   uint64_t wordmatches = 0;
870   for(unsigned int i = 0; i < kmerhashsize; i++) {
871     wordmatches += kmercount[i];
872 
873         }
874 
875   uint64_t pos = 0;
876   uint64_t progress_all =
877     4 * 50 +
878     4 * kmerhashsize +
879     4 * 1 +
880     4 * wordmatches +
881     4 * 8 +
882     4 * seqcount +
883     header_characters +
884     4 * seqcount +
885     ntcount;
886 
887   progress_init("Writing UDB file", progress_all);
888 
889   uint64_t buffersize = 4 * MAX(50, seqcount);
890   auto * buffer = (unsigned int *) xmalloc(buffersize);
891   memset(buffer, 0, buffersize);
892 
893   /* Header */
894   buffer[0]  = 0x55444246; /* FBDU UDBF */
895   buffer[2]  = 32; /* bits */
896   buffer[4]  = opt_wordlength; /* default 8 */
897   buffer[5]  = 1; /* dbstep */
898   buffer[6]  = 100; /* dbaccelpct % */
899   buffer[11] = 0; /* slots */
900   buffer[13] = (unsigned int) seqcount; /* number of sequences */
901   buffer[17] = 0x0000746e; /* alphabet: "nt" */
902   buffer[49] = 0x55444266; /* fBDU UDBf */
903   pos += largewrite(fd_output, buffer, 50 * 4, 0);
904 
905   /* write 4^wordlength uint32's with word match counts */
906   pos += largewrite(fd_output, kmercount, 4 * kmerhashsize, pos);
907 
908   /* 3BDU */
909   buffer[0] = 0x55444233; /* 3BDU UDB3 */
910   pos += largewrite(fd_output, buffer, 1 * 4, pos);
911 
912   /* lists of sequence no's with matches for all words */
913   for(unsigned int i = 0; i < kmerhashsize; i++)
914     {
915       if (kmerbitmap[i])
916         {
917           memset(buffer, 0, 4 * kmercount[i]);
918           unsigned int elements = 0;
919           for (unsigned int j = 0; j < seqcount; j++) {
920             if (bitmap_get(kmerbitmap[i], j)) {
921               buffer[elements++] = j;
922 
923         }
924 
925         }
926           pos += largewrite(fd_output, buffer, 4 * elements, pos);
927         }
928       else
929         {
930           if (kmercount[i] > 0)
931             {
932               pos += largewrite(fd_output,
933                                 kmerindex + kmerhash[i],
934                                 4 * kmercount[i],
935                                 pos);
936             }
937         }
938     }
939 
940   /* New header */
941   buffer[0] = 0x55444234; /* 4BDU UDB4 */
942   /* 0x005e0db3 */
943   buffer[1] = 0x005e0db3;
944   /* number of sequences, uint32 */
945   buffer[2] = (unsigned int) seqcount;
946   /* total number of nucleotides, uint64 */
947   buffer[3] = (unsigned int)(ntcount & 0xffffffff);
948   buffer[4] = (unsigned int)(ntcount >> 32);
949   /* total number of header characters, incl zero-terminator, uint64 */
950   buffer[5] = (unsigned int)(header_characters & 0xffffffff);
951   buffer[6] = (unsigned int)(header_characters >> 32);
952   /* 0x005e0db4 */
953   buffer[7] = 0x005e0db4;
954   pos += largewrite(fd_output, buffer, 4 * 8, pos);
955 
956   /* indices to headers (uint32) */
957   unsigned int sum = 0;
958   for (unsigned int i = 0; i < seqcount; i++)
959     {
960       buffer[i] = sum;
961       sum += db_getheaderlen(i) + 1;
962     }
963   pos += largewrite(fd_output, buffer, 4 * seqcount, pos);
964 
965   /* headers (ascii, zero terminated, not padded) */
966   for (unsigned int i = 0; i < seqcount; i++)
967     {
968       unsigned int len = db_getheaderlen(i);
969       pos += largewrite(fd_output, db_getheader(i), len + 1, pos);
970     }
971 
972   /* sequence lengths (uint32) */
973   for (unsigned int i = 0; i < seqcount; i++) {
974     buffer[i] = db_getsequencelen(i);
975 
976         }
977   pos += largewrite(fd_output, buffer, 4 * seqcount, pos);
978 
979   /* sequences (ascii, no term, no pad) */
980   for (unsigned int i = 0; i < seqcount; i++)
981     {
982       unsigned int len = db_getsequencelen(i);
983       pos += largewrite(fd_output, db_getsequence(i), len, pos);
984     }
985 
986   if (close(fd_output) != 0) {
987     fatal("Unable to close UDB file");
988 
989         }
990 
991   progress_done();
992   dbindex_free();
993   db_free();
994   xfree(buffer);
995 }
996