1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 /*
62
63 Implements the Sintax algorithm as desribed in Robert Edgar's preprint:
64
65 Robert Edgar (2016)
66 SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences
67 BioRxiv, 074161
68 doi: https://doi.org/10.1101/074161
69
70 Further details:
71
72 https://www.drive5.com/usearch/manual/cmd_sintax.html
73
74 */
75
76 #include "vsearch.h"
77
78 static struct searchinfo_s * si_plus;
79 static struct searchinfo_s * si_minus;
80 static pthread_t * pthread;
81
82 /* global constants/data, no need for synchronization */
83 static int tophits; /* the maximum number of hits to keep */
84 static int seqcount; /* number of database sequences */
85 static pthread_attr_t attr;
86 static fastx_handle query_fastx_h;
87
88 const int tax_levels = 8;
89 const char * tax_letters = "dkpcofgs";
90 const int subset_size = 32;
91 const int bootstrap_count = 100;
92
93 /* global data protected by mutex */
94 static pthread_mutex_t mutex_input;
95 static pthread_mutex_t mutex_output;
96 static FILE * fp_tabbedout;
97 static int queries = 0;
98 static int classified = 0;
99
sintax_parse_tax(const char * header,int header_length,int * tax_start,int * tax_end)100 bool sintax_parse_tax(const char * header,
101 int header_length,
102 int * tax_start,
103 int * tax_end)
104 {
105 /*
106 Identify the first occurence of the pattern (^|;)tax=([^;]*)(;|$)
107 */
108
109 if (! header) {
110 return false;
111
112 }
113
114 const char * attribute = "tax=";
115
116 int hlen = header_length;
117 int alen = strlen(attribute);
118
119 int i = 0;
120
121 while (i < hlen - alen)
122 {
123 char * r = (char *) strstr(header + i, attribute);
124
125 /* no match */
126 if (r == nullptr) {
127 break;
128
129 }
130
131 i = r - header;
132
133 /* check for ';' in front */
134 if ((i > 0) && (header[i-1] != ';'))
135 {
136 i += alen + 1;
137 continue;
138 }
139
140 * tax_start = i;
141
142 /* find end (semicolon or end of header) */
143 const char * s = strchr(header+i+alen, ';');
144 if (s == nullptr) {
145 * tax_end = hlen;
146 } else {
147 * tax_end = s - header;
148
149 }
150
151 return true;
152 }
153 return false;
154 }
155
sintax_split(int seqno,int * level_start,int * level_len)156 void sintax_split(int seqno, int * level_start, int * level_len)
157 {
158 /* Parse taxonomy string into the following parts
159 d domain
160 k kingdom
161 p phylum
162 c class
163 o order
164 f family
165 g genus
166 s species
167 */
168
169 for (int i = 0; i < tax_levels; i++)
170 {
171 level_start[i] = 0;
172 level_len[i] = 0;
173 }
174
175 int tax_start, tax_end;
176 char * h = db_getheader(seqno);
177 int hlen = db_getheaderlen(seqno);
178 if (sintax_parse_tax(h, hlen, & tax_start, & tax_end))
179 {
180 int t = tax_start + 4;
181
182 while (t < tax_end)
183 {
184 /* Is the next char a recogized tax level letter? */
185 const char * r = strchr(tax_letters, tolower(h[t]));
186 if (r)
187 {
188 int level = r - tax_letters;
189
190 /* Is there a colon after it? */
191 if (h[t + 1] == ':')
192 {
193 level_start[level] = t + 2;
194
195 char * z = strchr(h + t + 2, ',');
196 if (z) {
197 level_len[level] = z - h - t - 2;
198 } else {
199 level_len[level] = tax_end - t - 2;
200
201 }
202 }
203 }
204
205 /* skip past next comma */
206 char * x = strchr(h + t, ',');
207 if (x) {
208 t = x - h + 1;
209 } else {
210 t = tax_end;
211
212 }
213 }
214 }
215 }
216
sintax_analyse(char * query_head,int strand,int best_seqno,int best_count,int * all_seqno,int count)217 void sintax_analyse(char * query_head,
218 int strand,
219 int best_seqno,
220 int best_count,
221 int * all_seqno,
222 int count)
223 {
224 int best_level_start[tax_levels];
225 int best_level_len[tax_levels];
226 int level_match[tax_levels];
227
228 /* check number of successful bootstraps */
229 if (count >= (bootstrap_count+1) / 2)
230 {
231 char * best_h = db_getheader(best_seqno);
232
233 sintax_split(best_seqno, best_level_start, best_level_len);
234
235 for (int & j :
236 level_match) {
237 j = 0;
238
239 }
240
241 for (int i = 0; i < count; i++)
242 {
243 /* For each bootstrap experiment */
244
245 int level_start[tax_levels];
246 int level_len[tax_levels];
247 sintax_split(all_seqno[i], level_start, level_len);
248
249 char * h = db_getheader(all_seqno[i]);
250
251 for (int j = 0; j < tax_levels; j++)
252 {
253 /* For each taxonomic level */
254
255 if ((level_len[j] == best_level_len[j]) &&
256 (strncmp(best_h + best_level_start[j],
257 h + level_start[j],
258 level_len[j]) == 0))
259 {
260 level_match[j]++;
261 }
262 }
263 }
264 }
265
266 /* write to tabbedout file */
267 xpthread_mutex_lock(&mutex_output);
268 fprintf(fp_tabbedout, "%s\t", query_head);
269
270 queries++;
271
272 if (count >= bootstrap_count / 2)
273 {
274 char * best_h = db_getheader(best_seqno);
275
276 classified++;
277
278 bool comma = false;
279 for (int j = 0; j < tax_levels; j++)
280 {
281 if (best_level_len[j] > 0)
282 {
283 fprintf(fp_tabbedout,
284 "%s%c:%.*s(%.2f)",
285 (comma ? "," : ""),
286 tax_letters[j],
287 best_level_len[j],
288 best_h + best_level_start[j],
289 1.0 * level_match[j] / count);
290 comma = true;
291 }
292 }
293
294 fprintf(fp_tabbedout, "\t%c", strand ? '-' : '+');
295
296 if (opt_sintax_cutoff > 0.0)
297 {
298 fprintf(fp_tabbedout, "\t");
299 bool comma = false;
300 for (int j = 0; j < tax_levels; j++)
301 {
302 if ((best_level_len[j] > 0) &&
303 (1.0 * level_match[j] / count >= opt_sintax_cutoff))
304 {
305 fprintf(fp_tabbedout,
306 "%s%c:%.*s",
307 (comma ? "," : ""),
308 tax_letters[j],
309 best_level_len[j],
310 best_h + best_level_start[j]);
311 comma = true;
312 }
313 }
314 }
315 }
316 else
317 {
318 if (opt_sintax_cutoff > 0.0) {
319 fprintf(fp_tabbedout, "\t\t\t");
320 } else {
321 fprintf(fp_tabbedout, "\t\t");
322
323 }
324 }
325
326 #if 0
327 fprintf(fp_tabbedout, "\t%d\t%d", best_count, count);
328 #endif
329
330 fprintf(fp_tabbedout, "\n");
331 xpthread_mutex_unlock(&mutex_output);
332 }
333
sintax_query(int64_t t)334 void sintax_query(int64_t t)
335 {
336 int all_seqno[2][bootstrap_count];
337 int best_seqno[2] = {0, 0};
338 int boot_count[2] = {0, 0};
339 unsigned int best_count[2] = {0, 0};
340 int qseqlen = si_plus[t].qseqlen;
341 char * query_head = si_plus[t].query_head;
342
343 bitmap_t * b = bitmap_init(qseqlen);
344
345 for (int s = 0; s < opt_strand; s++)
346 {
347 struct searchinfo_s * si = s ? si_minus+t : si_plus+t;
348
349 /* perform search */
350
351 unsigned int kmersamplecount;
352 unsigned int * kmersample;
353
354 /* find unique kmers */
355 unique_count(si->uh, opt_wordlength,
356 si->qseqlen, si->qsequence,
357 & kmersamplecount, & kmersample, MASK_NONE);
358
359 /* perform 100 bootstraps */
360
361 if (kmersamplecount >= subset_size)
362 {
363 for (int i = 0; i < bootstrap_count ; i++)
364 {
365 /* subsample 32 kmers */
366 unsigned int kmersample_subset[subset_size];
367 int subsamples = 0;
368 bitmap_reset_all(b);
369 for(int j = 0; j < subset_size ; j++)
370 {
371 int64_t x = random_int(kmersamplecount);
372 if (! bitmap_get(b, x))
373 {
374 kmersample_subset[subsamples++] = kmersample[x];
375 bitmap_set(b, x);
376 }
377 }
378
379 si->kmersamplecount = subsamples;
380 si->kmersample = kmersample_subset;
381
382 search_topscores(si);
383
384 while(!minheap_isempty(si->m))
385 {
386 elem_t e = minheap_poplast(si->m);
387
388 all_seqno[s][boot_count[s]++] = e.seqno;
389
390 if (e.count > best_count[s])
391 {
392 best_count[s] = e.count;
393 best_seqno[s] = e.seqno;
394 }
395 }
396 }
397 }
398 }
399
400 int best_strand;
401
402 if (opt_strand == 1) {
403 best_strand = 0;
404 } else
405 {
406 if (best_count[0] > best_count[1]) {
407 best_strand = 0;
408 } else if (best_count[1] > best_count[0]) {
409 best_strand = 1;
410 } else
411 {
412 if (boot_count[0] >= boot_count[1]) {
413 best_strand = 0;
414 } else {
415 best_strand = 1;
416
417 }
418 }
419 }
420
421 sintax_analyse(query_head,
422 best_strand,
423 best_seqno[best_strand],
424 best_count[best_strand],
425 all_seqno[best_strand],
426 boot_count[best_strand]);
427
428 bitmap_free(b);
429 }
430
sintax_thread_run(int64_t t)431 void sintax_thread_run(int64_t t)
432 {
433 while (true)
434 {
435 xpthread_mutex_lock(&mutex_input);
436
437 if (fastx_next(query_fastx_h,
438 ! opt_notrunclabels,
439 chrmap_no_change))
440 {
441 char * qhead = fastx_get_header(query_fastx_h);
442 int query_head_len = fastx_get_header_length(query_fastx_h);
443 char * qseq = fastx_get_sequence(query_fastx_h);
444 int qseqlen = fastx_get_sequence_length(query_fastx_h);
445 int query_no = fastx_get_seqno(query_fastx_h);
446 int qsize = fastx_get_abundance(query_fastx_h);
447
448 for (int s = 0; s < opt_strand; s++)
449 {
450 struct searchinfo_s * si = s ? si_minus+t : si_plus+t;
451
452 si->query_head_len = query_head_len;
453 si->qseqlen = qseqlen;
454 si->query_no = query_no;
455 si->qsize = qsize;
456 si->strand = s;
457
458 /* allocate more memory for header and sequence, if necessary */
459
460 if (si->query_head_len + 1 > si->query_head_alloc)
461 {
462 si->query_head_alloc = si->query_head_len + 2001;
463 si->query_head = (char*)
464 xrealloc(si->query_head, (size_t)(si->query_head_alloc));
465 }
466
467 if (si->qseqlen + 1 > si->seq_alloc)
468 {
469 si->seq_alloc = si->qseqlen + 2001;
470 si->qsequence = (char*)
471 xrealloc(si->qsequence, (size_t)(si->seq_alloc));
472 }
473 }
474
475 /* plus strand: copy header and sequence */
476 strcpy(si_plus[t].query_head, qhead);
477 strcpy(si_plus[t].qsequence, qseq);
478
479 /* get progress as amount of input file read */
480 uint64_t progress = fastx_get_position(query_fastx_h);
481
482 /* let other threads read input */
483 xpthread_mutex_unlock(&mutex_input);
484
485 /* minus strand: copy header and reverse complementary sequence */
486 if (opt_strand > 1)
487 {
488 strcpy(si_minus[t].query_head, si_plus[t].query_head);
489 reverse_complement(si_minus[t].qsequence,
490 si_plus[t].qsequence,
491 si_plus[t].qseqlen);
492 }
493
494 sintax_query(t);
495
496 /* lock mutex for update of global data and output */
497 xpthread_mutex_lock(&mutex_output);
498
499 /* show progress */
500 progress_update(progress);
501
502 xpthread_mutex_unlock(&mutex_output);
503 }
504 else
505 {
506 xpthread_mutex_unlock(&mutex_input);
507 break;
508 }
509 }
510 }
511
sintax_thread_init(struct searchinfo_s * si)512 void sintax_thread_init(struct searchinfo_s * si)
513 {
514 /* thread specific initialiation */
515 si->uh = unique_init();
516 si->kmers = (count_t *) xmalloc(seqcount * sizeof(count_t) + 32);
517 si->m = minheap_init(tophits);
518 si->hits = nullptr;
519 si->qsize = 1;
520 si->query_head_alloc = 0;
521 si->query_head = nullptr;
522 si->seq_alloc = 0;
523 si->qsequence = nullptr;
524 si->nw = nullptr;
525 si->s = nullptr;
526 }
527
sintax_thread_exit(struct searchinfo_s * si)528 void sintax_thread_exit(struct searchinfo_s * si)
529 {
530 /* thread specific clean up */
531 unique_exit(si->uh);
532 minheap_exit(si->m);
533 xfree(si->kmers);
534 if (si->query_head) {
535 xfree(si->query_head);
536
537 }
538 if (si->qsequence) {
539 xfree(si->qsequence);
540
541 }
542 }
543
sintax_thread_worker(void * vp)544 void * sintax_thread_worker(void * vp)
545 {
546 auto t = (int64_t) vp;
547 sintax_thread_run(t);
548 return nullptr;
549 }
550
sintax_thread_worker_run()551 void sintax_thread_worker_run()
552 {
553 /* initialize threads, start them, join them and return */
554
555 xpthread_attr_init(&attr);
556 xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
557
558 /* init and create worker threads, put them into stand-by mode */
559 for(int t=0; t<opt_threads; t++)
560 {
561 sintax_thread_init(si_plus+t);
562 if (si_minus) {
563 sintax_thread_init(si_minus+t);
564
565 }
566 xpthread_create(pthread+t, &attr,
567 sintax_thread_worker, (void*)(int64_t)t);
568 }
569
570 /* finish and clean up worker threads */
571 for(int t=0; t<opt_threads; t++)
572 {
573 xpthread_join(pthread[t], nullptr);
574 sintax_thread_exit(si_plus+t);
575 if (si_minus) {
576 sintax_thread_exit(si_minus+t);
577
578 }
579 }
580
581 xpthread_attr_destroy(&attr);
582 }
583
sintax()584 void sintax()
585 {
586 /* tophits = the maximum number of hits we need to store */
587
588 tophits = 1;
589
590 /* open output files */
591
592 if (! opt_db) {
593 fatal("No database file specified with --db");
594
595 }
596
597 if (opt_tabbedout)
598 {
599 fp_tabbedout = fopen_output(opt_tabbedout);
600 if (! fp_tabbedout) {
601 fatal("Unable to open tabbedout output file for writing");
602
603 }
604 }
605 else {
606 fatal("No output file specified with --tabbedout");
607
608 }
609
610 /* check if db may be an UDB file */
611
612 bool is_udb = udb_detect_isudb(opt_db);
613
614 if (is_udb) {
615 udb_read(opt_db, true, true);
616 } else {
617 db_read(opt_db, 0);
618
619 }
620
621 seqcount = db_getsequencecount();
622
623 if (!is_udb)
624 {
625 dbindex_prepare(1, opt_dbmask);
626 dbindex_addallsequences(opt_dbmask);
627 }
628
629 /* prepare reading of queries */
630
631 query_fastx_h = fastx_open(opt_sintax);
632
633 /* allocate memory for thread info */
634
635 si_plus = (struct searchinfo_s *) xmalloc(opt_threads *
636 sizeof(struct searchinfo_s));
637 if (opt_strand > 1) {
638 si_minus = (struct searchinfo_s *) xmalloc(opt_threads *
639 sizeof(struct searchinfo_s));
640 } else {
641 si_minus = nullptr;
642
643 }
644
645 pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t));
646
647 /* init mutexes for input and output */
648 xpthread_mutex_init(&mutex_input, nullptr);
649 xpthread_mutex_init(&mutex_output, nullptr);
650
651 /* run */
652
653 progress_init("Classifying sequences", fastx_get_size(query_fastx_h));
654 sintax_thread_worker_run();
655 progress_done();
656
657 if (! opt_quiet)
658 {
659 fprintf(stderr, "Classified %d of %d sequences", classified, queries);
660 if (queries > 0) {
661 fprintf(stderr, " (%.2f%%)", 100.0 * classified / queries);
662
663 }
664 fprintf(stderr, "\n");
665 }
666
667 if (opt_log)
668 {
669 fprintf(fp_log, "Classified %d of %d sequences", classified, queries);
670 if (queries > 0) {
671 fprintf(fp_log, " (%.2f%%)", 100.0 * classified / queries);
672
673 }
674 fprintf(fp_log, "\n");
675 }
676
677 /* clean up */
678
679 xpthread_mutex_destroy(&mutex_output);
680 xpthread_mutex_destroy(&mutex_input);
681
682 xfree(pthread);
683 xfree(si_plus);
684 if (si_minus) {
685 xfree(si_minus);
686
687 }
688
689 fastx_close(query_fastx_h);
690 fclose(fp_tabbedout);
691
692 dbindex_free();
693 db_free();
694 }
695