1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 /*
62 
63   Implements the Sintax algorithm as desribed in Robert Edgar's preprint:
64 
65   Robert Edgar (2016)
66   SINTAX: a simple non-Bayesian taxonomy classifier for 16S and ITS sequences
67   BioRxiv, 074161
68   doi: https://doi.org/10.1101/074161
69 
70   Further details:
71 
72   https://www.drive5.com/usearch/manual/cmd_sintax.html
73 
74 */
75 
76 #include "vsearch.h"
77 
78 static struct searchinfo_s * si_plus;
79 static struct searchinfo_s * si_minus;
80 static pthread_t * pthread;
81 
82 /* global constants/data, no need for synchronization */
83 static int tophits; /* the maximum number of hits to keep */
84 static int seqcount; /* number of database sequences */
85 static pthread_attr_t attr;
86 static fastx_handle query_fastx_h;
87 
88 const int tax_levels = 8;
89 const char * tax_letters = "dkpcofgs";
90 const int subset_size = 32;
91 const int bootstrap_count = 100;
92 
93 /* global data protected by mutex */
94 static pthread_mutex_t mutex_input;
95 static pthread_mutex_t mutex_output;
96 static FILE * fp_tabbedout;
97 static int queries = 0;
98 static int classified = 0;
99 
sintax_parse_tax(const char * header,int header_length,int * tax_start,int * tax_end)100 bool sintax_parse_tax(const char * header,
101                       int header_length,
102                       int * tax_start,
103                       int * tax_end)
104 {
105   /*
106     Identify the first occurence of the pattern (^|;)tax=([^;]*)(;|$)
107   */
108 
109   if (! header) {
110     return false;
111 
112         }
113 
114   const char * attribute = "tax=";
115 
116   int hlen = header_length;
117   int alen = strlen(attribute);
118 
119   int i = 0;
120 
121   while (i < hlen - alen)
122     {
123       char * r = (char *) strstr(header + i, attribute);
124 
125       /* no match */
126       if (r == nullptr) {
127         break;
128 
129         }
130 
131       i = r - header;
132 
133       /* check for ';' in front */
134       if ((i > 0) && (header[i-1] != ';'))
135         {
136           i += alen + 1;
137           continue;
138         }
139 
140       * tax_start = i;
141 
142       /* find end (semicolon or end of header) */
143       const char * s = strchr(header+i+alen, ';');
144       if (s == nullptr) {
145         * tax_end = hlen;
146       } else {
147         * tax_end = s - header;
148 
149         }
150 
151       return true;
152     }
153   return false;
154 }
155 
sintax_split(int seqno,int * level_start,int * level_len)156 void sintax_split(int seqno, int * level_start, int * level_len)
157 {
158   /* Parse taxonomy string into the following parts
159      d domain
160      k kingdom
161      p phylum
162      c class
163      o order
164      f family
165      g genus
166      s species
167   */
168 
169   for (int i = 0; i < tax_levels; i++)
170     {
171       level_start[i] = 0;
172       level_len[i] = 0;
173     }
174 
175   int tax_start, tax_end;
176   char * h = db_getheader(seqno);
177   int hlen = db_getheaderlen(seqno);
178   if (sintax_parse_tax(h, hlen, & tax_start, & tax_end))
179     {
180       int t = tax_start + 4;
181 
182       while (t < tax_end)
183         {
184           /* Is the next char a recogized tax level letter? */
185           const char * r = strchr(tax_letters, tolower(h[t]));
186           if (r)
187             {
188               int level = r - tax_letters;
189 
190               /* Is there a colon after it? */
191               if (h[t + 1] == ':')
192                 {
193                   level_start[level] = t + 2;
194 
195                   char * z = strchr(h + t + 2, ',');
196                   if (z) {
197                     level_len[level] = z - h - t - 2;
198                   } else {
199                     level_len[level] = tax_end - t - 2;
200 
201         }
202                 }
203             }
204 
205           /* skip past next comma */
206           char * x = strchr(h + t, ',');
207           if (x) {
208             t = x - h + 1;
209           } else {
210             t = tax_end;
211 
212         }
213         }
214     }
215 }
216 
sintax_analyse(char * query_head,int strand,int best_seqno,int best_count,int * all_seqno,int count)217 void sintax_analyse(char * query_head,
218                     int strand,
219                     int best_seqno,
220                     int best_count,
221                     int * all_seqno,
222                     int count)
223 {
224   int best_level_start[tax_levels];
225   int best_level_len[tax_levels];
226   int level_match[tax_levels];
227 
228   /* check number of successful bootstraps */
229   if (count >= (bootstrap_count+1) / 2)
230     {
231       char * best_h = db_getheader(best_seqno);
232 
233       sintax_split(best_seqno, best_level_start, best_level_len);
234 
235       for (int & j :
236         level_match) {
237         j = 0;
238 
239         }
240 
241       for (int i = 0; i < count; i++)
242         {
243           /* For each bootstrap experiment */
244 
245           int level_start[tax_levels];
246           int level_len[tax_levels];
247           sintax_split(all_seqno[i], level_start, level_len);
248 
249           char * h = db_getheader(all_seqno[i]);
250 
251           for (int j = 0; j < tax_levels; j++)
252             {
253               /* For each taxonomic level */
254 
255               if ((level_len[j] == best_level_len[j]) &&
256                   (strncmp(best_h + best_level_start[j],
257                            h + level_start[j],
258                            level_len[j]) == 0))
259                 {
260                   level_match[j]++;
261                 }
262             }
263         }
264     }
265 
266   /* write to tabbedout file */
267   xpthread_mutex_lock(&mutex_output);
268   fprintf(fp_tabbedout, "%s\t", query_head);
269 
270   queries++;
271 
272   if (count >= bootstrap_count / 2)
273     {
274       char * best_h = db_getheader(best_seqno);
275 
276       classified++;
277 
278       bool comma = false;
279       for (int j = 0; j < tax_levels; j++)
280         {
281           if (best_level_len[j] > 0)
282             {
283               fprintf(fp_tabbedout,
284                       "%s%c:%.*s(%.2f)",
285                       (comma ? "," : ""),
286                       tax_letters[j],
287                       best_level_len[j],
288                       best_h + best_level_start[j],
289                       1.0 * level_match[j] / count);
290               comma = true;
291             }
292         }
293 
294       fprintf(fp_tabbedout, "\t%c", strand ? '-' : '+');
295 
296       if (opt_sintax_cutoff > 0.0)
297         {
298           fprintf(fp_tabbedout, "\t");
299           bool comma = false;
300           for (int j = 0; j < tax_levels; j++)
301             {
302               if ((best_level_len[j] > 0) &&
303                   (1.0 * level_match[j] / count >= opt_sintax_cutoff))
304                 {
305                   fprintf(fp_tabbedout,
306                           "%s%c:%.*s",
307                           (comma ? "," : ""),
308                           tax_letters[j],
309                           best_level_len[j],
310                           best_h + best_level_start[j]);
311                   comma = true;
312                 }
313             }
314         }
315     }
316   else
317     {
318       if (opt_sintax_cutoff > 0.0) {
319         fprintf(fp_tabbedout, "\t\t\t");
320       } else {
321         fprintf(fp_tabbedout, "\t\t");
322 
323         }
324     }
325 
326 #if 0
327   fprintf(fp_tabbedout, "\t%d\t%d", best_count, count);
328 #endif
329 
330   fprintf(fp_tabbedout, "\n");
331   xpthread_mutex_unlock(&mutex_output);
332 }
333 
sintax_query(int64_t t)334 void sintax_query(int64_t t)
335 {
336   int all_seqno[2][bootstrap_count];
337   int best_seqno[2] = {0, 0};
338   int boot_count[2] = {0, 0};
339   unsigned int best_count[2] = {0, 0};
340   int qseqlen = si_plus[t].qseqlen;
341   char * query_head = si_plus[t].query_head;
342 
343   bitmap_t * b = bitmap_init(qseqlen);
344 
345   for (int s = 0; s < opt_strand; s++)
346     {
347       struct searchinfo_s * si = s ? si_minus+t : si_plus+t;
348 
349       /* perform search */
350 
351       unsigned int kmersamplecount;
352       unsigned int * kmersample;
353 
354       /* find unique kmers */
355       unique_count(si->uh, opt_wordlength,
356                    si->qseqlen, si->qsequence,
357                    & kmersamplecount, & kmersample, MASK_NONE);
358 
359       /* perform 100 bootstraps */
360 
361       if (kmersamplecount >= subset_size)
362         {
363           for (int i = 0; i < bootstrap_count ; i++)
364             {
365               /* subsample 32 kmers */
366               unsigned int kmersample_subset[subset_size];
367               int subsamples = 0;
368               bitmap_reset_all(b);
369               for(int j = 0; j < subset_size ; j++)
370                 {
371                   int64_t x = random_int(kmersamplecount);
372                   if (! bitmap_get(b, x))
373                     {
374                       kmersample_subset[subsamples++] = kmersample[x];
375                       bitmap_set(b, x);
376                     }
377                 }
378 
379               si->kmersamplecount = subsamples;
380               si->kmersample = kmersample_subset;
381 
382               search_topscores(si);
383 
384               while(!minheap_isempty(si->m))
385                 {
386                   elem_t e = minheap_poplast(si->m);
387 
388                   all_seqno[s][boot_count[s]++] = e.seqno;
389 
390                   if (e.count > best_count[s])
391                     {
392                       best_count[s] = e.count;
393                       best_seqno[s] = e.seqno;
394                     }
395                 }
396             }
397         }
398     }
399 
400   int best_strand;
401 
402   if (opt_strand == 1) {
403     best_strand = 0;
404   } else
405     {
406       if (best_count[0] > best_count[1]) {
407         best_strand = 0;
408       } else if (best_count[1] > best_count[0]) {
409         best_strand = 1;
410       } else
411         {
412           if (boot_count[0] >= boot_count[1]) {
413             best_strand = 0;
414           } else {
415             best_strand = 1;
416 
417         }
418         }
419     }
420 
421   sintax_analyse(query_head,
422                  best_strand,
423                  best_seqno[best_strand],
424                  best_count[best_strand],
425                  all_seqno[best_strand],
426                  boot_count[best_strand]);
427 
428   bitmap_free(b);
429 }
430 
sintax_thread_run(int64_t t)431 void sintax_thread_run(int64_t t)
432 {
433   while (true)
434     {
435       xpthread_mutex_lock(&mutex_input);
436 
437       if (fastx_next(query_fastx_h,
438                      ! opt_notrunclabels,
439                      chrmap_no_change))
440         {
441           char * qhead = fastx_get_header(query_fastx_h);
442           int query_head_len = fastx_get_header_length(query_fastx_h);
443           char * qseq = fastx_get_sequence(query_fastx_h);
444           int qseqlen = fastx_get_sequence_length(query_fastx_h);
445           int query_no = fastx_get_seqno(query_fastx_h);
446           int qsize = fastx_get_abundance(query_fastx_h);
447 
448           for (int s = 0; s < opt_strand; s++)
449             {
450               struct searchinfo_s * si = s ? si_minus+t : si_plus+t;
451 
452               si->query_head_len = query_head_len;
453               si->qseqlen = qseqlen;
454               si->query_no = query_no;
455               si->qsize = qsize;
456               si->strand = s;
457 
458               /* allocate more memory for header and sequence, if necessary */
459 
460               if (si->query_head_len + 1 > si->query_head_alloc)
461                 {
462                   si->query_head_alloc = si->query_head_len + 2001;
463                   si->query_head = (char*)
464                     xrealloc(si->query_head, (size_t)(si->query_head_alloc));
465                 }
466 
467               if (si->qseqlen + 1 > si->seq_alloc)
468                 {
469                   si->seq_alloc = si->qseqlen + 2001;
470                   si->qsequence = (char*)
471                     xrealloc(si->qsequence, (size_t)(si->seq_alloc));
472                 }
473             }
474 
475           /* plus strand: copy header and sequence */
476           strcpy(si_plus[t].query_head, qhead);
477           strcpy(si_plus[t].qsequence, qseq);
478 
479           /* get progress as amount of input file read */
480           uint64_t progress = fastx_get_position(query_fastx_h);
481 
482           /* let other threads read input */
483           xpthread_mutex_unlock(&mutex_input);
484 
485           /* minus strand: copy header and reverse complementary sequence */
486           if (opt_strand > 1)
487             {
488               strcpy(si_minus[t].query_head, si_plus[t].query_head);
489               reverse_complement(si_minus[t].qsequence,
490                                  si_plus[t].qsequence,
491                                  si_plus[t].qseqlen);
492             }
493 
494           sintax_query(t);
495 
496           /* lock mutex for update of global data and output */
497           xpthread_mutex_lock(&mutex_output);
498 
499           /* show progress */
500           progress_update(progress);
501 
502           xpthread_mutex_unlock(&mutex_output);
503         }
504       else
505         {
506           xpthread_mutex_unlock(&mutex_input);
507           break;
508         }
509     }
510 }
511 
sintax_thread_init(struct searchinfo_s * si)512 void sintax_thread_init(struct searchinfo_s * si)
513 {
514   /* thread specific initialiation */
515   si->uh = unique_init();
516   si->kmers = (count_t *) xmalloc(seqcount * sizeof(count_t) + 32);
517   si->m = minheap_init(tophits);
518   si->hits = nullptr;
519   si->qsize = 1;
520   si->query_head_alloc = 0;
521   si->query_head = nullptr;
522   si->seq_alloc = 0;
523   si->qsequence = nullptr;
524   si->nw = nullptr;
525   si->s = nullptr;
526 }
527 
sintax_thread_exit(struct searchinfo_s * si)528 void sintax_thread_exit(struct searchinfo_s * si)
529 {
530   /* thread specific clean up */
531   unique_exit(si->uh);
532   minheap_exit(si->m);
533   xfree(si->kmers);
534   if (si->query_head) {
535     xfree(si->query_head);
536 
537         }
538   if (si->qsequence) {
539     xfree(si->qsequence);
540 
541         }
542 }
543 
sintax_thread_worker(void * vp)544 void * sintax_thread_worker(void * vp)
545 {
546   auto t = (int64_t) vp;
547   sintax_thread_run(t);
548   return nullptr;
549 }
550 
sintax_thread_worker_run()551 void sintax_thread_worker_run()
552 {
553   /* initialize threads, start them, join them and return */
554 
555   xpthread_attr_init(&attr);
556   xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
557 
558   /* init and create worker threads, put them into stand-by mode */
559   for(int t=0; t<opt_threads; t++)
560     {
561       sintax_thread_init(si_plus+t);
562       if (si_minus) {
563         sintax_thread_init(si_minus+t);
564 
565         }
566       xpthread_create(pthread+t, &attr,
567                       sintax_thread_worker, (void*)(int64_t)t);
568     }
569 
570   /* finish and clean up worker threads */
571   for(int t=0; t<opt_threads; t++)
572     {
573       xpthread_join(pthread[t], nullptr);
574       sintax_thread_exit(si_plus+t);
575       if (si_minus) {
576         sintax_thread_exit(si_minus+t);
577 
578         }
579     }
580 
581   xpthread_attr_destroy(&attr);
582 }
583 
sintax()584 void sintax()
585 {
586   /* tophits = the maximum number of hits we need to store */
587 
588   tophits = 1;
589 
590   /* open output files */
591 
592   if (! opt_db) {
593     fatal("No database file specified with --db");
594 
595         }
596 
597   if (opt_tabbedout)
598     {
599       fp_tabbedout = fopen_output(opt_tabbedout);
600       if (! fp_tabbedout) {
601         fatal("Unable to open tabbedout output file for writing");
602 
603         }
604     }
605   else {
606     fatal("No output file specified with --tabbedout");
607 
608         }
609 
610   /* check if db may be an UDB file */
611 
612   bool is_udb = udb_detect_isudb(opt_db);
613 
614   if (is_udb) {
615     udb_read(opt_db, true, true);
616   } else {
617     db_read(opt_db, 0);
618 
619         }
620 
621   seqcount = db_getsequencecount();
622 
623   if (!is_udb)
624     {
625       dbindex_prepare(1, opt_dbmask);
626       dbindex_addallsequences(opt_dbmask);
627     }
628 
629   /* prepare reading of queries */
630 
631   query_fastx_h = fastx_open(opt_sintax);
632 
633   /* allocate memory for thread info */
634 
635   si_plus = (struct searchinfo_s *) xmalloc(opt_threads *
636                                             sizeof(struct searchinfo_s));
637   if (opt_strand > 1) {
638     si_minus = (struct searchinfo_s *) xmalloc(opt_threads *
639                                                sizeof(struct searchinfo_s));
640   } else {
641     si_minus = nullptr;
642 
643         }
644 
645   pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t));
646 
647   /* init mutexes for input and output */
648   xpthread_mutex_init(&mutex_input, nullptr);
649   xpthread_mutex_init(&mutex_output, nullptr);
650 
651   /* run */
652 
653   progress_init("Classifying sequences", fastx_get_size(query_fastx_h));
654   sintax_thread_worker_run();
655   progress_done();
656 
657   if (! opt_quiet)
658     {
659       fprintf(stderr, "Classified %d of %d sequences", classified, queries);
660       if (queries > 0) {
661         fprintf(stderr, " (%.2f%%)", 100.0 * classified / queries);
662 
663         }
664       fprintf(stderr, "\n");
665     }
666 
667   if (opt_log)
668     {
669       fprintf(fp_log, "Classified %d of %d sequences", classified, queries);
670       if (queries > 0) {
671         fprintf(fp_log, " (%.2f%%)", 100.0 * classified / queries);
672 
673         }
674       fprintf(fp_log, "\n");
675     }
676 
677   /* clean up */
678 
679   xpthread_mutex_destroy(&mutex_output);
680   xpthread_mutex_destroy(&mutex_input);
681 
682   xfree(pthread);
683   xfree(si_plus);
684   if (si_minus) {
685     xfree(si_minus);
686 
687         }
688 
689   fastx_close(query_fastx_h);
690   fclose(fp_tabbedout);
691 
692   dbindex_free();
693   db_free();
694 }
695