1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 static pthread_t * pthread;
64 
65 /* global constants/data, no need for synchronization */
66 static int seqcount; /* number of database sequences */
67 static pthread_attr_t attr;
68 
69 /* global data protected by mutex */
70 static pthread_mutex_t mutex_input;
71 static pthread_mutex_t mutex_output;
72 static int qmatches;
73 static int queries;
74 static int64_t progress = 0;
75 static FILE * fp_alnout = nullptr;
76 static FILE * fp_samout = nullptr;
77 static FILE * fp_userout = nullptr;
78 static FILE * fp_blast6out = nullptr;
79 static FILE * fp_uc = nullptr;
80 static FILE * fp_fastapairs = nullptr;
81 static FILE * fp_matched = nullptr;
82 static FILE * fp_notmatched = nullptr;
83 
84 static int count_matched = 0;
85 static int count_notmatched = 0;
86 
allpairs_hit_compare_typed(struct hit * x,struct hit * y)87 inline int allpairs_hit_compare_typed(struct hit * x, struct hit * y)
88 {
89   // high id, then low id
90   // early target, then late target
91 
92   if (x->id > y->id) {
93     return -1;
94   } else
95     if (x->id < y->id) {
96       return +1;
97     } else
98       if (x->target < y->target) {
99         return -1;
100       } else
101         if (x->target > y->target) {
102           return +1;
103         } else {
104           return 0;
105 
106         }
107 }
108 
allpairs_hit_compare(const void * a,const void * b)109 int allpairs_hit_compare(const void * a, const void * b)
110 {
111   return allpairs_hit_compare_typed((struct hit *) a, (struct hit *) b);
112 }
113 
allpairs_output_results(int hit_count,struct hit * hits,char * query_head,int qseqlen,char * qsequence,char * qsequence_rc)114 void allpairs_output_results(int hit_count,
115                              struct hit * hits,
116                              char * query_head,
117                              int qseqlen,
118                              char * qsequence,
119                              char * qsequence_rc)
120 {
121   /* show results */
122   int64_t toreport = MIN(opt_maxhits, hit_count);
123 
124   if (fp_alnout) {
125     results_show_alnout(fp_alnout,
126                         hits,
127                         toreport,
128                         query_head,
129                         qsequence,
130                         qseqlen,
131                         qsequence_rc);
132 
133         }
134 
135   if (fp_samout) {
136     results_show_samout(fp_samout,
137                         hits,
138                         toreport,
139                         query_head,
140                         qsequence,
141                         qseqlen,
142                         qsequence_rc);
143 
144         }
145 
146   if (toreport)
147     {
148       double top_hit_id = hits[0].id;
149 
150       for(int t = 0; t < toreport; t++)
151         {
152           struct hit * hp = hits + t;
153 
154           if (opt_top_hits_only && (hp->id < top_hit_id)) {
155             break;
156 
157         }
158 
159           if (fp_fastapairs) {
160             results_show_fastapairs_one(fp_fastapairs,
161                                         hp,
162                                         query_head,
163                                         qsequence,
164                                         qseqlen,
165                                         qsequence_rc);
166 
167         }
168 
169           if (fp_uc) {
170             if ((t==0) || opt_uc_allhits) {
171               results_show_uc_one(fp_uc,
172                                   hp,
173                                   query_head,
174                                   qsequence,
175                                   qseqlen,
176                                   qsequence_rc,
177                                   hp->target);
178 
179         }
180 
181         }
182 
183           if (fp_userout) {
184             results_show_userout_one(fp_userout,
185                                      hp,
186                                      query_head,
187                                      qsequence,
188                                      qseqlen,
189                                      qsequence_rc);
190 
191         }
192 
193           if (fp_blast6out) {
194             results_show_blast6out_one(fp_blast6out,
195                                        hp,
196                                        query_head,
197                                        qsequence,
198                                        qseqlen,
199                                        qsequence_rc);
200 
201         }
202         }
203     }
204   else
205     {
206       if (fp_uc) {
207         results_show_uc_one(fp_uc,
208                             nullptr,
209                             query_head,
210                             qsequence,
211                             qseqlen,
212                             qsequence_rc,
213                             0);
214 
215         }
216 
217       if (opt_output_no_hits)
218         {
219           if (fp_userout) {
220             results_show_userout_one(fp_userout,
221                                      nullptr,
222                                      query_head,
223                                      qsequence,
224                                      qseqlen,
225                                      qsequence_rc);
226 
227         }
228 
229           if (fp_blast6out) {
230             results_show_blast6out_one(fp_blast6out,
231                                        nullptr,
232                                        query_head,
233                                        qsequence,
234                                        qseqlen,
235                                        qsequence_rc);
236 
237         }
238         }
239     }
240 
241   if (hit_count)
242     {
243       count_matched++;
244       if (opt_matched) {
245         fasta_print_general(fp_matched,
246                             nullptr,
247                             qsequence,
248                             qseqlen,
249                             query_head,
250                             strlen(query_head),
251                             0,
252                             count_matched,
253                             -1.0,
254                             -1, -1, nullptr, 0.0);
255 
256         }
257     }
258   else
259     {
260       count_notmatched++;
261       if (opt_notmatched) {
262         fasta_print_general(fp_notmatched,
263                             nullptr,
264                             qsequence,
265                             qseqlen,
266                             query_head,
267                             strlen(query_head),
268                             0,
269                             count_notmatched,
270                             -1.0,
271                             -1, -1, nullptr, 0.0);
272 
273         }
274     }
275 }
276 
allpairs_thread_run(int64_t t)277 void allpairs_thread_run(int64_t t)
278 {
279   (void) t;
280 
281   struct searchinfo_s sia;
282 
283   struct searchinfo_s * si = & sia;
284 
285   si->strand = 0;
286   si->query_head_alloc = 0;
287   si->seq_alloc = 0;
288   si->kmersamplecount = 0;
289   si->kmers = nullptr;
290   si->m = nullptr;
291   si->finalized = 0;
292 
293   si->hits = (struct hit *) xmalloc(sizeof(struct hit) * seqcount);
294 
295   struct nwinfo_s * nw = nw_init();
296 
297   si->s = search16_init(opt_match,
298                         opt_mismatch,
299                         opt_gap_open_query_left,
300                         opt_gap_open_target_left,
301                         opt_gap_open_query_interior,
302                         opt_gap_open_target_interior,
303                         opt_gap_open_query_right,
304                         opt_gap_open_target_right,
305                         opt_gap_extension_query_left,
306                         opt_gap_extension_target_left,
307                         opt_gap_extension_query_interior,
308                         opt_gap_extension_target_interior,
309                         opt_gap_extension_query_right,
310                         opt_gap_extension_target_right);
311 
312 
313   LinearMemoryAligner lma;
314 
315   int64_t * scorematrix = lma.scorematrix_create(opt_match, opt_mismatch);
316 
317   lma.set_parameters(scorematrix,
318                      opt_gap_open_query_left,
319                      opt_gap_open_target_left,
320                      opt_gap_open_query_interior,
321                      opt_gap_open_target_interior,
322                      opt_gap_open_query_right,
323                      opt_gap_open_target_right,
324                      opt_gap_extension_query_left,
325                      opt_gap_extension_target_left,
326                      opt_gap_extension_query_interior,
327                      opt_gap_extension_target_interior,
328                      opt_gap_extension_query_right,
329                      opt_gap_extension_target_right);
330 
331   /* allocate memory for alignment results */
332   unsigned int maxhits = seqcount;
333   auto * pseqnos =
334     (unsigned int *) xmalloc(sizeof(unsigned int) * maxhits);
335   CELL * pscores =
336     (CELL*) xmalloc(sizeof(CELL) * maxhits);
337   auto * paligned =
338     (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
339   auto * pmatches =
340     (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
341   auto * pmismatches =
342     (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
343   auto * pgaps =
344     (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
345   char** pcigar = (char**) xmalloc(sizeof(char*) * maxhits);
346 
347   auto * finalhits
348     = (struct hit *) xmalloc(sizeof(struct hit) * seqcount);
349 
350   bool cont = true;
351 
352   while (cont)
353     {
354       xpthread_mutex_lock(&mutex_input);
355 
356       int query_no = queries;
357 
358       if (query_no < seqcount)
359         {
360           queries++;
361 
362           /* let other threads read input */
363           xpthread_mutex_unlock(&mutex_input);
364 
365           /* init search info */
366           si->query_no = query_no;
367           si->qsize = db_getabundance(query_no);
368           si->query_head_len = db_getheaderlen(query_no);
369           si->query_head = db_getheader(query_no);
370           si->qseqlen = db_getsequencelen(query_no);
371           si->qsequence = db_getsequence(query_no);
372           si->rejects = 0;
373           si->accepts = 0;
374           si->hit_count = 0;
375 
376           for(int target = si->query_no + 1;
377               target < seqcount; target++)
378             {
379               if (opt_acceptall || search_acceptable_unaligned(si, target)) {
380                 pseqnos[si->hit_count++] = target;
381 
382         }
383             }
384 
385           if (si->hit_count)
386             {
387               /* perform alignments */
388 
389               search16_qprep(si->s, si->qsequence, si->qseqlen);
390 
391               search16(si->s,
392                        si->hit_count,
393                        pseqnos,
394                        pscores,
395                        paligned,
396                        pmatches,
397                        pmismatches,
398                        pgaps,
399                        pcigar);
400 
401               /* convert to hit structure */
402               for (int h = 0; h < si->hit_count; h++)
403                 {
404                   struct hit * hit = si->hits + h;
405 
406                   unsigned int target = pseqnos[h];
407                   int64_t nwscore = pscores[h];
408 
409                   char * nwcigar {nullptr};
410                   int64_t nwalignmentlength {0};
411                   int64_t nwmatches {0};
412                   int64_t nwmismatches {0};
413                   int64_t nwgaps {0};
414 
415                   if (nwscore == SHRT_MAX)
416                     {
417                       /* In case the SIMD aligner cannot align,
418                          perform a new alignment with the
419                          linear memory aligner */
420 
421                       char * tseq = db_getsequence(target);
422                       int64_t tseqlen = db_getsequencelen(target);
423 
424                       if (pcigar[h]) {
425                         xfree(pcigar[h]);
426 
427         }
428 
429                       nwcigar = xstrdup(lma.align(si->qsequence,
430                                                  tseq,
431                                                  si->qseqlen,
432                                                  tseqlen));
433                       lma.alignstats(nwcigar,
434                                      si->qsequence,
435                                      tseq,
436                                      & nwscore,
437                                      & nwalignmentlength,
438                                      & nwmatches,
439                                      & nwmismatches,
440                                      & nwgaps);
441                     }
442                   else
443                     {
444                       nwcigar = pcigar[h];
445                       nwalignmentlength = paligned[h];
446                       nwmatches = pmatches[h];
447                       nwmismatches = pmismatches[h];
448                       nwgaps = pgaps[h];
449                     }
450 
451                   hit->target = target;
452                   hit->strand = 0;
453                   hit->count = 0;
454 
455                   hit->accepted = false;
456                   hit->rejected = false;
457                   hit->aligned = true;
458                   hit->weak = false;
459 
460                   hit->nwscore = nwscore;
461                   hit->nwdiff = nwalignmentlength - nwmatches;
462                   hit->nwgaps = nwgaps;
463                   hit->nwindels = nwalignmentlength - nwmatches - nwmismatches;
464                   hit->nwalignmentlength = nwalignmentlength;
465                   hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) /
466                     nwalignmentlength;
467                   hit->nwalignment = nwcigar;
468                   hit->matches = nwalignmentlength - hit->nwdiff;
469                   hit->mismatches = hit->nwdiff - hit->nwindels;
470 
471                   int64_t dseqlen = db_getsequencelen(target);
472                   hit->shortest = MIN(si->qseqlen, dseqlen);
473                   hit->longest = MAX(si->qseqlen, dseqlen);
474 
475                   /* trim alignment, compute numbers excluding terminal gaps */
476                   align_trim(hit);
477 
478                   /* test accept/reject criteria after alignment */
479                   if (opt_acceptall || search_acceptable_aligned(si, hit)) {
480                     finalhits[si->accepts++] = *hit;
481 
482         }
483                 }
484 
485               /* sort hits */
486               qsort(finalhits, si->accepts,
487                     sizeof(struct hit), allpairs_hit_compare);
488             }
489 
490           /* lock mutex for update of global data and output */
491           xpthread_mutex_lock(&mutex_output);
492 
493           /* output results */
494           allpairs_output_results(si->accepts,
495                                   finalhits,
496                                   si->query_head,
497                                   si->qseqlen,
498                                   si->qsequence,
499                                   nullptr);
500 
501           /* update stats */
502           if (si->accepts) {
503             qmatches++;
504 
505         }
506 
507           /* show progress */
508           progress += seqcount - query_no - 1;
509           progress_update(progress);
510 
511           xpthread_mutex_unlock(&mutex_output);
512 
513           /* free memory for alignment strings */
514           for(int i=0; i < si->hit_count; i++) {
515             if (si->hits[i].aligned) {
516               xfree(si->hits[i].nwalignment);
517 
518         }
519 
520         }
521         }
522       else
523         {
524           /* let other threads read input */
525           xpthread_mutex_unlock(&mutex_input);
526 
527           cont = false;
528         }
529     }
530 
531   xfree(finalhits);
532 
533   xfree(pcigar);
534   xfree(pgaps);
535   xfree(pmismatches);
536   xfree(pmatches);
537   xfree(paligned);
538   xfree(pscores);
539   xfree(pseqnos);
540 
541   search16_exit(si->s);
542 
543   nw_exit(nw);
544 
545   xfree(scorematrix);
546 
547   xfree(si->hits);
548 }
549 
allpairs_thread_worker(void * vp)550 void * allpairs_thread_worker(void * vp)
551 {
552   auto t = (int64_t) vp;
553   allpairs_thread_run(t);
554   return nullptr;
555 }
556 
allpairs_thread_worker_run()557 void allpairs_thread_worker_run()
558 {
559   /* initialize threads, start them, join them and return */
560 
561   xpthread_attr_init(&attr);
562   xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
563 
564   /* init and create worker threads, put them into stand-by mode */
565   for(int t=0; t<opt_threads; t++) {
566     xpthread_create(pthread+t, &attr,
567                     allpairs_thread_worker, (void*)(int64_t)t);
568 
569         }
570 
571   /* finish and clean up worker threads */
572   for(int t=0; t<opt_threads; t++) {
573     xpthread_join(pthread[t], nullptr);
574 
575         }
576 
577   xpthread_attr_destroy(&attr);
578 }
579 
580 
allpairs_global(char * cmdline,char * progheader)581 void allpairs_global(char * cmdline, char * progheader)
582 {
583   opt_strand = 1;
584   opt_uc_allhits = 1;
585 
586   /* open output files */
587 
588   if (opt_alnout)
589     {
590       fp_alnout = fopen_output(opt_alnout);
591       if (! fp_alnout) {
592         fatal("Unable to open alignment output file for writing");
593 
594         }
595 
596       fprintf(fp_alnout, "%s\n", cmdline);
597       fprintf(fp_alnout, "%s\n", progheader);
598     }
599 
600   if (opt_samout)
601     {
602       fp_samout = fopen_output(opt_samout);
603       if (! fp_samout) {
604         fatal("Unable to open SAM output file for writing");
605 
606         }
607     }
608 
609   if (opt_userout)
610     {
611       fp_userout = fopen_output(opt_userout);
612       if (! fp_userout) {
613         fatal("Unable to open user-defined output file for writing");
614 
615         }
616     }
617 
618   if (opt_blast6out)
619     {
620       fp_blast6out = fopen_output(opt_blast6out);
621       if (! fp_blast6out) {
622         fatal("Unable to open blast6-like output file for writing");
623 
624         }
625     }
626 
627   if (opt_uc)
628     {
629       fp_uc = fopen_output(opt_uc);
630       if (! fp_uc) {
631         fatal("Unable to open uc output file for writing");
632 
633         }
634     }
635 
636   if (opt_fastapairs)
637     {
638       fp_fastapairs = fopen_output(opt_fastapairs);
639       if (! fp_fastapairs) {
640         fatal("Unable to open fastapairs output file for writing");
641 
642         }
643     }
644 
645   if (opt_matched)
646     {
647       fp_matched = fopen_output(opt_matched);
648       if (! fp_matched) {
649         fatal("Unable to open matched output file for writing");
650 
651         }
652     }
653 
654   if (opt_notmatched)
655     {
656       fp_notmatched = fopen_output(opt_notmatched);
657       if (! fp_notmatched) {
658         fatal("Unable to open notmatched output file for writing");
659 
660         }
661     }
662 
663   db_read(opt_allpairs_global, 0);
664 
665   results_show_samheader(fp_samout, cmdline, opt_allpairs_global);
666 
667   if (opt_qmask == MASK_DUST) {
668     dust_all();
669   } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) {
670     hardmask_all();
671 
672         }
673 
674   show_rusage();
675 
676   seqcount = db_getsequencecount();
677 
678   /* prepare reading of queries */
679   qmatches = 0;
680   queries = 0;
681 
682   pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t));
683 
684   /* init mutexes for input and output */
685   xpthread_mutex_init(&mutex_input, nullptr);
686   xpthread_mutex_init(&mutex_output, nullptr);
687 
688   progress = 0;
689   progress_init("Aligning", MAX(0,((int64_t)seqcount)*((int64_t)seqcount-1))/2);
690   allpairs_thread_worker_run();
691   progress_done();
692 
693   if (!opt_quiet)
694     {
695       fprintf(stderr, "Matching query sequences: %d of %d",
696               qmatches, queries);
697       if (queries > 0) {
698         fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries);
699 
700         }
701       fprintf(stderr, "\n");
702     }
703 
704   if (opt_log)
705     {
706       fprintf(fp_log, "Matching query sequences: %d of %d",
707               qmatches, queries);
708       if (queries > 0) {
709         fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries);
710 
711         }
712       fprintf(fp_log, "\n\n");
713     }
714 
715   xpthread_mutex_destroy(&mutex_output);
716   xpthread_mutex_destroy(&mutex_input);
717 
718   xfree(pthread);
719 
720   /* clean up, global */
721   db_free();
722   if (opt_matched) {
723     fclose(fp_matched);
724 
725         }
726   if (opt_notmatched) {
727     fclose(fp_notmatched);
728 
729         }
730   if (opt_fastapairs) {
731     fclose(fp_fastapairs);
732 
733         }
734   if (fp_uc) {
735     fclose(fp_uc);
736 
737         }
738   if (fp_blast6out) {
739     fclose(fp_blast6out);
740 
741         }
742   if (fp_userout) {
743     fclose(fp_userout);
744 
745         }
746   if (fp_alnout) {
747     fclose(fp_alnout);
748 
749         }
750   if (fp_samout) {
751     fclose(fp_samout);
752 
753         }
754   show_rusage();
755 }
756