1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 static pthread_t * pthread;
64
65 /* global constants/data, no need for synchronization */
66 static int seqcount; /* number of database sequences */
67 static pthread_attr_t attr;
68
69 /* global data protected by mutex */
70 static pthread_mutex_t mutex_input;
71 static pthread_mutex_t mutex_output;
72 static int qmatches;
73 static int queries;
74 static int64_t progress = 0;
75 static FILE * fp_alnout = nullptr;
76 static FILE * fp_samout = nullptr;
77 static FILE * fp_userout = nullptr;
78 static FILE * fp_blast6out = nullptr;
79 static FILE * fp_uc = nullptr;
80 static FILE * fp_fastapairs = nullptr;
81 static FILE * fp_matched = nullptr;
82 static FILE * fp_notmatched = nullptr;
83
84 static int count_matched = 0;
85 static int count_notmatched = 0;
86
allpairs_hit_compare_typed(struct hit * x,struct hit * y)87 inline int allpairs_hit_compare_typed(struct hit * x, struct hit * y)
88 {
89 // high id, then low id
90 // early target, then late target
91
92 if (x->id > y->id) {
93 return -1;
94 } else
95 if (x->id < y->id) {
96 return +1;
97 } else
98 if (x->target < y->target) {
99 return -1;
100 } else
101 if (x->target > y->target) {
102 return +1;
103 } else {
104 return 0;
105
106 }
107 }
108
allpairs_hit_compare(const void * a,const void * b)109 int allpairs_hit_compare(const void * a, const void * b)
110 {
111 return allpairs_hit_compare_typed((struct hit *) a, (struct hit *) b);
112 }
113
allpairs_output_results(int hit_count,struct hit * hits,char * query_head,int qseqlen,char * qsequence,char * qsequence_rc)114 void allpairs_output_results(int hit_count,
115 struct hit * hits,
116 char * query_head,
117 int qseqlen,
118 char * qsequence,
119 char * qsequence_rc)
120 {
121 /* show results */
122 int64_t toreport = MIN(opt_maxhits, hit_count);
123
124 if (fp_alnout) {
125 results_show_alnout(fp_alnout,
126 hits,
127 toreport,
128 query_head,
129 qsequence,
130 qseqlen,
131 qsequence_rc);
132
133 }
134
135 if (fp_samout) {
136 results_show_samout(fp_samout,
137 hits,
138 toreport,
139 query_head,
140 qsequence,
141 qseqlen,
142 qsequence_rc);
143
144 }
145
146 if (toreport)
147 {
148 double top_hit_id = hits[0].id;
149
150 for(int t = 0; t < toreport; t++)
151 {
152 struct hit * hp = hits + t;
153
154 if (opt_top_hits_only && (hp->id < top_hit_id)) {
155 break;
156
157 }
158
159 if (fp_fastapairs) {
160 results_show_fastapairs_one(fp_fastapairs,
161 hp,
162 query_head,
163 qsequence,
164 qseqlen,
165 qsequence_rc);
166
167 }
168
169 if (fp_uc) {
170 if ((t==0) || opt_uc_allhits) {
171 results_show_uc_one(fp_uc,
172 hp,
173 query_head,
174 qsequence,
175 qseqlen,
176 qsequence_rc,
177 hp->target);
178
179 }
180
181 }
182
183 if (fp_userout) {
184 results_show_userout_one(fp_userout,
185 hp,
186 query_head,
187 qsequence,
188 qseqlen,
189 qsequence_rc);
190
191 }
192
193 if (fp_blast6out) {
194 results_show_blast6out_one(fp_blast6out,
195 hp,
196 query_head,
197 qsequence,
198 qseqlen,
199 qsequence_rc);
200
201 }
202 }
203 }
204 else
205 {
206 if (fp_uc) {
207 results_show_uc_one(fp_uc,
208 nullptr,
209 query_head,
210 qsequence,
211 qseqlen,
212 qsequence_rc,
213 0);
214
215 }
216
217 if (opt_output_no_hits)
218 {
219 if (fp_userout) {
220 results_show_userout_one(fp_userout,
221 nullptr,
222 query_head,
223 qsequence,
224 qseqlen,
225 qsequence_rc);
226
227 }
228
229 if (fp_blast6out) {
230 results_show_blast6out_one(fp_blast6out,
231 nullptr,
232 query_head,
233 qsequence,
234 qseqlen,
235 qsequence_rc);
236
237 }
238 }
239 }
240
241 if (hit_count)
242 {
243 count_matched++;
244 if (opt_matched) {
245 fasta_print_general(fp_matched,
246 nullptr,
247 qsequence,
248 qseqlen,
249 query_head,
250 strlen(query_head),
251 0,
252 count_matched,
253 -1.0,
254 -1, -1, nullptr, 0.0);
255
256 }
257 }
258 else
259 {
260 count_notmatched++;
261 if (opt_notmatched) {
262 fasta_print_general(fp_notmatched,
263 nullptr,
264 qsequence,
265 qseqlen,
266 query_head,
267 strlen(query_head),
268 0,
269 count_notmatched,
270 -1.0,
271 -1, -1, nullptr, 0.0);
272
273 }
274 }
275 }
276
allpairs_thread_run(int64_t t)277 void allpairs_thread_run(int64_t t)
278 {
279 (void) t;
280
281 struct searchinfo_s sia;
282
283 struct searchinfo_s * si = & sia;
284
285 si->strand = 0;
286 si->query_head_alloc = 0;
287 si->seq_alloc = 0;
288 si->kmersamplecount = 0;
289 si->kmers = nullptr;
290 si->m = nullptr;
291 si->finalized = 0;
292
293 si->hits = (struct hit *) xmalloc(sizeof(struct hit) * seqcount);
294
295 struct nwinfo_s * nw = nw_init();
296
297 si->s = search16_init(opt_match,
298 opt_mismatch,
299 opt_gap_open_query_left,
300 opt_gap_open_target_left,
301 opt_gap_open_query_interior,
302 opt_gap_open_target_interior,
303 opt_gap_open_query_right,
304 opt_gap_open_target_right,
305 opt_gap_extension_query_left,
306 opt_gap_extension_target_left,
307 opt_gap_extension_query_interior,
308 opt_gap_extension_target_interior,
309 opt_gap_extension_query_right,
310 opt_gap_extension_target_right);
311
312
313 LinearMemoryAligner lma;
314
315 int64_t * scorematrix = lma.scorematrix_create(opt_match, opt_mismatch);
316
317 lma.set_parameters(scorematrix,
318 opt_gap_open_query_left,
319 opt_gap_open_target_left,
320 opt_gap_open_query_interior,
321 opt_gap_open_target_interior,
322 opt_gap_open_query_right,
323 opt_gap_open_target_right,
324 opt_gap_extension_query_left,
325 opt_gap_extension_target_left,
326 opt_gap_extension_query_interior,
327 opt_gap_extension_target_interior,
328 opt_gap_extension_query_right,
329 opt_gap_extension_target_right);
330
331 /* allocate memory for alignment results */
332 unsigned int maxhits = seqcount;
333 auto * pseqnos =
334 (unsigned int *) xmalloc(sizeof(unsigned int) * maxhits);
335 CELL * pscores =
336 (CELL*) xmalloc(sizeof(CELL) * maxhits);
337 auto * paligned =
338 (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
339 auto * pmatches =
340 (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
341 auto * pmismatches =
342 (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
343 auto * pgaps =
344 (unsigned short*) xmalloc(sizeof(unsigned short) * maxhits);
345 char** pcigar = (char**) xmalloc(sizeof(char*) * maxhits);
346
347 auto * finalhits
348 = (struct hit *) xmalloc(sizeof(struct hit) * seqcount);
349
350 bool cont = true;
351
352 while (cont)
353 {
354 xpthread_mutex_lock(&mutex_input);
355
356 int query_no = queries;
357
358 if (query_no < seqcount)
359 {
360 queries++;
361
362 /* let other threads read input */
363 xpthread_mutex_unlock(&mutex_input);
364
365 /* init search info */
366 si->query_no = query_no;
367 si->qsize = db_getabundance(query_no);
368 si->query_head_len = db_getheaderlen(query_no);
369 si->query_head = db_getheader(query_no);
370 si->qseqlen = db_getsequencelen(query_no);
371 si->qsequence = db_getsequence(query_no);
372 si->rejects = 0;
373 si->accepts = 0;
374 si->hit_count = 0;
375
376 for(int target = si->query_no + 1;
377 target < seqcount; target++)
378 {
379 if (opt_acceptall || search_acceptable_unaligned(si, target)) {
380 pseqnos[si->hit_count++] = target;
381
382 }
383 }
384
385 if (si->hit_count)
386 {
387 /* perform alignments */
388
389 search16_qprep(si->s, si->qsequence, si->qseqlen);
390
391 search16(si->s,
392 si->hit_count,
393 pseqnos,
394 pscores,
395 paligned,
396 pmatches,
397 pmismatches,
398 pgaps,
399 pcigar);
400
401 /* convert to hit structure */
402 for (int h = 0; h < si->hit_count; h++)
403 {
404 struct hit * hit = si->hits + h;
405
406 unsigned int target = pseqnos[h];
407 int64_t nwscore = pscores[h];
408
409 char * nwcigar {nullptr};
410 int64_t nwalignmentlength {0};
411 int64_t nwmatches {0};
412 int64_t nwmismatches {0};
413 int64_t nwgaps {0};
414
415 if (nwscore == SHRT_MAX)
416 {
417 /* In case the SIMD aligner cannot align,
418 perform a new alignment with the
419 linear memory aligner */
420
421 char * tseq = db_getsequence(target);
422 int64_t tseqlen = db_getsequencelen(target);
423
424 if (pcigar[h]) {
425 xfree(pcigar[h]);
426
427 }
428
429 nwcigar = xstrdup(lma.align(si->qsequence,
430 tseq,
431 si->qseqlen,
432 tseqlen));
433 lma.alignstats(nwcigar,
434 si->qsequence,
435 tseq,
436 & nwscore,
437 & nwalignmentlength,
438 & nwmatches,
439 & nwmismatches,
440 & nwgaps);
441 }
442 else
443 {
444 nwcigar = pcigar[h];
445 nwalignmentlength = paligned[h];
446 nwmatches = pmatches[h];
447 nwmismatches = pmismatches[h];
448 nwgaps = pgaps[h];
449 }
450
451 hit->target = target;
452 hit->strand = 0;
453 hit->count = 0;
454
455 hit->accepted = false;
456 hit->rejected = false;
457 hit->aligned = true;
458 hit->weak = false;
459
460 hit->nwscore = nwscore;
461 hit->nwdiff = nwalignmentlength - nwmatches;
462 hit->nwgaps = nwgaps;
463 hit->nwindels = nwalignmentlength - nwmatches - nwmismatches;
464 hit->nwalignmentlength = nwalignmentlength;
465 hit->nwid = 100.0 * (nwalignmentlength - hit->nwdiff) /
466 nwalignmentlength;
467 hit->nwalignment = nwcigar;
468 hit->matches = nwalignmentlength - hit->nwdiff;
469 hit->mismatches = hit->nwdiff - hit->nwindels;
470
471 int64_t dseqlen = db_getsequencelen(target);
472 hit->shortest = MIN(si->qseqlen, dseqlen);
473 hit->longest = MAX(si->qseqlen, dseqlen);
474
475 /* trim alignment, compute numbers excluding terminal gaps */
476 align_trim(hit);
477
478 /* test accept/reject criteria after alignment */
479 if (opt_acceptall || search_acceptable_aligned(si, hit)) {
480 finalhits[si->accepts++] = *hit;
481
482 }
483 }
484
485 /* sort hits */
486 qsort(finalhits, si->accepts,
487 sizeof(struct hit), allpairs_hit_compare);
488 }
489
490 /* lock mutex for update of global data and output */
491 xpthread_mutex_lock(&mutex_output);
492
493 /* output results */
494 allpairs_output_results(si->accepts,
495 finalhits,
496 si->query_head,
497 si->qseqlen,
498 si->qsequence,
499 nullptr);
500
501 /* update stats */
502 if (si->accepts) {
503 qmatches++;
504
505 }
506
507 /* show progress */
508 progress += seqcount - query_no - 1;
509 progress_update(progress);
510
511 xpthread_mutex_unlock(&mutex_output);
512
513 /* free memory for alignment strings */
514 for(int i=0; i < si->hit_count; i++) {
515 if (si->hits[i].aligned) {
516 xfree(si->hits[i].nwalignment);
517
518 }
519
520 }
521 }
522 else
523 {
524 /* let other threads read input */
525 xpthread_mutex_unlock(&mutex_input);
526
527 cont = false;
528 }
529 }
530
531 xfree(finalhits);
532
533 xfree(pcigar);
534 xfree(pgaps);
535 xfree(pmismatches);
536 xfree(pmatches);
537 xfree(paligned);
538 xfree(pscores);
539 xfree(pseqnos);
540
541 search16_exit(si->s);
542
543 nw_exit(nw);
544
545 xfree(scorematrix);
546
547 xfree(si->hits);
548 }
549
allpairs_thread_worker(void * vp)550 void * allpairs_thread_worker(void * vp)
551 {
552 auto t = (int64_t) vp;
553 allpairs_thread_run(t);
554 return nullptr;
555 }
556
allpairs_thread_worker_run()557 void allpairs_thread_worker_run()
558 {
559 /* initialize threads, start them, join them and return */
560
561 xpthread_attr_init(&attr);
562 xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
563
564 /* init and create worker threads, put them into stand-by mode */
565 for(int t=0; t<opt_threads; t++) {
566 xpthread_create(pthread+t, &attr,
567 allpairs_thread_worker, (void*)(int64_t)t);
568
569 }
570
571 /* finish and clean up worker threads */
572 for(int t=0; t<opt_threads; t++) {
573 xpthread_join(pthread[t], nullptr);
574
575 }
576
577 xpthread_attr_destroy(&attr);
578 }
579
580
allpairs_global(char * cmdline,char * progheader)581 void allpairs_global(char * cmdline, char * progheader)
582 {
583 opt_strand = 1;
584 opt_uc_allhits = 1;
585
586 /* open output files */
587
588 if (opt_alnout)
589 {
590 fp_alnout = fopen_output(opt_alnout);
591 if (! fp_alnout) {
592 fatal("Unable to open alignment output file for writing");
593
594 }
595
596 fprintf(fp_alnout, "%s\n", cmdline);
597 fprintf(fp_alnout, "%s\n", progheader);
598 }
599
600 if (opt_samout)
601 {
602 fp_samout = fopen_output(opt_samout);
603 if (! fp_samout) {
604 fatal("Unable to open SAM output file for writing");
605
606 }
607 }
608
609 if (opt_userout)
610 {
611 fp_userout = fopen_output(opt_userout);
612 if (! fp_userout) {
613 fatal("Unable to open user-defined output file for writing");
614
615 }
616 }
617
618 if (opt_blast6out)
619 {
620 fp_blast6out = fopen_output(opt_blast6out);
621 if (! fp_blast6out) {
622 fatal("Unable to open blast6-like output file for writing");
623
624 }
625 }
626
627 if (opt_uc)
628 {
629 fp_uc = fopen_output(opt_uc);
630 if (! fp_uc) {
631 fatal("Unable to open uc output file for writing");
632
633 }
634 }
635
636 if (opt_fastapairs)
637 {
638 fp_fastapairs = fopen_output(opt_fastapairs);
639 if (! fp_fastapairs) {
640 fatal("Unable to open fastapairs output file for writing");
641
642 }
643 }
644
645 if (opt_matched)
646 {
647 fp_matched = fopen_output(opt_matched);
648 if (! fp_matched) {
649 fatal("Unable to open matched output file for writing");
650
651 }
652 }
653
654 if (opt_notmatched)
655 {
656 fp_notmatched = fopen_output(opt_notmatched);
657 if (! fp_notmatched) {
658 fatal("Unable to open notmatched output file for writing");
659
660 }
661 }
662
663 db_read(opt_allpairs_global, 0);
664
665 results_show_samheader(fp_samout, cmdline, opt_allpairs_global);
666
667 if (opt_qmask == MASK_DUST) {
668 dust_all();
669 } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) {
670 hardmask_all();
671
672 }
673
674 show_rusage();
675
676 seqcount = db_getsequencecount();
677
678 /* prepare reading of queries */
679 qmatches = 0;
680 queries = 0;
681
682 pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t));
683
684 /* init mutexes for input and output */
685 xpthread_mutex_init(&mutex_input, nullptr);
686 xpthread_mutex_init(&mutex_output, nullptr);
687
688 progress = 0;
689 progress_init("Aligning", MAX(0,((int64_t)seqcount)*((int64_t)seqcount-1))/2);
690 allpairs_thread_worker_run();
691 progress_done();
692
693 if (!opt_quiet)
694 {
695 fprintf(stderr, "Matching query sequences: %d of %d",
696 qmatches, queries);
697 if (queries > 0) {
698 fprintf(stderr, " (%.2f%%)", 100.0 * qmatches / queries);
699
700 }
701 fprintf(stderr, "\n");
702 }
703
704 if (opt_log)
705 {
706 fprintf(fp_log, "Matching query sequences: %d of %d",
707 qmatches, queries);
708 if (queries > 0) {
709 fprintf(fp_log, " (%.2f%%)", 100.0 * qmatches / queries);
710
711 }
712 fprintf(fp_log, "\n\n");
713 }
714
715 xpthread_mutex_destroy(&mutex_output);
716 xpthread_mutex_destroy(&mutex_input);
717
718 xfree(pthread);
719
720 /* clean up, global */
721 db_free();
722 if (opt_matched) {
723 fclose(fp_matched);
724
725 }
726 if (opt_notmatched) {
727 fclose(fp_notmatched);
728
729 }
730 if (opt_fastapairs) {
731 fclose(fp_fastapairs);
732
733 }
734 if (fp_uc) {
735 fclose(fp_uc);
736
737 }
738 if (fp_blast6out) {
739 fclose(fp_blast6out);
740
741 }
742 if (fp_userout) {
743 fclose(fp_userout);
744
745 }
746 if (fp_alnout) {
747 fclose(fp_alnout);
748
749 }
750 if (fp_samout) {
751 fclose(fp_samout);
752
753 }
754 show_rusage();
755 }
756