1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 //#define COMPARENONVECTORIZED
62 
63 /* the number of alignments that can be delayed */
64 #define MAXDELAYED 8
65 
66 /* Default minimum number of word matches for word lengths 3-15 */
67 const int minwordmatches_defaults[] =
68   { -1, -1, -1, 18, 17, 16, 15, 14, 12, 11, 10,  9,  8,  7,  5,  3 };
69 
70 struct hit
71 {
72   int target;
73   int strand;
74 
75   /* candidate info */
76   unsigned int count;     /* number of unique kmers shared with query */
77 
78   bool accepted;          /* is it accepted? */
79   bool rejected;          /* is it rejected? */
80   bool aligned;           /* has this hit been aligned */
81   bool weak;              /* weak hits are aligned with id > weak_id */
82 
83   /* info about global alignment, including terminal gaps */
84 
85   int nwscore;           /* alignment score */
86   int nwdiff;            /* indels and mismatches in global alignment */
87   int nwgaps;            /* gaps in global alignment */
88   int nwindels;          /* indels in global alignment */
89   int nwalignmentlength; /* length of global alignment */
90   double nwid;           /* percent identity of global alignment */
91   char * nwalignment;    /* alignment string (cigar) of global alignment */
92   int matches;
93   int mismatches;
94 
95   /* info about alignment excluding terminal gaps */
96 
97   int internal_alignmentlength;
98   int internal_gaps;
99   int internal_indels;
100   int trim_q_left;
101   int trim_q_right;
102   int trim_t_left;
103   int trim_t_right;
104   int trim_aln_left;
105   int trim_aln_right;
106 
107   /* more info */
108 
109   double id;             /* identity used for ranking */
110   double id0, id1, id2, id3, id4;
111 
112   int shortest;          /* length of shortest of query and target */
113   int longest;           /* length of longest of query and target */
114 };
115 
116 /* type of kmer hit counter element remember possibility of overflow */
117 typedef unsigned short count_t;
118 
119 struct searchinfo_s
120 {
121   int query_no;                 /* query number, zero-based */
122   int strand;                   /* strand of query being analysed */
123   int qsize;                    /* query abundance */
124   int query_head_len;           /* query header length */
125   int query_head_alloc;         /* bytes allocated for the header */
126   char * query_head;            /* query header */
127   int qseqlen;                  /* query length */
128   int seq_alloc;                /* bytes allocated for the query sequence */
129   char * qsequence;             /* query sequence */
130   unsigned int kmersamplecount; /* number of kmer samples from query */
131   unsigned int * kmersample;    /* list of kmers sampled from query */
132   count_t * kmers;              /* list of kmer counts for each db seq */
133   struct hit * hits;            /* list of hits */
134   int hit_count;                /* number of hits in the above list */
135   struct uhandle_s * uh;        /* unique kmer finder instance */
136   struct s16info_s * s;         /* SIMD aligner instance */
137   struct nwinfo_s * nw;         /* NW aligner instance */
138   LinearMemoryAligner * lma;    /* Linear memory aligner instance pointer */
139   int accepts;                  /* number of accepts */
140   int rejects;                  /* number of rejects */
141   minheap_t * m;                /* min heap with the top kmer db seqs */
142   int finalized;
143 };
144 
145 void search_topscores(struct searchinfo_s * si);
146 
147 void search_onequery(struct searchinfo_s * si, int seqmask);
148 
149 struct hit * search_findbest2_byid(struct searchinfo_s * si_p,
150                                    struct searchinfo_s * si_m);
151 
152 struct hit * search_findbest2_bysize(struct searchinfo_s * si_p,
153                                      struct searchinfo_s * si_m);
154 
155 int search_acceptable_unaligned(struct searchinfo_s * si, int target);
156 
157 int search_acceptable_aligned(struct searchinfo_s * si,
158                               struct hit * hit);
159 
160 void align_trim(struct hit * hit);
161 
162 void search_joinhits(struct searchinfo_s * si_p,
163                      struct searchinfo_s * si_m,
164                      struct hit * * hits,
165                      int * hit_count);
166 
167 bool search_enough_kmers(struct searchinfo_s * si,
168                          unsigned int count);
169