1 /* $Id: blast_lookup.c 523540 2017-01-04 16:58:25Z boratyng $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 */
26
27 /** @file blast_lookup.c
28 * Functions that provide generic services for BLAST lookup tables
29 */
30
31 #include <algo/blast/core/blast_lookup.h>
32
BlastLookupAddWordHit(Int4 ** backbone,Int4 wordsize,Int4 charsize,Uint1 * seq,Int4 query_offset)33 void BlastLookupAddWordHit(Int4 **backbone, Int4 wordsize,
34 Int4 charsize, Uint1* seq,
35 Int4 query_offset)
36 {
37 Int4 index;
38 Int4 *chain = NULL;
39 Int4 chain_size = 0; /* total number of elements in the chain */
40 Int4 hits_in_chain = 0; /* number of occupied elements in the chain,
41 not including the zeroth and first
42 positions */
43
44 /* compute the backbone cell to update */
45
46 index = ComputeTableIndex(wordsize, charsize, seq);
47
48 /* if backbone cell is null, initialize a new chain */
49 if (backbone[index] == NULL) {
50 chain_size = 8;
51 hits_in_chain = 0;
52 chain = (Int4 *) malloc(chain_size * sizeof(Int4));
53 ASSERT(chain != NULL);
54 chain[0] = chain_size;
55 chain[1] = hits_in_chain;
56 backbone[index] = chain;
57 } else {
58 /* otherwise, use the existing chain */
59 chain = backbone[index];
60 chain_size = chain[0];
61 hits_in_chain = chain[1];
62 }
63
64 /* if the chain is full, allocate more room */
65 if ((hits_in_chain + 2) == chain_size) {
66 chain_size = chain_size * 2;
67 chain = (Int4 *) realloc(chain, chain_size * sizeof(Int4));
68 ASSERT(chain != NULL);
69
70 backbone[index] = chain;
71 chain[0] = chain_size;
72 }
73
74 /* add the hit */
75 chain[chain[1] + 2] = query_offset;
76 chain[1]++;
77 }
78
BlastLookupIndexQueryExactMatches(Int4 ** backbone,Int4 word_length,Int4 charsize,Int4 lut_word_length,BLAST_SequenceBlk * query,BlastSeqLoc * locations)79 void BlastLookupIndexQueryExactMatches(Int4 **backbone,
80 Int4 word_length,
81 Int4 charsize,
82 Int4 lut_word_length,
83 BLAST_SequenceBlk * query,
84 BlastSeqLoc * locations)
85 {
86 BlastSeqLoc *loc;
87 Int4 offset;
88 Uint1 *seq;
89 Uint1 *word_target;
90 Uint1 invalid_mask = 0xff << charsize;
91
92 for (loc = locations; loc; loc = loc->next) {
93 Int4 from = loc->ssr->left;
94 Int4 to = loc->ssr->right;
95
96 /* if this location is too small to fit a complete word, skip the
97 location */
98
99 if (word_length > to - from + 1)
100 continue;
101
102 /* Indexing proceeds from the start point to the last offset
103 such that a full lookup table word can be created. word_target
104 points to the letter beyond which indexing is allowed */
105 seq = query->sequence + from;
106 word_target = seq + lut_word_length;
107
108 for (offset = from; offset <= to; offset++, seq++) {
109
110 if (seq >= word_target) {
111 BlastLookupAddWordHit(backbone,
112 lut_word_length, charsize,
113 seq - lut_word_length,
114 offset - lut_word_length);
115 }
116
117 /* if the current word contains an ambiguity, skip all the
118 words that would contain that ambiguity */
119 if (*seq & invalid_mask)
120 word_target = seq + lut_word_length + 1;
121 }
122
123 /* handle the last word, without loading *seq */
124 if (seq >= word_target) {
125 BlastLookupAddWordHit(backbone,
126 lut_word_length, charsize,
127 seq - lut_word_length,
128 offset - lut_word_length);
129 }
130
131 }
132 }
133
134
BackboneCellFree(BackboneCell * cell)135 BackboneCell* BackboneCellFree(BackboneCell* cell)
136 {
137 BackboneCell* b = cell;
138 while (b) {
139 BackboneCell* next = b->next;
140 sfree(b);
141 b = next;
142 }
143
144 return NULL;
145 }
146
147
BackboneCellNew(Uint4 word,Int4 offset)148 BackboneCell* BackboneCellNew(Uint4 word, Int4 offset)
149 {
150 BackboneCell* cell = calloc(1, sizeof(BackboneCell));
151 if (!cell) {
152 BackboneCellFree(cell);
153 return NULL;
154 }
155
156 BackboneCellInit(cell, word, offset);
157 return cell;
158 }
159
160
BackboneCellInit(BackboneCell * cell,Uint4 word,Int4 offset)161 Int4 BackboneCellInit(BackboneCell* cell, Uint4 word, Int4 offset)
162 {
163 if (!cell) {
164 return -1;
165 }
166
167 cell->word = word;
168 cell->offset = offset;
169 cell->num_offsets = 1;
170
171 return 0;
172 }
173
s_AddWordHit(BackboneCell * backbone,Int4 * offsets,Int4 wordsize,Int4 charsize,Uint1 * seq,Int4 offset,TNaLookupHashFunction hash_func,Uint4 mask,PV_ARRAY_TYPE * pv_array)174 static Int2 s_AddWordHit(BackboneCell* backbone, Int4* offsets, Int4 wordsize,
175 Int4 charsize, Uint1* seq, Int4 offset,
176 TNaLookupHashFunction hash_func, Uint4 mask,
177 PV_ARRAY_TYPE* pv_array)
178 {
179 Uint4 large_index;
180 Int8 index;
181 Int4 i;
182
183 /* convert a sequence from 4NA to 2NA */
184 large_index = 0;
185 for (i = 0;i < wordsize;i++) {
186 large_index = (large_index << charsize) | seq[i];
187 }
188
189 /* if filtering by database word count, then do not add words
190 that do not appear in the database or appear to many times */
191 if (pv_array && !PV_TEST(pv_array, large_index, PV_ARRAY_BTS)) {
192 return 0;
193 }
194
195 index = (Int8)hash_func((Uint1*)&large_index, mask);
196
197 /* offset zero indicates end of offset list, so we are storing offset + 1
198 values */
199 offset++;
200
201 /* if the hash table entry is emtpy, initialize a new cell */
202 if (backbone[index].num_offsets == 0) {
203 BackboneCellInit(&backbone[index], large_index, offset);
204 }
205 else {
206 /* otherwise check if the word was already added */
207 BackboneCell* b = &backbone[index];
208 while (b->next && b->word != large_index) {
209 b = b->next;
210 }
211
212 /* if word was already added, add the new offset to an existing cell */
213 if (b->word == large_index) {
214 /* word offsets are all sored in a linear array where next offset =
215 offsets[current offset] until offset is zero (similarily to
216 next_pos array in MBLookupTable) */
217
218 ASSERT(offsets[offset] == 0);
219 offsets[offset] = b->offset;
220 b->offset = offset;
221 b->num_offsets++;
222 }
223 else {
224 /* otherwise creare a new cell */
225 ASSERT(!b->next);
226 b->next = BackboneCellNew(large_index, offset);
227 if (!b->next) {
228 return -1;
229 }
230 }
231 }
232
233 return 0;
234 }
235
236
BlastHashLookupIndexQueryExactMatches(BackboneCell * backbone,Int4 * offsets,Int4 word_length,Int4 charsize,Int4 lut_word_length,BLAST_SequenceBlk * query,BlastSeqLoc * locations,TNaLookupHashFunction hash_func,Uint4 mask,PV_ARRAY_TYPE * pv_array)237 void BlastHashLookupIndexQueryExactMatches(BackboneCell *backbone,
238 Int4* offsets,
239 Int4 word_length,
240 Int4 charsize,
241 Int4 lut_word_length,
242 BLAST_SequenceBlk* query,
243 BlastSeqLoc* locations,
244 TNaLookupHashFunction hash_func,
245 Uint4 mask,
246 PV_ARRAY_TYPE* pv_array)
247 {
248 BlastSeqLoc *loc;
249 Int4 offset;
250 Uint1 *seq;
251 Uint1 *word_target;
252 Uint1 invalid_mask = 0xff << charsize;
253
254 for (loc = locations; loc; loc = loc->next) {
255 Int4 from = loc->ssr->left;
256 Int4 to = loc->ssr->right;
257
258 /* if this location is too small to fit a complete word, skip the
259 location */
260
261 if (word_length > to - from + 1)
262 continue;
263
264 /* Indexing proceeds from the start point to the last offset
265 such that a full lookup table word can be created. word_target
266 points to the letter beyond which indexing is allowed */
267 seq = query->sequence + from;
268 word_target = seq + lut_word_length;
269
270 for (offset = from; offset <= to; offset++, seq++) {
271
272 if (seq >= word_target) {
273 s_AddWordHit(backbone,
274 offsets,
275 lut_word_length, charsize,
276 seq - lut_word_length,
277 offset - lut_word_length,
278 hash_func, mask,
279 pv_array);
280 }
281
282 /* if the current word contains an ambiguity, skip all the
283 words that would contain that ambiguity */
284 if (*seq & invalid_mask)
285 word_target = seq + lut_word_length + 1;
286 }
287
288 /* handle the last word, without loading *seq */
289 if (seq >= word_target) {
290 s_AddWordHit(backbone,
291 offsets,
292 lut_word_length, charsize,
293 seq - lut_word_length,
294 offset - lut_word_length,
295 hash_func, mask,
296 pv_array);
297 }
298
299 }
300 }
301
302