1 /* $Id: blast_lookup.c 523540 2017-01-04 16:58:25Z boratyng $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  */
26 
27 /** @file blast_lookup.c
28  * Functions that provide generic services for BLAST lookup tables
29  */
30 
31 #include <algo/blast/core/blast_lookup.h>
32 
BlastLookupAddWordHit(Int4 ** backbone,Int4 wordsize,Int4 charsize,Uint1 * seq,Int4 query_offset)33 void BlastLookupAddWordHit(Int4 **backbone, Int4 wordsize,
34                            Int4 charsize, Uint1* seq,
35                            Int4 query_offset)
36 {
37     Int4 index;
38     Int4 *chain = NULL;
39     Int4 chain_size = 0;        /* total number of elements in the chain */
40     Int4 hits_in_chain = 0;     /* number of occupied elements in the chain,
41                                    not including the zeroth and first
42                                    positions */
43 
44     /* compute the backbone cell to update */
45 
46     index = ComputeTableIndex(wordsize, charsize, seq);
47 
48     /* if backbone cell is null, initialize a new chain */
49     if (backbone[index] == NULL) {
50         chain_size = 8;
51         hits_in_chain = 0;
52         chain = (Int4 *) malloc(chain_size * sizeof(Int4));
53         ASSERT(chain != NULL);
54         chain[0] = chain_size;
55         chain[1] = hits_in_chain;
56         backbone[index] = chain;
57     } else {
58         /* otherwise, use the existing chain */
59         chain = backbone[index];
60         chain_size = chain[0];
61         hits_in_chain = chain[1];
62     }
63 
64     /* if the chain is full, allocate more room */
65     if ((hits_in_chain + 2) == chain_size) {
66         chain_size = chain_size * 2;
67         chain = (Int4 *) realloc(chain, chain_size * sizeof(Int4));
68         ASSERT(chain != NULL);
69 
70         backbone[index] = chain;
71         chain[0] = chain_size;
72     }
73 
74     /* add the hit */
75     chain[chain[1] + 2] = query_offset;
76     chain[1]++;
77 }
78 
BlastLookupIndexQueryExactMatches(Int4 ** backbone,Int4 word_length,Int4 charsize,Int4 lut_word_length,BLAST_SequenceBlk * query,BlastSeqLoc * locations)79 void BlastLookupIndexQueryExactMatches(Int4 **backbone,
80                                       Int4 word_length,
81                                       Int4 charsize,
82                                       Int4 lut_word_length,
83                                       BLAST_SequenceBlk * query,
84                                       BlastSeqLoc * locations)
85 {
86     BlastSeqLoc *loc;
87     Int4 offset;
88     Uint1 *seq;
89     Uint1 *word_target;
90     Uint1 invalid_mask = 0xff << charsize;
91 
92     for (loc = locations; loc; loc = loc->next) {
93         Int4 from = loc->ssr->left;
94         Int4 to = loc->ssr->right;
95 
96         /* if this location is too small to fit a complete word, skip the
97            location */
98 
99         if (word_length > to - from + 1)
100             continue;
101 
102         /* Indexing proceeds from the start point to the last offset
103            such that a full lookup table word can be created. word_target
104            points to the letter beyond which indexing is allowed */
105         seq = query->sequence + from;
106         word_target = seq + lut_word_length;
107 
108         for (offset = from; offset <= to; offset++, seq++) {
109 
110             if (seq >= word_target) {
111                 BlastLookupAddWordHit(backbone,
112                                       lut_word_length, charsize,
113                                       seq - lut_word_length,
114                                       offset - lut_word_length);
115             }
116 
117             /* if the current word contains an ambiguity, skip all the
118                words that would contain that ambiguity */
119             if (*seq & invalid_mask)
120                 word_target = seq + lut_word_length + 1;
121         }
122 
123         /* handle the last word, without loading *seq */
124         if (seq >= word_target) {
125             BlastLookupAddWordHit(backbone,
126                                   lut_word_length, charsize,
127                                   seq - lut_word_length,
128                                   offset - lut_word_length);
129         }
130 
131     }
132 }
133 
134 
BackboneCellFree(BackboneCell * cell)135 BackboneCell* BackboneCellFree(BackboneCell* cell)
136 {
137     BackboneCell* b = cell;
138     while (b) {
139         BackboneCell* next = b->next;
140         sfree(b);
141         b = next;
142     }
143 
144     return NULL;
145 }
146 
147 
BackboneCellNew(Uint4 word,Int4 offset)148 BackboneCell* BackboneCellNew(Uint4 word, Int4 offset)
149 {
150     BackboneCell* cell = calloc(1, sizeof(BackboneCell));
151     if (!cell) {
152         BackboneCellFree(cell);
153         return NULL;
154     }
155 
156     BackboneCellInit(cell, word, offset);
157     return cell;
158 }
159 
160 
BackboneCellInit(BackboneCell * cell,Uint4 word,Int4 offset)161 Int4 BackboneCellInit(BackboneCell* cell, Uint4 word, Int4 offset)
162 {
163     if (!cell) {
164         return -1;
165     }
166 
167     cell->word = word;
168     cell->offset = offset;
169     cell->num_offsets = 1;
170 
171     return 0;
172 }
173 
s_AddWordHit(BackboneCell * backbone,Int4 * offsets,Int4 wordsize,Int4 charsize,Uint1 * seq,Int4 offset,TNaLookupHashFunction hash_func,Uint4 mask,PV_ARRAY_TYPE * pv_array)174 static Int2 s_AddWordHit(BackboneCell* backbone, Int4* offsets, Int4 wordsize,
175                          Int4 charsize, Uint1* seq, Int4 offset,
176                          TNaLookupHashFunction hash_func, Uint4 mask,
177                          PV_ARRAY_TYPE* pv_array)
178 {
179     Uint4 large_index;
180     Int8 index;
181     Int4 i;
182 
183     /* convert a sequence from 4NA to 2NA */
184     large_index = 0;
185     for (i = 0;i < wordsize;i++) {
186         large_index = (large_index << charsize) | seq[i];
187     }
188 
189     /* if filtering by database word count, then do not add words
190        that do not appear in the database or appear to many times */
191     if (pv_array && !PV_TEST(pv_array, large_index, PV_ARRAY_BTS)) {
192         return 0;
193     }
194 
195     index = (Int8)hash_func((Uint1*)&large_index, mask);
196 
197     /* offset zero indicates end of offset list, so we are storing offset + 1
198        values */
199     offset++;
200 
201     /* if the hash table entry is emtpy, initialize a new cell */
202     if (backbone[index].num_offsets == 0) {
203         BackboneCellInit(&backbone[index], large_index, offset);
204     }
205     else {
206         /* otherwise check if the word was already added */
207         BackboneCell* b = &backbone[index];
208         while (b->next && b->word != large_index) {
209             b = b->next;
210         }
211 
212         /* if word was already added, add the new offset to an existing cell */
213         if (b->word == large_index) {
214             /* word offsets are all sored in a linear array where next offset =
215                offsets[current offset] until offset is zero (similarily to
216                next_pos array in MBLookupTable) */
217 
218             ASSERT(offsets[offset] == 0);
219             offsets[offset] = b->offset;
220             b->offset = offset;
221             b->num_offsets++;
222         }
223         else {
224             /* otherwise creare a new cell */
225             ASSERT(!b->next);
226             b->next = BackboneCellNew(large_index, offset);
227             if (!b->next) {
228                 return -1;
229             }
230         }
231     }
232 
233     return 0;
234 }
235 
236 
BlastHashLookupIndexQueryExactMatches(BackboneCell * backbone,Int4 * offsets,Int4 word_length,Int4 charsize,Int4 lut_word_length,BLAST_SequenceBlk * query,BlastSeqLoc * locations,TNaLookupHashFunction hash_func,Uint4 mask,PV_ARRAY_TYPE * pv_array)237 void BlastHashLookupIndexQueryExactMatches(BackboneCell *backbone,
238                                            Int4* offsets,
239                                            Int4 word_length,
240                                            Int4 charsize,
241                                            Int4 lut_word_length,
242                                            BLAST_SequenceBlk* query,
243                                            BlastSeqLoc* locations,
244                                            TNaLookupHashFunction hash_func,
245                                            Uint4 mask,
246                                            PV_ARRAY_TYPE* pv_array)
247 {
248     BlastSeqLoc *loc;
249     Int4 offset;
250     Uint1 *seq;
251     Uint1 *word_target;
252     Uint1 invalid_mask = 0xff << charsize;
253 
254     for (loc = locations; loc; loc = loc->next) {
255         Int4 from = loc->ssr->left;
256         Int4 to = loc->ssr->right;
257 
258         /* if this location is too small to fit a complete word, skip the
259            location */
260 
261         if (word_length > to - from + 1)
262             continue;
263 
264         /* Indexing proceeds from the start point to the last offset
265            such that a full lookup table word can be created. word_target
266            points to the letter beyond which indexing is allowed */
267         seq = query->sequence + from;
268         word_target = seq + lut_word_length;
269 
270         for (offset = from; offset <= to; offset++, seq++) {
271 
272             if (seq >= word_target) {
273                 s_AddWordHit(backbone,
274                              offsets,
275                              lut_word_length, charsize,
276                              seq - lut_word_length,
277                              offset - lut_word_length,
278                              hash_func, mask,
279                              pv_array);
280             }
281 
282             /* if the current word contains an ambiguity, skip all the
283                words that would contain that ambiguity */
284             if (*seq & invalid_mask)
285                 word_target = seq + lut_word_length + 1;
286         }
287 
288         /* handle the last word, without loading *seq */
289         if (seq >= word_target) {
290             s_AddWordHit(backbone,
291                          offsets,
292                          lut_word_length, charsize,
293                          seq - lut_word_length,
294                          offset - lut_word_length,
295                          hash_func, mask,
296                          pv_array);
297         }
298 
299     }
300 }
301 
302