1 /*  $Id: jumper.h,v 1.2 2016/06/21 13:54:13 fukanchi Exp $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Greg Boratyn
27  *
28  * Jumper alignment
29  *
30  */
31 
32 #include <algo/blast/core/ncbi_std.h>
33 #include <algo/blast/core/blast_util.h>
34 #include <algo/blast/core/blast_def.h>
35 #include <algo/blast/core/blast_gapalign.h>
36 #include <algo/blast/core/blast_parameters.h>
37 #include <algo/blast/core/blast_extend.h>
38 #include <algo/blast/core/gapinfo.h>
39 #include <algo/blast/core/blast_nalookup.h>
40 
41 
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45 
46 typedef struct { int dcp, dcq, lng, ok; } JUMP;
47 
48 /* Simple edit script for jumper: represented as integer n:
49    n > 0: n matches
50    n == 0: a single mismatch
51    n < 0: a single insertion or deletion */
52 #define JUMPER_MISMATCH (0)
53 #define JUMPER_INSERTION (-1)
54 #define JUMPER_DELETION (-2)
55 
56 /** Jumper edit script operation */
57 typedef Int2 JumperOpType;
58 
59 /** Internal alignment edit script */
60 typedef struct JumperPrelimEditBlock
61 {
62     JumperOpType* edit_ops;
63     Int4 num_ops;
64     Int4 num_allocated;
65 } JumperPrelimEditBlock;
66 
67 
68 /** Gapped alignment data needed for jumper */
69 struct JumperGapAlign
70 {
71     JumperPrelimEditBlock* left_prelim_block;
72     JumperPrelimEditBlock* right_prelim_block;
73     Uint4* table; /**< Table used for matching 4 bases in compressed subject
74                        to 4 bases in uncompressed query */
75 };
76 
77 
78 JumperGapAlign* JumperGapAlignFree(JumperGapAlign* jumper_align);
79 JumperGapAlign* JumperGapAlignNew(Int4 size);
80 Int4 JumperPrelimEditBlockAdd(JumperPrelimEditBlock* block, JumperOpType op);
81 
82 
83 /** Single alignment error */
84 typedef struct JumperEdit
85 {
86     Int4 query_pos;      /**< Query position*/
87     Uint1 query_base;    /**< Query base at this position */
88     Uint1 subject_base;  /**< Subject base at this position */
89 } JumperEdit;
90 
91 
92 /** Alignment edit script for gapped alignment */
93 struct JumperEditsBlock
94 {
95     JumperEdit* edits;
96     Int4 num_edits;
97 };
98 
99 
100 JumperEditsBlock* JumperEditsBlockFree(JumperEditsBlock* block);
101 JumperEditsBlock* JumperEditsBlockDup(const JumperEditsBlock* block);
102 JumperEditsBlock* JumperFindEdits(const Uint1* query, const Uint1* subject,
103                                   BlastGapAlignStruct* gap_align);
104 
105 /** Convert Jumper's preliminary edit script to GapEditScript */
106 GapEditScript* JumperPrelimEditBlockToGapEditScript(
107                                       JumperPrelimEditBlock* rev_prelim_block,
108                                       JumperPrelimEditBlock* fwd_prelim_block);
109 
110 
111 /** Test whether an HSP should be saved */
112 Boolean JumperGoodAlign(const BlastGapAlignStruct* gap_align,
113                         const BlastHitSavingParameters* hit_params,
114                         Int4 num_identical,
115                         BlastContextInfo* range_info);
116 
117 
118 /** Right extension with traceback */
119 Int4 JumperExtendRightWithTraceback(
120                                  const Uint1* query, const Uint1* subject,
121                                  int query_length, int subject_length,
122                                  Int4 match_score, Int4 mismatch_score,
123                                  Int4 gap_open, Int4 gap_extend,
124                                  int max_mismatches, int window,
125                                  Int4* query_ext_len, Int4* subject_ext_len,
126                                  JumperPrelimEditBlock* edit_script,
127                                  Int4* num_identical,
128                                  Boolean left_extension,
129                                  Int4* ungapped_ext_len,
130                                  JUMP* jumper);
131 
132 
133 
134 /** Jumper gapped alignment with traceback; 1 base per byte in query, 4 bases
135     per byte in subject */
136 int JumperGappedAlignmentCompressedWithTraceback(const Uint1* query,
137                                const Uint1* subject,
138                                Int4 query_length, Int4 subject_length,
139                                Int4 query_start, Int4 subject_start,
140                                BlastGapAlignStruct* gap_align,
141                                const BlastScoringParameters* score_params,
142                                Int4* num_identical,
143                                Int4* right_ungapped_ext_len);
144 
145 
146 #define SUBJECT_INDEX_WORD_LENGTH (4)
147 
148 /** Index for a chunk of a subject sequence */
149 typedef struct SubjectIndex
150 {
151     /** Array of lookup tables */
152     BlastNaLookupTable** lookups;
153     /** Number of bases covered by each lookup table */
154     Int4 width;
155     /** Number of lookup tables used */
156     Int4 num_lookups;
157 } SubjectIndex;
158 
159 /** Free subject index structure */
160 SubjectIndex* SubjectIndexFree(SubjectIndex* sindex);
161 
162 /** Index a sequence, used for indexing compressed nucleotide subject
163  *  sequence. The index consists of a list of lookup tables, each covering
164  *  width number of bases.
165  *
166  * @param subject Sequence to be indexed [in]
167  * @param width Number of bases covered by a single lookup table [in]
168  * @param word_size Word size to be used in the lookup table [in]
169  * @return Pointer to the created index
170  */
171 SubjectIndex* SubjectIndexNew(BLAST_SequenceBlk* subject, Int4 width,
172                               Int4 word_size);
173 
174 
175 /** Iterator over word locations in subject index */
176 typedef struct SubjectIndexIterator
177 {
178     SubjectIndex* subject_index;
179     Int4 word;
180     Int4 from;
181     Int4 to;
182     Int4 lookup_index;
183     Int4* lookup_pos;
184     Int4 num_words;
185     Int4 word_index;
186 } SubjectIndexIterator;
187 
188 
189 /** Free memory reserved for subject index word iterator */
190 SubjectIndexIterator* SubjectIndexIteratorFree(SubjectIndexIterator* it);
191 
192 /** Create an iterator for locations of a given word
193  * @param s_index Sequence index [in]
194  * @param word Nucleotide word to be searched [in]
195  * @param from Sequence location to begin search [in]
196  * @param to Sequence location to end search [in]
197  * @return Word location iterator
198  */
199 SubjectIndexIterator* SubjectIndexIteratorNew(SubjectIndex* s_index, Int4 word,
200                                               Int4 from, Int4 to);
201 
202 /** Return the next location of a word in an indexed sequence
203  * @param it Iterator [in|out]
204  * @return Word location or value less than zero if no more words are found
205  */
206 Int4 SubjectIndexIteratorNext(SubjectIndexIterator* it);
207 
208 /** Return the previous location of a word in an indexed sequence
209  * @param it Iterator [in|out]
210  * @return Word location or value less than zero if no more words are found
211  */
212 Int4 SubjectIndexIteratorPrev(SubjectIndexIterator* it);
213 
214 
215 
216 /** Extend a list of word hits */
217 Int4 BlastNaExtendJumper(BlastOffsetPair* offset_pairs, Int4 num_hits,
218                          const BlastInitialWordParameters* word_params,
219                          const BlastScoringParameters* score_params,
220                          const BlastHitSavingParameters* hit_params,
221                          LookupTableWrap* lookup_wrap,
222                          BLAST_SequenceBlk* query,
223                          BLAST_SequenceBlk* subject,
224                          BlastQueryInfo* query_info,
225                          BlastGapAlignStruct* gap_align,
226                          BlastHSPList* hsp_list,
227                          Uint4 s_range,
228                          SubjectIndex* s_index);
229 
230 
231 #define MAPPER_SPLICE_SIGNAL (0x80)
232 #define MAPPER_EXON (0x40)
233 #define MAPPER_POLY_A (0x20)
234 #define MAPPER_ADAPTER (0x10)
235 
236 /** Find splice signals at the edges of an HSP and save them in the HSP
237  * @param hsp HSP [in|out]
238  * @param query_len Length of the query/read [in]
239  * @param subject Subject sequence [in]
240  * @param subject_len Subject length [in]
241  * @return Zero on success
242  */
243 int JumperFindSpliceSignals(BlastHSP* hsp, Int4 query_len,
244                             const Uint1* subject, Int4 subject_len);
245 
246 
247 #define MAPPER_FLAGS_PAIR_CONVERGENT (1)
248 #define MAPPER_FLAGS_PAIR_DIVERGENT  (1 << 1)
249 #define MAPPER_FLAGS_PAIR_REARRANGED (1 << 2)
250 
251 #define MAPPER_FLAGS_PAIR (MAPPER_FLAGS_PAIR_CONVERGENT | MAPPER_FLAGS_PAIR_DIVERGENT | MAPPER_FLAGS_PAIR_REARRANGED)
252 
253 
254 /* Combine edit scripts into one. Returns the combined edit script, deletes
255    input scripts. */
256 GapEditScript* GapEditScriptCombine(GapEditScript** edit_script,
257                                     GapEditScript** append);
258 
259 /* Combine two edits blocks into one. Returns the combined block, deletes input
260    blocks */
261 JumperEditsBlock* JumperEditsBlockCombine(JumperEditsBlock** block,
262                                           JumperEditsBlock** append);
263 
264 /** Structure to save short unaligned subsequences outside an HSP */
265 struct SequenceOverhangs
266 {
267     Int4 left_len;    /**< Length of the left subsequence */
268     Int4 right_len;   /**< Length of the right subsequence */
269     Uint1* left;      /**< Left subsequence */
270     Uint1* right;     /**< Rught subsequence */
271 };
272 
273 SequenceOverhangs* SequenceOverhangsFree(SequenceOverhangs* overhangs);
274 
275 
276 #ifdef __cplusplus
277 }
278 #endif
279 
280