1 /* $Id: jumper.h,v 1.2 2016/06/21 13:54:13 fukanchi Exp $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Greg Boratyn 27 * 28 * Jumper alignment 29 * 30 */ 31 32 #include <algo/blast/core/ncbi_std.h> 33 #include <algo/blast/core/blast_util.h> 34 #include <algo/blast/core/blast_def.h> 35 #include <algo/blast/core/blast_gapalign.h> 36 #include <algo/blast/core/blast_parameters.h> 37 #include <algo/blast/core/blast_extend.h> 38 #include <algo/blast/core/gapinfo.h> 39 #include <algo/blast/core/blast_nalookup.h> 40 41 42 #ifdef __cplusplus 43 extern "C" { 44 #endif 45 46 typedef struct { int dcp, dcq, lng, ok; } JUMP; 47 48 /* Simple edit script for jumper: represented as integer n: 49 n > 0: n matches 50 n == 0: a single mismatch 51 n < 0: a single insertion or deletion */ 52 #define JUMPER_MISMATCH (0) 53 #define JUMPER_INSERTION (-1) 54 #define JUMPER_DELETION (-2) 55 56 /** Jumper edit script operation */ 57 typedef Int2 JumperOpType; 58 59 /** Internal alignment edit script */ 60 typedef struct JumperPrelimEditBlock 61 { 62 JumperOpType* edit_ops; 63 Int4 num_ops; 64 Int4 num_allocated; 65 } JumperPrelimEditBlock; 66 67 68 /** Gapped alignment data needed for jumper */ 69 struct JumperGapAlign 70 { 71 JumperPrelimEditBlock* left_prelim_block; 72 JumperPrelimEditBlock* right_prelim_block; 73 Uint4* table; /**< Table used for matching 4 bases in compressed subject 74 to 4 bases in uncompressed query */ 75 }; 76 77 78 JumperGapAlign* JumperGapAlignFree(JumperGapAlign* jumper_align); 79 JumperGapAlign* JumperGapAlignNew(Int4 size); 80 Int4 JumperPrelimEditBlockAdd(JumperPrelimEditBlock* block, JumperOpType op); 81 82 83 /** Single alignment error */ 84 typedef struct JumperEdit 85 { 86 Int4 query_pos; /**< Query position*/ 87 Uint1 query_base; /**< Query base at this position */ 88 Uint1 subject_base; /**< Subject base at this position */ 89 } JumperEdit; 90 91 92 /** Alignment edit script for gapped alignment */ 93 struct JumperEditsBlock 94 { 95 JumperEdit* edits; 96 Int4 num_edits; 97 }; 98 99 100 JumperEditsBlock* JumperEditsBlockFree(JumperEditsBlock* block); 101 JumperEditsBlock* JumperEditsBlockDup(const JumperEditsBlock* block); 102 JumperEditsBlock* JumperFindEdits(const Uint1* query, const Uint1* subject, 103 BlastGapAlignStruct* gap_align); 104 105 /** Convert Jumper's preliminary edit script to GapEditScript */ 106 GapEditScript* JumperPrelimEditBlockToGapEditScript( 107 JumperPrelimEditBlock* rev_prelim_block, 108 JumperPrelimEditBlock* fwd_prelim_block); 109 110 111 /** Test whether an HSP should be saved */ 112 Boolean JumperGoodAlign(const BlastGapAlignStruct* gap_align, 113 const BlastHitSavingParameters* hit_params, 114 Int4 num_identical, 115 BlastContextInfo* range_info); 116 117 118 /** Right extension with traceback */ 119 Int4 JumperExtendRightWithTraceback( 120 const Uint1* query, const Uint1* subject, 121 int query_length, int subject_length, 122 Int4 match_score, Int4 mismatch_score, 123 Int4 gap_open, Int4 gap_extend, 124 int max_mismatches, int window, 125 Int4* query_ext_len, Int4* subject_ext_len, 126 JumperPrelimEditBlock* edit_script, 127 Int4* num_identical, 128 Boolean left_extension, 129 Int4* ungapped_ext_len, 130 JUMP* jumper); 131 132 133 134 /** Jumper gapped alignment with traceback; 1 base per byte in query, 4 bases 135 per byte in subject */ 136 int JumperGappedAlignmentCompressedWithTraceback(const Uint1* query, 137 const Uint1* subject, 138 Int4 query_length, Int4 subject_length, 139 Int4 query_start, Int4 subject_start, 140 BlastGapAlignStruct* gap_align, 141 const BlastScoringParameters* score_params, 142 Int4* num_identical, 143 Int4* right_ungapped_ext_len); 144 145 146 #define SUBJECT_INDEX_WORD_LENGTH (4) 147 148 /** Index for a chunk of a subject sequence */ 149 typedef struct SubjectIndex 150 { 151 /** Array of lookup tables */ 152 BlastNaLookupTable** lookups; 153 /** Number of bases covered by each lookup table */ 154 Int4 width; 155 /** Number of lookup tables used */ 156 Int4 num_lookups; 157 } SubjectIndex; 158 159 /** Free subject index structure */ 160 SubjectIndex* SubjectIndexFree(SubjectIndex* sindex); 161 162 /** Index a sequence, used for indexing compressed nucleotide subject 163 * sequence. The index consists of a list of lookup tables, each covering 164 * width number of bases. 165 * 166 * @param subject Sequence to be indexed [in] 167 * @param width Number of bases covered by a single lookup table [in] 168 * @param word_size Word size to be used in the lookup table [in] 169 * @return Pointer to the created index 170 */ 171 SubjectIndex* SubjectIndexNew(BLAST_SequenceBlk* subject, Int4 width, 172 Int4 word_size); 173 174 175 /** Iterator over word locations in subject index */ 176 typedef struct SubjectIndexIterator 177 { 178 SubjectIndex* subject_index; 179 Int4 word; 180 Int4 from; 181 Int4 to; 182 Int4 lookup_index; 183 Int4* lookup_pos; 184 Int4 num_words; 185 Int4 word_index; 186 } SubjectIndexIterator; 187 188 189 /** Free memory reserved for subject index word iterator */ 190 SubjectIndexIterator* SubjectIndexIteratorFree(SubjectIndexIterator* it); 191 192 /** Create an iterator for locations of a given word 193 * @param s_index Sequence index [in] 194 * @param word Nucleotide word to be searched [in] 195 * @param from Sequence location to begin search [in] 196 * @param to Sequence location to end search [in] 197 * @return Word location iterator 198 */ 199 SubjectIndexIterator* SubjectIndexIteratorNew(SubjectIndex* s_index, Int4 word, 200 Int4 from, Int4 to); 201 202 /** Return the next location of a word in an indexed sequence 203 * @param it Iterator [in|out] 204 * @return Word location or value less than zero if no more words are found 205 */ 206 Int4 SubjectIndexIteratorNext(SubjectIndexIterator* it); 207 208 /** Return the previous location of a word in an indexed sequence 209 * @param it Iterator [in|out] 210 * @return Word location or value less than zero if no more words are found 211 */ 212 Int4 SubjectIndexIteratorPrev(SubjectIndexIterator* it); 213 214 215 216 /** Extend a list of word hits */ 217 Int4 BlastNaExtendJumper(BlastOffsetPair* offset_pairs, Int4 num_hits, 218 const BlastInitialWordParameters* word_params, 219 const BlastScoringParameters* score_params, 220 const BlastHitSavingParameters* hit_params, 221 LookupTableWrap* lookup_wrap, 222 BLAST_SequenceBlk* query, 223 BLAST_SequenceBlk* subject, 224 BlastQueryInfo* query_info, 225 BlastGapAlignStruct* gap_align, 226 BlastHSPList* hsp_list, 227 Uint4 s_range, 228 SubjectIndex* s_index); 229 230 231 #define MAPPER_SPLICE_SIGNAL (0x80) 232 #define MAPPER_EXON (0x40) 233 #define MAPPER_POLY_A (0x20) 234 #define MAPPER_ADAPTER (0x10) 235 236 /** Find splice signals at the edges of an HSP and save them in the HSP 237 * @param hsp HSP [in|out] 238 * @param query_len Length of the query/read [in] 239 * @param subject Subject sequence [in] 240 * @param subject_len Subject length [in] 241 * @return Zero on success 242 */ 243 int JumperFindSpliceSignals(BlastHSP* hsp, Int4 query_len, 244 const Uint1* subject, Int4 subject_len); 245 246 247 #define MAPPER_FLAGS_PAIR_CONVERGENT (1) 248 #define MAPPER_FLAGS_PAIR_DIVERGENT (1 << 1) 249 #define MAPPER_FLAGS_PAIR_REARRANGED (1 << 2) 250 251 #define MAPPER_FLAGS_PAIR (MAPPER_FLAGS_PAIR_CONVERGENT | MAPPER_FLAGS_PAIR_DIVERGENT | MAPPER_FLAGS_PAIR_REARRANGED) 252 253 254 /* Combine edit scripts into one. Returns the combined edit script, deletes 255 input scripts. */ 256 GapEditScript* GapEditScriptCombine(GapEditScript** edit_script, 257 GapEditScript** append); 258 259 /* Combine two edits blocks into one. Returns the combined block, deletes input 260 blocks */ 261 JumperEditsBlock* JumperEditsBlockCombine(JumperEditsBlock** block, 262 JumperEditsBlock** append); 263 264 /** Structure to save short unaligned subsequences outside an HSP */ 265 struct SequenceOverhangs 266 { 267 Int4 left_len; /**< Length of the left subsequence */ 268 Int4 right_len; /**< Length of the right subsequence */ 269 Uint1* left; /**< Left subsequence */ 270 Uint1* right; /**< Rught subsequence */ 271 }; 272 273 SequenceOverhangs* SequenceOverhangsFree(SequenceOverhangs* overhangs); 274 275 276 #ifdef __cplusplus 277 } 278 #endif 279 280