1 /* $Id: stage3.h 222484 2020-04-22 18:12:24Z twu $ */ 2 #ifndef STAGE3_INCLUDED 3 #define STAGE3_INCLUDED 4 5 typedef struct Stage3middle_T *Stage3middle_T; 6 typedef struct Stage3_T *Stage3_T; 7 8 #include "bool.h" 9 #include "sense.h" 10 #include "chrnum.h" 11 #include "genomicpos.h" 12 #include "types.h" 13 #include "list.h" 14 #include "sequence.h" 15 #include "genome.h" 16 #include "stage2.h" 17 #include "pairpool.h" 18 #include "diagpool.h" 19 #include "cellpool.h" 20 #include "dynprog.h" 21 #include "iit-read-univ.h" 22 #include "iit-read.h" 23 #include "reader.h" /* For cDNAEnd_T */ 24 #include "chimera.h" 25 #include "stopwatch.h" 26 #ifdef PMAP 27 #include "oligoindex_pmap.h" 28 #else 29 #include "oligoindex_hr.h" 30 #endif 31 #include "filestring.h" 32 #include "output.h" /* For Printtype_T */ 33 34 #ifndef GSNAP 35 #include "gregion.h" 36 #endif 37 38 #define EXTRAQUERYGAP 20 39 40 /* POST_CANONICAL is the path_compute_final() step */ 41 /* POST_TRIM is the path_trim() step */ 42 typedef enum {NO_STAGE3DEBUG, POST_STAGE2, POST_SINGLES, POST_INTRONS, 43 POST_HMM, POST_SMOOTHING, POST_DUAL_INTRONS, POST_CYCLES, POST_DUAL_BREAKS, 44 POST_MIDDLE, POST_ENDS, POST_CANONICAL, POST_TRIM, POST_CHANGEPOINT, POST_DISTAL_MEDIAL} Stage3debug_T; 45 typedef enum {NO_ANNOTATION, INSERT_ANNOTATION, KEYVALUE_ANNOTATION} GFF3_fasta_annotation_T; 46 47 48 #define T Stage3_T 49 50 extern void 51 Stage3_setup (bool splicingp_in, bool novelsplicingp_in, bool require_splicedir_p_in, 52 Chrpos_T shortsplicedist_novelend, 53 IIT_T splicesites_iit_in, int *splicesites_divint_crosstable_in, 54 int donor_typeint_in, int acceptor_typeint_in, 55 Univcoord_T *splicesites_in, bool *circularp_in, bool *altlocp_in, 56 Univcoord_T *alias_starts_in, Univcoord_T *alias_ends_in, 57 int min_intronlength_in, int max_deletionlength_in, int min_indel_end_matches_in, 58 int maxpeelback_distalmedial_in, int nullgap_in, 59 int extramaterial_end_in, int extramaterial_paired_in, 60 int extraband_single_in, int extraband_end_in, int extraband_paired_in, 61 int ngap_in, int maxintronlen_in, int maxintronlen_ends_in, int minendexon_in, 62 bool homopolymerp_in, GFF3_fasta_annotation_T gff3_fasta_annotation_type_in, 63 Stage3debug_T stage3debug_in, Univcoord_T genome_totallength_in); 64 65 extern Chrnum_T 66 Stage3middle_chrnum (Stage3middle_T this); 67 extern Univcoord_T 68 Stage3middle_chroffset (Stage3middle_T this); 69 extern Univcoord_T 70 Stage3middle_chrhigh (Stage3middle_T this); 71 extern Chrpos_T 72 Stage3middle_chrlength (Stage3middle_T this); 73 74 extern bool 75 Stage3middle_watsonp (Stage3middle_T this); 76 extern int 77 Stage3middle_genestrand (Stage3middle_T this); 78 79 extern int 80 Stage3middle_goodness (Stage3middle_T this); 81 extern void 82 Stage3middle_free (Stage3middle_T *old); 83 extern int 84 Stage3middle_cmp (const void *a, const void *b); 85 86 extern bool 87 Stage3_chimera_left_p (T this); 88 extern bool 89 Stage3_chimera_right_p (T this); 90 extern bool 91 Stage3_watsonp (T this); 92 extern int 93 Stage3_genestrand (T this); 94 extern int 95 Stage3_cdna_direction (T this); 96 extern int 97 Stage3_sensedir (T this); 98 extern int 99 Stage3_straintype (T this); 100 extern int 101 Stage3_goodness (T this); 102 extern int 103 Stage3_absmq_score (T this); 104 extern int 105 Stage3_mapq_score (T this); 106 extern List_T 107 Stage3_pairs (T this); 108 extern struct Pair_T * 109 Stage3_pairarray (T this); 110 extern int 111 Stage3_npairs (T this); 112 extern int 113 Stage3_matches (T this); 114 extern int 115 Stage3_mismatches (T this); 116 extern int 117 Stage3_indels (T this); 118 119 extern int 120 Stage3_querystart (T this); 121 extern int 122 Stage3_queryend (T this); 123 124 extern bool 125 Stage3_joinable_left_p (T this); 126 extern bool 127 Stage3_joinable_right_p (T this); 128 extern void 129 Stage3_clear_joinable (T this); 130 extern void 131 Stage3_set_joinable_left (T this); 132 extern void 133 Stage3_set_joinable_right (T this); 134 135 extern void 136 Stage3_print_ends (T this); 137 extern Chrnum_T 138 Stage3_chrnum (T this); 139 extern Univcoord_T 140 Stage3_chroffset (T this); 141 extern Univcoord_T 142 Stage3_chrhigh (T this); 143 extern Chrpos_T 144 Stage3_chrlength (T this); 145 extern bool 146 Stage3_altloc_chr (Univcoord_T *alias_start, Univcoord_T *alias_end, T this); 147 extern Chrpos_T 148 Stage3_chrstart (T this); 149 extern Chrpos_T 150 Stage3_chrend (T this); 151 extern Univcoord_T 152 Stage3_genomicstart (T this); 153 extern Univcoord_T 154 Stage3_genomicend (T this); 155 extern void 156 Stage3_set_genomicend (T this, Univcoord_T genomicend); 157 extern int 158 Stage3_circularpos (T this); 159 160 extern int 161 Stage3_translation_start (T this); 162 extern int 163 Stage3_translation_end (T this); 164 extern int 165 Stage3_domain (T this); 166 extern int 167 Stage3_largemargin (int *newstart, int *newend, T this, int queryntlength); 168 169 extern double 170 Stage3_fracidentity (T this); 171 extern Univcoord_T 172 Stage3_genomicpos (T this, int querypos, bool headp); 173 extern int 174 Stage3_chimeric_goodness (int *matches1, int *matches2, T part1, T part2, int breakpoint); 175 176 extern bool 177 Stage3_passes_filter (T this, double min_trimmed_coverage, double min_identity); 178 extern bool 179 Stage3_passes_filter_chimera (Chimera_T chimera, double min_trimmed_coverage, double min_identity); 180 extern int 181 Stage3_cmp (const void *a, const void *b); 182 extern Chrpos_T 183 Stage3_genomiclength (T this); 184 extern int 185 Stage3_position_cmp (const void *a, const void *b); 186 extern int 187 Stage3_querystart_cmp (const void *a, const void *b); 188 extern int 189 Stage3_queryend_cmp (const void *a, const void *b); 190 extern int 191 Stage3_chrnum_cmp (const void *a, const void *b); 192 extern int 193 Stage3_chrnum_querystart_cmp (const void *a, const void *b); 194 extern int 195 Stage3_chrnum_queryend_cmp (const void *a, const void *b); 196 extern int 197 Stage3_identity_cmp (const void *a, const void *b); 198 extern bool 199 Stage3_overlap (T x, T y); 200 201 extern void 202 Stage3_compute_mapq (List_T stage3list); 203 extern void 204 Stage3_count_paths (int *npaths_primary, int *npaths_altloc, List_T stage3list); 205 extern void 206 Stage3_recompute_goodness (List_T stage3list); 207 extern void 208 Stage3_recompute_coverage (List_T stage3list, Sequence_T queryseq); 209 extern void 210 Stage3_free (T *old); 211 212 extern bool 213 Stage3_test_bounds (T this, int minpos, int maxpos); 214 215 #ifdef PMAP 216 extern void 217 Stage3_translate_cdna (T this, Sequence_T queryaaseq, bool strictp); 218 extern void 219 Stage3_backtranslate_cdna (T this); 220 #else 221 extern void 222 Stage3_translate_genomic (T this, int npairs, bool fulllengthp, int cds_startpos, int querylength, 223 bool truncatep, bool strictp); 224 #endif 225 extern void 226 Stage3_translate (T this, 227 #ifdef PMAP 228 Sequence_T queryseq, 229 #endif 230 int querylength, bool fulllengthp, 231 int cds_startpos, bool truncatep, bool strictp); 232 extern void 233 Stage3_translate_chimera (T this, T mate, 234 #ifdef PMAP 235 Sequence_T queryseq, 236 #endif 237 int querylength, bool fulllengthp, 238 int cds_startpos, bool truncatep, bool strictp); 239 extern void 240 Stage3_print_pathsummary (Filestring_T fp, T this, int pathnum, Univ_IIT_T chromosome_iit, Univ_IIT_T contig_iit, 241 IIT_T altstrain_iit, Sequence_T queryseq, char *dbversion); 242 extern void 243 Stage3_print_pslformat_nt (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T usersegment, Sequence_T queryseq); 244 #ifdef PMAP 245 extern void 246 Stage3_print_pslformat_pro (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T usersegment, Sequence_T queryseq, bool strictp); 247 #endif 248 extern void 249 Stage3_print_gff3 (Filestring_T fp, T this, int pathnum, Univ_IIT_T chromosome_iit, Sequence_T usersegment, 250 Sequence_T queryseq, int querylength, Printtype_T printtype, char *sourcename); 251 #ifndef PMAP 252 extern void 253 Stage3_print_bedpe (Filestring_T fp, T this, Univ_IIT_T chromosome_iit); 254 extern void 255 Stage3_print_sam (Filestring_T fp, char *abbrev, T this, int pathnum, int npaths_primary, int npaths_altloc, 256 int absmq_score, int second_absmq, int mapq_score, 257 Univ_IIT_T chromosome_iit, Sequence_T usersegment, 258 Sequence_T queryseq, int chimera_part, Chimera_T chimera, 259 int quality_shift, bool sam_paired_p, char *sam_read_group_id); 260 #endif 261 extern void 262 Stage3_print_iit_map (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq); 263 extern void 264 Stage3_print_iit_exon_map (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq); 265 extern void 266 Stage3_print_splicesites (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq); 267 extern void 268 Stage3_print_introns (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq); 269 270 extern void 271 Stage3_print_map (Filestring_T fp, T this, IIT_T map_iit, int *map_divint_crosstable, Univ_IIT_T chromosome_iit, 272 int pathnum, bool map_exons_p, bool map_bothstrands_p, int nflanking, bool print_comment_p); 273 extern void 274 Stage3_print_alignment (Filestring_T fp, T this, Genome_T genome, 275 Univ_IIT_T chromosome_iit, Printtype_T printtype, 276 bool continuousp, bool continuous_by_exon_p, bool genomefirstp, 277 int invertmode, bool nointronlenp, int wraplength); 278 279 extern void 280 Stage3_print_coordinates (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, int invertmode); 281 extern void 282 Stage3_print_cdna (Filestring_T fp, T this, int wraplength); 283 284 extern void 285 Stage3_print_protein_genomic (Filestring_T fp, T this, int wraplength); 286 287 extern void 288 Stage3_print_compressed (Filestring_T fp, T this, Sequence_T queryseq, Univ_IIT_T chromosome_iit, 289 char *dbversion, Sequence_T usersegment, int pathnum, int npaths, 290 bool checksump, int chimerapos, int chimeraequivpos, 291 double donor_prob, double acceptor_prob, int chimera_cdna_direction); 292 293 294 extern T 295 Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int goodness, 296 int cdna_direction, int sensedir, 297 int matches, int unknowns, int mismatches, int qopens, int qindels, 298 int topens, int tindels, int ncanonical, int nsemicanonical, int nnoncanonical, 299 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength, 300 bool watsonp, int genestrand, int querylength, int skiplength, int trimlength, 301 int straintype, char *strain, IIT_T altstrain_iit); 302 303 extern T 304 Stage3_new_from_pairs (List_T pairs, int cdna_direction, bool watsonp, int genestrand, int sensedir, 305 Pairpool_T pairpool, Sequence_T queryseq, int query_subseq_offset, 306 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength); 307 308 extern bool 309 Stage3_short_alignment_p (struct Pair_T *pairarray, int npairs, int querylength); 310 311 extern bool 312 Stage3_bad_stretch_p (struct Pair_T *pairarray, int npairs, int pos5, int pos3); 313 314 extern int 315 Stage3_good_part (struct Pair_T *pairarray, int npairs, int pos5, int pos3); 316 317 extern Stage3middle_T 318 Stage3_compute_middle (List_T stage2pairs, List_T all_stage2_starts, List_T all_stage2_ends, 319 #ifdef PMAP 320 char *queryaaseq_ptr, 321 #endif 322 char *queryseq_ptr, char *queryuc_ptr, int querylength, 323 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength, 324 bool watsonp, int genestrand, bool jump_late_p, int maxpeelback, 325 #ifndef GSNAP 326 Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool, 327 #endif 328 Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR, 329 int sense_try); 330 331 extern struct Pair_T * 332 Stage3_compute_ends (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *npairs1, int *goodness1, 333 int *matches1, int *nmatches_posttrim_1, int *max_match_length_1, 334 int *ambig_end_length_5_1, int *ambig_end_length_3_1, 335 Splicetype_T *ambig_splicetype_5_1, Splicetype_T *ambig_splicetype_3_1, 336 double *ambig_prob_5_1, double *ambig_prob_3_1, 337 int *unknowns1, int *mismatches1, int *qopens1, int *qindels1, int *topens1, int *tindels1, 338 int *ncanonical1, int *nsemicanonical1, int *nnoncanonical1, double *avg_splice_score_1, 339 #ifdef GSNAP 340 struct Pair_T **pairarray2, List_T *finalpairs2, int *npairs2, int *goodness2, 341 int *matches2, int *nmatches_posttrim_2, int *max_match_length_2, 342 int *ambig_end_length_5_2, int *ambig_end_length_3_2, 343 Splicetype_T *ambig_splicetype_5_2, Splicetype_T *ambig_splicetype_3_2, 344 double *ambig_prob_5_2, double *ambig_prob_3_2, 345 int *unknowns2, int *mismatches2, int *qopens2, int *qindels2, int *topens2, int *tindels2, 346 int *ncanonical2, int *nsemicanonical2, int *nnoncanonical2, double *avg_splice_score_2, 347 #endif 348 349 Stage3middle_T stage3middle, 350 #ifdef PMAP 351 char *queryaaseq_ptr, 352 #endif 353 char *queryseq_ptr, char *queryuc_ptr, int querylength, 354 int skiplength, int query_subseq_offset, 355 Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high, 356 int maxpeelback, Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR, 357 int sense_filter, Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool); 358 359 extern struct Pair_T * 360 Stage3_compute_one (int *cdna_direction, int *sensedir, 361 List_T *pairs1, int *npairs1, int *goodness1, 362 int *matches1, int *nmatches_posttrim_1, int *max_match_length_1, 363 int *ambig_end_length_5_1, int *ambig_end_length_3_1, 364 Splicetype_T *ambig_splicetype_5_1, Splicetype_T *ambig_splicetype_3_1, 365 double *ambig_prob_5_1, double *ambig_prob_3_1, 366 int *unknowns1, int *mismatches1, int *qopens1, int *qindels1, int *topens1, int *tindels1, 367 int *ncanonical1, int *nsemicanonical1, int *nnoncanonical1, double *avg_splice_score_1, 368 #ifdef GSNAP 369 struct Pair_T **pairarray2, List_T *pairs2, int *npairs2, int *goodness2, 370 int *matches2, int *nmatches_posttrim_2, int *max_match_length_2, 371 int *ambig_end_length_5_2, int *ambig_end_length_3_2, 372 Splicetype_T *ambig_splicetype_5_2, Splicetype_T *ambig_splicetype_3_2, 373 double *ambig_prob_5_2, double *ambig_prob_3_2, 374 int *unknowns2, int *mismatches2, int *qopens2, int *qindels2, int *topens2, int *tindels2, 375 int *ncanonical2, int *nsemicanonical2, int *nnoncanonical2, double *avg_splice_score_2, 376 #endif 377 378 List_T stage2pairs, List_T all_stage2_starts, List_T all_stage2_ends, 379 #ifdef PMAP 380 char *queryaaseq_ptr, 381 #endif 382 char *queryseq_ptr, char *queryuc_ptr, int querylength, 383 int skiplength, int query_subseq_offset, 384 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, 385 Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high, 386 bool watsonp, int genestrand, bool jump_late_p, 387 int maxpeelback, 388 #ifndef GSNAP 389 Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool, 390 #endif 391 Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR, 392 int sense_try, int sense_filter); 393 394 #ifndef GSNAP 395 extern T 396 Stage3_direct (Gregion_T gregion, 397 #ifdef PMAP 398 Sequence_T queryaaseq, 399 #endif 400 Sequence_T queryseq, Sequence_T queryuc, Pairpool_T pairpool, Genome_T genome, 401 Chrnum_T chrnum, Univcoord_T chroffset, Chrpos_T chrpos, bool watsonp, 402 int ngap, Dynprog_T dynprogL, Dynprog_T dynprogR, 403 int extramaterial_end, int extraband_end); 404 #endif 405 406 extern bool 407 Stage3_mergeable (Stage3_T firstpart, Stage3_T secondpart, int exonexonpos, int queryntlength); 408 409 extern bool 410 Stage3_merge_chimera (T *new_left, T *new_right, T old_left, T old_right, 411 int minpos1, int maxpos1, int minpos2, int maxpos2, 412 Sequence_T queryseq, 413 char *queryseq_ptr, char *queryuc_ptr, Pairpool_T pairpool, 414 Dynprog_T dynprogL, Dynprog_T dynprogR, int maxpeelback); 415 extern void 416 Stage3_extend_right (T this, int goal, int querylength, 417 char *queryseq_ptr, char *queryuc_ptr, 418 bool max_extend_p, Pairpool_T pairpool, 419 int genestrand, int maxpeelback); 420 extern void 421 Stage3_extend_left (T this, int goal, 422 char *queryseq_ptr, char *queryuc_ptr, 423 bool max_extend_p, Pairpool_T pairpool, 424 int genestrand, int maxpeelback); 425 426 extern void 427 Stage3_trim_right (T this, int goal, char *queryseq_ptr, Pairpool_T pairpool); 428 extern void 429 Stage3_trim_left (T this, int goal, char *queryseq_ptr, Pairpool_T pairpool); 430 431 extern T 432 Stage3_merge_local (T old_left, T old_right, 433 int minpos1, int maxpos1, int minpos2, int maxpos2, 434 Sequence_T queryseq, 435 #ifdef PMAP 436 char *queryaaseq_ptr, 437 #endif 438 char *queryseq_ptr, char *queryuc_ptr, 439 #ifndef GSNAP 440 Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool, 441 #endif 442 Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR, 443 int maxpeelback); 444 445 extern List_T 446 Stage3_split (T this, Sequence_T queryseq, Pairpool_T pairpool); 447 448 #ifndef PMAP 449 extern void 450 Stage3_guess_cdna_direction (T this); 451 #endif 452 453 #undef T 454 #endif 455