1 /* $Id: stage3.h 222484 2020-04-22 18:12:24Z twu $ */
2 #ifndef STAGE3_INCLUDED
3 #define STAGE3_INCLUDED
4 
5 typedef struct Stage3middle_T *Stage3middle_T;
6 typedef struct Stage3_T *Stage3_T;
7 
8 #include "bool.h"
9 #include "sense.h"
10 #include "chrnum.h"
11 #include "genomicpos.h"
12 #include "types.h"
13 #include "list.h"
14 #include "sequence.h"
15 #include "genome.h"
16 #include "stage2.h"
17 #include "pairpool.h"
18 #include "diagpool.h"
19 #include "cellpool.h"
20 #include "dynprog.h"
21 #include "iit-read-univ.h"
22 #include "iit-read.h"
23 #include "reader.h"		/* For cDNAEnd_T */
24 #include "chimera.h"
25 #include "stopwatch.h"
26 #ifdef PMAP
27 #include "oligoindex_pmap.h"
28 #else
29 #include "oligoindex_hr.h"
30 #endif
31 #include "filestring.h"
32 #include "output.h"		/* For Printtype_T */
33 
34 #ifndef GSNAP
35 #include "gregion.h"
36 #endif
37 
38 #define EXTRAQUERYGAP 20
39 
40 /* POST_CANONICAL is the path_compute_final() step */
41 /* POST_TRIM is the path_trim() step */
42 typedef enum {NO_STAGE3DEBUG, POST_STAGE2, POST_SINGLES, POST_INTRONS,
43 	      POST_HMM, POST_SMOOTHING, POST_DUAL_INTRONS, POST_CYCLES, POST_DUAL_BREAKS,
44 	      POST_MIDDLE, POST_ENDS, POST_CANONICAL, POST_TRIM, POST_CHANGEPOINT, POST_DISTAL_MEDIAL} Stage3debug_T;
45 typedef enum {NO_ANNOTATION, INSERT_ANNOTATION, KEYVALUE_ANNOTATION} GFF3_fasta_annotation_T;
46 
47 
48 #define T Stage3_T
49 
50 extern void
51 Stage3_setup (bool splicingp_in, bool novelsplicingp_in, bool require_splicedir_p_in,
52 	      Chrpos_T shortsplicedist_novelend,
53 	      IIT_T splicesites_iit_in, int *splicesites_divint_crosstable_in,
54 	      int donor_typeint_in, int acceptor_typeint_in,
55 	      Univcoord_T *splicesites_in, bool *circularp_in, bool *altlocp_in,
56 	      Univcoord_T *alias_starts_in, Univcoord_T *alias_ends_in,
57 	      int min_intronlength_in, int max_deletionlength_in, int min_indel_end_matches_in,
58 	      int maxpeelback_distalmedial_in, int nullgap_in,
59 	      int extramaterial_end_in, int extramaterial_paired_in,
60 	      int extraband_single_in, int extraband_end_in, int extraband_paired_in,
61 	      int ngap_in, int maxintronlen_in, int maxintronlen_ends_in, int minendexon_in,
62 	      bool homopolymerp_in, GFF3_fasta_annotation_T gff3_fasta_annotation_type_in,
63 	      Stage3debug_T stage3debug_in, Univcoord_T genome_totallength_in);
64 
65 extern Chrnum_T
66 Stage3middle_chrnum (Stage3middle_T this);
67 extern Univcoord_T
68 Stage3middle_chroffset (Stage3middle_T this);
69 extern Univcoord_T
70 Stage3middle_chrhigh (Stage3middle_T this);
71 extern Chrpos_T
72 Stage3middle_chrlength (Stage3middle_T this);
73 
74 extern bool
75 Stage3middle_watsonp (Stage3middle_T this);
76 extern int
77 Stage3middle_genestrand (Stage3middle_T this);
78 
79 extern int
80 Stage3middle_goodness (Stage3middle_T this);
81 extern void
82 Stage3middle_free (Stage3middle_T *old);
83 extern int
84 Stage3middle_cmp (const void *a, const void *b);
85 
86 extern bool
87 Stage3_chimera_left_p (T this);
88 extern bool
89 Stage3_chimera_right_p (T this);
90 extern bool
91 Stage3_watsonp (T this);
92 extern int
93 Stage3_genestrand (T this);
94 extern int
95 Stage3_cdna_direction (T this);
96 extern int
97 Stage3_sensedir (T this);
98 extern int
99 Stage3_straintype (T this);
100 extern int
101 Stage3_goodness (T this);
102 extern int
103 Stage3_absmq_score (T this);
104 extern int
105 Stage3_mapq_score (T this);
106 extern List_T
107 Stage3_pairs (T this);
108 extern struct Pair_T *
109 Stage3_pairarray (T this);
110 extern int
111 Stage3_npairs (T this);
112 extern int
113 Stage3_matches (T this);
114 extern int
115 Stage3_mismatches (T this);
116 extern int
117 Stage3_indels (T this);
118 
119 extern int
120 Stage3_querystart (T this);
121 extern int
122 Stage3_queryend (T this);
123 
124 extern bool
125 Stage3_joinable_left_p (T this);
126 extern bool
127 Stage3_joinable_right_p (T this);
128 extern void
129 Stage3_clear_joinable (T this);
130 extern void
131 Stage3_set_joinable_left (T this);
132 extern void
133 Stage3_set_joinable_right (T this);
134 
135 extern void
136 Stage3_print_ends (T this);
137 extern Chrnum_T
138 Stage3_chrnum (T this);
139 extern Univcoord_T
140 Stage3_chroffset (T this);
141 extern Univcoord_T
142 Stage3_chrhigh (T this);
143 extern Chrpos_T
144 Stage3_chrlength (T this);
145 extern bool
146 Stage3_altloc_chr (Univcoord_T *alias_start, Univcoord_T *alias_end, T this);
147 extern Chrpos_T
148 Stage3_chrstart (T this);
149 extern Chrpos_T
150 Stage3_chrend (T this);
151 extern Univcoord_T
152 Stage3_genomicstart (T this);
153 extern Univcoord_T
154 Stage3_genomicend (T this);
155 extern void
156 Stage3_set_genomicend (T this, Univcoord_T genomicend);
157 extern int
158 Stage3_circularpos (T this);
159 
160 extern int
161 Stage3_translation_start (T this);
162 extern int
163 Stage3_translation_end (T this);
164 extern int
165 Stage3_domain (T this);
166 extern int
167 Stage3_largemargin (int *newstart, int *newend, T this, int queryntlength);
168 
169 extern double
170 Stage3_fracidentity (T this);
171 extern Univcoord_T
172 Stage3_genomicpos (T this, int querypos, bool headp);
173 extern int
174 Stage3_chimeric_goodness (int *matches1, int *matches2, T part1, T part2, int breakpoint);
175 
176 extern bool
177 Stage3_passes_filter (T this, double min_trimmed_coverage, double min_identity);
178 extern bool
179 Stage3_passes_filter_chimera (Chimera_T chimera, double min_trimmed_coverage, double min_identity);
180 extern int
181 Stage3_cmp (const void *a, const void *b);
182 extern Chrpos_T
183 Stage3_genomiclength (T this);
184 extern int
185 Stage3_position_cmp (const void *a, const void *b);
186 extern int
187 Stage3_querystart_cmp (const void *a, const void *b);
188 extern int
189 Stage3_queryend_cmp (const void *a, const void *b);
190 extern int
191 Stage3_chrnum_cmp (const void *a, const void *b);
192 extern int
193 Stage3_chrnum_querystart_cmp (const void *a, const void *b);
194 extern int
195 Stage3_chrnum_queryend_cmp (const void *a, const void *b);
196 extern int
197 Stage3_identity_cmp (const void *a, const void *b);
198 extern bool
199 Stage3_overlap (T x, T y);
200 
201 extern void
202 Stage3_compute_mapq (List_T stage3list);
203 extern void
204 Stage3_count_paths (int *npaths_primary, int *npaths_altloc, List_T stage3list);
205 extern void
206 Stage3_recompute_goodness (List_T stage3list);
207 extern void
208 Stage3_recompute_coverage (List_T stage3list, Sequence_T queryseq);
209 extern void
210 Stage3_free (T *old);
211 
212 extern bool
213 Stage3_test_bounds (T this, int minpos, int maxpos);
214 
215 #ifdef PMAP
216 extern void
217 Stage3_translate_cdna (T this, Sequence_T queryaaseq, bool strictp);
218 extern void
219 Stage3_backtranslate_cdna (T this);
220 #else
221 extern void
222 Stage3_translate_genomic (T this, int npairs, bool fulllengthp, int cds_startpos, int querylength,
223 			  bool truncatep, bool strictp);
224 #endif
225 extern void
226 Stage3_translate (T this,
227 #ifdef PMAP
228 		  Sequence_T queryseq,
229 #endif
230 		  int querylength, bool fulllengthp,
231 		  int cds_startpos, bool truncatep, bool strictp);
232 extern void
233 Stage3_translate_chimera (T this, T mate,
234 #ifdef PMAP
235 			  Sequence_T queryseq,
236 #endif
237 			  int querylength, bool fulllengthp,
238 			  int cds_startpos, bool truncatep, bool strictp);
239 extern void
240 Stage3_print_pathsummary (Filestring_T fp, T this, int pathnum, Univ_IIT_T chromosome_iit, Univ_IIT_T contig_iit,
241 			  IIT_T altstrain_iit, Sequence_T queryseq, char *dbversion);
242 extern void
243 Stage3_print_pslformat_nt (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T usersegment, Sequence_T queryseq);
244 #ifdef PMAP
245 extern void
246 Stage3_print_pslformat_pro (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T usersegment, Sequence_T queryseq, bool strictp);
247 #endif
248 extern void
249 Stage3_print_gff3 (Filestring_T fp, T this, int pathnum, Univ_IIT_T chromosome_iit, Sequence_T usersegment,
250 		   Sequence_T queryseq, int querylength, Printtype_T printtype, char *sourcename);
251 #ifndef PMAP
252 extern void
253 Stage3_print_bedpe (Filestring_T fp, T this, Univ_IIT_T chromosome_iit);
254 extern void
255 Stage3_print_sam (Filestring_T fp, char *abbrev, T this, int pathnum, int npaths_primary, int npaths_altloc,
256 		  int absmq_score, int second_absmq, int mapq_score,
257 		  Univ_IIT_T chromosome_iit, Sequence_T usersegment,
258 		  Sequence_T queryseq, int chimera_part, Chimera_T chimera,
259 		  int quality_shift, bool sam_paired_p, char *sam_read_group_id);
260 #endif
261 extern void
262 Stage3_print_iit_map (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq);
263 extern void
264 Stage3_print_iit_exon_map (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq);
265 extern void
266 Stage3_print_splicesites (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq);
267 extern void
268 Stage3_print_introns (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, Sequence_T queryseq);
269 
270 extern void
271 Stage3_print_map (Filestring_T fp, T this, IIT_T map_iit, int *map_divint_crosstable, Univ_IIT_T chromosome_iit,
272 		  int pathnum, bool map_exons_p, bool map_bothstrands_p, int nflanking, bool print_comment_p);
273 extern void
274 Stage3_print_alignment (Filestring_T fp, T this, Genome_T genome,
275 			Univ_IIT_T chromosome_iit, Printtype_T printtype,
276 			bool continuousp, bool continuous_by_exon_p, bool genomefirstp,
277 			int invertmode, bool nointronlenp, int wraplength);
278 
279 extern void
280 Stage3_print_coordinates (Filestring_T fp, T this, Univ_IIT_T chromosome_iit, int invertmode);
281 extern void
282 Stage3_print_cdna (Filestring_T fp, T this, int wraplength);
283 
284 extern void
285 Stage3_print_protein_genomic (Filestring_T fp, T this, int wraplength);
286 
287 extern void
288 Stage3_print_compressed (Filestring_T fp, T this, Sequence_T queryseq, Univ_IIT_T chromosome_iit,
289 			 char *dbversion, Sequence_T usersegment, int pathnum, int npaths,
290 			 bool checksump, int chimerapos, int chimeraequivpos,
291 			 double donor_prob, double acceptor_prob, int chimera_cdna_direction);
292 
293 
294 extern T
295 Stage3_new (struct Pair_T *pairarray, List_T pairs, int npairs, int goodness,
296 	    int cdna_direction, int sensedir,
297 	    int matches, int unknowns, int mismatches, int qopens, int qindels,
298 	    int topens, int tindels, int ncanonical, int nsemicanonical, int nnoncanonical,
299 	    Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
300 	    bool watsonp, int genestrand, int querylength, int skiplength, int trimlength,
301 	    int straintype, char *strain, IIT_T altstrain_iit);
302 
303 extern T
304 Stage3_new_from_pairs (List_T pairs, int cdna_direction, bool watsonp, int genestrand, int sensedir,
305 		       Pairpool_T pairpool, Sequence_T queryseq, int query_subseq_offset,
306 		       Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength);
307 
308 extern bool
309 Stage3_short_alignment_p (struct Pair_T *pairarray, int npairs, int querylength);
310 
311 extern bool
312 Stage3_bad_stretch_p (struct Pair_T *pairarray, int npairs, int pos5, int pos3);
313 
314 extern int
315 Stage3_good_part (struct Pair_T *pairarray, int npairs, int pos5, int pos3);
316 
317 extern Stage3middle_T
318 Stage3_compute_middle (List_T stage2pairs, List_T all_stage2_starts, List_T all_stage2_ends,
319 #ifdef PMAP
320 		       char *queryaaseq_ptr,
321 #endif
322 		       char *queryseq_ptr, char *queryuc_ptr, int querylength,
323 		       Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
324 		       bool watsonp, int genestrand, bool jump_late_p, int maxpeelback,
325 #ifndef GSNAP
326 		       Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
327 #endif
328 		       Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
329 		       int sense_try);
330 
331 extern struct Pair_T *
332 Stage3_compute_ends (int *cdna_direction, int *sensedir, List_T *finalpairs1, int *npairs1, int *goodness1,
333 		     int *matches1, int *nmatches_posttrim_1, int *max_match_length_1,
334 		     int *ambig_end_length_5_1, int *ambig_end_length_3_1,
335 		     Splicetype_T *ambig_splicetype_5_1, Splicetype_T *ambig_splicetype_3_1,
336 		     double *ambig_prob_5_1, double *ambig_prob_3_1,
337 		     int *unknowns1, int *mismatches1, int *qopens1, int *qindels1, int *topens1, int *tindels1,
338 		     int *ncanonical1, int *nsemicanonical1, int *nnoncanonical1, double *avg_splice_score_1,
339 #ifdef GSNAP
340 		     struct Pair_T **pairarray2, List_T *finalpairs2, int *npairs2, int *goodness2,
341 		     int *matches2, int *nmatches_posttrim_2, int *max_match_length_2,
342 		     int *ambig_end_length_5_2, int *ambig_end_length_3_2,
343 		     Splicetype_T *ambig_splicetype_5_2, Splicetype_T *ambig_splicetype_3_2,
344 		     double *ambig_prob_5_2, double *ambig_prob_3_2,
345 		     int *unknowns2, int *mismatches2, int *qopens2, int *qindels2, int *topens2, int *tindels2,
346 		     int *ncanonical2, int *nsemicanonical2, int *nnoncanonical2, double *avg_splice_score_2,
347 #endif
348 
349 		     Stage3middle_T stage3middle,
350 #ifdef PMAP
351 		     char *queryaaseq_ptr,
352 #endif
353 		     char *queryseq_ptr, char *queryuc_ptr, int querylength,
354 		     int skiplength, int query_subseq_offset,
355 		     Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
356 		     int maxpeelback, Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
357 		     int sense_filter, Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool);
358 
359 extern struct Pair_T *
360 Stage3_compute_one (int *cdna_direction, int *sensedir,
361 		    List_T *pairs1, int *npairs1, int *goodness1,
362 		    int *matches1, int *nmatches_posttrim_1, int *max_match_length_1,
363 		    int *ambig_end_length_5_1, int *ambig_end_length_3_1,
364 		    Splicetype_T *ambig_splicetype_5_1, Splicetype_T *ambig_splicetype_3_1,
365 		    double *ambig_prob_5_1, double *ambig_prob_3_1,
366 		    int *unknowns1, int *mismatches1, int *qopens1, int *qindels1, int *topens1, int *tindels1,
367 		    int *ncanonical1, int *nsemicanonical1, int *nnoncanonical1, double *avg_splice_score_1,
368 #ifdef GSNAP
369 		    struct Pair_T **pairarray2, List_T *pairs2, int *npairs2, int *goodness2,
370 		    int *matches2, int *nmatches_posttrim_2, int *max_match_length_2,
371 		    int *ambig_end_length_5_2, int *ambig_end_length_3_2,
372 		    Splicetype_T *ambig_splicetype_5_2, Splicetype_T *ambig_splicetype_3_2,
373 		    double *ambig_prob_5_2, double *ambig_prob_3_2,
374 		    int *unknowns2, int *mismatches2, int *qopens2, int *qindels2, int *topens2, int *tindels2,
375 		    int *ncanonical2, int *nsemicanonical2, int *nnoncanonical2, double *avg_splice_score_2,
376 #endif
377 
378 		    List_T stage2pairs, List_T all_stage2_starts, List_T all_stage2_ends,
379 #ifdef PMAP
380 		    char *queryaaseq_ptr,
381 #endif
382 		    char *queryseq_ptr, char *queryuc_ptr, int querylength,
383 		    int skiplength, int query_subseq_offset,
384 		    Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh,
385 		    Univcoord_T knownsplice_limit_low, Univcoord_T knownsplice_limit_high,
386 		    bool watsonp, int genestrand, bool jump_late_p,
387 		    int maxpeelback,
388 #ifndef GSNAP
389 		    Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
390 #endif
391 		    Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
392 		    int sense_try, int sense_filter);
393 
394 #ifndef GSNAP
395 extern T
396 Stage3_direct (Gregion_T gregion,
397 #ifdef PMAP
398 	       Sequence_T queryaaseq,
399 #endif
400 	       Sequence_T queryseq, Sequence_T queryuc, Pairpool_T pairpool, Genome_T genome,
401 	       Chrnum_T chrnum,  Univcoord_T chroffset, Chrpos_T chrpos, bool watsonp,
402 	       int ngap, Dynprog_T dynprogL, Dynprog_T dynprogR,
403 	       int extramaterial_end, int extraband_end);
404 #endif
405 
406 extern bool
407 Stage3_mergeable (Stage3_T firstpart, Stage3_T secondpart, int exonexonpos, int queryntlength);
408 
409 extern bool
410 Stage3_merge_chimera (T *new_left, T *new_right, T old_left, T old_right,
411 		      int minpos1, int maxpos1, int minpos2, int maxpos2,
412 		      Sequence_T queryseq,
413 		      char *queryseq_ptr, char *queryuc_ptr, Pairpool_T pairpool,
414 		      Dynprog_T dynprogL, Dynprog_T dynprogR, int maxpeelback);
415 extern void
416 Stage3_extend_right (T this, int goal, int querylength,
417 		     char *queryseq_ptr, char *queryuc_ptr,
418 		     bool max_extend_p, Pairpool_T pairpool,
419 		     int genestrand, int maxpeelback);
420 extern void
421 Stage3_extend_left (T this, int goal,
422 		    char *queryseq_ptr, char *queryuc_ptr,
423 		    bool max_extend_p, Pairpool_T pairpool,
424 		    int genestrand, int maxpeelback);
425 
426 extern void
427 Stage3_trim_right (T this, int goal, char *queryseq_ptr, Pairpool_T pairpool);
428 extern void
429 Stage3_trim_left (T this, int goal, char *queryseq_ptr, Pairpool_T pairpool);
430 
431 extern T
432 Stage3_merge_local (T old_left, T old_right,
433 		    int minpos1, int maxpos1, int minpos2, int maxpos2,
434 		    Sequence_T queryseq,
435 #ifdef PMAP
436 		    char *queryaaseq_ptr,
437 #endif
438 		    char *queryseq_ptr, char *queryuc_ptr,
439 #ifndef GSNAP
440 		    Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool,
441 #endif
442 		    Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
443 		    int maxpeelback);
444 
445 extern List_T
446 Stage3_split (T this, Sequence_T queryseq, Pairpool_T pairpool);
447 
448 #ifndef PMAP
449 extern void
450 Stage3_guess_cdna_direction (T this);
451 #endif
452 
453 #undef T
454 #endif
455