1 /* $Id: stage3hrdef.h 222929 2020-06-29 16:09:28Z twu $ */
2 #ifndef STAGE3HRDEF_INCLUDED
3 #define STAGE3HRDEF_INCLUDED
4 
5 #include "bool.h"
6 #include "types.h"
7 #include "genomicpos.h"
8 #include "method.h"
9 #include "chrnum.h"
10 #include "iit-read.h"		/* For Overlap_T */
11 #include "stage3hr.h"		/* For Hittype_T */
12 #include "list.h"
13 #include "substring.h"
14 #include "resulthr.h"		/* For Pairtype_T */
15 
16 
17 /* Note: Substring_T has genomiclength, but not Stage3end_T */
18 
19 /* TODO: Allow a Stage3end_T object to hold solutions for both
20    sensedirs.  Then the pairing operations can select the sensedirs
21    that are best */
22 #define T Stage3end_T
23 struct T {
24   Hittype_T hittype;
25   Method_T method;
26   int level;
27 
28   int querylength;		/* Needed for overlap and pairlength calculations */
29   int querylength_adj;		/* Adjusted for insertions */
30 
31   /* For transcriptome alignment.  For fusions, transcripts corresponding to substring_for_concordance */
32   List_T transcripts;
33 
34   /*  For fusions, transcripts corresponding to substring_other */
35   List_T transcripts_other;
36 
37   List_T substrings_1toN;	/* query position 1 to N */
38   List_T substrings_Nto1;	/* query position N to 1.  Keeps only pointers to the substrings. */
39 
40   List_T junctions_1toN;
41   List_T junctions_Nto1;
42 
43   /* if trim_querystart_splicep or trim_queryend_splicep is true, then trim is of type "unknown amb" */
44   /* if trim_querystart_splicep or trim_queryend_splicep is false, then trim is of type "unknown" */
45 
46   int trim_querystart; /* Used by Stage3end_optimal_score for comparing terminals and non-terminals */
47   int trim_queryend;
48 
49   /* Not clear if we should use mandatory_trims or querystart_chrbound and (querylength - queryend_chrbound) */
50   int mandatory_trim_querystart; /* trim that is due to extension past chromosomal bounds.  Needed for computing coverage */
51   int mandatory_trim_queryend;
52 
53   bool trim_querystart_splicep;
54   bool trim_queryend_splicep;
55 
56   int querystart_chrbound;
57   int queryend_chrbound;
58 
59   Univcoord_T genomicstart;
60   Univcoord_T genomicend;
61 
62   Univcoord_T low;
63   Univcoord_T high;
64 
65   Chrpos_T genomiclength;
66   Chrpos_T guided_insertlength; /* Used only by Stage3end_eval_and_sort_guided */
67 
68   bool distant_splice_p;	/* Indicates a distant splice (same
69 				   chromosome or different ones).
70 				   Used for filtering (e.g., by
71 				   DISTANT_SPLICE_SPECIAL) */
72 
73   Chrnum_T chrnum; /* Needed for printing paired-end results.  A
74 		      chrnum of 0 indicates a translocation (two
75 		      different chromosomes), or an alignment that
76 		      needs to be printed as a translocation
77 		      (samechr_splice unless --merge_samechr is
78 		      selected).  Used for printing. */
79 
80   Chrnum_T effective_chrnum;	/* For determining concordance */
81   Chrnum_T other_chrnum;	/* 0 for non-translocations, and other chrnum besides effective_chrnum for translocations */
82   Univcoord_T chroffset;
83   Univcoord_T chrhigh;
84   Chrpos_T chrlength;
85 
86   bool plusp;
87   int genestrand;
88 
89   /* For spliced alignments */
90   int sensedir;			/* a private value */
91   int sensedir_for_concordance;
92   /* Possibilities:
93      not spliced: sensedir_for_concordance is NULL.  sensedir_private is NULL.
94      regular or transloc splice with certain sense: sensedir_for_concordance is {FORWARD,ANTI}.  sensedir(private) is the same {FORWARD,ANTI}.
95      distant splice with uncertain sense: sensedir_for_concordance is NULL.  sensedir(private) is {FORWARD,ANTI}.
96   */
97 
98   int nsplices;
99   double splice_score;		/* Used by various SPLICE types */
100   int nindels;			/* for indels */
101 
102   int nmismatches_bothdiff;
103   int nmismatches_refdiff;	/* Set only for display */
104   int nsegments;
105 
106   int refalt_nmatches_to_trims;
107   int ref_nmatches_to_trims;
108 
109   /* Use refalt_score_overall for filtering, because a mismatch in the masked region does not change refalt_score */
110   /* Also use refalt_score_overall for ranking, so only mismatches in the unmasked regions count */
111   /* Is there any role for ref_score_overall? */
112   int ref_score_overall;  /* Over entire query, so trimming raises the score */
113   int refalt_score_overall;  /* Over entire query, so trimming raises the score */
114   int refalt_score_within_trims; /* From one trim to the other */
115 
116   int refalt_nmatches_plus_spliced_trims;  /* Includes alts and ambiguous parts after good splice ends */
117   int ref_nmatches_plus_spliced_trims;
118 
119   float mapq_loglik;
120   int mapq_score;
121   int absmq_score;		/* Absolute MAPQ, for XQ and X2 flags */
122 
123 
124   /* score = querylength - nmatches - penalties */
125   /* score_posttrim = querylength - nmatches_posttrim - penalties */
126   /* score (best case) <= score_posttrim (worst case) */
127   /* In computing concordance, can rank by score to get ambiguous
128      alignments, but keep going until we reach score_posttrim to give
129      complete alignments a chance */
130 
131   /* query: ------ACGTACGaACGa------ (length 24, spliced on left) */
132   /*              ||||||| |||:       */
133   /* ref:   ----agACGTACGTACgt------ */
134   /* mask:  nnnnnnACGTACGTACnnnnnnnn */
135 
136   /*     refalt_score_overall: -13: -6 for left trim, -6 for right trim, -1 for mismatch (remove)
137      (*) ref_score_overall: -14 = -6 for left trim, -6 for right trim, -2 for mismatches (keep)
138 
139 
140      (*) refalt_score_within_trims: -1 for mismatches (keep)
141          ref_score_within_trims: -2 for mismatches (remove)
142 
143          refalt_score_allowing_spliced_trims: -7 = -6 for right trim, -1 for mismatches (remove)
144          ref_score_allowing_spliced_trims: -8 = -6 for right trim, -2 for mismatches (remove)
145      (*) refalt_nmatches_plus_spliced_trims: 17 = 6 for left trim + 12 in alignment - 1 mismatch
146      (*) ref_nmatches_plus_spliced_trims: 16 = 6 for left trim + 12 in alignment - 2 mismatches
147 
148      (*) refalt_nmatches_to_trims: 11 = 12 in alignment - 1 mismatch => (keep)
149      (*) ref_nmatches_to_trims: 10 = 12 in alignment - 2 mismatches (keep)
150 
151          Use scores to monitor progress.  Use nmatches to compare hits.
152 	 Order nmatches to favor spliced trims, and then shorter trims
153 	 Use nmatches_to_trims for optimal_score_final within loci
154   */
155 
156   /* Current */
157   /* found_score: score_overall */
158   /* found_score_within_trims: score_within_trims */
159   /* Stage3end_output_cmp: nmatches_plus_spliced_trims */
160   /* Stage3pair_output_cmp: nmatches_plus_spliced_trims, score_within_trims */
161   /* hit_sort_cmp: score_within_trims, nmatches_plus_spliced_trims */
162   /* hit_equiv_cmp: score_within_trims, nmatches_plus_spliced_trims */
163   /* Stage3end_hit_goodness_cmp: nmatches_plus_spliced_trims */
164   /* hitpair_sort_cmp: score_within_trims, nmatches_plus_spliced_trims */
165   /* hitpair_equiv_cmp: nmatches_plus_spliced_trims */
166   /* hitpair_goodness_cmp: nmatches_plus_spliced_trims */
167   /* Stage3end_optimal_score_final: nmatches_to_trims */
168 
169   /* Desired */
170   /* found_score: ref_score_overall */
171   /* found_score_within_trims: refalt_score_within_trims */
172 
173   /* Stage3end_output_cmp: refalt_nmatches_plus_spliced_trims, ref_nmismatches_plus_spliced_trims, refalt_score_within_trims */
174   /* Stage3pair_output_cmp: refalt_nmatches_plus_spliced_trims, ref_nmismatches_plus_spliced_trims, refalt_score_within_trims */
175   /* hit_sort_cmp: refalt_score_within_trims, ref_score_within_trims, refalt_nmatches_plus_spliced_trims */
176   /* hit_equiv_cmp: refalt_score_within_trims, ref_score_within_trims, refalt_nmatches_plus_spliced_trims */
177   /* Stage3end_hit_goodness_cmp: refalt_nmatches_plus_spliced_trims */
178   /* hitpair_sort_cmp: refalt_score_within_trims, ref_score_within_trims, refalt_nmatches_plus_spliced_trims */
179   /* hitpair_equiv_cmp: nmatches_plus_spliced_trims */
180   /* hitpair_goodness_cmp: nmatches_plus_spliced_trims */
181   /* Stage3end_optimal_score_final: refalt_nmatches_plus_spliced_trims */
182   /* Stage3pair_optimal_score_final: refalt_nmatches_plus_spliced_trims */
183 
184 
185   int score_eventrim;		/* Temporary storage used by Stage3end_optimal_score */
186 
187   bool paired_usedp;
188 
189   int query_splicepos;		/* For splices.  Relative to querystart, so different from circularpos */
190 
191   int circularalias;			/* -1 if all below chrlength, 0 if straddles or NA (e.g., transloc), and +1 if above */
192                                 /* -2 if extends below beginning of circular chromosome, +2 if extends beyond end of second copy */
193   int circularpos;		/* if circularalias == 0, then amount of queryseq below chrlength.  Defined relative to low */
194 
195   bool altlocp;
196 };
197 
198 
199 struct Stage3pair_T {
200   Pairtype_T pairtype;
201   int genestrand;
202   int sensedir;
203 
204   T hit5;			/* Always a copy from the original */
205   T hit3;			/* Always a copy from the original */
206 
207   Univcoord_T low;
208   Univcoord_T high;
209   Chrpos_T insertlength;
210   int pair_relationship;
211   int insertlength_expected_sign;	/* 1 if in (expected_pairlength_low, expected_pairlength_high),
212 					   0 if in (expected_pairlength_low, expected_pairlength_very_high), and
213 					   -1 if < expected_pairlength_low or > expected_pairlength_very_high */
214 
215   Chrpos_T outerlength;
216 
217   float mapq_loglik;
218   int mapq_score;
219   int absmq_score;
220 
221   /* Add values from hit5 and hit3: */
222   /* int refalt_nmatches_to_trims; */
223   /* int ref_nmatches_to_trims; */
224   /* int ref_score_overall; */
225   /* int refalt_score_within_trims; */
226   /* int refalt_nmatches_plus_spliced_trims; */
227   /* int ref_nmatches_plus_spliced_trims; */
228 
229   int nmismatches;		/* querylength - sum of nmatches */
230   int score_eventrim;		/* for storage */
231 
232   /* Overlap_T gene_overlap; */
233   long int tally;
234 
235 #ifdef USE_ABSDIFFLENGTH
236   Chrpos_T absdifflength;
237 #endif
238 #ifdef USE_BINGO
239   bool absdifflength_bingo_p;
240 #endif
241   int dir;			/* -1, 0, or +1 */
242   bool sense_consistent_p;
243 
244   int nsplices;
245 
246   bool circularp;		/* If either hit5 or hit3 are circular */
247   int alts_status_inside;
248 };
249 
250 #undef T
251 #endif
252 
253