1 /* $Id: stage3hrdef.h 222929 2020-06-29 16:09:28Z twu $ */ 2 #ifndef STAGE3HRDEF_INCLUDED 3 #define STAGE3HRDEF_INCLUDED 4 5 #include "bool.h" 6 #include "types.h" 7 #include "genomicpos.h" 8 #include "method.h" 9 #include "chrnum.h" 10 #include "iit-read.h" /* For Overlap_T */ 11 #include "stage3hr.h" /* For Hittype_T */ 12 #include "list.h" 13 #include "substring.h" 14 #include "resulthr.h" /* For Pairtype_T */ 15 16 17 /* Note: Substring_T has genomiclength, but not Stage3end_T */ 18 19 /* TODO: Allow a Stage3end_T object to hold solutions for both 20 sensedirs. Then the pairing operations can select the sensedirs 21 that are best */ 22 #define T Stage3end_T 23 struct T { 24 Hittype_T hittype; 25 Method_T method; 26 int level; 27 28 int querylength; /* Needed for overlap and pairlength calculations */ 29 int querylength_adj; /* Adjusted for insertions */ 30 31 /* For transcriptome alignment. For fusions, transcripts corresponding to substring_for_concordance */ 32 List_T transcripts; 33 34 /* For fusions, transcripts corresponding to substring_other */ 35 List_T transcripts_other; 36 37 List_T substrings_1toN; /* query position 1 to N */ 38 List_T substrings_Nto1; /* query position N to 1. Keeps only pointers to the substrings. */ 39 40 List_T junctions_1toN; 41 List_T junctions_Nto1; 42 43 /* if trim_querystart_splicep or trim_queryend_splicep is true, then trim is of type "unknown amb" */ 44 /* if trim_querystart_splicep or trim_queryend_splicep is false, then trim is of type "unknown" */ 45 46 int trim_querystart; /* Used by Stage3end_optimal_score for comparing terminals and non-terminals */ 47 int trim_queryend; 48 49 /* Not clear if we should use mandatory_trims or querystart_chrbound and (querylength - queryend_chrbound) */ 50 int mandatory_trim_querystart; /* trim that is due to extension past chromosomal bounds. Needed for computing coverage */ 51 int mandatory_trim_queryend; 52 53 bool trim_querystart_splicep; 54 bool trim_queryend_splicep; 55 56 int querystart_chrbound; 57 int queryend_chrbound; 58 59 Univcoord_T genomicstart; 60 Univcoord_T genomicend; 61 62 Univcoord_T low; 63 Univcoord_T high; 64 65 Chrpos_T genomiclength; 66 Chrpos_T guided_insertlength; /* Used only by Stage3end_eval_and_sort_guided */ 67 68 bool distant_splice_p; /* Indicates a distant splice (same 69 chromosome or different ones). 70 Used for filtering (e.g., by 71 DISTANT_SPLICE_SPECIAL) */ 72 73 Chrnum_T chrnum; /* Needed for printing paired-end results. A 74 chrnum of 0 indicates a translocation (two 75 different chromosomes), or an alignment that 76 needs to be printed as a translocation 77 (samechr_splice unless --merge_samechr is 78 selected). Used for printing. */ 79 80 Chrnum_T effective_chrnum; /* For determining concordance */ 81 Chrnum_T other_chrnum; /* 0 for non-translocations, and other chrnum besides effective_chrnum for translocations */ 82 Univcoord_T chroffset; 83 Univcoord_T chrhigh; 84 Chrpos_T chrlength; 85 86 bool plusp; 87 int genestrand; 88 89 /* For spliced alignments */ 90 int sensedir; /* a private value */ 91 int sensedir_for_concordance; 92 /* Possibilities: 93 not spliced: sensedir_for_concordance is NULL. sensedir_private is NULL. 94 regular or transloc splice with certain sense: sensedir_for_concordance is {FORWARD,ANTI}. sensedir(private) is the same {FORWARD,ANTI}. 95 distant splice with uncertain sense: sensedir_for_concordance is NULL. sensedir(private) is {FORWARD,ANTI}. 96 */ 97 98 int nsplices; 99 double splice_score; /* Used by various SPLICE types */ 100 int nindels; /* for indels */ 101 102 int nmismatches_bothdiff; 103 int nmismatches_refdiff; /* Set only for display */ 104 int nsegments; 105 106 int refalt_nmatches_to_trims; 107 int ref_nmatches_to_trims; 108 109 /* Use refalt_score_overall for filtering, because a mismatch in the masked region does not change refalt_score */ 110 /* Also use refalt_score_overall for ranking, so only mismatches in the unmasked regions count */ 111 /* Is there any role for ref_score_overall? */ 112 int ref_score_overall; /* Over entire query, so trimming raises the score */ 113 int refalt_score_overall; /* Over entire query, so trimming raises the score */ 114 int refalt_score_within_trims; /* From one trim to the other */ 115 116 int refalt_nmatches_plus_spliced_trims; /* Includes alts and ambiguous parts after good splice ends */ 117 int ref_nmatches_plus_spliced_trims; 118 119 float mapq_loglik; 120 int mapq_score; 121 int absmq_score; /* Absolute MAPQ, for XQ and X2 flags */ 122 123 124 /* score = querylength - nmatches - penalties */ 125 /* score_posttrim = querylength - nmatches_posttrim - penalties */ 126 /* score (best case) <= score_posttrim (worst case) */ 127 /* In computing concordance, can rank by score to get ambiguous 128 alignments, but keep going until we reach score_posttrim to give 129 complete alignments a chance */ 130 131 /* query: ------ACGTACGaACGa------ (length 24, spliced on left) */ 132 /* ||||||| |||: */ 133 /* ref: ----agACGTACGTACgt------ */ 134 /* mask: nnnnnnACGTACGTACnnnnnnnn */ 135 136 /* refalt_score_overall: -13: -6 for left trim, -6 for right trim, -1 for mismatch (remove) 137 (*) ref_score_overall: -14 = -6 for left trim, -6 for right trim, -2 for mismatches (keep) 138 139 140 (*) refalt_score_within_trims: -1 for mismatches (keep) 141 ref_score_within_trims: -2 for mismatches (remove) 142 143 refalt_score_allowing_spliced_trims: -7 = -6 for right trim, -1 for mismatches (remove) 144 ref_score_allowing_spliced_trims: -8 = -6 for right trim, -2 for mismatches (remove) 145 (*) refalt_nmatches_plus_spliced_trims: 17 = 6 for left trim + 12 in alignment - 1 mismatch 146 (*) ref_nmatches_plus_spliced_trims: 16 = 6 for left trim + 12 in alignment - 2 mismatches 147 148 (*) refalt_nmatches_to_trims: 11 = 12 in alignment - 1 mismatch => (keep) 149 (*) ref_nmatches_to_trims: 10 = 12 in alignment - 2 mismatches (keep) 150 151 Use scores to monitor progress. Use nmatches to compare hits. 152 Order nmatches to favor spliced trims, and then shorter trims 153 Use nmatches_to_trims for optimal_score_final within loci 154 */ 155 156 /* Current */ 157 /* found_score: score_overall */ 158 /* found_score_within_trims: score_within_trims */ 159 /* Stage3end_output_cmp: nmatches_plus_spliced_trims */ 160 /* Stage3pair_output_cmp: nmatches_plus_spliced_trims, score_within_trims */ 161 /* hit_sort_cmp: score_within_trims, nmatches_plus_spliced_trims */ 162 /* hit_equiv_cmp: score_within_trims, nmatches_plus_spliced_trims */ 163 /* Stage3end_hit_goodness_cmp: nmatches_plus_spliced_trims */ 164 /* hitpair_sort_cmp: score_within_trims, nmatches_plus_spliced_trims */ 165 /* hitpair_equiv_cmp: nmatches_plus_spliced_trims */ 166 /* hitpair_goodness_cmp: nmatches_plus_spliced_trims */ 167 /* Stage3end_optimal_score_final: nmatches_to_trims */ 168 169 /* Desired */ 170 /* found_score: ref_score_overall */ 171 /* found_score_within_trims: refalt_score_within_trims */ 172 173 /* Stage3end_output_cmp: refalt_nmatches_plus_spliced_trims, ref_nmismatches_plus_spliced_trims, refalt_score_within_trims */ 174 /* Stage3pair_output_cmp: refalt_nmatches_plus_spliced_trims, ref_nmismatches_plus_spliced_trims, refalt_score_within_trims */ 175 /* hit_sort_cmp: refalt_score_within_trims, ref_score_within_trims, refalt_nmatches_plus_spliced_trims */ 176 /* hit_equiv_cmp: refalt_score_within_trims, ref_score_within_trims, refalt_nmatches_plus_spliced_trims */ 177 /* Stage3end_hit_goodness_cmp: refalt_nmatches_plus_spliced_trims */ 178 /* hitpair_sort_cmp: refalt_score_within_trims, ref_score_within_trims, refalt_nmatches_plus_spliced_trims */ 179 /* hitpair_equiv_cmp: nmatches_plus_spliced_trims */ 180 /* hitpair_goodness_cmp: nmatches_plus_spliced_trims */ 181 /* Stage3end_optimal_score_final: refalt_nmatches_plus_spliced_trims */ 182 /* Stage3pair_optimal_score_final: refalt_nmatches_plus_spliced_trims */ 183 184 185 int score_eventrim; /* Temporary storage used by Stage3end_optimal_score */ 186 187 bool paired_usedp; 188 189 int query_splicepos; /* For splices. Relative to querystart, so different from circularpos */ 190 191 int circularalias; /* -1 if all below chrlength, 0 if straddles or NA (e.g., transloc), and +1 if above */ 192 /* -2 if extends below beginning of circular chromosome, +2 if extends beyond end of second copy */ 193 int circularpos; /* if circularalias == 0, then amount of queryseq below chrlength. Defined relative to low */ 194 195 bool altlocp; 196 }; 197 198 199 struct Stage3pair_T { 200 Pairtype_T pairtype; 201 int genestrand; 202 int sensedir; 203 204 T hit5; /* Always a copy from the original */ 205 T hit3; /* Always a copy from the original */ 206 207 Univcoord_T low; 208 Univcoord_T high; 209 Chrpos_T insertlength; 210 int pair_relationship; 211 int insertlength_expected_sign; /* 1 if in (expected_pairlength_low, expected_pairlength_high), 212 0 if in (expected_pairlength_low, expected_pairlength_very_high), and 213 -1 if < expected_pairlength_low or > expected_pairlength_very_high */ 214 215 Chrpos_T outerlength; 216 217 float mapq_loglik; 218 int mapq_score; 219 int absmq_score; 220 221 /* Add values from hit5 and hit3: */ 222 /* int refalt_nmatches_to_trims; */ 223 /* int ref_nmatches_to_trims; */ 224 /* int ref_score_overall; */ 225 /* int refalt_score_within_trims; */ 226 /* int refalt_nmatches_plus_spliced_trims; */ 227 /* int ref_nmatches_plus_spliced_trims; */ 228 229 int nmismatches; /* querylength - sum of nmatches */ 230 int score_eventrim; /* for storage */ 231 232 /* Overlap_T gene_overlap; */ 233 long int tally; 234 235 #ifdef USE_ABSDIFFLENGTH 236 Chrpos_T absdifflength; 237 #endif 238 #ifdef USE_BINGO 239 bool absdifflength_bingo_p; 240 #endif 241 int dir; /* -1, 0, or +1 */ 242 bool sense_consistent_p; 243 244 int nsplices; 245 246 bool circularp; /* If either hit5 or hit3 are circular */ 247 int alts_status_inside; 248 }; 249 250 #undef T 251 #endif 252 253