1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "overlapInCore.H"
19
20 // Output the overlap between strings S_ID and T_ID which
21 // have lengths S_Len and T_Len , respectively.
22 // The overlap information is in (* olap) .
23 // S_Dir indicates the orientation of S .
24 // T is always forward.
25
26 void
Output_Overlap(uint32 S_ID,int S_Len,Direction_t S_Dir,uint32 T_ID,int T_Len,Olap_Info_t * olap,Work_Area_t * WA)27 Output_Overlap(uint32 S_ID, int S_Len, Direction_t S_Dir,
28 uint32 T_ID, int T_Len, Olap_Info_t *olap,
29 Work_Area_t *WA) {
30
31 ovOverlap *ovs = WA->overlaps + WA->overlapsLen++;
32
33 // Overlap is good for UTG only.
34
35 ovs->dat.ovl.forUTG = true;
36 ovs->dat.ovl.forOBT = false;
37 ovs->dat.ovl.forDUP = false;
38
39 // Set the span of the alignment.
40
41 ovs->dat.ovl.span = (olap->s_hi - olap->s_lo);
42 ovs->dat.ovl.span += (olap->t_hi - olap->t_lo);
43 ovs->dat.ovl.span += (olap->delta_ct);
44 assert((ovs->dat.ovl.span % 2) == 0);
45 ovs->dat.ovl.span /= 2;
46
47 assert (S_ID < T_ID);
48
49 int32 S_Right_Hang = S_Len - olap->s_hi - 1;
50 int32 T_Right_Hang = T_Len - olap->t_hi - 1;
51
52 bool Sleft;
53
54 char orient = 0;
55 int32 ahg = 0;
56 int32 bhg = 0;
57
58 //fprintf(stderr, "S %d %d = %d len %d T %d %d = %d len %d delta_ct %d delta %d\n",
59 // olap->s_lo, olap->s_hi, olap->s_hi - olap->s_lo, S_Len,
60 // olap->t_lo, olap->t_hi, olap->t_hi - olap->t_lo, T_Len,
61 // olap->delta_ct, olap->delta[0]);
62
63 assert(olap->s_lo < olap->s_hi);
64 assert(olap->t_lo < olap->t_hi);
65
66 // Ensure this is a dovetail or contained overlap.
67 assert((olap->s_lo == 0) || (olap->t_lo == 0));
68 assert((olap->s_hi == S_Len - 1) || (olap->t_hi == T_Len - 1));
69
70
71 if ((olap->s_lo > olap->t_lo) ||
72 ((olap->s_lo == olap->t_lo) && (S_Right_Hang > T_Right_Hang))) {
73 assert(olap->t_lo == 0);
74 assert((olap->s_hi == S_Len - 1) || (olap->t_hi == T_Len - 1));
75 Sleft = true;
76 } else {
77 assert(olap->s_lo == 0);
78 assert((olap->s_hi == S_Len - 1) || (olap->t_hi == T_Len - 1));
79 Sleft = false;
80 }
81
82 if (Sleft) {
83 ovs->a_iid = S_ID;
84 ovs->b_iid = T_ID;
85 } else {
86 ovs->a_iid = T_ID;
87 ovs->b_iid = S_ID;
88 }
89
90 //if (Sleft) {
91 // if (S_Right_Hang >= T_Right_Hang)
92 // overlap_type = AS_CONTAINMENT;
93 // else
94 // overlap_type = AS_DOVETAIL;
95 //} else {
96 // if (T_Right_Hang >= S_Right_Hang)
97 // overlap_type = AS_CONTAINMENT;
98 // else
99 // overlap_type = AS_DOVETAIL;
100 //}
101
102 if (Sleft) {
103 orient = (S_Dir == FORWARD) ? 'N' : 'O';
104 ahg = olap->s_lo;
105 bhg = T_Right_Hang - S_Right_Hang;
106
107 } else {
108 orient = (S_Dir == FORWARD) ? 'N' : 'I';
109 ahg = olap->t_lo;
110 bhg = S_Right_Hang - T_Right_Hang;
111 }
112
113 // CMM: Regularize the reverse orientated containment overlaps to a common orientation.
114 //
115 // This catches the case where a reverse orient S (T is always fowrard) is placed
116 // in the A position; we flip the overlap to make S be forward and T be reverse.
117 //
118 if ((orient == 'O') && (S_Right_Hang >= T_Right_Hang)) {
119 orient = 'I';
120 ahg = -(T_Right_Hang - S_Right_Hang);
121 bhg = -(olap->s_lo);
122 }
123
124 ovs->erate(olap->quality);
125
126 switch (orient) {
127 case 'N':
128 ovs->a_hang(ahg);
129 ovs->b_hang(bhg);
130 ovs->dat.ovl.flipped = false;
131 break;
132
133 case 'I':
134 ovs->a_hang(ahg);
135 ovs->b_hang(bhg);
136 ovs->dat.ovl.flipped = true;
137 break;
138
139 case 'O':
140 ovs->a_hang(-bhg);
141 ovs->b_hang(-ahg);
142 ovs->dat.ovl.flipped = true;
143 break;
144
145 case 'A': // Never reached.
146 ovs->a_hang(-bhg);
147 ovs->b_hang(-ahg);
148 ovs->dat.ovl.flipped = false;
149 break;
150 }
151
152 #if OUTPUT_OVERLAP_DELTAS
153 signed char deltas[2 * AS_READ_MAX_NORMAL_LEN];
154 signed char *deltaCursor = deltas;
155
156 if (Sleft == false)
157 for (i = 0; i < olap->delta_ct; i ++)
158 olap->delta [i] *= -1;
159
160 for (int i = 0; i < olap->delta_ct; i ++) {
161 for (int j = abs (olap->delta [i]); j > 0; j -= AS_LONGEST_DELTA) {
162 if (j > AS_LONGEST_DELTA)
163 *deltaCursor++ = AS_LONG_DELTA_CODE;
164 else
165 *deltaCursor++ = j * Sign (olap->delta [i]);
166 }
167 }
168
169 *deltaCursor = AS_ENDOF_DELTA_CODE;
170 #endif
171
172
173 WA->Total_Overlaps ++;
174
175 if (bhg <= 0)
176 WA->Contained_Overlap_Ct ++;
177 else
178 WA->Dovetail_Overlap_Ct ++;
179
180 // Write overlaps if we've saved too many.
181 // They're also written at the end of the thread.
182
183 if (WA->overlapsLen >= WA->overlapsMax)
184 #pragma omp critical
185 {
186 for (int32 zz=0; zz<WA->overlapsLen; zz++)
187 Out_BOF->writeOverlap(WA->overlaps + zz);
188
189 WA->overlapsLen = 0;
190 }
191 }
192
193
194
195 void
Output_Partial_Overlap(uint32 s_id,uint32 t_id,Direction_t dir,const Olap_Info_t * olap,int s_len,int t_len,Work_Area_t * WA)196 Output_Partial_Overlap(uint32 s_id,
197 uint32 t_id,
198 Direction_t dir,
199 const Olap_Info_t *olap,
200 int s_len,
201 int t_len,
202 Work_Area_t *WA) {
203
204 Total_Overlaps++;
205
206 ovOverlap *ovl = WA->overlaps + WA->overlapsLen++;
207
208 assert(s_id < t_id);
209
210 ovl->a_iid = s_id;
211 ovl->b_iid = t_id;
212
213 // Overlap is good for OBT or DUP. It will be refined more during the store build.
214
215 ovl->dat.ovl.forUTG = false;
216 ovl->dat.ovl.forOBT = true;
217 ovl->dat.ovl.forDUP = true;
218
219 // Set the span of the alignment.
220
221 ovl->dat.ovl.span = (olap->s_hi - olap->s_lo);
222 ovl->dat.ovl.span += (olap->t_hi - olap->t_lo);
223 ovl->dat.ovl.span += (olap->delta_ct);
224 assert((ovl->dat.ovl.span % 2) == 0);
225 ovl->dat.ovl.span /= 2;
226
227 // Convert to canonical form with s forward and use space-based
228 // coordinates
229
230 if (dir == FORWARD) {
231 ovl->dat.ovl.ahg5 = (olap->s_lo);
232 ovl->dat.ovl.ahg3 = s_len - (olap->s_hi + 1);
233 ovl->dat.ovl.bhg5 = (olap->t_lo);
234 ovl->dat.ovl.bhg3 = t_len - (olap->t_hi + 1);
235
236 ovl->dat.ovl.flipped = false;
237
238 //assert(a < b);
239 //assert(c < d);
240
241 } else {
242 ovl->dat.ovl.ahg5 = s_len - (olap->s_hi + 1);
243 ovl->dat.ovl.ahg3 = (olap->s_lo);
244 ovl->dat.ovl.bhg5 = t_len - (olap->t_hi + 1);
245 ovl->dat.ovl.bhg3 = (olap->t_lo);
246
247 ovl->dat.ovl.flipped = true;
248
249 //assert(a < b);
250 //assert(c > d); // Reverse!
251 }
252
253 ovl->erate(olap->quality);
254
255 // We also flush the file at the end of a thread
256
257 if (WA->overlapsLen >= WA->overlapsMax) {
258 #pragma omp critical
259 for (int32 zz=0; zz<WA->overlapsLen; zz++)
260 Out_BOF->writeOverlap(WA->overlaps + zz);
261
262 WA->overlapsLen = 0;
263 }
264 }
265
266