1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "AS_BAT_ReadInfo.H"
19 #include "AS_BAT_BestOverlapGraph.H"
20 #include "AS_BAT_Logging.H"
21
22 #include "AS_BAT_FindCircular.H"
23
24
25 class readCheck {
26 public:
init(ufNode * node)27 void init(ufNode *node) {
28 read = node;
29
30 ovlLen = 0;
31 ovl = OC->getOverlaps(read->ident, ovlLen);
32 };
33
34 ufNode *read;
35
36 uint32 ovlLen;
37 BAToverlap *ovl;
38 };
39
40
41 // Checks if all the overlaps implied by circularizing the tig actually
42 // exist.
43 //
44 // This checks evry read in the path
45 // rdFid .. the end of the tig --circular-> the other end .. rdTid
46 // and ensures that the tested read has an overlap to all later reads.
47 //
48 // [OLD COMMENT - names are incorrect, but the idea is the same]
49 //
50 // Decide if the external edge in externalSco can plausibly circularize the tig.
51 // For this to happen, the external egdge read must overlap all the reads in the path
52 // to rdA.
53 //
54 // ======================= <- another copy of the extern read
55 // ----------------------- <- another copy of the last read
56 // 1-------------------- <- the first read
57 // 2--------------------- <- rdA
58 // 3-------------------------
59 // . . . . . . . . . . . . . . .
60 // 8============================== <- extern
61 // 9------------------------------ <- the last read
62 // ------------- ------
63 //
64 // Here, the extern read is causing rdA to look confused on it's low end.
65 // In order for this to circularize, we need to check that all the
66 // (non-contained) reads from the extern read to the last read have overlaps
67 // to all the reads between itself and rdA (going around the circle).
68 //
69 // This will correcly not find a circle if the extern read is too far from
70 // the end, or if rdA is too far from the beginning.
71 //
72
73 bool
isCircularizingEdge(Unitig * tig,uint32 rdFid,uint32 rdTid)74 isCircularizingEdge(Unitig *tig,
75 uint32 rdFid, // read From
76 uint32 rdTid) { // read To
77 bool isCircular = true;
78
79 if ((rdTid == 0) || (tig->inUnitig(rdTid) != tig->id()) || // Quick sanity checks:
80 (rdFid == 0) || (tig->inUnitig(rdFid) != tig->id())) // the reads must exist.
81 return(false); // the reads must be in the supplied tig.
82
83 // Based on the position of the reads in the tig, decide if we are
84 // checking overlaps from end-to-start or from start-to-end. Then build a
85 // list of the reads we need to check.
86 //
87 // forward == true if we iterate:
88 // from the F read to the high end of the tig
89 // across the circle
90 // to the low end of the tig
91 // and up to the T read.
92 //
93 // forward == false if we iterate:
94 // from the F read to the low end of the tig
95 // across the circle
96 // to the high end of the tig
97 // and down to the T read.
98
99 uint32 rdTidx = tig->ufpathIdx(rdTid);
100 uint32 rdFidx = tig->ufpathIdx(rdFid);
101 bool forward = false;
102
103 uint32 rcMax = 0;
104 uint32 rcLen = 0;
105 readCheck *rc = nullptr;
106
107 // -- rdT is at the start of the tig, rdF is at the end.
108 if (rdTidx < rdFidx) {
109 rcMax = tig->ufpath.size() - rdFidx + 1 + rdTidx + 1;
110 rc = new readCheck [rcMax];
111 forward = true;
112
113 for (uint32 rd=rdFidx; rd < tig->ufpath.size(); rd++)
114 if (OG->isContained(tig->ufpath[rd].ident) == false)
115 rc[rcLen++].init(&tig->ufpath[rd]);
116
117 for (uint32 rd=0; rd <= rdTidx; rd++)
118 if (OG->isContained(tig->ufpath[rd].ident) == false)
119 rc[rcLen++].init(&tig->ufpath[rd]);
120 }
121
122 // -- rdF is at the start of the tig, rdT is at the end.
123 else {
124 rcMax = tig->ufpath.size() - rdTidx + 1 + rdFidx + 1;
125 rc = new readCheck [rcMax];
126 forward = false;
127
128 for (uint32 rd=rdFidx+1; rd-- > 0; )
129 if (OG->isContained(tig->ufpath[rd].ident) == false)
130 rc[rcLen++].init(&tig->ufpath[rd]);
131
132 for (uint32 rd=tig->ufpath.size(); rd-- > rdTidx; )
133 if (OG->isContained(tig->ufpath[rd].ident) == false)
134 rc[rcLen++].init(&tig->ufpath[rd]);
135 }
136
137 assert(rcLen < rcMax);
138
139 // Iterate over all the reads, checking for an overlap to all the later
140 // reads in the list.
141 //
142 // Once we get a pair, decide which read ends the overlap should be on.
143 //
144 // In the 'forward' case, we start with rdA being the read near the high
145 // end of the tig (passed in as rdF) and check for overlaps to each rdB
146 // 'after' it in the circular tig.
147 //
148 // rc[2] ----------
149 // rc[3] ------------ <- B read
150 // A read -> ---------> rc[0]
151 // <----------- rc[1]
152 //
153 // In the 'reverse' case, we start with rdA being the read near the low
154 // end of the tig (passed in as rdF) and check for overlaps to each rdB
155 // 'before; it in the circular tig.
156 //
157 // rc[1] --------->
158 // rc[0] -----------> <- A read
159 // B read -> ---------- rc[3]
160 // ------------ rc[2]
161 //
162 // In both cases, we check for an overlap between rc[i] and rc[j], i<j.
163 // 'forward' or 'reverse' is used to decide which end of the read we need
164 // to find the overlap on. The orientation on the read in the picture is
165 // to help visualizing ovlA3 and ovlB3 below).
166 //
167 // For each rc[] pair, we just scan all the overlaps for rdA and check that
168 // the overlap is to the correct rdB read
169 // the overlap is on the correct ends of each read
170 // the oveerlap is of good quality.
171
172 for (uint32 ai=0; (isCircular == true) && (ai<rcLen); ai++) {
173 ufNode *rdA = rc[ai].read;
174 uint32 ovlLen = rc[ai].ovlLen;
175 BAToverlap *ovl = rc[ai].ovl;
176 uint32 rdAid = rdA->ident;
177
178 for (uint32 bi=ai+1; (isCircular == true) && (bi<rcLen); bi++) {
179 ufNode *rdB = rc[bi].read;
180 uint32 rdBid = rdB->ident;
181
182 bool ovlA3 = (forward) ? (rdA->isForward()) : (rdA->isReverse()); // Need overlap on 3' end of A read.
183 bool ovlB3 = (forward) ? (rdB->isReverse()) : (rdB->isForward());
184 bool found = false;
185
186 for (uint32 oo=0; (oo<ovlLen) && (ovl[oo].b_iid <= rdBid) && (found == false); oo++) {
187 if ((ovl[oo].b_iid == rdBid) &&
188 (ovl[oo].isDovetail() == true) &&
189 (ovl[oo].AEndIs3prime() == ovlA3) &&
190 (ovl[oo].BEndIs3prime() == ovlB3) &&
191 (ovl[oo].erate() <= OG->reportErrorLimit())) {
192 found = true;
193 writeLog(" overlap from read %9u %s idx=%-7u to read %9u %s idx=%-7u at %6.4f%% <= %6.4f%%\n",
194 rdAid, (rdA->isForward()) ? "->" : "<-", tig->ufpathIdx(rdAid),
195 rdBid, (rdB->isForward()) ? "->" : "<-", tig->ufpathIdx(rdBid), 100.0 * ovl[oo].erate(), 100.0 * OG->reportErrorLimit());
196 }
197 }
198
199 // If no overlap found, we cannot circularize this tig.
200 if (found == false) {
201 writeLog(" overlap from read %9u %s idx=%-7u to read %9u %s idx=%-7u not found\n",
202 rdAid, (rdA->isForward()) ? "->" : "<-", tig->ufpathIdx(rdAid),
203 rdBid, (rdB->isForward()) ? "->" : "<-", tig->ufpathIdx(rdBid));
204 isCircular = false;
205 }
206 }
207 }
208
209 delete [] rc;
210
211 return(isCircular);
212 }
213
214
215
216 void
findCircularContigs(TigVector & tigs,const char * prefix)217 findCircularContigs(TigVector &tigs,
218 const char *prefix) {
219 FILE *CIRC = AS_UTL_openOutputFile(prefix, '.', "circular");
220
221 fprintf(CIRC, " ---------------------------------------------------------------------- ----------------------------------------------------------------------\n");
222 fprintf(CIRC, " first overlap last overlap\n");
223 fprintf(CIRC, " tigID readID bgn-end readID bgn-end length readID bgn-end readID bgn-end length\n");
224 fprintf(CIRC, "-------- --------- ---------- ---------- --------- ---------- ---------- ------ --------- ---------- ---------- --------- ---------- ---------- ------\n");
225
226 for (uint32 ti=1; ti<tigs.size(); ti++) {
227 Unitig *tig = tigs[ti];
228
229 if ((tig == NULL) ||
230 (tig->_isUnassembled == true))
231 continue;
232
233 // Grab the first and last reads in the tig, then find the edge that
234 // points out of the tig.
235
236 ufNode *fRead = tig->firstBackboneRead();
237 ufNode *lRead = tig->lastBackboneRead();
238
239 bool invert = (fRead->position.isForward() != lRead->position.isForward());
240 uint32 circularLength = 0;
241 uint32 ovlLen = 0;
242 BAToverlap *ovl = OC->getOverlaps(lRead->ident, ovlLen);
243
244 for (uint32 oo=0; oo<ovlLen; oo++) {
245 if ((ovl[oo].b_iid == fRead->ident) &&
246 (((lRead->position.isForward() == true) && (ovl[oo].AEndIs3prime() == true) && (ovl[oo].flipped == invert)) ||
247 ((lRead->position.isForward() == false) && (ovl[oo].AEndIs5prime() == true) && (ovl[oo].flipped == invert))))
248
249 // Circular!
250 circularLength = RI->overlapLength(lRead->ident, fRead->ident, ovl[oo].a_hang, ovl[oo].b_hang);
251 }
252
253 if (circularLength == 0)
254 continue;
255
256 tig->_isCircular = true;
257 tig->_circularLength = circularLength;
258
259 ufNode *foRead = &tig->ufpath[ tig->ufpathIdx(lRead->ident) ];
260 ufNode *loRead = &tig->ufpath[ tig->ufpathIdx(fRead->ident) ];
261
262 fprintf(CIRC, "%8u %9u %10u-%-10u %9u %10u-%-10u %6u %9u %10u-%-10u %9u %10u-%-10u %6u\n",
263 ti,
264 fRead->ident, fRead->position.bgn, fRead->position.end, foRead->ident, foRead->position.bgn, foRead->position.end, tig->_circularLength,
265 lRead->ident, lRead->position.bgn, lRead->position.end, loRead->ident, loRead->position.bgn, loRead->position.end, tig->_circularLength);
266 }
267
268 AS_UTL_closeFile(CIRC, prefix, '.', "circular");
269 }
270