1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "AS_BAT_ReadInfo.H"
19 #include "AS_BAT_BestOverlapGraph.H"
20 #include "AS_BAT_Logging.H"
21 
22 #include "AS_BAT_FindCircular.H"
23 
24 
25 class readCheck {
26 public:
init(ufNode * node)27   void         init(ufNode *node) {
28     read    = node;
29 
30     ovlLen  = 0;
31     ovl     = OC->getOverlaps(read->ident, ovlLen);
32   };
33 
34   ufNode      *read;
35 
36   uint32       ovlLen;
37   BAToverlap  *ovl;
38 };
39 
40 
41 //  Checks if all the overlaps implied by circularizing the tig actually
42 //  exist.
43 //
44 //  This checks evry read in the path
45 //      rdFid .. the end of the tig --circular-> the other end .. rdTid
46 //  and ensures that the tested read has an overlap to all later reads.
47 //
48 //  [OLD COMMENT - names are incorrect, but the idea is the same]
49 //
50 //  Decide if the external edge in externalSco can plausibly circularize the tig.
51 //  For this to happen, the external egdge read must overlap all the reads in the path
52 //  to rdA.
53 //
54 //         =======================         <- another copy of the extern read
55 //          -----------------------        <- another copy of the last read
56 //            1--------------------        <- the first read
57 //                 2---------------------  <- rdA
58 //                    3-------------------------
59 //                       . . . . . . . . . . . . . . .
60 //                            8==============================  <- extern
61 //                             9------------------------------ <- the last read
62 //                                -------------    ------
63 //
64 //  Here, the extern read is causing rdA to look confused on it's low end.
65 //  In order for this to circularize, we need to check that all the
66 //  (non-contained) reads from the extern read to the last read have overlaps
67 //  to all the reads between itself and rdA (going around the circle).
68 //
69 //  This will correcly not find a circle if the extern read is too far from
70 //  the end, or if rdA is too far from the beginning.
71 //
72 
73 bool
isCircularizingEdge(Unitig * tig,uint32 rdFid,uint32 rdTid)74 isCircularizingEdge(Unitig   *tig,
75                     uint32    rdFid,     //  read From
76                     uint32    rdTid) {   //  read To
77   bool   isCircular = true;
78 
79   if ((rdTid == 0) || (tig->inUnitig(rdTid) != tig->id()) ||   //  Quick sanity checks:
80       (rdFid == 0) || (tig->inUnitig(rdFid) != tig->id()))     //    the reads must exist.
81     return(false);                                             //    the reads must be in the supplied tig.
82 
83   //  Based on the position of the reads in the tig, decide if we are
84   //  checking overlaps from end-to-start or from start-to-end.  Then build a
85   //  list of the reads we need to check.
86   //
87   //  forward == true if we iterate:
88   //    from the F read to the high end of the tig
89   //                    across the circle
90   //                    to the low end of the tig
91   //                    and up to the T read.
92   //
93   //  forward == false if we iterate:
94   //    from the F read to the low end of the tig
95   //                    across the circle
96   //                    to the high end of the tig
97   //                    and down to the T read.
98 
99   uint32      rdTidx  = tig->ufpathIdx(rdTid);
100   uint32      rdFidx  = tig->ufpathIdx(rdFid);
101   bool        forward = false;
102 
103   uint32      rcMax   = 0;
104   uint32      rcLen   = 0;
105   readCheck  *rc      = nullptr;
106 
107   //  -- rdT is at the start of the tig, rdF is at the end.
108   if (rdTidx < rdFidx) {
109     rcMax   = tig->ufpath.size() - rdFidx + 1 + rdTidx + 1;
110     rc      = new readCheck [rcMax];
111     forward = true;
112 
113     for (uint32 rd=rdFidx; rd < tig->ufpath.size(); rd++)
114       if (OG->isContained(tig->ufpath[rd].ident) == false)
115         rc[rcLen++].init(&tig->ufpath[rd]);
116 
117     for (uint32 rd=0; rd <= rdTidx; rd++)
118       if (OG->isContained(tig->ufpath[rd].ident) == false)
119         rc[rcLen++].init(&tig->ufpath[rd]);
120   }
121 
122   //  -- rdF is at the start of the tig, rdT is at the end.
123   else {
124     rcMax   = tig->ufpath.size() - rdTidx + 1 + rdFidx + 1;
125     rc      = new readCheck [rcMax];
126     forward = false;
127 
128     for (uint32 rd=rdFidx+1; rd-- > 0; )
129       if (OG->isContained(tig->ufpath[rd].ident) == false)
130         rc[rcLen++].init(&tig->ufpath[rd]);
131 
132     for (uint32 rd=tig->ufpath.size(); rd-- > rdTidx; )
133       if (OG->isContained(tig->ufpath[rd].ident) == false)
134         rc[rcLen++].init(&tig->ufpath[rd]);
135   }
136 
137   assert(rcLen < rcMax);
138 
139   //  Iterate over all the reads, checking for an overlap to all the later
140   //  reads in the list.
141   //
142   //  Once we get a pair, decide which read ends the overlap should be on.
143   //
144   //  In the 'forward' case, we start with rdA being the read near the high
145   //  end of the tig (passed in as rdF) and check for overlaps to each rdB
146   //  'after' it in the circular tig.
147   //
148   //    rc[2]  ----------
149   //    rc[3]     ------------ <- B read
150   //                                A read -> --------->       rc[0]
151   //                                             <-----------  rc[1]
152   //
153   //  In the 'reverse' case, we start with rdA being the read near the low
154   //  end of the tig (passed in as rdF) and check for overlaps to each rdB
155   //  'before; it in the circular tig.
156   //
157   //    rc[1]  --------->
158   //    rc[0]     -----------> <- A read
159   //                                B read -> ----------       rc[3]
160   //                                             ------------  rc[2]
161   //
162   //  In both cases, we check for an overlap between rc[i] and rc[j], i<j.
163   //  'forward' or 'reverse' is used to decide which end of the read we need
164   //  to find the overlap on.  The orientation on the read in the picture is
165   //  to help visualizing ovlA3 and ovlB3 below).
166   //
167   //  For each rc[] pair, we just scan all the overlaps for rdA and check that
168   //    the overlap is to the correct rdB read
169   //    the overlap is on the correct ends of each read
170   //    the oveerlap is of good quality.
171 
172   for (uint32 ai=0; (isCircular == true) && (ai<rcLen); ai++) {
173     ufNode     *rdA    = rc[ai].read;
174     uint32      ovlLen = rc[ai].ovlLen;
175     BAToverlap *ovl    = rc[ai].ovl;
176     uint32      rdAid  = rdA->ident;
177 
178     for (uint32 bi=ai+1; (isCircular == true) && (bi<rcLen); bi++) {
179       ufNode   *rdB    = rc[bi].read;
180       uint32    rdBid  = rdB->ident;
181 
182       bool  ovlA3 = (forward) ? (rdA->isForward()) : (rdA->isReverse());   //  Need overlap on 3' end of A read.
183       bool  ovlB3 = (forward) ? (rdB->isReverse()) : (rdB->isForward());
184       bool  found = false;
185 
186       for (uint32 oo=0; (oo<ovlLen) && (ovl[oo].b_iid <= rdBid) && (found == false); oo++) {
187         if ((ovl[oo].b_iid == rdBid) &&
188             (ovl[oo].isDovetail() == true) &&
189             (ovl[oo].AEndIs3prime() == ovlA3) &&
190             (ovl[oo].BEndIs3prime() == ovlB3) &&
191             (ovl[oo].erate() <= OG->reportErrorLimit())) {
192           found = true;
193           writeLog("                    overlap from read %9u %s idx=%-7u to read %9u %s idx=%-7u at %6.4f%% <= %6.4f%%\n",
194                    rdAid, (rdA->isForward()) ? "->" : "<-", tig->ufpathIdx(rdAid),
195                    rdBid, (rdB->isForward()) ? "->" : "<-", tig->ufpathIdx(rdBid), 100.0 * ovl[oo].erate(), 100.0 * OG->reportErrorLimit());
196         }
197       }
198 
199       //  If no overlap found, we cannot circularize this tig.
200       if (found == false) {
201         writeLog("                    overlap from read %9u %s idx=%-7u to read %9u %s idx=%-7u not found\n",
202                  rdAid, (rdA->isForward()) ? "->" : "<-", tig->ufpathIdx(rdAid),
203                  rdBid, (rdB->isForward()) ? "->" : "<-", tig->ufpathIdx(rdBid));
204         isCircular = false;
205       }
206     }
207   }
208 
209   delete [] rc;
210 
211   return(isCircular);
212 }
213 
214 
215 
216 void
findCircularContigs(TigVector & tigs,const char * prefix)217 findCircularContigs(TigVector &tigs,
218                     const char *prefix) {
219   FILE *CIRC = AS_UTL_openOutputFile(prefix, '.', "circular");
220 
221   fprintf(CIRC, "          ----------------------------------------------------------------------  ----------------------------------------------------------------------\n");
222   fprintf(CIRC, "              first                         overlap                                    last                         overlap\n");
223   fprintf(CIRC, "   tigID     readID        bgn-end           readID        bgn-end        length     readID        bgn-end           readID        bgn-end        length\n");
224   fprintf(CIRC, "--------  --------- ---------- ---------- --------- ---------- ---------- ------  --------- ---------- ---------- --------- ---------- ---------- ------\n");
225 
226   for (uint32 ti=1; ti<tigs.size(); ti++) {
227     Unitig  *tig = tigs[ti];
228 
229     if ((tig == NULL) ||
230         (tig->_isUnassembled == true))
231       continue;
232 
233     //  Grab the first and last reads in the tig, then find the edge that
234     //  points out of the tig.
235 
236     ufNode          *fRead = tig->firstBackboneRead();
237     ufNode          *lRead = tig->lastBackboneRead();
238 
239     bool        invert = (fRead->position.isForward() != lRead->position.isForward());
240     uint32      circularLength = 0;
241     uint32      ovlLen = 0;
242     BAToverlap *ovl    = OC->getOverlaps(lRead->ident, ovlLen);
243 
244     for (uint32 oo=0; oo<ovlLen; oo++) {
245       if ((ovl[oo].b_iid == fRead->ident) &&
246           (((lRead->position.isForward() ==  true) && (ovl[oo].AEndIs3prime() ==  true) && (ovl[oo].flipped == invert))  ||
247            ((lRead->position.isForward() == false) && (ovl[oo].AEndIs5prime() ==  true) && (ovl[oo].flipped == invert))))
248 
249         //  Circular!
250         circularLength = RI->overlapLength(lRead->ident, fRead->ident, ovl[oo].a_hang, ovl[oo].b_hang);
251     }
252 
253     if (circularLength == 0)
254        continue;
255 
256     tig->_isCircular     = true;
257     tig->_circularLength = circularLength;
258 
259     ufNode *foRead = &tig->ufpath[ tig->ufpathIdx(lRead->ident) ];
260     ufNode *loRead = &tig->ufpath[ tig->ufpathIdx(fRead->ident) ];
261 
262     fprintf(CIRC, "%8u  %9u %10u-%-10u %9u %10u-%-10u %6u  %9u %10u-%-10u %9u %10u-%-10u %6u\n",
263             ti,
264             fRead->ident, fRead->position.bgn, fRead->position.end, foRead->ident, foRead->position.bgn, foRead->position.end, tig->_circularLength,
265             lRead->ident, lRead->position.bgn, lRead->position.end, loRead->ident, loRead->position.bgn, loRead->position.end, tig->_circularLength);
266   }
267 
268   AS_UTL_closeFile(CIRC, prefix, '.', "circular");
269 }
270