1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "runtime.H"
19 #include "files.H"
20 
21 #include "gfa.H"
22 
23 
24 
25 template<typename TT>
26 static
27 bool
findGFAtokenI(char const * features,char const * token,TT & value)28 findGFAtokenI(char const *features, char const *token, TT &value) {
29   char const *p = NULL;
30 
31   p = strstr(features, token);
32 
33   if (p == NULL)
34     return(false);
35 
36   p += strlen(token);  //  Skip over the token...
37 
38   if (*p == ':')       //  ...and any :, if the user forgot to include it.
39     p++;
40 
41   value = (TT)strtoll(p, NULL, 10);
42 
43   //fprintf(stderr, "FOUND feature '%s' in '%s' -> '%s' %u\n", token, features, p, value);
44 
45   return(true);
46 }
47 
48 
49 //  Search for canu-specific names, and convert to tigID's.
50 //    Allow either 'tig', 'utg' or 'ctg'.
51 static
52 uint32
nameToCanuID(char * name)53 nameToCanuID(char *name) {
54   uint32   id = UINT32_MAX;
55 
56   if (((name[0] == 't') && (name[1] == 'i') && (name[2] == 'g')) ||
57       ((name[0] == 'u') && (name[1] == 't') && (name[2] == 'g')) ||
58       ((name[0] == 'c') && (name[1] == 't') && (name[2] == 'g')))
59     id = strtoll(name + 3, NULL, 10);
60 
61   return(id);
62 }
63 
64 
65 
gfaSequence()66 gfaSequence::gfaSequence() {
67   _name     = NULL;
68   _id       = UINT32_MAX;
69   _sequence = NULL;
70   _features = NULL;
71   _length   = 0;
72 }
73 
74 
gfaSequence(char * inLine)75 gfaSequence::gfaSequence(char *inLine) {
76   load(inLine);
77 }
78 
79 
gfaSequence(char * name,uint32 id,uint32 len)80 gfaSequence::gfaSequence(char *name, uint32 id, uint32 len) {
81   _name     = new char [strlen(name) + 1];
82   _id       = id;
83   _sequence = NULL;
84   _features = NULL;
85   _length   = len;
86 
87   strcpy(_name, name);
88 }
89 
90 
~gfaSequence()91 gfaSequence::~gfaSequence() {
92   delete [] _name;
93   delete [] _sequence;
94   delete [] _features;
95 }
96 
97 
98 void
load(char * inLine)99 gfaSequence::load(char *inLine) {
100   splitToWords W(inLine);
101 
102   _name     = new char [strlen(W[1]) + 1];
103   _id       = UINT32_MAX;
104   _sequence = new char [strlen(W[2]) + 1];
105   _features = new char [strlen(W[3]) + 1];
106 
107   _length   = 0;
108 
109   strcpy(_name,     W[1]);
110   strcpy(_sequence, W[2]);
111   strcpy(_features, W[3]);
112 
113   //  Scan the _features for a length.
114 
115   findGFAtokenI(_features, "LN:i:", _length);
116 
117   //  And any canu ID
118 
119   _id = nameToCanuID(_name);
120 }
121 
122 
123 void
save(FILE * outFile)124 gfaSequence::save(FILE *outFile) {
125   fprintf(outFile, "S\t%s\t%s\tLN:i:%u\n",
126           _name,
127           _sequence ? _sequence : "*",
128           _length);
129 }
130 
131 
gfaLink()132 gfaLink::gfaLink() {
133   _Aname    = NULL;
134   _Aid      = UINT32_MAX;
135   _Afwd     = false;
136 
137   _Bname    = NULL;
138   _Bid      = UINT32_MAX;
139   _Bfwd     = false;
140 
141   _cigar    = NULL;
142   _features = NULL;
143 }
144 
145 
gfaLink(char * inLine)146 gfaLink::gfaLink(char *inLine) {
147   load(inLine);
148 }
149 
150 
gfaLink(char * Aname,uint32 Aid,bool Afwd,char * Bname,uint32 Bid,bool Bfwd,char * cigar)151 gfaLink::gfaLink(char *Aname, uint32 Aid, bool Afwd,
152                  char *Bname, uint32 Bid, bool Bfwd, char *cigar) {
153   _Aname    = new char [strlen(Aname) + 1];
154   _Aid      = Aid;
155   _Afwd     = Afwd;
156 
157   _Bname    = new char [strlen(Bname) + 1];
158   _Bid      = Bid;
159   _Bfwd     = Bfwd;
160 
161   _cigar    = new char [strlen(cigar) + 1];
162   _features = NULL;
163 
164   strcpy(_Aname,    Aname);
165   strcpy(_Bname,    Bname);
166   strcpy(_cigar,    cigar);
167 
168   _Aid = nameToCanuID(_Aname);    //  Search for canu-specific names, and convert to tigID's.
169   _Bid = nameToCanuID(_Bname);
170 }
171 
172 
~gfaLink()173 gfaLink::~gfaLink() {
174   delete [] _Aname;
175   delete [] _Bname;
176   delete [] _cigar;
177   delete [] _features;
178 }
179 
180 
181 void
load(char * inLine)182 gfaLink::load(char *inLine) {
183   splitToWords W(inLine);
184 
185   _Aname    = new char [strlen(W[1]) + 1];
186   _Aid      = UINT32_MAX;
187   _Afwd     = W[2][0] == '+';
188 
189   _Bname    = new char [strlen(W[3]) + 1];
190   _Bid      = UINT32_MAX;
191   _Bfwd     = W[4][0] == '+';
192 
193   _cigar    = new char [strlen(W[5]) + 1];
194 
195   _features = new char [(W[6]) ? strlen(W[6]) + 1 : 1];
196 
197   strcpy(_Aname,    W[1]);
198   strcpy(_Bname,    W[3]);
199   strcpy(_cigar,    W[5]);
200   strcpy(_features, (W[6]) ? W[6] : "");
201 
202   _Aid = nameToCanuID(_Aname);    //  Search for canu-specific names, and convert to tigID's.
203   _Bid = nameToCanuID(_Bname);
204 }
205 
206 
207 void
save(FILE * outFile)208 gfaLink::save(FILE *outFile) {
209   fprintf(outFile, "L\t%s\t%c\t%s\t%c\t%s\n",
210           _Aname, (_Afwd == true) ? '+' : '-',
211           _Bname, (_Bfwd == true) ? '+' : '-',
212           (_cigar == NULL) ? "*" : _cigar);
213 }
214 
215 
216 void
alignmentLength(int32 & queryLen,int32 & refceLen,int32 & alignLen)217 gfaLink::alignmentLength(int32 &queryLen, int32 &refceLen, int32 &alignLen) {
218   char  *cp = _cigar;
219 
220   refceLen = 0;  //  Bases on the reference involved in the alignment
221   queryLen = 0;  //  Bases on the query     involved in the alignment
222   alignLen = 0;  //  Length of the alignment
223 
224   if (cp == NULL)
225     return;
226 
227   if (*cp == '*')
228     return;
229 
230   do {
231     int64  val  = strtoll(cp, &cp, 10);
232     char   code = *cp++;
233 
234     switch (code) {
235       case 'M':  //  Alignment, either match or mismatch
236         refceLen += val;
237         queryLen += val;
238         alignLen += val;
239         break;
240       case 'I':  //  Insertion to the reference - gap in query
241         queryLen += val;
242         alignLen += val;
243         break;
244       case 'D':  //  Deletion from the reference - gap in reference
245         refceLen += val;
246         alignLen += val;
247        break;
248       case 'N':  //  Skipped in the reference (e.g., intron)
249         refceLen += val;
250         fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
251         break;
252       case 'S':  //  Soft-clipped from the query - not part of the alignment
253         fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
254         break;
255       case 'H':  //  Hard-clipped from the query - not part of the alignment, and removed from the read as input
256         fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
257         break;
258       case 'P':  //  Padding - "silent deletion from padded reference" - ???
259         fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
260         break;
261       case '=':  //  Alignment, match
262         refceLen += val;
263         queryLen += val;
264         alignLen += val;
265         break;
266       case 'X':  //  Alignment, mismatch
267         refceLen += val;
268         queryLen += val;
269         alignLen += val;
270         break;
271       default:
272         fprintf(stderr, "unknown CIGAR code '%c' in '%s'\n", code, _cigar);
273         break;
274     }
275 
276   } while (*cp != 0);
277 }
278 
279 
280 
281 
gfaFile()282 gfaFile::gfaFile() {
283   _header = NULL;
284 }
285 
286 
gfaFile(char const * inName)287 gfaFile::gfaFile(char const *inName) {
288   _header = NULL;
289 
290   if ((inName[0] == 'H') && (inName[1] == '\t')) {
291     _header = new char [strlen(inName) + 1];
292     strcpy(_header, inName);
293   }
294 
295   else {
296     loadFile(inName);
297   }
298 }
299 
300 
~gfaFile()301 gfaFile::~gfaFile() {
302   delete [] _header;
303 
304   for (uint32 ii=0; ii<_sequences.size(); ii++)
305     delete _sequences[ii];
306 
307   for (uint32 ii=0; ii<_links.size(); ii++)
308     delete _links[ii];
309 }
310 
311 
312 bool
loadFile(char const * inName)313 gfaFile::loadFile(char const *inName) {
314   char  *L    = NULL;
315   uint32 Llen = 0;
316   uint32 Lmax = 0;
317 
318   FILE *F = AS_UTL_openInputFile(inName);
319 
320   while (AS_UTL_readLine(L, Llen, Lmax, F)) {
321     char  type = L[0];
322 
323     if (L[1] != '\t')
324       fprintf(stderr, "gfaFile::loadFile()-- misformed file; second letter must be tab in line '%s'\n", L), exit(1);
325 
326     if      (type == 'H') {
327       delete [] _header;
328       _header = new char [Llen];
329       strcpy(_header, L+2);
330     }
331 
332     else if (type == 'S') {
333       _sequences.push_back(new gfaSequence(L));
334     }
335 
336     else if (type == 'L') {
337       _links.push_back(new gfaLink(L));
338     }
339 
340     else {
341       fprintf(stderr, "gfaFile::loadFile()-- unrecognized line '%s'\n", L), exit(1);
342     }
343   }
344 
345   AS_UTL_closeFile(F, inName);
346 
347   delete [] L;
348 
349   fprintf(stderr, "gfa:  Loaded " F_SIZE_T " sequences and " F_SIZE_T " links.\n", _sequences.size(), _links.size());
350 
351   return(true);
352 }
353 
354 
355 
356 
357 bool
saveFile(char const * outName)358 gfaFile::saveFile(char const *outName) {
359 
360   FILE *F = AS_UTL_openOutputFile(outName);
361 
362   fprintf(F, "H\t%s\n", _header);
363 
364   for (uint32 ii=0; ii<_sequences.size(); ii++)
365     if (_sequences[ii])
366       _sequences[ii]->save(F);
367 
368   for (uint32 ii=0; ii<_links.size(); ii++)
369     if (_links[ii])
370       _links[ii]->save(F);
371 
372   AS_UTL_closeFile(F, outName);
373 
374   return(true);
375 }
376 
377