1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19 #include "files.H"
20
21 #include "gfa.H"
22
23
24
25 template<typename TT>
26 static
27 bool
findGFAtokenI(char const * features,char const * token,TT & value)28 findGFAtokenI(char const *features, char const *token, TT &value) {
29 char const *p = NULL;
30
31 p = strstr(features, token);
32
33 if (p == NULL)
34 return(false);
35
36 p += strlen(token); // Skip over the token...
37
38 if (*p == ':') // ...and any :, if the user forgot to include it.
39 p++;
40
41 value = (TT)strtoll(p, NULL, 10);
42
43 //fprintf(stderr, "FOUND feature '%s' in '%s' -> '%s' %u\n", token, features, p, value);
44
45 return(true);
46 }
47
48
49 // Search for canu-specific names, and convert to tigID's.
50 // Allow either 'tig', 'utg' or 'ctg'.
51 static
52 uint32
nameToCanuID(char * name)53 nameToCanuID(char *name) {
54 uint32 id = UINT32_MAX;
55
56 if (((name[0] == 't') && (name[1] == 'i') && (name[2] == 'g')) ||
57 ((name[0] == 'u') && (name[1] == 't') && (name[2] == 'g')) ||
58 ((name[0] == 'c') && (name[1] == 't') && (name[2] == 'g')))
59 id = strtoll(name + 3, NULL, 10);
60
61 return(id);
62 }
63
64
65
gfaSequence()66 gfaSequence::gfaSequence() {
67 _name = NULL;
68 _id = UINT32_MAX;
69 _sequence = NULL;
70 _features = NULL;
71 _length = 0;
72 }
73
74
gfaSequence(char * inLine)75 gfaSequence::gfaSequence(char *inLine) {
76 load(inLine);
77 }
78
79
gfaSequence(char * name,uint32 id,uint32 len)80 gfaSequence::gfaSequence(char *name, uint32 id, uint32 len) {
81 _name = new char [strlen(name) + 1];
82 _id = id;
83 _sequence = NULL;
84 _features = NULL;
85 _length = len;
86
87 strcpy(_name, name);
88 }
89
90
~gfaSequence()91 gfaSequence::~gfaSequence() {
92 delete [] _name;
93 delete [] _sequence;
94 delete [] _features;
95 }
96
97
98 void
load(char * inLine)99 gfaSequence::load(char *inLine) {
100 splitToWords W(inLine);
101
102 _name = new char [strlen(W[1]) + 1];
103 _id = UINT32_MAX;
104 _sequence = new char [strlen(W[2]) + 1];
105 _features = new char [strlen(W[3]) + 1];
106
107 _length = 0;
108
109 strcpy(_name, W[1]);
110 strcpy(_sequence, W[2]);
111 strcpy(_features, W[3]);
112
113 // Scan the _features for a length.
114
115 findGFAtokenI(_features, "LN:i:", _length);
116
117 // And any canu ID
118
119 _id = nameToCanuID(_name);
120 }
121
122
123 void
save(FILE * outFile)124 gfaSequence::save(FILE *outFile) {
125 fprintf(outFile, "S\t%s\t%s\tLN:i:%u\n",
126 _name,
127 _sequence ? _sequence : "*",
128 _length);
129 }
130
131
gfaLink()132 gfaLink::gfaLink() {
133 _Aname = NULL;
134 _Aid = UINT32_MAX;
135 _Afwd = false;
136
137 _Bname = NULL;
138 _Bid = UINT32_MAX;
139 _Bfwd = false;
140
141 _cigar = NULL;
142 _features = NULL;
143 }
144
145
gfaLink(char * inLine)146 gfaLink::gfaLink(char *inLine) {
147 load(inLine);
148 }
149
150
gfaLink(char * Aname,uint32 Aid,bool Afwd,char * Bname,uint32 Bid,bool Bfwd,char * cigar)151 gfaLink::gfaLink(char *Aname, uint32 Aid, bool Afwd,
152 char *Bname, uint32 Bid, bool Bfwd, char *cigar) {
153 _Aname = new char [strlen(Aname) + 1];
154 _Aid = Aid;
155 _Afwd = Afwd;
156
157 _Bname = new char [strlen(Bname) + 1];
158 _Bid = Bid;
159 _Bfwd = Bfwd;
160
161 _cigar = new char [strlen(cigar) + 1];
162 _features = NULL;
163
164 strcpy(_Aname, Aname);
165 strcpy(_Bname, Bname);
166 strcpy(_cigar, cigar);
167
168 _Aid = nameToCanuID(_Aname); // Search for canu-specific names, and convert to tigID's.
169 _Bid = nameToCanuID(_Bname);
170 }
171
172
~gfaLink()173 gfaLink::~gfaLink() {
174 delete [] _Aname;
175 delete [] _Bname;
176 delete [] _cigar;
177 delete [] _features;
178 }
179
180
181 void
load(char * inLine)182 gfaLink::load(char *inLine) {
183 splitToWords W(inLine);
184
185 _Aname = new char [strlen(W[1]) + 1];
186 _Aid = UINT32_MAX;
187 _Afwd = W[2][0] == '+';
188
189 _Bname = new char [strlen(W[3]) + 1];
190 _Bid = UINT32_MAX;
191 _Bfwd = W[4][0] == '+';
192
193 _cigar = new char [strlen(W[5]) + 1];
194
195 _features = new char [(W[6]) ? strlen(W[6]) + 1 : 1];
196
197 strcpy(_Aname, W[1]);
198 strcpy(_Bname, W[3]);
199 strcpy(_cigar, W[5]);
200 strcpy(_features, (W[6]) ? W[6] : "");
201
202 _Aid = nameToCanuID(_Aname); // Search for canu-specific names, and convert to tigID's.
203 _Bid = nameToCanuID(_Bname);
204 }
205
206
207 void
save(FILE * outFile)208 gfaLink::save(FILE *outFile) {
209 fprintf(outFile, "L\t%s\t%c\t%s\t%c\t%s\n",
210 _Aname, (_Afwd == true) ? '+' : '-',
211 _Bname, (_Bfwd == true) ? '+' : '-',
212 (_cigar == NULL) ? "*" : _cigar);
213 }
214
215
216 void
alignmentLength(int32 & queryLen,int32 & refceLen,int32 & alignLen)217 gfaLink::alignmentLength(int32 &queryLen, int32 &refceLen, int32 &alignLen) {
218 char *cp = _cigar;
219
220 refceLen = 0; // Bases on the reference involved in the alignment
221 queryLen = 0; // Bases on the query involved in the alignment
222 alignLen = 0; // Length of the alignment
223
224 if (cp == NULL)
225 return;
226
227 if (*cp == '*')
228 return;
229
230 do {
231 int64 val = strtoll(cp, &cp, 10);
232 char code = *cp++;
233
234 switch (code) {
235 case 'M': // Alignment, either match or mismatch
236 refceLen += val;
237 queryLen += val;
238 alignLen += val;
239 break;
240 case 'I': // Insertion to the reference - gap in query
241 queryLen += val;
242 alignLen += val;
243 break;
244 case 'D': // Deletion from the reference - gap in reference
245 refceLen += val;
246 alignLen += val;
247 break;
248 case 'N': // Skipped in the reference (e.g., intron)
249 refceLen += val;
250 fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
251 break;
252 case 'S': // Soft-clipped from the query - not part of the alignment
253 fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
254 break;
255 case 'H': // Hard-clipped from the query - not part of the alignment, and removed from the read as input
256 fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
257 break;
258 case 'P': // Padding - "silent deletion from padded reference" - ???
259 fprintf(stderr, "warning - unsupported CIGAR code '%c' in '%s'\n", code, _cigar);
260 break;
261 case '=': // Alignment, match
262 refceLen += val;
263 queryLen += val;
264 alignLen += val;
265 break;
266 case 'X': // Alignment, mismatch
267 refceLen += val;
268 queryLen += val;
269 alignLen += val;
270 break;
271 default:
272 fprintf(stderr, "unknown CIGAR code '%c' in '%s'\n", code, _cigar);
273 break;
274 }
275
276 } while (*cp != 0);
277 }
278
279
280
281
gfaFile()282 gfaFile::gfaFile() {
283 _header = NULL;
284 }
285
286
gfaFile(char const * inName)287 gfaFile::gfaFile(char const *inName) {
288 _header = NULL;
289
290 if ((inName[0] == 'H') && (inName[1] == '\t')) {
291 _header = new char [strlen(inName) + 1];
292 strcpy(_header, inName);
293 }
294
295 else {
296 loadFile(inName);
297 }
298 }
299
300
~gfaFile()301 gfaFile::~gfaFile() {
302 delete [] _header;
303
304 for (uint32 ii=0; ii<_sequences.size(); ii++)
305 delete _sequences[ii];
306
307 for (uint32 ii=0; ii<_links.size(); ii++)
308 delete _links[ii];
309 }
310
311
312 bool
loadFile(char const * inName)313 gfaFile::loadFile(char const *inName) {
314 char *L = NULL;
315 uint32 Llen = 0;
316 uint32 Lmax = 0;
317
318 FILE *F = AS_UTL_openInputFile(inName);
319
320 while (AS_UTL_readLine(L, Llen, Lmax, F)) {
321 char type = L[0];
322
323 if (L[1] != '\t')
324 fprintf(stderr, "gfaFile::loadFile()-- misformed file; second letter must be tab in line '%s'\n", L), exit(1);
325
326 if (type == 'H') {
327 delete [] _header;
328 _header = new char [Llen];
329 strcpy(_header, L+2);
330 }
331
332 else if (type == 'S') {
333 _sequences.push_back(new gfaSequence(L));
334 }
335
336 else if (type == 'L') {
337 _links.push_back(new gfaLink(L));
338 }
339
340 else {
341 fprintf(stderr, "gfaFile::loadFile()-- unrecognized line '%s'\n", L), exit(1);
342 }
343 }
344
345 AS_UTL_closeFile(F, inName);
346
347 delete [] L;
348
349 fprintf(stderr, "gfa: Loaded " F_SIZE_T " sequences and " F_SIZE_T " links.\n", _sequences.size(), _links.size());
350
351 return(true);
352 }
353
354
355
356
357 bool
saveFile(char const * outName)358 gfaFile::saveFile(char const *outName) {
359
360 FILE *F = AS_UTL_openOutputFile(outName);
361
362 fprintf(F, "H\t%s\n", _header);
363
364 for (uint32 ii=0; ii<_sequences.size(); ii++)
365 if (_sequences[ii])
366 _sequences[ii]->save(F);
367
368 for (uint32 ii=0; ii<_links.size(); ii++)
369 if (_links[ii])
370 _links[ii]->save(F);
371
372 AS_UTL_closeFile(F, outName);
373
374 return(true);
375 }
376
377