1 /**
2 * Author: Mark Larkin
3 *
4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
5 */
6 /**
7 * Changes:
8 *
9 * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle
10 * cross-platform end-of-lines.
11 */
12
13 #ifdef HAVE_CONFIG_H
14 #include "config.h"
15 #endif
16 #include "RSFFileParser.h"
17
18 namespace clustalw
19 {
20
21 /**
22 * Constructor sets up the chartab array.
23 * @param filePath
24 */
RSFFileParser(string filePath)25 RSFFileParser::RSFFileParser(string filePath)
26 {
27 fileName = filePath;
28 fillCharTab();
29 }
30
getSeqRange(int firstSeq,int no,string * offendingSeq)31 vector<Sequence> RSFFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq)
32 {
33 vector<Sequence> seqRangeVector;
34 int i;
35
36 for (i=0; i<no; i++)
37 {
38 Sequence tempSeq = getSeq(firstSeq + i, offendingSeq);
39 if (parseExitCode!=OK) {
40 seqRangeVector.clear();
41 return seqRangeVector;
42 }
43 seqRangeVector.push_back(tempSeq);
44 }
45 return seqRangeVector;
46 }
47
48
49 /**
50 * get the sequence seqNum from the file.
51 * @param seqNum The number of the sequence to get.
52 * @return The sequence seqNum.
53 */
getSeq(int seqNum,string * offendingSeq)54 Sequence RSFFileParser::getSeq(int seqNum, string *offendingSeq)
55 {
56 char _line[MAXLINE + 1];
57 char _sname[MAXNAMES + 1];
58 string characterSeq = "";
59 string name = "";
60 string title = "";
61 string blank = "";
62 _line[0] = EOS;
63
64 int i;
65 unsigned char c;
66 int _currentSeqNum = 0; // Not at any sequence yet!
67
68 try
69 {
70 _fileIn = new InFileStream; //nige
71 _fileIn->open(fileName.c_str()); //nige
72 _fileIn->seekg(0, std::ios::beg); // start at the beginning
73
74 // Need to get the cursor to the begining of the correct sequence.
75 // This will be the case when we get to the seqNum {
76 while (_currentSeqNum != seqNum)
77 {
78 while(*_line != '{')
79 {
80 if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore!
81 {
82 _fileIn->close();
83 return Sequence(blank, blank, blank);
84 }
85 }
86 ++_currentSeqNum;
87 if(_currentSeqNum == seqNum) // Found the sequence
88 {
89 break;
90 }
91 // Get next line so that we are past the '{' line
92 _fileIn->getline(_line, MAXLINE + 1);
93 }
94
95 while (!keyword(_line, "name"))
96 {
97 if (!_fileIn->getline(_line, MAXLINE + 1))
98 {
99 _fileIn->close();
100 return Sequence(blank, blank, blank);
101 }
102 }
103 for (i = 5; i <= (int)strlen(_line); i++)
104 {
105 if (_line[i] != ' ')
106 {
107 break;
108 }
109 }
110 strncpy(_sname, _line + i, MAXNAMES); // remember entryname
111 for (i = 0; i <= (int)strlen(_sname); i++)
112 {
113 if (_sname[i] == ' ')
114 {
115 _sname[i] = EOS;
116 break;
117 }
118 }
119
120 _sname[MAXNAMES] = EOS;
121 utilityObject->rTrim(_sname);
122 utilityObject->blankToUnderscore(_sname); // replace blanks with '_'
123 name = string(_sname);
124
125
126 while (!keyword(_line, "sequence"))
127 {
128 if (!_fileIn->getline(_line, MAXLINE + 1))
129 {
130 _fileIn->close();
131 return Sequence(blank, blank, blank);
132 }
133 }
134
135 while (_fileIn->getline(_line, MAXLINE + 1))
136 {
137 for (i = 0; i <= MAXLINE; i++)
138 {
139 c = _line[i];
140 if (c == EOS || c == '}')
141 {
142 break;
143 }
144 // EOL
145 if (c == '.')
146 {
147 characterSeq += '-';
148 }
149 c = chartab[c];
150 if (c)
151 {
152 characterSeq += c;
153 }
154 }
155 if (c == '}')
156 {
157 break;
158 }
159 }
160 _fileIn->close();
161
162 if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
163 {
164 parseExitCode=SEQUENCETOOBIG;
165 if (offendingSeq!=NULL)
166 offendingSeq->assign(name);
167 // return empty seq
168 return Sequence(blank, blank, blank);
169 }
170 return Sequence(characterSeq, name, title);
171 }
172 catch(...)
173 {
174 _fileIn->close();
175 cerr << "There was an exception in the RSFFileParser::getSeq function.\n"
176 << "Need to end program\n";
177 exit(1);
178 }
179 }
180
181 /**
182 * count the number of sequences in a GCG RSF alignment file
183 * @return The number of sequences in the file.
184 */
countSeqs()185 int RSFFileParser::countSeqs()
186 {
187 char _line[MAXLINE + 1];
188 int numSeqs;
189
190 try
191 {
192 numSeqs = 0;
193 _fileIn = new InFileStream; //nige
194 _fileIn->open(fileName.c_str()); //nige
195 _fileIn->seekg(0, std::ios::beg); // start at the beginning
196
197 if(!_fileIn->is_open())
198 {
199 return 0; // No sequences found!
200 }
201
202 // skip the comments
203 while (_fileIn->getline(_line, MAXLINE + 1))
204 {
205 // NOTE needed to change to -1 and -2 (it was -2 and -3)
206 // This is because getline does not put the \n in!
207 if (_line[strlen(_line) - 1] == '.' && _line[strlen(_line) - 2] == '.')
208 {
209 break;
210 }
211 }
212
213 while (_fileIn->getline(_line, MAXLINE + 1))
214 {
215 if (*_line == '{')
216 {
217 numSeqs++;
218 }
219 }
220 _fileIn->close();
221 return numSeqs;
222 }
223 catch(...)
224 {
225 _fileIn->close();
226 cerr << "An exception has occured in the function RSFFileParser::countSeqs()\n"
227 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
228 exit(1);
229 }
230 }
231
232 /**
233 * Get the secondary structure information from the file.
234 * @param gapPenaltyMask
235 * @param secStructMask
236 * @param secStructName
237 * @param structPenalties
238 * @param length
239 */
getSecStructure(vector<char> & gapPenaltyMask,vector<char> & secStructMask,string & secStructName,int & structPenalties,int length)240 void RSFFileParser::getSecStructure(vector<char>& gapPenaltyMask, vector<char>& secStructMask,
241 string& secStructName, int &structPenalties, int length)
242 {
243 bool guigetss = false;
244 if(userParameters->getProfileNum() == 1 && userParameters->getStructPenalties1())
245 guigetss = true;
246 if(userParameters->getProfileNum() == 2 && userParameters->getStructPenalties2())
247 guigetss = true;
248
249 char _title[MAXLINE + 1];
250 char _line[MAXLINE + 1];
251 char _lin2[MAXLINE + 1];
252 char _sname[MAXNAMES + 1];
253 int i;
254 _line[0] = EOS;
255
256 try
257 {
258 secStructMask.clear();
259 secStructMask.assign(length, '.');
260 _fileIn = new InFileStream; //nige
261 _fileIn->open(fileName.c_str()); //nige
262 _fileIn->seekg(0, std::ios::beg); // Need to start at begining
263
264 // skip the comments
265 while (_fileIn->getline(_line, MAXLINE + 1))
266 {
267 if (_line[strlen(_line) - 1] == '.' && _line[strlen(_line) - 2] == '.')
268 {
269 break;
270 }
271 }
272
273 // find the start of the sequence entry
274 for (;;)
275 {
276 while (_fileIn->getline(_line, MAXLINE + 1))
277 if (*_line == '{')
278 {
279 break;
280 }
281
282 while (!keyword(_line, "name"))
283 {
284 if (!_fileIn->getline(_line, MAXLINE + 1))
285 {
286 _fileIn->close();
287 return;
288 }
289 }
290
291 for (i = 5; i <= (int)strlen(_line); i++)
292 {
293 if (_line[i] != ' ')
294 {
295 break;
296 }
297 }
298 strncpy(_sname, _line + i, MAXNAMES); // remember entryname
299 for (i = 0; i <= (int)strlen(_sname); i++)
300 {
301 if (_sname[i] == ' ')
302 {
303 _sname[i] = EOS;
304 break;
305 }
306 }
307 _sname[MAXNAMES] = EOS;
308 utilityObject->rTrim(_sname);
309 utilityObject->blankToUnderscore(_sname); // replace blanks with '_'
310
311 // look for secondary structure feature table / gap penalty mask
312 while (_fileIn->getline(_line, MAXLINE + 1))
313 {
314 if (keyword(_line, "feature"))
315 {
316 if (userParameters->getInteractive() && !userParameters->getGui())
317 {
318 strcpy(_title, "Found secondary structure in alignment file: ");
319 strcat(_title, _sname);
320 (*_lin2) = utilityObject->promptForYesNo(_title,
321 "Use it to set local gap penalties ");
322 }
323 else
324 {
325 (*_lin2) = 'y';
326 }
327 if (guigetss || ((*_lin2 != 'n') && (*_lin2 != 'N')))
328 {
329 structPenalties = SECST;
330 secStructMask.assign(length, '.');
331 do
332 {
333 if (keyword(_line, "feature"))
334 {
335 getRSFFeature(&_line[7], secStructMask, length);
336 }
337 _fileIn->getline(_line, MAXLINE + 1);
338 }
339 while (!keyword(_line, "sequence"));
340 }
341 else
342 {
343 do
344 {
345 _fileIn->getline(_line, MAXLINE + 1);
346 }
347 while (!keyword(_line, "sequence"));
348 }
349 secStructName = string(_sname);
350 }
351 else if (keyword(_line, "sequence"))
352 {
353 break;
354 }
355
356 if (structPenalties != NONE)
357 {
358 break;
359 }
360 }
361 }
362 _fileIn->close();
363 }
364 catch(...)
365 {
366 _fileIn->close();
367 cerr << "An exception has occured in the function RSFFileParser::getSecStructure()\n"
368 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
369 exit(1);
370 }
371 }
372
373 /**
374 * get a feature from the file. Called by getSecStructure
375 * @param line
376 * @param secStructMask
377 * @param length
378 */
getRSFFeature(char * line,vector<char> & secStructMask,int length)379 void RSFFileParser::getRSFFeature(char* line, vector<char>& secStructMask, int length)
380 {
381 char c, s;
382 char str1[MAXLINE + 1], str2[MAXLINE + 1], feature[MAXLINE + 1];
383 int i, tmp, startPos, endPos;
384
385 try
386 {
387 if (sscanf(line, "%d%d%d%s%s%s", &startPos, &endPos, &tmp, str1, str2,
388 feature) != 6)
389 {
390 return;
391 }
392
393 if (strcmp(feature, "HELIX") == 0)
394 {
395 c = 'A';
396 s = '$';
397 }
398 else if (strcmp(feature, "STRAND") == 0)
399 {
400 c = 'B';
401 s = '%';
402 }
403 else
404 {
405 return ;
406 }
407
408 if (startPos >= length || endPos >= length)
409 {
410 return ;
411 }
412 secStructMask[startPos - 1] = s;
413 for (i = startPos; i < endPos - 1; i++)
414 {
415 secStructMask[i] = c;
416 }
417 secStructMask[endPos - 1] = s;
418 }
419 catch(...)
420 {
421 cerr << "An exception has occured in the function RSFFileParser::getRSFFeature()\n"
422 << "Program needs to terminate.\nPlease contact the Clustal developers\n";
423 exit(1);
424 }
425 }
426
427 /**
428 * keyword checks if code is on the line!
429 * @param line
430 * @param code
431 * @return
432 */
keyword(char * line,const char * code)433 bool RSFFileParser::keyword(char *line, const char *code)
434 {
435 int i;
436 char key[MAXLINE];
437
438 for (i = 0; !isspace(line[i]) && line[i] != EOS; i++)
439 {
440 key[i] = line[i];
441 }
442 key[i] = EOS;
443 return (strcmp(key, code) == 0);
444 }
445
446 }
447
448
449