1 /**
2  * Author: Mark Larkin
3  *
4  * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
5  */
6 /**
7  * Changes:
8  *
9  * 10-02-07,Nigel Brown(EMBL): changed ifstream to InFileStream to handle
10  * cross-platform end-of-lines.
11  */
12 
13 #ifdef HAVE_CONFIG_H
14     #include "config.h"
15 #endif
16 #include "RSFFileParser.h"
17 
18 namespace clustalw
19 {
20 
21 /**
22  * Constructor sets up the chartab array.
23  * @param filePath
24  */
RSFFileParser(string filePath)25 RSFFileParser::RSFFileParser(string filePath)
26 {
27     fileName = filePath;
28     fillCharTab();
29 }
30 
getSeqRange(int firstSeq,int no,string * offendingSeq)31     vector<Sequence> RSFFileParser::getSeqRange(int firstSeq, int no, string *offendingSeq)
32 {
33     vector<Sequence> seqRangeVector;
34     int i;
35 
36     for (i=0; i<no; i++)
37     {
38         Sequence tempSeq = getSeq(firstSeq + i, offendingSeq);
39         if (parseExitCode!=OK) {
40             seqRangeVector.clear();
41             return seqRangeVector;
42         }
43         seqRangeVector.push_back(tempSeq);
44     }
45     return seqRangeVector;
46 }
47 
48 
49 /**
50  * get the sequence seqNum from the file.
51  * @param seqNum The number of the sequence to get.
52  * @return The sequence seqNum.
53  */
getSeq(int seqNum,string * offendingSeq)54     Sequence RSFFileParser::getSeq(int seqNum, string *offendingSeq)
55 {
56     char _line[MAXLINE + 1];
57     char _sname[MAXNAMES + 1];
58     string characterSeq = "";
59     string name = "";
60     string title = "";
61     string blank = "";
62     _line[0] = EOS;
63 
64     int i;
65     unsigned char c;
66     int _currentSeqNum = 0; // Not at any sequence yet!
67 
68     try
69     {
70         _fileIn = new InFileStream;  //nige
71         _fileIn->open(fileName.c_str());  //nige
72         _fileIn->seekg(0, std::ios::beg); // start at the beginning
73 
74         // Need to get the cursor to the begining of the correct sequence.
75         // This will be the case when we get to the seqNum {
76         while (_currentSeqNum != seqNum)
77         {
78             while(*_line != '{')
79             {
80                 if(!_fileIn->getline(_line, MAXLINE + 1)) // If we cannot get anymore!
81                 {
82                     _fileIn->close();
83                     return Sequence(blank, blank, blank);
84                 }
85             }
86             ++_currentSeqNum;
87             if(_currentSeqNum == seqNum) // Found the sequence
88             {
89                 break;
90             }
91             // Get next line so that we are past the '{' line
92             _fileIn->getline(_line, MAXLINE + 1);
93         }
94 
95         while (!keyword(_line, "name"))
96         {
97             if (!_fileIn->getline(_line, MAXLINE + 1))
98             {
99                 _fileIn->close();
100                 return Sequence(blank, blank, blank);
101             }
102         }
103         for (i = 5; i <= (int)strlen(_line); i++)
104         {
105             if (_line[i] != ' ')
106             {
107                 break;
108             }
109         }
110         strncpy(_sname, _line + i, MAXNAMES); // remember entryname
111         for (i = 0; i <= (int)strlen(_sname); i++)
112         {
113             if (_sname[i] == ' ')
114             {
115                 _sname[i] = EOS;
116                 break;
117             }
118         }
119 
120         _sname[MAXNAMES] = EOS;
121         utilityObject->rTrim(_sname);
122         utilityObject->blankToUnderscore(_sname); // replace blanks with '_'
123         name = string(_sname);
124 
125 
126         while (!keyword(_line, "sequence"))
127         {
128             if (!_fileIn->getline(_line, MAXLINE + 1))
129             {
130                 _fileIn->close();
131                 return Sequence(blank, blank, blank);
132             }
133         }
134 
135         while (_fileIn->getline(_line, MAXLINE + 1))
136         {
137             for (i = 0; i <= MAXLINE; i++)
138             {
139                 c = _line[i];
140                 if (c == EOS || c == '}')
141                 {
142                     break;
143                 }
144                  // EOL
145                 if (c == '.')
146                 {
147                     characterSeq += '-';
148                 }
149                 c = chartab[c];
150                 if (c)
151                 {
152                     characterSeq += c;
153                 }
154             }
155             if (c == '}')
156             {
157                 break;
158             }
159         }
160         _fileIn->close();
161 
162         if ((int)characterSeq.length() > userParameters->getMaxAllowedSeqLength())
163         {
164             parseExitCode=SEQUENCETOOBIG;
165             if (offendingSeq!=NULL)
166                 offendingSeq->assign(name);
167             // return empty seq
168             return Sequence(blank, blank, blank);
169         }
170         return Sequence(characterSeq, name, title);
171     }
172     catch(...)
173     {
174         _fileIn->close();
175         cerr << "There was an exception in the RSFFileParser::getSeq function.\n"
176              << "Need to end program\n";
177         exit(1);
178     }
179 }
180 
181 /**
182  * count the number of sequences in a GCG RSF alignment file
183  * @return The number of sequences in the file.
184  */
countSeqs()185 int RSFFileParser::countSeqs()
186 {
187     char _line[MAXLINE + 1];
188     int numSeqs;
189 
190     try
191     {
192         numSeqs = 0;
193         _fileIn = new InFileStream;  //nige
194         _fileIn->open(fileName.c_str());  //nige
195         _fileIn->seekg(0, std::ios::beg); // start at the beginning
196 
197         if(!_fileIn->is_open())
198         {
199             return 0; // No sequences found!
200         }
201 
202         // skip the comments
203         while (_fileIn->getline(_line, MAXLINE + 1))
204         {
205             // NOTE needed to change to -1 and -2 (it was -2 and -3)
206             // This is because getline does not put the \n in!
207             if (_line[strlen(_line) - 1] == '.' && _line[strlen(_line) - 2] == '.')
208             {
209                 break;
210             }
211         }
212 
213         while (_fileIn->getline(_line, MAXLINE + 1))
214         {
215             if (*_line == '{')
216             {
217                 numSeqs++;
218             }
219         }
220         _fileIn->close();
221         return numSeqs;
222     }
223     catch(...)
224     {
225         _fileIn->close();
226         cerr << "An exception has occured in the function RSFFileParser::countSeqs()\n"
227              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
228         exit(1);
229     }
230 }
231 
232 /**
233  * Get the secondary structure information from the file.
234  * @param gapPenaltyMask
235  * @param secStructMask
236  * @param secStructName
237  * @param structPenalties
238  * @param length
239  */
getSecStructure(vector<char> & gapPenaltyMask,vector<char> & secStructMask,string & secStructName,int & structPenalties,int length)240 void RSFFileParser::getSecStructure(vector<char>& gapPenaltyMask, vector<char>& secStructMask,
241                      string& secStructName, int &structPenalties, int length)
242 {
243     bool guigetss = false;
244     if(userParameters->getProfileNum() == 1 && userParameters->getStructPenalties1())
245          guigetss = true;
246     if(userParameters->getProfileNum() == 2 && userParameters->getStructPenalties2())
247          guigetss = true;
248 
249     char _title[MAXLINE + 1];
250     char _line[MAXLINE + 1];
251     char _lin2[MAXLINE + 1];
252     char _sname[MAXNAMES + 1];
253     int i;
254     _line[0] = EOS;
255 
256     try
257     {
258         secStructMask.clear();
259         secStructMask.assign(length, '.');
260         _fileIn = new InFileStream;  //nige
261         _fileIn->open(fileName.c_str());  //nige
262         _fileIn->seekg(0, std::ios::beg); // Need to start at begining
263 
264         // skip the comments
265         while (_fileIn->getline(_line, MAXLINE + 1))
266         {
267             if (_line[strlen(_line) - 1] == '.' && _line[strlen(_line) - 2] == '.')
268             {
269                 break;
270             }
271         }
272 
273         // find the start of the sequence entry
274         for (;;)
275         {
276             while (_fileIn->getline(_line, MAXLINE + 1))
277                 if (*_line == '{')
278                 {
279                     break;
280                 }
281 
282             while (!keyword(_line, "name"))
283             {
284                 if (!_fileIn->getline(_line, MAXLINE + 1))
285                 {
286                     _fileIn->close();
287                     return;
288                 }
289             }
290 
291             for (i = 5; i <= (int)strlen(_line); i++)
292             {
293                 if (_line[i] != ' ')
294                 {
295                     break;
296                 }
297             }
298             strncpy(_sname, _line + i, MAXNAMES); // remember entryname
299             for (i = 0; i <= (int)strlen(_sname); i++)
300             {
301                 if (_sname[i] == ' ')
302                 {
303                     _sname[i] = EOS;
304                     break;
305                 }
306             }
307             _sname[MAXNAMES] = EOS;
308             utilityObject->rTrim(_sname);
309             utilityObject->blankToUnderscore(_sname); // replace blanks with '_'
310 
311             // look for secondary structure feature table / gap penalty mask
312             while (_fileIn->getline(_line, MAXLINE + 1))
313             {
314                 if (keyword(_line, "feature"))
315                 {
316                     if (userParameters->getInteractive() && !userParameters->getGui())
317                     {
318                         strcpy(_title, "Found secondary structure in alignment file: ");
319                         strcat(_title, _sname);
320                         (*_lin2) = utilityObject->promptForYesNo(_title,
321                             "Use it to set local gap penalties ");
322                     }
323                     else
324                     {
325                         (*_lin2) = 'y';
326                     }
327                     if (guigetss || ((*_lin2 != 'n') && (*_lin2 != 'N')))
328                     {
329                         structPenalties = SECST;
330                         secStructMask.assign(length, '.');
331                         do
332                         {
333                             if (keyword(_line, "feature"))
334                             {
335                                 getRSFFeature(&_line[7], secStructMask, length);
336                             }
337                             _fileIn->getline(_line, MAXLINE + 1);
338                         }
339                         while (!keyword(_line, "sequence"));
340                     }
341                     else
342                     {
343                         do
344                         {
345                             _fileIn->getline(_line, MAXLINE + 1);
346                         }
347                         while (!keyword(_line, "sequence"));
348                     }
349                     secStructName = string(_sname);
350                 }
351                 else if (keyword(_line, "sequence"))
352                 {
353                     break;
354                 }
355 
356                 if (structPenalties != NONE)
357                 {
358                     break;
359                 }
360             }
361         }
362         _fileIn->close();
363     }
364     catch(...)
365     {
366         _fileIn->close();
367         cerr << "An exception has occured in the function RSFFileParser::getSecStructure()\n"
368              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
369         exit(1);
370     }
371 }
372 
373 /**
374  * get a feature from the file. Called by getSecStructure
375  * @param line
376  * @param secStructMask
377  * @param length
378  */
getRSFFeature(char * line,vector<char> & secStructMask,int length)379 void RSFFileParser::getRSFFeature(char* line, vector<char>& secStructMask, int length)
380 {
381     char c, s;
382     char str1[MAXLINE + 1], str2[MAXLINE + 1], feature[MAXLINE + 1];
383     int i, tmp, startPos, endPos;
384 
385     try
386     {
387         if (sscanf(line, "%d%d%d%s%s%s", &startPos, &endPos, &tmp, str1, str2,
388             feature) != 6)
389         {
390             return;
391         }
392 
393         if (strcmp(feature, "HELIX") == 0)
394         {
395             c = 'A';
396             s = '$';
397         }
398         else if (strcmp(feature, "STRAND") == 0)
399         {
400             c = 'B';
401             s = '%';
402         }
403         else
404         {
405             return ;
406         }
407 
408         if (startPos >= length || endPos >= length)
409         {
410             return ;
411         }
412         secStructMask[startPos - 1] = s;
413         for (i = startPos; i < endPos - 1; i++)
414         {
415             secStructMask[i] = c;
416         }
417         secStructMask[endPos - 1] = s;
418     }
419     catch(...)
420     {
421         cerr << "An exception has occured in the function RSFFileParser::getRSFFeature()\n"
422              << "Program needs to terminate.\nPlease contact the Clustal developers\n";
423         exit(1);
424     }
425 }
426 
427 /**
428  * keyword checks if code is on the line!
429  * @param line
430  * @param code
431  * @return
432  */
keyword(char * line,const char * code)433 bool RSFFileParser::keyword(char *line, const char *code)
434 {
435     int i;
436     char key[MAXLINE];
437 
438     for (i = 0; !isspace(line[i]) && line[i] != EOS; i++)
439     {
440         key[i] = line[i];
441     }
442     key[i] = EOS;
443     return (strcmp(key, code) == 0);
444 }
445 
446 }
447 
448 
449