1 #include "muscle.h"
2 #include "seq.h"
3 #include "textfile.h"
4 #include "msa.h"
5 //#include <ctype.h>
6
7 const size_t MAX_FASTA_LINE = 16000;
8
SetName(const char * ptrName)9 void Seq::SetName(const char *ptrName)
10 {
11 delete[] m_ptrName;
12 size_t n = strlen(ptrName) + 1;
13 m_ptrName = new char[n];
14 strcpy(m_ptrName, ptrName);
15 }
16
ToFASTAFile(TextFile & File) const17 void Seq::ToFASTAFile(TextFile &File) const
18 {
19 File.PutFormat(">%s\n", m_ptrName);
20 unsigned uColCount = Length();
21 for (unsigned n = 0; n < uColCount; ++n)
22 {
23 if (n > 0 && n%60 == 0)
24 File.PutString("\n");
25 File.PutChar(at(n));
26 }
27 File.PutString("\n");
28 }
29
30 // Return true on end-of-file
FromFASTAFile(TextFile & File)31 bool Seq::FromFASTAFile(TextFile &File)
32 {
33 Clear();
34
35 char szLine[MAX_FASTA_LINE];
36 bool bEof = File.GetLine(szLine, sizeof(szLine));
37 if (bEof)
38 return true;
39 if ('>' != szLine[0])
40 Quit("Expecting '>' in FASTA file %s line %u",
41 File.GetFileName(), File.GetLineNr());
42
43 size_t n = strlen(szLine);
44 if (1 == n)
45 Quit("Missing annotation following '>' in FASTA file %s line %u",
46 File.GetFileName(), File.GetLineNr());
47
48 m_ptrName = new char[n];
49 strcpy(m_ptrName, szLine + 1);
50
51 TEXTFILEPOS Pos = File.GetPos();
52 for (;;)
53 {
54 bEof = File.GetLine(szLine, sizeof(szLine));
55 if (bEof)
56 {
57 if (0 == size())
58 {
59 Quit("Empty sequence in FASTA file %s line %u",
60 File.GetFileName(), File.GetLineNr());
61 return true;
62 }
63 return false;
64 }
65 if ('>' == szLine[0])
66 {
67 if (0 == size())
68 Quit("Empty sequence in FASTA file %s line %u",
69 File.GetFileName(), File.GetLineNr());
70 // Rewind to beginning of this line, it's the start of the
71 // next sequence.
72 File.SetPos(Pos);
73 return false;
74 }
75 const char *ptrChar = szLine;
76 while (char c = *ptrChar++)
77 {
78 if (isspace(c))
79 continue;
80 if (IsGapChar(c))
81 continue;
82 if (!IsResidueChar(c))
83 {
84 if (isprint(c))
85 {
86 char w = GetWildcardChar();
87 Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'",
88 c, File.GetFileName(), File.GetLineNr(), w);
89 c = w;
90 }
91 else
92 Quit("Invalid byte hex %02x in FASTA file %s line %d",
93 (unsigned char) c, File.GetFileName(), File.GetLineNr());
94 }
95 c = toupper(c);
96 push_back(c);
97 }
98 Pos = File.GetPos();
99 }
100 }
101
ExtractUngapped(MSA & msa) const102 void Seq::ExtractUngapped(MSA &msa) const
103 {
104 msa.Clear();
105 unsigned uColCount = Length();
106 msa.SetSize(1, 1);
107 unsigned uUngappedPos = 0;
108 for (unsigned n = 0; n < uColCount; ++n)
109 {
110 char c = at(n);
111 if (!IsGapChar(c))
112 msa.SetChar(0, uUngappedPos++, c);
113 }
114 msa.SetSeqName(0, m_ptrName);
115 }
116
Copy(const Seq & rhs)117 void Seq::Copy(const Seq &rhs)
118 {
119 clear();
120 const unsigned uLength = rhs.Length();
121 for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
122 push_back(rhs.at(uColIndex));
123 const char *ptrName = rhs.GetName();
124 size_t n = strlen(ptrName) + 1;
125 m_ptrName = new char[n];
126 strcpy(m_ptrName, ptrName);
127 SetId(rhs.GetId());
128 }
129
CopyReversed(const Seq & rhs)130 void Seq::CopyReversed(const Seq &rhs)
131 {
132 clear();
133 const unsigned uLength = rhs.Length();
134 const unsigned uBase = rhs.Length() - 1;
135 for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
136 push_back(rhs.at(uBase - uColIndex));
137 const char *ptrName = rhs.GetName();
138 size_t n = strlen(ptrName) + 1;
139 m_ptrName = new char[n];
140 strcpy(m_ptrName, ptrName);
141 }
142
StripGaps()143 void Seq::StripGaps()
144 {
145 for (CharVect::iterator p = begin(); p != end(); )
146 {
147 char c = *p;
148 if (IsGapChar(c))
149 erase(p);
150 else
151 ++p;
152 }
153 }
154
StripGapsAndWhitespace()155 void Seq::StripGapsAndWhitespace()
156 {
157 for (CharVect::iterator p = begin(); p != end(); )
158 {
159 char c = *p;
160 if (isspace(c) || IsGapChar(c))
161 erase(p);
162 else
163 ++p;
164 }
165 }
166
ToUpper()167 void Seq::ToUpper()
168 {
169 for (CharVect::iterator p = begin(); p != end(); ++p)
170 {
171 char c = *p;
172 if (islower(c))
173 *p = toupper(c);
174 }
175 }
176
GetLetter(unsigned uIndex) const177 unsigned Seq::GetLetter(unsigned uIndex) const
178 {
179 assert(uIndex < Length());
180 char c = operator[](uIndex);
181 return CharToLetter(c);
182 }
183
EqIgnoreCase(const Seq & s) const184 bool Seq::EqIgnoreCase(const Seq &s) const
185 {
186 const unsigned n = Length();
187 if (n != s.Length())
188 return false;
189 for (unsigned i = 0; i < n; ++i)
190 {
191 const char c1 = at(i);
192 const char c2 = s.at(i);
193 if (IsGapChar(c1))
194 {
195 if (!IsGapChar(c2))
196 return false;
197 }
198 else
199 {
200 if (toupper(c1) != toupper(c2))
201 return false;
202 }
203 }
204 return true;
205 }
206
Eq(const Seq & s) const207 bool Seq::Eq(const Seq &s) const
208 {
209 const unsigned n = Length();
210 if (n != s.Length())
211 return false;
212 for (unsigned i = 0; i < n; ++i)
213 {
214 const char c1 = at(i);
215 const char c2 = s.at(i);
216 if (c1 != c2)
217 return false;
218 }
219 return true;
220 }
221
EqIgnoreCaseAndGaps(const Seq & s) const222 bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const
223 {
224 const unsigned uThisLength = Length();
225 const unsigned uOtherLength = s.Length();
226
227 unsigned uThisPos = 0;
228 unsigned uOtherPos = 0;
229
230 int cThis;
231 int cOther;
232 for (;;)
233 {
234 if (uThisPos == uThisLength && uOtherPos == uOtherLength)
235 break;
236
237 // Set cThis to next non-gap character in this string
238 // or -1 if end-of-string.
239 for (;;)
240 {
241 if (uThisPos == uThisLength)
242 {
243 cThis = -1;
244 break;
245 }
246 else
247 {
248 cThis = at(uThisPos);
249 ++uThisPos;
250 if (!IsGapChar(cThis))
251 {
252 cThis = toupper(cThis);
253 break;
254 }
255 }
256 }
257
258 // Set cOther to next non-gap character in s
259 // or -1 if end-of-string.
260 for (;;)
261 {
262 if (uOtherPos == uOtherLength)
263 {
264 cOther = -1;
265 break;
266 }
267 else
268 {
269 cOther = s.at(uOtherPos);
270 ++uOtherPos;
271 if (!IsGapChar(cOther))
272 {
273 cOther = toupper(cOther);
274 break;
275 }
276 }
277 }
278
279 // Compare characters are corresponding ungapped position
280 if (cThis != cOther)
281 return false;
282 }
283 return true;
284 }
285
GetUngappedLength() const286 unsigned Seq::GetUngappedLength() const
287 {
288 unsigned uUngappedLength = 0;
289 for (CharVect::const_iterator p = begin(); p != end(); ++p)
290 {
291 char c = *p;
292 if (!IsGapChar(c))
293 ++uUngappedLength;
294 }
295 return uUngappedLength;
296 }
297
LogMe() const298 void Seq::LogMe() const
299 {
300 Log(">%s\n", m_ptrName);
301 const unsigned n = Length();
302 for (unsigned i = 0; i < n; ++i)
303 Log("%c", at(i));
304 Log("\n");
305 }
306
FromString(const char * pstrSeq,const char * pstrName)307 void Seq::FromString(const char *pstrSeq, const char *pstrName)
308 {
309 clear();
310 const unsigned uLength = (unsigned) strlen(pstrSeq);
311 for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
312 push_back(pstrSeq[uColIndex]);
313 size_t n = strlen(pstrName) + 1;
314 m_ptrName = new char[n];
315 strcpy(m_ptrName, pstrName);
316 }
317
HasGap() const318 bool Seq::HasGap() const
319 {
320 for (CharVect::const_iterator p = begin(); p != end(); ++p)
321 {
322 char c = *p;
323 if (IsGapChar(c))
324 return true;
325 }
326 return false;
327 }
328
FixAlpha()329 void Seq::FixAlpha()
330 {
331 for (CharVect::iterator p = begin(); p != end(); ++p)
332 {
333 char c = *p;
334 if (!IsResidueChar(c))
335 {
336 char w = GetWildcardChar();
337 // Warning("Invalid residue '%c', replaced by '%c'", c, w);
338 InvalidLetterWarning(c, w);
339 *p = w;
340 }
341 }
342 }
343