1 #include "muscle.h"
2 #include "seq.h"
3 #include "textfile.h"
4 #include "msa.h"
5 //#include <ctype.h>
6 
7 const size_t MAX_FASTA_LINE = 16000;
8 
SetName(const char * ptrName)9 void Seq::SetName(const char *ptrName)
10 	{
11 	delete[] m_ptrName;
12 	size_t n = strlen(ptrName) + 1;
13 	m_ptrName = new char[n];
14 	strcpy(m_ptrName, ptrName);
15 	}
16 
ToFASTAFile(TextFile & File) const17 void Seq::ToFASTAFile(TextFile &File) const
18 	{
19 	File.PutFormat(">%s\n", m_ptrName);
20 	unsigned uColCount = Length();
21 	for (unsigned n = 0; n < uColCount; ++n)
22 		{
23 		if (n > 0 && n%60 == 0)
24 			File.PutString("\n");
25 		File.PutChar(at(n));
26 		}
27 	File.PutString("\n");
28 	}
29 
30 // Return true on end-of-file
FromFASTAFile(TextFile & File)31 bool Seq::FromFASTAFile(TextFile &File)
32 	{
33 	Clear();
34 
35 	char szLine[MAX_FASTA_LINE];
36 	bool bEof = File.GetLine(szLine, sizeof(szLine));
37 	if (bEof)
38 		return true;
39 	if ('>' != szLine[0])
40 		Quit("Expecting '>' in FASTA file %s line %u",
41 		  File.GetFileName(), File.GetLineNr());
42 
43 	size_t n = strlen(szLine);
44 	if (1 == n)
45 		Quit("Missing annotation following '>' in FASTA file %s line %u",
46 		  File.GetFileName(), File.GetLineNr());
47 
48 	m_ptrName = new char[n];
49 	strcpy(m_ptrName, szLine + 1);
50 
51 	TEXTFILEPOS Pos = File.GetPos();
52 	for (;;)
53 		{
54 		bEof = File.GetLine(szLine, sizeof(szLine));
55 		if (bEof)
56 			{
57 			if (0 == size())
58 				{
59 				Quit("Empty sequence in FASTA file %s line %u",
60 				  File.GetFileName(), File.GetLineNr());
61 				return true;
62 				}
63 			return false;
64 			}
65 		if ('>' == szLine[0])
66 			{
67 			if (0 == size())
68 				Quit("Empty sequence in FASTA file %s line %u",
69 				  File.GetFileName(), File.GetLineNr());
70 		// Rewind to beginning of this line, it's the start of the
71 		// next sequence.
72 			File.SetPos(Pos);
73 			return false;
74 			}
75 		const char *ptrChar = szLine;
76 		while (char c = *ptrChar++)
77 			{
78 			if (isspace(c))
79 				continue;
80 			if (IsGapChar(c))
81 				continue;
82 			if (!IsResidueChar(c))
83 				{
84 				if (isprint(c))
85 					{
86 					char w = GetWildcardChar();
87 					Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'",
88 					  c, File.GetFileName(), File.GetLineNr(), w);
89 					c = w;
90 					}
91 				else
92 					Quit("Invalid byte hex %02x in FASTA file %s line %d",
93 					  (unsigned char) c, File.GetFileName(), File.GetLineNr());
94 				}
95 			c = toupper(c);
96 			push_back(c);
97 			}
98 		Pos = File.GetPos();
99 		}
100 	}
101 
ExtractUngapped(MSA & msa) const102 void Seq::ExtractUngapped(MSA &msa) const
103 	{
104 	msa.Clear();
105 	unsigned uColCount = Length();
106 	msa.SetSize(1, 1);
107 	unsigned uUngappedPos = 0;
108 	for (unsigned n = 0; n < uColCount; ++n)
109 		{
110 		char c = at(n);
111 		if (!IsGapChar(c))
112 			msa.SetChar(0, uUngappedPos++, c);
113 		}
114 	msa.SetSeqName(0, m_ptrName);
115 	}
116 
Copy(const Seq & rhs)117 void Seq::Copy(const Seq &rhs)
118 	{
119 	clear();
120 	const unsigned uLength = rhs.Length();
121 	for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
122 		push_back(rhs.at(uColIndex));
123 	const char *ptrName = rhs.GetName();
124 	size_t n = strlen(ptrName) + 1;
125 	m_ptrName = new char[n];
126 	strcpy(m_ptrName, ptrName);
127 	SetId(rhs.GetId());
128 	}
129 
CopyReversed(const Seq & rhs)130 void Seq::CopyReversed(const Seq &rhs)
131 	{
132 	clear();
133 	const unsigned uLength = rhs.Length();
134 	const unsigned uBase = rhs.Length() - 1;
135 	for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
136 		push_back(rhs.at(uBase - uColIndex));
137 	const char *ptrName = rhs.GetName();
138 	size_t n = strlen(ptrName) + 1;
139 	m_ptrName = new char[n];
140 	strcpy(m_ptrName, ptrName);
141 	}
142 
StripGaps()143 void Seq::StripGaps()
144 	{
145 	for (CharVect::iterator p = begin(); p != end(); )
146 		{
147 		char c = *p;
148 		if (IsGapChar(c))
149 			erase(p);
150 		else
151 			++p;
152 		}
153 	}
154 
StripGapsAndWhitespace()155 void Seq::StripGapsAndWhitespace()
156 	{
157 	for (CharVect::iterator p = begin(); p != end(); )
158 		{
159 		char c = *p;
160 		if (isspace(c) || IsGapChar(c))
161 			erase(p);
162 		else
163 			++p;
164 		}
165 	}
166 
ToUpper()167 void Seq::ToUpper()
168 	{
169 	for (CharVect::iterator p = begin(); p != end(); ++p)
170 		{
171 		char c = *p;
172 		if (islower(c))
173 			*p = toupper(c);
174 		}
175 	}
176 
GetLetter(unsigned uIndex) const177 unsigned Seq::GetLetter(unsigned uIndex) const
178 	{
179 	assert(uIndex < Length());
180 	char c = operator[](uIndex);
181 	return CharToLetter(c);
182 	}
183 
EqIgnoreCase(const Seq & s) const184 bool Seq::EqIgnoreCase(const Seq &s) const
185 	{
186 	const unsigned n = Length();
187 	if (n != s.Length())
188 		return false;
189 	for (unsigned i = 0; i < n; ++i)
190 		{
191 		const char c1 = at(i);
192 		const char c2 = s.at(i);
193 		if (IsGapChar(c1))
194 			{
195 			if (!IsGapChar(c2))
196 				return false;
197 			}
198 		else
199 			{
200 			if (toupper(c1) != toupper(c2))
201 				return false;
202 			}
203 		}
204 	return true;
205 	}
206 
Eq(const Seq & s) const207 bool Seq::Eq(const Seq &s) const
208 	{
209 	const unsigned n = Length();
210 	if (n != s.Length())
211 		return false;
212 	for (unsigned i = 0; i < n; ++i)
213 		{
214 		const char c1 = at(i);
215 		const char c2 = s.at(i);
216 		if (c1 != c2)
217 			return false;
218 		}
219 	return true;
220 	}
221 
EqIgnoreCaseAndGaps(const Seq & s) const222 bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const
223 	{
224 	const unsigned uThisLength = Length();
225 	const unsigned uOtherLength = s.Length();
226 
227 	unsigned uThisPos = 0;
228 	unsigned uOtherPos = 0;
229 
230 	int cThis;
231 	int cOther;
232 	for (;;)
233 		{
234 		if (uThisPos == uThisLength && uOtherPos == uOtherLength)
235 			break;
236 
237 	// Set cThis to next non-gap character in this string
238 	// or -1 if end-of-string.
239 		for (;;)
240 			{
241 			if (uThisPos == uThisLength)
242 				{
243 				cThis = -1;
244 				break;
245 				}
246 			else
247 				{
248 				cThis = at(uThisPos);
249 				++uThisPos;
250 				if (!IsGapChar(cThis))
251 					{
252 					cThis = toupper(cThis);
253 					break;
254 					}
255 				}
256 			}
257 
258 	// Set cOther to next non-gap character in s
259 	// or -1 if end-of-string.
260 		for (;;)
261 			{
262 			if (uOtherPos == uOtherLength)
263 				{
264 				cOther = -1;
265 				break;
266 				}
267 			else
268 				{
269 				cOther = s.at(uOtherPos);
270 				++uOtherPos;
271 				if (!IsGapChar(cOther))
272 					{
273 					cOther = toupper(cOther);
274 					break;
275 					}
276 				}
277 			}
278 
279 	// Compare characters are corresponding ungapped position
280 		if (cThis != cOther)
281 			return false;
282 		}
283 	return true;
284 	}
285 
GetUngappedLength() const286 unsigned Seq::GetUngappedLength() const
287 	{
288 	unsigned uUngappedLength = 0;
289 	for (CharVect::const_iterator p = begin(); p != end(); ++p)
290 		{
291 		char c = *p;
292 		if (!IsGapChar(c))
293 			++uUngappedLength;
294 		}
295 	return uUngappedLength;
296 	}
297 
LogMe() const298 void Seq::LogMe() const
299 	{
300 	Log(">%s\n", m_ptrName);
301 	const unsigned n = Length();
302 	for (unsigned i = 0; i < n; ++i)
303 		Log("%c", at(i));
304 	Log("\n");
305 	}
306 
FromString(const char * pstrSeq,const char * pstrName)307 void Seq::FromString(const char *pstrSeq, const char *pstrName)
308 	{
309 	clear();
310 	const unsigned uLength = (unsigned) strlen(pstrSeq);
311 	for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)
312 		push_back(pstrSeq[uColIndex]);
313 	size_t n = strlen(pstrName) + 1;
314 	m_ptrName = new char[n];
315 	strcpy(m_ptrName, pstrName);
316 	}
317 
HasGap() const318 bool Seq::HasGap() const
319 	{
320 	for (CharVect::const_iterator p = begin(); p != end(); ++p)
321 		{
322 		char c = *p;
323 		if (IsGapChar(c))
324 			return true;
325 		}
326 	return false;
327 	}
328 
FixAlpha()329 void Seq::FixAlpha()
330 	{
331 	for (CharVect::iterator p = begin(); p != end(); ++p)
332 		{
333 		char c = *p;
334 		if (!IsResidueChar(c))
335 			{
336 			char w = GetWildcardChar();
337 			// Warning("Invalid residue '%c', replaced by '%c'", c, w);
338 			InvalidLetterWarning(c, w);
339 			*p = w;
340 			}
341 		}
342 	}
343