1 /**
2 * Author: Mark Larkin
3 *
4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
5 */
6 #ifdef HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9 #include <algorithm>
10 #include "Sequence.h"
11
12 using namespace std;
13
14 namespace clustalw
15 {
16
17 /**
18 *
19 * @param seq
20 * @param name
21 * @param title
22 * @return
23 */
Sequence(string & seq,string & name,string & title)24 Sequence::Sequence(string& seq, string& name, string& title)
25 {
26 copyStringIntoVector(&_sequence, &seq);
27 encodeSequence();
28 _name = name;
29 _title = title;
30 identifier = utilityObject->getUniqueSequenceIdentifier();
31 }
32
Sequence(std::string & seq,std::string & name,std::string & title,unsigned long id)33 Sequence::Sequence(std::string& seq, std::string& name, std::string& title, unsigned long id)
34 {
35 copyStringIntoVector(&_sequence, &seq);
36 encodeSequence();
37 _name = name;
38 _title = title;
39 identifier = id;
40 }
41
42 /**
43 * This is an overloaded contructor that is used to construct a seq object from an
44 * encoded sequenced instead of a string.
45 * @param encodedSequence
46 * @param name
47 * @param title
48 * @param id The unique identifier from the previous sequence!!!
49 * @return
50 */
Sequence(std::vector<int> * encodedSequence,std::string & name,std::string & title,unsigned long id)51 Sequence::Sequence(std::vector<int>* encodedSequence, std::string& name, std::string& title,
52 unsigned long id)
53 {
54 _encodedSequence = *encodedSequence;
55 _name = name;
56 _title = title;
57 identifier = id;
58 }
59
60 /**
61 *
62 */
encodeSequence()63 void Sequence::encodeSequence()
64 {
65 /* code seq as ints .. use gapPos2 for gap */
66 std::vector<char>::iterator it;
67
68 _encodedSequence.push_back(0);
69
70 for(it = _sequence.begin(); it != _sequence.end(); ++it)
71 {
72 if (*it == '-')
73 {
74 _encodedSequence.push_back(userParameters->getGapPos2());
75 }
76 else
77 {
78 _encodedSequence.push_back(userParameters->resIndex(
79 userParameters->getAminoAcidCodes(), *it));
80 }
81 }
82 }
83
84 /**
85 *
86 * @param _vectorTo
87 * @param _stringFrom
88 */
copyStringIntoVector(vector<char> * _vectorTo,string * _stringFrom)89 void Sequence::copyStringIntoVector(vector<char>* _vectorTo, string* _stringFrom)
90 {
91 _vectorTo->clear();
92
93 for(int i = 0; i < (int)_stringFrom->size(); i++)
94 {
95 _vectorTo->push_back(_stringFrom->at(i));
96 }
97
98 if(_vectorTo->size() != _stringFrom->size())
99 {
100 std::cerr << "Error: In function copyStringIntoVector. Strings different length!\n";
101 exit(1);
102 }
103 }
104
105 /**
106 *
107 */
printSequence()108 void Sequence::printSequence()
109 {
110 std::cout << "This is the sequence and the encoded sequence " << _name << std::endl;
111
112 std::vector<char>::iterator itChar;
113 for(itChar = _sequence.begin(); itChar != _sequence.end(); ++itChar)
114 {
115 cout << *itChar;
116 }
117 cout << std::endl;
118
119 std::vector<int>::iterator itInt;
120 for(itInt = _encodedSequence.begin(); itInt != _encodedSequence.end(); ++itInt)
121 {
122 cout << " " << *itInt;
123 }
124 cout << std::endl;
125 }
126
127 /**
128 *
129 */
checkIntegrity()130 void Sequence::checkIntegrity()
131 {
132 // The sequences should be the same length.
133 if(_sequence.size() != _encodedSequence.size())
134 {
135 std::cerr << "Error: _sequence is not same size as _encodedSequence\n";
136 exit(1);
137 }
138 }
139
140 /**
141 *
142 * @return the encoded sequence, this is what is used in the pairwise!
143 */
getSequence()144 std::vector<int>* Sequence::getSequence()
145 {
146 return &_encodedSequence;
147 }
148
149 /**
150 *
151 * @return
152 */
getName()153 std::string Sequence::getName()
154 {
155 return _name;
156 }
157
158 /**
159 *
160 * @return
161 */
getTitle()162 std::string Sequence::getTitle()
163 {
164 return _title;
165 }
166
167 /**
168 *
169 * @return
170 */
isEmpty()171 bool Sequence::isEmpty()
172 {
173 if(_sequence.size() == 0)
174 {
175 return true;
176 }
177 else
178 {
179 return false;
180 }
181
182 }
183
184 /**
185 *
186 * @return
187 */
checkDNAFlag()188 bool Sequence::checkDNAFlag()
189 // check if DNA or Protein
190 // The decision is based on counting all A,C,G,T,U or N.
191 // If >= 85% of all characters (except -) are as above => DNA
192 {
193 int c, numResidues, numBases;
194 float ratio;
195 string dna_codes = "ACGTUN";
196
197 numResidues = numBases = 0;
198
199 vector<char>::iterator seqIterator = _sequence.begin();
200
201 while (seqIterator != _sequence.end())
202 {
203 if (*seqIterator != '-')
204 {
205 numResidues++;
206 if (*seqIterator == 'N')
207 {
208 numBases++;
209 }
210 else
211 {
212 c = userParameters->resIndex(dna_codes, *seqIterator);
213 if (c >= 0)
214 {
215 numBases++;
216 }
217 }
218 }
219 seqIterator++;
220 }
221
222 if ((numBases == 0) || (numResidues == 0))
223 {
224 return false;
225 }
226 ratio = (float)numBases / (float)numResidues;
227
228 if (ratio >= 0.85)
229 {
230 return true;
231 }
232 else
233 {
234 return false;
235 }
236 }
237
238 }
239
240