1 /*
2 * sequencedb.cpp
3 * Mothur
4 *
5 * Created by Thomas Ryabin on 4/13/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7 *
8 */
9
10 #include "sequencedb.h"
11 #include "sequence.hpp"
12 #include "mothur.h"
13 #include "calculator.h"
14 #include "kmer.hpp"
15
16 /***********************************************************************/
17
SequenceDB()18 SequenceDB::SequenceDB() : StorageDatabase() {}
19 /***********************************************************************/
20 //the clear function free's the memory
~SequenceDB()21 SequenceDB::~SequenceDB() { data.clear(); }
22
23 /***********************************************************************/
24
SequenceDB(int newSize)25 SequenceDB::SequenceDB(int newSize) : StorageDatabase() { data.resize(newSize, Sequence()); }
26
27 /***********************************************************************/
28 //kmerDB[0] = vector<int> maxKmers long, contains kmer counts
SequenceDB(ifstream & filehandle,int kmerSize,vector<vector<int>> & kmerDB,vector<int> & lengths)29 SequenceDB::SequenceDB(ifstream& filehandle, int kmerSize, vector< vector< int > >& kmerDB, vector< int >& lengths) {
30 try{
31 Utils util; length = 0; samelength = true; lengths.clear(); kmerDB.clear();
32
33 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
34
35 int maxKmer = power4s[kmerSize];
36
37 Kmer kmer(kmerSize);
38
39 while (!filehandle.eof()) {
40 //input sequence info into sequencedb
41 Sequence newSequence(filehandle); util.gobble(filehandle);
42
43 if (newSequence.getName() != "") {
44 if (length == 0) { length = newSequence.getAligned().length(); }
45 if (length != newSequence.getAligned().length()) { samelength = false; }
46
47 data.push_back(newSequence);
48
49 vector<int> kmerLocations; kmerLocations.resize(maxKmer+1, 0);
50
51 int numKmers = newSequence.getNumBases() - kmerSize + 1;
52
53 for(int i=0;i<numKmers;i++){
54 int kmerNumber = kmer.getKmerNumber(newSequence.getUnaligned(), i);
55
56 kmerLocations[kmerNumber]++; //ok, we've seen the kmer now
57 }
58
59 kmerDB.push_back(kmerLocations);
60 lengths.push_back(newSequence.getNumBases());
61 }
62 }
63
64 filehandle.close();
65
66 }
67 catch(exception& e) {
68 m->errorOut(e, "SequenceDB", "SequenceDB");
69 exit(1);
70 }
71 }
72 /***********************************************************************/
SequenceDB(ifstream & filehandle)73 SequenceDB::SequenceDB(ifstream& filehandle) : StorageDatabase() {
74 try{
75
76 //read through file
77 while (!filehandle.eof()) {
78 //input sequence info into sequencedb
79 Sequence newSequence(filehandle);
80
81 if (newSequence.getName() != "") {
82 if (length == 0) { length = newSequence.getAligned().length(); }
83 if (length != newSequence.getAligned().length()) { samelength = false; }
84 data.push_back(newSequence);
85 }
86
87 //takes care of white space
88 util.gobble(filehandle);
89 }
90
91 filehandle.close();
92
93 }
94 catch(exception& e) {
95 m->errorOut(e, "SequenceDB", "SequenceDB");
96 exit(1);
97 }
98 }
99 /***********************************************************************/
100
SequenceDB(const SequenceDB & sdb,set<string> names)101 SequenceDB::SequenceDB(const SequenceDB& sdb, set<string> names) : StorageDatabase() {
102 try{
103
104 int numSeqs = sdb.data.size();
105
106 for (int i = 0; i < numSeqs; i++) {
107
108 Sequence seqI = sdb.data[i];
109 if (names.count(seqI.getName()) != 0) {
110 if (length == 0) { length = seqI.getAligned().length(); }
111 if (length != seqI.getAligned().length()) { samelength = false; }
112 data.push_back(seqI);
113 }
114 }
115 }
116 catch(exception& e) {
117 m->errorOut(e, "SequenceDB", "SequenceDB");
118 exit(1);
119 }
120 }
121 /***********************************************************************/
122
SequenceDB(const SequenceDB & sdb,set<string> names,int kmerSize,vector<vector<int>> & kmerDB,vector<int> & lengths)123 SequenceDB::SequenceDB(const SequenceDB& sdb, set<string> names, int kmerSize, vector< vector< int > >& kmerDB, vector< int >& lengths) : StorageDatabase() {
124 try{
125
126 int numSeqs = sdb.data.size();
127
128 Utils util; length = 0; samelength = true; lengths.clear(); kmerDB.clear();
129
130 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
131
132 int maxKmer = power4s[kmerSize];
133
134 Kmer kmer(kmerSize);
135
136 for (int i = 0; i < numSeqs; i++) {
137
138 Sequence newSequence = sdb.data[i];
139
140 if (names.count(newSequence.getName()) != 0) {
141 if (length == 0) { length = newSequence.getAligned().length(); }
142 if (length != newSequence.getAligned().length()) { samelength = false; }
143
144 data.push_back(newSequence);
145
146 vector<int> kmerLocations; kmerLocations.resize(maxKmer+1, 0);
147
148 int numKmers = newSequence.getNumBases() - kmerSize + 1;
149
150 for(int i=0;i<numKmers;i++){
151 int kmerNumber = kmer.getKmerNumber(newSequence.getUnaligned(), i);
152
153 kmerLocations[kmerNumber]++; //ok, we've seen the kmer now
154 }
155
156 kmerDB.push_back(kmerLocations);
157 lengths.push_back(newSequence.getNumBases());
158 }
159 }
160 }
161 catch(exception& e) {
162 m->errorOut(e, "SequenceDB", "SequenceDB");
163 exit(1);
164 }
165 }
166 /***********************************************************************/
167
getNumSeqs()168 int SequenceDB::getNumSeqs() {
169 return data.size();
170 }
171
172 /***********************************************************************/
getSeq(int index)173 Sequence SequenceDB::getSeq(int index) {
174 return data[index];
175 }
176 /***********************************************************************/
177
push_back(Sequence newSequence)178 void SequenceDB::push_back(Sequence newSequence) {
179 try {
180 if (length == 0) { length = newSequence.getAligned().length(); }
181 if (length != newSequence.getAligned().length()) { samelength = false; }
182
183 data.push_back(newSequence);
184 }
185 catch(exception& e) {
186 m->errorOut(e, "SequenceDB", "push_back");
187 exit(1);
188 }
189 }
190 /***********************************************************************/
191
print(string outputFileName)192 void SequenceDB::print(string outputFileName) {
193 try {
194 ofstream out; util.openOutputFile(outputFileName, out);
195
196 for (int i = 0; i < data.size(); i++) {
197 data[i].printSequence(out);
198 }
199 out.close();
200 }
201 catch(exception& e) {
202 m->errorOut(e, "SequenceDB", "print");
203 exit(1);
204 }
205 }
206
207 /***********************************************************************/
208
209