1 /*
2  *  sequencedb.cpp
3  *  Mothur
4  *
5  *  Created by Thomas Ryabin on 4/13/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9 
10 #include "sequencedb.h"
11 #include "sequence.hpp"
12 #include "mothur.h"
13 #include "calculator.h"
14 #include "kmer.hpp"
15 
16 /***********************************************************************/
17 
SequenceDB()18 SequenceDB::SequenceDB() : StorageDatabase() {}
19 /***********************************************************************/
20 //the clear function free's the memory
~SequenceDB()21 SequenceDB::~SequenceDB() { data.clear(); }
22 
23 /***********************************************************************/
24 
SequenceDB(int newSize)25 SequenceDB::SequenceDB(int newSize) : StorageDatabase() { data.resize(newSize, Sequence()); }
26 
27 /***********************************************************************/
28 //kmerDB[0] = vector<int> maxKmers long, contains kmer counts
SequenceDB(ifstream & filehandle,int kmerSize,vector<vector<int>> & kmerDB,vector<int> & lengths)29 SequenceDB::SequenceDB(ifstream& filehandle, int kmerSize, vector< vector< int > >& kmerDB, vector< int >& lengths) {
30     try{
31         Utils util; length = 0; samelength = true; lengths.clear(); kmerDB.clear();
32 
33         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
34 
35         int maxKmer = power4s[kmerSize];
36 
37         Kmer kmer(kmerSize);
38 
39         while (!filehandle.eof()) {
40             //input sequence info into sequencedb
41             Sequence newSequence(filehandle); util.gobble(filehandle);
42 
43             if (newSequence.getName() != "") {
44                 if (length == 0) { length = newSequence.getAligned().length(); }
45                 if (length != newSequence.getAligned().length()) { samelength = false;  }
46 
47                 data.push_back(newSequence);
48 
49                 vector<int> kmerLocations; kmerLocations.resize(maxKmer+1, 0);
50 
51                 int numKmers = newSequence.getNumBases() - kmerSize + 1;
52 
53                 for(int i=0;i<numKmers;i++){
54                     int kmerNumber = kmer.getKmerNumber(newSequence.getUnaligned(), i);
55 
56                     kmerLocations[kmerNumber]++; //ok, we've seen the kmer now
57                 }
58 
59                 kmerDB.push_back(kmerLocations);
60                 lengths.push_back(newSequence.getNumBases());
61             }
62         }
63 
64         filehandle.close();
65 
66     }
67     catch(exception& e) {
68         m->errorOut(e, "SequenceDB", "SequenceDB");
69         exit(1);
70     }
71 }
72 /***********************************************************************/
SequenceDB(ifstream & filehandle)73 SequenceDB::SequenceDB(ifstream& filehandle) : StorageDatabase() {
74 	try{
75 
76 		//read through file
77 		while (!filehandle.eof()) {
78 			//input sequence info into sequencedb
79 			Sequence newSequence(filehandle);
80 
81 			if (newSequence.getName() != "") {
82 				if (length == 0) { length = newSequence.getAligned().length(); }
83                 if (length != newSequence.getAligned().length()) { samelength = false;  }
84 				data.push_back(newSequence);
85 			}
86 
87 			//takes care of white space
88 			util.gobble(filehandle);
89 		}
90 
91 		filehandle.close();
92 
93 	}
94 	catch(exception& e) {
95 		m->errorOut(e, "SequenceDB", "SequenceDB");
96 		exit(1);
97 	}
98 }
99 /***********************************************************************/
100 
SequenceDB(const SequenceDB & sdb,set<string> names)101 SequenceDB::SequenceDB(const SequenceDB& sdb, set<string> names) : StorageDatabase() {
102     try{
103 
104         int numSeqs = sdb.data.size();
105 
106         for (int i = 0; i < numSeqs; i++) {
107 
108             Sequence seqI = sdb.data[i];
109             if (names.count(seqI.getName()) != 0) {
110                 if (length == 0) { length = seqI.getAligned().length(); }
111                 if (length != seqI.getAligned().length()) { samelength = false;  }
112                 data.push_back(seqI);
113             }
114         }
115     }
116     catch(exception& e) {
117         m->errorOut(e, "SequenceDB", "SequenceDB");
118         exit(1);
119     }
120 }
121 /***********************************************************************/
122 
SequenceDB(const SequenceDB & sdb,set<string> names,int kmerSize,vector<vector<int>> & kmerDB,vector<int> & lengths)123 SequenceDB::SequenceDB(const SequenceDB& sdb, set<string> names, int kmerSize, vector< vector< int > >& kmerDB, vector< int >& lengths) : StorageDatabase() {
124     try{
125 
126         int numSeqs = sdb.data.size();
127 
128         Utils util; length = 0; samelength = true; lengths.clear(); kmerDB.clear();
129 
130         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
131 
132         int maxKmer = power4s[kmerSize];
133 
134         Kmer kmer(kmerSize);
135 
136         for (int i = 0; i < numSeqs; i++) {
137 
138             Sequence newSequence = sdb.data[i];
139 
140             if (names.count(newSequence.getName()) != 0) {
141                 if (length == 0) { length = newSequence.getAligned().length(); }
142                 if (length != newSequence.getAligned().length()) { samelength = false;  }
143 
144                 data.push_back(newSequence);
145 
146                 vector<int> kmerLocations; kmerLocations.resize(maxKmer+1, 0);
147 
148                 int numKmers = newSequence.getNumBases() - kmerSize + 1;
149 
150                 for(int i=0;i<numKmers;i++){
151                     int kmerNumber = kmer.getKmerNumber(newSequence.getUnaligned(), i);
152 
153                     kmerLocations[kmerNumber]++; //ok, we've seen the kmer now
154                 }
155 
156                 kmerDB.push_back(kmerLocations);
157                 lengths.push_back(newSequence.getNumBases());
158             }
159         }
160     }
161     catch(exception& e) {
162         m->errorOut(e, "SequenceDB", "SequenceDB");
163         exit(1);
164     }
165 }
166 /***********************************************************************/
167 
getNumSeqs()168 int SequenceDB::getNumSeqs() {
169 	return data.size();
170 }
171 
172 /***********************************************************************/
getSeq(int index)173 Sequence SequenceDB::getSeq(int index) {
174 	return data[index];
175 }
176 /***********************************************************************/
177 
push_back(Sequence newSequence)178 void SequenceDB::push_back(Sequence newSequence) {
179 	try {
180 		if (length == 0) { length = newSequence.getAligned().length(); }
181 		if (length != newSequence.getAligned().length()) { samelength = false; }
182 
183 		data.push_back(newSequence);
184 	}
185 	catch(exception& e) {
186 		m->errorOut(e, "SequenceDB", "push_back");
187 		exit(1);
188 	}
189 }
190 /***********************************************************************/
191 
print(string outputFileName)192 void SequenceDB::print(string outputFileName) {
193     try {
194         ofstream out; util.openOutputFile(outputFileName, out);
195 
196         for (int i = 0; i < data.size(); i++) {
197             data[i].printSequence(out);
198         }
199         out.close();
200     }
201     catch(exception& e) {
202         m->errorOut(e, "SequenceDB", "print");
203         exit(1);
204     }
205 }
206 
207 /***********************************************************************/
208 
209