1 #include "Parameters.h"
2 #include "Util.h"
3 #include "Debug.h"
4 #include "DBReader.h"
5 #include "DBWriter.h"
6 #include "CompressedA3M.h"
7 #include "MathUtil.h"
8 #include "Domain.h"
9
10 #include <fstream>
11 #include <iomanip>
12 #include <map>
13
14 #ifdef OPENMP
15 #include <omp.h>
16 #endif
17
getOverlap(const std::vector<bool> & covered,unsigned int qStart,unsigned int qEnd)18 static inline float getOverlap(const std::vector<bool>& covered, unsigned int qStart, unsigned int qEnd) {
19 size_t counter = 0;
20 for (size_t i = qStart; i < qEnd; ++i) {
21 counter += covered[i];
22 }
23 return static_cast<float>(counter) / static_cast<float>(qEnd - qStart + 1);
24 }
25
26
mapDomains(const std::vector<Domain> & input,float overlap,float minCoverage,double eValThreshold)27 std::vector<Domain> mapDomains(const std::vector<Domain> &input, float overlap, float minCoverage,
28 double eValThreshold) {
29 std::vector<Domain> result;
30 if (input.empty()) {
31 return result;
32 }
33
34 std::vector<bool> covered(input[0].qLength, false);
35 for (size_t i = 0; i < input.size(); i++) {
36 Domain domain = input[i];
37 if(domain.qStart > domain.qLength || domain.qEnd > domain.qLength){
38 Debug(Debug::WARNING) << "Query alignment start or end is greater than query length in set "
39 << domain.query << "! Skipping line.\n";
40 continue;
41 }
42 if(domain.qStart > domain.qEnd){
43 Debug(Debug::WARNING) << "Query alignment end is greater than start in set "
44 << domain.query << "! Skipping line.\n";
45 continue;
46 }
47 float percentageOverlap = getOverlap(covered, domain.qStart, domain.qEnd);
48 if(domain.tStart > domain.tEnd){
49 Debug(Debug::WARNING) << "Target alignment end is greater than start in set "
50 << domain.query << "! Skipping line.\n";
51 continue;
52 }
53 if(domain.tStart > domain.tLength || domain.tEnd > domain.tLength){
54 Debug(Debug::WARNING) << "Target alignment start or end is greater than target length in set "
55 << domain.query << "! Skipping line.\n";
56 continue;
57 }
58 float targetCov = MathUtil::getCoverage(domain.tStart, domain.tEnd, domain.tLength);
59 if (percentageOverlap <= overlap && targetCov > minCoverage && domain.eValue < eValThreshold) {
60 for (unsigned int j = domain.qStart; j < domain.qEnd; ++j) {
61 covered[j] = true;
62 }
63 result.push_back(domain);
64 }
65 }
66 return result;
67 }
68
readLength(const std::string & file)69 std::map<std::string, unsigned int> readLength(const std::string &file) {
70 std::ifstream mappingStream(file);
71 if (mappingStream.fail()) {
72 Debug(Debug::ERROR) << "File " << file << " not found!\n";
73 EXIT(EXIT_FAILURE);
74 }
75 std::map<std::string, unsigned int> mapping;
76 std::string line;
77 while (std::getline(mappingStream, line)) {
78 std::vector<std::string> split = Util::split(line, "\t");
79 unsigned int length = static_cast<unsigned int>(strtoul(split[1].c_str(), NULL, 10));
80 mapping.emplace(split[0], length);
81 }
82 return mapping;
83 }
84
getEntries(unsigned int queryId,char * data,size_t length,const std::map<std::string,unsigned int> & lengths)85 std::vector<Domain> getEntries(unsigned int queryId, char *data, size_t length, const std::map<std::string, unsigned int> &lengths) {
86 std::vector<Domain> result;
87
88 std::string query = SSTR(queryId);
89
90 std::string line;
91 std::istringstream iss(std::string(data, length));
92 while (std::getline(iss, line)) {
93 std::vector<std::string> fields = Util::split(line.c_str(), "\t");
94
95 unsigned int qStart = static_cast<unsigned int>(strtoul(fields[6].c_str(), NULL, 10)) - 1;
96 unsigned int qEnd = static_cast<unsigned int>(strtoul(fields[7].c_str(), NULL, 10)) - 1;
97
98 std::map<std::string, unsigned int>::const_iterator it = lengths.lower_bound(query);
99 unsigned int qLength;
100 if(it != lengths.end()) {
101 qLength = (*it).second;
102 } else {
103 Debug(Debug::WARNING) << "Missing query length! Skipping line.\n";
104 continue;
105 }
106
107 unsigned int tStart = static_cast<unsigned int>(strtoul(fields[8].c_str(), NULL, 10)) - 1;
108 unsigned int tEnd = static_cast<unsigned int>(strtoul(fields[9].c_str(), NULL, 10)) - 1;
109 it = lengths.lower_bound(fields[1]);
110 unsigned int tLength;
111 if(it != lengths.end()) {
112 tLength = (*it).second;
113 } else {
114 Debug(Debug::WARNING) << "Missing target length! Skipping line.\n";
115 continue;
116 }
117
118 double eValue = strtod(fields[10].c_str(), NULL);
119
120 result.push_back(Domain(query, qStart, qEnd, qLength, fields[1], tStart, tEnd, tLength, eValue));
121 }
122
123 std::stable_sort(result.begin(), result.end());
124
125 return result;
126 }
127
doAnnotate(Parameters & par,DBReader<unsigned int> & blastTabReader,const std::pair<std::string,std::string> & resultdb,const size_t dbFrom,const size_t dbSize,bool merge)128 int doAnnotate(Parameters &par, DBReader<unsigned int> &blastTabReader,
129 const std::pair<std::string, std::string>& resultdb,
130 const size_t dbFrom, const size_t dbSize, bool merge) {
131 DBWriter writer(resultdb.first.c_str(), resultdb.second.c_str(), static_cast<unsigned int>(par.threads), par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
132 writer.open();
133
134 std::map<std::string, unsigned int> lengths = readLength(par.db2);
135
136 Debug::Progress progress(dbSize);
137
138 #pragma omp parallel
139 {
140 unsigned int thread_idx = 0;
141 #ifdef OPENMP
142 thread_idx = static_cast<unsigned int>(omp_get_thread_num());
143 #endif
144 #pragma omp for schedule(dynamic, 100)
145 for (size_t i = dbFrom; i < dbFrom + dbSize; ++i) {
146 progress.updateProgress();
147 unsigned int id = blastTabReader.getDbKey(i);
148
149 char *tabData = blastTabReader.getData(i, thread_idx);
150 size_t tabLength = blastTabReader.getEntryLen(i) - 1;
151 const std::vector<Domain> entries = getEntries(id, tabData, tabLength, lengths);
152 if (entries.empty()) {
153 Debug(Debug::WARNING) << "Can not map any entries for entry " << id << "!\n";
154 continue;
155 }
156
157 std::vector<Domain> result = mapDomains(entries, par.overlap, par.covThr, par.evalThr);
158 if (result.empty()) {
159 Debug(Debug::WARNING) << "Can not map any domains for entry " << id << "!\n";
160 continue;
161 }
162
163 std::ostringstream oss;
164 oss << std::setprecision(std::numeric_limits<float>::digits10);
165
166 for (size_t j = 0; j < result.size(); j++) {
167 Domain d = result[j];
168 d.writeResult(oss);
169 oss << "\n";
170 }
171
172 std::string annotation = oss.str();
173 writer.writeData(annotation.c_str(), annotation.length(), id, thread_idx);
174 }
175 }
176
177 writer.close(merge);
178
179 return EXIT_SUCCESS;
180 }
181
doAnnotate(Parameters & par,const unsigned int mpiRank,const unsigned int mpiNumProc)182 int doAnnotate(Parameters &par, const unsigned int mpiRank, const unsigned int mpiNumProc) {
183 DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
184 reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
185
186 size_t dbFrom = 0;
187 size_t dbSize = 0;
188 reader.decomposeDomainByAminoAcid(mpiRank, mpiNumProc, &dbFrom, &dbSize);
189 std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(par.db3, par.db3Index, mpiRank);
190
191 int status = doAnnotate(par, reader, tmpOutput, dbFrom, dbSize, true);
192
193 reader.close();
194
195 #ifdef HAVE_MPI
196 MPI_Barrier(MPI_COMM_WORLD);
197 #endif
198 // master reduces results
199 if(mpiRank == 0) {
200 std::vector<std::pair<std::string, std::string>> splitFiles;
201 for(unsigned int proc = 0; proc < mpiNumProc; ++proc){
202 std::pair<std::string, std::string> tmpFile = Util::createTmpFileNames(par.db3, par.db3Index, proc);
203 splitFiles.push_back(std::make_pair(tmpFile.first, tmpFile.second));
204 }
205 DBWriter::mergeResults(par.db3, par.db3Index, splitFiles);
206 }
207 return status;
208 }
209
doAnnotate(Parameters & par)210 int doAnnotate(Parameters &par) {
211 DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
212 reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
213 size_t resultSize = reader.getSize();
214 int status = doAnnotate(par, reader, std::make_pair(par.db3, par.db3Index), 0, resultSize, false);
215 reader.close();
216 return status;
217 }
218
summarizetabs(int argc,const char ** argv,const Command & command)219 int summarizetabs(int argc, const char **argv, const Command& command) {
220 Parameters& par = Parameters::getInstance();
221 par.parseParameters(argc, argv, command, true, 0, 0);
222
223 MMseqsMPI::init(argc, argv);
224
225 #ifdef HAVE_MPI
226 int status = doAnnotate(par, MMseqsMPI::rank, MMseqsMPI::numProc);
227 #else
228 int status = doAnnotate(par);
229 #endif
230
231 return status;
232 }
233