1 #include "Parameters.h"
2 #include "Util.h"
3 #include "Debug.h"
4 #include "DBReader.h"
5 #include "DBWriter.h"
6 #include "CompressedA3M.h"
7 #include "MathUtil.h"
8 #include "Domain.h"
9 
10 #include <fstream>
11 #include <iomanip>
12 #include <map>
13 
14 #ifdef OPENMP
15 #include <omp.h>
16 #endif
17 
getOverlap(const std::vector<bool> & covered,unsigned int qStart,unsigned int qEnd)18 static inline float getOverlap(const std::vector<bool>& covered, unsigned int qStart, unsigned int qEnd) {
19     size_t counter = 0;
20     for (size_t i = qStart; i < qEnd; ++i) {
21         counter += covered[i];
22     }
23     return static_cast<float>(counter) / static_cast<float>(qEnd - qStart + 1);
24 }
25 
26 
mapDomains(const std::vector<Domain> & input,float overlap,float minCoverage,double eValThreshold)27 std::vector<Domain> mapDomains(const std::vector<Domain> &input, float overlap, float minCoverage,
28                                double eValThreshold) {
29     std::vector<Domain> result;
30     if (input.empty()) {
31         return result;
32     }
33 
34     std::vector<bool> covered(input[0].qLength, false);
35     for (size_t i = 0; i  < input.size(); i++) {
36         Domain domain = input[i];
37         if(domain.qStart > domain.qLength || domain.qEnd > domain.qLength){
38             Debug(Debug::WARNING) << "Query alignment start or end is greater than query length in set "
39             << domain.query << "! Skipping line.\n";
40             continue;
41         }
42         if(domain.qStart > domain.qEnd){
43             Debug(Debug::WARNING) << "Query alignment end is greater than start in set "
44             << domain.query << "! Skipping line.\n";
45             continue;
46         }
47         float percentageOverlap = getOverlap(covered, domain.qStart, domain.qEnd);
48         if(domain.tStart > domain.tEnd){
49             Debug(Debug::WARNING) << "Target alignment end is greater than start in set "
50             << domain.query << "! Skipping line.\n";
51             continue;
52         }
53         if(domain.tStart > domain.tLength || domain.tEnd > domain.tLength){
54             Debug(Debug::WARNING) << "Target alignment start or end is greater than target length in set "
55             << domain.query << "! Skipping line.\n";
56             continue;
57         }
58         float targetCov = MathUtil::getCoverage(domain.tStart, domain.tEnd, domain.tLength);
59         if (percentageOverlap <= overlap && targetCov > minCoverage && domain.eValue < eValThreshold) {
60             for (unsigned int j = domain.qStart; j < domain.qEnd; ++j) {
61                 covered[j] = true;
62             }
63             result.push_back(domain);
64         }
65     }
66     return result;
67 }
68 
readLength(const std::string & file)69 std::map<std::string, unsigned int> readLength(const std::string &file) {
70     std::ifstream mappingStream(file);
71     if (mappingStream.fail()) {
72         Debug(Debug::ERROR) << "File " << file << " not found!\n";
73         EXIT(EXIT_FAILURE);
74     }
75     std::map<std::string, unsigned int> mapping;
76     std::string line;
77     while (std::getline(mappingStream, line)) {
78         std::vector<std::string> split = Util::split(line, "\t");
79         unsigned int length = static_cast<unsigned int>(strtoul(split[1].c_str(), NULL, 10));
80         mapping.emplace(split[0], length);
81     }
82     return mapping;
83 }
84 
getEntries(unsigned int queryId,char * data,size_t length,const std::map<std::string,unsigned int> & lengths)85 std::vector<Domain> getEntries(unsigned int queryId, char *data, size_t length, const std::map<std::string, unsigned int> &lengths) {
86     std::vector<Domain> result;
87 
88     std::string query = SSTR(queryId);
89 
90     std::string line;
91     std::istringstream iss(std::string(data, length));
92     while (std::getline(iss, line)) {
93         std::vector<std::string> fields = Util::split(line.c_str(), "\t");
94 
95         unsigned int qStart = static_cast<unsigned int>(strtoul(fields[6].c_str(), NULL, 10)) - 1;
96         unsigned int qEnd = static_cast<unsigned int>(strtoul(fields[7].c_str(), NULL, 10)) - 1;
97 
98         std::map<std::string, unsigned int>::const_iterator it = lengths.lower_bound(query);
99         unsigned int qLength;
100         if(it != lengths.end()) {
101             qLength = (*it).second;
102         } else {
103             Debug(Debug::WARNING) << "Missing query length! Skipping line.\n";
104             continue;
105         }
106 
107         unsigned int tStart = static_cast<unsigned int>(strtoul(fields[8].c_str(), NULL, 10)) - 1;
108         unsigned int tEnd = static_cast<unsigned int>(strtoul(fields[9].c_str(), NULL, 10)) - 1;
109         it = lengths.lower_bound(fields[1]);
110         unsigned int tLength;
111         if(it != lengths.end()) {
112             tLength = (*it).second;
113         } else {
114             Debug(Debug::WARNING) << "Missing target length! Skipping line.\n";
115             continue;
116         }
117 
118         double eValue = strtod(fields[10].c_str(), NULL);
119 
120         result.push_back(Domain(query, qStart, qEnd, qLength, fields[1], tStart, tEnd, tLength, eValue));
121     }
122 
123     std::stable_sort(result.begin(), result.end());
124 
125     return result;
126 }
127 
doAnnotate(Parameters & par,DBReader<unsigned int> & blastTabReader,const std::pair<std::string,std::string> & resultdb,const size_t dbFrom,const size_t dbSize,bool merge)128 int doAnnotate(Parameters &par, DBReader<unsigned int> &blastTabReader,
129                const std::pair<std::string, std::string>& resultdb,
130                const size_t dbFrom, const size_t dbSize, bool merge) {
131     DBWriter writer(resultdb.first.c_str(), resultdb.second.c_str(), static_cast<unsigned int>(par.threads), par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
132     writer.open();
133 
134     std::map<std::string, unsigned int> lengths = readLength(par.db2);
135 
136     Debug::Progress progress(dbSize);
137 
138 #pragma omp parallel
139     {
140         unsigned int thread_idx = 0;
141 #ifdef OPENMP
142         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
143 #endif
144 #pragma omp for schedule(dynamic, 100)
145         for (size_t i = dbFrom; i < dbFrom + dbSize; ++i) {
146             progress.updateProgress();
147             unsigned int id = blastTabReader.getDbKey(i);
148 
149             char *tabData = blastTabReader.getData(i, thread_idx);
150             size_t tabLength = blastTabReader.getEntryLen(i) - 1;
151             const std::vector<Domain> entries = getEntries(id, tabData, tabLength, lengths);
152             if (entries.empty()) {
153                 Debug(Debug::WARNING) << "Can not map any entries for entry " << id << "!\n";
154                 continue;
155             }
156 
157             std::vector<Domain> result = mapDomains(entries, par.overlap, par.covThr, par.evalThr);
158             if (result.empty()) {
159                 Debug(Debug::WARNING) << "Can not map any domains for entry " << id << "!\n";
160                 continue;
161             }
162 
163             std::ostringstream oss;
164             oss << std::setprecision(std::numeric_limits<float>::digits10);
165 
166             for (size_t j = 0; j < result.size(); j++) {
167                 Domain d = result[j];
168                 d.writeResult(oss);
169                 oss << "\n";
170             }
171 
172             std::string annotation = oss.str();
173             writer.writeData(annotation.c_str(), annotation.length(), id, thread_idx);
174         }
175     }
176 
177     writer.close(merge);
178 
179     return EXIT_SUCCESS;
180 }
181 
doAnnotate(Parameters & par,const unsigned int mpiRank,const unsigned int mpiNumProc)182 int doAnnotate(Parameters &par, const unsigned int mpiRank, const unsigned int mpiNumProc) {
183     DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
184     reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
185 
186     size_t dbFrom = 0;
187     size_t dbSize = 0;
188     reader.decomposeDomainByAminoAcid(mpiRank, mpiNumProc, &dbFrom, &dbSize);
189     std::pair<std::string, std::string> tmpOutput = Util::createTmpFileNames(par.db3, par.db3Index, mpiRank);
190 
191     int status = doAnnotate(par, reader, tmpOutput, dbFrom, dbSize, true);
192 
193     reader.close();
194 
195 #ifdef HAVE_MPI
196     MPI_Barrier(MPI_COMM_WORLD);
197 #endif
198     // master reduces results
199     if(mpiRank == 0) {
200         std::vector<std::pair<std::string, std::string>> splitFiles;
201         for(unsigned int proc = 0; proc < mpiNumProc; ++proc){
202             std::pair<std::string, std::string> tmpFile = Util::createTmpFileNames(par.db3, par.db3Index, proc);
203             splitFiles.push_back(std::make_pair(tmpFile.first,  tmpFile.second));
204         }
205         DBWriter::mergeResults(par.db3, par.db3Index, splitFiles);
206     }
207     return status;
208 }
209 
doAnnotate(Parameters & par)210 int doAnnotate(Parameters &par) {
211     DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
212     reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
213     size_t resultSize = reader.getSize();
214     int status = doAnnotate(par, reader, std::make_pair(par.db3, par.db3Index), 0, resultSize, false);
215     reader.close();
216     return status;
217 }
218 
summarizetabs(int argc,const char ** argv,const Command & command)219 int summarizetabs(int argc, const char **argv, const Command& command) {
220     Parameters& par = Parameters::getInstance();
221     par.parseParameters(argc, argv, command, true, 0, 0);
222 
223     MMseqsMPI::init(argc, argv);
224 
225 #ifdef HAVE_MPI
226     int status = doAnnotate(par, MMseqsMPI::rank, MMseqsMPI::numProc);
227 #else
228     int status = doAnnotate(par);
229 #endif
230 
231     return status;
232 }
233