1 #include "Util.h"
2 #include "Parameters.h"
3 #include "Matcher.h"
4 #include "Debug.h"
5 #include "DBReader.h"
6 #include "DBWriter.h"
7 #include "IndexReader.h"
8 #include "FileUtil.h"
9 #include "TranslateNucl.h"
10 #include "Sequence.h"
11 #include "Orf.h"
12 #include "MemoryMapped.h"
13 #include "NcbiTaxonomy.h"
14
15 #define ZSTD_STATIC_LINKING_ONLY
16 #include <zstd.h>
17 #include "result_viz_prelude.html.zst.h"
18
19 #include <map>
20
21 #ifdef OPENMP
22 #include <omp.h>
23 #endif
24
25
printSeqBasedOnAln(std::string & out,const char * seq,unsigned int offset,const std::string & bt,bool reverse,bool isReverseStrand,bool translateSequence,const TranslateNucl & translateNucl)26 void printSeqBasedOnAln(std::string &out, const char *seq, unsigned int offset,
27 const std::string &bt, bool reverse, bool isReverseStrand,
28 bool translateSequence, const TranslateNucl &translateNucl) {
29 unsigned int seqPos = 0;
30 char codon[3];
31 for (uint32_t i = 0; i < bt.size(); ++i) {
32 char seqChar = (isReverseStrand == true) ? Orf::complement(seq[offset - seqPos]) : seq[offset + seqPos];
33 if (translateSequence) {
34 codon[0] = (isReverseStrand == true) ? Orf::complement(seq[offset - seqPos]) : seq[offset + seqPos];
35 codon[1] = (isReverseStrand == true) ? Orf::complement(seq[offset - (seqPos+1)]) : seq[offset + (seqPos+1)];
36 codon[2] = (isReverseStrand == true) ? Orf::complement(seq[offset - (seqPos+2)]) : seq[offset + (seqPos+2)];
37 seqChar = translateNucl.translateSingleCodon(codon);
38 }
39 switch (bt[i]) {
40 case 'M':
41 out.append(1, seqChar);
42 seqPos += (translateSequence) ? 3 : 1;
43 break;
44 case 'I':
45 if (reverse == true) {
46 out.append(1, '-');
47 } else {
48 out.append(1, seqChar);
49 seqPos += (translateSequence) ? 3 : 1;
50 }
51 break;
52 case 'D':
53 if (reverse == true) {
54 out.append(1, seqChar);
55 seqPos += (translateSequence) ? 3 : 1;
56 } else {
57 out.append(1, '-');
58 }
59 break;
60 }
61 }
62 }
63
64 /*
65 query Query sequence label
66 target Target sequenc label
67 evalue E-value
68 gapopen Number of gap opens
69 pident Percentage of identical matches
70 nident Number of identical matches
71 qstart 1-based start position of alignment in query sequence
72 qend 1-based end position of alignment in query sequence
73 qlen Query sequence length
74 tstart 1-based start position of alignment in target sequence
75 tend 1-based end position of alignment in target sequence
76 tlen Target sequence length
77 alnlen Number of alignment columns
78 raw Raw alignment score
79 bits Bit score
80 cigar Alignment as string M=letter pair, D=delete (gap in query), I=insert (gap in target)
81 qseq Full-length query sequence
82 tseq Full-length target sequence
83 qheader Header of Query sequence
84 theader Header of Target sequence
85 qaln Aligned query sequence with gaps
86 taln Aligned target sequence with gaps
87 qframe Query frame (-3 to +3)
88 tframe Target frame (-3 to +3)
89 mismatch Number of mismatches
90 qcov Fraction of query sequence covered by alignment
91 tcov Fraction of target sequence covered by alignment
92 qset Query set
93 tset Target set
94 */
95
readKeyToSet(const std::string & file)96 std::map<unsigned int, unsigned int> readKeyToSet(const std::string& file) {
97 std::map<unsigned int, unsigned int> mapping;
98 if (file.length() == 0) {
99 return mapping;
100 }
101
102 MemoryMapped lookup(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
103 char* data = (char *) lookup.getData();
104 const char* entry[255];
105 while (*data != '\0') {
106 const size_t columns = Util::getWordsOfLine(data, entry, 255);
107 if (columns < 3) {
108 Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n";
109 continue;
110 }
111 mapping.emplace(Util::fast_atoi<unsigned int>(entry[0]), Util::fast_atoi<unsigned int>(entry[2]));
112 data = Util::skipLine(data);
113 }
114 lookup.close();
115 return mapping;
116 }
117
118
readSetToSource(const std::string & file)119 std::map<unsigned int, std::string> readSetToSource(const std::string& file) {
120 std::map<unsigned int, std::string> mapping;
121 if (file.length() == 0) {
122 return mapping;
123 }
124
125 MemoryMapped source(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
126 char* data = (char *) source.getData();
127 const char* entry[255];
128 while (*data != '\0') {
129 const size_t columns = Util::getWordsOfLine(data, entry, 255);
130 if (columns < 2) {
131 Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n";
132 continue;
133 }
134 data = Util::skipLine(data);
135 std::string source(entry[1], data - entry[1] - 1);
136 mapping.emplace(Util::fast_atoi<unsigned int>(entry[0]), source);
137 }
138 source.close();
139 return mapping;
140 }
141
compareToFirstInt(const std::pair<unsigned int,unsigned int> & lhs,const std::pair<unsigned int,unsigned int> & rhs)142 static bool compareToFirstInt(const std::pair<unsigned int, unsigned int>& lhs, const std::pair<unsigned int, unsigned int>& rhs){
143 return (lhs.first <= rhs.first);
144 }
145
convertalignments(int argc,const char ** argv,const Command & command)146 int convertalignments(int argc, const char **argv, const Command &command) {
147 Parameters &par = Parameters::getInstance();
148 par.parseParameters(argc, argv, command, true, 0, 0);
149
150 const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false;
151 const int format = par.formatAlignmentMode;
152 const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
153
154 bool needSequenceDB = false;
155 bool needBacktrace = false;
156 bool needFullHeaders = false;
157 bool needLookup = false;
158 bool needSource = false;
159 bool needTaxonomy = false;
160 bool needTaxonomyMapping = false;
161 const std::vector<int> outcodes = Parameters::getOutputFormat(format, par.outfmt, needSequenceDB, needBacktrace, needFullHeaders,
162 needLookup, needSource, needTaxonomyMapping, needTaxonomy);
163
164 NcbiTaxonomy * t = NULL;
165 std::vector<std::pair<unsigned int, unsigned int>> mapping;
166 if(needTaxonomy){
167 std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
168 t = NcbiTaxonomy::openTaxonomy(db2NoIndexName);
169 }
170 if(needTaxonomy || needTaxonomyMapping){
171 std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
172 if(FileUtil::fileExists(std::string(db2NoIndexName + "_mapping").c_str()) == false){
173 Debug(Debug::ERROR) << db2NoIndexName + "_mapping" << " does not exist. Please create the taxonomy mapping!\n";
174 EXIT(EXIT_FAILURE);
175 }
176 bool isSorted = Util::readMapping( db2NoIndexName + "_mapping", mapping);
177 if(isSorted == false){
178 std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt);
179 }
180 }
181
182 bool isTranslatedSearch = false;
183
184 int dbaccessMode = needSequenceDB ? (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA) : (DBReader<unsigned int>::USE_INDEX);
185
186 std::map<unsigned int, unsigned int> qKeyToSet;
187 std::map<unsigned int, unsigned int> tKeyToSet;
188 if (needLookup) {
189 std::string file1 = par.db1 + ".lookup";
190 std::string file2 = par.db2 + ".lookup";
191 qKeyToSet = readKeyToSet(file1);
192 tKeyToSet = readKeyToSet(file2);
193 }
194
195 std::map<unsigned int, std::string> qSetToSource;
196 std::map<unsigned int, std::string> tSetToSource;
197 if (needSource) {
198 std::string file1 = par.db1 + ".source";
199 std::string file2 = par.db2 + ".source";
200 qSetToSource = readSetToSource(file1);
201 tSetToSource = readSetToSource(file2);
202 }
203
204 IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
205 IndexReader qDbrHeader(par.db1, par.threads, IndexReader::SRC_HEADERS , (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
206
207 IndexReader *tDbr;
208 IndexReader *tDbrHeader;
209 if (sameDB) {
210 tDbr = &qDbr;
211 tDbrHeader= &qDbrHeader;
212 } else {
213 tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
214 tDbrHeader = new IndexReader(par.db2, par.threads, IndexReader::SRC_HEADERS, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
215 }
216
217 bool queryNucs = Parameters::isEqualDbtype(qDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES);
218 bool targetNucs = Parameters::isEqualDbtype(tDbr->sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES);
219 if (needSequenceDB) {
220 // try to figure out if search was translated. This is can not be solved perfectly.
221 bool seqtargetAA = false;
222 if(Parameters::isEqualDbtype(tDbr->getDbtype(), Parameters::DBTYPE_INDEX_DB)){
223 IndexReader tseqDbr(par.db2, par.threads, IndexReader::SEQUENCES, 0, IndexReader::PRELOAD_INDEX);
224 seqtargetAA = Parameters::isEqualDbtype(tseqDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_AMINO_ACIDS);
225 } else if(targetNucs == true && queryNucs == true && par.searchType == Parameters::SEARCH_TYPE_AUTO){
226 Debug(Debug::WARNING) << "It is unclear from the input if a translated or nucleotide search was performed\n "
227 "Please provide the parameter --search-type 2 (translated) or 3 (nucleotide)\n";
228 EXIT(EXIT_FAILURE);
229 } else if(par.searchType == Parameters::SEARCH_TYPE_TRANSLATED){
230 seqtargetAA = true;
231 }
232
233 if((targetNucs == true && queryNucs == false ) || (targetNucs == false && queryNucs == true ) || (targetNucs == true && seqtargetAA == true && queryNucs == true ) ){
234 isTranslatedSearch = true;
235 }
236 }
237
238 int gapOpen, gapExtend;
239 SubstitutionMatrix * subMat= NULL;
240 if (targetNucs == true && queryNucs == true && isTranslatedSearch == false) {
241 subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
242 gapOpen = par.gapOpen.nucleotides;
243 gapExtend = par.gapExtend.nucleotides;
244 }else{
245 subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
246 gapOpen = par.gapOpen.aminoacids;
247 gapExtend = par.gapExtend.aminoacids;
248 }
249 EvalueComputation *evaluer = NULL;
250 bool queryProfile = false;
251 bool targetProfile = false;
252 if (needSequenceDB) {
253 queryProfile = Parameters::isEqualDbtype(qDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_HMM_PROFILE);
254 targetProfile = Parameters::isEqualDbtype(tDbr->sequenceReader->getDbtype(), Parameters::DBTYPE_HMM_PROFILE);
255 evaluer = new EvalueComputation(tDbr->sequenceReader->getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
256 }
257
258 DBReader<unsigned int> alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
259 alnDbr.open(DBReader<unsigned int>::LINEAR_ACCCESS);
260
261 unsigned int localThreads = 1;
262 #ifdef OPENMP
263 localThreads = std::min((unsigned int)par.threads, (unsigned int)alnDbr.getSize());
264 #endif
265
266 const bool shouldCompress = par.dbOut == true && par.compressed == true;
267 const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE;
268 DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), localThreads, shouldCompress, dbType);
269 resultWriter.open();
270
271 const bool isDb = par.dbOut;
272 TranslateNucl translateNucl(static_cast<TranslateNucl::GenCode>(par.translationTable));
273
274 if (format == Parameters::FORMAT_ALIGNMENT_SAM) {
275 char buffer[1024];
276 unsigned int lastKey = tDbr->sequenceReader->getLastKey();
277 bool *headerWritten = new bool[lastKey + 1];
278 memset(headerWritten, 0, sizeof(bool) * (lastKey + 1));
279 resultWriter.writeStart(0);
280 std::string header = "@HD\tVN:1.4\tSO:queryname\n";
281 resultWriter.writeAdd(header.c_str(), header.size(), 0);
282
283 for (size_t i = 0; i < alnDbr.getSize(); i++) {
284 char *data = alnDbr.getData(i, 0);
285 while (*data != '\0') {
286 char dbKeyBuffer[255 + 1];
287 Util::parseKey(data, dbKeyBuffer);
288 const unsigned int dbKey = (unsigned int) strtoul(dbKeyBuffer, NULL, 10);
289 if (headerWritten[dbKey] == false) {
290 headerWritten[dbKey] = true;
291 unsigned int tId = tDbr->sequenceReader->getId(dbKey);
292 unsigned int seqLen = tDbr->sequenceReader->getSeqLen(tId);
293 unsigned int tHeaderId = tDbrHeader->sequenceReader->getId(dbKey);
294 const char *tHeader = tDbrHeader->sequenceReader->getData(tHeaderId, 0);
295 std::string targetId = Util::parseFastaHeader(tHeader);
296 int count = snprintf(buffer, sizeof(buffer), "@SQ\tSN:%s\tLN:%d\n", targetId.c_str(),
297 (int32_t) seqLen);
298 if (count < 0 || static_cast<size_t>(count) >= sizeof(buffer)) {
299 Debug(Debug::WARNING) << "Truncated line in header " << i << "!\n";
300 continue;
301 }
302 resultWriter.writeAdd(buffer, count, 0);
303 }
304 resultWriter.writeEnd(0, 0, false, 0);
305 data = Util::skipLine(data);
306 }
307 }
308 delete[] headerWritten;
309 } else if (format == Parameters::FORMAT_ALIGNMENT_HTML) {
310 size_t dstSize = ZSTD_findDecompressedSize(result_viz_prelude_html_zst, result_viz_prelude_html_zst_len);
311 char* dst = (char*)malloc(sizeof(char) * dstSize);
312 size_t realSize = ZSTD_decompress(dst, dstSize, result_viz_prelude_html_zst, result_viz_prelude_html_zst_len);
313 resultWriter.writeData(dst, realSize, 0, 0, false, false);
314 const char* scriptBlock = "<script>render([";
315 resultWriter.writeData(scriptBlock, strlen(scriptBlock), 0, 0, false, false);
316 free(dst);
317 }
318
319 Debug::Progress progress(alnDbr.getSize());
320 #pragma omp parallel num_threads(localThreads)
321 {
322 unsigned int thread_idx = 0;
323 #ifdef OPENMP
324 thread_idx = static_cast<unsigned int>(omp_get_thread_num());
325 #endif
326 char buffer[1024];
327
328 std::string result;
329 result.reserve(1024*1024);
330
331 std::string queryProfData;
332 queryProfData.reserve(1024);
333
334 std::string queryBuffer;
335 queryBuffer.reserve(1024);
336
337 std::string queryHeaderBuffer;
338 queryHeaderBuffer.reserve(1024);
339
340 std::string targetProfData;
341 targetProfData.reserve(1024);
342
343 std::string newBacktrace;
344 newBacktrace.reserve(1024);
345
346 const TaxonNode * taxonNode = NULL;
347
348 #pragma omp for schedule(dynamic, 10)
349 for (size_t i = 0; i < alnDbr.getSize(); i++) {
350 progress.updateProgress();
351
352 const unsigned int queryKey = alnDbr.getDbKey(i);
353 char *querySeqData = NULL;
354 size_t querySeqLen = 0;
355 queryProfData.clear();
356 if (needSequenceDB) {
357 size_t qId = qDbr.sequenceReader->getId(queryKey);
358 querySeqData = qDbr.sequenceReader->getData(qId, thread_idx);
359 querySeqLen = qDbr.sequenceReader->getSeqLen(qId);
360 if(sameDB && qDbr.sequenceReader->isCompressed()){
361 queryBuffer.assign(querySeqData, querySeqLen);
362 querySeqData = (char*) queryBuffer.c_str();
363 }
364 if (queryProfile) {
365 Sequence::extractProfileConsensus(querySeqData, *subMat, queryProfData);
366 }
367 }
368
369 size_t qHeaderId = qDbrHeader.sequenceReader->getId(queryKey);
370 const char *qHeader = qDbrHeader.sequenceReader->getData(qHeaderId, thread_idx);
371 size_t qHeaderLen = qDbrHeader.sequenceReader->getSeqLen(qHeaderId);
372 std::string queryId = Util::parseFastaHeader(qHeader);
373 if (sameDB && needFullHeaders) {
374 queryHeaderBuffer.assign(qHeader, qHeaderLen);
375 qHeader = (char*) queryHeaderBuffer.c_str();
376 }
377
378 if (format == Parameters::FORMAT_ALIGNMENT_HTML) {
379 const char* jsStart = "{\"query\": {\"accession\": \"%s\",\"sequence\": \"";
380 int count = snprintf(buffer, sizeof(buffer), jsStart, queryId.c_str(), querySeqData);
381 if (count < 0 || static_cast<size_t>(count) >= sizeof(buffer)) {
382 Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n";
383 continue;
384 }
385 result.append(buffer, count);
386 if (queryProfile) {
387 result.append(queryProfData);
388 } else {
389 result.append(querySeqData, querySeqLen);
390 }
391 result.append("\"}, \"alignments\": [\n");
392 }
393
394 char *data = alnDbr.getData(i, thread_idx);
395 while (*data != '\0') {
396 Matcher::result_t res = Matcher::parseAlignmentRecord(data, true);
397 data = Util::skipLine(data);
398
399 if (res.backtrace.empty() && needBacktrace == true) {
400 Debug(Debug::ERROR) << "Backtrace cigar is missing in the alignment result. Please recompute the alignment with the -a flag.\n"
401 "Command: mmseqs align " << par.db1 << " " << par.db2 << " " << par.db3 << " " << "alnNew -a\n";
402 EXIT(EXIT_FAILURE);
403 }
404
405 size_t tHeaderId = tDbrHeader->sequenceReader->getId(res.dbKey);
406 const char *tHeader = tDbrHeader->sequenceReader->getData(tHeaderId, thread_idx);
407 size_t tHeaderLen = tDbrHeader->sequenceReader->getSeqLen(tHeaderId);
408 std::string targetId = Util::parseFastaHeader(tHeader);
409
410 unsigned int gapOpenCount = 0;
411 unsigned int alnLen = res.alnLength;
412 unsigned int missMatchCount = 0;
413 unsigned int identical = 0;
414 if (res.backtrace.empty() == false) {
415 size_t matchCount = 0;
416 alnLen = 0;
417 for (size_t pos = 0; pos < res.backtrace.size(); pos++) {
418 int cnt = 0;
419 if (isdigit(res.backtrace[pos])) {
420 cnt += Util::fast_atoi<int>(res.backtrace.c_str() + pos);
421 while (isdigit(res.backtrace[pos])) {
422 pos++;
423 }
424 }
425 alnLen += cnt;
426
427 switch (res.backtrace[pos]) {
428 case 'M':
429 matchCount += cnt;
430 break;
431 case 'D':
432 case 'I':
433 gapOpenCount += 1;
434 break;
435 }
436 }
437 // res.seqId = X / alnLen;
438 identical = static_cast<unsigned int>(res.seqId * static_cast<float>(alnLen) + 0.5);
439 //res.alnLength = alnLen;
440 missMatchCount = static_cast<unsigned int>( matchCount - identical);
441 } else {
442 const int adjustQstart = (res.qStartPos == -1) ? 0 : res.qStartPos;
443 const int adjustDBstart = (res.dbStartPos == -1) ? 0 : res.dbStartPos;
444 const float bestMatchEstimate = static_cast<float>(std::min(abs(res.qEndPos - adjustQstart), abs(res.dbEndPos - adjustDBstart)));
445 missMatchCount = static_cast<unsigned int>(bestMatchEstimate * (1.0f - res.seqId) + 0.5);
446 }
447
448 switch (format) {
449 case Parameters::FORMAT_ALIGNMENT_BLAST_TAB: {
450 if (outcodes.empty()) {
451 int count = snprintf(buffer, sizeof(buffer),
452 "%s\t%s\t%1.3f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2E\t%d\n",
453 queryId.c_str(), targetId.c_str(), res.seqId, alnLen,
454 missMatchCount, gapOpenCount,
455 res.qStartPos + 1, res.qEndPos + 1,
456 res.dbStartPos + 1, res.dbEndPos + 1,
457 res.eval, res.score);
458 if (count < 0 || static_cast<size_t>(count) >= sizeof(buffer)) {
459 Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n";
460 continue;
461 }
462 result.append(buffer, count);
463 } else {
464 char *targetSeqData = NULL;
465 targetProfData.clear();
466 unsigned int taxon = 0;
467
468 if(needTaxonomy || needTaxonomyMapping) {
469 std::pair<unsigned int, unsigned int> val;
470 val.first = res.dbKey;
471 std::vector<std::pair<unsigned int, unsigned int>>::iterator mappingIt;
472 mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt);
473 if (mappingIt == mapping.end() || mappingIt->first != val.first) {
474 taxon = 0;
475 taxonNode = NULL;
476 }else{
477 taxon = mappingIt->second;
478 if(needTaxonomy){
479 taxonNode = t->taxonNode(taxon, false);
480 }
481 }
482
483 }
484
485 if (needSequenceDB) {
486 size_t tId = tDbr->sequenceReader->getId(res.dbKey);
487 targetSeqData = tDbr->sequenceReader->getData(tId, thread_idx);
488 if (targetProfile) {
489 Sequence::extractProfileConsensus(targetSeqData, *subMat, targetProfData);
490 }
491 }
492 for(size_t i = 0; i < outcodes.size(); i++) {
493 switch (outcodes[i]) {
494 case Parameters::OUTFMT_QUERY:
495 result.append(queryId);
496 break;
497 case Parameters::OUTFMT_TARGET:
498 result.append(targetId);
499 break;
500 case Parameters::OUTFMT_EVALUE:
501 result.append(SSTR(res.eval));
502 break;
503 case Parameters::OUTFMT_GAPOPEN:
504 result.append(SSTR(gapOpenCount));
505 break;
506 case Parameters::OUTFMT_FIDENT:
507 result.append(SSTR(res.seqId));
508 break;
509 case Parameters::OUTFMT_PIDENT:
510 result.append(SSTR(res.seqId*100));
511 break;
512 case Parameters::OUTFMT_NIDENT:
513 result.append(SSTR(identical));
514 break;
515 case Parameters::OUTFMT_QSTART:
516 result.append(SSTR(res.qStartPos + 1));
517 break;
518 case Parameters::OUTFMT_QEND:
519 result.append(SSTR(res.qEndPos + 1));
520 break;
521 case Parameters::OUTFMT_QLEN:
522 result.append(SSTR(res.qLen));
523 break;
524 case Parameters::OUTFMT_TSTART:
525 result.append(SSTR(res.dbStartPos + 1));
526 break;
527 case Parameters::OUTFMT_TEND:
528 result.append(SSTR(res.dbEndPos + 1));
529 break;
530 case Parameters::OUTFMT_TLEN:
531 result.append(SSTR(res.dbLen));
532 break;
533 case Parameters::OUTFMT_ALNLEN:
534 result.append(SSTR(alnLen));
535 break;
536 case Parameters::OUTFMT_RAW:
537 result.append(SSTR(static_cast<int>(evaluer->computeRawScoreFromBitScore(res.score) + 0.5)));
538 break;
539 case Parameters::OUTFMT_BITS:
540 result.append(SSTR(res.score));
541 break;
542 case Parameters::OUTFMT_CIGAR:
543 if(isTranslatedSearch == true && targetNucs == true && queryNucs == true ){
544 Matcher::result_t::protein2nucl(res.backtrace, newBacktrace);
545 res.backtrace = newBacktrace;
546 }
547 result.append(SSTR(res.backtrace));
548 newBacktrace.clear();
549 break;
550 case Parameters::OUTFMT_QSEQ:
551 if (queryProfile) {
552 result.append(queryProfData.c_str(), res.qLen);
553 } else {
554 result.append(querySeqData, res.qLen);
555 }
556 break;
557 case Parameters::OUTFMT_TSEQ:
558 if (targetProfile) {
559 result.append(targetProfData.c_str(), res.dbLen);
560 } else {
561 result.append(targetSeqData, res.dbLen);
562 }
563 break;
564 case Parameters::OUTFMT_QHEADER:
565 result.append(qHeader, qHeaderLen);
566 break;
567 case Parameters::OUTFMT_THEADER:
568 result.append(tHeader, tHeaderLen);
569 break;
570 case Parameters::OUTFMT_QALN:
571 if (queryProfile) {
572 printSeqBasedOnAln(result, queryProfData.c_str(), res.qStartPos,
573 Matcher::uncompressAlignment(res.backtrace), false, (res.qStartPos > res.qEndPos),
574 (isTranslatedSearch == true && queryNucs == true), translateNucl);
575 } else {
576 printSeqBasedOnAln(result, querySeqData, res.qStartPos,
577 Matcher::uncompressAlignment(res.backtrace), false, (res.qStartPos > res.qEndPos),
578 (isTranslatedSearch == true && queryNucs == true), translateNucl);
579 }
580 break;
581 case Parameters::OUTFMT_TALN: {
582 if (targetProfile) {
583 printSeqBasedOnAln(result, targetProfData.c_str(), res.dbStartPos,
584 Matcher::uncompressAlignment(res.backtrace), true,
585 (res.dbStartPos > res.dbEndPos),
586 (isTranslatedSearch == true && targetNucs == true), translateNucl);
587 } else {
588 printSeqBasedOnAln(result, targetSeqData, res.dbStartPos,
589 Matcher::uncompressAlignment(res.backtrace), true,
590 (res.dbStartPos > res.dbEndPos),
591 (isTranslatedSearch == true && targetNucs == true), translateNucl);
592 }
593 break;
594 }
595 case Parameters::OUTFMT_MISMATCH:
596 result.append(SSTR(missMatchCount));
597 break;
598 case Parameters::OUTFMT_QCOV:
599 result.append(SSTR(res.qcov));
600 break;
601 case Parameters::OUTFMT_TCOV:
602 result.append(SSTR(res.dbcov));
603 break;
604 case Parameters::OUTFMT_QSET:
605 result.append(SSTR(qSetToSource[qKeyToSet[queryKey]]));
606 break;
607 case Parameters::OUTFMT_QSETID:
608 result.append(SSTR(qKeyToSet[queryKey]));
609 break;
610 case Parameters::OUTFMT_TSET:
611 result.append(SSTR(tSetToSource[tKeyToSet[res.dbKey]]));
612 break;
613 case Parameters::OUTFMT_TSETID:
614 result.append(SSTR(tKeyToSet[res.dbKey]));
615 break;
616 case Parameters::OUTFMT_TAXID:
617 result.append(SSTR(taxon));
618 break;
619 case Parameters::OUTFMT_TAXNAME:
620 result.append((taxonNode != NULL) ? t->getString(taxonNode->nameIdx) : "unclassified");
621 break;
622 case Parameters::OUTFMT_TAXLIN:
623 result.append((taxonNode != NULL) ? t->taxLineage(taxonNode, true) : "unclassified");
624 break;
625 case Parameters::OUTFMT_EMPTY:
626 result.push_back('-');
627 break;
628 case Parameters::OUTFMT_QORFSTART:
629 result.append(SSTR(res.queryOrfStartPos));
630 break;
631 case Parameters::OUTFMT_QORFEND:
632 result.append(SSTR(res.queryOrfEndPos));
633 break;
634 case Parameters::OUTFMT_TORFSTART:
635 result.append(SSTR(res.dbOrfStartPos));
636 break;
637 case Parameters::OUTFMT_TORFEND:
638 result.append(SSTR(res.dbOrfEndPos));
639 break;
640 }
641 if (i < outcodes.size() - 1) {
642 result.push_back('\t');
643 }
644 }
645 result.push_back('\n');
646 }
647 break;
648 }
649 case Parameters::FORMAT_ALIGNMENT_BLAST_WITH_LEN: {
650 int count = snprintf(buffer, sizeof(buffer),
651 "%s\t%s\t%1.3f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.2E\t%d\t%d\t%d\n",
652 queryId.c_str(), targetId.c_str(), res.seqId, alnLen,
653 missMatchCount, gapOpenCount,
654 res.qStartPos + 1, res.qEndPos + 1,
655 res.dbStartPos + 1, res.dbEndPos + 1,
656 res.eval, res.score,
657 res.qLen, res.dbLen);
658
659 if (count < 0 || static_cast<size_t>(count) >= sizeof(buffer)) {
660 Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n";
661 continue;
662 }
663
664 result.append(buffer, count);
665 break;
666 }
667 case Parameters::FORMAT_ALIGNMENT_SAM: {
668 bool strand = res.qEndPos > res.qStartPos;
669 int rawScore = static_cast<int>(evaluer->computeRawScoreFromBitScore(res.score) + 0.5);
670 uint32_t mapq = -4.343 * log(exp(static_cast<double>(-rawScore)));
671 mapq = (uint32_t) (mapq + 4.99);
672 mapq = mapq < 254 ? mapq : 254;
673 int count = snprintf(buffer, sizeof(buffer), "%s\t%d\t%s\t%d\t%d\t", queryId.c_str(), (strand) ? 16: 0, targetId.c_str(), res.dbStartPos + 1, mapq);
674 if (count < 0 || static_cast<size_t>(count) >= sizeof(buffer)) {
675 Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n";
676 continue;
677 }
678 result.append(buffer, count);
679 if (isTranslatedSearch == true && targetNucs == true && queryNucs == true) {
680 Matcher::result_t::protein2nucl(res.backtrace, newBacktrace);
681 result.append(newBacktrace);
682 newBacktrace.clear();
683
684 } else {
685 result.append(res.backtrace);
686 }
687 result.append("\t*\t0\t0\t");
688 int start = std::min(res.qStartPos, res.qEndPos);
689 int end = std::max(res.qStartPos, res.qEndPos);
690 if (queryProfile) {
691 result.append(queryProfData.c_str() + start, (end + 1) - start);
692 } else {
693 result.append(querySeqData + start, (end + 1) - start);
694 }
695 count = snprintf(buffer, sizeof(buffer), "\t*\tAS:i:%d\tNM:i:%d\n", rawScore, missMatchCount);
696 if (count < 0 || static_cast<size_t>(count) >= sizeof(buffer)) {
697 Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n";
698 continue;
699 }
700 result.append(buffer, count);
701 break;
702 }
703 case Parameters::FORMAT_ALIGNMENT_HTML: {
704 const char* jsAln = "{\"target\": \"%s\", \"seqId\": %1.3f, \"alnLen\": %d, \"mismatch\": %d, \"gapopen\": %d, \"qStartPos\": %d, \"qEndPos\": %d, \"dbStartPos\": %d, \"dbEndPos\": %d, \"eval\": %.2E, \"score\": %d, \"qLen\": %d, \"dbLen\": %d, \"qAln\": \"";
705 int count = snprintf(buffer, sizeof(buffer), jsAln,
706 targetId.c_str(), res.seqId, alnLen,
707 missMatchCount, gapOpenCount,
708 res.qStartPos + 1, res.qEndPos + 1,
709 res.dbStartPos + 1, res.dbEndPos + 1,
710 res.eval, res.score,
711 res.qLen, res.dbLen);
712
713 if (count < 0 || static_cast<size_t>(count) >= sizeof(buffer)) {
714 Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n";
715 continue;
716 }
717 result.append(buffer, count);
718 if (queryProfile) {
719 printSeqBasedOnAln(result, queryProfData.c_str(), res.qStartPos,
720 Matcher::uncompressAlignment(res.backtrace), false, (res.qStartPos > res.qEndPos),
721 (isTranslatedSearch == true && queryNucs == true), translateNucl);
722 } else {
723 printSeqBasedOnAln(result, querySeqData, res.qStartPos,
724 Matcher::uncompressAlignment(res.backtrace), false, (res.qStartPos > res.qEndPos),
725 (isTranslatedSearch == true && queryNucs == true), translateNucl);
726 }
727 result.append("\", \"dbAln\": \"");
728 size_t tId = tDbr->sequenceReader->getId(res.dbKey);
729 char* targetSeqData = tDbr->sequenceReader->getData(tId, thread_idx);
730 if (targetProfile) {
731 Sequence::extractProfileConsensus(targetSeqData, *subMat, targetProfData);
732 printSeqBasedOnAln(result, targetProfData.c_str(), res.dbStartPos,
733 Matcher::uncompressAlignment(res.backtrace), true,
734 (res.dbStartPos > res.dbEndPos),
735 (isTranslatedSearch == true && targetNucs == true), translateNucl);
736 } else {
737 printSeqBasedOnAln(result, targetSeqData, res.dbStartPos,
738 Matcher::uncompressAlignment(res.backtrace), true,
739 (res.dbStartPos > res.dbEndPos),
740 (isTranslatedSearch == true && targetNucs == true), translateNucl);
741 }
742 result.append("\" },\n");
743 break;
744 }
745
746 // case Parameters::FORMAT_ALIGNMENT_GFF:{
747 // // for TBLASTX
748 // bool strand = res.qEndPos > res.qStartPos;
749 // int currStart = std::min(res.qStartPos, res.qEndPos);
750 // int currEnd = std::max(res.qStartPos, res.qEndPos);
751 // int currLen = currEnd - currStart;
752 // result.append(queryId);
753 // result.append("\tconserve\tprotein_match\t");
754 // result.append(SSTR(currStart+1));
755 // result.push_back('\t');
756 // result.append(SSTR(currEnd+1));
757 // result.push_back('\t');
758 // result.append(SSTR(currLen));
759 // result.push_back('\t');
760 // result.push_back((strand) ? '-' : '+');
761 // result.append("\t.\t");
762 // result.append("ID=");
763 // result.append(queryId);
764 // result.append(":hsp:");
765 // result.append(SSTR(counter));
766 // result.append(";");
767 // break;
768 // }
769 default:
770 Debug(Debug::ERROR) << "Not implemented yet";
771 EXIT(EXIT_FAILURE);
772 }
773 }
774
775 if (format == Parameters::FORMAT_ALIGNMENT_HTML) {
776 result.append("]},\n");
777 }
778 resultWriter.writeData(result.c_str(), result.size(), queryKey, thread_idx, isDb);
779 result.clear();
780 }
781 }
782 if (format == Parameters::FORMAT_ALIGNMENT_HTML) {
783 const char* endBlock = "]);</script>";
784 resultWriter.writeData(endBlock, strlen(endBlock), 0, localThreads - 1, false, false);
785 }
786 // tsv output
787 resultWriter.close(true);
788 if (isDb == false) {
789 FileUtil::remove(par.db4Index.c_str());
790 }
791 if(needTaxonomy){
792 delete t;
793 }
794 alnDbr.close();
795 if (sameDB == false) {
796 delete tDbr;
797 delete tDbrHeader;
798 }
799 if (needSequenceDB) {
800 delete evaluer;
801 }
802 delete subMat;
803
804 return EXIT_SUCCESS;
805 }
806
807