1 #include "NucleotideMatrix.h"
2 #include "SubstitutionMatrix.h"
3 #include "tantan.h"
4 #include "DBReader.h"
5 #include "DBWriter.h"
6 #include "Debug.h"
7 #include "Util.h"
8 #include "FileUtil.h"
9 
10 #ifdef OPENMP
11 #include <omp.h>
12 #endif
13 
masksequence(int argc,const char ** argv,const Command & command)14 int masksequence(int argc, const char **argv, const Command& command) {
15     Parameters &par = Parameters::getInstance();
16     par.parseParameters(argc, argv, command, true, 0, 0);
17 
18     DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads,
19                                   DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
20     reader.open(DBReader<unsigned int>::NOSORT);
21 
22     BaseMatrix *subMat;
23     if (Parameters::isEqualDbtype(reader.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
24         subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
25     } else {
26         // keep score bias at 0.0 (improved ROC)
27         subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
28     }
29     size_t maxSeqLen = 0;
30 
31     for (size_t i = 0; i < reader.getSize(); i++) {
32         maxSeqLen = std::max(reader.getSeqLen(i), maxSeqLen);
33     }
34     // need to prune low scoring k-mers through masking
35     ProbabilityMatrix probMatrix(*subMat);
36 
37     DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
38     writer.open();
39 #pragma omp parallel
40     {
41 
42         unsigned int thread_idx = 0;
43 #ifdef OPENMP
44         thread_idx = (unsigned int) omp_get_thread_num();
45 #endif
46         char *charSequence = new char[maxSeqLen + 1];
47 
48 #pragma omp for schedule(dynamic, 1)
49         for (size_t id = 0; id < reader.getSize(); ++id) {
50             char *seqData = reader.getData(id, thread_idx);
51             unsigned int seqLen = 0;
52             while (seqData[seqLen] != '\0') {
53                 charSequence[seqLen] = (char) subMat->aa2num[static_cast<int>(seqData[seqLen])];
54                 seqLen++;
55             }
56             tantan::maskSequences(charSequence,
57                                   charSequence + seqLen,
58                                   50 /*options.maxCycleLength*/,
59                                   probMatrix.probMatrixPointers,
60                                   0.005 /*options.repeatProb*/,
61                                   0.05 /*options.repeatEndProb*/,
62                                   0.9 /*options.repeatOffsetProbDecay*/,
63                                   0, 0,
64                                   0.5 /*options.minMaskProb*/,
65                                   probMatrix.hardMaskTable);
66 
67             for (unsigned int pos = 0; pos < seqLen; pos++) {
68                 char aa = seqData[pos];
69                 charSequence[pos] = (charSequence[pos] == probMatrix.hardMaskTable[0]) ? tolower(aa) : toupper(aa);
70             }
71             writer.writeData(charSequence, seqLen, reader.getDbKey(id), thread_idx);
72         }
73         delete[] charSequence;
74     }
75     writer.close(true);
76     DBReader<unsigned int>::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY);
77     reader.close();
78 
79     delete subMat;
80     return EXIT_SUCCESS;
81 }
82