1 #include "NucleotideMatrix.h"
2 #include "SubstitutionMatrix.h"
3 #include "tantan.h"
4 #include "DBReader.h"
5 #include "DBWriter.h"
6 #include "Debug.h"
7 #include "Util.h"
8 #include "FileUtil.h"
9
10 #ifdef OPENMP
11 #include <omp.h>
12 #endif
13
masksequence(int argc,const char ** argv,const Command & command)14 int masksequence(int argc, const char **argv, const Command& command) {
15 Parameters &par = Parameters::getInstance();
16 par.parseParameters(argc, argv, command, true, 0, 0);
17
18 DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads,
19 DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
20 reader.open(DBReader<unsigned int>::NOSORT);
21
22 BaseMatrix *subMat;
23 if (Parameters::isEqualDbtype(reader.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
24 subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
25 } else {
26 // keep score bias at 0.0 (improved ROC)
27 subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
28 }
29 size_t maxSeqLen = 0;
30
31 for (size_t i = 0; i < reader.getSize(); i++) {
32 maxSeqLen = std::max(reader.getSeqLen(i), maxSeqLen);
33 }
34 // need to prune low scoring k-mers through masking
35 ProbabilityMatrix probMatrix(*subMat);
36
37 DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
38 writer.open();
39 #pragma omp parallel
40 {
41
42 unsigned int thread_idx = 0;
43 #ifdef OPENMP
44 thread_idx = (unsigned int) omp_get_thread_num();
45 #endif
46 char *charSequence = new char[maxSeqLen + 1];
47
48 #pragma omp for schedule(dynamic, 1)
49 for (size_t id = 0; id < reader.getSize(); ++id) {
50 char *seqData = reader.getData(id, thread_idx);
51 unsigned int seqLen = 0;
52 while (seqData[seqLen] != '\0') {
53 charSequence[seqLen] = (char) subMat->aa2num[static_cast<int>(seqData[seqLen])];
54 seqLen++;
55 }
56 tantan::maskSequences(charSequence,
57 charSequence + seqLen,
58 50 /*options.maxCycleLength*/,
59 probMatrix.probMatrixPointers,
60 0.005 /*options.repeatProb*/,
61 0.05 /*options.repeatEndProb*/,
62 0.9 /*options.repeatOffsetProbDecay*/,
63 0, 0,
64 0.5 /*options.minMaskProb*/,
65 probMatrix.hardMaskTable);
66
67 for (unsigned int pos = 0; pos < seqLen; pos++) {
68 char aa = seqData[pos];
69 charSequence[pos] = (charSequence[pos] == probMatrix.hardMaskTable[0]) ? tolower(aa) : toupper(aa);
70 }
71 writer.writeData(charSequence, seqLen, reader.getDbKey(id), thread_idx);
72 }
73 delete[] charSequence;
74 }
75 writer.close(true);
76 DBReader<unsigned int>::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY);
77 reader.close();
78
79 delete subMat;
80 return EXIT_SUCCESS;
81 }
82