1 #include "Parameters.h"
2 #include "Util.h"
3 #include "DBWriter.h"
4 #include "CommandCaller.h"
5 #include "Debug.h"
6 #include "FileUtil.h"
7 
8 #include "cascaded_clustering.sh.h"
9 #include "nucleotide_clustering.sh.h"
10 #include "clustering.sh.h"
11 
12 #include <cassert>
13 
setWorkflowDefaults(Parameters * p)14 void setWorkflowDefaults(Parameters *p) {
15     p->spacedKmer = true;
16     p->covThr = 0.8;
17     p->evalThr = 0.001;
18     p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
19     p->maxResListLen = 20;
20 }
21 
setAutomaticThreshold(float seqId)22 float setAutomaticThreshold(float seqId) {
23     float sens;
24     if (seqId <= 0.3) {
25         sens = 6;
26     } else if (seqId > 0.8) {
27         sens = 1.0;
28     } else {
29         sens = 1.0 + (1.0 * (0.7 - seqId) * 10);
30     }
31     return sens;
32 }
33 
setAutomaticIterations(float sens)34 int setAutomaticIterations(float sens){
35     if (sens <= 2.0) {
36         return 1;
37     } else {
38         return 3;
39     }
40 }
41 
setNuclClusterDefaults(Parameters * p)42 void setNuclClusterDefaults(Parameters *p) {
43     // leave ungapped alignment untouched
44     if (p->alignmentMode != Parameters::ALIGNMENT_MODE_UNGAPPED) {
45         p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
46     }
47     //p->orfLongest = true;
48     p->exactKmerMatching = true;
49     if (p->PARAM_DIAGONAL_SCORING.wasSet == false) {
50         p->diagonalScoring = 0;
51     }
52     if (p->PARAM_STRAND.wasSet == false) {
53         p->strand = 2;
54     }
55     if (p->PARAM_K.wasSet == false) {
56         p->kmerSize = 15;
57     }
58     if (p->PARAM_MAX_SEQ_LEN.wasSet == false) {
59         p->maxSeqLen = 10000;
60     }
61 }
62 
setClusterAutomagicParameters(Parameters & par)63 void setClusterAutomagicParameters(Parameters& par) {
64     if (par.PARAM_NO_COMP_BIAS_CORR.wasSet == false && par.seqIdThr >= 0.7) {
65         par.compBiasCorrection = 0;
66         par.PARAM_NO_COMP_BIAS_CORR.wasSet = true;
67     }
68 
69     if (par.PARAM_MIN_DIAG_SCORE.wasSet == false && par.seqIdThr >= 0.7) {
70         par.minDiagScoreThr = 60;
71         par.PARAM_MIN_DIAG_SCORE.wasSet = true;
72     }
73 
74     if (par.PARAM_S.wasSet == false) {
75         par.sensitivity = setAutomaticThreshold(par.seqIdThr);
76         par.PARAM_S.wasSet = true;
77         Debug(Debug::INFO) << "Set cluster sensitivity to -s " << par.sensitivity << "\n";
78     }
79 
80     const bool nonsymetric = (par.covMode == Parameters::COV_MODE_TARGET || par.covMode == Parameters::COV_MODE_QUERY);
81     if (par.PARAM_CLUSTER_MODE.wasSet == false) {
82         if (nonsymetric) {
83             par.clusteringMode = Parameters::GREEDY_MEM;
84         } else {
85             par.clusteringMode = Parameters::SET_COVER;
86         }
87         par.PARAM_CLUSTER_MODE.wasSet = true;
88         Debug(Debug::INFO) << "Set cluster mode " << ((par.clusteringMode == Parameters::GREEDY_MEM) ? "GREEDY MEM" : "SET COVER") << "\n";
89     }
90     if (nonsymetric && par.clusteringMode != Parameters::GREEDY && par.clusteringMode != Parameters::GREEDY_MEM) {
91         Debug(Debug::WARNING) << "Combining cluster mode " << par.clusteringMode
92                               << " in combination with coverage mode " << par.covMode << " can produce wrong results.\n"
93                               << "Please use --cov-mode 2\n";
94     }
95     if (par.singleStepClustering == false && par.clusteringMode == Parameters::CONNECTED_COMPONENT) {
96         Debug(Debug::WARNING) << "Connected component clustering produces less clusters in a single step clustering.\n"
97                               << "Please use --single-step-cluster";
98     }
99     if (par.PARAM_CLUSTER_STEPS.wasSet == false) {
100         par.clusterSteps = setAutomaticIterations(par.sensitivity);
101         par.PARAM_CLUSTER_STEPS.wasSet = true;
102         Debug(Debug::INFO) << "Set cluster iterations to " << par.clusterSteps << "\n";
103     }
104 }
105 
clusteringworkflow(int argc,const char ** argv,const Command & command)106 int clusteringworkflow(int argc, const char **argv, const Command& command) {
107     Parameters &par = Parameters::getInstance();
108     setWorkflowDefaults(&par);
109     par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT);
110     par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
111     par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
112     par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT);
113     par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
114     par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT);
115     par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT);
116     par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
117     par.parseParameters(argc, argv, command, false, 0, 0);
118     const int dbType = FileUtil::parseDbType(par.db1.c_str());
119     bool isNucleotideDb = (Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_NUCLEOTIDES));
120     if (isNucleotideDb) {
121         setNuclClusterDefaults(&par);
122     }
123     par.printParameters(command.cmd, argc, argv, *command.params);
124 
125     const bool isUngappedMode = par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED;
126     if (isUngappedMode && Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_HMM_PROFILE)) {
127         Debug(Debug::ERROR) << "Cannot use ungapped alignment mode with profile databases\n";
128         EXIT(EXIT_FAILURE);
129     }
130     setClusterAutomagicParameters(par);
131 
132     std::string tmpDir = par.db3;
133     std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, par.clusterworkflow));
134     if (par.reuseLatest) {
135         hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
136     }
137     tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
138     par.filenames.pop_back();
139     par.filenames.push_back(tmpDir);
140 
141     CommandCaller cmd;
142     cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
143     const int originalRescoreMode = par.rescoreMode;
144     par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
145     cmd.addVariable("ALIGN_MODULE", isUngappedMode ? "rescorediagonal" : "align");
146     par.rescoreMode = originalRescoreMode;
147     cmd.addVariable("RUNNER", par.runner.c_str());
148     cmd.addVariable("MERGECLU_PAR", par.createParameterString(par.threadsandcompression).c_str());
149     cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
150 
151     if (isNucleotideDb) {
152         par.forwardFrames = "1";
153         par.reverseFrames = "1";
154         par.searchType = 3;
155         cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
156         int oldKmer = par.kmerSize;
157         par.kmerSize = 0;
158         cmd.addVariable("LINCLUST_PAR", par.createParameterString(par.linclustworkflow).c_str());
159         par.kmerSize = oldKmer;
160         if (par.PARAM_MAX_SEQS.wasSet == false) {
161             par.maxResListLen = 300;
162         }
163 
164         cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
165         if (isUngappedMode) {
166             par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
167             cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());
168             par.rescoreMode = originalRescoreMode;
169         } else {
170             cmd.addVariable("ALIGNMENT_MODE_NOT_SET", "TRUE");
171             par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
172             cmd.addVariable("RESCORE_ALN_PAR", par.createParameterString(par.rescorediagonal).c_str());
173             cmd.addVariable("THREADSANDCOMPRESS_PAR", par.createParameterString(par.threadsandcompression).c_str());
174             cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.align).c_str());
175         }
176         cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str());
177         cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
178         std::string program = tmpDir + "/nucleotide_clustering.sh";
179         FileUtil::writeFile(program, nucleotide_clustering_sh, nucleotide_clustering_sh_len);
180         cmd.execProgram(program.c_str(), par.filenames);
181     } else if (par.singleStepClustering == false) {
182         // save some values to restore them later
183         float targetSensitivity = par.sensitivity;
184         MultiParam<int> alphabetSize = par.alphabetSize;
185         par.alphabetSize = MultiParam<int>(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5);
186         int kmerSize = par.kmerSize;
187         par.kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K;
188         int maskMode = par.maskMode;
189         par.maskMode = 0;
190         cmd.addVariable("LINCLUST_PAR", par.createParameterString(par.linclustworkflow).c_str());
191         par.alphabetSize = alphabetSize;
192         par.kmerSize = kmerSize;
193         par.maskMode = maskMode;
194         // 1 is lowest sens
195         par.sensitivity = ((par.clusterSteps - 1) == 0) ? par.sensitivity : 1;
196         int minDiagScoreThr = par.minDiagScoreThr;
197         par.minDiagScoreThr = 0;
198         par.diagonalScoring = 0;
199         par.compBiasCorrection = 0;
200         cmd.addVariable("PREFILTER0_PAR", par.createParameterString(par.prefilter).c_str());
201         if (isUngappedMode) {
202             par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
203             cmd.addVariable("ALIGNMENT0_PAR", par.createParameterString(par.rescorediagonal).c_str());
204             par.rescoreMode = originalRescoreMode;
205         } else {
206             cmd.addVariable("ALIGNMENT0_PAR", par.createParameterString(par.align).c_str());
207         }
208         cmd.addVariable("CLUSTER0_PAR", par.createParameterString(par.clust).c_str());
209         par.diagonalScoring = 1;
210         par.compBiasCorrection = 1;
211         par.minDiagScoreThr = minDiagScoreThr;
212         float sensStepSize = (targetSensitivity - 1) / (static_cast<float>(par.clusterSteps) - 1);
213         for (int step = 1; step < par.clusterSteps; step++) {
214             par.sensitivity = 1.0 + sensStepSize * step;
215 
216             cmd.addVariable(std::string("PREFILTER" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.prefilter).c_str());
217             if (isUngappedMode) {
218                 par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
219                 cmd.addVariable(std::string("ALIGNMENT" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.rescorediagonal).c_str());
220                 par.rescoreMode = originalRescoreMode;
221             } else {
222                 cmd.addVariable(std::string("ALIGNMENT" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.align).c_str());
223             }
224             cmd.addVariable(std::string("CLUSTER" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.clust).c_str());
225         }
226         cmd.addVariable("STEPS", SSTR(par.clusterSteps).c_str());
227         // correct for cascading clustering errors
228         if (par.clusterReassignment) {
229             cmd.addVariable("REASSIGN", "TRUE");
230         }
231         cmd.addVariable("THREADSANDCOMPRESS", par.createParameterString(par.threadsandcompression).c_str());
232         cmd.addVariable("VERBCOMPRESS", par.createParameterString(par.verbandcompression).c_str());
233         int swapedCovMode = Util::swapCoverageMode(par.covMode);
234         int tmpCovMode = par.covMode;
235         par.covMode = swapedCovMode;
236         cmd.addVariable("PREFILTER_REASSIGN_PAR", par.createParameterString(par.prefilter).c_str());
237         par.covMode = tmpCovMode;
238         cmd.addVariable("ALIGNMENT_REASSIGN_PAR", par.createParameterString(par.align).c_str());
239         cmd.addVariable("MERGEDBS_PAR", par.createParameterString(par.mergedbs).c_str());
240 
241         std::string program = tmpDir + "/cascaded_clustering.sh";
242         FileUtil::writeFile(program, cascaded_clustering_sh, cascaded_clustering_sh_len);
243         cmd.execProgram(program.c_str(), par.filenames);
244     } else {
245         // same as above, clusthash needs a smaller alphabetsize
246         MultiParam<int> alphabetSize = par.alphabetSize;
247         par.alphabetSize = MultiParam<int>(Parameters::CLUST_HASH_DEFAULT_ALPH_SIZE, 5);
248         float seqIdThr = par.seqIdThr;
249         par.seqIdThr = (float) Parameters::CLUST_HASH_DEFAULT_MIN_SEQ_ID / 100.0f;
250         cmd.addVariable("DETECTREDUNDANCY_PAR", par.createParameterString(par.clusthash).c_str());
251         par.alphabetSize = alphabetSize;
252         par.seqIdThr = seqIdThr;
253         cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
254         if (isUngappedMode) {
255             cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());
256         } else {
257             cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.align).c_str());
258         }
259         cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str());
260         std::string program = tmpDir + "/clustering.sh";
261         FileUtil::writeFile(program, clustering_sh, clustering_sh_len);
262         cmd.execProgram(program.c_str(), par.filenames);
263     }
264 
265     // Unreachable
266     assert(false);
267     return EXIT_FAILURE;
268 }
269