1 #include "Parameters.h"
2 #include "Util.h"
3 #include "DBWriter.h"
4 #include "CommandCaller.h"
5 #include "Debug.h"
6 #include "FileUtil.h"
7
8 #include "cascaded_clustering.sh.h"
9 #include "nucleotide_clustering.sh.h"
10 #include "clustering.sh.h"
11
12 #include <cassert>
13
setWorkflowDefaults(Parameters * p)14 void setWorkflowDefaults(Parameters *p) {
15 p->spacedKmer = true;
16 p->covThr = 0.8;
17 p->evalThr = 0.001;
18 p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
19 p->maxResListLen = 20;
20 }
21
setAutomaticThreshold(float seqId)22 float setAutomaticThreshold(float seqId) {
23 float sens;
24 if (seqId <= 0.3) {
25 sens = 6;
26 } else if (seqId > 0.8) {
27 sens = 1.0;
28 } else {
29 sens = 1.0 + (1.0 * (0.7 - seqId) * 10);
30 }
31 return sens;
32 }
33
setAutomaticIterations(float sens)34 int setAutomaticIterations(float sens){
35 if (sens <= 2.0) {
36 return 1;
37 } else {
38 return 3;
39 }
40 }
41
setNuclClusterDefaults(Parameters * p)42 void setNuclClusterDefaults(Parameters *p) {
43 // leave ungapped alignment untouched
44 if (p->alignmentMode != Parameters::ALIGNMENT_MODE_UNGAPPED) {
45 p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
46 }
47 //p->orfLongest = true;
48 p->exactKmerMatching = true;
49 if (p->PARAM_DIAGONAL_SCORING.wasSet == false) {
50 p->diagonalScoring = 0;
51 }
52 if (p->PARAM_STRAND.wasSet == false) {
53 p->strand = 2;
54 }
55 if (p->PARAM_K.wasSet == false) {
56 p->kmerSize = 15;
57 }
58 if (p->PARAM_MAX_SEQ_LEN.wasSet == false) {
59 p->maxSeqLen = 10000;
60 }
61 }
62
setClusterAutomagicParameters(Parameters & par)63 void setClusterAutomagicParameters(Parameters& par) {
64 if (par.PARAM_NO_COMP_BIAS_CORR.wasSet == false && par.seqIdThr >= 0.7) {
65 par.compBiasCorrection = 0;
66 par.PARAM_NO_COMP_BIAS_CORR.wasSet = true;
67 }
68
69 if (par.PARAM_MIN_DIAG_SCORE.wasSet == false && par.seqIdThr >= 0.7) {
70 par.minDiagScoreThr = 60;
71 par.PARAM_MIN_DIAG_SCORE.wasSet = true;
72 }
73
74 if (par.PARAM_S.wasSet == false) {
75 par.sensitivity = setAutomaticThreshold(par.seqIdThr);
76 par.PARAM_S.wasSet = true;
77 Debug(Debug::INFO) << "Set cluster sensitivity to -s " << par.sensitivity << "\n";
78 }
79
80 const bool nonsymetric = (par.covMode == Parameters::COV_MODE_TARGET || par.covMode == Parameters::COV_MODE_QUERY);
81 if (par.PARAM_CLUSTER_MODE.wasSet == false) {
82 if (nonsymetric) {
83 par.clusteringMode = Parameters::GREEDY_MEM;
84 } else {
85 par.clusteringMode = Parameters::SET_COVER;
86 }
87 par.PARAM_CLUSTER_MODE.wasSet = true;
88 Debug(Debug::INFO) << "Set cluster mode " << ((par.clusteringMode == Parameters::GREEDY_MEM) ? "GREEDY MEM" : "SET COVER") << "\n";
89 }
90 if (nonsymetric && par.clusteringMode != Parameters::GREEDY && par.clusteringMode != Parameters::GREEDY_MEM) {
91 Debug(Debug::WARNING) << "Combining cluster mode " << par.clusteringMode
92 << " in combination with coverage mode " << par.covMode << " can produce wrong results.\n"
93 << "Please use --cov-mode 2\n";
94 }
95 if (par.singleStepClustering == false && par.clusteringMode == Parameters::CONNECTED_COMPONENT) {
96 Debug(Debug::WARNING) << "Connected component clustering produces less clusters in a single step clustering.\n"
97 << "Please use --single-step-cluster";
98 }
99 if (par.PARAM_CLUSTER_STEPS.wasSet == false) {
100 par.clusterSteps = setAutomaticIterations(par.sensitivity);
101 par.PARAM_CLUSTER_STEPS.wasSet = true;
102 Debug(Debug::INFO) << "Set cluster iterations to " << par.clusterSteps << "\n";
103 }
104 }
105
clusteringworkflow(int argc,const char ** argv,const Command & command)106 int clusteringworkflow(int argc, const char **argv, const Command& command) {
107 Parameters &par = Parameters::getInstance();
108 setWorkflowDefaults(&par);
109 par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT);
110 par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
111 par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
112 par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT);
113 par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
114 par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT);
115 par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT);
116 par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
117 par.parseParameters(argc, argv, command, false, 0, 0);
118 const int dbType = FileUtil::parseDbType(par.db1.c_str());
119 bool isNucleotideDb = (Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_NUCLEOTIDES));
120 if (isNucleotideDb) {
121 setNuclClusterDefaults(&par);
122 }
123 par.printParameters(command.cmd, argc, argv, *command.params);
124
125 const bool isUngappedMode = par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED;
126 if (isUngappedMode && Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_HMM_PROFILE)) {
127 Debug(Debug::ERROR) << "Cannot use ungapped alignment mode with profile databases\n";
128 EXIT(EXIT_FAILURE);
129 }
130 setClusterAutomagicParameters(par);
131
132 std::string tmpDir = par.db3;
133 std::string hash = SSTR(par.hashParameter(command.databases, par.filenames, par.clusterworkflow));
134 if (par.reuseLatest) {
135 hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
136 }
137 tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
138 par.filenames.pop_back();
139 par.filenames.push_back(tmpDir);
140
141 CommandCaller cmd;
142 cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
143 const int originalRescoreMode = par.rescoreMode;
144 par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
145 cmd.addVariable("ALIGN_MODULE", isUngappedMode ? "rescorediagonal" : "align");
146 par.rescoreMode = originalRescoreMode;
147 cmd.addVariable("RUNNER", par.runner.c_str());
148 cmd.addVariable("MERGECLU_PAR", par.createParameterString(par.threadsandcompression).c_str());
149 cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
150
151 if (isNucleotideDb) {
152 par.forwardFrames = "1";
153 par.reverseFrames = "1";
154 par.searchType = 3;
155 cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
156 int oldKmer = par.kmerSize;
157 par.kmerSize = 0;
158 cmd.addVariable("LINCLUST_PAR", par.createParameterString(par.linclustworkflow).c_str());
159 par.kmerSize = oldKmer;
160 if (par.PARAM_MAX_SEQS.wasSet == false) {
161 par.maxResListLen = 300;
162 }
163
164 cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
165 if (isUngappedMode) {
166 par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
167 cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());
168 par.rescoreMode = originalRescoreMode;
169 } else {
170 cmd.addVariable("ALIGNMENT_MODE_NOT_SET", "TRUE");
171 par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
172 cmd.addVariable("RESCORE_ALN_PAR", par.createParameterString(par.rescorediagonal).c_str());
173 cmd.addVariable("THREADSANDCOMPRESS_PAR", par.createParameterString(par.threadsandcompression).c_str());
174 cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.align).c_str());
175 }
176 cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str());
177 cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
178 std::string program = tmpDir + "/nucleotide_clustering.sh";
179 FileUtil::writeFile(program, nucleotide_clustering_sh, nucleotide_clustering_sh_len);
180 cmd.execProgram(program.c_str(), par.filenames);
181 } else if (par.singleStepClustering == false) {
182 // save some values to restore them later
183 float targetSensitivity = par.sensitivity;
184 MultiParam<int> alphabetSize = par.alphabetSize;
185 par.alphabetSize = MultiParam<int>(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5);
186 int kmerSize = par.kmerSize;
187 par.kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K;
188 int maskMode = par.maskMode;
189 par.maskMode = 0;
190 cmd.addVariable("LINCLUST_PAR", par.createParameterString(par.linclustworkflow).c_str());
191 par.alphabetSize = alphabetSize;
192 par.kmerSize = kmerSize;
193 par.maskMode = maskMode;
194 // 1 is lowest sens
195 par.sensitivity = ((par.clusterSteps - 1) == 0) ? par.sensitivity : 1;
196 int minDiagScoreThr = par.minDiagScoreThr;
197 par.minDiagScoreThr = 0;
198 par.diagonalScoring = 0;
199 par.compBiasCorrection = 0;
200 cmd.addVariable("PREFILTER0_PAR", par.createParameterString(par.prefilter).c_str());
201 if (isUngappedMode) {
202 par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
203 cmd.addVariable("ALIGNMENT0_PAR", par.createParameterString(par.rescorediagonal).c_str());
204 par.rescoreMode = originalRescoreMode;
205 } else {
206 cmd.addVariable("ALIGNMENT0_PAR", par.createParameterString(par.align).c_str());
207 }
208 cmd.addVariable("CLUSTER0_PAR", par.createParameterString(par.clust).c_str());
209 par.diagonalScoring = 1;
210 par.compBiasCorrection = 1;
211 par.minDiagScoreThr = minDiagScoreThr;
212 float sensStepSize = (targetSensitivity - 1) / (static_cast<float>(par.clusterSteps) - 1);
213 for (int step = 1; step < par.clusterSteps; step++) {
214 par.sensitivity = 1.0 + sensStepSize * step;
215
216 cmd.addVariable(std::string("PREFILTER" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.prefilter).c_str());
217 if (isUngappedMode) {
218 par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
219 cmd.addVariable(std::string("ALIGNMENT" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.rescorediagonal).c_str());
220 par.rescoreMode = originalRescoreMode;
221 } else {
222 cmd.addVariable(std::string("ALIGNMENT" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.align).c_str());
223 }
224 cmd.addVariable(std::string("CLUSTER" + SSTR(step) + "_PAR").c_str(), par.createParameterString(par.clust).c_str());
225 }
226 cmd.addVariable("STEPS", SSTR(par.clusterSteps).c_str());
227 // correct for cascading clustering errors
228 if (par.clusterReassignment) {
229 cmd.addVariable("REASSIGN", "TRUE");
230 }
231 cmd.addVariable("THREADSANDCOMPRESS", par.createParameterString(par.threadsandcompression).c_str());
232 cmd.addVariable("VERBCOMPRESS", par.createParameterString(par.verbandcompression).c_str());
233 int swapedCovMode = Util::swapCoverageMode(par.covMode);
234 int tmpCovMode = par.covMode;
235 par.covMode = swapedCovMode;
236 cmd.addVariable("PREFILTER_REASSIGN_PAR", par.createParameterString(par.prefilter).c_str());
237 par.covMode = tmpCovMode;
238 cmd.addVariable("ALIGNMENT_REASSIGN_PAR", par.createParameterString(par.align).c_str());
239 cmd.addVariable("MERGEDBS_PAR", par.createParameterString(par.mergedbs).c_str());
240
241 std::string program = tmpDir + "/cascaded_clustering.sh";
242 FileUtil::writeFile(program, cascaded_clustering_sh, cascaded_clustering_sh_len);
243 cmd.execProgram(program.c_str(), par.filenames);
244 } else {
245 // same as above, clusthash needs a smaller alphabetsize
246 MultiParam<int> alphabetSize = par.alphabetSize;
247 par.alphabetSize = MultiParam<int>(Parameters::CLUST_HASH_DEFAULT_ALPH_SIZE, 5);
248 float seqIdThr = par.seqIdThr;
249 par.seqIdThr = (float) Parameters::CLUST_HASH_DEFAULT_MIN_SEQ_ID / 100.0f;
250 cmd.addVariable("DETECTREDUNDANCY_PAR", par.createParameterString(par.clusthash).c_str());
251 par.alphabetSize = alphabetSize;
252 par.seqIdThr = seqIdThr;
253 cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
254 if (isUngappedMode) {
255 cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());
256 } else {
257 cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.align).c_str());
258 }
259 cmd.addVariable("CLUSTER_PAR", par.createParameterString(par.clust).c_str());
260 std::string program = tmpDir + "/clustering.sh";
261 FileUtil::writeFile(program, clustering_sh, clustering_sh_len);
262 cmd.execProgram(program.c_str(), par.filenames);
263 }
264
265 // Unreachable
266 assert(false);
267 return EXIT_FAILURE;
268 }
269