1 /*
2 * HHblits.cpp
3 *
4 * Created on: Apr 1, 2014
5 * Author: meiermark
6 */
7
8 #include "hhblits.h"
9 #include "hhsuite_config.h"
10
HHblits(Parameters & par,std::vector<HHblitsDatabase * > & databases)11 HHblits::HHblits(Parameters& par, std::vector<HHblitsDatabase*>& databases) : par(par) {
12 dbs = databases;
13
14 context_lib = NULL;
15 crf = NULL;
16 pc_hhm_context_engine = NULL;
17 pc_hhm_context_mode = NULL;
18 pc_prefilter_context_engine = NULL;
19 pc_prefilter_context_mode = NULL;
20
21 Qali = NULL;
22 Qali_allseqs = NULL;
23
24 q = NULL;
25 q_tmp = NULL;
26
27 // Set (global variable) substitution matrix and derived matrices
28 SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim);
29
30 // Set secondary structure substitution matrix
31 if (par.ssm)
32 SetSecStrucSubstitutionMatrix(par.ssa, S73, S37, S33);
33
34 // Prepare pseudocounts
35 if (!par.nocontxt) {
36 InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode);
37 }
38
39 // Prepare multi-threading - reserve memory for threads, intialize, etc.
40 viterbiMatrices = new ViterbiMatrix*[par.threads];
41 posteriorMatrices = new PosteriorMatrix*[par.threads];
42 for (int bin = 0; bin < par.threads; bin++) {
43 viterbiMatrices[bin] = new ViterbiMatrix();
44 posteriorMatrices[bin] = new PosteriorMatrix();
45 }
46 }
47
~HHblits()48 HHblits::~HHblits() {
49 Reset();
50
51 for (int bin = 0; bin < par.threads; bin++) {
52 delete viterbiMatrices[bin];
53 delete posteriorMatrices[bin];
54 }
55 delete[] viterbiMatrices;
56 delete[] posteriorMatrices;
57
58 DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode);
59 }
60
prepareDatabases(Parameters & par,std::vector<HHblitsDatabase * > & databases)61 void HHblits::prepareDatabases(Parameters& par,
62 std::vector<HHblitsDatabase*>& databases) {
63 for (size_t i = 0; i < par.db_bases.size(); i++) {
64 HHblitsDatabase* db = new HHblitsDatabase(par.db_bases[i].c_str());
65 databases.push_back(db);
66 }
67
68 par.dbsize = 0;
69 for (size_t i = 0; i < databases.size(); i++) {
70 par.dbsize += databases[i]->cs219_database->db_index->n_entries;
71 }
72
73 if (par.prefilter) {
74 for (size_t i = 0; i < databases.size(); i++) {
75 databases[i]->initPrefilter(par.cs_library);
76 }
77 }
78 }
79
ProcessAllArguments(Parameters & par)80 void HHblits::ProcessAllArguments(Parameters& par) {
81 const int argc = par.argc;
82 const char** argv = par.argv;
83
84 par.Ndiff = 1000;
85 par.prefilter = true;
86
87 par.early_stopping_filter = true;
88 par.filter_thresh = 0.01;
89
90 ProcessArguments(par);
91
92 // Check needed files
93 if (!*par.infile || !strcmp(par.infile, "")) {
94 help(par);
95 HH_LOG(ERROR) << "Input file is missing! (see -i)" << std::endl;
96 exit(4);
97 }
98 if (par.db_bases.size() == 0) {
99 help(par);
100 HH_LOG(ERROR) << "Database is missing! (see -d)" << std::endl;
101 exit(4);
102 }
103
104 if (par.loc == 0 && par.num_rounds >= 2) {
105 HH_LOG(WARNING)
106 << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << "\n"
107 << "\tusing -global alignment for iterative searches is deprecated "
108 "since non-homologous sequence segments can easily enter the "
109 "MSA and corrupt it."
110 << std::endl;
111 }
112
113 if (par.num_rounds < 1)
114 par.num_rounds = 1;
115 else if (par.num_rounds > 8) {
116 HH_LOG(WARNING) << "Number of iterations (" << par.num_rounds << ") too large => Set to 8 iterations\n";
117 par.num_rounds = 8;
118 }
119
120 // Check option compatibilities
121 if (par.nseqdis > MAXSEQDIS - 3 - par.showcons)
122 par.nseqdis = MAXSEQDIS - 3 - par.showcons; //3 reserved for secondary structure
123 if (par.aliwidth < 20)
124 par.aliwidth = 20;
125 if (par.pc_hhm_context_engine.pca < 0.001)
126 par.pc_hhm_context_engine.pca = 0.001; // to avoid log(0)
127 if (par.pc_prefilter_context_engine.pca < 0.001)
128 par.pc_prefilter_context_engine.pca = 0.001; // to avoid log(0)
129 if (par.b > par.B)
130 par.B = par.b;
131 if (par.z > par.Z)
132 par.Z = par.z;
133 if (par.maxmem < 1.0) {
134 HH_LOG(WARNING) << "Setting -maxmem to its minimum allowed value of 1.0" << std::endl;
135 par.maxmem = 1.0;
136 }
137 if (par.mact >= 1.0)
138 par.mact = 0.999;
139 else if (par.mact < 0)
140 par.mact = 0.0;
141 if (par.altali < 1)
142 par.altali = 1;
143 }
144
Reset()145 void HHblits::Reset() {
146 if (q) {
147 delete q;
148 q = NULL;
149 }
150
151 if (q_tmp) {
152 delete q_tmp;
153 q_tmp = NULL;
154 }
155
156 if (Qali) {
157 delete Qali;
158 Qali = NULL;
159 }
160
161 if (Qali_allseqs) {
162 delete Qali_allseqs;
163 Qali_allseqs = NULL;
164 }
165
166 hitlist.Reset();
167 while (!hitlist.End())
168 hitlist.Delete().Delete();
169 hitlist.Reset();
170
171 std::map<int, Alignment*>::iterator it;
172 for (it = alis.begin(); it != alis.end(); it++) {
173 delete (*it).second;
174 }
175 alis.clear();
176
177 // Prepare multi-threading - reserve memory for threads, intialize, etc.
178 for (int bin = 0; bin < par.threads; bin++) {
179 viterbiMatrices[bin]->DeleteBacktraceMatrix();
180 posteriorMatrices[bin]->DeleteProbabilityMatrix();
181 }
182 }
183
184 /////////////////////////////////////////////////////////////////////////////////////
185 // Help functions
186 /////////////////////////////////////////////////////////////////////////////////////
help(Parameters & par,char all)187 void HHblits::help(Parameters& par, char all) {
188 printf("HHblits %i.%i.%i:\nHMM-HMM-based lightning-fast iterative sequence search\n", HHSUITE_VERSION_MAJOR, HHSUITE_VERSION_MINOR, HHSUITE_VERSION_PATCH);
189 printf("HHblits is a sensitive, general-purpose, iterative sequence search tool that represents\n");
190 printf("both query and database sequences by HMMs. You can search HHblits databases starting\n");
191 printf("with a single query sequence, a multiple sequence alignment (MSA), or an HMM. HHblits\n");
192 printf("prints out a ranked list of database HMMs/MSAs and can also generate an MSA by merging\n");
193 printf("the significant database HMMs/MSAs onto the query MSA.\n");
194 printf("\n");
195 printf("%s", REFERENCE);
196 printf("%s", COPYRIGHT);
197 printf("\n");
198 printf("Usage: hhblits -i query [options] \n");
199 printf(" -i <file> input/query: single sequence or multiple sequence alignment (MSA)\n");
200 printf(" in a3m, a2m, or FASTA format, or HMM in hhm format\n");
201
202 if (all) {
203 printf("\n");
204 printf("<file> may be 'stdin' or 'stdout' throughout.\n");
205 }
206 printf("\n");
207
208 printf("Options: \n");
209 printf(" -d <name> database name (e.g. uniprot20_29Feb2012) \n");
210 printf(" Multiple databases may be specified with '-d <db1> -d <db2> ...'\n");
211 printf(" -n [1,8] number of iterations (default=%i) \n", par.num_rounds);
212 printf(" -e [0,1] E-value cutoff for inclusion in result alignment (def=%G) \n", par.e);
213 printf("\n");
214
215 printf("Input alignment format: \n");
216 printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n");
217 printf(" ' -' = Delete; '.' = gaps aligned to inserts (may be omitted) \n");
218 printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n");
219 printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n");
220 printf(" -tags/-notags do NOT / do neutralize His-, C-myc-, FLAG-tags, and trypsin \n");
221 printf(" recognition sequence to background distribution (def=-notags) \n");
222 printf("\n");
223
224 printf("Output options: \n");
225 printf(" -o <file> write results in standard format to file (default=<infile.hhr>)\n");
226 printf(" -oa3m <file> write result MSA with significant matches in a3m format\n");
227 if (!all) {
228 printf(" Analogous for -opsi and -ohhm\n");
229 }
230 if (all) {
231 printf(" -opsi <file> write result MSA of significant matches in PSI-BLAST format\n");
232 printf(" -ohhm <file> write HHM file for result MSA of significant matches\n");
233 }
234 printf(" -oalis <name> write MSAs in A3M format after each iteration\n");
235 printf(" -blasttab <name> write result in tabular BLAST format (compatible to -m 8 or -outfmt 6 output)\n");
236 printf(" 1 2 3 4 5 6 8 9 10 11 12\n");
237 printf(" 'query target #match/tLen #mismatch #gapOpen qstart qend tstart tend eval score'\n");
238 printf(" -add_cons generate consensus sequence as master sequence of query MSA (default=don't)\n");
239 printf(" -hide_cons don't show consensus sequence in alignments (default=show) \n");
240 printf(" -hide_pred don't show predicted 2ndary structure in alignments (default=show)\n");
241 printf(" -hide_dssp don't show DSSP 2ndary structure in alignments (default=show) \n");
242 printf(" -show_ssconf show confidences for predicted 2ndary structure in alignments\n");
243 if (all) {
244 printf(" -Ofas <file> write pairwise alignments in FASTA xor A2M (-Oa2m) xor A3M (-Oa3m) format \n");
245 printf(" -seq <int> max. number of query/template sequences displayed (default=%i) \n", par.nseqdis);
246 printf(" -aliw <int> number of columns per line in alignment list (default=%i) \n", par.aliwidth);
247 printf(" -p [0,100] minimum probability in summary and alignment list (default=%G) \n", par.p);
248 printf(" -E [0,inf[ maximum E-value in summary and alignment list (default=%G) \n", par.E);
249 printf(" -Z <int> maximum number of lines in summary hit list (default=%i) \n", par.Z);
250 printf(" -z <int> minimum number of lines in summary hit list (default=%i) \n", par.z);
251 printf(" -B <int> maximum number of alignments in alignment list (default=%i) \n", par.B);
252 printf(" -b <int> minimum number of alignments in alignment list (default=%i) \n", par.b);
253 printf("\n");
254 }
255
256
257 if(all) {
258 printf("Prefilter options \n");
259 printf(" -noprefilt disable all filter steps \n");
260 printf(" -noaddfilter disable all filter steps (except for fast prefiltering) \n");
261 printf(" -maxfilt max number of hits allowed to pass 2nd prefilter (default=%i) \n", par.maxnumdb);
262 printf(" -min_prefilter_hits min number of hits to pass prefilter (default=%i) \n", par.min_prefilter_hits);
263 printf(" -prepre_smax_thresh min score threshold of ungapped prefilter (default=%i) \n", par.preprefilter_smax_thresh);
264 printf(" -pre_evalue_thresh max E-value threshold of Smith-Waterman prefilter score (default=%.1f)\n", par.prefilter_evalue_thresh);
265 printf(" -pre_bitfactor prefilter scores are in units of 1 bit / pre_bitfactor (default=%i)\n", par.prefilter_bit_factor);
266 printf(" -pre_gap_open gap open penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_open);
267 printf(" -pre_gap_extend gap extend penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_extend);
268 printf(" -pre_score_offset offset on sequence profile scores in prefilter S-W alignment (default=%i)\n", par.prefilter_score_offset);
269
270 printf("\n");
271 }
272
273 printf("Filter options applied to query MSA, database MSAs, and result MSA \n");
274 printf(" -all show all sequences in result MSA; do not filter result MSA \n");
275 printf(" -interim_filter NONE|FULL \n");
276 printf(" filter sequences of query MSA during merging to avoid early stop (default: FULL)\n");
277 printf(" NONE: disables the intermediate filter \n");
278 printf(" FULL: if an early stop occurs compare filter seqs in an all vs. all comparison\n");
279 printf(" -id [0,100] maximum pairwise sequence identity (def=%i)\n", par.max_seqid);
280 printf(" -diff [0,inf[ filter MSAs by selecting most diverse set of sequences, keeping \n");
281 printf(" at least this many seqs in each MSA block of length 50 \n");
282 printf(" Zero and non-numerical values turn off the filtering. (def=%i) \n", par.Ndiff);
283 printf(" -cov [0,100] minimum coverage with master sequence (%%) (def=%i) \n", par.coverage);
284 printf(" -qid [0,100] minimum sequence identity with master sequence (%%) (def=%i) \n", par.qid);
285 printf(" -qsc [0,100] minimum score per column with master sequence (default=%.1f) \n", par.qsc);
286 printf(" -neff [1,inf] target diversity of multiple sequence alignment (default=off) \n");
287 printf(" -mark do not filter out sequences marked by \">@\"in their name line \n");
288 printf("\n");
289
290 printf("HMM-HMM alignment options: \n");
291 printf(" -norealign do NOT realign displayed hits with MAC algorithm (def=realign) \n");
292 printf(" -realign_old_hits realign hits from previous iterations \n");
293 printf(" -mact [0,1[ posterior prob threshold for MAC realignment controlling greedi- \n");
294 printf(" ness at alignment ends: 0:global >0.1:local (default=%.2f) \n", par.mact);
295 printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n");
296 if (all) {
297 printf(" -realign realign displayed hits with max. accuracy (MAC) algorithm \n");
298 printf(" -realign_max <int> realign max. <int> hits (default=%i) \n", par.realign_max);
299 printf(" -ovlp <int> banded alignment: forbid <ovlp> largest diagonals |i-j| of DP matrix (def=%i)\n", par.min_overlap);
300 printf(" -alt <int> show up to this many alternative alignments with raw score > smin(def=%i) \n", par.altali);
301 printf(" -premerge <int> merge <int> hits to query MSA before aligning remaining hits (def=%i)\n",par.premerge);
302 printf(" -smin <float> minimum raw score for alternative alignments (def=%.1f) \n", par.smin);
303 printf(" -shift [-1,1] profile-profile score offset (def=%-.2f) \n", par.shift);
304 printf(" -corr [0,1] weight of term for pair correlations (def=%.2f) \n", par.corr);
305 printf(" -sc <int> amino acid score (tja: template HMM at column j) (def=%i)\n", par.columnscore);
306 printf(" 0 = log2 Sum(tja*qia/pa) (pa: aa background frequencies) \n");
307 printf(" 1 = log2 Sum(tja*qia/pqa) (pqa = 1/2*(pa+ta) ) \n");
308 printf(" 2 = log2 Sum(tja*qia/ta) (ta: av. aa freqs in template) \n");
309 printf(" 3 = log2 Sum(tja*qia/qa) (qa: av. aa freqs in query) \n");
310 printf(" 5 local amino acid composition correction \n");
311 printf(" -ssm {0,..,4} 0: no ss scoring \n");
312 printf(" 1,2: ss scoring after or during alignment [default=%1i] \n", par.ssm);
313 printf(" 3,4: ss scoring after or during alignment, predicted vs. predicted\n");
314 printf(" -ssw [0,1] weight of ss score (def=%-.2f) \n", par.ssw);
315 printf(" -ssa [0,1] ss confusion matrix = (1-ssa)*I + ssa*psipred-confusion-matrix [def=%-.2f)\n", par.ssa);
316 printf(" -wg use global sequence weighting for realignment! \n");
317 printf("\n");
318
319 printf("Gap cost options: \n");
320 printf(" -gapb [0,inf[ Transition pseudocount admixture (def=%-.2f) \n", par.gapb);
321 printf(" -gapd [0,inf[ Transition pseudocount admixture for open gap (default=%-.2f) \n", par.gapd);
322 printf(" -gape [0,1.5] Transition pseudocount admixture for extend gap (def=%-.2f) \n", par.gape);
323 printf(" -gapf ]0,inf] factor to increase/reduce gap open penalty for deletes (def=%-.2f) \n", par.gapf);
324 printf(" -gapg ]0,inf] factor to increase/reduce gap open penalty for inserts (def=%-.2f) \n", par.gapg);
325 printf(" -gaph ]0,inf] factor to increase/reduce gap extend penalty for deletes(def=%-.2f)\n", par.gaph);
326 printf(" -gapi ]0,inf] factor to increase/reduce gap extend penalty for inserts(def=%-.2f)\n", par.gapi);
327 printf(" -egq [0,inf[ penalty (bits) for end gaps aligned to query residues (def=%-.2f) \n", par.egq);
328 printf(" -egt [0,inf[ penalty (bits) for end gaps aligned to template residues (def=%-.2f)\n", par.egt);
329 printf("\n");
330
331 printf("Pseudocount (pc) options: \n");
332 printf(" Context specific hhm pseudocounts:\n");
333 printf(" -pc_hhm_contxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_hhm_context_engine.admix);
334 printf(" 0: no pseudo counts: tau = 0 \n");
335 printf(" 1: constant tau = a \n");
336 printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n");
337 printf(" 3: CSBlast admixture: tau = a(1+b)/(Neff[i]+b) \n");
338 printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
339 printf(" -pc_hhm_contxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n", par.pc_hhm_context_engine.pca);
340 printf(" -pc_hhm_contxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n", par.pc_hhm_context_engine.pcb);
341 printf(" -pc_hhm_contxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n", par.pc_hhm_context_engine.pcc);
342 printf("\n");
343
344 printf(" Context independent hhm pseudocounts (used for templates; used for query if contxt file is not available):\n");
345 printf(" -pc_hhm_nocontxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_hhm_nocontext_mode);
346 printf(" 0: no pseudo counts: tau = 0 \n");
347 printf(" 1: constant tau = a \n");
348 printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n");
349 printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
350 printf(" -pc_hhm_nocontxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n", par.pc_hhm_nocontext_a);
351 printf(" -pc_hhm_nocontxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n", par.pc_hhm_nocontext_b);
352 printf(" -pc_hhm_nocontxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n", par.pc_hhm_nocontext_c);
353 printf("\n");
354
355 printf(" Context specific prefilter pseudocounts:\n");
356 printf(" -pc_prefilter_contxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_prefilter_context_engine.admix);
357 printf(" 0: no pseudo counts: tau = 0 \n");
358 printf(" 1: constant tau = a \n");
359 printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n");
360 printf(" 3: CSBlast admixture: tau = a(1+b)/(Neff[i]+b) \n");
361 printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
362 printf(" -pc_prefilter_contxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n", par.pc_prefilter_context_engine.pca);
363 printf(" -pc_prefilter_contxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n", par.pc_prefilter_context_engine.pcb);
364 printf(" -pc_prefilter_contxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n", par.pc_prefilter_context_engine.pcc);
365 printf("\n");
366
367 printf(" Context independent prefilter pseudocounts (used if context file is not available):\n");
368 printf(" -pc_prefilter_nocontxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_prefilter_nocontext_mode);
369 printf(" 0: no pseudo counts: tau = 0 \n");
370 printf(" 1: constant tau = a \n");
371 printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n");
372 printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
373 printf(" -pc_prefilter_nocontxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n", par.pc_prefilter_nocontext_a);
374 printf(" -pc_prefilter_nocontxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n", par.pc_prefilter_nocontext_b);
375 printf(" -pc_prefilter_nocontxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n", par.pc_prefilter_nocontext_c);
376 printf("\n");
377
378 printf(" Context-specific pseudo-counts: \n");
379 printf(" -nocontxt use substitution-matrix instead of context-specific pseudocounts \n");
380 printf(" -contxt <file> context file for computing context-specific pseudocounts (default=%s)\n", par.clusterfile.c_str());
381 printf(" -csw [0,inf] weight of central position in cs pseudocount mode (def=%.1f)\n", par.csw);
382 printf(" -csb [0,1] weight decay parameter for positions in cs pc mode (def=%.1f)\n", par.csb);
383 printf("\n");
384 }
385
386 printf("Other options: \n");
387 printf(" -v <int> verbose mode: 0:no screen output 1:only warings 2: verbose (def=%i)\n", par.v);
388 printf(" -neffmax ]1,20] skip further search iterations when diversity Neff of query MSA \n");
389 printf(" becomes larger than neffmax (default=%.1f)\n", par.neffmax);
390 printf(" -cpu <int> number of CPUs to use (for shared memory SMPs) (default=%i) \n", par.threads);
391 if (all) {
392 printf(" -scores <file> write scores for all pairwise comparisons to file \n");
393 printf(" -filter_matrices filter matrices for similarity to output at most 100 matrices\n");
394 printf(" -atab <file> write all alignments in tabular layout to file \n");
395 printf(" -maxseq <int> max number of input rows (def=%5i)\n", par.maxseq);
396 printf(" -maxres <int> max number of HMM columns (def=%5i)\n", par.maxres);
397 printf(" -maxmem [1,inf[ limit memory for realignment (in GB) (def=%.1f) \n", par.maxmem);
398 }
399 printf("\n");
400
401 if (!all) {
402 printf("An extended list of options can be obtained by calling 'hhblits -h all'\n");
403 printf("\n");
404 }
405 printf("Examples:\n");
406 printf("hhblits -i query.fas -o query.hhr -d ./uniclust30\n");
407 printf("\n");
408 printf("hhblits -i query.fas -o query.hhr -oa3m query.a3m -n 1 -d ./uniclust30\n");
409 printf("\n");
410 printf("Download databases from <http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/>.\n");
411 }
412
413
ProcessArguments(Parameters & par)414 void HHblits::ProcessArguments(Parameters& par) {
415 const int argc = par.argc;
416 const char** argv = par.argv;
417
418 //Processing command line input
419 for (int i = 1; i < argc; i++) {
420 HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl;
421 if (!strcmp(argv[i], "-i")) {
422 if (++i >= argc || argv[i][0] == '-') {
423 help(par);
424 HH_LOG(ERROR) << "No query file following -i" << std::endl;
425 exit(4);
426 } else
427 strcpy(par.infile, argv[i]);
428 } else if (!strcmp(argv[i], "-d")) {
429 if (++i >= argc || argv[i][0] == '-') {
430 help(par);
431 HH_LOG(ERROR) << "No database basename following -d" << std::endl;
432 exit(4);
433 } else {
434 std::string db(argv[i]);
435 if (HHDatabase::checkDatabaseConflicts(argv[i])) {
436 HH_LOG(ERROR) << "Ambiguous database basename. Choose either a A3M or CA3M database." << std::endl;
437 exit(4);
438 }
439 par.db_bases.push_back(db);
440 }
441 } else if (!strcmp(argv[i], "-contxt")
442 || !strcmp(argv[i], "-context_data")) {
443 if (++i >= argc || argv[i][0] == '-') {
444 help(par);
445 HH_LOG(ERROR) << "No lib following -contxt" << std::endl;
446 exit(4);
447 } else
448 par.clusterfile = argv[i];
449 } else if (!strcmp(argv[i], "-cslib") || !strcmp(argv[i], "-cs_lib")) {
450 if (++i >= argc || argv[i][0] == '-') {
451 help(par);
452 HH_LOG(ERROR) << "No lib following -cslib" << std::endl;
453 exit(4);
454 } else
455 par.cs_library = argv[i];
456 } else if (!strcmp(argv[i], "-o")) {
457 if (++i >= argc || argv[i][0] == '-') {
458 help(par);
459 HH_LOG(ERROR) << "No output file following -o" << std::endl;
460 exit(4);
461 } else
462 strcpy(par.outfile, argv[i]);
463 }
464 //no help required
465 else if (!strcmp(argv[i], "-omat")) {
466 if (++i >= argc || argv[i][0] == '-') {
467 help(par);
468 HH_LOG(ERROR) << "No output file following -omat" << std::endl;
469 exit(4);
470 } else
471 strcpy(par.matrices_output_file, argv[i]);
472 } else if (!strcmp(argv[i], "-oa3m")) {
473 if (++i >= argc || argv[i][0] == '-') {
474 help(par);
475 HH_LOG(ERROR) << "No output file following -oa3m" << std::endl;
476 exit(4);
477 } else
478 strcpy(par.alnfile, argv[i]);
479 } else if (!strcmp(argv[i], "-filter_matrices")) {
480 par.filter_matrices = true;
481 } else if (!strcmp(argv[i], "-ohhm")) {
482 if (++i >= argc || argv[i][0] == '-') {
483 help(par);
484 HH_LOG(ERROR) << "No output file following -ohhm" << std::endl;
485 exit(4);
486 } else
487 strcpy(par.hhmfile, argv[i]);
488 } else if (!strcmp(argv[i], "-opsi")) {
489 if (++i >= argc || argv[i][0] == '-') {
490 help(par);
491 HH_LOG(ERROR) << "No output file following -opsi" << std::endl;
492 exit(4);
493 } else
494 strcpy(par.psifile, argv[i]);
495 } else if (!strcmp(argv[i], "-oalis")) {
496 if (++i >= argc || argv[i][0] == '-') {
497 help(par);
498 HH_LOG(ERROR) << "No file basename following -oalis" << std::endl;
499 exit(4);
500 } else
501 strcpy(par.alisbasename, argv[i]);
502 }
503 else if (!strcmp(argv[i], "-Ofas")) {
504 par.append = 0;
505 par.outformat = 1;
506 if (++i >= argc || argv[i][0] == '-') {
507 help(par);
508 HH_LOG(ERROR) << "No output file following -o" << std::endl;
509 exit(4);
510 } else
511 strcpy(par.pairwisealisfile, argv[i]);
512 } else if (!strcmp(argv[i], "-Oa2m")) {
513 par.append = 0;
514 par.outformat = 2;
515 if (++i >= argc || argv[i][0] == '-') {
516 help(par);
517 HH_LOG(ERROR) << "No output file following -o" << std::endl;
518 exit(4);
519 } else
520 strcpy(par.pairwisealisfile, argv[i]);
521 } else if (!strcmp(argv[i], "-Oa3m")) {
522 par.append = 0;
523 par.outformat = 3;
524 if (++i >= argc || argv[i][0] == '-') {
525 help(par);
526 HH_LOG(ERROR) << "No output file following -o" << std::endl;
527 exit(4);
528 } else
529 strcpy(par.pairwisealisfile, argv[i]);
530 }
531 else if (!strcmp(argv[i], "-scores")) {
532 if (++i >= argc || argv[i][0] == '-') {
533 help(par);
534 HH_LOG(ERROR) << "No file following -scores" << std::endl;
535 exit(4);
536 } else {
537 strcpy(par.scorefile, argv[i]);
538 }
539 }
540 else if (!strcmp(argv[i], "-blasttab")) {
541 if (++i >= argc || argv[i][0] == '-') {
542 help(par);
543 HH_LOG(ERROR) << "No file following -blasttab" << std::endl;
544 exit(4);
545 } else {
546 strcpy(par.m8file, argv[i]);
547 }
548 }
549 else if (!strcmp(argv[i], "-atab")) {
550 if (++i >= argc || argv[i][0] == '-') {
551 help(par);
552 HH_LOG(ERROR) << "No file following -atab" << std::endl;
553 exit(4);
554 } else {
555 strcpy(par.alitabfile, argv[i]);
556 }
557 }
558 else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "-help")) {
559 help(par, 1);
560 exit(0);
561 } else if (!strcmp(argv[i], "-v") && (i < argc - 1)
562 && argv[i + 1][0] != '-') {
563 int v = atoi(argv[++i]);
564 par.v = Log::from_int(v);
565 Log::reporting_level() = par.v;
566 } else if (!strcmp(argv[i], "-n") && (i < argc - 1))
567 par.num_rounds = atoi(argv[++i]);
568 //no help required
569 else if (!strncmp(argv[i], "-BLOSUM", 7)
570 || !strncmp(argv[i], "-Blosum", 7)) {
571 if (!strcmp(argv[i] + 7, "30"))
572 par.matrix = 30;
573 else if (!strcmp(argv[i] + 7, "40"))
574 par.matrix = 40;
575 else if (!strcmp(argv[i] + 7, "50"))
576 par.matrix = 50;
577 else if (!strcmp(argv[i] + 7, "62"))
578 par.matrix = 62;
579 else if (!strcmp(argv[i] + 7, "65"))
580 par.matrix = 65;
581 else if (!strcmp(argv[i] + 7, "80"))
582 par.matrix = 80;
583 else
584 HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
585 } else if (!strcmp(argv[i], "-M") && (i < argc - 1)) {
586 if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m"))
587 par.M = 1;
588 else if (!strcmp(argv[i], "first"))
589 par.M = 3;
590 else if (argv[i][0] >= '0' && argv[i][0] <= '9') {
591 par.Mgaps = atoi(argv[i]);
592 par.M = 2;
593 } else
594 HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl;
595 }
596 else if (!strcmp(argv[i], "-p") && (i < argc - 1))
597 par.p = atof(argv[++i]);
598 else if (!strcmp(argv[i], "-E") && (i < argc - 1))
599 par.E = atof(argv[++i]);
600 else if (!strcmp(argv[i], "-b") && (i < argc - 1))
601 par.b = atoi(argv[++i]);
602 else if (!strcmp(argv[i], "-B") && (i < argc - 1))
603 par.B = atoi(argv[++i]);
604 else if (!strcmp(argv[i], "-z") && (i < argc - 1))
605 par.z = atoi(argv[++i]);
606 else if (!strcmp(argv[i], "-Z") && (i < argc - 1))
607 par.Z = atoi(argv[++i]);
608 else if (!strncmp(argv[i], "-hide_cons", 7))
609 par.showcons = 0;
610 else if (!strncmp(argv[i], "-hide_pred", 7))
611 par.showpred = 0;
612 else if (!strncmp(argv[i], "-hide_dssp", 7))
613 par.showdssp = 0;
614 else if (!strncmp(argv[i], "-show_ssconf", 7))
615 par.showconf = 1;
616 else if (!strncmp(argv[i], "-mark", 7))
617 par.mark = 1;
618 else if (!strncmp(argv[i], "-add_cons", 5))
619 par.cons = 1;
620 else if (!strcmp(argv[i], "-realign_max") && (i < argc - 1))
621 par.realign_max = atoi(argv[++i]);
622 else if (!strcmp(argv[i], "-e") && (i < argc - 1))
623 par.e = atof(argv[++i]);
624 else if (!strcmp(argv[i], "-seq") && (i < argc - 1))
625 par.nseqdis = atoi(argv[++i]);
626 else if (!strcmp(argv[i], "-aliw") && (i < argc - 1))
627 par.aliwidth = atoi(argv[++i]);
628 else if (!strcmp(argv[i], "-id") && (i < argc - 1))
629 par.max_seqid = atoi(argv[++i]);
630 else if (!strcmp(argv[i], "-qid") && (i < argc - 1))
631 par.qid = atoi(argv[++i]);
632 else if (!strcmp(argv[i], "-qsc") && (i < argc - 1))
633 par.qsc = atof(argv[++i]);
634 else if (!strcmp(argv[i], "-cov") && (i < argc - 1))
635 par.coverage = atoi(argv[++i]);
636 else if (!strcmp(argv[i], "-diff") && (i < argc - 1))
637 //Will be zero if "Inf" or other non-numeric values are passed to atoi
638 par.Ndiff = atoi(argv[++i]);
639 else if (!strcmp(argv[i], "-all") || !strcmp(argv[i], "-nodiff")) {
640 par.allseqs = true;
641 } else if (!strcmp(argv[i], "-neffmax") && (i < argc - 1))
642 par.neffmax = atof(argv[++i]);
643 else if ((!strcmp(argv[i], "-neff") || !strcmp(argv[i], "-Neff"))
644 && (i < argc - 1))
645 par.Neff = atof(argv[++i]);
646 //pc hhm context variables
647 else if (!strcmp(argv[i], "-pc_hhm_contxt_mode") && (i < argc - 1))
648 par.pc_hhm_context_engine.admix = (Pseudocounts::Admix) atoi(argv[++i]);
649 else if (!strcmp(argv[i], "-pc_hhm_contxt_a") && (i < argc - 1))
650 par.pc_hhm_context_engine.pca = atof(argv[++i]);
651 else if (!strcmp(argv[i], "-pc_hhm_contxt_b") && (i < argc - 1))
652 par.pc_hhm_context_engine.pcb = atof(argv[++i]);
653 else if (!strcmp(argv[i], "-pc_hhm_contxt_c") && (i < argc - 1))
654 par.pc_hhm_context_engine.pcc = atof(argv[++i]);
655 //pc prefilter context variables
656 else if (!strcmp(argv[i], "-pc_prefilter_contxt_mode") && (i < argc - 1))
657 par.pc_prefilter_context_engine.admix = (Pseudocounts::Admix) atoi(
658 argv[++i]);
659 else if (!strcmp(argv[i], "-pc_prefilter_contxt_a") && (i < argc - 1))
660 par.pc_prefilter_context_engine.pca = atof(argv[++i]);
661 else if (!strcmp(argv[i], "-pc_prefilter_contxt_b") && (i < argc - 1))
662 par.pc_prefilter_context_engine.pcb = atof(argv[++i]);
663 else if (!strcmp(argv[i], "-pc_prefilter_contxt_c") && (i < argc - 1))
664 par.pc_prefilter_context_engine.pcc = atof(argv[++i]);
665 //pc hhm nocontext variables
666 else if (!strcmp(argv[i], "-pc_hhm_nocontxt_mode") && (i < argc - 1))
667 par.pc_hhm_nocontext_mode = atoi(argv[++i]);
668 else if (!strcmp(argv[i], "-pc_hhm_nocontxt_a") && (i < argc - 1))
669 par.pc_hhm_nocontext_a = atof(argv[++i]);
670 else if (!strcmp(argv[i], "-pc_hhm_nocontxt_b") && (i < argc - 1))
671 par.pc_hhm_nocontext_b = atof(argv[++i]);
672 else if (!strcmp(argv[i], "-pc_hhm_nocontxt_c") && (i < argc - 1))
673 par.pc_hhm_nocontext_c = atof(argv[++i]);
674 //pc prefilter nocontext variables
675 else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_mode") && (i < argc - 1))
676 par.pc_prefilter_nocontext_mode = atoi(argv[++i]);
677 else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_a") && (i < argc - 1))
678 par.pc_hhm_nocontext_a = atof(argv[++i]);
679 else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_b") && (i < argc - 1))
680 par.pc_hhm_nocontext_b = atof(argv[++i]);
681 else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_c") && (i < argc - 1))
682 par.pc_hhm_nocontext_c = atof(argv[++i]);
683 else if (!strcmp(argv[i], "-gapb") && (i < argc - 1)) {
684 par.gapb = atof(argv[++i]);
685 if (par.gapb <= 0.01)
686 par.gapb = 0.01;
687 } else if (!strcmp(argv[i], "-gapd") && (i < argc - 1))
688 par.gapd = atof(argv[++i]);
689 else if (!strcmp(argv[i], "-gape") && (i < argc - 1))
690 par.gape = atof(argv[++i]);
691 else if (!strcmp(argv[i], "-gapf") && (i < argc - 1))
692 par.gapf = atof(argv[++i]);
693 else if (!strcmp(argv[i], "-gapg") && (i < argc - 1))
694 par.gapg = atof(argv[++i]);
695 else if (!strcmp(argv[i], "-gaph") && (i < argc - 1))
696 par.gaph = atof(argv[++i]);
697 else if (!strcmp(argv[i], "-gapi") && (i < argc - 1))
698 par.gapi = atof(argv[++i]);
699 else if (!strcmp(argv[i], "-egq") && (i < argc - 1))
700 par.egq = atof(argv[++i]);
701 else if (!strcmp(argv[i], "-egt") && (i < argc - 1))
702 par.egt = atof(argv[++i]);
703 //no help required
704 else if (!strcmp(argv[i], "-alphaa") && (i < argc - 1))
705 par.alphaa = atof(argv[++i]);
706 //no help required
707 else if (!strcmp(argv[i], "-alphab") && (i < argc - 1))
708 par.alphab = atof(argv[++i]);
709 //no help required
710 else if (!strcmp(argv[i], "-alphac") && (i < argc - 1))
711 par.alphac = atof(argv[++i]);
712 else if (!strcmp(argv[i], "-noprefilt")) {
713 par.prefilter = false;
714 par.already_seen_filter = false;
715 } else if (!strcmp(argv[i], "-noaddfilter")) {
716 par.already_seen_filter = false;
717 } else if (!strcmp(argv[i], "-min_prefilter_hits") && (i < argc - 1))
718 par.min_prefilter_hits = atoi(argv[++i]);
719 else if (!strcmp(argv[i], "-prepre_smax_thresh") && (i < argc - 1))
720 par.preprefilter_smax_thresh = atoi(argv[++i]);
721 else if (!strcmp(argv[i], "-pre_evalue_thresh") && (i < argc - 1))
722 par.prefilter_evalue_thresh = atof(argv[++i]);
723 else if (!strcmp(argv[i], "-pre_bitfactor") && (i < argc - 1))
724 par.prefilter_bit_factor = atoi(argv[++i]);
725 else if (!strcmp(argv[i], "-pre_gap_open") && (i < argc - 1))
726 par.prefilter_gap_open = atoi(argv[++i]);
727 else if (!strcmp(argv[i], "-pre_gap_extend") && (i < argc - 1))
728 par.prefilter_gap_extend = atoi(argv[++i]);
729 else if (!strcmp(argv[i], "-pre_score_offset") && (i < argc - 1))
730 par.prefilter_score_offset = atoi(argv[++i]);
731 else if (!strcmp(argv[i], "-realign_old_hits"))
732 par.realign_old_hits = true;
733 else if (!strcmp(argv[i], "-realign"))
734 par.realign = 1;
735 else if (!strcmp(argv[i], "-norealign"))
736 par.realign = 0;
737 else if (!strcmp(argv[i], "-ssm") && (i < argc - 1))
738 par.ssm = atoi(argv[++i]);
739 else if (!strcmp(argv[i], "-ssw") && (i < argc - 1))
740 par.ssw = atof(argv[++i]);
741 else if (!strcmp(argv[i], "-ssa") && (i < argc - 1))
742 par.ssa = atof(argv[++i]);
743 else if (!strcmp(argv[i], "-wg"))
744 par.wg = 1;
745 else if (!strcmp(argv[i], "-maxseq") && (i < argc - 1))
746 par.maxseq = atoi(argv[++i]);
747 else if (!strcmp(argv[i], "-maxres") && (i < argc - 1)) {
748 par.maxres = atoi(argv[++i]);
749 par.maxcol = 2 * par.maxres;
750 } else if (!strncmp(argv[i], "-glob", 5)) {
751 par.loc = 0;
752 if (par.mact > 0.35 && par.mact < 0.3502) {
753 par.mact = 0;
754 }
755 } else if (!strncmp(argv[i], "-loc", 4))
756 par.loc = 1;
757 else if (!strncmp(argv[i], "-alt", 4) && (i < argc - 1))
758 par.altali = atoi(argv[++i]);
759 else if (!strncmp(argv[i], "-smin", 4) && (i < argc - 1))
760 par.smin = atof(argv[++i]);
761 else if (!strcmp(argv[i], "-shift") && (i < argc - 1))
762 par.shift = atof(argv[++i]);
763 else if ((!strcmp(argv[i], "-mact") || !strcmp(argv[i], "-mapt"))
764 && (i < argc - 1))
765 par.mact = atof(argv[++i]);
766 else if (!strcmp(argv[i], "-sc") && (i < argc - 1))
767 par.columnscore = atoi(argv[++i]);
768 //no help required
769 else if (!strcmp(argv[i], "-scwin") && (i < argc - 1)) {
770 par.columnscore = 5;
771 par.half_window_size_local_aa_bg_freqs = std::max(1, atoi(argv[++i]));
772 } else if (!strncmp(argv[i], "-cpu", 4) && (i < argc - 1)) {
773 par.threads = atoi(argv[++i]);
774 } else if (!strcmp(argv[i], "-maxmem") && (i < argc - 1)) {
775 par.maxmem = atof(argv[++i]);
776 }
777 else if (!strncmp(argv[i],"-premerge",9) && (i<argc-1)) {
778 par.premerge=atoi(argv[++i]);
779 }
780 else if (!strcmp(argv[i], "-nocontxt"))
781 par.nocontxt = 1;
782 else if (!strcmp(argv[i], "-csb") && (i < argc - 1))
783 par.csb = atof(argv[++i]);
784 else if (!strcmp(argv[i], "-csw") && (i < argc - 1))
785 par.csw = atof(argv[++i]);
786 else if (!strcmp(argv[i], "-corr") && (i < argc - 1))
787 par.corr = atof(argv[++i]);
788 else if (!strcmp(argv[i], "-ovlp") && (i < argc - 1))
789 par.min_overlap = atoi(argv[++i]);
790 else if (!strcmp(argv[i], "-tags"))
791 par.notags = 0;
792 else if (!strcmp(argv[i], "-notags"))
793 par.notags = 1;
794 else if (!strcmp(argv[i], "-maxfilt") && (i < argc - 1))
795 par.maxnumdb = atoi(argv[++i]);
796 else if (!strcmp(argv[i], "-interim_filter")){
797 if (++i >= argc || argv[i][0] == '-') {
798 help(par);
799 HH_LOG(ERROR) << "No state following -interim_filter" << std::endl;
800 exit(4);
801 } else {
802 if(!strcmp(argv[i], "NONE")) {
803 par.interim_filter = INTERIM_FILTER_NONE;
804 } else if(!strcmp(argv[i], "FULL")) {
805 par.interim_filter = INTERIM_FILTER_FULL;
806 } else {
807 help(par);
808 HH_LOG(ERROR) << "No state out of NONE|FULL following -interim_filter" << std::endl;
809 exit(4);
810 }
811 }
812 }
813 else {
814 HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
815 }
816
817 HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl;
818 } // end of for-loop for command line input
819 }
820
mergeHitsToQuery(HitList & hitlist,Hash<Hit> * previous_hits,Hash<Hit> * premerged_hits,int & seqs_found,int & cluster_found,int min_col_realign)821 void HHblits::mergeHitsToQuery(HitList &hitlist, Hash<Hit>* previous_hits,Hash<Hit>* premerged_hits,
822 int& seqs_found, int& cluster_found, int min_col_realign) {
823
824 // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
825 const float COV_ABS = 25; // min. number of aligned residues
826 int cov_tot = std::max(std::min((int) (COV_ABS / Qali->L * 100 + 0.5), 70), par.coverage);
827
828 // For each template below threshold
829 hitlist.Reset();
830 while (!hitlist.End()) {
831 Hit hit_cur = hitlist.ReadNext();
832
833 if (hit_cur.Eval > 100.0 * par.e)
834 break; // E-value much too large
835 if (hit_cur.Eval > par.e)
836 continue; // E-value too large
837 if (hit_cur.matched_cols < min_col_realign)
838 continue; // leave out too short alignments
839
840 // Already in alignment
841 std::stringstream ss_tmp;
842 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
843 if (previous_hits->Contains((char*) ss_tmp.str().c_str()))
844 continue;
845
846 // Add number of sequences in this cluster to total found
847 seqs_found += SequencesInCluster(hit_cur.name); // read number after second '|'
848 cluster_found++;
849
850 // Skip merging this hit if hit alignment was already merged during premerging
851 if (premerged_hits->Contains((char*)ss_tmp.str().c_str()))
852 continue;
853
854 // Read a3m alignment of hit from <file>.a3m file
855 // Reading in next db MSA and merging it onto Qali
856 Alignment Tali(par.maxseq, par.maxres);
857 hit_cur.entry->getTemplateA3M(par, pb, S, Sim, Tali);
858
859 if (par.allseqs) // need to keep *all* sequences in Qali_allseqs? => merge before filtering
860 Qali_allseqs->MergeMasterSlave(hit_cur, Tali, hit_cur.name, par.maxcol);
861
862 Tali.N_filtered = Tali.Filter(par.max_seqid_db, S, par.coverage_db,
863 par.qid_db, par.qsc_db, par.Ndiff_db);
864
865 if(par.interim_filter == INTERIM_FILTER_FULL && Tali.N_filtered + Qali->N_in >= par.maxseq) {
866 Qali->N_filtered = Qali->Filter(par.max_seqid, S, cov_tot, par.qid, par.qsc, par.Ndiff);
867 Qali->Shrink();
868 }
869
870 Qali->MergeMasterSlave(hit_cur, Tali, hit_cur.name, par.maxcol);
871
872 if (Qali->N_in >= par.maxseq)
873 break; // Maximum number of sequences reached
874 }
875
876 // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
877 Qali->Compress("merged A3M file", par.cons, par.maxcol, 1, par.Mgaps);
878
879 // Sort out the nseqdis most dissimilacd r sequences for display in the result alignments
880 Qali->FilterForDisplay(par.max_seqid, par.mark, S, par.coverage, par.qid, par.qsc, par.nseqdis);
881
882
883 HH_LOG(DEBUG) << "Filter new alignment with cov " << cov_tot
884 << std::endl;
885
886 Qali->N_filtered = Qali->Filter(par.max_seqid, S, cov_tot, par.qid, par.qsc,
887 par.Ndiff);
888 }
889
add_hits_to_hitlist(std::vector<Hit> & hits,HitList & hitlist)890 void HHblits::add_hits_to_hitlist(std::vector<Hit>& hits, HitList& hitlist) {
891 for (std::vector<Hit>::size_type i = 0; i != hits.size(); i++) {
892 hitlist.Push(hits[i]);
893 }
894
895 // Sort list according to sortscore
896 HH_LOG(DEBUG) << "Sorting hit list ...\n";
897 hitlist.SortList();
898
899 // Use NN prediction of lamda and mu
900 hitlist.CalculatePvalues(q, par.loc, par.ssm, par.ssw);
901
902 // Calculate E-values as combination of P-value for Viterbi HMM-HMM comparison and prefilter E-value: E = Ndb P (Epre/Ndb)^alpha
903 if (par.prefilter)
904 hitlist.CalculateHHblitsEvalues(q, par.dbsize, par.alphaa, par.alphab, par.alphac, par.prefilter_evalue_thresh);
905 }
906
907 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
908 // Variant of ViterbiSearch() function for rescoring previously found HMMs with Viterbi algorithm.
909 // Perform Viterbi search on each hit object in global hash previous_hits, but keep old alignment
910 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
RescoreWithViterbiKeepAlignment(HMMSimd & q_vec,Hash<Hit> * previous_hits)911 void HHblits::RescoreWithViterbiKeepAlignment(HMMSimd& q_vec,
912 Hash<Hit>* previous_hits) {
913 // Initialize
914 std::vector<HHEntry*> hits_to_rescore;
915
916 // Get dbfiles of previous hits
917 previous_hits->Reset();
918 while (!previous_hits->End()) {
919 Hit hit_cur = previous_hits->ReadNext();
920 if (hit_cur.irep == 1) {
921 hits_to_rescore.push_back(hit_cur.entry);
922 }
923 }
924
925 //////////////////////////////////////////////////////////
926 // Start Viterbi search through db HMMs listed in dbfiles
927 // DoViterbiSearch(hits_to_rescore, previous_hits, false);
928
929 ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
930 std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
931 hits_to_rescore,
932 par.qsc_db, pb, S, Sim,
933 R, par.ssm, S73, S33, S37);
934
935 for (std::vector<Hit>::size_type i = 0; i != hits_to_add.size(); i++) {
936 std::stringstream ss_tmp;
937 ss_tmp << hits_to_add[i].file << "__" << hits_to_add[i].irep;
938 if (previous_hits->Contains((char*) ss_tmp.str().c_str())) {
939 Hit hit_cur = previous_hits->Remove((char*) ss_tmp.str().c_str());
940 previous_hits->Add((char*) ss_tmp.str().c_str(), hits_to_add[i]);
941 // Overwrite *hit[bin] with alignment, etc. of hit_cur
942 hit_cur.score = hits_to_add[i].score;
943 hit_cur.score_aass = hits_to_add[i].score_aass;
944 hit_cur.score_ss = hits_to_add[i].score_ss;
945 hit_cur.Pval = hits_to_add[i].Pval;
946 hit_cur.Pvalt = hits_to_add[i].Pvalt;
947 hit_cur.logPval = hits_to_add[i].logPval;
948 hit_cur.logPvalt = hits_to_add[i].logPvalt;
949 hit_cur.Eval = hits_to_add[i].Eval;
950 hit_cur.logEval = hits_to_add[i].logEval;
951 hit_cur.Probab = hits_to_add[i].Probab;
952 hitlist.Push(hit_cur); // insert hit at beginning of list (last repeats first!)
953 } else {
954 hits_to_add[i].Delete();
955 }
956 }
957
958 // Sort list according to sortscore
959 HH_LOG(DEBUG) << "Sorting hit list ..." << std::endl;
960
961 hitlist.SortList();
962
963 hitlist.CalculatePvalues(q, par.loc, par.ssm, par.ssw); // Use NN prediction of lamda and mu
964
965 if (par.prefilter)
966 hitlist.CalculateHHblitsEvalues(q, par.dbsize, par.alphaa, par.alphab,
967 par.alphac, par.prefilter_evalue_thresh);
968 }
969
970 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
971 // Realign hits with MAC algorithm
972 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
perform_realign(HMMSimd & q_vec,const char input_format,std::vector<HHEntry * > & hits_to_realign,int min_col_realign)973 void HHblits::perform_realign(HMMSimd& q_vec, const char input_format,
974 std::vector<HHEntry*>& hits_to_realign, int min_col_realign) {
975
976 // 19/02/2014: F/B-algos are calculated in log-space
977 // q->Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done
978 int nhits = 0;
979
980 // Longest allowable length of database HMM (backtrace: 5 chars, fwd: 1 double, bwd: 1 double
981 long int Lmaxmem = ((par.maxmem - 0.5) * 1024 * 1024 * 1024)
982 / (2 * sizeof(double) + 8) / q->L / par.threads;
983 int Lmax = 0; // length of longest HMM to be realigned
984
985 /////////////////////////////////////////////////////////////////////////////////////////////////
986 // Categorize the hits: first irep, then dbfile
987 int n_realignments = 0;
988 std::map<short int, std::vector<Hit *> > alignments;
989 hitlist.Reset();
990
991 std::vector<Hit *> hit_vector;
992
993 while (!hitlist.End()) {
994 Hit hit_cur = hitlist.ReadNext();
995 if (n_realignments >= par.realign_max
996 && n_realignments >= imax(par.B, par.Z))
997 break;
998 if (hit_cur.Eval > par.e) {
999 if (n_realignments >= imax(par.B, par.Z))
1000 continue;
1001 if (n_realignments >= imax(par.b, par.z) && hit_cur.Probab < par.p)
1002 continue;
1003 if (n_realignments >= imax(par.b, par.z) && hit_cur.Eval > par.E)
1004 continue;
1005 }
1006
1007 if (hit_cur.L > Lmax) {
1008 Lmax = hit_cur.L;
1009 }
1010 if (hit_cur.L > Lmaxmem) {
1011 nhits++;
1012 continue;
1013 }
1014
1015 hit_vector.push_back(hitlist.ReadCurrentAddress());
1016 n_realignments++;
1017 nhits++;
1018 }
1019
1020 int t_maxres = Lmax + 2;
1021 for (int i = 0; i < par.threads; i++) {
1022 posteriorMatrices[i]->allocateMatrix(q->L, t_maxres);
1023 }
1024
1025 // Initialize a Null-value as a return value if not items are available anymore
1026 PosteriorDecoderRunner runner(posteriorMatrices, viterbiMatrices, par.threads, par.ssw, S73, S33, S37);
1027
1028 HH_LOG(INFO)
1029 << "Realigning " << nhits
1030 << " HMM-HMM alignments using Maximum Accuracy algorithm" << std::endl;
1031
1032 runner.executeComputation(*q, hit_vector, par, par.qsc_db, pb, S, Sim, R);
1033
1034
1035 // Delete all hitlist entries with too short alignments
1036 nhits = 0;
1037 hitlist.Reset();
1038 while (!hitlist.End()) {
1039 Hit hit_cur = hitlist.ReadNext();
1040 //printf("Deleting alignment of %s with length %i? irep=%i nhits=%-2i par.B=%-3i par.Z=%-3i par.e=%.2g par.b=%-3i par.z=%-3i par.p=%.2g\n",hit_cur.name,hit_cur.matched_cols,hit_cur.irep,nhits,par.B,par.Z,par.e,par.b,par.z,par.p);
1041
1042 if (nhits > par.realign_max && nhits >= imax(par.B, par.Z))
1043 break;
1044 if (hit_cur.Eval > par.e) {
1045 if (nhits >= imax(par.B, par.Z))
1046 continue;
1047 if (nhits >= imax(par.b, par.z) && hit_cur.Probab < par.p)
1048 continue;
1049 if (nhits >= imax(par.b, par.z) && hit_cur.Eval > par.E)
1050 continue;
1051 }
1052
1053 if (hit_cur.matched_cols < min_col_realign) {
1054 HH_LOG(DEBUG) << "Deleting alignment of " << hit_cur.name
1055 << " with length " << hit_cur.matched_cols << std::endl;
1056 hitlist.Delete().Delete(); // delete the list record and hit object
1057 // // Make sure only realigned alignments get displayed! JS: Why? better unrealigned than none.
1058 // if (last_round)
1059 // if (par.B>par.Z) par.B--; else if (par.B==par.Z) {par.B--; par.Z--;} else par.Z--;
1060 }
1061 nhits++;
1062 }
1063 }
1064
run(FILE * query_fh,char * query_path)1065 void HHblits::run(FILE* query_fh, char* query_path) {
1066 int cluster_found = 0;
1067 int seqs_found = 0;
1068
1069 std::set<std::string> search_counter;
1070
1071 Hit hit_cur;
1072 Hash<Hit> * previous_hits = new Hash<Hit>(1631, hit_cur);
1073 Hash<Hit> * premerged_hits = new Hash<Hit>(1631, hit_cur);
1074 Qali = new Alignment(par.maxseq, par.maxres);
1075 Qali_allseqs = new Alignment(par.maxseq, par.maxres);
1076
1077 q = new HMM(MAXSEQDIS, par.maxres);
1078 HMMSimd q_vec(par.maxres);
1079 q_tmp = new HMM(MAXSEQDIS, par.maxres);
1080
1081 // Read query input file (HHM or alignment format) without adding pseudocounts
1082 Qali->N_in = 0;
1083 char input_format;
1084 ReadQueryFile(par, query_fh, input_format, par.wg, q, Qali, query_path, pb, S,
1085 Sim);
1086
1087 if (par.allseqs) {
1088 *Qali_allseqs = *Qali; // make a *deep* copy of Qali!
1089 for (int k = 0; k < Qali_allseqs->N_in; ++k)
1090 Qali_allseqs->keep[k] = 1; // keep *all* sequences (reset filtering in Qali)
1091 }
1092
1093 // Set query columns in His-tags etc to Null model distribution
1094 if (par.notags)
1095 q->NeutralizeTags(pb);
1096
1097 //save all entries pointer in this vector to delete, when it's safe
1098 std::vector<HHEntry*> all_entries;
1099 std::vector<HHEntry*> new_entries;
1100 std::vector<HHEntry*> old_entries;
1101
1102 if (!par.prefilter) {
1103 for (size_t i = 0; i < dbs.size(); i++) {
1104 dbs[i]->initNoPrefilter(new_entries);
1105 }
1106 all_entries.insert(all_entries.end(), new_entries.begin(),
1107 new_entries.end());
1108
1109 for (size_t i = 0; i < all_entries.size(); i++) {
1110 search_counter.insert(all_entries[i]->getName());
1111 }
1112 }
1113
1114 //////////////////////////////////////////////////////////////////////////////////
1115 // Main loop overs search iterations
1116 //////////////////////////////////////////////////////////////////////////////////
1117
1118 for (int round = 1; round <= par.num_rounds; round++) {
1119 HH_LOG(INFO) << "Iteration " << round << std::endl;
1120 if (par.premerge > 0 && round > 1 && previous_hits->Size() >= par.premerge)
1121 {
1122 HH_LOG(INFO) << "Set premerge to 0! (premerge: " << par.premerge << " iteration: " <<round << " hits.Size: " << previous_hits->Size() << ")\n";
1123 par.premerge = 0;
1124 } else{
1125 par.premerge -= previous_hits->Size();
1126 }
1127 // Save HMM without pseudocounts for prefilter query-profile
1128 *q_tmp = *q;
1129 HMM* q_rescore = new HMM(MAXSEQDIS, par.maxres);
1130
1131
1132 PrepareQueryHMM(par, input_format, q, pc_hhm_context_engine,
1133 pc_hhm_context_mode, pb, R);
1134 //deep copy for rescoring of q
1135 *q_rescore = *q;
1136 q_vec.MapOneHMM(q_rescore);
1137
1138 ////////////////////////////////////////////
1139 // Prefiltering
1140 ////////////////////////////////////////////
1141
1142 if (par.prefilter) {
1143 HH_LOG(INFO) << "Prefiltering database" << std::endl;
1144
1145 new_entries.clear();
1146 old_entries.clear();
1147
1148 // Add Pseudocounts to q_tmp
1149 if (par.nocontxt) {
1150 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1151 q_tmp->PreparePseudocounts(R);
1152 // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1153 q_tmp->AddAminoAcidPseudocounts(par.pc_prefilter_nocontext_mode,
1154 par.pc_prefilter_nocontext_a,
1155 par.pc_prefilter_nocontext_b,
1156 par.pc_prefilter_nocontext_c);
1157 } else {
1158 // Add context specific pseudocounts (now always used, because clusterfile is necessary)
1159 q_tmp->AddContextSpecificPseudocounts(pc_prefilter_context_engine,
1160 pc_prefilter_context_mode);
1161 }
1162
1163 q_tmp->CalculateAminoAcidBackground(pb);
1164
1165 for (size_t i = 0; i < dbs.size(); i++) {
1166 dbs[i]->prefilter_db(q_tmp, previous_hits, par.threads,
1167 par.prefilter_gap_open, par.prefilter_gap_extend,
1168 par.prefilter_score_offset,
1169 par.prefilter_bit_factor,
1170 par.prefilter_evalue_thresh,
1171 par.prefilter_evalue_coarse_thresh,
1172 par.preprefilter_smax_thresh,
1173 par.min_prefilter_hits, par.maxnumdb, R,
1174 new_entries, old_entries);
1175 }
1176
1177 for (size_t i = 0; i < new_entries.size(); i++) {
1178 search_counter.insert(new_entries[i]->getName());
1179 }
1180
1181 all_entries.insert(all_entries.end(), new_entries.begin(),
1182 new_entries.end());
1183 all_entries.insert(all_entries.end(), old_entries.begin(),
1184 old_entries.end());
1185 }
1186
1187 int max_template_length = getMaxTemplateLength(new_entries);
1188 if (max_template_length > par.maxres) {
1189 HH_LOG(WARNING)
1190 << "database contains sequences that exceed maximum allowed size (maxres = "
1191 << par.maxres << "). Max sequence length can be increased with parameter -maxres."
1192 << std::endl;
1193 }
1194 max_template_length = std::min(max_template_length, par.maxres);
1195 for (int i = 0; i < par.threads; i++) {
1196 viterbiMatrices[i]->AllocateBacktraceMatrix(q->L, max_template_length);
1197 }
1198
1199 hitlist.N_searched = search_counter.size();
1200
1201 if (new_entries.size() == 0) {
1202 HH_LOG(INFO) << "No HMMs pass prefilter => Stop searching!" << std::endl;
1203 break;
1204 }
1205
1206 if (par.prefilter) {
1207 // Search datbases
1208 HH_LOG(INFO)
1209 << "HMMs passed 2nd prefilter (gapped profile-profile alignment) : " << new_entries.size() + old_entries.size() << std::endl;
1210 HH_LOG(INFO)
1211 << "HMMs passed 2nd prefilter and not found in previous iterations : " << new_entries.size() << std::endl;
1212 }
1213 HH_LOG(INFO) << "Scoring " << new_entries.size() << " HMMs using HMM-HMM Viterbi alignment" << std::endl;
1214 // Main Viterbi HMM-HMM search
1215 ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1216 std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1217 new_entries,
1218 par.qsc_db, pb, S,
1219 Sim, R, par.ssm, S73, S33, S37);
1220
1221 add_hits_to_hitlist(hits_to_add, hitlist);
1222
1223 // check for new hits or end with iteration
1224 int new_hits = 0;
1225 hitlist.Reset();
1226 while (!hitlist.End()) {
1227 Hit hit_cur = hitlist.ReadNext();
1228 if (hit_cur.Eval > 100.0 * par.e)
1229 break; // E-value much too large
1230 if (hit_cur.Eval > par.e)
1231 continue; // E-value too large
1232 new_hits++;
1233 }
1234
1235 if (new_hits == 0 || round == par.num_rounds) {
1236 if (round < par.num_rounds) {
1237 HH_LOG(INFO) << "No new hits found in iteration " << round
1238 << " => Stop searching" << std::endl;
1239 }
1240
1241 if (old_entries.size() > 0 && par.realign_old_hits) {
1242 HH_LOG(INFO)
1243 << "Rescoring previously found HMMs with Viterbi algorithm"
1244 << std::endl;
1245
1246 ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1247 std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1248 old_entries,
1249 par.qsc_db, pb,
1250 S, Sim, R, par.ssm, S73, S33, S37);
1251 add_hits_to_hitlist(hits_to_add, hitlist);
1252 // Add dbfiles_old to dbfiles_new for realign
1253 new_entries.insert(new_entries.end(), old_entries.begin(),
1254 old_entries.end());
1255 } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1256 HH_LOG(INFO)
1257 << "Rescoring previously found HMMs with Viterbi algorithm"
1258 << std::endl;
1259 RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1260 }
1261 }
1262 if (par.premerge){
1263 premerge(previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1264 }
1265 // Realign hits with MAC algorithm
1266 if (par.realign) {
1267 perform_realign(q_vec, input_format, new_entries, MINCOLS_REALIGN);
1268 }
1269 // Generate alignment for next iteration
1270 if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile || *par.alisbasename) {
1271 if (new_hits > 0) {
1272 mergeHitsToQuery(hitlist, previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1273 }
1274
1275 // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
1276 Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons, par.showcons, pb, Sim, NULL, true);
1277
1278 if (par.notags)
1279 q->NeutralizeTags(pb);
1280
1281 if (*par.alisbasename) {
1282 Alignment* tmp = new Alignment(par.maxseq, par.maxres);
1283 if (par.allseqs) {
1284 (*tmp) = (*Qali_allseqs);
1285 } else {
1286 (*tmp) = (*Qali);
1287 }
1288
1289 alis[round] = tmp;
1290 }
1291 }
1292 // Update counts for log
1293 else if (round == par.num_rounds) {
1294 hitlist.Reset();
1295 while (!hitlist.End()) {
1296 Hit hit_cur = hitlist.ReadNext();
1297
1298 // E-value much too large
1299 if (hit_cur.Eval > 100.0 * par.e)
1300 break;
1301
1302 // E-value too large
1303 if (hit_cur.Eval > par.e)
1304 continue;
1305
1306 std::stringstream ss_tmp;
1307 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1308 // Already in alignment?
1309 if (previous_hits->Contains((char*) ss_tmp.str().c_str()))
1310 continue;
1311
1312 // Add number of sequences in this cluster to total found
1313 // read number after second '|'
1314 seqs_found += SequencesInCluster(hit_cur.name);
1315 cluster_found++;
1316 }
1317 }
1318
1319 HH_LOG(INFO) << seqs_found << " sequences belonging to "
1320 << cluster_found
1321 << " database HMMs found with an E-value < " << par.e
1322 << std::endl;
1323
1324 if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile
1325 || *par.alisbasename) {
1326 HH_LOG(INFO)
1327 << "Number of effective sequences of resulting query HMM: Neff = "
1328 << q->Neff_HMM << std::endl;
1329 }
1330
1331 if (q->Neff_HMM > par.neffmax && round < par.num_rounds) {
1332 HH_LOG(INFO)
1333 << "Diversity is above threshold (" << par.neffmax
1334 << "). Stop searching! (Change threshold using -neffmax <float>.)"
1335 << std::endl;
1336 }
1337
1338 if (Qali->N_in >= par.maxseq) {
1339 HH_LOG(INFO)
1340 << "Maximun number of sequences in query alignment reached (" << par.maxseq << "). Stop searching!" << std::endl;
1341 }
1342
1343 if (new_hits == 0 || round == par.num_rounds || q->Neff_HMM > par.neffmax || Qali->N_in >= par.maxseq) {
1344 // if (new_hits == 0 && round < par.num_rounds) {
1345 // HH_LOG(INFO) << "No new hits found in iteration " << round
1346 // << " => Stop searching" << std::endl;
1347 // }
1348 //
1349 // if (old_entries.size() > 0 && par.realign_old_hits) {
1350 // HH_LOG(INFO)
1351 // << "Recalculating previously found HMMs with Viterbi algorithm"
1352 // << std::endl;
1353 //
1354 // ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1355 // std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1356 // old_entries,
1357 // par.qsc_db, pb,
1358 // S, Sim, R, par.ssm, S73, S33, S37);
1359 //
1360 // add_hits_to_hitlist(hits_to_add, hitlist);
1361 //
1362 // if (par.realign)
1363 // perform_realign(q_vec, input_format, old_entries);
1364 // } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1365 // HH_LOG(INFO)
1366 // << "Rescoring previously found HMMs with Viterbi algorithm"
1367 // << std::endl;
1368 // RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1369 // }
1370
1371 delete q_rescore;
1372 break;
1373 }
1374
1375
1376 // Write good hits to previous_hits hash and clear hitlist
1377 hitlist.Reset();
1378 while (!hitlist.End()) {
1379 Hit hit_cur = hitlist.ReadNext();
1380
1381 std::stringstream ss_tmp;
1382 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1383
1384 if (!par.already_seen_filter || hit_cur.Eval > par.e
1385 || previous_hits->Contains((char*) ss_tmp.str().c_str()))
1386 hit_cur.Delete(); // Delete hit object (deep delete with Hit::Delete())
1387 else {
1388 previous_hits->Add((char*) ss_tmp.str().c_str(), hit_cur);
1389 }
1390
1391 hitlist.Delete(); // Delete list record (flat delete)
1392 }
1393
1394 delete q_rescore;
1395 }
1396
1397 // Warn, if HMMER files were used
1398 if (par.hmmer_used) {
1399 HH_LOG(WARNING)
1400 << "Using HMMER files results in a drastically reduced sensitivity (>10%%).\n"
1401 << " We recommend to use HHMs build by hhmake." << std::endl;
1402 }
1403
1404 for (size_t i = 0; i < all_entries.size(); i++) {
1405 delete all_entries[i];
1406 }
1407 all_entries.clear();
1408
1409 previous_hits->Reset();
1410 while (!previous_hits->End())
1411 previous_hits->ReadNext().Delete(); // Delete hit object
1412 delete previous_hits;
1413 delete premerged_hits;
1414 }
1415
1416
run(ffindex_entry_t * entry,char * data,ffindex_index_t * sequence_index,char * seq,ffindex_index_t * header_index,char * header)1417 void HHblits::run(ffindex_entry_t* entry, char* data,
1418 ffindex_index_t* sequence_index, char* seq,
1419 ffindex_index_t* header_index, char* header) {
1420
1421
1422 q=new HMM(MAXSEQDIS, par.maxres);
1423
1424 Qali = new Alignment(par.maxseq, par.maxres);
1425 Qali_allseqs = new Alignment(par.maxseq, par.maxres);
1426
1427
1428
1429 // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
1430 Qali->ReadCompressed(entry,data,
1431 sequence_index, seq,
1432 header_index, header,
1433 par.mark, par.maxcol);
1434
1435
1436 // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
1437 // and store marked sequences in name[k] and seq[k]
1438 Qali->Compress(par.infile, par.cons, par.maxcol, par.M, par.Mgaps);
1439
1440 Qali->Shrink();
1441
1442 // Sort out the nseqdis most dissimilar sequences for display in the output alignments
1443 Qali->FilterForDisplay(par.max_seqid, par.mark, S, par.coverage, par.qid, par.qsc, par.nseqdis);
1444
1445
1446 // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
1447 Qali->N_filtered = Qali->Filter(par.max_seqid, S, par.coverage, par.qid, par.qsc, par.Ndiff);
1448
1449 if (par.Neff >= 0.999)
1450 Qali->FilterNeff(par.wg, par.mark, par.cons, par.showcons, par.max_seqid, par.coverage, par.Neff, pb, S, Sim);
1451
1452 // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
1453 Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons,par.showcons, pb, Sim);
1454
1455
1456
1457 int cluster_found = 0;
1458 int seqs_found = 0;
1459
1460 std::set<std::string> search_counter;
1461
1462 Hit hit_cur;
1463 Hash<Hit>* previous_hits = new Hash<Hit>(1631, hit_cur);
1464 Hash<Hit>* premerged_hits = new Hash<Hit>(1631, hit_cur);
1465
1466
1467 HMMSimd q_vec(par.maxres);
1468 q_tmp = new HMM(MAXSEQDIS, par.maxres);
1469
1470 // Read query input file (HHM or alignment format) without adding pseudocounts
1471 Qali->N_in = 0;
1472 char input_format;
1473
1474 /*ReadQueryFile(par, query_fh, input_format, par.wg, q, Qali, query_path, pb, S,
1475 Sim);
1476 */
1477
1478
1479 if (par.allseqs) {
1480 *Qali_allseqs = *Qali; // make a *deep* copy of Qali!
1481 for (int k = 0; k < Qali_allseqs->N_in; ++k)
1482 Qali_allseqs->keep[k] = 1; // keep *all* sequences (reset filtering in Qali)
1483 }
1484
1485 // Set query columns in His-tags etc to Null model distribution
1486 if (par.notags)
1487 q->NeutralizeTags(pb);
1488
1489 //save all entries pointer in this vector to delete, when it's safe
1490 std::vector<HHEntry*> all_entries;
1491 std::vector<HHEntry*> new_entries;
1492 std::vector<HHEntry*> old_entries;
1493
1494 if (!par.prefilter) {
1495 for (size_t i = 0; i < dbs.size(); i++) {
1496 dbs[i]->initNoPrefilter(new_entries);
1497 }
1498 all_entries.insert(all_entries.end(), new_entries.begin(),
1499 new_entries.end());
1500
1501 for (size_t i = 0; i < all_entries.size(); i++) {
1502 search_counter.insert(all_entries[i]->getName());
1503 }
1504 }
1505
1506 //////////////////////////////////////////////////////////////////////////////////
1507 // Main loop overs search iterations
1508 //////////////////////////////////////////////////////////////////////////////////
1509
1510 for (int round = 1; round <= par.num_rounds; round++) {
1511 HH_LOG(INFO) << "Iteration " << round << std::endl;
1512 // Settings for different rounds
1513 if (par.premerge > 0 && round > 1 && previous_hits->Size() >= par.premerge)
1514 {
1515 HH_LOG(INFO) << "Set premerge to 0! (premerge: " << par.premerge << " iteration: " <<round << " hits.Size: " << previous_hits->Size() << ")\n";
1516 par.premerge = 0;
1517 } else{
1518 par.premerge -= previous_hits->Size();
1519 }
1520 // Save HMM without pseudocounts for prefilter query-profile
1521 *q_tmp = *q;
1522 HMM* q_rescore = new HMM(MAXSEQDIS, par.maxres);
1523
1524
1525 PrepareQueryHMM(par, input_format, q, pc_hhm_context_engine,
1526 pc_hhm_context_mode, pb, R);
1527 //deep copy for rescoring of q
1528 *q_rescore = *q;
1529 q_vec.MapOneHMM(q_rescore);
1530
1531 ////////////////////////////////////////////
1532 // Prefiltering
1533 ////////////////////////////////////////////
1534
1535 if (par.prefilter) {
1536 HH_LOG(INFO) << "Prefiltering database" << std::endl;
1537
1538 new_entries.clear();
1539 old_entries.clear();
1540
1541 // Add Pseudocounts to q_tmp
1542 if (par.nocontxt) {
1543 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1544 q_tmp->PreparePseudocounts(R);
1545 // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1546 q_tmp->AddAminoAcidPseudocounts(par.pc_prefilter_nocontext_mode,
1547 par.pc_prefilter_nocontext_a,
1548 par.pc_prefilter_nocontext_b,
1549 par.pc_prefilter_nocontext_c);
1550 } else {
1551 // Add context specific pseudocounts (now always used, because clusterfile is necessary)
1552 q_tmp->AddContextSpecificPseudocounts(pc_prefilter_context_engine,
1553 pc_prefilter_context_mode);
1554 }
1555
1556 q_tmp->CalculateAminoAcidBackground(pb);
1557
1558 for (size_t i = 0; i < dbs.size(); i++) {
1559 dbs[i]->prefilter_db(q_tmp, previous_hits, par.threads,
1560 par.prefilter_gap_open, par.prefilter_gap_extend,
1561 par.prefilter_score_offset,
1562 par.prefilter_bit_factor,
1563 par.prefilter_evalue_thresh,
1564 par.prefilter_evalue_coarse_thresh,
1565 par.preprefilter_smax_thresh,
1566 par.min_prefilter_hits, par.maxnumdb, R,
1567 new_entries, old_entries);
1568 }
1569
1570 for (size_t i = 0; i < new_entries.size(); i++) {
1571 search_counter.insert(new_entries[i]->getName());
1572 }
1573
1574 all_entries.insert(all_entries.end(), new_entries.begin(),
1575 new_entries.end());
1576 all_entries.insert(all_entries.end(), old_entries.begin(),
1577 old_entries.end());
1578 }
1579
1580 int max_template_length = getMaxTemplateLength(new_entries);
1581 if (max_template_length > par.maxres) {
1582 HH_LOG(WARNING)
1583 << "database contains sequences that exceeds maximum allowed size (maxres = "
1584 << par.maxres << "). Maxres can be increased with parameter -maxres."
1585 << std::endl;
1586 }
1587 max_template_length = std::min(max_template_length, par.maxres);
1588 for (int i = 0; i < par.threads; i++) {
1589 viterbiMatrices[i]->AllocateBacktraceMatrix(q->L, max_template_length);
1590 }
1591
1592 hitlist.N_searched = search_counter.size();
1593
1594 if (new_entries.size() == 0) {
1595 HH_LOG(INFO) << "No HMMs pass prefilter => Stop searching!"
1596 << std::endl;
1597 break;
1598 }
1599
1600 // Search datbases
1601 HH_LOG(INFO)
1602 << "HMMs passed 2nd prefilter (gapped profile-profile alignment) : "
1603 << new_entries.size() + old_entries.size() << std::endl;
1604 HH_LOG(INFO)
1605 << "HMMs passed 2nd prefilter and not found in previous iterations : "
1606 << new_entries.size() << std::endl;
1607 HH_LOG(INFO) << "Scoring " << new_entries.size()
1608 << " HMMs using HMM-HMM Viterbi alignment"
1609 << std::endl;
1610
1611 // Main Viterbi HMM-HMM search
1612 ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1613 std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1614 new_entries,
1615 par.qsc_db, pb, S,
1616 Sim, R, par.ssm, S73, S33, S37);
1617
1618 add_hits_to_hitlist(hits_to_add, hitlist);
1619
1620 // check for new hits or end with iteration
1621 int new_hits = 0;
1622 hitlist.Reset();
1623 while (!hitlist.End()) {
1624 Hit hit_cur = hitlist.ReadNext();
1625 if (hit_cur.Eval > 100.0 * par.e)
1626 break; // E-value much too large
1627 if (hit_cur.Eval > par.e)
1628 continue; // E-value too large
1629 new_hits++;
1630 }
1631
1632 if (new_hits == 0 || round == par.num_rounds) {
1633 if (round < par.num_rounds) {
1634 HH_LOG(INFO) << "No new hits found in iteration " << round
1635 << " => Stop searching" << std::endl;
1636 }
1637
1638 if (old_entries.size() > 0 && par.realign_old_hits) {
1639 HH_LOG(INFO)
1640 << "Rescoring previously found HMMs with Viterbi algorithm"
1641 << std::endl;
1642
1643 ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1644 std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1645 old_entries,
1646 par.qsc_db, pb,
1647 S, Sim, R, par.ssm, S73, S33, S37);
1648 add_hits_to_hitlist(hits_to_add, hitlist);
1649 // Add dbfiles_old to dbfiles_new for realign
1650 new_entries.insert(new_entries.end(), old_entries.begin(),
1651 old_entries.end());
1652 } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1653 HH_LOG(INFO)
1654 << "Rescoring previously found HMMs with Viterbi algorithm"
1655 << std::endl;
1656 RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1657 }
1658 }
1659 if (par.premerge){
1660 premerge(previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1661 }
1662 // Realign hits with MAC algorithm
1663 if (par.realign)
1664 perform_realign(q_vec, input_format, new_entries, MINCOLS_REALIGN);
1665
1666 // Generate alignment for next iteration
1667 if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile || *par.alisbasename) {
1668 if (new_hits > 0) {
1669 mergeHitsToQuery(hitlist, previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1670 }
1671
1672 // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
1673 Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons,par.showcons, pb, Sim, NULL,true);
1674
1675 if (par.notags)
1676 q->NeutralizeTags(pb);
1677
1678 if (*par.alisbasename) {
1679 Alignment* tmp = new Alignment(par.maxseq, par.maxres);
1680 if (par.allseqs) {
1681 (*tmp) = (*Qali_allseqs);
1682 } else {
1683 (*tmp) = (*Qali);
1684 }
1685
1686 alis[round] = tmp;
1687 }
1688 }
1689 // Update counts for log
1690 else if (round == par.num_rounds) {
1691 hitlist.Reset();
1692 while (!hitlist.End()) {
1693 Hit hit_cur = hitlist.ReadNext();
1694
1695 // E-value much too large
1696 if (hit_cur.Eval > 100.0 * par.e)
1697 break;
1698
1699 // E-value too large
1700 if (hit_cur.Eval > par.e)
1701 continue;
1702
1703 std::stringstream ss_tmp;
1704 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1705 // Already in alignment?
1706 if (previous_hits->Contains((char*) ss_tmp.str().c_str()))
1707 continue;
1708
1709 // Add number of sequences in this cluster to total found
1710 // read number after second '|'
1711 seqs_found += SequencesInCluster(hit_cur.name);
1712 cluster_found++;
1713 }
1714 }
1715
1716 HH_LOG(INFO) << seqs_found << " sequences belonging to "
1717 << cluster_found
1718 << " database HMMs found with an E-value < " << par.e
1719 << std::endl;
1720
1721 if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile
1722 || *par.alisbasename) {
1723 HH_LOG(INFO)
1724 << "Number of effective sequences of resulting query HMM: Neff = "
1725 << q->Neff_HMM << std::endl;
1726 }
1727
1728 if (q->Neff_HMM > par.neffmax && round < par.num_rounds) {
1729 HH_LOG(INFO)
1730 << "Diversity is above threshold (" << par.neffmax
1731 << "). Stop searching! (Change threshold using -neffmax <float>.)"
1732 << std::endl;
1733 }
1734
1735 if (Qali->N_in >= par.maxseq) {
1736 HH_LOG(INFO)
1737 << "Maximun number of sequences in query alignment reached ("
1738 << par.maxseq << "). Stop searching!" << std::endl;
1739 }
1740
1741 if (new_hits == 0 || round == par.num_rounds || q->Neff_HMM > par.neffmax || Qali->N_in >= par.maxseq) {
1742 // if (new_hits == 0 && round < par.num_rounds) {
1743 // HH_LOG(INFO) << "No new hits found in iteration " << round
1744 // << " => Stop searching" << std::endl;
1745 // }
1746 //
1747 // if (old_entries.size() > 0 && par.realign_old_hits) {
1748 // HH_LOG(INFO)
1749 // << "Recalculating previously found HMMs with Viterbi algorithm"
1750 // << std::endl;
1751 //
1752 // ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1753 // std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1754 // old_entries,
1755 // par.qsc_db, pb,
1756 // S, Sim, R, par.ssm, S73, S33, S37);
1757 //
1758 // add_hits_to_hitlist(hits_to_add, hitlist);
1759 //
1760 // if (par.realign)
1761 // perform_realign(q_vec, input_format, old_entries);
1762 // } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1763 // HH_LOG(INFO)
1764 // << "Rescoring previously found HMMs with Viterbi algorithm"
1765 // << std::endl;
1766 // RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1767 // }
1768
1769 delete q_rescore;
1770 break;
1771 }
1772
1773
1774 // Write good hits to previous_hits hash and clear hitlist
1775 hitlist.Reset();
1776 while (!hitlist.End()) {
1777 Hit hit_cur = hitlist.ReadNext();
1778
1779 std::stringstream ss_tmp;
1780 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1781
1782 if (!par.already_seen_filter || hit_cur.Eval > par.e
1783 || previous_hits->Contains((char*) ss_tmp.str().c_str()))
1784 hit_cur.Delete(); // Delete hit object (deep delete with Hit::Delete())
1785 else {
1786 previous_hits->Add((char*) ss_tmp.str().c_str(), hit_cur);
1787 }
1788
1789 hitlist.Delete(); // Delete list record (flat delete)
1790 }
1791
1792 delete q_rescore;
1793 }
1794
1795 // Warn, if HMMER files were used
1796 if (par.hmmer_used) {
1797 HH_LOG(WARNING)
1798 << "Using HMMER files results in a drastically reduced sensitivity (>10%%).\n"
1799 << " We recommend to use HHMs build by hhmake." << std::endl;
1800 }
1801
1802 for (size_t i = 0; i < all_entries.size(); i++) {
1803 delete all_entries[i];
1804 }
1805 all_entries.clear();
1806
1807 previous_hits->Reset();
1808 while (!previous_hits->End())
1809 previous_hits->ReadNext().Delete(); // Delete hit object
1810 delete previous_hits;
1811 delete premerged_hits;
1812 }
1813
1814
1815
writeHHRFile(char * hhrFile)1816 void HHblits::writeHHRFile(char* hhrFile) {
1817 if (*hhrFile) {
1818 hitlist.PrintHHR(q_tmp, hhrFile, par.maxdbstrlen, par.showconf,
1819 par.showcons, par.showdssp, par.showpred, par.b, par.B,
1820 par.z, par.Z, par.aliwidth, par.nseqdis, par.p, par.E,
1821 par.argc, par.argv, S, par.maxseq);
1822 }
1823 }
1824
writeAlisFile(char * basename)1825 void HHblits::writeAlisFile(char* basename) {
1826 if (*basename) {
1827 std::map<int, Alignment*>::iterator it;
1828 for (it = alis.begin(); it != alis.end(); it++) {
1829 std::stringstream ss_tmp;
1830 ss_tmp << basename << "_" << (*it).first << ".a3m";
1831 std::string id = ss_tmp.str();
1832
1833 (*it).second->WriteToFile(id.c_str(), par.append, "a3m");
1834 }
1835 }
1836 }
1837
writeScoresFile(char * scoresFile)1838 void HHblits::writeScoresFile(char* scoresFile) {
1839 if (*scoresFile) {
1840 hitlist.PrintScoreFile(q, scoresFile);
1841 }
1842 }
1843
writeM8(char * m8File)1844 void HHblits::writeM8(char* m8File) {
1845 if (*m8File) {
1846 hitlist.PrintM8File(q, m8File, par.nseqdis, par.p, par.b, par.E);
1847 }
1848 }
1849
writePairwiseAlisFile(char * pairwiseAlisFile,char outformat)1850 void HHblits::writePairwiseAlisFile(char* pairwiseAlisFile, char outformat) {
1851 if (*pairwiseAlisFile) {
1852 hitlist.PrintAlignments(q, pairwiseAlisFile, par.showconf, par.showcons,
1853 par.showdssp, par.showpred, par.p, par.aliwidth,
1854 par.nseqdis, par.b, par.B, par.E, S, par.maxseq, outformat);
1855 }
1856 }
1857
writeAlitabFile(char * alitabFile)1858 void HHblits::writeAlitabFile(char* alitabFile) {
1859 if (*alitabFile) {
1860 hitlist.WriteToAlifile(q, alitabFile, par.b, par.B, par.z, par.Z, par.p,
1861 par.E);
1862 }
1863 }
1864
writePsiFile(char * psiFile)1865 void HHblits::writePsiFile(char* psiFile) {
1866 // Write output PSI-BLAST-formatted alignment?
1867 if (*psiFile) {
1868 if (par.allseqs)
1869 Qali_allseqs->WriteToFile(psiFile, par.append, "psi");
1870 else
1871 Qali->WriteToFile(psiFile, par.append, "psi");
1872 }
1873 }
1874
writeHMMFile(char * HMMFile)1875 void HHblits::writeHMMFile(char* HMMFile) {
1876 // Write output HHM file?
1877 if (*HMMFile) {
1878 // Add *no* amino acid pseudocounts to query. This is necessary to copy f[i][a] to p[i][a]
1879 q->AddAminoAcidPseudocounts(0, 0.0, 0.0, 1.0);
1880 q->CalculateAminoAcidBackground(pb);
1881
1882 q->WriteToFile(HMMFile, par.append, par.max_seqid, par.coverage, par.qid,
1883 par.Ndiff, par.qsc, par.argc, par.argv, pb);
1884 }
1885 }
1886
writeA3MFile(char * A3MFile)1887 void HHblits::writeA3MFile(char* A3MFile) {
1888 // Write output A3M alignment?
1889 if (*A3MFile) {
1890 if (par.allseqs)
1891 Qali_allseqs->WriteToFile(A3MFile, par.append, "a3m");
1892 else
1893 Qali->WriteToFile(A3MFile, par.append, "a3m");
1894 }
1895 }
1896
writeHHRFile(HHblits & hhblits,std::stringstream & out)1897 void HHblits::writeHHRFile(HHblits& hhblits, std::stringstream& out) {
1898 hhblits.hitlist.PrintHHR(hhblits.q_tmp, out, hhblits.par.maxdbstrlen,
1899 hhblits.par.showconf, hhblits.par.showcons,
1900 hhblits.par.showdssp, hhblits.par.showpred,
1901 hhblits.par.b, hhblits.par.B, hhblits.par.z,
1902 hhblits.par.Z, hhblits.par.aliwidth,
1903 hhblits.par.nseqdis, hhblits.par.p, hhblits.par.E,
1904 hhblits.par.argc, hhblits.par.argv, hhblits.S,
1905 hhblits.par.maxseq);
1906 }
1907
printHitList()1908 void HHblits::printHitList() {
1909 char output[] = {"stdout"};
1910 hitlist.PrintHitList(q, output, par.maxdbstrlen, par.z, par.Z, par.p, par.E, par.argc, par.argv);
1911 }
1912
printHHRFile()1913 void HHblits::printHHRFile() {
1914 char output[] = {"stdout"};
1915 hitlist.PrintHHR(q_tmp, output, par.maxdbstrlen,
1916 par.showconf, par.showcons,
1917 par.showdssp, par.showpred,
1918 par.b, par.B, par.z,
1919 par.Z, par.aliwidth,
1920 par.nseqdis, par.p, par.E,
1921 par.argc, par.argv, S, par.maxseq);
1922 }
1923
writeScoresFile(HHblits & hhblits,std::stringstream & out)1924 void HHblits::writeScoresFile(HHblits& hhblits, std::stringstream& out) {
1925 hhblits.hitlist.PrintScoreFile(hhblits.q, out);
1926 }
1927
writeM8(HHblits & hhblits,std::stringstream & out)1928 void HHblits::writeM8(HHblits& hhblits, std::stringstream& out) {
1929 hhblits.hitlist.PrintM8File(hhblits.q, out, hhblits.par.nseqdis, hhblits.par.p, hhblits.par.b, hhblits.par.E);
1930 }
1931
writePairwiseAlisFile(HHblits & hhblits,std::stringstream & out)1932 void HHblits::writePairwiseAlisFile(HHblits& hhblits, std::stringstream& out) {
1933 hhblits.hitlist.PrintAlignments(hhblits.q, out, hhblits.par.showconf,
1934 hhblits.par.showcons, hhblits.par.showdssp,
1935 hhblits.par.showpred, hhblits.par.p,
1936 hhblits.par.aliwidth, hhblits.par.nseqdis,
1937 hhblits.par.b, hhblits.par.B, hhblits.par.E,
1938 hhblits.S, hhblits.par.maxseq, hhblits.par.outformat);
1939 }
1940
writeAlitabFile(HHblits & hhblits,std::stringstream & out)1941 void HHblits::writeAlitabFile(HHblits& hhblits, std::stringstream& out) {
1942 hhblits.hitlist.WriteToAlifile(hhblits.q, out, hhblits.par.b, hhblits.par.B,
1943 hhblits.par.z, hhblits.par.Z, hhblits.par.p,
1944 hhblits.par.E);
1945 }
1946
writePsiFile(HHblits & hhblits,std::stringstream & out)1947 void HHblits::writePsiFile(HHblits& hhblits, std::stringstream& out) {
1948 if (hhblits.par.allseqs)
1949 hhblits.Qali_allseqs->WriteToFile(out, "psi");
1950 else
1951 hhblits.Qali->WriteToFile(out, "psi");
1952 }
1953
writeHMMFile(HHblits & hhblits,std::stringstream & out)1954 void HHblits::writeHMMFile(HHblits& hhblits, std::stringstream& out) {
1955 // Add *no* amino acid pseudocounts to query. This is necessary to copy f[i][a] to p[i][a]
1956 hhblits.q->AddAminoAcidPseudocounts(0, 0.0, 0.0, 1.0);
1957 hhblits.q->CalculateAminoAcidBackground(hhblits.pb);
1958
1959 hhblits.q->WriteToFile(out, hhblits.par.max_seqid, hhblits.par.coverage,
1960 hhblits.par.qid, hhblits.par.Ndiff, hhblits.par.qsc,
1961 hhblits.par.argc, hhblits.par.argv, hhblits.pb);
1962 }
1963
writeA3MFile(HHblits & hhblits,std::stringstream & out)1964 void HHblits::writeA3MFile(HHblits& hhblits, std::stringstream& out) {
1965 if (hhblits.par.allseqs)
1966 hhblits.Qali_allseqs->WriteToFile(out, "a3m");
1967 else
1968 hhblits.Qali->WriteToFile(out, "a3m");
1969 }
1970
writeMatricesFile(char * matricesOutputFileName)1971 void HHblits::writeMatricesFile(char* matricesOutputFileName) {
1972 if (*matricesOutputFileName) {
1973 hitlist.PrintMatrices(q, matricesOutputFileName, par.filter_matrices,
1974 par.max_number_matrices, S);
1975 }
1976 }
1977
writeMatricesFile(HHblits & hhblits,std::stringstream & out)1978 void HHblits::writeMatricesFile(HHblits& hhblits, std::stringstream& out) {
1979 hhblits.hitlist.PrintMatrices(hhblits.q, out, hhblits.par.filter_matrices,
1980 hhblits.par.max_number_matrices,
1981 hhblits.S);
1982 }
1983
premerge(Hash<Hit> * previous_hits,Hash<Hit> * premerged_hits,int & seqs_found,int & cluster_found,int min_col_realign)1984 void HHblits::premerge(Hash<Hit>* previous_hits, Hash<Hit>* premerged_hits,
1985 int& seqs_found, int& cluster_found, int min_col_realign) {
1986 int Lmax = 0; // length of longest HMM to be realigned
1987 long int Lmaxmem = ((par.maxmem - 0.5) * 1024 * 1024 * 1024)
1988 / (2 * sizeof(double) + 8) / q->L / par.threads;
1989
1990 // Initialize a Null-value as a return value if not items are available anymore
1991 hitlist.Reset();
1992 while (!hitlist.End()) {
1993 Hit hit_cur = hitlist.ReadNext();
1994 if (hit_cur.L > Lmax) {
1995 Lmax = hit_cur.L;
1996 }
1997 if (hit_cur.L > Lmaxmem) {
1998 continue;
1999 }
2000 }
2001 int t_maxres = Lmax + 2;
2002 for (int i = 0; i < par.threads; i++) {
2003 posteriorMatrices[i]->allocateMatrix(q->L, t_maxres);
2004 }
2005 PosteriorDecoderRunner runner(posteriorMatrices, viterbiMatrices, par.threads, par.ssw, S73, S33, S37);
2006
2007 int prermergedHits = 0;
2008 std::vector<Hit *> tmpHits;
2009 hitlist.Reset();
2010
2011 while (!hitlist.End() && prermergedHits<par.premerge)
2012 {
2013 HitList tmpHitList;
2014 Hit hit_cur = hitlist.ReadNext();
2015 if (hit_cur.L > Lmaxmem) {
2016 continue;
2017 }
2018
2019 tmpHits.clear();
2020 tmpHits.push_back(&hit_cur);
2021 if (prermergedHits>=imax(par.B,par.Z)) break;
2022 if (prermergedHits>=imax(par.b,par.z) && hit_cur.Probab < par.p) break;
2023 if (prermergedHits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue;
2024 prermergedHits++;
2025
2026 if (hit_cur.Eval > par.e) continue; // Don't align hits with an E-value below the inclusion threshold
2027 // Replace original hit in hitlist with realigned hit
2028 //hitlist.ReadCurrent().Delete();
2029 runner.executeComputation(*q, tmpHits, par, par.qsc_db, pb, S, Sim, R);
2030 hitlist.Delete();
2031 hitlist.Insert(*tmpHits[0]);
2032 tmpHitList.Push(*tmpHits[0]);
2033 mergeHitsToQuery(tmpHitList, previous_hits, premerged_hits, seqs_found, cluster_found, min_col_realign);
2034 std::stringstream ss_tmp;
2035 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
2036 premerged_hits->Add((char*)ss_tmp.str().c_str());
2037
2038 // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
2039 Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons,par.showcons, pb, Sim, NULL,true);
2040
2041 if (par.notags) q->NeutralizeTags(pb);
2042
2043 if (par.nocontxt) {
2044 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
2045 q->PreparePseudocounts(R);
2046 // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
2047 q->AddAminoAcidPseudocounts(par.pc_prefilter_nocontext_mode,
2048 par.pc_prefilter_nocontext_a,
2049 par.pc_prefilter_nocontext_b,
2050 par.pc_prefilter_nocontext_c);
2051 } else {
2052 // Add context specific pseudocounts (now always used, because clusterfile is necessary)
2053 q->AddContextSpecificPseudocounts(pc_prefilter_context_engine,
2054 pc_prefilter_context_mode);
2055 }
2056 q->CalculateAminoAcidBackground(pb);
2057
2058 // Transform transition freqs to lin space if not already done
2059 q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
2060 par.gaph, par.gapi, par.gapb, par.gapb);
2061 q->Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done
2062
2063 }
2064 HH_LOG(INFO) << "Premerge done\n";
2065
2066 }
2067
2068