1 /*
2  * HHblits.cpp
3  *
4  *  Created on: Apr 1, 2014
5  *      Author: meiermark
6  */
7 
8 #include "hhblits.h"
9 #include "hhsuite_config.h"
10 
HHblits(Parameters & par,std::vector<HHblitsDatabase * > & databases)11 HHblits::HHblits(Parameters& par, std::vector<HHblitsDatabase*>& databases) : par(par) {
12     dbs = databases;
13 
14     context_lib = NULL;
15     crf = NULL;
16     pc_hhm_context_engine = NULL;
17     pc_hhm_context_mode = NULL;
18     pc_prefilter_context_engine = NULL;
19     pc_prefilter_context_mode = NULL;
20 
21     Qali = NULL;
22     Qali_allseqs = NULL;
23 
24     q = NULL;
25     q_tmp = NULL;
26 
27     // Set (global variable) substitution matrix and derived matrices
28     SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim);
29 
30     // Set secondary structure substitution matrix
31     if (par.ssm)
32         SetSecStrucSubstitutionMatrix(par.ssa, S73, S37, S33);
33 
34     // Prepare pseudocounts
35     if (!par.nocontxt) {
36         InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode);
37     }
38 
39     // Prepare multi-threading - reserve memory for threads, intialize, etc.
40     viterbiMatrices = new ViterbiMatrix*[par.threads];
41     posteriorMatrices = new PosteriorMatrix*[par.threads];
42     for (int bin = 0; bin < par.threads; bin++) {
43         viterbiMatrices[bin] = new ViterbiMatrix();
44         posteriorMatrices[bin] = new PosteriorMatrix();
45     }
46 }
47 
~HHblits()48 HHblits::~HHblits() {
49     Reset();
50 
51     for (int bin = 0; bin < par.threads; bin++) {
52         delete viterbiMatrices[bin];
53         delete posteriorMatrices[bin];
54     }
55     delete[] viterbiMatrices;
56     delete[] posteriorMatrices;
57 
58     DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode);
59 }
60 
prepareDatabases(Parameters & par,std::vector<HHblitsDatabase * > & databases)61 void HHblits::prepareDatabases(Parameters& par,
62                                std::vector<HHblitsDatabase*>& databases) {
63     for (size_t i = 0; i < par.db_bases.size(); i++) {
64         HHblitsDatabase* db = new HHblitsDatabase(par.db_bases[i].c_str());
65         databases.push_back(db);
66     }
67 
68     par.dbsize = 0;
69     for (size_t i = 0; i < databases.size(); i++) {
70         par.dbsize += databases[i]->cs219_database->db_index->n_entries;
71     }
72 
73     if (par.prefilter) {
74         for (size_t i = 0; i < databases.size(); i++) {
75             databases[i]->initPrefilter(par.cs_library);
76         }
77     }
78 }
79 
ProcessAllArguments(Parameters & par)80 void HHblits::ProcessAllArguments(Parameters& par) {
81     const int argc = par.argc;
82     const char** argv = par.argv;
83 
84     par.Ndiff = 1000;
85     par.prefilter = true;
86 
87     par.early_stopping_filter = true;
88     par.filter_thresh = 0.01;
89 
90     ProcessArguments(par);
91 
92     // Check needed files
93     if (!*par.infile || !strcmp(par.infile, "")) {
94         help(par);
95         HH_LOG(ERROR) << "Input file is missing! (see -i)" << std::endl;
96         exit(4);
97     }
98     if (par.db_bases.size() == 0) {
99         help(par);
100         HH_LOG(ERROR) << "Database is missing! (see -d)" << std::endl;
101         exit(4);
102     }
103 
104     if (par.loc == 0 && par.num_rounds >= 2) {
105         HH_LOG(WARNING)
106             << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << "\n"
107             << "\tusing -global alignment for iterative searches is deprecated "
108                "since non-homologous sequence segments can easily enter the "
109                "MSA and corrupt it."
110             << std::endl;
111     }
112 
113     if (par.num_rounds < 1)
114         par.num_rounds = 1;
115     else if (par.num_rounds > 8) {
116         HH_LOG(WARNING) << "Number of iterations (" << par.num_rounds << ") too large => Set to 8 iterations\n";
117         par.num_rounds = 8;
118     }
119 
120     // Check option compatibilities
121     if (par.nseqdis > MAXSEQDIS - 3 - par.showcons)
122         par.nseqdis = MAXSEQDIS - 3 - par.showcons;  //3 reserved for secondary structure
123     if (par.aliwidth < 20)
124         par.aliwidth = 20;
125     if (par.pc_hhm_context_engine.pca < 0.001)
126         par.pc_hhm_context_engine.pca = 0.001;  // to avoid log(0)
127     if (par.pc_prefilter_context_engine.pca < 0.001)
128         par.pc_prefilter_context_engine.pca = 0.001;  // to avoid log(0)
129     if (par.b > par.B)
130         par.B = par.b;
131     if (par.z > par.Z)
132         par.Z = par.z;
133     if (par.maxmem < 1.0) {
134         HH_LOG(WARNING) << "Setting -maxmem to its minimum allowed value of 1.0" << std::endl;
135         par.maxmem = 1.0;
136     }
137     if (par.mact >= 1.0)
138         par.mact = 0.999;
139     else if (par.mact < 0)
140         par.mact = 0.0;
141     if (par.altali < 1)
142         par.altali = 1;
143 }
144 
Reset()145 void HHblits::Reset() {
146     if (q) {
147         delete q;
148         q = NULL;
149     }
150 
151     if (q_tmp) {
152         delete q_tmp;
153         q_tmp = NULL;
154     }
155 
156     if (Qali) {
157         delete Qali;
158         Qali = NULL;
159     }
160 
161     if (Qali_allseqs) {
162         delete Qali_allseqs;
163         Qali_allseqs = NULL;
164     }
165 
166     hitlist.Reset();
167     while (!hitlist.End())
168         hitlist.Delete().Delete();
169     hitlist.Reset();
170 
171     std::map<int, Alignment*>::iterator it;
172     for (it = alis.begin(); it != alis.end(); it++) {
173         delete (*it).second;
174     }
175     alis.clear();
176 
177     // Prepare multi-threading - reserve memory for threads, intialize, etc.
178     for (int bin = 0; bin < par.threads; bin++) {
179         viterbiMatrices[bin]->DeleteBacktraceMatrix();
180         posteriorMatrices[bin]->DeleteProbabilityMatrix();
181     }
182 }
183 
184 /////////////////////////////////////////////////////////////////////////////////////
185 // Help functions
186 /////////////////////////////////////////////////////////////////////////////////////
help(Parameters & par,char all)187 void HHblits::help(Parameters& par, char all) {
188     printf("HHblits %i.%i.%i:\nHMM-HMM-based lightning-fast iterative sequence search\n", HHSUITE_VERSION_MAJOR, HHSUITE_VERSION_MINOR, HHSUITE_VERSION_PATCH);
189     printf("HHblits is a sensitive, general-purpose, iterative sequence search tool that represents\n");
190     printf("both query and database sequences by HMMs. You can search HHblits databases starting\n");
191     printf("with a single query sequence, a multiple sequence alignment (MSA), or an HMM. HHblits\n");
192     printf("prints out a ranked list of database HMMs/MSAs and can also generate an MSA by merging\n");
193     printf("the significant database HMMs/MSAs onto the query MSA.\n");
194     printf("\n");
195     printf("%s", REFERENCE);
196     printf("%s", COPYRIGHT);
197     printf("\n");
198     printf("Usage: hhblits -i query [options] \n");
199     printf(" -i <file>      input/query: single sequence or multiple sequence alignment (MSA)\n");
200     printf("                in a3m, a2m, or FASTA format, or HMM in hhm format\n");
201 
202     if (all) {
203         printf("\n");
204         printf("<file> may be 'stdin' or 'stdout' throughout.\n");
205     }
206     printf("\n");
207 
208     printf("Options:                                                                        \n");
209     printf(" -d <name>      database name (e.g. uniprot20_29Feb2012)                        \n");
210     printf("                Multiple databases may be specified with '-d <db1> -d <db2> ...'\n");
211     printf(" -n     [1,8]   number of iterations (default=%i)                               \n", par.num_rounds);
212     printf(" -e     [0,1]   E-value cutoff for inclusion in result alignment (def=%G)       \n", par.e);
213     printf("\n");
214 
215     printf("Input alignment format:                                                       \n");
216     printf(" -M a2m         use A2M/A3M (default): upper case = Match; lower case = Insert;\n");
217     printf("               ' -' = Delete; '.' = gaps aligned to inserts (may be omitted)   \n");
218     printf(" -M first       use FASTA: columns with residue in 1st sequence are match states\n");
219     printf(" -M [0,100]     use FASTA: columns with fewer than X%% gaps are match states   \n");
220     printf(" -tags/-notags  do NOT / do neutralize His-, C-myc-, FLAG-tags, and trypsin \n");
221     printf("                recognition sequence to background distribution (def=-notags)  \n");
222     printf("\n");
223 
224     printf("Output options: \n");
225     printf(" -o <file>      write results in standard format to file (default=<infile.hhr>)\n");
226     printf(" -oa3m <file>   write result MSA with significant matches in a3m format\n");
227     if (!all) {
228         printf("                Analogous for -opsi and -ohhm\n");
229     }
230     if (all) {
231         printf(" -opsi <file>   write result MSA of significant matches in PSI-BLAST format\n");
232         printf(" -ohhm <file>   write HHM file for result MSA of significant matches\n");
233     }
234     printf(" -oalis <name>  write MSAs in A3M format after each iteration\n");
235     printf(" -blasttab <name> write result in tabular BLAST format (compatible to -m 8 or -outfmt 6 output)\n");
236     printf("                  1      2      3           4         5        6      8    9      10   11   12\n");
237     printf("                  'query target #match/tLen #mismatch #gapOpen qstart qend tstart tend eval score'\n");
238     printf(" -add_cons      generate consensus sequence as master sequence of query MSA (default=don't)\n");
239     printf(" -hide_cons     don't show consensus sequence in alignments (default=show)     \n");
240     printf(" -hide_pred     don't show predicted 2ndary structure in alignments (default=show)\n");
241     printf(" -hide_dssp     don't show DSSP 2ndary structure in alignments (default=show)  \n");
242     printf(" -show_ssconf   show confidences for predicted 2ndary structure in alignments\n");
243     if (all) {
244         printf(" -Ofas <file>   write pairwise alignments in FASTA xor A2M (-Oa2m) xor A3M (-Oa3m) format   \n");
245         printf(" -seq <int>     max. number of query/template sequences displayed (default=%i)  \n", par.nseqdis);
246         printf(" -aliw <int>    number of columns per line in alignment list (default=%i)       \n", par.aliwidth);
247         printf(" -p [0,100]     minimum probability in summary and alignment list (default=%G)  \n", par.p);
248         printf(" -E [0,inf[     maximum E-value in summary and alignment list (default=%G)      \n", par.E);
249         printf(" -Z <int>       maximum number of lines in summary hit list (default=%i)        \n", par.Z);
250         printf(" -z <int>       minimum number of lines in summary hit list (default=%i)        \n", par.z);
251         printf(" -B <int>       maximum number of alignments in alignment list (default=%i)     \n", par.B);
252         printf(" -b <int>       minimum number of alignments in alignment list (default=%i)     \n", par.b);
253         printf("\n");
254     }
255 
256 
257     if(all) {
258         printf("Prefilter options                                                               \n");
259         printf(" -noprefilt                disable all filter steps                                        \n");
260         printf(" -noaddfilter              disable all filter steps (except for fast prefiltering)         \n");
261         printf(" -maxfilt                  max number of hits allowed to pass 2nd prefilter (default=%i)   \n", par.maxnumdb);
262         printf(" -min_prefilter_hits       min number of hits to pass prefilter (default=%i)               \n", par.min_prefilter_hits);
263         printf(" -prepre_smax_thresh       min score threshold of ungapped prefilter (default=%i)               \n", par.preprefilter_smax_thresh);
264         printf(" -pre_evalue_thresh        max E-value threshold of Smith-Waterman prefilter score (default=%.1f)\n", par.prefilter_evalue_thresh);
265         printf(" -pre_bitfactor            prefilter scores are in units of 1 bit / pre_bitfactor (default=%i)\n", par.prefilter_bit_factor);
266         printf(" -pre_gap_open             gap open penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_open);
267         printf(" -pre_gap_extend           gap extend penalty in prefilter Smith-Waterman alignment (default=%i)\n", par.prefilter_gap_extend);
268         printf(" -pre_score_offset         offset on sequence profile scores in prefilter S-W alignment (default=%i)\n", par.prefilter_score_offset);
269 
270         printf("\n");
271     }
272 
273     printf("Filter options applied to query MSA, database MSAs, and result MSA              \n");
274     printf(" -all           show all sequences in result MSA; do not filter result MSA      \n");
275     printf(" -interim_filter NONE|FULL  \n");
276     printf("                filter sequences of query MSA during merging to avoid early stop (default: FULL)\n");
277     printf("                  NONE: disables the intermediate filter \n");
278     printf("                  FULL: if an early stop occurs compare filter seqs in an all vs. all comparison\n");
279     printf(" -id   [0,100]  maximum pairwise sequence identity (def=%i)\n", par.max_seqid);
280     printf(" -diff [0,inf[  filter MSAs by selecting most diverse set of sequences, keeping \n");
281     printf("                at least this many seqs in each MSA block of length 50 \n");
282     printf("                Zero and non-numerical values turn off the filtering. (def=%i) \n", par.Ndiff);
283     printf(" -cov  [0,100]  minimum coverage with master sequence (%%) (def=%i)             \n", par.coverage);
284     printf(" -qid  [0,100]  minimum sequence identity with master sequence (%%) (def=%i)    \n", par.qid);
285     printf(" -qsc  [0,100]  minimum score per column with master sequence (default=%.1f)    \n", par.qsc);
286     printf(" -neff [1,inf]  target diversity of multiple sequence alignment (default=off)   \n");
287     printf(" -mark          do not filter out sequences marked by \">@\"in their name line  \n");
288     printf("\n");
289 
290     printf("HMM-HMM alignment options:                                                       \n");
291     printf(" -norealign           do NOT realign displayed hits with MAC algorithm (def=realign)   \n");
292     printf(" -realign_old_hits    realign hits from previous iterations                          \n");
293     printf(" -mact [0,1[          posterior prob threshold for MAC realignment controlling greedi- \n");
294     printf("                      ness at alignment ends: 0:global >0.1:local (default=%.2f)       \n", par.mact);
295     printf(" -glob/-loc           use global/local alignment mode for searching/ranking (def=local)\n");
296     if (all) {
297         printf(" -realign             realign displayed hits with max. accuracy (MAC) algorithm \n");
298         printf(" -realign_max <int>   realign max. <int> hits (default=%i)                        \n", par.realign_max);
299         printf(" -ovlp <int>          banded alignment: forbid <ovlp> largest diagonals |i-j| of DP matrix (def=%i)\n", par.min_overlap);
300         printf(" -alt <int>           show up to this many alternative alignments with raw score > smin(def=%i)  \n", par.altali);
301         printf(" -premerge <int> merge <int> hits to query MSA before aligning remaining hits (def=%i)\n",par.premerge);
302         printf(" -smin <float>        minimum raw score for alternative alignments (def=%.1f)  \n", par.smin);
303         printf(" -shift [-1,1]        profile-profile score offset (def=%-.2f)                         \n", par.shift);
304         printf(" -corr [0,1]          weight of term for pair correlations (def=%.2f)                \n", par.corr);
305         printf(" -sc   <int>          amino acid score         (tja: template HMM at column j) (def=%i)\n", par.columnscore);
306         printf("              0       = log2 Sum(tja*qia/pa)   (pa: aa background frequencies)    \n");
307         printf("              1       = log2 Sum(tja*qia/pqa)  (pqa = 1/2*(pa+ta) )               \n");
308         printf("              2       = log2 Sum(tja*qia/ta)   (ta: av. aa freqs in template)     \n");
309         printf("              3       = log2 Sum(tja*qia/qa)   (qa: av. aa freqs in query)        \n");
310         printf("              5       local amino acid composition correction                     \n");
311         printf(" -ssm {0,..,4}        0:   no ss scoring                                             \n");
312         printf("                      1,2: ss scoring after or during alignment  [default=%1i]         \n", par.ssm);
313         printf("                      3,4: ss scoring after or during alignment, predicted vs. predicted\n");
314         printf(" -ssw [0,1]           weight of ss score  (def=%-.2f)                                  \n", par.ssw);
315         printf(" -ssa [0,1]           ss confusion matrix = (1-ssa)*I + ssa*psipred-confusion-matrix [def=%-.2f)\n", par.ssa);
316         printf(" -wg                  use global sequence weighting for realignment!                   \n");
317         printf("\n");
318 
319         printf("Gap cost options:                                                                \n");
320         printf(" -gapb [0,inf[  Transition pseudocount admixture (def=%-.2f)                     \n", par.gapb);
321         printf(" -gapd [0,inf[  Transition pseudocount admixture for open gap (default=%-.2f)    \n", par.gapd);
322         printf(" -gape [0,1.5]  Transition pseudocount admixture for extend gap (def=%-.2f)      \n", par.gape);
323         printf(" -gapf ]0,inf]  factor to increase/reduce gap open penalty for deletes (def=%-.2f) \n", par.gapf);
324         printf(" -gapg ]0,inf]  factor to increase/reduce gap open penalty for inserts (def=%-.2f) \n", par.gapg);
325         printf(" -gaph ]0,inf]  factor to increase/reduce gap extend penalty for deletes(def=%-.2f)\n", par.gaph);
326         printf(" -gapi ]0,inf]  factor to increase/reduce gap extend penalty for inserts(def=%-.2f)\n", par.gapi);
327         printf(" -egq  [0,inf[  penalty (bits) for end gaps aligned to query residues (def=%-.2f) \n",  par.egq);
328         printf(" -egt  [0,inf[  penalty (bits) for end gaps aligned to template residues (def=%-.2f)\n", par.egt);
329         printf("\n");
330 
331         printf("Pseudocount (pc) options:                                                        \n");
332         printf(" Context specific hhm pseudocounts:\n");
333         printf("  -pc_hhm_contxt_mode {0,..,3}   position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_hhm_context_engine.admix);
334         printf("               0: no pseudo counts:    tau = 0                                  \n");
335         printf("               1: constant             tau = a                                  \n");
336         printf("               2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c)            \n");
337         printf("               3: CSBlast admixture:   tau = a(1+b)/(Neff[i]+b)                 \n");
338         printf("               (Neff[i]: number of effective seqs in local MSA around column i) \n");
339         printf("  -pc_hhm_contxt_a  [0,1]        overall pseudocount admixture (def=%-.1f)                        \n", par.pc_hhm_context_engine.pca);
340         printf("  -pc_hhm_contxt_b  [1,inf[      Neff threshold value for mode 2 (def=%-.1f)                      \n", par.pc_hhm_context_engine.pcb);
341         printf("  -pc_hhm_contxt_c  [0,3]        extinction exponent c for mode 2 (def=%-.1f)                     \n", par.pc_hhm_context_engine.pcc);
342         printf("\n");
343 
344         printf(" Context independent hhm pseudocounts (used for templates; used for query if contxt file is not available):\n");
345         printf("  -pc_hhm_nocontxt_mode {0,..,3}   position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_hhm_nocontext_mode);
346         printf("               0: no pseudo counts:    tau = 0                                  \n");
347         printf("               1: constant             tau = a                                  \n");
348         printf("               2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c)            \n");
349         printf("               (Neff[i]: number of effective seqs in local MSA around column i) \n");
350         printf("  -pc_hhm_nocontxt_a  [0,1]        overall pseudocount admixture (def=%-.1f)                        \n", par.pc_hhm_nocontext_a);
351         printf("  -pc_hhm_nocontxt_b  [1,inf[      Neff threshold value for mode 2 (def=%-.1f)                      \n", par.pc_hhm_nocontext_b);
352         printf("  -pc_hhm_nocontxt_c  [0,3]        extinction exponent c for mode 2 (def=%-.1f)                     \n", par.pc_hhm_nocontext_c);
353         printf("\n");
354 
355         printf(" Context specific prefilter pseudocounts:\n");
356         printf("  -pc_prefilter_contxt_mode {0,..,3}   position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_prefilter_context_engine.admix);
357         printf("               0: no pseudo counts:    tau = 0                                  \n");
358         printf("               1: constant             tau = a                                  \n");
359         printf("               2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c)            \n");
360         printf("               3: CSBlast admixture:   tau = a(1+b)/(Neff[i]+b)                 \n");
361         printf("               (Neff[i]: number of effective seqs in local MSA around column i) \n");
362         printf("  -pc_prefilter_contxt_a  [0,1]        overall pseudocount admixture (def=%-.1f)                        \n", par.pc_prefilter_context_engine.pca);
363         printf("  -pc_prefilter_contxt_b  [1,inf[      Neff threshold value for mode 2 (def=%-.1f)                      \n", par.pc_prefilter_context_engine.pcb);
364         printf("  -pc_prefilter_contxt_c  [0,3]        extinction exponent c for mode 2 (def=%-.1f)                     \n", par.pc_prefilter_context_engine.pcc);
365         printf("\n");
366 
367         printf(" Context independent prefilter pseudocounts (used if context file is not available):\n");
368         printf("  -pc_prefilter_nocontxt_mode {0,..,3}   position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_prefilter_nocontext_mode);
369         printf("               0: no pseudo counts:    tau = 0                                  \n");
370         printf("               1: constant             tau = a                                  \n");
371         printf("               2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c)            \n");
372         printf("               (Neff[i]: number of effective seqs in local MSA around column i) \n");
373         printf("  -pc_prefilter_nocontxt_a  [0,1]        overall pseudocount admixture (def=%-.1f)                        \n", par.pc_prefilter_nocontext_a);
374         printf("  -pc_prefilter_nocontxt_b  [1,inf[      Neff threshold value for mode 2 (def=%-.1f)                      \n", par.pc_prefilter_nocontext_b);
375         printf("  -pc_prefilter_nocontxt_c  [0,3]        extinction exponent c for mode 2 (def=%-.1f)                     \n", par.pc_prefilter_nocontext_c);
376         printf("\n");
377 
378         printf(" Context-specific pseudo-counts:                                                  \n");
379         printf("  -nocontxt      use substitution-matrix instead of context-specific pseudocounts \n");
380         printf("  -contxt <file> context file for computing context-specific pseudocounts (default=%s)\n", par.clusterfile.c_str());
381         printf("  -csw  [0,inf]  weight of central position in cs pseudocount mode (def=%.1f)\n", par.csw);
382         printf("  -csb  [0,1]    weight decay parameter for positions in cs pc mode (def=%.1f)\n", par.csb);
383         printf("\n");
384     }
385 
386     printf("Other options:                                                                   \n");
387     printf(" -v <int>       verbose mode: 0:no screen output  1:only warings  2: verbose (def=%i)\n", par.v);
388     printf(" -neffmax ]1,20] skip further search iterations when diversity Neff of query MSA \n");
389     printf("                becomes larger than neffmax (default=%.1f)\n", par.neffmax);
390     printf(" -cpu <int>     number of CPUs to use (for shared memory SMPs) (default=%i)      \n", par.threads);
391     if (all) {
392         printf(" -scores <file> write scores for all pairwise comparisons to file               \n");
393         printf(" -filter_matrices filter matrices for similarity to output at most 100 matrices\n");
394         printf(" -atab   <file> write all alignments in tabular layout to file                   \n");
395         printf(" -maxseq <int>  max number of input rows (def=%5i)\n", par.maxseq);
396         printf(" -maxres <int>  max number of HMM columns (def=%5i)\n", par.maxres);
397         printf(" -maxmem [1,inf[ limit memory for realignment (in GB) (def=%.1f)          \n", par.maxmem);
398     }
399     printf("\n");
400 
401     if (!all) {
402         printf("An extended list of options can be obtained by calling 'hhblits -h all'\n");
403         printf("\n");
404     }
405     printf("Examples:\n");
406     printf("hhblits -i query.fas -o query.hhr -d ./uniclust30\n");
407     printf("\n");
408     printf("hhblits -i query.fas -o query.hhr -oa3m query.a3m -n 1 -d ./uniclust30\n");
409     printf("\n");
410     printf("Download databases from <http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/>.\n");
411 }
412 
413 
ProcessArguments(Parameters & par)414 void HHblits::ProcessArguments(Parameters& par) {
415     const int argc = par.argc;
416     const char** argv = par.argv;
417 
418     //Processing command line input
419     for (int i = 1; i < argc; i++) {
420         HH_LOG(DEBUG1) << i << "  " << argv[i] << std::endl;
421         if (!strcmp(argv[i], "-i")) {
422             if (++i >= argc || argv[i][0] == '-') {
423                 help(par);
424                 HH_LOG(ERROR) << "No query file following -i" << std::endl;
425                 exit(4);
426             } else
427                 strcpy(par.infile, argv[i]);
428         } else if (!strcmp(argv[i], "-d")) {
429             if (++i >= argc || argv[i][0] == '-') {
430                 help(par);
431                 HH_LOG(ERROR) << "No database basename following -d" << std::endl;
432                 exit(4);
433             } else {
434                 std::string db(argv[i]);
435                 if (HHDatabase::checkDatabaseConflicts(argv[i])) {
436                     HH_LOG(ERROR) << "Ambiguous database basename. Choose either a A3M or CA3M database." << std::endl;
437                     exit(4);
438                 }
439                 par.db_bases.push_back(db);
440             }
441         } else if (!strcmp(argv[i], "-contxt")
442                    || !strcmp(argv[i], "-context_data")) {
443             if (++i >= argc || argv[i][0] == '-') {
444                 help(par);
445                 HH_LOG(ERROR) << "No lib following -contxt" << std::endl;
446                 exit(4);
447             } else
448                 par.clusterfile = argv[i];
449         } else if (!strcmp(argv[i], "-cslib") || !strcmp(argv[i], "-cs_lib")) {
450             if (++i >= argc || argv[i][0] == '-') {
451                 help(par);
452                 HH_LOG(ERROR) << "No lib following -cslib" << std::endl;
453                 exit(4);
454             } else
455                 par.cs_library = argv[i];
456         } else if (!strcmp(argv[i], "-o")) {
457             if (++i >= argc || argv[i][0] == '-') {
458                 help(par);
459                 HH_LOG(ERROR) << "No output file following -o" << std::endl;
460                 exit(4);
461             } else
462                 strcpy(par.outfile, argv[i]);
463         }
464             //no help required
465         else if (!strcmp(argv[i], "-omat")) {
466             if (++i >= argc || argv[i][0] == '-') {
467                 help(par);
468                 HH_LOG(ERROR) << "No output file following -omat" << std::endl;
469                 exit(4);
470             } else
471                 strcpy(par.matrices_output_file, argv[i]);
472         } else if (!strcmp(argv[i], "-oa3m")) {
473             if (++i >= argc || argv[i][0] == '-') {
474                 help(par);
475                 HH_LOG(ERROR) << "No output file following -oa3m" << std::endl;
476                 exit(4);
477             } else
478                 strcpy(par.alnfile, argv[i]);
479         } else if (!strcmp(argv[i], "-filter_matrices")) {
480             par.filter_matrices = true;
481         } else if (!strcmp(argv[i], "-ohhm")) {
482             if (++i >= argc || argv[i][0] == '-') {
483                 help(par);
484                 HH_LOG(ERROR) << "No output file following -ohhm" << std::endl;
485                 exit(4);
486             } else
487                 strcpy(par.hhmfile, argv[i]);
488         } else if (!strcmp(argv[i], "-opsi")) {
489             if (++i >= argc || argv[i][0] == '-') {
490                 help(par);
491                 HH_LOG(ERROR) << "No output file following -opsi" << std::endl;
492                 exit(4);
493             } else
494                 strcpy(par.psifile, argv[i]);
495         } else if (!strcmp(argv[i], "-oalis")) {
496             if (++i >= argc || argv[i][0] == '-') {
497                 help(par);
498                 HH_LOG(ERROR) << "No file basename following -oalis" << std::endl;
499                 exit(4);
500             } else
501                 strcpy(par.alisbasename, argv[i]);
502         }
503         else if (!strcmp(argv[i], "-Ofas")) {
504             par.append = 0;
505             par.outformat = 1;
506             if (++i >= argc || argv[i][0] == '-') {
507                 help(par);
508                 HH_LOG(ERROR) << "No output file following -o" << std::endl;
509                 exit(4);
510             } else
511                 strcpy(par.pairwisealisfile, argv[i]);
512         } else if (!strcmp(argv[i], "-Oa2m")) {
513             par.append = 0;
514             par.outformat = 2;
515             if (++i >= argc || argv[i][0] == '-') {
516                 help(par);
517                 HH_LOG(ERROR) << "No output file following -o" << std::endl;
518                 exit(4);
519             } else
520                 strcpy(par.pairwisealisfile, argv[i]);
521         } else if (!strcmp(argv[i], "-Oa3m")) {
522             par.append = 0;
523             par.outformat = 3;
524             if (++i >= argc || argv[i][0] == '-') {
525                 help(par);
526                 HH_LOG(ERROR) << "No output file following -o" << std::endl;
527                 exit(4);
528             } else
529                 strcpy(par.pairwisealisfile, argv[i]);
530         }
531         else if (!strcmp(argv[i], "-scores")) {
532             if (++i >= argc || argv[i][0] == '-') {
533                 help(par);
534                 HH_LOG(ERROR) << "No file following -scores" << std::endl;
535                 exit(4);
536             } else {
537                 strcpy(par.scorefile, argv[i]);
538             }
539         }
540         else if (!strcmp(argv[i], "-blasttab")) {
541             if (++i >= argc || argv[i][0] == '-') {
542                 help(par);
543                 HH_LOG(ERROR) << "No file following -blasttab" << std::endl;
544                 exit(4);
545             } else {
546                 strcpy(par.m8file, argv[i]);
547             }
548         }
549         else if (!strcmp(argv[i], "-atab")) {
550             if (++i >= argc || argv[i][0] == '-') {
551                 help(par);
552                 HH_LOG(ERROR) << "No file following -atab" << std::endl;
553                 exit(4);
554             } else {
555                 strcpy(par.alitabfile, argv[i]);
556             }
557         }
558         else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "-help")) {
559             help(par, 1);
560             exit(0);
561         } else if (!strcmp(argv[i], "-v") && (i < argc - 1)
562                    && argv[i + 1][0] != '-') {
563             int v = atoi(argv[++i]);
564             par.v = Log::from_int(v);
565             Log::reporting_level() = par.v;
566         } else if (!strcmp(argv[i], "-n") && (i < argc - 1))
567             par.num_rounds = atoi(argv[++i]);
568             //no help required
569         else if (!strncmp(argv[i], "-BLOSUM", 7)
570                  || !strncmp(argv[i], "-Blosum", 7)) {
571             if (!strcmp(argv[i] + 7, "30"))
572                 par.matrix = 30;
573             else if (!strcmp(argv[i] + 7, "40"))
574                 par.matrix = 40;
575             else if (!strcmp(argv[i] + 7, "50"))
576                 par.matrix = 50;
577             else if (!strcmp(argv[i] + 7, "62"))
578                 par.matrix = 62;
579             else if (!strcmp(argv[i] + 7, "65"))
580                 par.matrix = 65;
581             else if (!strcmp(argv[i] + 7, "80"))
582                 par.matrix = 80;
583             else
584             HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
585         } else if (!strcmp(argv[i], "-M") && (i < argc - 1)) {
586             if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m"))
587                 par.M = 1;
588             else if (!strcmp(argv[i], "first"))
589                 par.M = 3;
590             else if (argv[i][0] >= '0' && argv[i][0] <= '9') {
591                 par.Mgaps = atoi(argv[i]);
592                 par.M = 2;
593             } else
594             HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl;
595         }
596         else if (!strcmp(argv[i], "-p") && (i < argc - 1))
597             par.p = atof(argv[++i]);
598         else if (!strcmp(argv[i], "-E") && (i < argc - 1))
599             par.E = atof(argv[++i]);
600         else if (!strcmp(argv[i], "-b") && (i < argc - 1))
601             par.b = atoi(argv[++i]);
602         else if (!strcmp(argv[i], "-B") && (i < argc - 1))
603             par.B = atoi(argv[++i]);
604         else if (!strcmp(argv[i], "-z") && (i < argc - 1))
605             par.z = atoi(argv[++i]);
606         else if (!strcmp(argv[i], "-Z") && (i < argc - 1))
607             par.Z = atoi(argv[++i]);
608         else if (!strncmp(argv[i], "-hide_cons", 7))
609             par.showcons = 0;
610         else if (!strncmp(argv[i], "-hide_pred", 7))
611             par.showpred = 0;
612         else if (!strncmp(argv[i], "-hide_dssp", 7))
613             par.showdssp = 0;
614         else if (!strncmp(argv[i], "-show_ssconf", 7))
615             par.showconf = 1;
616         else if (!strncmp(argv[i], "-mark", 7))
617             par.mark = 1;
618         else if (!strncmp(argv[i], "-add_cons", 5))
619             par.cons = 1;
620         else if (!strcmp(argv[i], "-realign_max") && (i < argc - 1))
621             par.realign_max = atoi(argv[++i]);
622         else if (!strcmp(argv[i], "-e") && (i < argc - 1))
623             par.e = atof(argv[++i]);
624         else if (!strcmp(argv[i], "-seq") && (i < argc - 1))
625             par.nseqdis = atoi(argv[++i]);
626         else if (!strcmp(argv[i], "-aliw") && (i < argc - 1))
627             par.aliwidth = atoi(argv[++i]);
628         else if (!strcmp(argv[i], "-id") && (i < argc - 1))
629             par.max_seqid = atoi(argv[++i]);
630         else if (!strcmp(argv[i], "-qid") && (i < argc - 1))
631             par.qid = atoi(argv[++i]);
632         else if (!strcmp(argv[i], "-qsc") && (i < argc - 1))
633             par.qsc = atof(argv[++i]);
634         else if (!strcmp(argv[i], "-cov") && (i < argc - 1))
635             par.coverage = atoi(argv[++i]);
636         else if (!strcmp(argv[i], "-diff") && (i < argc - 1))
637             //Will be zero if "Inf" or other non-numeric values are passed to atoi
638             par.Ndiff = atoi(argv[++i]);
639         else if (!strcmp(argv[i], "-all") || !strcmp(argv[i], "-nodiff")) {
640             par.allseqs = true;
641         } else if (!strcmp(argv[i], "-neffmax") && (i < argc - 1))
642             par.neffmax = atof(argv[++i]);
643         else if ((!strcmp(argv[i], "-neff") || !strcmp(argv[i], "-Neff"))
644                  && (i < argc - 1))
645             par.Neff = atof(argv[++i]);
646             //pc hhm context variables
647         else if (!strcmp(argv[i], "-pc_hhm_contxt_mode") && (i < argc - 1))
648             par.pc_hhm_context_engine.admix = (Pseudocounts::Admix) atoi(argv[++i]);
649         else if (!strcmp(argv[i], "-pc_hhm_contxt_a") && (i < argc - 1))
650             par.pc_hhm_context_engine.pca = atof(argv[++i]);
651         else if (!strcmp(argv[i], "-pc_hhm_contxt_b") && (i < argc - 1))
652             par.pc_hhm_context_engine.pcb = atof(argv[++i]);
653         else if (!strcmp(argv[i], "-pc_hhm_contxt_c") && (i < argc - 1))
654             par.pc_hhm_context_engine.pcc = atof(argv[++i]);
655             //pc prefilter context variables
656         else if (!strcmp(argv[i], "-pc_prefilter_contxt_mode") && (i < argc - 1))
657             par.pc_prefilter_context_engine.admix = (Pseudocounts::Admix) atoi(
658                     argv[++i]);
659         else if (!strcmp(argv[i], "-pc_prefilter_contxt_a") && (i < argc - 1))
660             par.pc_prefilter_context_engine.pca = atof(argv[++i]);
661         else if (!strcmp(argv[i], "-pc_prefilter_contxt_b") && (i < argc - 1))
662             par.pc_prefilter_context_engine.pcb = atof(argv[++i]);
663         else if (!strcmp(argv[i], "-pc_prefilter_contxt_c") && (i < argc - 1))
664             par.pc_prefilter_context_engine.pcc = atof(argv[++i]);
665             //pc hhm nocontext variables
666         else if (!strcmp(argv[i], "-pc_hhm_nocontxt_mode") && (i < argc - 1))
667             par.pc_hhm_nocontext_mode = atoi(argv[++i]);
668         else if (!strcmp(argv[i], "-pc_hhm_nocontxt_a") && (i < argc - 1))
669             par.pc_hhm_nocontext_a = atof(argv[++i]);
670         else if (!strcmp(argv[i], "-pc_hhm_nocontxt_b") && (i < argc - 1))
671             par.pc_hhm_nocontext_b = atof(argv[++i]);
672         else if (!strcmp(argv[i], "-pc_hhm_nocontxt_c") && (i < argc - 1))
673             par.pc_hhm_nocontext_c = atof(argv[++i]);
674             //pc prefilter nocontext variables
675         else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_mode") && (i < argc - 1))
676             par.pc_prefilter_nocontext_mode = atoi(argv[++i]);
677         else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_a") && (i < argc - 1))
678             par.pc_hhm_nocontext_a = atof(argv[++i]);
679         else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_b") && (i < argc - 1))
680             par.pc_hhm_nocontext_b = atof(argv[++i]);
681         else if (!strcmp(argv[i], "-pc_prefilter_nocontxt_c") && (i < argc - 1))
682             par.pc_hhm_nocontext_c = atof(argv[++i]);
683         else if (!strcmp(argv[i], "-gapb") && (i < argc - 1)) {
684             par.gapb = atof(argv[++i]);
685             if (par.gapb <= 0.01)
686                 par.gapb = 0.01;
687         } else if (!strcmp(argv[i], "-gapd") && (i < argc - 1))
688             par.gapd = atof(argv[++i]);
689         else if (!strcmp(argv[i], "-gape") && (i < argc - 1))
690             par.gape = atof(argv[++i]);
691         else if (!strcmp(argv[i], "-gapf") && (i < argc - 1))
692             par.gapf = atof(argv[++i]);
693         else if (!strcmp(argv[i], "-gapg") && (i < argc - 1))
694             par.gapg = atof(argv[++i]);
695         else if (!strcmp(argv[i], "-gaph") && (i < argc - 1))
696             par.gaph = atof(argv[++i]);
697         else if (!strcmp(argv[i], "-gapi") && (i < argc - 1))
698             par.gapi = atof(argv[++i]);
699         else if (!strcmp(argv[i], "-egq") && (i < argc - 1))
700             par.egq = atof(argv[++i]);
701         else if (!strcmp(argv[i], "-egt") && (i < argc - 1))
702             par.egt = atof(argv[++i]);
703             //no help required
704         else if (!strcmp(argv[i], "-alphaa") && (i < argc - 1))
705             par.alphaa = atof(argv[++i]);
706             //no help required
707         else if (!strcmp(argv[i], "-alphab") && (i < argc - 1))
708             par.alphab = atof(argv[++i]);
709             //no help required
710         else if (!strcmp(argv[i], "-alphac") && (i < argc - 1))
711             par.alphac = atof(argv[++i]);
712         else if (!strcmp(argv[i], "-noprefilt")) {
713             par.prefilter = false;
714             par.already_seen_filter = false;
715         } else if (!strcmp(argv[i], "-noaddfilter")) {
716             par.already_seen_filter = false;
717         } else if (!strcmp(argv[i], "-min_prefilter_hits") && (i < argc - 1))
718             par.min_prefilter_hits = atoi(argv[++i]);
719         else if (!strcmp(argv[i], "-prepre_smax_thresh") && (i < argc - 1))
720             par.preprefilter_smax_thresh = atoi(argv[++i]);
721         else if (!strcmp(argv[i], "-pre_evalue_thresh") && (i < argc - 1))
722             par.prefilter_evalue_thresh = atof(argv[++i]);
723         else if (!strcmp(argv[i], "-pre_bitfactor") && (i < argc - 1))
724             par.prefilter_bit_factor = atoi(argv[++i]);
725         else if (!strcmp(argv[i], "-pre_gap_open") && (i < argc - 1))
726             par.prefilter_gap_open = atoi(argv[++i]);
727         else if (!strcmp(argv[i], "-pre_gap_extend") && (i < argc - 1))
728             par.prefilter_gap_extend = atoi(argv[++i]);
729         else if (!strcmp(argv[i], "-pre_score_offset") && (i < argc - 1))
730             par.prefilter_score_offset = atoi(argv[++i]);
731         else if (!strcmp(argv[i], "-realign_old_hits"))
732             par.realign_old_hits = true;
733         else if (!strcmp(argv[i], "-realign"))
734             par.realign = 1;
735         else if (!strcmp(argv[i], "-norealign"))
736             par.realign = 0;
737         else if (!strcmp(argv[i], "-ssm") && (i < argc - 1))
738             par.ssm = atoi(argv[++i]);
739         else if (!strcmp(argv[i], "-ssw") && (i < argc - 1))
740             par.ssw = atof(argv[++i]);
741         else if (!strcmp(argv[i], "-ssa") && (i < argc - 1))
742             par.ssa = atof(argv[++i]);
743         else if (!strcmp(argv[i], "-wg"))
744             par.wg = 1;
745         else if (!strcmp(argv[i], "-maxseq") && (i < argc - 1))
746             par.maxseq = atoi(argv[++i]);
747         else if (!strcmp(argv[i], "-maxres") && (i < argc - 1)) {
748             par.maxres = atoi(argv[++i]);
749             par.maxcol = 2 * par.maxres;
750         } else if (!strncmp(argv[i], "-glob", 5)) {
751             par.loc = 0;
752             if (par.mact > 0.35 && par.mact < 0.3502) {
753                 par.mact = 0;
754             }
755         } else if (!strncmp(argv[i], "-loc", 4))
756             par.loc = 1;
757         else if (!strncmp(argv[i], "-alt", 4) && (i < argc - 1))
758             par.altali = atoi(argv[++i]);
759         else if (!strncmp(argv[i], "-smin", 4) && (i < argc - 1))
760             par.smin = atof(argv[++i]);
761         else if (!strcmp(argv[i], "-shift") && (i < argc - 1))
762             par.shift = atof(argv[++i]);
763         else if ((!strcmp(argv[i], "-mact") || !strcmp(argv[i], "-mapt"))
764                  && (i < argc - 1))
765             par.mact = atof(argv[++i]);
766         else if (!strcmp(argv[i], "-sc") && (i < argc - 1))
767             par.columnscore = atoi(argv[++i]);
768             //no help required
769         else if (!strcmp(argv[i], "-scwin") && (i < argc - 1)) {
770             par.columnscore = 5;
771             par.half_window_size_local_aa_bg_freqs = std::max(1, atoi(argv[++i]));
772         } else if (!strncmp(argv[i], "-cpu", 4) && (i < argc - 1)) {
773             par.threads = atoi(argv[++i]);
774         } else if (!strcmp(argv[i], "-maxmem") && (i < argc - 1)) {
775             par.maxmem = atof(argv[++i]);
776         }
777         else if (!strncmp(argv[i],"-premerge",9) && (i<argc-1)) {
778             par.premerge=atoi(argv[++i]);
779         }
780         else if (!strcmp(argv[i], "-nocontxt"))
781             par.nocontxt = 1;
782         else if (!strcmp(argv[i], "-csb") && (i < argc - 1))
783             par.csb = atof(argv[++i]);
784         else if (!strcmp(argv[i], "-csw") && (i < argc - 1))
785             par.csw = atof(argv[++i]);
786         else if (!strcmp(argv[i], "-corr") && (i < argc - 1))
787             par.corr = atof(argv[++i]);
788         else if (!strcmp(argv[i], "-ovlp") && (i < argc - 1))
789             par.min_overlap = atoi(argv[++i]);
790         else if (!strcmp(argv[i], "-tags"))
791             par.notags = 0;
792         else if (!strcmp(argv[i], "-notags"))
793             par.notags = 1;
794         else if (!strcmp(argv[i], "-maxfilt") && (i < argc - 1))
795             par.maxnumdb = atoi(argv[++i]);
796         else if (!strcmp(argv[i], "-interim_filter")){
797             if (++i >= argc || argv[i][0] == '-') {
798                 help(par);
799                 HH_LOG(ERROR) << "No state following -interim_filter" << std::endl;
800                 exit(4);
801             } else {
802                 if(!strcmp(argv[i], "NONE")) {
803                     par.interim_filter = INTERIM_FILTER_NONE;
804                 } else if(!strcmp(argv[i], "FULL")) {
805                     par.interim_filter = INTERIM_FILTER_FULL;
806                 } else {
807                     help(par);
808                     HH_LOG(ERROR) << "No state out of NONE|FULL following -interim_filter" << std::endl;
809                     exit(4);
810                 }
811             }
812         }
813         else {
814             HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
815         }
816 
817         HH_LOG(DEBUG1) << i << "  " << argv[i] << std::endl;
818     }  // end of for-loop for command line input
819 }
820 
mergeHitsToQuery(HitList & hitlist,Hash<Hit> * previous_hits,Hash<Hit> * premerged_hits,int & seqs_found,int & cluster_found,int min_col_realign)821 void HHblits::mergeHitsToQuery(HitList &hitlist, Hash<Hit>* previous_hits,Hash<Hit>*  premerged_hits,
822                                int& seqs_found, int& cluster_found, int min_col_realign) {
823 
824     // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
825     const float COV_ABS = 25;     // min. number of aligned residues
826     int cov_tot = std::max(std::min((int) (COV_ABS / Qali->L * 100 + 0.5), 70), par.coverage);
827 
828     // For each template below threshold
829     hitlist.Reset();
830     while (!hitlist.End()) {
831         Hit hit_cur = hitlist.ReadNext();
832 
833         if (hit_cur.Eval > 100.0 * par.e)
834             break;  // E-value much too large
835         if (hit_cur.Eval > par.e)
836             continue;  // E-value too large
837         if (hit_cur.matched_cols < min_col_realign)
838             continue;  // leave out too short alignments
839 
840         // Already in alignment
841         std::stringstream ss_tmp;
842         ss_tmp << hit_cur.file << "__" << hit_cur.irep;
843         if (previous_hits->Contains((char*) ss_tmp.str().c_str()))
844             continue;
845 
846         // Add number of sequences in this cluster to total found
847         seqs_found += SequencesInCluster(hit_cur.name);  // read number after second '|'
848         cluster_found++;
849 
850         // Skip merging this hit if hit alignment was already merged during premerging
851         if (premerged_hits->Contains((char*)ss_tmp.str().c_str()))
852             continue;
853 
854         // Read a3m alignment of hit from <file>.a3m file
855         // Reading in next db MSA and merging it onto Qali
856         Alignment Tali(par.maxseq, par.maxres);
857         hit_cur.entry->getTemplateA3M(par, pb, S, Sim, Tali);
858 
859         if (par.allseqs)  // need to keep *all* sequences in Qali_allseqs? => merge before filtering
860             Qali_allseqs->MergeMasterSlave(hit_cur, Tali, hit_cur.name, par.maxcol);
861 
862         Tali.N_filtered = Tali.Filter(par.max_seqid_db, S, par.coverage_db,
863                                       par.qid_db, par.qsc_db, par.Ndiff_db);
864 
865         if(par.interim_filter == INTERIM_FILTER_FULL && Tali.N_filtered + Qali->N_in >= par.maxseq) {
866             Qali->N_filtered = Qali->Filter(par.max_seqid, S, cov_tot, par.qid, par.qsc, par.Ndiff);
867             Qali->Shrink();
868         }
869 
870         Qali->MergeMasterSlave(hit_cur, Tali, hit_cur.name, par.maxcol);
871 
872         if (Qali->N_in >= par.maxseq)
873             break;  // Maximum number of sequences reached
874     }
875 
876     // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
877     Qali->Compress("merged A3M file", par.cons, par.maxcol, 1, par.Mgaps);
878 
879     // Sort out the nseqdis most dissimilacd r sequences for display in the result alignments
880     Qali->FilterForDisplay(par.max_seqid, par.mark, S, par.coverage, par.qid, par.qsc, par.nseqdis);
881 
882 
883     HH_LOG(DEBUG) << "Filter new alignment with cov " << cov_tot
884                   << std::endl;
885 
886     Qali->N_filtered = Qali->Filter(par.max_seqid, S, cov_tot, par.qid, par.qsc,
887                                     par.Ndiff);
888 }
889 
add_hits_to_hitlist(std::vector<Hit> & hits,HitList & hitlist)890 void HHblits::add_hits_to_hitlist(std::vector<Hit>& hits, HitList& hitlist) {
891     for (std::vector<Hit>::size_type i = 0; i != hits.size(); i++) {
892         hitlist.Push(hits[i]);
893     }
894 
895     // Sort list according to sortscore
896     HH_LOG(DEBUG) << "Sorting hit list ...\n";
897     hitlist.SortList();
898 
899     // Use NN prediction of lamda and mu
900     hitlist.CalculatePvalues(q, par.loc, par.ssm, par.ssw);
901 
902     // Calculate E-values as combination of P-value for Viterbi HMM-HMM comparison and prefilter E-value: E = Ndb P (Epre/Ndb)^alpha
903     if (par.prefilter)
904         hitlist.CalculateHHblitsEvalues(q, par.dbsize, par.alphaa, par.alphab, par.alphac, par.prefilter_evalue_thresh);
905 }
906 
907 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
908 // Variant of ViterbiSearch() function for rescoring previously found HMMs with Viterbi algorithm.
909 // Perform Viterbi search on each hit object in global hash previous_hits, but keep old alignment
910 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
RescoreWithViterbiKeepAlignment(HMMSimd & q_vec,Hash<Hit> * previous_hits)911 void HHblits::RescoreWithViterbiKeepAlignment(HMMSimd& q_vec,
912                                               Hash<Hit>* previous_hits) {
913     // Initialize
914     std::vector<HHEntry*> hits_to_rescore;
915 
916     // Get dbfiles of previous hits
917     previous_hits->Reset();
918     while (!previous_hits->End()) {
919         Hit hit_cur = previous_hits->ReadNext();
920         if (hit_cur.irep == 1) {
921             hits_to_rescore.push_back(hit_cur.entry);
922         }
923     }
924 
925     //////////////////////////////////////////////////////////
926     // Start Viterbi search through db HMMs listed in dbfiles
927 //	DoViterbiSearch(hits_to_rescore, previous_hits, false);
928 
929     ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
930     std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
931                                                            hits_to_rescore,
932                                                            par.qsc_db, pb, S, Sim,
933                                                            R, par.ssm, S73, S33, S37);
934 
935     for (std::vector<Hit>::size_type i = 0; i != hits_to_add.size(); i++) {
936         std::stringstream ss_tmp;
937         ss_tmp << hits_to_add[i].file << "__" << hits_to_add[i].irep;
938         if (previous_hits->Contains((char*) ss_tmp.str().c_str())) {
939             Hit hit_cur = previous_hits->Remove((char*) ss_tmp.str().c_str());
940             previous_hits->Add((char*) ss_tmp.str().c_str(), hits_to_add[i]);
941             // Overwrite *hit[bin] with alignment, etc. of hit_cur
942             hit_cur.score = hits_to_add[i].score;
943             hit_cur.score_aass = hits_to_add[i].score_aass;
944             hit_cur.score_ss = hits_to_add[i].score_ss;
945             hit_cur.Pval = hits_to_add[i].Pval;
946             hit_cur.Pvalt = hits_to_add[i].Pvalt;
947             hit_cur.logPval = hits_to_add[i].logPval;
948             hit_cur.logPvalt = hits_to_add[i].logPvalt;
949             hit_cur.Eval = hits_to_add[i].Eval;
950             hit_cur.logEval = hits_to_add[i].logEval;
951             hit_cur.Probab = hits_to_add[i].Probab;
952             hitlist.Push(hit_cur);  // insert hit at beginning of list (last repeats first!)
953         } else {
954             hits_to_add[i].Delete();
955         }
956     }
957 
958     // Sort list according to sortscore
959     HH_LOG(DEBUG) << "Sorting hit list ..." << std::endl;
960 
961     hitlist.SortList();
962 
963     hitlist.CalculatePvalues(q, par.loc, par.ssm, par.ssw);  // Use NN prediction of lamda and mu
964 
965     if (par.prefilter)
966         hitlist.CalculateHHblitsEvalues(q, par.dbsize, par.alphaa, par.alphab,
967                                         par.alphac, par.prefilter_evalue_thresh);
968 }
969 
970 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
971 // Realign hits with MAC algorithm
972 /////////////////////////////////////////////////////////////////////////////////////////////////////////////
perform_realign(HMMSimd & q_vec,const char input_format,std::vector<HHEntry * > & hits_to_realign,int min_col_realign)973 void HHblits::perform_realign(HMMSimd& q_vec, const char input_format,
974                               std::vector<HHEntry*>& hits_to_realign, int min_col_realign) {
975 
976     // 19/02/2014: F/B-algos are calculated in log-space
977     //  q->Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done
978     int nhits = 0;
979 
980     // Longest allowable length of database HMM (backtrace: 5 chars, fwd: 1 double, bwd: 1 double
981     long int Lmaxmem = ((par.maxmem - 0.5) * 1024 * 1024 * 1024)
982                        / (2 * sizeof(double) + 8) / q->L / par.threads;
983     int Lmax = 0;      // length of longest HMM to be realigned
984 
985     /////////////////////////////////////////////////////////////////////////////////////////////////
986     // Categorize the hits: first irep, then dbfile
987     int n_realignments = 0;
988     std::map<short int, std::vector<Hit *> > alignments;
989     hitlist.Reset();
990 
991     std::vector<Hit *> hit_vector;
992 
993     while (!hitlist.End()) {
994         Hit hit_cur = hitlist.ReadNext();
995         if (n_realignments >= par.realign_max
996             && n_realignments >= imax(par.B, par.Z))
997             break;
998         if (hit_cur.Eval > par.e) {
999             if (n_realignments >= imax(par.B, par.Z))
1000                 continue;
1001             if (n_realignments >= imax(par.b, par.z) && hit_cur.Probab < par.p)
1002                 continue;
1003             if (n_realignments >= imax(par.b, par.z) && hit_cur.Eval > par.E)
1004                 continue;
1005         }
1006 
1007         if (hit_cur.L > Lmax) {
1008             Lmax = hit_cur.L;
1009         }
1010         if (hit_cur.L > Lmaxmem) {
1011             nhits++;
1012             continue;
1013         }
1014 
1015         hit_vector.push_back(hitlist.ReadCurrentAddress());
1016         n_realignments++;
1017         nhits++;
1018     }
1019 
1020     int t_maxres = Lmax + 2;
1021     for (int i = 0; i < par.threads; i++) {
1022         posteriorMatrices[i]->allocateMatrix(q->L, t_maxres);
1023     }
1024 
1025     // Initialize a Null-value as a return value if not items are available anymore
1026     PosteriorDecoderRunner runner(posteriorMatrices, viterbiMatrices, par.threads, par.ssw, S73, S33, S37);
1027 
1028     HH_LOG(INFO)
1029         << "Realigning " << nhits
1030         << " HMM-HMM alignments using Maximum Accuracy algorithm" << std::endl;
1031 
1032     runner.executeComputation(*q, hit_vector, par, par.qsc_db, pb, S, Sim, R);
1033 
1034 
1035     // Delete all hitlist entries with too short alignments
1036     nhits = 0;
1037     hitlist.Reset();
1038     while (!hitlist.End()) {
1039         Hit hit_cur = hitlist.ReadNext();
1040         //printf("Deleting alignment of %s with length %i? irep=%i nhits=%-2i  par.B=%-3i  par.Z=%-3i par.e=%.2g par.b=%-3i  par.z=%-3i par.p=%.2g\n",hit_cur.name,hit_cur.matched_cols,hit_cur.irep,nhits,par.B,par.Z,par.e,par.b,par.z,par.p);
1041 
1042         if (nhits > par.realign_max && nhits >= imax(par.B, par.Z))
1043             break;
1044         if (hit_cur.Eval > par.e) {
1045             if (nhits >= imax(par.B, par.Z))
1046                 continue;
1047             if (nhits >= imax(par.b, par.z) && hit_cur.Probab < par.p)
1048                 continue;
1049             if (nhits >= imax(par.b, par.z) && hit_cur.Eval > par.E)
1050                 continue;
1051         }
1052 
1053         if (hit_cur.matched_cols < min_col_realign) {
1054           HH_LOG(DEBUG) << "Deleting alignment of " << hit_cur.name
1055               << " with length " << hit_cur.matched_cols << std::endl;
1056           hitlist.Delete().Delete();        // delete the list record and hit object
1057           // // Make sure only realigned alignments get displayed! JS: Why? better unrealigned than none.
1058           // if (last_round)
1059           // if (par.B>par.Z) par.B--; else if (par.B==par.Z) {par.B--; par.Z--;} else par.Z--;
1060         }
1061     nhits++;
1062   }
1063 }
1064 
run(FILE * query_fh,char * query_path)1065 void HHblits::run(FILE* query_fh, char* query_path) {
1066     int cluster_found = 0;
1067     int seqs_found = 0;
1068 
1069     std::set<std::string> search_counter;
1070 
1071     Hit hit_cur;
1072     Hash<Hit> * previous_hits = new Hash<Hit>(1631, hit_cur);
1073     Hash<Hit> * premerged_hits = new Hash<Hit>(1631, hit_cur);
1074     Qali = new Alignment(par.maxseq, par.maxres);
1075     Qali_allseqs = new Alignment(par.maxseq, par.maxres);
1076 
1077     q = new HMM(MAXSEQDIS, par.maxres);
1078     HMMSimd q_vec(par.maxres);
1079     q_tmp = new HMM(MAXSEQDIS, par.maxres);
1080 
1081     // Read query input file (HHM or alignment format) without adding pseudocounts
1082     Qali->N_in = 0;
1083     char input_format;
1084     ReadQueryFile(par, query_fh, input_format, par.wg, q, Qali, query_path, pb, S,
1085                   Sim);
1086 
1087     if (par.allseqs) {
1088         *Qali_allseqs = *Qali;  // make a *deep* copy of Qali!
1089         for (int k = 0; k < Qali_allseqs->N_in; ++k)
1090             Qali_allseqs->keep[k] = 1;  // keep *all* sequences (reset filtering in Qali)
1091     }
1092 
1093     // Set query columns in His-tags etc to Null model distribution
1094     if (par.notags)
1095         q->NeutralizeTags(pb);
1096 
1097     //save all entries pointer in this vector to delete, when it's safe
1098     std::vector<HHEntry*> all_entries;
1099     std::vector<HHEntry*> new_entries;
1100     std::vector<HHEntry*> old_entries;
1101 
1102     if (!par.prefilter) {
1103         for (size_t i = 0; i < dbs.size(); i++) {
1104             dbs[i]->initNoPrefilter(new_entries);
1105         }
1106         all_entries.insert(all_entries.end(), new_entries.begin(),
1107                            new_entries.end());
1108 
1109         for (size_t i = 0; i < all_entries.size(); i++) {
1110             search_counter.insert(all_entries[i]->getName());
1111         }
1112     }
1113 
1114     //////////////////////////////////////////////////////////////////////////////////
1115     // Main loop overs search iterations
1116     //////////////////////////////////////////////////////////////////////////////////
1117 
1118     for (int round = 1; round <= par.num_rounds; round++) {
1119         HH_LOG(INFO) << "Iteration " << round << std::endl;
1120         if (par.premerge > 0 && round > 1 && previous_hits->Size() >= par.premerge)
1121         {
1122             HH_LOG(INFO) << "Set premerge to 0! (premerge: " << par.premerge << " iteration: " <<round << " hits.Size: " << previous_hits->Size() << ")\n";
1123             par.premerge = 0;
1124         } else{
1125             par.premerge -= previous_hits->Size();
1126         }
1127         // Save HMM without pseudocounts for prefilter query-profile
1128         *q_tmp = *q;
1129         HMM* q_rescore = new HMM(MAXSEQDIS, par.maxres);
1130 
1131 
1132         PrepareQueryHMM(par, input_format, q, pc_hhm_context_engine,
1133                         pc_hhm_context_mode, pb, R);
1134         //deep copy for rescoring of q
1135         *q_rescore = *q;
1136         q_vec.MapOneHMM(q_rescore);
1137 
1138         ////////////////////////////////////////////
1139         // Prefiltering
1140         ////////////////////////////////////////////
1141 
1142         if (par.prefilter) {
1143             HH_LOG(INFO) << "Prefiltering database" << std::endl;
1144 
1145             new_entries.clear();
1146             old_entries.clear();
1147 
1148             // Add Pseudocounts to q_tmp
1149             if (par.nocontxt) {
1150                 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1151                 q_tmp->PreparePseudocounts(R);
1152                 // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1153                 q_tmp->AddAminoAcidPseudocounts(par.pc_prefilter_nocontext_mode,
1154                                                 par.pc_prefilter_nocontext_a,
1155                                                 par.pc_prefilter_nocontext_b,
1156                                                 par.pc_prefilter_nocontext_c);
1157             } else {
1158                 // Add context specific pseudocounts (now always used, because clusterfile is necessary)
1159                 q_tmp->AddContextSpecificPseudocounts(pc_prefilter_context_engine,
1160                                                       pc_prefilter_context_mode);
1161             }
1162 
1163             q_tmp->CalculateAminoAcidBackground(pb);
1164 
1165             for (size_t i = 0; i < dbs.size(); i++) {
1166                 dbs[i]->prefilter_db(q_tmp, previous_hits, par.threads,
1167                                      par.prefilter_gap_open, par.prefilter_gap_extend,
1168                                      par.prefilter_score_offset,
1169                                      par.prefilter_bit_factor,
1170                                      par.prefilter_evalue_thresh,
1171                                      par.prefilter_evalue_coarse_thresh,
1172                                      par.preprefilter_smax_thresh,
1173                                      par.min_prefilter_hits, par.maxnumdb, R,
1174                                      new_entries, old_entries);
1175             }
1176 
1177             for (size_t i = 0; i < new_entries.size(); i++) {
1178                 search_counter.insert(new_entries[i]->getName());
1179             }
1180 
1181             all_entries.insert(all_entries.end(), new_entries.begin(),
1182                                new_entries.end());
1183             all_entries.insert(all_entries.end(), old_entries.begin(),
1184                                old_entries.end());
1185         }
1186 
1187         int max_template_length = getMaxTemplateLength(new_entries);
1188         if (max_template_length > par.maxres) {
1189           HH_LOG(WARNING)
1190               << "database contains sequences that exceed maximum allowed size (maxres = "
1191               << par.maxres << "). Max sequence length can be increased with parameter -maxres."
1192               << std::endl;
1193         }
1194         max_template_length = std::min(max_template_length, par.maxres);
1195         for (int i = 0; i < par.threads; i++) {
1196           viterbiMatrices[i]->AllocateBacktraceMatrix(q->L, max_template_length);
1197         }
1198 
1199         hitlist.N_searched = search_counter.size();
1200 
1201         if (new_entries.size() == 0) {
1202             HH_LOG(INFO) << "No HMMs pass prefilter => Stop searching!" << std::endl;
1203             break;
1204         }
1205 
1206         if (par.prefilter) {
1207             // Search datbases
1208             HH_LOG(INFO)
1209                 << "HMMs passed 2nd prefilter (gapped profile-profile alignment)   : " << new_entries.size() + old_entries.size() << std::endl;
1210             HH_LOG(INFO)
1211                 << "HMMs passed 2nd prefilter and not found in previous iterations : " << new_entries.size() << std::endl;
1212         }
1213         HH_LOG(INFO) << "Scoring " << new_entries.size() << " HMMs using HMM-HMM Viterbi alignment" << std::endl;
1214         // Main Viterbi HMM-HMM search
1215         ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1216         std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1217                                                                new_entries,
1218                                                                par.qsc_db, pb, S,
1219                                                                Sim, R, par.ssm, S73, S33, S37);
1220 
1221         add_hits_to_hitlist(hits_to_add, hitlist);
1222 
1223         // check for new hits or end with iteration
1224         int new_hits = 0;
1225         hitlist.Reset();
1226         while (!hitlist.End()) {
1227             Hit hit_cur = hitlist.ReadNext();
1228             if (hit_cur.Eval > 100.0 * par.e)
1229                 break;  // E-value much too large
1230             if (hit_cur.Eval > par.e)
1231                 continue;  // E-value too large
1232             new_hits++;
1233         }
1234 
1235         if (new_hits == 0 || round == par.num_rounds) {
1236             if (round < par.num_rounds) {
1237                 HH_LOG(INFO) << "No new hits found in iteration " << round
1238                              << " => Stop searching" << std::endl;
1239             }
1240 
1241             if (old_entries.size() > 0 && par.realign_old_hits) {
1242                 HH_LOG(INFO)
1243                     << "Rescoring previously found HMMs with Viterbi algorithm"
1244                     << std::endl;
1245 
1246                 ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1247                 std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1248                                                                        old_entries,
1249                                                                        par.qsc_db, pb,
1250                                                                        S, Sim, R, par.ssm, S73, S33, S37);
1251                 add_hits_to_hitlist(hits_to_add, hitlist);
1252                 // Add dbfiles_old to dbfiles_new for realign
1253                 new_entries.insert(new_entries.end(), old_entries.begin(),
1254                                    old_entries.end());
1255             } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1256                 HH_LOG(INFO)
1257                     << "Rescoring previously found HMMs with Viterbi algorithm"
1258                     << std::endl;
1259                 RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1260             }
1261         }
1262         if (par.premerge){
1263             premerge(previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1264         }
1265         // Realign hits with MAC algorithm
1266         if (par.realign) {
1267             perform_realign(q_vec, input_format, new_entries, MINCOLS_REALIGN);
1268         }
1269         // Generate alignment for next iteration
1270         if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile || *par.alisbasename) {
1271             if (new_hits > 0) {
1272                 mergeHitsToQuery(hitlist, previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1273             }
1274 
1275             // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
1276             Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons, par.showcons, pb, Sim, NULL, true);
1277 
1278             if (par.notags)
1279                 q->NeutralizeTags(pb);
1280 
1281             if (*par.alisbasename) {
1282                 Alignment* tmp = new Alignment(par.maxseq, par.maxres);
1283                 if (par.allseqs) {
1284                     (*tmp) = (*Qali_allseqs);
1285                 } else {
1286                     (*tmp) = (*Qali);
1287                 }
1288 
1289                 alis[round] = tmp;
1290             }
1291         }
1292             // Update counts for log
1293         else if (round == par.num_rounds) {
1294             hitlist.Reset();
1295             while (!hitlist.End()) {
1296                 Hit hit_cur = hitlist.ReadNext();
1297 
1298                 // E-value much too large
1299                 if (hit_cur.Eval > 100.0 * par.e)
1300                     break;
1301 
1302                 // E-value too large
1303                 if (hit_cur.Eval > par.e)
1304                     continue;
1305 
1306                 std::stringstream ss_tmp;
1307                 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1308                 // Already in alignment?
1309                 if (previous_hits->Contains((char*) ss_tmp.str().c_str()))
1310                     continue;
1311 
1312                 // Add number of sequences in this cluster to total found
1313                 // read number after second '|'
1314                 seqs_found += SequencesInCluster(hit_cur.name);
1315                 cluster_found++;
1316             }
1317         }
1318 
1319         HH_LOG(INFO) << seqs_found << " sequences belonging to "
1320                      << cluster_found
1321                      << " database HMMs found with an E-value < " << par.e
1322                      << std::endl;
1323 
1324         if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile
1325             || *par.alisbasename) {
1326             HH_LOG(INFO)
1327                 << "Number of effective sequences of resulting query HMM: Neff = "
1328                 << q->Neff_HMM << std::endl;
1329         }
1330 
1331         if (q->Neff_HMM > par.neffmax && round < par.num_rounds) {
1332             HH_LOG(INFO)
1333                 << "Diversity is above threshold (" << par.neffmax
1334                 << "). Stop searching! (Change threshold using -neffmax <float>.)"
1335                 << std::endl;
1336         }
1337 
1338         if (Qali->N_in >= par.maxseq) {
1339             HH_LOG(INFO)
1340                 << "Maximun number of sequences in query alignment reached (" << par.maxseq << "). Stop searching!" << std::endl;
1341         }
1342 
1343         if (new_hits == 0 || round == par.num_rounds || q->Neff_HMM > par.neffmax || Qali->N_in >= par.maxseq) {
1344 //      if (new_hits == 0 && round < par.num_rounds) {
1345 //        HH_LOG(INFO) << "No new hits found in iteration " << round
1346 //                               << " => Stop searching" << std::endl;
1347 //      }
1348 //
1349 //      if (old_entries.size() > 0 && par.realign_old_hits) {
1350 //        HH_LOG(INFO)
1351 //            << "Recalculating previously found HMMs with Viterbi algorithm"
1352 //            << std::endl;
1353 //
1354 //        ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1355 //        std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1356 //                                                               old_entries,
1357 //                                                               par.qsc_db, pb,
1358 //                                                               S, Sim, R, par.ssm, S73, S33, S37);
1359 //
1360 //        add_hits_to_hitlist(hits_to_add, hitlist);
1361 //
1362 //        if (par.realign)
1363 //          perform_realign(q_vec, input_format, old_entries);
1364 //      } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1365 //        HH_LOG(INFO)
1366 //            << "Rescoring previously found HMMs with Viterbi algorithm"
1367 //            << std::endl;
1368 //        RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1369 //      }
1370 
1371             delete q_rescore;
1372             break;
1373         }
1374 
1375 
1376         // Write good hits to previous_hits hash and clear hitlist
1377         hitlist.Reset();
1378         while (!hitlist.End()) {
1379             Hit hit_cur = hitlist.ReadNext();
1380 
1381             std::stringstream ss_tmp;
1382             ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1383 
1384             if (!par.already_seen_filter || hit_cur.Eval > par.e
1385                 || previous_hits->Contains((char*) ss_tmp.str().c_str()))
1386                 hit_cur.Delete();  // Delete hit object (deep delete with Hit::Delete())
1387             else {
1388                 previous_hits->Add((char*) ss_tmp.str().c_str(), hit_cur);
1389             }
1390 
1391             hitlist.Delete();  // Delete list record (flat delete)
1392         }
1393 
1394         delete q_rescore;
1395     }
1396 
1397     // Warn, if HMMER files were used
1398     if (par.hmmer_used) {
1399         HH_LOG(WARNING)
1400             << "Using HMMER files results in a drastically reduced sensitivity (>10%%).\n"
1401             << " We recommend to use HHMs build by hhmake." << std::endl;
1402     }
1403 
1404     for (size_t i = 0; i < all_entries.size(); i++) {
1405         delete all_entries[i];
1406     }
1407     all_entries.clear();
1408 
1409     previous_hits->Reset();
1410     while (!previous_hits->End())
1411         previous_hits->ReadNext().Delete();  // Delete hit object
1412     delete previous_hits;
1413     delete premerged_hits;
1414 }
1415 
1416 
run(ffindex_entry_t * entry,char * data,ffindex_index_t * sequence_index,char * seq,ffindex_index_t * header_index,char * header)1417 void HHblits::run(ffindex_entry_t* entry, char* data,
1418                   ffindex_index_t* sequence_index, char* seq,
1419                   ffindex_index_t* header_index, char* header) {
1420 
1421 
1422     q=new HMM(MAXSEQDIS, par.maxres);
1423 
1424     Qali = new Alignment(par.maxseq, par.maxres);
1425     Qali_allseqs = new Alignment(par.maxseq, par.maxres);
1426 
1427 
1428 
1429     // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
1430     Qali->ReadCompressed(entry,data,
1431                          sequence_index, seq,
1432                          header_index, header,
1433                          par.mark, par.maxcol);
1434 
1435 
1436     // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
1437     // and store marked sequences in name[k] and seq[k]
1438     Qali->Compress(par.infile, par.cons, par.maxcol, par.M, par.Mgaps);
1439 
1440     Qali->Shrink();
1441 
1442     // Sort out the nseqdis most dissimilar sequences for display in the output alignments
1443     Qali->FilterForDisplay(par.max_seqid, par.mark, S, par.coverage, par.qid, par.qsc, par.nseqdis);
1444 
1445 
1446     // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
1447     Qali->N_filtered = Qali->Filter(par.max_seqid, S, par.coverage, par.qid, par.qsc, par.Ndiff);
1448 
1449     if (par.Neff >= 0.999)
1450         Qali->FilterNeff(par.wg, par.mark, par.cons, par.showcons, par.max_seqid, par.coverage, par.Neff, pb, S, Sim);
1451 
1452     // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
1453     Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons,par.showcons, pb, Sim);
1454 
1455 
1456 
1457     int cluster_found = 0;
1458     int seqs_found = 0;
1459 
1460     std::set<std::string> search_counter;
1461 
1462     Hit hit_cur;
1463     Hash<Hit>* previous_hits = new Hash<Hit>(1631, hit_cur);
1464     Hash<Hit>* premerged_hits = new Hash<Hit>(1631, hit_cur);
1465 
1466 
1467     HMMSimd q_vec(par.maxres);
1468     q_tmp = new HMM(MAXSEQDIS, par.maxres);
1469 
1470     // Read query input file (HHM or alignment format) without adding pseudocounts
1471     Qali->N_in = 0;
1472     char input_format;
1473 
1474     /*ReadQueryFile(par, query_fh, input_format, par.wg, q, Qali, query_path, pb, S,
1475                   Sim);
1476                   */
1477 
1478 
1479     if (par.allseqs) {
1480         *Qali_allseqs = *Qali;  // make a *deep* copy of Qali!
1481         for (int k = 0; k < Qali_allseqs->N_in; ++k)
1482             Qali_allseqs->keep[k] = 1;  // keep *all* sequences (reset filtering in Qali)
1483     }
1484 
1485     // Set query columns in His-tags etc to Null model distribution
1486     if (par.notags)
1487         q->NeutralizeTags(pb);
1488 
1489     //save all entries pointer in this vector to delete, when it's safe
1490     std::vector<HHEntry*> all_entries;
1491     std::vector<HHEntry*> new_entries;
1492     std::vector<HHEntry*> old_entries;
1493 
1494     if (!par.prefilter) {
1495         for (size_t i = 0; i < dbs.size(); i++) {
1496             dbs[i]->initNoPrefilter(new_entries);
1497         }
1498         all_entries.insert(all_entries.end(), new_entries.begin(),
1499                            new_entries.end());
1500 
1501         for (size_t i = 0; i < all_entries.size(); i++) {
1502             search_counter.insert(all_entries[i]->getName());
1503         }
1504     }
1505 
1506     //////////////////////////////////////////////////////////////////////////////////
1507     // Main loop overs search iterations
1508     //////////////////////////////////////////////////////////////////////////////////
1509 
1510     for (int round = 1; round <= par.num_rounds; round++) {
1511         HH_LOG(INFO) << "Iteration " << round << std::endl;
1512         // Settings for different rounds
1513         if (par.premerge > 0 && round > 1 && previous_hits->Size() >= par.premerge)
1514         {
1515             HH_LOG(INFO) << "Set premerge to 0! (premerge: " << par.premerge << " iteration: " <<round << " hits.Size: " << previous_hits->Size() << ")\n";
1516             par.premerge = 0;
1517         } else{
1518             par.premerge -= previous_hits->Size();
1519         }
1520         // Save HMM without pseudocounts for prefilter query-profile
1521         *q_tmp = *q;
1522         HMM* q_rescore = new HMM(MAXSEQDIS, par.maxres);
1523 
1524 
1525         PrepareQueryHMM(par, input_format, q, pc_hhm_context_engine,
1526                         pc_hhm_context_mode, pb, R);
1527         //deep copy for rescoring of q
1528         *q_rescore = *q;
1529         q_vec.MapOneHMM(q_rescore);
1530 
1531         ////////////////////////////////////////////
1532         // Prefiltering
1533         ////////////////////////////////////////////
1534 
1535         if (par.prefilter) {
1536             HH_LOG(INFO) << "Prefiltering database" << std::endl;
1537 
1538             new_entries.clear();
1539             old_entries.clear();
1540 
1541             // Add Pseudocounts to q_tmp
1542             if (par.nocontxt) {
1543                 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1544                 q_tmp->PreparePseudocounts(R);
1545                 // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1546                 q_tmp->AddAminoAcidPseudocounts(par.pc_prefilter_nocontext_mode,
1547                                                 par.pc_prefilter_nocontext_a,
1548                                                 par.pc_prefilter_nocontext_b,
1549                                                 par.pc_prefilter_nocontext_c);
1550             } else {
1551                 // Add context specific pseudocounts (now always used, because clusterfile is necessary)
1552                 q_tmp->AddContextSpecificPseudocounts(pc_prefilter_context_engine,
1553                                                       pc_prefilter_context_mode);
1554             }
1555 
1556             q_tmp->CalculateAminoAcidBackground(pb);
1557 
1558             for (size_t i = 0; i < dbs.size(); i++) {
1559                 dbs[i]->prefilter_db(q_tmp, previous_hits, par.threads,
1560                                      par.prefilter_gap_open, par.prefilter_gap_extend,
1561                                      par.prefilter_score_offset,
1562                                      par.prefilter_bit_factor,
1563                                      par.prefilter_evalue_thresh,
1564                                      par.prefilter_evalue_coarse_thresh,
1565                                      par.preprefilter_smax_thresh,
1566                                      par.min_prefilter_hits, par.maxnumdb, R,
1567                                      new_entries, old_entries);
1568             }
1569 
1570             for (size_t i = 0; i < new_entries.size(); i++) {
1571                 search_counter.insert(new_entries[i]->getName());
1572             }
1573 
1574             all_entries.insert(all_entries.end(), new_entries.begin(),
1575                                new_entries.end());
1576             all_entries.insert(all_entries.end(), old_entries.begin(),
1577                                old_entries.end());
1578         }
1579 
1580         int max_template_length = getMaxTemplateLength(new_entries);
1581         if (max_template_length > par.maxres) {
1582             HH_LOG(WARNING)
1583                 << "database contains sequences that exceeds maximum allowed size (maxres = "
1584                 << par.maxres << "). Maxres can be increased with parameter -maxres."
1585                 << std::endl;
1586         }
1587         max_template_length = std::min(max_template_length, par.maxres);
1588         for (int i = 0; i < par.threads; i++) {
1589             viterbiMatrices[i]->AllocateBacktraceMatrix(q->L, max_template_length);
1590         }
1591 
1592         hitlist.N_searched = search_counter.size();
1593 
1594         if (new_entries.size() == 0) {
1595             HH_LOG(INFO) << "No HMMs pass prefilter => Stop searching!"
1596                          << std::endl;
1597             break;
1598         }
1599 
1600         // Search datbases
1601         HH_LOG(INFO)
1602             << "HMMs passed 2nd prefilter (gapped profile-profile alignment)   : "
1603             << new_entries.size() + old_entries.size() << std::endl;
1604         HH_LOG(INFO)
1605             << "HMMs passed 2nd prefilter and not found in previous iterations : "
1606             << new_entries.size() << std::endl;
1607         HH_LOG(INFO) << "Scoring " << new_entries.size()
1608                      << " HMMs using HMM-HMM Viterbi alignment"
1609                      << std::endl;
1610 
1611         // Main Viterbi HMM-HMM search
1612         ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1613         std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1614                                                                new_entries,
1615                                                                par.qsc_db, pb, S,
1616                                                                Sim, R, par.ssm, S73, S33, S37);
1617 
1618         add_hits_to_hitlist(hits_to_add, hitlist);
1619 
1620         // check for new hits or end with iteration
1621         int new_hits = 0;
1622         hitlist.Reset();
1623         while (!hitlist.End()) {
1624             Hit hit_cur = hitlist.ReadNext();
1625             if (hit_cur.Eval > 100.0 * par.e)
1626                 break;  // E-value much too large
1627             if (hit_cur.Eval > par.e)
1628                 continue;  // E-value too large
1629             new_hits++;
1630         }
1631 
1632         if (new_hits == 0 || round == par.num_rounds) {
1633             if (round < par.num_rounds) {
1634                 HH_LOG(INFO) << "No new hits found in iteration " << round
1635                              << " => Stop searching" << std::endl;
1636             }
1637 
1638             if (old_entries.size() > 0 && par.realign_old_hits) {
1639                 HH_LOG(INFO)
1640                     << "Rescoring previously found HMMs with Viterbi algorithm"
1641                     << std::endl;
1642 
1643                 ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1644                 std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1645                                                                        old_entries,
1646                                                                        par.qsc_db, pb,
1647                                                                        S, Sim, R, par.ssm, S73, S33, S37);
1648                 add_hits_to_hitlist(hits_to_add, hitlist);
1649                 // Add dbfiles_old to dbfiles_new for realign
1650                 new_entries.insert(new_entries.end(), old_entries.begin(),
1651                                    old_entries.end());
1652             } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1653                 HH_LOG(INFO)
1654                     << "Rescoring previously found HMMs with Viterbi algorithm"
1655                     << std::endl;
1656                 RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1657             }
1658         }
1659         if (par.premerge){
1660             premerge(previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1661         }
1662         // Realign hits with MAC algorithm
1663         if (par.realign)
1664             perform_realign(q_vec, input_format, new_entries, MINCOLS_REALIGN);
1665 
1666         // Generate alignment for next iteration
1667         if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile || *par.alisbasename) {
1668             if (new_hits > 0) {
1669                 mergeHitsToQuery(hitlist, previous_hits, premerged_hits, seqs_found, cluster_found, MINCOLS_REALIGN);
1670             }
1671 
1672             // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
1673             Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons,par.showcons, pb, Sim, NULL,true);
1674 
1675             if (par.notags)
1676                 q->NeutralizeTags(pb);
1677 
1678             if (*par.alisbasename) {
1679                 Alignment* tmp = new Alignment(par.maxseq, par.maxres);
1680                 if (par.allseqs) {
1681                     (*tmp) = (*Qali_allseqs);
1682                 } else {
1683                     (*tmp) = (*Qali);
1684                 }
1685 
1686                 alis[round] = tmp;
1687             }
1688         }
1689             // Update counts for log
1690         else if (round == par.num_rounds) {
1691             hitlist.Reset();
1692             while (!hitlist.End()) {
1693                 Hit hit_cur = hitlist.ReadNext();
1694 
1695                 // E-value much too large
1696                 if (hit_cur.Eval > 100.0 * par.e)
1697                     break;
1698 
1699                 // E-value too large
1700                 if (hit_cur.Eval > par.e)
1701                     continue;
1702 
1703                 std::stringstream ss_tmp;
1704                 ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1705                 // Already in alignment?
1706                 if (previous_hits->Contains((char*) ss_tmp.str().c_str()))
1707                     continue;
1708 
1709                 // Add number of sequences in this cluster to total found
1710                 // read number after second '|'
1711                 seqs_found += SequencesInCluster(hit_cur.name);
1712                 cluster_found++;
1713             }
1714         }
1715 
1716         HH_LOG(INFO) << seqs_found << " sequences belonging to "
1717                      << cluster_found
1718                      << " database HMMs found with an E-value < " << par.e
1719                      << std::endl;
1720 
1721         if (round < par.num_rounds || *par.alnfile || *par.psifile || *par.hhmfile
1722             || *par.alisbasename) {
1723             HH_LOG(INFO)
1724                 << "Number of effective sequences of resulting query HMM: Neff = "
1725                 << q->Neff_HMM << std::endl;
1726         }
1727 
1728         if (q->Neff_HMM > par.neffmax && round < par.num_rounds) {
1729             HH_LOG(INFO)
1730                 << "Diversity is above threshold (" << par.neffmax
1731                 << "). Stop searching! (Change threshold using -neffmax <float>.)"
1732                 << std::endl;
1733         }
1734 
1735         if (Qali->N_in >= par.maxseq) {
1736             HH_LOG(INFO)
1737                 << "Maximun number of sequences in query alignment reached ("
1738                 << par.maxseq << "). Stop searching!" << std::endl;
1739         }
1740 
1741         if (new_hits == 0 || round == par.num_rounds || q->Neff_HMM > par.neffmax || Qali->N_in >= par.maxseq) {
1742 //      if (new_hits == 0 && round < par.num_rounds) {
1743 //        HH_LOG(INFO) << "No new hits found in iteration " << round
1744 //                               << " => Stop searching" << std::endl;
1745 //      }
1746 //
1747 //      if (old_entries.size() > 0 && par.realign_old_hits) {
1748 //        HH_LOG(INFO)
1749 //            << "Recalculating previously found HMMs with Viterbi algorithm"
1750 //            << std::endl;
1751 //
1752 //        ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads);
1753 //        std::vector<Hit> hits_to_add = viterbirunner.alignment(par, &q_vec,
1754 //                                                               old_entries,
1755 //                                                               par.qsc_db, pb,
1756 //                                                               S, Sim, R, par.ssm, S73, S33, S37);
1757 //
1758 //        add_hits_to_hitlist(hits_to_add, hitlist);
1759 //
1760 //        if (par.realign)
1761 //          perform_realign(q_vec, input_format, old_entries);
1762 //      } else if (!par.realign_old_hits && previous_hits->Size() > 0) {
1763 //        HH_LOG(INFO)
1764 //            << "Rescoring previously found HMMs with Viterbi algorithm"
1765 //            << std::endl;
1766 //        RescoreWithViterbiKeepAlignment(q_vec, previous_hits);
1767 //      }
1768 
1769             delete q_rescore;
1770             break;
1771         }
1772 
1773 
1774         // Write good hits to previous_hits hash and clear hitlist
1775         hitlist.Reset();
1776         while (!hitlist.End()) {
1777             Hit hit_cur = hitlist.ReadNext();
1778 
1779             std::stringstream ss_tmp;
1780             ss_tmp << hit_cur.file << "__" << hit_cur.irep;
1781 
1782             if (!par.already_seen_filter || hit_cur.Eval > par.e
1783                 || previous_hits->Contains((char*) ss_tmp.str().c_str()))
1784                 hit_cur.Delete();  // Delete hit object (deep delete with Hit::Delete())
1785             else {
1786                 previous_hits->Add((char*) ss_tmp.str().c_str(), hit_cur);
1787             }
1788 
1789             hitlist.Delete();  // Delete list record (flat delete)
1790         }
1791 
1792         delete q_rescore;
1793     }
1794 
1795     // Warn, if HMMER files were used
1796     if (par.hmmer_used) {
1797         HH_LOG(WARNING)
1798             << "Using HMMER files results in a drastically reduced sensitivity (>10%%).\n"
1799             << " We recommend to use HHMs build by hhmake." << std::endl;
1800     }
1801 
1802     for (size_t i = 0; i < all_entries.size(); i++) {
1803         delete all_entries[i];
1804     }
1805     all_entries.clear();
1806 
1807     previous_hits->Reset();
1808     while (!previous_hits->End())
1809         previous_hits->ReadNext().Delete();  // Delete hit object
1810     delete previous_hits;
1811     delete premerged_hits;
1812 }
1813 
1814 
1815 
writeHHRFile(char * hhrFile)1816 void HHblits::writeHHRFile(char* hhrFile) {
1817     if (*hhrFile) {
1818         hitlist.PrintHHR(q_tmp, hhrFile, par.maxdbstrlen, par.showconf,
1819                          par.showcons, par.showdssp, par.showpred, par.b, par.B,
1820                          par.z, par.Z, par.aliwidth, par.nseqdis, par.p, par.E,
1821                          par.argc, par.argv, S, par.maxseq);
1822     }
1823 }
1824 
writeAlisFile(char * basename)1825 void HHblits::writeAlisFile(char* basename) {
1826     if (*basename) {
1827         std::map<int, Alignment*>::iterator it;
1828         for (it = alis.begin(); it != alis.end(); it++) {
1829             std::stringstream ss_tmp;
1830             ss_tmp << basename << "_" << (*it).first << ".a3m";
1831             std::string id = ss_tmp.str();
1832 
1833             (*it).second->WriteToFile(id.c_str(), par.append, "a3m");
1834         }
1835     }
1836 }
1837 
writeScoresFile(char * scoresFile)1838 void HHblits::writeScoresFile(char* scoresFile) {
1839     if (*scoresFile) {
1840         hitlist.PrintScoreFile(q, scoresFile);
1841     }
1842 }
1843 
writeM8(char * m8File)1844 void HHblits::writeM8(char* m8File) {
1845     if (*m8File) {
1846         hitlist.PrintM8File(q, m8File, par.nseqdis, par.p, par.b, par.E);
1847     }
1848 }
1849 
writePairwiseAlisFile(char * pairwiseAlisFile,char outformat)1850 void HHblits::writePairwiseAlisFile(char* pairwiseAlisFile, char outformat) {
1851     if (*pairwiseAlisFile) {
1852         hitlist.PrintAlignments(q, pairwiseAlisFile, par.showconf, par.showcons,
1853                                 par.showdssp, par.showpred, par.p, par.aliwidth,
1854                                 par.nseqdis, par.b, par.B, par.E, S, par.maxseq, outformat);
1855     }
1856 }
1857 
writeAlitabFile(char * alitabFile)1858 void HHblits::writeAlitabFile(char* alitabFile) {
1859     if (*alitabFile) {
1860         hitlist.WriteToAlifile(q, alitabFile, par.b, par.B, par.z, par.Z, par.p,
1861                                par.E);
1862     }
1863 }
1864 
writePsiFile(char * psiFile)1865 void HHblits::writePsiFile(char* psiFile) {
1866     // Write output PSI-BLAST-formatted alignment?
1867     if (*psiFile) {
1868         if (par.allseqs)
1869             Qali_allseqs->WriteToFile(psiFile, par.append, "psi");
1870         else
1871             Qali->WriteToFile(psiFile, par.append, "psi");
1872     }
1873 }
1874 
writeHMMFile(char * HMMFile)1875 void HHblits::writeHMMFile(char* HMMFile) {
1876     // Write output HHM file?
1877     if (*HMMFile) {
1878         // Add *no* amino acid pseudocounts to query. This is necessary to copy f[i][a] to p[i][a]
1879         q->AddAminoAcidPseudocounts(0, 0.0, 0.0, 1.0);
1880         q->CalculateAminoAcidBackground(pb);
1881 
1882         q->WriteToFile(HMMFile, par.append, par.max_seqid, par.coverage, par.qid,
1883                        par.Ndiff, par.qsc, par.argc, par.argv, pb);
1884     }
1885 }
1886 
writeA3MFile(char * A3MFile)1887 void HHblits::writeA3MFile(char* A3MFile) {
1888     // Write output A3M alignment?
1889     if (*A3MFile) {
1890         if (par.allseqs)
1891             Qali_allseqs->WriteToFile(A3MFile, par.append, "a3m");
1892         else
1893             Qali->WriteToFile(A3MFile, par.append, "a3m");
1894     }
1895 }
1896 
writeHHRFile(HHblits & hhblits,std::stringstream & out)1897 void HHblits::writeHHRFile(HHblits& hhblits, std::stringstream& out) {
1898     hhblits.hitlist.PrintHHR(hhblits.q_tmp, out, hhblits.par.maxdbstrlen,
1899                              hhblits.par.showconf, hhblits.par.showcons,
1900                              hhblits.par.showdssp, hhblits.par.showpred,
1901                              hhblits.par.b, hhblits.par.B, hhblits.par.z,
1902                              hhblits.par.Z, hhblits.par.aliwidth,
1903                              hhblits.par.nseqdis, hhblits.par.p, hhblits.par.E,
1904                              hhblits.par.argc, hhblits.par.argv, hhblits.S,
1905                              hhblits.par.maxseq);
1906 }
1907 
printHitList()1908 void HHblits::printHitList() {
1909     char output[] = {"stdout"};
1910     hitlist.PrintHitList(q, output, par.maxdbstrlen, par.z, par.Z, par.p, par.E, par.argc, par.argv);
1911 }
1912 
printHHRFile()1913 void HHblits::printHHRFile() {
1914     char output[] = {"stdout"};
1915     hitlist.PrintHHR(q_tmp, output, par.maxdbstrlen,
1916                      par.showconf, par.showcons,
1917                      par.showdssp, par.showpred,
1918                      par.b, par.B, par.z,
1919                      par.Z, par.aliwidth,
1920                      par.nseqdis, par.p, par.E,
1921                      par.argc, par.argv, S, par.maxseq);
1922 }
1923 
writeScoresFile(HHblits & hhblits,std::stringstream & out)1924 void HHblits::writeScoresFile(HHblits& hhblits, std::stringstream& out) {
1925     hhblits.hitlist.PrintScoreFile(hhblits.q, out);
1926 }
1927 
writeM8(HHblits & hhblits,std::stringstream & out)1928 void HHblits::writeM8(HHblits& hhblits, std::stringstream& out) {
1929     hhblits.hitlist.PrintM8File(hhblits.q, out, hhblits.par.nseqdis, hhblits.par.p, hhblits.par.b, hhblits.par.E);
1930 }
1931 
writePairwiseAlisFile(HHblits & hhblits,std::stringstream & out)1932 void HHblits::writePairwiseAlisFile(HHblits& hhblits, std::stringstream& out) {
1933     hhblits.hitlist.PrintAlignments(hhblits.q, out, hhblits.par.showconf,
1934                                     hhblits.par.showcons, hhblits.par.showdssp,
1935                                     hhblits.par.showpred, hhblits.par.p,
1936                                     hhblits.par.aliwidth, hhblits.par.nseqdis,
1937                                     hhblits.par.b, hhblits.par.B, hhblits.par.E,
1938                                     hhblits.S, hhblits.par.maxseq, hhblits.par.outformat);
1939 }
1940 
writeAlitabFile(HHblits & hhblits,std::stringstream & out)1941 void HHblits::writeAlitabFile(HHblits& hhblits, std::stringstream& out) {
1942     hhblits.hitlist.WriteToAlifile(hhblits.q, out, hhblits.par.b, hhblits.par.B,
1943                                    hhblits.par.z, hhblits.par.Z, hhblits.par.p,
1944                                    hhblits.par.E);
1945 }
1946 
writePsiFile(HHblits & hhblits,std::stringstream & out)1947 void HHblits::writePsiFile(HHblits& hhblits, std::stringstream& out) {
1948     if (hhblits.par.allseqs)
1949         hhblits.Qali_allseqs->WriteToFile(out, "psi");
1950     else
1951         hhblits.Qali->WriteToFile(out, "psi");
1952 }
1953 
writeHMMFile(HHblits & hhblits,std::stringstream & out)1954 void HHblits::writeHMMFile(HHblits& hhblits, std::stringstream& out) {
1955     // Add *no* amino acid pseudocounts to query. This is necessary to copy f[i][a] to p[i][a]
1956     hhblits.q->AddAminoAcidPseudocounts(0, 0.0, 0.0, 1.0);
1957     hhblits.q->CalculateAminoAcidBackground(hhblits.pb);
1958 
1959     hhblits.q->WriteToFile(out, hhblits.par.max_seqid, hhblits.par.coverage,
1960                            hhblits.par.qid, hhblits.par.Ndiff, hhblits.par.qsc,
1961                            hhblits.par.argc, hhblits.par.argv, hhblits.pb);
1962 }
1963 
writeA3MFile(HHblits & hhblits,std::stringstream & out)1964 void HHblits::writeA3MFile(HHblits& hhblits, std::stringstream& out) {
1965     if (hhblits.par.allseqs)
1966         hhblits.Qali_allseqs->WriteToFile(out, "a3m");
1967     else
1968         hhblits.Qali->WriteToFile(out, "a3m");
1969 }
1970 
writeMatricesFile(char * matricesOutputFileName)1971 void HHblits::writeMatricesFile(char* matricesOutputFileName) {
1972     if (*matricesOutputFileName) {
1973         hitlist.PrintMatrices(q, matricesOutputFileName, par.filter_matrices,
1974                               par.max_number_matrices, S);
1975     }
1976 }
1977 
writeMatricesFile(HHblits & hhblits,std::stringstream & out)1978 void HHblits::writeMatricesFile(HHblits& hhblits, std::stringstream& out) {
1979     hhblits.hitlist.PrintMatrices(hhblits.q, out, hhblits.par.filter_matrices,
1980                                   hhblits.par.max_number_matrices,
1981                                   hhblits.S);
1982 }
1983 
premerge(Hash<Hit> * previous_hits,Hash<Hit> * premerged_hits,int & seqs_found,int & cluster_found,int min_col_realign)1984 void HHblits::premerge(Hash<Hit>* previous_hits, Hash<Hit>* premerged_hits,
1985                        int& seqs_found, int& cluster_found, int min_col_realign) {
1986     int Lmax = 0;      // length of longest HMM to be realigned
1987     long int Lmaxmem = ((par.maxmem - 0.5) * 1024 * 1024 * 1024)
1988                        / (2 * sizeof(double) + 8) / q->L / par.threads;
1989 
1990     // Initialize a Null-value as a return value if not items are available anymore
1991     hitlist.Reset();
1992     while (!hitlist.End()) {
1993         Hit hit_cur = hitlist.ReadNext();
1994         if (hit_cur.L > Lmax) {
1995             Lmax = hit_cur.L;
1996         }
1997         if (hit_cur.L > Lmaxmem) {
1998             continue;
1999         }
2000     }
2001     int t_maxres = Lmax + 2;
2002     for (int i = 0; i < par.threads; i++) {
2003         posteriorMatrices[i]->allocateMatrix(q->L, t_maxres);
2004     }
2005     PosteriorDecoderRunner runner(posteriorMatrices, viterbiMatrices, par.threads, par.ssw, S73, S33, S37);
2006 
2007     int prermergedHits = 0;
2008     std::vector<Hit *> tmpHits;
2009     hitlist.Reset();
2010 
2011     while (!hitlist.End() && prermergedHits<par.premerge)
2012     {
2013         HitList tmpHitList;
2014         Hit hit_cur = hitlist.ReadNext();
2015         if (hit_cur.L > Lmaxmem) {
2016             continue;
2017         }
2018 
2019         tmpHits.clear();
2020         tmpHits.push_back(&hit_cur);
2021         if (prermergedHits>=imax(par.B,par.Z)) break;
2022         if (prermergedHits>=imax(par.b,par.z) && hit_cur.Probab < par.p) break;
2023         if (prermergedHits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue;
2024         prermergedHits++;
2025 
2026         if (hit_cur.Eval > par.e) continue; // Don't align hits with an E-value below the inclusion threshold
2027         // Replace original hit in hitlist with realigned hit
2028         //hitlist.ReadCurrent().Delete();
2029         runner.executeComputation(*q, tmpHits, par, par.qsc_db, pb, S, Sim, R);
2030         hitlist.Delete();
2031         hitlist.Insert(*tmpHits[0]);
2032         tmpHitList.Push(*tmpHits[0]);
2033         mergeHitsToQuery(tmpHitList, previous_hits, premerged_hits, seqs_found, cluster_found, min_col_realign);
2034         std::stringstream ss_tmp;
2035         ss_tmp << hit_cur.file << "__" << hit_cur.irep;
2036         premerged_hits->Add((char*)ss_tmp.str().c_str());
2037 
2038         // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
2039         Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons,par.showcons, pb, Sim, NULL,true);
2040 
2041         if (par.notags) q->NeutralizeTags(pb);
2042 
2043         if (par.nocontxt) {
2044             // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
2045             q->PreparePseudocounts(R);
2046             // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
2047             q->AddAminoAcidPseudocounts(par.pc_prefilter_nocontext_mode,
2048                                         par.pc_prefilter_nocontext_a,
2049                                         par.pc_prefilter_nocontext_b,
2050                                         par.pc_prefilter_nocontext_c);
2051         } else {
2052             // Add context specific pseudocounts (now always used, because clusterfile is necessary)
2053             q->AddContextSpecificPseudocounts(pc_prefilter_context_engine,
2054                                               pc_prefilter_context_mode);
2055         }
2056         q->CalculateAminoAcidBackground(pb);
2057 
2058         // Transform transition freqs to lin space if not already done
2059         q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
2060                                      par.gaph, par.gapi, par.gapb, par.gapb);
2061         q->Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done
2062 
2063     }
2064     HH_LOG(INFO) << "Premerge done\n";
2065 
2066 }
2067 
2068