1 #include "hhfunc.h"
2 #include "ext/fmemopen.h"
3 #include "context_data.crf.h"
4 
5 #include <sstream>
6 
7 /////////////////////////////////////////////////////////////////////////////////////
8 // Read input file (HMM, HHM, or alignment format)
9 /////////////////////////////////////////////////////////////////////////////////////
ReadQueryFile(Parameters & par,FILE * inf,char & input_format,char use_global_weights,HMM * q,Alignment * qali,char infile[],float * pb,const float S[20][20],const float Sim[20][20])10 void ReadQueryFile(Parameters& par, FILE* inf, char& input_format,
11     char use_global_weights, HMM* q, Alignment* qali, char infile[], float* pb,
12     const float S[20][20], const float Sim[20][20]) {
13   char line[LINELEN];
14 
15   if (!fgetline(line, LINELEN, inf)) {
16 	HH_LOG(ERROR) << "Error in " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
17 	HH_LOG(ERROR) << "\t" << infile << " is empty!\n";
18     exit(4);
19   }
20   while (strscn(line) == NULL)
21     fgetline(line, LINELEN, inf); // skip lines that contain only white space
22 
23   // Is infile a HMMER file?
24   if (!strncmp(line, "HMMER", 5)) {
25     // Uncomment this line to allow HMMER2/HMMER3 models as queries:
26     HH_LOG(ERROR) << "Use of HMMER format as input will result in severe loss of sensitivity!\n";
27   }
28   // ... or is it an hhm file?
29   else if (!strncmp(line, "NAME", 4) || !strncmp(line, "HH", 2)) {
30     char path[NAMELEN];
31     Pathname(path, infile);
32 
33     HH_LOG(INFO) << "Query file is in HHM format\n";
34 
35     // Rewind to beginning of line and read query hhm file
36     rewind(inf);
37     q->Read(inf, par.maxcol, par.nseqdis, pb, path);
38     input_format = 0;
39 
40     // HHM format
41     if (input_format == 0) {
42       HH_LOG(INFO) << "Extracting representative sequences from " << infile << " to merge later with matched database sequences\n";
43     }
44 
45     Alignment ali_tmp(par.maxseq, par.maxres);
46     ali_tmp.GetSeqsFromHMM(q);
47     ali_tmp.Compress(infile, par.cons, par.maxcol, par.M, par.Mgaps);
48     *qali = ali_tmp;
49   }
50   // ... or is it an alignment file
51   else if (line[0] == '#' || line[0] == '>') {
52     if (strcmp(infile, "stdin")) {
53     	HH_LOG(INFO) << infile << " is in A2M, A3M or FASTA format\n";
54     }
55 
56     Alignment ali_tmp(par.maxseq, par.maxres);
57 
58     // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
59     ali_tmp.Read(inf, infile, par.mark, par.maxcol, par.nseqdis, line);
60 
61     // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
62     // and store marked sequences in name[k] and seq[k]
63     ali_tmp.Compress(infile, par.cons, par.maxcol, par.M, par.Mgaps);
64 
65     ali_tmp.Shrink();
66 
67     // Sort out the nseqdis most dissimilar sequences for display in the output alignments
68     ali_tmp.FilterForDisplay(par.max_seqid, par.mark, S, par.coverage, par.qid, par.qsc, par.nseqdis);
69 
70     // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
71     ali_tmp.N_filtered = ali_tmp.Filter(par.max_seqid, S, par.coverage, par.qid, par.qsc, par.Ndiff);
72 
73     if (par.Neff >= 0.999)
74     	ali_tmp.FilterNeff(use_global_weights, par.mark, par.cons, par.showcons, par.max_seqid, par.coverage, par.Neff, pb, S, Sim);
75 
76     // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
77     ali_tmp.FrequenciesAndTransitions(q, use_global_weights, par.mark, par.cons, par.showcons, pb, Sim);
78 
79     *qali = ali_tmp;
80     input_format = 0;
81   }
82   else {
83 	HH_LOG(ERROR) << "Error in " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
84 	HH_LOG(ERROR) << "\tunrecognized input file format in \'" << infile << "\'\n";
85 	HH_LOG(ERROR) << "\tline = " << line << "\n";
86     exit(1);
87   }
88 
89   if (input_format == 0 && q->Neff_HMM > 11.0) {
90 	  HH_LOG(WARNING) << "MSA " << q->name << " looks too diverse (Neff=" << q->Neff_HMM << ">11). Better check it with an alignment viewer for non-homologous segments. Also consider building the MSA with hhblits using the - option to limit MSA diversity.\n";
91   }
92 }
93 
ReadQueryFile(Parameters & par,char * infile,char & input_format,char use_global_weights,HMM * q,Alignment * qali,float * pb,const float S[20][20],const float Sim[20][20])94 void ReadQueryFile(Parameters& par, char* infile, char& input_format,
95     char use_global_weights, HMM* q, Alignment* qali, float* pb,
96     const float S[20][20], const float Sim[20][20]) {
97   // Open query file and determine file type
98   char path[NAMELEN]; // path of input file (is needed to write full path and file name to HMM FILE record)
99   FILE* inf = NULL;
100   if (strcmp(infile, "stdin") == 0) {
101     inf = stdin;
102     HH_LOG(INFO) << "Reading HMM / multiple alignment from standard input ...\n";
103     path[0] = '\0';
104   }
105   else {
106     inf = fopen(infile, "r");
107     if (!inf)
108       OpenFileError(infile, __FILE__, __LINE__, __func__);
109     Pathname(path, infile);
110   }
111 
112   ReadQueryFile(par, inf, input_format, use_global_weights, q, qali, infile, pb, S, Sim);
113 
114   fclose(inf);
115 }
116 
117 /////////////////////////////////////////////////////////////////////////////////////
118 // Add transition and amino acid pseudocounts to query HMM, calculate aa background etc.
119 /////////////////////////////////////////////////////////////////////////////////////
PrepareQueryHMM(Parameters & par,char & input_format,HMM * q,cs::Pseudocounts<cs::AA> * pc_hhm_context_engine,cs::Admix * pc_hhm_context_mode,const float * pb,const float R[20][20])120 void PrepareQueryHMM(Parameters& par, char& input_format, HMM* q,
121     cs::Pseudocounts<cs::AA>* pc_hhm_context_engine,
122     cs::Admix* pc_hhm_context_mode, const float* pb, const float R[20][20]) {
123   // Was query an HHsearch formatted file or MSA (no pseudocounts added yet)?
124   if (input_format == 0) {
125     // Add transition pseudocounts to query -> q->p[i][a]
126     q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
127         par.gaph, par.gapi, par.gapb, par.gapb);
128 
129     // Compute substitutino matrix pseudocounts?
130     if (par.nocontxt) {
131       // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
132       q->PreparePseudocounts(R);
133       // Add amino acid pseudocounts to query:  q->p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
134       q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode,
135           par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b,
136           par.pc_hhm_nocontext_c);
137     }
138     else {
139       // Add context specific pseudocount to query
140       q->AddContextSpecificPseudocounts(pc_hhm_context_engine,
141           pc_hhm_context_mode);
142     }
143   }
144   // or was query a HMMER file? (pseudocounts already added!)
145   else if (input_format == 1) {
146     // Don't add transition pseudocounts to query!!
147     // DON'T ADD amino acid pseudocounts to query: pcm=0!  q->p[i][a] = f[i][a]
148     q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a,
149         par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
150   }
151 
152   q->CalculateAminoAcidBackground(pb);
153 
154   // if (par.addss==1) CalculateSS(q);
155 
156   if (par.columnscore == 5 && !q->divided_by_local_bg_freqs)
157     q->DivideBySqrtOfLocalBackgroundFreqs(
158         par.half_window_size_local_aa_bg_freqs, pb);
159 }
160 
161 /////////////////////////////////////////////////////////////////////////////////////
162 // Do precalculations for q and t to prepare comparison
163 /////////////////////////////////////////////////////////////////////////////////////
PrepareTemplateHMM(Parameters & par,HMM * q,HMM * t,int format,float linear_tranistion_probs,const float * pb,const float R[20][20])164 void PrepareTemplateHMM(Parameters& par, HMM* q, HMM* t, int format, float linear_tranistion_probs,
165     const float* pb, const float R[20][20]) {
166   // HHM format
167   if (format == 0) {
168     // Add transition pseudocounts to template
169     t->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
170         par.gaph, par.gapi, par.gapb, par.gapb);
171 
172     // Don't use CS-pseudocounts because of runtime!!!
173     // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
174     t->PreparePseudocounts(R);
175 
176     // Add amino acid pseudocounts to query:  p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
177     t->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode,
178         par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
179   }
180   // HHMER format
181   else {
182     // Don't add transition pseudocounts to template
183     // t->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0);
184 
185     // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
186     // t->PreparePseudocounts();
187 
188     // DON'T ADD amino acid pseudocounts to temlate: pcm=0!  t->p[i][a] = t->f[i][a]
189     t->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a,
190         par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
191   }
192   t->CalculateAminoAcidBackground(pb);
193 
194   if (linear_tranistion_probs)
195     t->Log2LinTransitionProbs(1.0);
196 
197   // Factor Null model into HMM t
198   // ATTENTION! t->p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t->p
199   t->IncludeNullModelInHMM(q, t, par.columnscore,
200       par.half_window_size_local_aa_bg_freqs, pb); // Can go BEFORE the loop if not dependent on template
201 }
202 
InitializePseudocountsEngine(Parameters & par,cs::ContextLibrary<cs::AA> * & context_lib,cs::Crf<cs::AA> * & crf,cs::Pseudocounts<cs::AA> * & pc_hhm_context_engine,cs::Admix * & pc_hhm_context_mode,cs::Pseudocounts<cs::AA> * & pc_prefilter_context_engine,cs::Admix * & pc_prefilter_context_mode)203 void InitializePseudocountsEngine(Parameters& par,
204     cs::ContextLibrary<cs::AA>*& context_lib, cs::Crf<cs::AA>*& crf,
205     cs::Pseudocounts<cs::AA>*& pc_hhm_context_engine,
206     cs::Admix*& pc_hhm_context_mode,
207     cs::Pseudocounts<cs::AA>*& pc_prefilter_context_engine,
208     cs::Admix*& pc_prefilter_context_mode) {
209   // Prepare pseudocounts engine
210   FILE* fin;
211   char ext[100];
212   if (par.clusterfile != "") {
213     fin = fopen(par.clusterfile.c_str(), "r");
214     if (!fin) {
215       HH_LOG(ERROR) << "Could not open file \'" << par.clusterfile << "\'\n";
216       exit(2);
217     }
218     Extension(ext, par.clusterfile.c_str());
219   } else {
220     fin = fmemopen((void*)context_data_crf, context_data_crf_len, "r");
221     strcpy(ext, "crf");
222   }
223   if (strcmp(ext, "crf") == 0) {
224     // Newer discriminative approach by Angermueller & Soeding Bioinformatics 2012
225     crf = new cs::Crf<cs::AA>(fin);
226     pc_hhm_context_engine = new cs::CrfPseudocounts<cs::AA>(*crf);
227     pc_prefilter_context_engine = new cs::CrfPseudocounts<cs::AA>(*crf);
228   }
229   else {
230     // Older generative approach by Biegert & Soeding PNAS 2009
231     context_lib = new cs::ContextLibrary<cs::AA>(fin);
232     cs::TransformToLog(*context_lib);
233     pc_hhm_context_engine = new cs::LibraryPseudocounts<cs::AA>(*context_lib, par.csw, par.csb);
234     pc_prefilter_context_engine = new cs::LibraryPseudocounts<cs::AA>(*context_lib, par.csw, par.csb);
235   }
236   fclose(fin);
237   pc_hhm_context_engine->SetTargetNeff(par.pc_hhm_context_engine.target_neff);
238   pc_prefilter_context_engine->SetTargetNeff(par.pc_prefilter_context_engine.target_neff);
239 
240   // Prepare pseudocounts admixture method
241   pc_hhm_context_mode = par.pc_hhm_context_engine.CreateAdmix();
242   pc_prefilter_context_mode = par.pc_prefilter_context_engine.CreateAdmix();
243 }
244 
DeletePseudocountsEngine(cs::ContextLibrary<cs::AA> * context_lib,cs::Crf<cs::AA> * crf,cs::Pseudocounts<cs::AA> * pc_hhm_context_engine,cs::Admix * pc_hhm_context_mode,cs::Pseudocounts<cs::AA> * pc_prefilter_context_engine,cs::Admix * pc_prefilter_context_mode)245 void DeletePseudocountsEngine(cs::ContextLibrary<cs::AA>* context_lib,
246     cs::Crf<cs::AA>* crf, cs::Pseudocounts<cs::AA>* pc_hhm_context_engine,
247     cs::Admix* pc_hhm_context_mode,
248     cs::Pseudocounts<cs::AA>* pc_prefilter_context_engine,
249     cs::Admix* pc_prefilter_context_mode) {
250 
251   if (context_lib != NULL)
252     delete context_lib;
253   if (crf != NULL)
254     delete crf;
255   if (pc_hhm_context_engine != NULL)
256     delete pc_hhm_context_engine;
257   if (pc_hhm_context_mode != NULL)
258     delete pc_hhm_context_mode;
259   if (pc_prefilter_context_engine != NULL)
260     delete pc_prefilter_context_engine;
261   if (pc_prefilter_context_mode != NULL)
262     delete pc_prefilter_context_mode;
263 }
264 
265 
266 /////////////////////////////////////////////////////////////////////////////////////
267 // Read number of sequences in annotation, after second '|'
268 /////////////////////////////////////////////////////////////////////////////////////
SequencesInCluster(char * name)269 int SequencesInCluster(char* name) {
270   int num = 1;
271   char *ptr = strchr(name, '|');
272   if (!strncmp(name, "cl|", 3) || !strncmp(name, "UP20|", 5)
273       || !strncmp(name, "NR20|", 5))   // kClust formatted database (NR20, ...)
274           {
275     if (*ptr == '|')
276       ptr = strchr(ptr, '|');
277     if (*ptr == '|') {
278       num = strint(ptr);
279       if (num < 0)
280         num = 1;
281     }
282   }
283   return num;
284 }
285 
286 
287