1 #include "hhfunc.h"
2 #include "ext/fmemopen.h"
3 #include "context_data.crf.h"
4
5 #include <sstream>
6
7 /////////////////////////////////////////////////////////////////////////////////////
8 // Read input file (HMM, HHM, or alignment format)
9 /////////////////////////////////////////////////////////////////////////////////////
ReadQueryFile(Parameters & par,FILE * inf,char & input_format,char use_global_weights,HMM * q,Alignment * qali,char infile[],float * pb,const float S[20][20],const float Sim[20][20])10 void ReadQueryFile(Parameters& par, FILE* inf, char& input_format,
11 char use_global_weights, HMM* q, Alignment* qali, char infile[], float* pb,
12 const float S[20][20], const float Sim[20][20]) {
13 char line[LINELEN];
14
15 if (!fgetline(line, LINELEN, inf)) {
16 HH_LOG(ERROR) << "Error in " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
17 HH_LOG(ERROR) << "\t" << infile << " is empty!\n";
18 exit(4);
19 }
20 while (strscn(line) == NULL)
21 fgetline(line, LINELEN, inf); // skip lines that contain only white space
22
23 // Is infile a HMMER file?
24 if (!strncmp(line, "HMMER", 5)) {
25 // Uncomment this line to allow HMMER2/HMMER3 models as queries:
26 HH_LOG(ERROR) << "Use of HMMER format as input will result in severe loss of sensitivity!\n";
27 }
28 // ... or is it an hhm file?
29 else if (!strncmp(line, "NAME", 4) || !strncmp(line, "HH", 2)) {
30 char path[NAMELEN];
31 Pathname(path, infile);
32
33 HH_LOG(INFO) << "Query file is in HHM format\n";
34
35 // Rewind to beginning of line and read query hhm file
36 rewind(inf);
37 q->Read(inf, par.maxcol, par.nseqdis, pb, path);
38 input_format = 0;
39
40 // HHM format
41 if (input_format == 0) {
42 HH_LOG(INFO) << "Extracting representative sequences from " << infile << " to merge later with matched database sequences\n";
43 }
44
45 Alignment ali_tmp(par.maxseq, par.maxres);
46 ali_tmp.GetSeqsFromHMM(q);
47 ali_tmp.Compress(infile, par.cons, par.maxcol, par.M, par.Mgaps);
48 *qali = ali_tmp;
49 }
50 // ... or is it an alignment file
51 else if (line[0] == '#' || line[0] == '>') {
52 if (strcmp(infile, "stdin")) {
53 HH_LOG(INFO) << infile << " is in A2M, A3M or FASTA format\n";
54 }
55
56 Alignment ali_tmp(par.maxseq, par.maxres);
57
58 // Read alignment from infile into matrix X[k][l] as ASCII (and supply first line as extra argument)
59 ali_tmp.Read(inf, infile, par.mark, par.maxcol, par.nseqdis, line);
60
61 // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
62 // and store marked sequences in name[k] and seq[k]
63 ali_tmp.Compress(infile, par.cons, par.maxcol, par.M, par.Mgaps);
64
65 ali_tmp.Shrink();
66
67 // Sort out the nseqdis most dissimilar sequences for display in the output alignments
68 ali_tmp.FilterForDisplay(par.max_seqid, par.mark, S, par.coverage, par.qid, par.qsc, par.nseqdis);
69
70 // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
71 ali_tmp.N_filtered = ali_tmp.Filter(par.max_seqid, S, par.coverage, par.qid, par.qsc, par.Ndiff);
72
73 if (par.Neff >= 0.999)
74 ali_tmp.FilterNeff(use_global_weights, par.mark, par.cons, par.showcons, par.max_seqid, par.coverage, par.Neff, pb, S, Sim);
75
76 // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
77 ali_tmp.FrequenciesAndTransitions(q, use_global_weights, par.mark, par.cons, par.showcons, pb, Sim);
78
79 *qali = ali_tmp;
80 input_format = 0;
81 }
82 else {
83 HH_LOG(ERROR) << "Error in " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
84 HH_LOG(ERROR) << "\tunrecognized input file format in \'" << infile << "\'\n";
85 HH_LOG(ERROR) << "\tline = " << line << "\n";
86 exit(1);
87 }
88
89 if (input_format == 0 && q->Neff_HMM > 11.0) {
90 HH_LOG(WARNING) << "MSA " << q->name << " looks too diverse (Neff=" << q->Neff_HMM << ">11). Better check it with an alignment viewer for non-homologous segments. Also consider building the MSA with hhblits using the - option to limit MSA diversity.\n";
91 }
92 }
93
ReadQueryFile(Parameters & par,char * infile,char & input_format,char use_global_weights,HMM * q,Alignment * qali,float * pb,const float S[20][20],const float Sim[20][20])94 void ReadQueryFile(Parameters& par, char* infile, char& input_format,
95 char use_global_weights, HMM* q, Alignment* qali, float* pb,
96 const float S[20][20], const float Sim[20][20]) {
97 // Open query file and determine file type
98 char path[NAMELEN]; // path of input file (is needed to write full path and file name to HMM FILE record)
99 FILE* inf = NULL;
100 if (strcmp(infile, "stdin") == 0) {
101 inf = stdin;
102 HH_LOG(INFO) << "Reading HMM / multiple alignment from standard input ...\n";
103 path[0] = '\0';
104 }
105 else {
106 inf = fopen(infile, "r");
107 if (!inf)
108 OpenFileError(infile, __FILE__, __LINE__, __func__);
109 Pathname(path, infile);
110 }
111
112 ReadQueryFile(par, inf, input_format, use_global_weights, q, qali, infile, pb, S, Sim);
113
114 fclose(inf);
115 }
116
117 /////////////////////////////////////////////////////////////////////////////////////
118 // Add transition and amino acid pseudocounts to query HMM, calculate aa background etc.
119 /////////////////////////////////////////////////////////////////////////////////////
PrepareQueryHMM(Parameters & par,char & input_format,HMM * q,cs::Pseudocounts<cs::AA> * pc_hhm_context_engine,cs::Admix * pc_hhm_context_mode,const float * pb,const float R[20][20])120 void PrepareQueryHMM(Parameters& par, char& input_format, HMM* q,
121 cs::Pseudocounts<cs::AA>* pc_hhm_context_engine,
122 cs::Admix* pc_hhm_context_mode, const float* pb, const float R[20][20]) {
123 // Was query an HHsearch formatted file or MSA (no pseudocounts added yet)?
124 if (input_format == 0) {
125 // Add transition pseudocounts to query -> q->p[i][a]
126 q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
127 par.gaph, par.gapi, par.gapb, par.gapb);
128
129 // Compute substitutino matrix pseudocounts?
130 if (par.nocontxt) {
131 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
132 q->PreparePseudocounts(R);
133 // Add amino acid pseudocounts to query: q->p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
134 q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode,
135 par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b,
136 par.pc_hhm_nocontext_c);
137 }
138 else {
139 // Add context specific pseudocount to query
140 q->AddContextSpecificPseudocounts(pc_hhm_context_engine,
141 pc_hhm_context_mode);
142 }
143 }
144 // or was query a HMMER file? (pseudocounts already added!)
145 else if (input_format == 1) {
146 // Don't add transition pseudocounts to query!!
147 // DON'T ADD amino acid pseudocounts to query: pcm=0! q->p[i][a] = f[i][a]
148 q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a,
149 par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
150 }
151
152 q->CalculateAminoAcidBackground(pb);
153
154 // if (par.addss==1) CalculateSS(q);
155
156 if (par.columnscore == 5 && !q->divided_by_local_bg_freqs)
157 q->DivideBySqrtOfLocalBackgroundFreqs(
158 par.half_window_size_local_aa_bg_freqs, pb);
159 }
160
161 /////////////////////////////////////////////////////////////////////////////////////
162 // Do precalculations for q and t to prepare comparison
163 /////////////////////////////////////////////////////////////////////////////////////
PrepareTemplateHMM(Parameters & par,HMM * q,HMM * t,int format,float linear_tranistion_probs,const float * pb,const float R[20][20])164 void PrepareTemplateHMM(Parameters& par, HMM* q, HMM* t, int format, float linear_tranistion_probs,
165 const float* pb, const float R[20][20]) {
166 // HHM format
167 if (format == 0) {
168 // Add transition pseudocounts to template
169 t->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
170 par.gaph, par.gapi, par.gapb, par.gapb);
171
172 // Don't use CS-pseudocounts because of runtime!!!
173 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
174 t->PreparePseudocounts(R);
175
176 // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
177 t->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode,
178 par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
179 }
180 // HHMER format
181 else {
182 // Don't add transition pseudocounts to template
183 // t->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0);
184
185 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
186 // t->PreparePseudocounts();
187
188 // DON'T ADD amino acid pseudocounts to temlate: pcm=0! t->p[i][a] = t->f[i][a]
189 t->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a,
190 par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
191 }
192 t->CalculateAminoAcidBackground(pb);
193
194 if (linear_tranistion_probs)
195 t->Log2LinTransitionProbs(1.0);
196
197 // Factor Null model into HMM t
198 // ATTENTION! t->p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t->p
199 t->IncludeNullModelInHMM(q, t, par.columnscore,
200 par.half_window_size_local_aa_bg_freqs, pb); // Can go BEFORE the loop if not dependent on template
201 }
202
InitializePseudocountsEngine(Parameters & par,cs::ContextLibrary<cs::AA> * & context_lib,cs::Crf<cs::AA> * & crf,cs::Pseudocounts<cs::AA> * & pc_hhm_context_engine,cs::Admix * & pc_hhm_context_mode,cs::Pseudocounts<cs::AA> * & pc_prefilter_context_engine,cs::Admix * & pc_prefilter_context_mode)203 void InitializePseudocountsEngine(Parameters& par,
204 cs::ContextLibrary<cs::AA>*& context_lib, cs::Crf<cs::AA>*& crf,
205 cs::Pseudocounts<cs::AA>*& pc_hhm_context_engine,
206 cs::Admix*& pc_hhm_context_mode,
207 cs::Pseudocounts<cs::AA>*& pc_prefilter_context_engine,
208 cs::Admix*& pc_prefilter_context_mode) {
209 // Prepare pseudocounts engine
210 FILE* fin;
211 char ext[100];
212 if (par.clusterfile != "") {
213 fin = fopen(par.clusterfile.c_str(), "r");
214 if (!fin) {
215 HH_LOG(ERROR) << "Could not open file \'" << par.clusterfile << "\'\n";
216 exit(2);
217 }
218 Extension(ext, par.clusterfile.c_str());
219 } else {
220 fin = fmemopen((void*)context_data_crf, context_data_crf_len, "r");
221 strcpy(ext, "crf");
222 }
223 if (strcmp(ext, "crf") == 0) {
224 // Newer discriminative approach by Angermueller & Soeding Bioinformatics 2012
225 crf = new cs::Crf<cs::AA>(fin);
226 pc_hhm_context_engine = new cs::CrfPseudocounts<cs::AA>(*crf);
227 pc_prefilter_context_engine = new cs::CrfPseudocounts<cs::AA>(*crf);
228 }
229 else {
230 // Older generative approach by Biegert & Soeding PNAS 2009
231 context_lib = new cs::ContextLibrary<cs::AA>(fin);
232 cs::TransformToLog(*context_lib);
233 pc_hhm_context_engine = new cs::LibraryPseudocounts<cs::AA>(*context_lib, par.csw, par.csb);
234 pc_prefilter_context_engine = new cs::LibraryPseudocounts<cs::AA>(*context_lib, par.csw, par.csb);
235 }
236 fclose(fin);
237 pc_hhm_context_engine->SetTargetNeff(par.pc_hhm_context_engine.target_neff);
238 pc_prefilter_context_engine->SetTargetNeff(par.pc_prefilter_context_engine.target_neff);
239
240 // Prepare pseudocounts admixture method
241 pc_hhm_context_mode = par.pc_hhm_context_engine.CreateAdmix();
242 pc_prefilter_context_mode = par.pc_prefilter_context_engine.CreateAdmix();
243 }
244
DeletePseudocountsEngine(cs::ContextLibrary<cs::AA> * context_lib,cs::Crf<cs::AA> * crf,cs::Pseudocounts<cs::AA> * pc_hhm_context_engine,cs::Admix * pc_hhm_context_mode,cs::Pseudocounts<cs::AA> * pc_prefilter_context_engine,cs::Admix * pc_prefilter_context_mode)245 void DeletePseudocountsEngine(cs::ContextLibrary<cs::AA>* context_lib,
246 cs::Crf<cs::AA>* crf, cs::Pseudocounts<cs::AA>* pc_hhm_context_engine,
247 cs::Admix* pc_hhm_context_mode,
248 cs::Pseudocounts<cs::AA>* pc_prefilter_context_engine,
249 cs::Admix* pc_prefilter_context_mode) {
250
251 if (context_lib != NULL)
252 delete context_lib;
253 if (crf != NULL)
254 delete crf;
255 if (pc_hhm_context_engine != NULL)
256 delete pc_hhm_context_engine;
257 if (pc_hhm_context_mode != NULL)
258 delete pc_hhm_context_mode;
259 if (pc_prefilter_context_engine != NULL)
260 delete pc_prefilter_context_engine;
261 if (pc_prefilter_context_mode != NULL)
262 delete pc_prefilter_context_mode;
263 }
264
265
266 /////////////////////////////////////////////////////////////////////////////////////
267 // Read number of sequences in annotation, after second '|'
268 /////////////////////////////////////////////////////////////////////////////////////
SequencesInCluster(char * name)269 int SequencesInCluster(char* name) {
270 int num = 1;
271 char *ptr = strchr(name, '|');
272 if (!strncmp(name, "cl|", 3) || !strncmp(name, "UP20|", 5)
273 || !strncmp(name, "NR20|", 5)) // kClust formatted database (NR20, ...)
274 {
275 if (*ptr == '|')
276 ptr = strchr(ptr, '|');
277 if (*ptr == '|') {
278 num = strint(ptr);
279 if (num < 0)
280 num = 1;
281 }
282 }
283 return num;
284 }
285
286
287