1 /* Sequence weighting algorithms.
2 *
3 * Implementations of ad hoc sequence weighting algorithms for multiple
4 * sequence alignments:
5 * PB weights: Henikoff and Henikoff, JMB 243:574-578, 1994.
6 * GSC weights: Gerstein et al., JMB 236:1067-1078, 1994.
7 * BLOSUM weights: Henikoff and Henikoff, PNAS 89:10915-10919, 1992.
8 *
9 * and also, filtering an alignment to remove "redundant" sequences >=
10 * a given pairwise % identity cutoff, which is sort of a weighting
11 * method.
12 *
13 * The most work has gone into PB weights, the default in HMMER and
14 * Infernal. PB weights are the only one of the weighting schemes that
15 * scales practically to deep alignments.
16 *
17 * Contents:
18 * 1. Position-based (PB) weighting
19 * 2. Optional config, stats collection for advanced PB weighting
20 * 3. Other weighting algorithms (GSC, BLOSUM)
21 * 4. %id filtering
22 * 5. Benchmark
23 * 6. Stats driver
24 * 7. Unit tests
25 * 8. Test driver
26 * 9. Example
27 */
28 #include "esl_config.h"
29
30 #include <math.h>
31 #include <string.h>
32 #include <ctype.h>
33
34 #include "easel.h"
35 #include "esl_alphabet.h"
36 #include "esl_distance.h"
37 #include "esl_dmatrix.h"
38 #include "esl_matrixops.h"
39 #include "esl_msa.h"
40 #include "esl_msacluster.h"
41 #include "esl_quicksort.h"
42 #include "esl_tree.h"
43 #include "esl_vectorops.h"
44
45 #include "esl_msaweight.h"
46
47
48 /*****************************************************************
49 * 1. Position-based (PB) weighting
50 *****************************************************************/
51 /* Mar 2019: PB weighting algorithm revised to use only consensus
52 * columns, not all columns. HMMER benchmarks show improved
53 * discrimination especially with deep alignments. Also optimized
54 * order of access, trading off O(K) -> O(KL) memory in return for
55 * ~40x acceleration. These changes were only introduced for digital
56 * mode alignments; text mode PB algorithm remains as it was.
57 */
58
59 static int consensus_by_rf (const ESL_MSA *msa, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat);
60 static int consensus_by_sample(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat);
61 static int consensus_by_all (const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat);
62 static int collect_counts (const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, const int *conscols, int ncons, int **ct, ESL_MSAWEIGHT_DAT *dat);
63 static int msaweight_PB_txt(ESL_MSA *msa);
64
65 /* Function: esl_msaweight_PB()
66 * Synopsis: PB (position-based) weights.
67 *
68 * Purpose: Given a multiple alignment <msa>, calculate sequence
69 * weights according to the position-based weighting
70 * algorithm (Henikoff and Henikoff, JMB 243:574-578,
71 * 1994). These weights are stored internally in the <msa>
72 * object, replacing any weights that may have already been
73 * there. Weights are $\geq 0$ and they sum to <msa->nseq>.
74 *
75 * The Henikoffs' algorithm is defined for ungapped
76 * alignments. It does not give rules for dealing with
77 * gaps, nor for degenerate residue symbols. The rule here
78 * is to ignore these, and moreover (in digital mode
79 * alignments, in order to deal with deep gappy alignments)
80 * to only consider alignment columns that are "consensus"
81 * (defined below). This means that longer sequences
82 * initially get more weight; hence we do a "double
83 * normalization" in which the weights are first divided by
84 * sequence length in canonical residues (to get the
85 * average weight per residue, to compensate for that
86 * effect), then normalized to sum to nseq.
87
88 * The <msa> may be in either digitized or text mode. The
89 * weighting algorithm is subtly different depending on
90 * this mode. Digital mode is preferred. The digital mode
91 * algorithm handles deep gappy alignments better (it only
92 * considers consensus columns, not all columns), it deals
93 * with degenerate residue symbols (by ignoring them, like
94 * it ignores gaps), and it's much faster. In text mode,
95 * the algorithm simply uses the 26 letters as "residues"
96 * (case-insensitively), and ignores all other residues as
97 * gaps.
98 *
99 * PB weights require $O(NL)$ time and $O(LK)$ memory, for
100 * an alignment of $L$ columns, $N$ sequences, and alphabet
101 * size $K$.
102 *
103 * Consensus columns (for the digital alignment version)
104 * are determined as follows:
105 *
106 * 1. if the MSA has RF consensus column annotation, use it.
107 *
108 * 2. else, use HMMER's rules (fragthresh, symfrac):
109 * - define sequence fragments as those with aligned
110 * lengths where fractional span (l-r+1)/L < fragthresh
111 * for leftmost residue position l and rightmost r;
112 * - count % residue occupancy per column ignoring
113 * external gaps for fragments;
114 * - define consensus columns as those with $\geq$
115 * symfrac residue occupancy.
116 * Defaults are fragthresh = 0.5, symfrac = 0.5.
117 *
118 * 3. if all else fails, use all columns.
119 *
120 * Additionally, at step 2, if the alignment is very deep
121 * (>50000 sequences by default), we attempt to define
122 * consensus columns from a statistical sample (of 10,000
123 * seqs by default). This is a speed optimization. It can
124 * fail in a pathological case where the alignment contains
125 * a ridiculous number of fragments piled up on one
126 * subsequence. (Some DNA repeats can be this way.) If we
127 * get >5000 fragments in the sample (by default), we use
128 * all sequences, not a subsample.
129 *
130 * All the parameters mentioned with default parameters can
131 * be changed using the <esl_msaweight_PB_adv()> "advanced"
132 * version.
133 *
134 * Returns: <eslOK> on success. <msa->wgt[]> now contains weights for
135 * each sequence. <eslMSA_HASWGTS> flag is raised in
136 * <msa->flags>.
137 *
138 * Throws: <eslEMEM> on allocation error, in which case <msa> is
139 * returned unmodified.
140 *
141 * Xref: [Henikoff94b]; squid::weight.c::PositionBasedWeights().
142 */
143 int
esl_msaweight_PB(ESL_MSA * msa)144 esl_msaweight_PB(ESL_MSA *msa)
145 {
146 if (msa->flags & eslMSA_DIGITAL) return esl_msaweight_PB_adv(NULL, msa, NULL);
147 else return msaweight_PB_txt(msa);
148 }
149
150
151 /* Function: esl_msaweight_PB_adv()
152 * Synopsis: Advanced version of PB weights; digital MSAs only.
153 * Incept: SRE, Wed 20 Mar 2019
154 *
155 * Purpose: Same as <esl_msaweight_PB()> but only takes digital-mode
156 * MSAs, and can optionally take customized parameters
157 * <cfg>, and optionally collect data about the computation
158 * in <dat>.
159 *
160 * Args: cfg - optional customized parameters, or NULL to use defaults.
161 * msa - MSA to weight; weights stored in msa->wgt[]
162 * dat - optional data collection, or NULL.
163 *
164 * Returns: <eslOK> on success. <msa->wgt[]> now contains weights for
165 * each sequence. <eslMSA_HASWGTS> flag is raised in
166 * <msa->flags>. <dat>, if provided, contains data about
167 * stuff that happened during the weight computation.
168 *
169 * Throws: (no abnormal error conditions)
170 */
171 int
esl_msaweight_PB_adv(const ESL_MSAWEIGHT_CFG * cfg,ESL_MSA * msa,ESL_MSAWEIGHT_DAT * dat)172 esl_msaweight_PB_adv(const ESL_MSAWEIGHT_CFG *cfg, ESL_MSA *msa, ESL_MSAWEIGHT_DAT *dat)
173 {
174 int ignore_rf = (cfg? cfg->ignore_rf : eslMSAWEIGHT_IGNORE_RF); // default is FALSE: use RF annotation as consensus definition, if RF is present
175 int allow_samp = (cfg? cfg->allow_samp : eslMSAWEIGHT_ALLOW_SAMP); // default is TRUE: allow subsampling speed optimization
176 int sampthresh = (cfg? cfg->sampthresh : eslMSAWEIGHT_SAMPTHRESH); // if nseq > sampthresh, try to determine consensus on a subsample of seqs
177 int **ct = NULL; // matrix of symbol counts in each column. ct[apos=(0).1..alen][a=0..Kp-1]
178 int *r = NULL; // number of different canonical residues used in each consensus column. r[j=0..ncons-1]
179 int *conscols = NULL; // list of consensus column indices [0..ncons-1]
180 int ncons = 0; // number of consensus column indices in <conscols> list
181 int idx, apos, j, a; // indices over sequences, original columns, consensus columns, symbols
182 int rlen; // number of canonical residues in a seq; used for first PB normalization
183 int status = eslOK;
184
185 /* Contract checks & bailouts */
186 ESL_DASSERT1(( msa->nseq >= 1 && msa->alen >= 1));
187 ESL_DASSERT1(( msa->flags & eslMSA_DIGITAL ));
188 if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
189
190 /* Allocations */
191 ct = esl_mat_ICreate( msa->alen+1, msa->abc->Kp ); // (0).1..alen; 0..Kp-1
192 ESL_ALLOC(conscols, sizeof(int) * msa->alen);
193
194 /* Determine consensus columns early if we can. (ncons stays = 0 if neither way gets used.) */
195 if (! ignore_rf && msa->rf) consensus_by_rf(msa, conscols, &ncons, dat);
196 else if (allow_samp && msa->nseq > sampthresh) consensus_by_sample(cfg, msa, ct, conscols, &ncons, dat);
197
198 /* Collect count matrix ct[apos][a] (either all columns, or if we have consensus already, only consensus columns) */
199 collect_counts(cfg, msa, conscols, ncons, ct, dat);
200
201 /* If we still haven't determined consensus columns yet, do it now, using <ct> */
202 if (! ncons) consensus_by_all(cfg, msa, ct, conscols, &ncons, dat);
203
204 /* If we *still* have no consensus columns, that's pretty pathological -- use 'em all */
205 if (! ncons)
206 {
207 for (apos = 1; apos <= msa->alen; apos++) conscols[apos-1] = apos;
208 ncons = msa->alen;
209 if (dat) dat->cons_allcols = TRUE;
210 }
211
212 /* Count how many different canonical residues are used in each consensus column: r[j] */
213 ESL_ALLOC(r, sizeof(int) * ncons);
214 esl_vec_ISet(r, ncons, 0);
215 for (j = 0; j < ncons; j++)
216 {
217 apos = conscols[j];
218 for (a = 0; a < msa->abc->K; a++)
219 if (ct[apos][a] > 0) r[j]++;
220 }
221
222 /* Bump sequence weights using PB weighting rule */
223 esl_vec_DSet(msa->wgt, msa->nseq, 0.0);
224 for (idx = 0; idx < msa->nseq; idx++)
225 {
226 rlen = 0;
227 for (j = 0; j < ncons; j++)
228 {
229 apos = conscols[j];
230 a = msa->ax[idx][apos];
231 msa->wgt[idx] += (a >= msa->abc->K ? 0. : 1. / (double) (r[j] * ct[apos][a])); // <= This is the PB weight rule.
232 rlen += (a >= msa->abc->K ? 0 : 1); // (ternary is faster than an if)
233 }
234 if (rlen > 0) msa->wgt[idx] /= (double) rlen; // first normalization, by unaligned seq length
235 }
236
237 /* Normalize weights to sum to N */
238 esl_vec_DNorm(msa->wgt, msa->nseq);
239 esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
240 msa->flags |= eslMSA_HASWGTS;
241
242 ERROR:
243 esl_mat_IDestroy(ct);
244 free(r);
245 if (dat) dat->ncons = ncons;
246 if (dat) dat->conscols = conscols; else free(conscols);
247 return status;
248 }
249
250 /* consensus_by_rf()
251 * Use RF annotation to define consensus columns.
252 *
253 * <conscols> is allocated for up to <msa->alen> indices of consensus columns.
254 * Upon return, it contains a list of indices, each 1..alen, and
255 * <ret_ncons> is the length of the list (the number of consensus cols).
256 * (1-based, not 0-, because it's a consensus for a digital mode alignment).
257 *
258 * Returns <eslOK> on success.
259 */
260 static int
consensus_by_rf(const ESL_MSA * msa,int * conscols,int * ret_ncons,ESL_MSAWEIGHT_DAT * dat)261 consensus_by_rf(const ESL_MSA *msa, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat)
262 {
263 int ncons = 0;
264 int apos;
265
266 for (apos = 1; apos <= msa->alen; apos++)
267 {
268 if (esl_abc_CIsGap(msa->abc, msa->rf[apos-1])) continue;
269 conscols[ncons] = apos;
270 ncons++;
271 }
272 if (dat) dat->cons_by_rf = TRUE;
273 *ret_ncons = ncons;
274 return eslOK;
275 }
276
277 /* consensus_by_sample()
278 * Use a statistical sample of seqs to define consensus columns.
279 *
280 * On deep alignments, it's usually overkill to look at all sequences
281 * just to determine which columns to call consensus. We can speed the
282 * decision up by looking at a subsample instead.
283 *
284 * We want results to be reproducible; by default, the random number
285 * generator (used for taking the sample) uses a fixed seed.
286 *
287 * We have to watch out for a pathological case where there could be a
288 * ridiculous number of fragments piled up on just one piece of the
289 * alignment. This can happen with some DNA repeat elements, for
290 * example. If the sample contains too many fragments, reject this
291 * approach, the caller will have to determine consensus on all
292 * sequences instead.
293 *
294 * In: cfg - optional configuration options, or NULL to use all defaults
295 * msa - MSA to determine consensus for
296 * ct - allocated space for [(0).1..alen][0..Kp-1] observed symbol counts; contents irrelevant
297 * conscols - allocated space for up to <alen> consensus column indices
298 *
299 * Out: ct - [apos=1..alen][a=0..Kp-1] are counts of observed symbol <a> in column <alen> in sample. [0][] are all 0's.
300 * conscols - [0..ncons-1] list of consensus column indices 1..alen
301 * ncons - number of consensus columns in <conscols>, or 0 if consensus determination was rejected or failed
302 *
303 * Returns <eslOK> on success.
304 *
305 * Returns <eslFAIL> if there were too many fragments. Now <ct> still has the counts
306 * observed in the sample, but <conscols> has no valid indices and <ncons> is 0.
307 *
308 * Throws <eslEMEM> on allocation failure.
309 */
310 static int
consensus_by_sample(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,int ** ct,int * conscols,int * ret_ncons,ESL_MSAWEIGHT_DAT * dat)311 consensus_by_sample(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat)
312 {
313 float fragthresh = (cfg? cfg->fragthresh : eslMSAWEIGHT_FRAGTHRESH);
314 float symfrac = (cfg? cfg->symfrac : eslMSAWEIGHT_SYMFRAC);
315 int nsamp = (cfg? cfg->nsamp : eslMSAWEIGHT_NSAMP);
316 int maxfrag = (cfg? cfg->maxfrag : eslMSAWEIGHT_MAXFRAG);
317 ESL_RAND64 *rng = (cfg? esl_rand64_Create(cfg->seed) : esl_rand64_Create(eslMSAWEIGHT_RNGSEED)); // fixed seed for default reproducibility
318 int64_t *sampidx = NULL;
319 int nfrag = 0; // number of fragments in sample
320 int ncons = 0; // number of consensus columns defined
321 int tot; // total # of residues+gaps in a column
322 int minspan;
323 int apos, a;
324 int lpos, rpos;
325 int i, idx;
326 int status = eslOK;
327
328 ESL_ALLOC(sampidx, sizeof(int64_t) * nsamp);
329 esl_mat_ISet(ct, msa->alen+1, msa->abc->Kp, 0);
330 if (dat) dat->seed = esl_rand64_GetSeed(rng);
331
332 esl_rand64_Deal(rng, nsamp, (int64_t) msa->nseq, sampidx); // <sampidx> is now an ordered list of <nsamp> indices in range 0..nseq-1
333
334 minspan = (int) ceil( fragthresh * (float) msa->alen ); // define alispan as aligned length from first to last non-gap. If alispan < minspan, define seq as a fragment
335 for (i = 0; i < nsamp; i++)
336 {
337 idx = (int) sampidx[i];
338
339 for (lpos = 1; lpos <= msa->alen; lpos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][lpos])) break;
340 for (rpos = msa->alen; rpos >= 1; rpos--) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][rpos])) break;
341 if (rpos - lpos + 1 < minspan) nfrag++; else { lpos = 1; rpos = msa->alen; }
342
343 for (apos = lpos; apos <= rpos; apos++)
344 ct[apos][msa->ax[idx][apos]]++;
345 }
346
347 if (dat) dat->samp_nfrag = nfrag;
348
349 if (nfrag <= maxfrag)
350 {
351 for (apos = 1; apos <= msa->alen; apos++)
352 {
353 for (tot = 0, a = 0; a < msa->abc->Kp-2; a++) tot += ct[apos][a]; // i.e. <tot> symbols over esl_abc_XIsResidue() || esl_abc_XIsGap(); inclusive of degeneracies, exclusive of missing | nonresidue.
354 if ( ((float) ct[apos][msa->abc->K] / (float) tot) < symfrac) // This is the rule for determining a consensus column: if the frequency of residues/(residues+gaps) >= symfrac
355 conscols[ncons++] = apos;
356 }
357 if (dat) dat->cons_by_sample = TRUE;
358 }
359 else
360 {
361 if (dat) dat->rejected_sample = TRUE;
362 status = eslFAIL;
363 }
364
365 ERROR:
366 free(sampidx);
367 esl_rand64_Destroy(rng);
368 *ret_ncons = ncons; // will be 0 if we saw too many fragments.
369 return status;
370 }
371
372
373 /* consensus_by_all()
374 * Use counts from all sequences to determine consensus.
375 *
376 * If we weren't able to define a consensus before collecting observed
377 * counts in just those columns, then we had to collect observed
378 * counts on all columns, and now we define the consensus.
379 *
380 * Because we do this from counts, not the MSA, <msa> here is only
381 * used for dimensions <alen> and <Kp>.
382 *
383 * Returns <eslOK> on success.
384 *
385 * Obscure C note: <ct> is input only here, so it might look like
386 * you can declare it <const int **ct>, but don't. <const> syntax
387 * is arcane and confusing with double indirection.
388 */
389 static int
consensus_by_all(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,int ** ct,int * conscols,int * ret_ncons,ESL_MSAWEIGHT_DAT * dat)390 consensus_by_all(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat)
391 {
392 float symfrac = (cfg? cfg->symfrac : eslMSAWEIGHT_SYMFRAC);
393 int ncons = 0;
394 int apos;
395 int tot;
396 int a;
397
398 for (apos = 1; apos <= msa->alen; apos++)
399 {
400 tot = 0;
401 for (a = 0; a < msa->abc->Kp-2; a++) // i.e. over esl_abc_XIsResidue() || esl_abc_XIsGap()
402 tot += ct[apos][a];
403 if ( ((float) ct[apos][msa->abc->K] / (float) tot) < symfrac)
404 conscols[ncons++] = apos;
405 }
406 if (dat) dat->cons_by_all = TRUE;
407 *ret_ncons = ncons;
408 return eslOK;
409 }
410
411
412 /* collect_counts()
413 * Collect a matrix of observed symbol counts in each column: ct[apos=1..alen][a=0..Kp-1].
414 * - use HMMER's fragment-definition rule, and don't count external gaps of fragments.
415 * - This matrix is O(KL) for alignment length L, alphabet size K.
416 * It's responsible for the asymptotic memory complexity of PB weights.
417 * - "What if I've already marked fragments", you say (maybe in hmmbuild), "can I
418 * save some time here?" Not really. Even if you know a seq is a fragment, you'd
419 * need to run the lpos and rpos loops to find its start/end.
420 * - If we already know what the consensus columns are, only collect counts in them,
421 * leaving counts in nonconsensus columns zero. This is a time optimization.
422 */
423 static int
collect_counts(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,const int * conscols,int ncons,int ** ct,ESL_MSAWEIGHT_DAT * dat)424 collect_counts(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, const int *conscols, int ncons, int **ct, ESL_MSAWEIGHT_DAT *dat)
425 {
426 float fragthresh = (cfg? cfg->fragthresh : eslMSAWEIGHT_FRAGTHRESH); // seq is fragment if (length from 1st to last aligned residue)/alen < fragthresh (i.e. span < minspan)
427 int minspan = (int) ceil( fragthresh * (float) msa->alen ); // precalculated span length threshold using <fragthresh>
428 int lpos, rpos; // leftmost, rightmost aligned residue (1..alen)
429 int idx, apos, j;
430
431
432 esl_mat_ISet(ct, msa->alen+1, msa->abc->Kp, 0);
433 for (idx = 0; idx < msa->nseq; idx++)
434 {
435 // HMMER mark_fragments() rule. Count "span" from first to last aligned residue. If alispan/alen < fragthresh, it's a fragment.
436 for (lpos = 1; lpos <= msa->alen; lpos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][lpos])) break;
437 for (rpos = msa->alen; rpos >= 1; rpos--) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][rpos])) break;
438 // L=0 seq or alen=0? then lpos == msa->alen+1, rpos == 0 => lpos > rpos. rpos-lpos-1 <= 0 but test below still works.
439 if (rpos - lpos + 1 >= minspan) { lpos = 1; rpos = msa->alen; } else if (dat) dat->all_nfrag++; // full len seqs count cols 1..alen; fragments only count lpos..rpos.
440
441 if (ncons) // if we have consensus columns already, only count symbols in those columns (faster)...
442 {
443 for (j = 0; j < ncons && conscols[j] <= rpos; j++)
444 {
445 apos = conscols[j];
446 if (apos < lpos) continue;
447 ct[apos][msa->ax[idx][apos]]++;
448 }
449 }
450 else // ... else, count symbols in all columns.
451 {
452 for (apos = lpos; apos <= rpos; apos++)
453 ct[apos][msa->ax[idx][apos]]++;
454 }
455 }
456 return eslOK;
457 }
458
459
460 /* msaweight_PB_txt()
461 * PB weighting for text-mode alignment.
462 *
463 * All columns are counted. Symbols [a-zA-Z] are counted as
464 * "residues"; anything else is ignored.
465 *
466 * O(LN) time for alignment length L, number of sequences N.
467 * O(K) memory for K=26 residues.
468 *
469 * You should use digital mode PB weights if at all possible. They're
470 * better (with improved methods for gappy alignments), faster
471 * (because the implementation is optimized for access patterns), and
472 * have more options for customization and monitoring (via the
473 * ESL_MSAWEIGHT_CFG and ESL_MSAWEIGHT_STATS optional objects).
474 */
475 static int
msaweight_PB_txt(ESL_MSA * msa)476 msaweight_PB_txt(ESL_MSA *msa)
477 {
478 int *ct = NULL; // counts of each residue observed in a column
479 int K = 26; // alphabet size. In text mode, we count [a-zA-z] as "residues"
480 int r; // number of different symbols observed in a column
481 int idx, apos, a, rlen;
482 int status = eslOK;
483
484 ESL_DASSERT1( (msa->nseq >= 1) );
485 ESL_DASSERT1( (msa->alen >= 1) );
486 ESL_DASSERT1( (! (msa->flags & eslMSA_DIGITAL )));
487 if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
488
489 ESL_ALLOC(ct, sizeof(int) * K);
490 esl_vec_DSet(msa->wgt, msa->nseq, 0.0);
491
492 for (apos = 0; apos < msa->alen; apos++)
493 {
494 /* Collect # of letters A..Z (case insensitively) in this column, and total */
495 esl_vec_ISet(ct, K, 0.);
496 for (idx = 0; idx < msa->nseq; idx++)
497 if (isalpha((int) msa->aseq[idx][apos]))
498 ct[toupper((int) msa->aseq[idx][apos]) - 'A'] ++;
499 for (r = 0, a = 0; a < K; a++) if (ct[a] > 0) r++;
500
501 /* Bump weight on each seq by PB rule */
502 if (r > 0) {
503 for (idx = 0; idx < msa->nseq; idx++) {
504 if (isalpha((int) msa->aseq[idx][apos]))
505 msa->wgt[idx] += 1. /
506 (double) (r * ct[toupper((int) msa->aseq[idx][apos]) - 'A'] );
507 }
508 }
509 }
510
511 /* first normalization by # of residues counted in each seq */
512 for (idx = 0; idx < msa->nseq; idx++) {
513 for (rlen = 0, apos = 0; apos < msa->alen; apos++)
514 if (isalpha((int) msa->aseq[idx][apos])) rlen++;
515 if (rlen > 0) msa->wgt[idx] /= (double) rlen;
516 /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
517 }
518
519 /* Make weights normalize up to nseq, and return. In pathological
520 * case where all wgts were 0 (no seqs contain any unambiguous
521 * residues), weights become 1.0.
522 */
523 esl_vec_DNorm(msa->wgt, msa->nseq);
524 esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
525 msa->flags |= eslMSA_HASWGTS;
526
527 ERROR:
528 free(ct);
529 return status;
530 }
531
532
533
534
535
536 /*****************************************************************
537 * 2. Optional config, stats collection for advanced PB weighting
538 *****************************************************************/
539
540 /* Function: esl_msaweight_cfg_Create()
541 * Synopsis: Create config options structure for PB weights, %id filter
542 * Incept: SRE, Thu 21 Mar 2019 [Drive By Truckers, Guns of Umpqua]
543 */
544 ESL_MSAWEIGHT_CFG *
esl_msaweight_cfg_Create(void)545 esl_msaweight_cfg_Create(void)
546 {
547 ESL_MSAWEIGHT_CFG *cfg = NULL;
548 int status;
549
550 ESL_ALLOC(cfg, sizeof(ESL_MSAWEIGHT_CFG));
551
552 cfg->fragthresh = eslMSAWEIGHT_FRAGTHRESH;
553 cfg->symfrac = eslMSAWEIGHT_SYMFRAC;
554 cfg->ignore_rf = eslMSAWEIGHT_IGNORE_RF;
555 cfg->allow_samp = eslMSAWEIGHT_ALLOW_SAMP;
556 cfg->sampthresh = eslMSAWEIGHT_SAMPTHRESH;
557 cfg->nsamp = eslMSAWEIGHT_NSAMP;
558 cfg->maxfrag = eslMSAWEIGHT_MAXFRAG;
559 cfg->seed = eslMSAWEIGHT_RNGSEED;
560 cfg->filterpref = eslMSAWEIGHT_FILT_CONSCOVER;
561
562 ERROR:
563 return cfg;
564 }
565
566 /* Function: esl_msaweight_cfg_Destroy()
567 * Synopsis: Destroy an <ESL_MSAWEIGHT_CFG>
568 */
569 void
esl_msaweight_cfg_Destroy(ESL_MSAWEIGHT_CFG * cfg)570 esl_msaweight_cfg_Destroy(ESL_MSAWEIGHT_CFG *cfg)
571 {
572 free(cfg);
573 }
574
575
576 /* Function: esl_msaweight_dat_Create()
577 * Synopsis: Create data collection structure for PB weighting
578 * Incept: SRE, Thu 21 Mar 2019
579 */
580 ESL_MSAWEIGHT_DAT *
esl_msaweight_dat_Create(void)581 esl_msaweight_dat_Create(void)
582 {
583 ESL_MSAWEIGHT_DAT *dat = NULL;
584 int status;
585
586 ESL_ALLOC(dat, sizeof(ESL_MSAWEIGHT_DAT));
587
588 dat->seed = eslMSAWEIGHT_RNGSEED;
589
590 dat->cons_by_rf = FALSE;
591 dat->cons_by_sample = FALSE;
592 dat->cons_by_all = FALSE;
593 dat->cons_allcols = FALSE;
594 dat->rejected_sample = FALSE;
595
596 dat->ncons = 0;
597 dat->conscols = NULL;
598
599 dat->all_nfrag = 0;
600 dat->samp_nfrag = 0;
601
602 ERROR:
603 return dat;
604 }
605
606 /* Function: esl_msaweight_dat_Reuse()
607 * Synopsis: Reuse an <ESL_MSAWEIGHT_DAT> structure.
608 */
609 int
esl_msaweight_dat_Reuse(ESL_MSAWEIGHT_DAT * dat)610 esl_msaweight_dat_Reuse(ESL_MSAWEIGHT_DAT *dat)
611 {
612 if (dat->conscols) { free(dat->conscols); }
613 dat->seed = eslMSAWEIGHT_RNGSEED;
614 dat->cons_by_rf = FALSE;
615 dat->cons_by_sample = FALSE;
616 dat->cons_by_all = FALSE;
617 dat->cons_allcols = FALSE;
618 dat->rejected_sample = FALSE;
619 dat->ncons = 0;
620 dat->conscols = NULL;
621 dat->all_nfrag = 0;
622 dat->samp_nfrag = 0;
623 return eslOK;
624 }
625
626
627 /* Function: esl_msaweight_dat_Destroy()
628 * Synopsis: Free an <ESL_MSAWEIGHT_DAT> structure.
629 */
630 void
esl_msaweight_dat_Destroy(ESL_MSAWEIGHT_DAT * dat)631 esl_msaweight_dat_Destroy(ESL_MSAWEIGHT_DAT *dat)
632 {
633 if (dat)
634 {
635 if (dat->conscols) free(dat->conscols);
636 free(dat);
637 }
638 }
639
640
641 /*****************************************************************
642 * 3. Other weighting algorithms
643 *****************************************************************/
644
645 /* Function: esl_msaweight_GSC()
646 * Synopsis: GSC weights.
647 * Incept: SRE, Fri Nov 3 13:31:14 2006 [Janelia]
648 *
649 * Purpose: Given a multiple sequence alignment <msa>, calculate
650 * sequence weights according to the
651 * Gerstein/Sonnhammer/Chothia algorithm. These weights
652 * are stored internally in the <msa> object, replacing
653 * any weights that may have already been there. Weights
654 * are $\geq 0$ and they sum to <msa->nseq>.
655 *
656 * The <msa> may be in either digitized or text mode.
657 * Digital mode is preferred, so that distance calculations
658 * used by the GSC algorithm are robust against degenerate
659 * residue symbols.
660 *
661 * This is an implementation of Gerstein et al., "A method to
662 * weight protein sequences to correct for unequal
663 * representation", JMB 236:1067-1078, 1994.
664 *
665 * The algorithm is $O(N^2)$ memory (it requires a pairwise
666 * distance matrix) and $O(N^3 + LN^2)$ time ($N^3$ for a UPGMA
667 * tree building step, $LN^2$ for distance matrix construction)
668 * for an alignment of N sequences and L columns.
669 *
670 * In the current implementation, the actual memory
671 * requirement is dominated by two full NxN distance
672 * matrices (one tmp copy in UPGMA, and one here): for
673 * 8-byte doubles, that's $16N^2$ bytes. To keep the
674 * calculation under memory limits, don't process large
675 * alignments: max 1400 sequences for 32 MB, max 4000
676 * sequences for 256 MB, max 8000 seqs for 1 GB. Watch
677 * out, because Pfam alignments can easily blow this up.
678 *
679 * Note: Memory usage could be improved. UPGMA consumes a distance
680 * matrix, but that can be D itself, not a copy, if the
681 * caller doesn't mind the destruction of D. Also, D is
682 * symmetrical, so we could use upper or lower triangular
683 * matrices if we rewrote dmatrix to allow them.
684 *
685 * I also think UPGMA can be reduced to O(N^2) time, by
686 * being more tricky about rapidly identifying the minimum
687 * element: could keep min of each row, and update that,
688 * I think.
689 *
690 * Returns: <eslOK> on success, and the weights inside <msa> have been
691 * modified.
692 *
693 * Throws: <eslEINVAL> if the alignment data are somehow invalid and
694 * distance matrices can't be calculated. <eslEMEM> on an
695 * allocation error. In either case, the original <msa> is
696 * left unmodified.
697 *
698 * Xref: [Gerstein94]; squid::weight.c::GSCWeights(); STL11/81.
699 */
700 int
esl_msaweight_GSC(ESL_MSA * msa)701 esl_msaweight_GSC(ESL_MSA *msa)
702 {
703 ESL_DMATRIX *D = NULL; /* distance matrix */
704 ESL_TREE *T = NULL; /* UPGMA tree */
705 double *x = NULL; /* storage per node, 0..N-2 */
706 double lw, rw; /* total branchlen on left, right subtrees */
707 double lx, rx; /* distribution of weight to left, right side */
708 int i; /* counter over nodes */
709 int status;
710
711 /* Contract checks
712 */
713 ESL_DASSERT1( (msa != NULL) );
714 ESL_DASSERT1( (msa->nseq >= 1) );
715 ESL_DASSERT1( (msa->alen >= 1) );
716 ESL_DASSERT1( (msa->wgt != NULL) );
717 if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
718
719 /* GSC weights use a rooted tree with "branch lengths" calculated by
720 * UPGMA on a fractional difference matrix - pretty crude.
721 */
722 if (! (msa->flags & eslMSA_DIGITAL))
723 {
724 if ((status = esl_dst_CDiffMx(msa->aseq, msa->nseq, &D)) != eslOK) goto ERROR;
725 }
726 else
727 {
728 if ((status = esl_dst_XDiffMx(msa->abc, msa->ax, msa->nseq, &D)) != eslOK) goto ERROR;
729 }
730
731 /* oi, look out here. UPGMA is correct, but old squid library uses
732 * single linkage, so for regression tests ONLY, we use single link.
733 */
734 #ifdef eslMSAWEIGHT_REGRESSION
735 if ((status = esl_tree_SingleLinkage(D, &T)) != eslOK) goto ERROR;
736 #else
737 if ((status = esl_tree_UPGMA(D, &T)) != eslOK) goto ERROR;
738 #endif
739 esl_tree_SetCladesizes(T);
740
741 ESL_ALLOC(x, sizeof(double) * (T->N-1));
742
743 /* Postorder traverse (leaves to root) to calculate the total branch
744 * length under each internal node; store this in x[]. Remember the
745 * total branch length (x[0]) for a future sanity check.
746 */
747 for (i = T->N-2; i >= 0; i--)
748 {
749 x[i] = T->ld[i] + T->rd[i];
750 if (T->left[i] > 0) x[i] += x[T->left[i]];
751 if (T->right[i] > 0) x[i] += x[T->right[i]];
752 }
753
754 /* Preorder traverse (root to leaves) to calculate the weights. Now
755 * we use x[] to mean, the total weight *above* this node that we will
756 * apportion to the node's left and right children. The two
757 * meanings of x[] never cross: every x[] beneath x[i] is still a
758 * total branch length.
759 *
760 * Because the API guarantees that msa is returned unmodified in case
761 * of an exception, and we're touching msa->wgt here, no exceptions
762 * may be thrown from now on in this function.
763 */
764 x[0] = 0; /* initialize: no branch to the root. */
765 for (i = 0; i <= T->N-2; i++)
766 {
767 lw = T->ld[i]; if (T->left[i] > 0) lw += x[T->left[i]];
768 rw = T->rd[i]; if (T->right[i] > 0) rw += x[T->right[i]];
769
770 if (lw+rw == 0.)
771 {
772 /* A special case arises in GSC weights when all branch lengths in a subtree are 0.
773 * In this case, all seqs in this clade should get equal weights, sharing x[i] equally.
774 * So, split x[i] in proportion to cladesize, not to branch weight.
775 */
776 if (T->left[i] > 0) lx = x[i] * ((double) T->cladesize[T->left[i]] / (double) T->cladesize[i]);
777 else lx = x[i] / (double) T->cladesize[i];
778
779 if (T->right[i] > 0) rx = x[i] * ((double) T->cladesize[T->right[i]] / (double) T->cladesize[i]);
780 else rx = x[i] / (double) T->cladesize[i];
781 }
782 else /* normal case: x[i] split in proportion to branch weight. */
783 {
784 lx = x[i] * lw/(lw+rw);
785 rx = x[i] * rw/(lw+rw);
786 }
787
788 if (T->left[i] <= 0) msa->wgt[-(T->left[i])] = lx + T->ld[i];
789 else x[T->left[i]] = lx + T->ld[i];
790
791 if (T->right[i] <= 0) msa->wgt[-(T->right[i])] = rx + T->rd[i];
792 else x[T->right[i]] = rx + T->rd[i];
793 }
794
795 /* Renormalize weights to sum to N.
796 */
797 esl_vec_DNorm(msa->wgt, msa->nseq);
798 esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
799 msa->flags |= eslMSA_HASWGTS;
800
801 free(x);
802 esl_tree_Destroy(T);
803 esl_dmatrix_Destroy(D);
804 return eslOK;
805
806 ERROR:
807 if (x != NULL) free(x);
808 if (T != NULL) esl_tree_Destroy(T);
809 if (D != NULL) esl_dmatrix_Destroy(D);
810 return status;
811 }
812
813
814
815 /* Function: esl_msaweight_BLOSUM()
816 * Synopsis: BLOSUM weights.
817 * Incept: SRE, Sun Nov 5 09:52:41 2006 [Janelia]
818 *
819 * Purpose: Given a multiple sequence alignment <msa> and an identity
820 * threshold <maxid>, calculate sequence weights using the
821 * BLOSUM algorithm (Henikoff and Henikoff, PNAS
822 * 89:10915-10919, 1992). These weights are stored
823 * internally in the <msa> object, replacing any weights
824 * that may have already been there. Weights are $\geq 0$
825 * and they sum to <msa->nseq>.
826 *
827 * The algorithm does a single linkage clustering by
828 * fractional id, defines clusters such that no two clusters
829 * have a pairwise link $\geq$ <maxid>), and assigns
830 * weights of $\frac{1}{M_i}$ to each of the $M_i$
831 * sequences in each cluster $i$. The <maxid> threshold
832 * is a fractional pairwise identity, in the range
833 * $0..1$.
834 *
835 * The <msa> may be in either digitized or text mode.
836 * Digital mode is preferred, so that the pairwise identity
837 * calculations deal with degenerate residue symbols
838 * properly.
839 *
840 * Returns: <eslOK> on success, and the weights inside <msa> have been
841 * modified.
842 *
843 * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
844 * identity calculation fails because of corrupted sequence
845 * data. In either case, the <msa> is unmodified.
846 *
847 * Xref: [Henikoff92]; squid::weight.c::BlosumWeights().
848 */
849 int
esl_msaweight_BLOSUM(ESL_MSA * msa,double maxid)850 esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid)
851 {
852 int *c = NULL; /* cluster assignments for each sequence */
853 int *nmem = NULL; /* number of seqs in each cluster */
854 int nc; /* number of clusters */
855 int i; /* loop counter */
856 int status;
857
858 /* Contract checks
859 */
860 ESL_DASSERT1( (maxid >= 0. && maxid <= 1.) );
861 ESL_DASSERT1( (msa->nseq >= 1) );
862 ESL_DASSERT1( (msa->alen >= 1) );
863 if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
864
865 if ((status = esl_msacluster_SingleLinkage(msa, maxid, &c, NULL, &nc)) != eslOK) goto ERROR;
866 ESL_ALLOC(nmem, sizeof(int) * nc);
867 esl_vec_ISet(nmem, nc, 0);
868 for (i = 0; i < msa->nseq; i++) nmem[c[i]]++;
869 for (i = 0; i < msa->nseq; i++) msa->wgt[i] = 1. / (double) nmem[c[i]];
870
871 /* Make weights normalize up to nseq, and return.
872 */
873 esl_vec_DNorm(msa->wgt, msa->nseq);
874 esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
875 msa->flags |= eslMSA_HASWGTS;
876
877 free(nmem);
878 free(c);
879 return eslOK;
880
881 ERROR:
882 if (c != NULL) free(c);
883 if (nmem != NULL) free(nmem);
884 return status;
885 }
886
887
888
889 /*****************************************************************
890 * 4. %id filtering
891 *****************************************************************/
892 /* Mar 2019: After revising the PB weighting algorithm, I also went
893 * ahead and revised ER's %id filtering along related lines, because TAJ
894 * coincidentally complained about how filtering could keep a fragment
895 * and throw out a closer-to-consensus sequence.
896 */
897
898 /* In addition to some of the internal functions used by PB weighting above... */
899 static int sort_doubles_decreasing(const void *data, int e1, int e2);
900 static int set_preference_conscover(const ESL_MSA *msa, const int *conscols, int ncons, double *sortwgt);
901 static int set_preference_randomly(const ESL_MSAWEIGHT_CFG *cfg, int nseq, double *sortwgt);
902 static int set_preference_origorder(int nseq, double *sortwgt);
903 static int msaweight_IDFilter_txt(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa);
904
905 /* Function: esl_msaweight_IDFilter()
906 * Synopsis: Filter by %ID.
907 * Incept: ER, Wed Oct 29 10:06:43 2008 [Janelia]
908 *
909 * Purpose: Remove sequences in <msa> that are $\geq$ <maxid>
910 * fractional identity to another aligned sequence. Return
911 * the filtered alignment in <*ret_newmsa>. The original
912 * <msa> is not changed.
913 *
914 * When a pair of sequences has $\geq$ <maxid> fractional
915 * identity, the default rule for which of them we keep
916 * differs depending on whether the <msa> is in digital or
917 * text mode. We typically manipulate biosequence
918 * alignments in digital mode. In digital mode, the default
919 * rule (called "conscover") is to prefer the sequence with
920 * an alignment span that covers more consensus columns.
921 * (Where "span" is defined by the column coordinates of
922 * the leftmost and rightmost aligned residues of the
923 * sequence; fragment definition rules also use the concept
924 * of "span".) In the simpler text mode, the rule is to
925 * keep the earlier sequence and discard the later, in the
926 * order the seqs come in the alignment. The conscover rule
927 * is designed to avoid keeping fragments, while trying not
928 * to bias the insertion/deletion statistics of the
929 * filtered alignment. To use different preference rules in
930 * digital mode alignments, see the more customizable
931 * <esl_msaweight_IDFilter_adv()>.
932 *
933 * Fractional identity is calculated by
934 * <esl_dst_{CX}PairID()>, which defines the denominator of
935 * the calculation as MIN(rlen1, rlen2).
936 *
937 * Unparsed Stockholm markup is not propagated into the new
938 * filtered alignment.
939 *
940 * Return: <eslOK> on success, and <*ret_newmsa> is the filtered
941 * alignment.
942 *
943 * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
944 * identity calculation fails because of corrupted sequence
945 * data. In either case, the <msa> is unmodified.
946 *
947 * Xref: squid::weight.c::FilterAlignment().
948 */
949 int
esl_msaweight_IDFilter(const ESL_MSA * msa,double maxid,ESL_MSA ** ret_newmsa)950 esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
951 {
952 if (msa->flags & eslMSA_DIGITAL) return esl_msaweight_IDFilter_adv(NULL, msa, maxid, ret_newmsa);
953 else return msaweight_IDFilter_txt ( msa, maxid, ret_newmsa);
954 }
955
956
957 /* Function: esl_msaweight_IDFilter_adv()
958 * Synopsis: Advanced version of %id filtering; digital MSAs only.
959 * Incept: SRE, Sat 30 Mar 2019
960 *
961 * Purpose: Same as <esl_msaweight_IDFilter()> but customizable with
962 * optional custom parameters in <cfg>. Requires digital
963 * mode alignment.
964 *
965 * In particular, <cfg> can be used to select two other
966 * rules for sequence preference, besides the default
967 * "conscover" rule: random preference, or preferring the
968 * lower-index sequence (same as the rule used in text mode
969 * alignments).
970 *
971 * For the "conscover" rule, consensus column determination
972 * can be customized the same way as in PB weighting.
973 */
974 int
esl_msaweight_IDFilter_adv(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,double maxid,ESL_MSA ** ret_newmsa)975 esl_msaweight_IDFilter_adv(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
976 {
977 int ignore_rf = (cfg? cfg->ignore_rf : eslMSAWEIGHT_IGNORE_RF); // default is FALSE: use RF annotation as consensus definition, if RF is present
978 int allow_samp = (cfg? cfg->allow_samp : eslMSAWEIGHT_ALLOW_SAMP); // default is TRUE: allow subsampling speed optimization
979 int sampthresh = (cfg? cfg->sampthresh : eslMSAWEIGHT_SAMPTHRESH); // if nseq > sampthresh, try to determine consensus on a subsample of seqs
980 int filterpref = (cfg? cfg->filterpref : eslMSAWEIGHT_FILT_CONSCOVER); // default preference rule is "conscover"
981 int **ct = NULL; // matrix of symbol counts in each column. ct[apos=(0).1..alen][a=0..Kp-1]
982 int *conscols = NULL; // list of consensus column indices [0..ncons-1]
983 double *sortwgt = NULL; // when pair of seqs is >= maxid, retain seq w/ higher <sortwgt>
984 int *ranked_at = NULL; // sorted seq indices 0..nseq-1
985 int *list = NULL; // array of seq indices in new msa
986 int *useme = NULL; // useme[i] is TRUE if seq[i] is kept in new msa
987 int ncons = 0; // number of consensus column indices in <conscols> list
988 int nnew = 0; // how many seqs have been added to <list> and <useme> so far
989 int apos, r, i; // index over columns, ranked seq indices, seqs in <list>
990 double ident; // pairwise fractional id (0..1)
991 int status = eslOK;
992
993 ESL_DASSERT1(( msa->nseq >= 1 && msa->alen >= 1));
994 ESL_DASSERT1(( msa->flags & eslMSA_DIGITAL ));
995
996 /* Allocations that we always need*/
997 ESL_ALLOC(sortwgt, sizeof(double) * msa->nseq);
998 ESL_ALLOC(ranked_at, sizeof(double) * msa->nseq);
999 ESL_ALLOC(list, sizeof(int) * msa->nseq);
1000 ESL_ALLOC(useme, sizeof(int) * msa->nseq);
1001 esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */
1002
1003 /* Determine consensus columns with same procedure as advanced PB weights.
1004 * 1. If RF annotation is provided, use it.
1005 * (...unless cfg->ignore_rf has been set TRUE)
1006 * 2. If it's a deep alignment, subsample and apply fragthresh/symfrac rules to the sample.
1007 * (...unless cfg->allow_samp has been set FALSE)
1008 * 3. Otherwise, use standard rule on all sequences:
1009 * collect observed counts in <ct> (applying fragthresh rule) and
1010 * determine consensus (applying symfrac rule)
1011 * 4. If all else fails, define all columns as consensus.
1012 *
1013 * We only need to do this if we're using the default "conscover" rule.
1014 */
1015 if (filterpref == eslMSAWEIGHT_FILT_CONSCOVER)
1016 {
1017 /* allocations only needed for consensus determination */
1018 ct = esl_mat_ICreate( msa->alen+1, msa->abc->Kp ); // (0).1..alen; 0..Kp-1
1019 ESL_ALLOC(conscols, sizeof(int) * msa->alen);
1020
1021 if (! ignore_rf && msa->rf) consensus_by_rf(msa, conscols, &ncons, NULL);
1022 else if (allow_samp && msa->nseq > sampthresh) consensus_by_sample(cfg, msa, ct, conscols, &ncons, NULL);
1023 else {
1024 collect_counts(cfg, msa, conscols, ncons, ct, NULL);
1025 consensus_by_all(cfg, msa, ct, conscols, &ncons, NULL);
1026 }
1027 if (!ncons) {
1028 for (apos = 1; apos <= msa->alen; apos++) conscols[apos-1] = apos;
1029 ncons = msa->alen;
1030 }
1031 }
1032
1033 /* Assign preferences for which sequences to retain.
1034 * Higher <sortwgt> means higher preference.
1035 */
1036 if (filterpref == eslMSAWEIGHT_FILT_CONSCOVER) set_preference_conscover(msa, conscols, ncons, sortwgt);
1037 else if (filterpref == eslMSAWEIGHT_FILT_RANDOM) set_preference_randomly (cfg, msa->nseq, sortwgt);
1038 else if (filterpref == eslMSAWEIGHT_FILT_ORIGORDER) set_preference_origorder( msa->nseq, sortwgt);
1039 else esl_fatal("bad filterpref");
1040
1041 /* Sort sequence indices by these preferences.
1042 * <ranked_at[]> is a ranked list of sequence indices 0..nseq-1.
1043 */
1044 esl_quicksort(sortwgt, msa->nseq, sort_doubles_decreasing, ranked_at);
1045
1046 /* Determine which seqs will be kept, favoring highest ranked ones.
1047 */
1048 for (r = 0; r < msa->nseq; r++) // Try to include each sequence, starting with highest ranked ones
1049 {
1050 for (i = 0; i < nnew; i++) // Test it against all the seqs we've already put in the list.
1051 {
1052 if ((status = esl_dst_XPairId(msa->abc, msa->ax[ranked_at[r]], msa->ax[list[i]], &ident, NULL, NULL)) != eslOK) goto ERROR;
1053 if (ident >= maxid) break;
1054 }
1055
1056 if (i == nnew) // if the seq made it past comparisons with all <nnew> current seqs in the list...
1057 {
1058 list[nnew++] = ranked_at[r]; // ... add it to the growing list.
1059 useme[ranked_at[r]] = TRUE;
1060 }
1061 }
1062
1063 /* Filter the input MSA.
1064 */
1065 if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
1066
1067 ERROR:
1068 free(useme);
1069 free(list);
1070 free(ranked_at);
1071 free(sortwgt);
1072 free(conscols);
1073 esl_mat_IDestroy(ct);
1074 return status;
1075 }
1076
1077
1078
1079 /* Sorting function for the esl_quicksort() in the
1080 * esl_msaweight_IDFilter_adv() implementation below:
1081 * rank seqs in order of decreasing sortwgt.
1082 */
1083 static int
sort_doubles_decreasing(const void * data,int e1,int e2)1084 sort_doubles_decreasing(const void *data, int e1, int e2)
1085 {
1086 double *sortwgt = (double *) data;
1087 if (sortwgt[e1] > sortwgt[e2]) return -1;
1088 if (sortwgt[e1] < sortwgt[e2]) return 1;
1089 return 0;
1090 }
1091
1092 /* Preference-setting function for which seqs we prefer to retain. The
1093 * default "conscover" rule counts how many consensus columns are
1094 * covered by the "span" of this sequence from its first to last
1095 * residue.
1096 * - set lpos, rpos to the column coord (1..alen) of the leftmost,
1097 * rightmost residue in this seq;
1098 * - count how many consensus columns are covered by that span.
1099 *
1100 * This is not the same as counting how many consensus columns have
1101 * a residue in them. We're trying not to bias (too much) against
1102 * deletions when we select representative sequences.
1103 */
1104 static int
set_preference_conscover(const ESL_MSA * msa,const int * conscols,int ncons,double * sortwgt)1105 set_preference_conscover(const ESL_MSA *msa, const int *conscols, int ncons, double *sortwgt)
1106 {
1107 int idx, apos, lpos, rpos, j;
1108
1109 for (idx = 0; idx < msa->nseq; idx++)
1110 {
1111 for (lpos = 1; lpos <= msa->alen; lpos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][lpos])) break;
1112 for (rpos = msa->alen; rpos >= 1; rpos--) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][rpos])) break;
1113
1114 sortwgt[idx] = 0.;
1115 for (j = 0; j < ncons && conscols[j] <= rpos; j++)
1116 {
1117 apos = conscols[j];
1118 if (apos < lpos) continue;
1119 sortwgt[idx] += 1.;
1120 }
1121 }
1122 return eslOK;
1123 }
1124
1125 /* Preference-setting function where we assign random preference.
1126 */
1127 static int
set_preference_randomly(const ESL_MSAWEIGHT_CFG * cfg,int nseq,double * sortwgt)1128 set_preference_randomly(const ESL_MSAWEIGHT_CFG *cfg, int nseq, double *sortwgt)
1129 {
1130 ESL_RAND64 *rng = (cfg? esl_rand64_Create(cfg->seed) : esl_rand64_Create(eslMSAWEIGHT_RNGSEED));
1131 int idx;
1132
1133 for (idx = 0; idx < nseq; idx++)
1134 sortwgt[idx] = esl_rand64_double(rng);
1135
1136 esl_rand64_Destroy(rng);
1137 return eslOK;
1138 }
1139
1140 /* Preference-setting function that favors the earlier sequence
1141 * in the alignment; same as the rule used for text mode alignments.
1142 */
1143 static int
set_preference_origorder(int nseq,double * sortwgt)1144 set_preference_origorder(int nseq, double *sortwgt)
1145 {
1146 int idx;
1147
1148 for (idx = 0; idx < nseq; idx++)
1149 sortwgt[idx] = (double) (nseq - idx); // sets them to nseq..1
1150 return eslOK;
1151 }
1152
1153
1154 /* msaweight_IDFilter_txt()
1155 * %id filtering for text-mode MSAs.
1156 */
1157 static int
msaweight_IDFilter_txt(const ESL_MSA * msa,double maxid,ESL_MSA ** ret_newmsa)1158 msaweight_IDFilter_txt(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
1159 {
1160 int *list = NULL; /* array of seqs in new msa */
1161 int *useme = NULL; /* TRUE if seq is kept in new msa */
1162 int nnew; /* number of seqs in new alignment */
1163 double ident; /* pairwise percentage id */
1164 int i,j; /* seqs counters*/
1165 int remove; /* TRUE if sq is to be removed */
1166 int status;
1167
1168 ESL_DASSERT1(( msa ));
1169 ESL_DASSERT1(( msa->nseq >= 1 ));
1170 ESL_DASSERT1(( msa->alen >= 1 ));
1171 ESL_DASSERT1(( ! (msa->flags & eslMSA_DIGITAL )));
1172
1173 /* allocate */
1174 ESL_ALLOC(list, sizeof(int) * msa->nseq);
1175 ESL_ALLOC(useme, sizeof(int) * msa->nseq);
1176 esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */
1177
1178 /* find which seqs to keep (list) */
1179 nnew = 0;
1180 for (i = 0; i < msa->nseq; i++)
1181 {
1182 remove = FALSE;
1183 for (j = 0; j < nnew; j++)
1184 {
1185 if (! (msa->flags & eslMSA_DIGITAL)) {
1186 if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR;
1187 }
1188 else {
1189 if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR;
1190 }
1191
1192 if (ident >= maxid)
1193 {
1194 remove = TRUE;
1195 break;
1196 }
1197 }
1198 if (remove == FALSE) {
1199 list[nnew++] = i;
1200 useme[i] = TRUE;
1201 }
1202 }
1203 if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
1204
1205 free(list);
1206 free(useme);
1207 return eslOK;
1208
1209 ERROR:
1210 if (list != NULL) free(list);
1211 if (useme != NULL) free(useme);
1212 return status;
1213 }
1214
1215
1216
1217 /*****************************************************************
1218 * 5. Benchmark
1219 *****************************************************************/
1220 #ifdef eslMSAWEIGHT_BENCHMARK
1221 /* gcc -g -Wall -o benchmark -I. -L. -DeslMSAWEIGHT_BENCHMARK esl_msaweight.c -leasel -lm
1222 * ./benchmark <MSA file>
1223 *
1224 * Script for benchmarks on Pfam:
1225 * ./benchmark --gsc --maxN 4000 /misc/data0/databases/Pfam/Pfam-A.full
1226 * ./benchmark --blosum /misc/data0/databases/Pfam/Pfam-A.full
1227 * ./benchmark --pb /misc/data0/databases/Pfam/Pfam-A.full
1228 */
1229 #include "easel.h"
1230 #include "esl_getopts.h"
1231 #include "esl_msa.h"
1232 #include "esl_msafile.h"
1233 #include "esl_msaweight.h"
1234 #include "esl_vectorops.h"
1235 #include "esl_stopwatch.h"
1236
1237 #define WGROUP "--blosum,--gsc,--pb"
1238
1239 static ESL_OPTIONS options[] = {
1240 /* name type deflt env rng togs req incmpt help docgrp */
1241 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0 },
1242 { "--blosum", eslARG_NONE, FALSE, NULL, NULL, WGROUP, NULL, NULL, "use BLOSUM weights", 0 },
1243 { "--gsc", eslARG_NONE,"default",NULL,NULL, WGROUP, NULL, NULL, "use GSC weights", 0 },
1244 { "--pb", eslARG_NONE, FALSE, NULL, NULL, WGROUP, NULL, NULL, "use position-based weights", 0 },
1245 { "--id", eslARG_REAL, "0.62", NULL,"0<=x<=1",NULL,"--blosum",NULL, "id threshold for --blosum", 0 },
1246 { "--maxN", eslARG_INT, "0", NULL,"n>=0", NULL, NULL, NULL, "skip alignments w/ > <n> seqs", 0 },
1247 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1248 };
1249
1250 static char usage[] = "Usage: ./benchmark [-options] <msa_file>";
1251
1252 int
main(int argc,char ** argv)1253 main(int argc, char **argv)
1254 {
1255 ESL_STOPWATCH *w;
1256 ESL_GETOPTS *go;
1257 char *msafile;
1258 ESL_MSAFILE *afp;
1259 ESL_MSA *msa;
1260 int do_gsc;
1261 int do_pb;
1262 int do_blosum;
1263 int maxN;
1264 double maxid;
1265 double cpu;
1266 int status;
1267
1268 /* Process command line
1269 */
1270 go = esl_getopts_Create(options);
1271 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf);
1272 if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf);
1273 if (esl_opt_GetBoolean(go, "-h") == TRUE) {
1274 puts(usage);
1275 puts("\n where options are:");
1276 esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
1277 return 0;
1278 }
1279 do_blosum = esl_opt_GetBoolean(go, "--blosum");
1280 do_gsc = esl_opt_GetBoolean(go, "--gsc");
1281 do_pb = esl_opt_GetBoolean(go, "--pb");
1282 maxid = esl_opt_GetReal (go, "--id");
1283 maxN = esl_opt_GetInteger(go, "--maxN");
1284 if (esl_opt_ArgNumber(go) != 1) {
1285 puts("Incorrect number of command line arguments.");
1286 puts(usage);
1287 return 1;
1288 }
1289 if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("failed to parse cmd line: %s", go->errbuf);
1290 esl_getopts_Destroy(go);
1291
1292 w = esl_stopwatch_Create();
1293
1294 /* Weight one or more alignments from input file
1295 */
1296 if ((status = esl_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
1297 esl_msafile_OpenFailure(afp, status);
1298
1299 while ( (status = esl_msafile_Read(afp, &msa)) != eslEOF)
1300 {
1301 if (status != eslOK) esl_msafile_ReadFailure(afp, status);
1302 if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }
1303
1304 esl_stopwatch_Start(w);
1305
1306 if (do_gsc) esl_msaweight_GSC(msa);
1307 else if (do_pb) esl_msaweight_PB(msa);
1308 else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid);
1309
1310 esl_stopwatch_Stop(w);
1311 cpu = w->user;
1312 printf("%-20s %6d %6d %.3f\n", msa->name, msa->alen, msa->nseq, cpu);
1313 esl_msa_Destroy(msa);
1314 }
1315 esl_msafile_Close(afp);
1316
1317 esl_stopwatch_Destroy(w);
1318 return eslOK;
1319 }
1320 #endif /* eslMSAWEIGHT_BENCHMARK */
1321 /*-------------------- end, benchmark --------------------------*/
1322
1323
1324
1325 /*****************************************************************
1326 * 6. Statistics driver
1327 *****************************************************************/
1328 #ifdef eslMSAWEIGHT_STATS
1329 /* gcc -g -Wall -o stats -I. -L. -DeslMSAWEIGHT_STATS esl_msaweight.c -leasel -lm
1330 * ./stats <MSA file>
1331 *
1332 * Script for weight statistics on Pfam:
1333 * ./stats --gsc --maxN 4000 /misc/data0/databases/Pfam/Pfam-A.full
1334 * ./stats --blosum /misc/data0/databases/Pfam/Pfam-A.full
1335 * ./stats --pb /misc/data0/databases/Pfam/Pfam-A.full
1336 */
1337 #include "easel.h"
1338 #include "esl_getopts.h"
1339 #include "esl_msa.h"
1340 #include "esl_msafile.h"
1341 #include "esl_msaweight.h"
1342 #include "esl_vectorops.h"
1343
1344 #define WGROUP "--blosum,--gsc,--pb"
1345
1346 static ESL_OPTIONS options[] = {
1347 /* name type deflt env rng togs req incmpt help docgrp */
1348 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0 },
1349 { "--blosum", eslARG_NONE, FALSE, NULL, NULL, WGROUP, NULL, NULL, "use BLOSUM weights", 0 },
1350 { "--gsc", eslARG_NONE,"default",NULL,NULL, WGROUP, NULL, NULL, "use GSC weights", 0 },
1351 { "--pb", eslARG_NONE, FALSE, NULL, NULL, WGROUP, NULL, NULL, "use position-based weights", 0 },
1352 { "--id", eslARG_REAL, "0.62", NULL,"0<=x<=1",NULL,"--blosum",NULL, "id threshold for --blosum", 0 },
1353 { "--maxN", eslARG_INT, "0", NULL,"n>=0", NULL, NULL, NULL, "skip alignments w/ > <n> seqs", 0 },
1354 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1355 };
1356
1357 static char usage[] = "Usage: ./stats [-options] <msa_file>";
1358
1359 int
main(int argc,char ** argv)1360 main(int argc, char **argv)
1361 {
1362 ESL_GETOPTS *go;
1363 char *msafile;
1364 ESL_MSAFILE *afp;
1365 ESL_MSA *msa;
1366 int do_gsc;
1367 int do_pb;
1368 int do_blosum;
1369 int maxN;
1370 double maxid;
1371 int nsmall, nbig;
1372 int i;
1373 int status;
1374
1375 /* Process command line */
1376 go = esl_getopts_Create(options);
1377 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf);
1378 if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("%s", go->errbuf);
1379 if (esl_opt_GetBoolean(go, "-h") == TRUE){
1380 puts(usage);
1381 puts("\n where options are:");
1382 esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
1383 return 0;
1384 }
1385 do_blosum = esl_opt_GetBoolean(go, "--blosum");
1386 do_gsc = esl_opt_GetBoolean(go, "--gsc");
1387 do_pb = esl_opt_GetBoolean(go, "--pb");
1388 maxid = esl_opt_GetReal (go, "--id");
1389 maxN = esl_opt_GetInteger(go, "--maxN");
1390 if (esl_opt_ArgNumber(go) != 1) {
1391 puts("Incorrect number of command line arguments.");
1392 puts(usage);
1393 return 1;
1394 }
1395 if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf);
1396 esl_getopts_Destroy(go);
1397
1398 /* Weight one or more alignments from input file
1399 */
1400 if ((status = esl_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
1401 esl_msafile_OpenFailure(afp, status);
1402
1403 while ( (status = esl_msafile_Read(afp, &msa)) != eslEOF)
1404 {
1405 if (status != eslOK) esl_msafile_ReadFailure(afp, status);
1406 if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }
1407
1408 if (do_gsc) esl_msaweight_GSC(msa);
1409 else if (do_pb) esl_msaweight_PB(msa);
1410 else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid);
1411
1412 for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) {
1413 if (msa->wgt[i] < 0.2) nsmall++;
1414 if (msa->wgt[i] > 5.0) nbig++;
1415 }
1416
1417 printf("%-20s %5d %5d %8.4f %8.4f %5d %5d\n",
1418 msa->name,
1419 msa->nseq,
1420 msa->alen,
1421 esl_vec_DMin(msa->wgt, msa->nseq),
1422 esl_vec_DMax(msa->wgt, msa->nseq),
1423 nsmall,
1424 nbig);
1425 esl_msa_Destroy(msa);
1426 }
1427 esl_msafile_Close(afp);
1428 return eslOK;
1429 }
1430 #endif /* eslMSAWEIGHT_STATS */
1431 /*---------------- end, statistics driver ----------------------*/
1432
1433
1434
1435
1436 /*****************************************************************
1437 * 7. Unit tests
1438 *****************************************************************/
1439 #ifdef eslMSAWEIGHT_TESTDRIVE
1440
1441 #include "esl_msafile.h"
1442
1443 /* GSC weighting test on text-mode alignment <msa>, where we expect
1444 * the weights to be <expect[0]..expect[nseq-1]>.
1445 *
1446 * If <abc> is non-NULL, digitize alignment and run test again in
1447 * digital alignment mode.
1448 *
1449 * On failure, caller utest provides <msg> as the error msg to print.
1450 */
1451 static void
do_GSC(ESL_ALPHABET * abc,ESL_MSA * msa,double * expect,char * msg)1452 do_GSC(ESL_ALPHABET *abc, ESL_MSA *msa, double *expect, char *msg)
1453 {
1454 /* text mode */
1455 if (esl_msaweight_GSC(msa) != eslOK) esl_fatal(msg);
1456 if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1457
1458 /* digital mode */
1459 if (abc)
1460 {
1461 if (esl_msa_Digitize(abc, msa, NULL) != eslOK) esl_fatal(msg);
1462 if (esl_msaweight_GSC(msa) != eslOK) esl_fatal(msg);
1463 if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1464 if (esl_msa_Textize(msa) != eslOK) esl_fatal(msg);
1465 }
1466 }
1467
1468 /* As above, but for default PB weights */
1469 static void
do_PB(ESL_ALPHABET * abc,ESL_MSA * msa,double * expect,char * msg)1470 do_PB(ESL_ALPHABET *abc, ESL_MSA *msa, double *expect, char *msg)
1471 {
1472 if (esl_msaweight_PB(msa) != eslOK) esl_fatal(msg);
1473 if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1474
1475 if (abc)
1476 {
1477 if (esl_msa_Digitize(abc, msa, NULL) != eslOK) esl_fatal(msg);
1478 if (esl_msaweight_PB(msa) != eslOK) esl_fatal(msg);
1479 if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1480 if (esl_msa_Textize(msa) != eslOK) esl_fatal(msg);
1481 }
1482 }
1483
1484 /* As above, but for BLOSUM weights with fractional identity threshold <maxid> */
1485 static void
do_BLOSUM(ESL_ALPHABET * abc,ESL_MSA * msa,double maxid,double * expect,char * msg)1486 do_BLOSUM(ESL_ALPHABET *abc, ESL_MSA *msa, double maxid, double *expect, char *msg)
1487 {
1488 if (esl_msaweight_BLOSUM(msa, maxid) != eslOK) esl_fatal(msg);
1489 if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1490
1491 if (abc)
1492 {
1493 if (esl_msa_Digitize(abc, msa, NULL) != eslOK) esl_fatal(msg);
1494 if (esl_msaweight_BLOSUM(msa, maxid) != eslOK) esl_fatal(msg);
1495 if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1496 if (esl_msa_Textize(msa) != eslOK) esl_fatal(msg);
1497 }
1498 }
1499
1500 /* utest_wgt_identical_seqs
1501 * For all identical sequences, weights are uniform, all 1.0.
1502 */
1503 static void
utest_identical_seqs(void)1504 utest_identical_seqs(void)
1505 {
1506 char msg[] = "identical_seqs test failed";
1507 ESL_MSA *msa = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1 AAAAA\nseq2 AAAAA\nseq3 AAAAA\nseq4 AAAAA\nseq5 AAAAA\n//\n", eslMSAFILE_STOCKHOLM);
1508 ESL_ALPHABET *aa_abc = esl_alphabet_Create(eslAMINO);
1509 ESL_ALPHABET *nt_abc = esl_alphabet_Create(eslDNA);
1510 double uniform[5] = { 1.0, 1.0, 1.0, 1.0, 1.0 };
1511
1512 do_GSC (aa_abc, msa, uniform, msg);
1513 do_GSC (nt_abc, msa, uniform, msg);
1514 do_PB (aa_abc, msa, uniform, msg);
1515 do_PB (nt_abc, msa, uniform, msg);
1516 do_BLOSUM(aa_abc, msa, 0.62, uniform, msg);
1517 do_BLOSUM(nt_abc, msa, 0.62, uniform, msg);
1518
1519 esl_alphabet_Destroy(nt_abc);
1520 esl_alphabet_Destroy(aa_abc);
1521 esl_msa_Destroy(msa);
1522 }
1523
1524 /* The "contrived" example of [Henikoff94b].
1525 * "Correct" solution is 1==2, 3==4, 5==2x others.
1526 * PB, GSC, BLOSUM weights all agree.
1527 */
1528 static void
utest_henikoff_contrived(void)1529 utest_henikoff_contrived(void)
1530 {
1531 char msg[] = "henikoff_contrived test failed";
1532 ESL_MSA *msa = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1 AAAAA\nseq2 AAAAA\nseq3 CCCCC\nseq4 CCCCC\nseq5 TTTTT\n//\n", eslMSAFILE_STOCKHOLM);
1533 ESL_ALPHABET *aa_abc = esl_alphabet_Create(eslAMINO);
1534 ESL_ALPHABET *nt_abc = esl_alphabet_Create(eslDNA);
1535 double ewgt[5] = { 0.833333, 0.833333, 0.833333, 0.833333, 1.66667 };
1536
1537 do_GSC (aa_abc, msa, ewgt, msg);
1538 do_GSC (nt_abc, msa, ewgt, msg);
1539 do_PB (aa_abc, msa, ewgt, msg);
1540 do_PB (nt_abc, msa, ewgt, msg);
1541 do_BLOSUM(aa_abc, msa, 0.62, ewgt, msg);
1542 do_BLOSUM(nt_abc, msa, 0.62, ewgt, msg);
1543
1544 esl_alphabet_Destroy(nt_abc);
1545 esl_alphabet_Destroy(aa_abc);
1546 esl_msa_Destroy(msa);
1547 }
1548
1549 /* The "nitrogenase segments" example of [Henikoff94b].
1550 * PB, GSC, BLOSUM wgts differ on this example.
1551 */
1552 static void
utest_nitrogenase(void)1553 utest_nitrogenase(void)
1554 {
1555 char msg[] = "nitrogenase test failed";
1556 ESL_MSA *msa = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nNIFE_CLOPA GYVGS\nNIFD_AZOVI GFDGF\nNIFD_BRAJA GYDGF\nNIFK_ANASP GYQGG\n//\n", eslMSAFILE_STOCKHOLM);
1557 ESL_ALPHABET *aa_abc = esl_alphabet_Create(eslAMINO);
1558 double egsc[4] = { 1.125000, 0.875000, 0.875000, 1.125000 };
1559 double epb[4] = { 1.066667, 1.066667, 0.800000, 1.066667 };
1560 double eblo[4] = { 1.333333, 0.666667, 0.666667, 1.333333 };
1561
1562 do_GSC (aa_abc, msa, egsc, msg);
1563 do_PB (aa_abc, msa, epb, msg);
1564 do_BLOSUM(aa_abc, msa, 0.62, eblo, msg);
1565
1566 esl_alphabet_Destroy(aa_abc);
1567 esl_msa_Destroy(msa);
1568 }
1569
1570 /* gerstein4 test:
1571 * alignment that makes the same distances as Figure 4 from [Gerstein94].
1572 */
1573 static void
utest_gerstein4(void)1574 utest_gerstein4(void)
1575 {
1576 char msg[] = "gerstein4 test failed";
1577 ESL_MSA *msa = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nA AAAAAAAAAA\nB TTAAAAAAAA\nC ATAAAACCCC\nD GGGAAGGGGG\n//\n", eslMSAFILE_STOCKHOLM);
1578 ESL_ALPHABET *aa_abc = esl_alphabet_Create(eslAMINO);
1579 ESL_ALPHABET *nt_abc = esl_alphabet_Create(eslDNA);
1580 double egsc[4] = { 0.760870, 0.760870, 1.086957, 1.391304 };
1581 double epb[4] = { 0.800000, 0.800000, 1.000000, 1.400000 };
1582 double eblo[4] = { 0.666667, 0.666667, 1.333333, 1.333333 };
1583 double uni[4] = { 1.0, 1.0, 1.0, 1.0 };
1584
1585 do_GSC (aa_abc, msa, egsc, msg);
1586 do_GSC (nt_abc, msa, egsc, msg);
1587 do_PB (aa_abc, msa, epb, msg);
1588 do_PB (nt_abc, msa, epb, msg);
1589 do_BLOSUM(aa_abc, msa, 0.62, eblo, msg);
1590 do_BLOSUM(nt_abc, msa, 0.62, eblo, msg);
1591
1592 /* BLOSUM weights have the peculiar property of going flat at maxid=0.0
1593 * (when everyone clusters) or maxid=1.0 (nobody clusters). Test that here.
1594 */
1595 do_BLOSUM(aa_abc, msa, 0.0, uni, msg);
1596 do_BLOSUM(aa_abc, msa, 1.0, uni, msg);
1597
1598 esl_alphabet_Destroy(nt_abc);
1599 esl_alphabet_Destroy(aa_abc);
1600 esl_msa_Destroy(msa);
1601 }
1602
1603 /* pathologs test
1604 * A gappy alignment where no seqs overlap, and weighting methods
1605 * have to resort to uniform weights.
1606 */
1607 static void
utest_pathologs(void)1608 utest_pathologs(void)
1609 {
1610 char msg[] = "pathologs test failed";
1611 ESL_MSA *msa = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nA A----\nB -C---\nC --G--\nD ---T-\nE ----T\n//\n", eslMSAFILE_STOCKHOLM);
1612 ESL_ALPHABET *aa_abc = esl_alphabet_Create(eslAMINO);
1613 ESL_ALPHABET *nt_abc = esl_alphabet_Create(eslDNA);
1614 double uni[5] = { 1.0, 1.0, 1.0, 1.0, 1.0 };
1615
1616 do_GSC (aa_abc, msa, uni, msg);
1617 do_GSC (nt_abc, msa, uni, msg);
1618 do_PB (aa_abc, msa, uni, msg);
1619 do_PB (nt_abc, msa, uni, msg);
1620 do_BLOSUM(aa_abc, msa, 0.62, uni, msg);
1621 do_BLOSUM(nt_abc, msa, 0.62, uni, msg);
1622
1623 esl_alphabet_Destroy(nt_abc);
1624 esl_alphabet_Destroy(aa_abc);
1625 esl_msa_Destroy(msa);
1626 }
1627
1628
1629 /* Simple test of the %id filter.
1630 * seq1 and seq2 are 100% identical, but seq1 is shorter.
1631 * seq2 is preferred in the default conscover rule;
1632 * seq1 is preferred in the origorder rule;
1633 * either seq1 or seq2 will be selected with the random rule.
1634 */
1635 static void
utest_idfilter(void)1636 utest_idfilter(void)
1637 {
1638 char msg[] = "idfilter test failed";
1639 ESL_MSAWEIGHT_CFG *cfg = esl_msaweight_cfg_Create();
1640 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
1641 ESL_MSA *msa = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1 ..AAAAAAAA\nseq2 AAAAAAAAAA\nseq3 CCCCCCCCCC\nseq4 GGGGGGGGGG\n//\n", eslMSAFILE_STOCKHOLM);
1642 ESL_MSA *msa2 = NULL;
1643
1644 /* Default text mode rule is origorder */
1645 if (esl_msaweight_IDFilter(msa, 1.0, &msa2) != eslOK) esl_fatal(msg);
1646 if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq1") != 0) esl_fatal(msg);
1647 esl_msa_Destroy(msa2);
1648
1649 /* Default digital mode rule is conscover */
1650 if (esl_msa_Digitize(abc, msa, NULL) != eslOK) esl_fatal(msg);
1651 if (esl_msaweight_IDFilter(msa, 1.0, &msa2) != eslOK) esl_fatal(msg);
1652 if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq2") != 0) esl_fatal(msg);
1653 esl_msa_Destroy(msa2);
1654
1655 /* Ditto with _adv too */
1656 if (esl_msaweight_IDFilter_adv(cfg, msa, 1.0, &msa2) != eslOK) esl_fatal(msg);
1657 if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq2") != 0) esl_fatal(msg);
1658 esl_msa_Destroy(msa2);
1659
1660 /* Custom setting _adv to origorder */
1661 cfg->filterpref = eslMSAWEIGHT_FILT_ORIGORDER;
1662 if (esl_msaweight_IDFilter_adv(cfg, msa, 1.0, &msa2) != eslOK) esl_fatal(msg);
1663 if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq1") != 0) esl_fatal(msg);
1664 esl_msa_Destroy(msa2);
1665
1666 /* Custom setting _adv to randorder */
1667 cfg->filterpref = eslMSAWEIGHT_FILT_RANDOM;
1668 if (esl_msaweight_IDFilter_adv(cfg, msa, 1.0, &msa2) != eslOK) esl_fatal(msg);
1669 if (msa2->nseq != 3) esl_fatal(msg);
1670 if (strcmp(msa2->sqname[0], "seq1") != 0 && strcmp(msa2->sqname[0], "seq2") != 0) esl_fatal(msg);
1671 esl_msa_Destroy(msa2);
1672
1673 esl_msaweight_cfg_Destroy(cfg);
1674 esl_alphabet_Destroy(abc);
1675 esl_msa_Destroy(msa);
1676 }
1677
1678 #endif /*eslMSAWEIGHT_TESTDRIVE*/
1679 /*-------------------- end, unit tests -------------------------*/
1680
1681
1682
1683 /*****************************************************************
1684 * 8. Test driver
1685 *****************************************************************/
1686 #ifdef eslMSAWEIGHT_TESTDRIVE
1687
1688 #include "easel.h"
1689 #include "esl_getopts.h"
1690 #include "esl_msa.h"
1691 #include "esl_msafile.h"
1692 #include "esl_msaweight.h"
1693
1694 static ESL_OPTIONS options[] = {
1695 /* name type default env range toggles reqs incomp help docgroup*/
1696 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1697 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1698 };
1699 static char usage[] = "[-options]";
1700 static char banner[] = "test driver for msaweight module";
1701
1702 int
main(int argc,char ** argv)1703 main(int argc, char **argv)
1704 {
1705 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1706
1707 fprintf(stderr, "## %s\n", argv[0]);
1708
1709 utest_identical_seqs();
1710 utest_henikoff_contrived();
1711 utest_nitrogenase();
1712 utest_gerstein4();
1713 utest_pathologs();
1714
1715 utest_idfilter();
1716
1717 fprintf(stderr, "# status = ok\n");
1718
1719 esl_getopts_Destroy(go);
1720 exit(0);
1721 }
1722 #endif /*eslMSAWEIGHT_TESTDRIVE*/
1723 /*-------------------- end, test driver -------------------------*/
1724
1725
1726
1727
1728
1729 /*****************************************************************
1730 * 9. Example
1731 *****************************************************************/
1732 #ifdef eslMSAWEIGHT_EXAMPLE
1733
1734 #include "easel.h"
1735 #include "esl_getopts.h"
1736 #include "esl_msa.h"
1737 #include "esl_msafile.h"
1738 #include "esl_msaweight.h"
1739
1740 static ESL_OPTIONS options[] = {
1741 /* name type default env range toggles reqs incomp help docgroup*/
1742 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 1 },
1743 { "--informat", eslARG_STRING, NULL, NULL, NULL, NULL, NULL, NULL, "specify the input MSA file is in format <s>", 1 },
1744 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet", 1 },
1745 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet", 1 },
1746 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use protein alphabet", 1 },
1747
1748 { "--ignore-rf", eslARG_NONE, eslMSAWEIGHT_IGNORE_RF, NULL, NULL, NULL, NULL, NULL, "ignore any RF line; always determine our own consensus", 2 },
1749 { "--fragthresh", eslARG_REAL, ESL_STR(eslMSAWEIGHT_FRAGTHRESH), NULL, "0<=x<=1", NULL, NULL, NULL, "seq is fragment if aspan/alen < fragthresh", 2 }, // 0.0 = no fragments; 1.0 = everything is a frag except 100% full-span aseq
1750 { "--symfrac", eslARG_REAL, ESL_STR(eslMSAWEIGHT_SYMFRAC), NULL, "0<=x<=1", NULL, NULL, NULL, "col is consensus if nres/(nres+ngap) >= symfrac", 2 }, // 0.0 = all cols are consensus; 1.0 = only 100% all-residue cols are consensus
1751
1752 { "--no-sampling", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "never use subsampling to determine consensus", 3 },
1753 { "--nsamp", eslARG_INT, ESL_STR(eslMSAWEIGHT_NSAMP), NULL, "n>=0", NULL, NULL, "--no-sampling", "number of seqs to sample (if using sampling)", 3 },
1754 { "--sampthresh", eslARG_INT, ESL_STR(eslMSAWEIGHT_SAMPTHRESH), NULL, "n>=0", NULL, NULL, "--no-sampling", "switch to using sampling when nseq > nsamp", 3 },
1755 { "--maxfrag", eslARG_INT, ESL_STR(eslMSAWEIGHT_MAXFRAG), NULL, "n>=0", NULL, NULL, "--no-sampling", "if sample has > maxfrag fragments, don't use sample", 3 },
1756 { "-s", eslARG_INT, ESL_STR(eslMSAWEIGHT_RNGSEED), NULL, "n>=0", NULL, NULL, NULL, "set random number seed to <n>", 3 },
1757
1758 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1759 };
1760
1761 static char usage[] = "[-options] <msafile>";
1762 static char banner[] = "esl_msaweight example";
1763
1764 int
main(int argc,char ** argv)1765 main(int argc, char **argv)
1766 {
1767 ESL_GETOPTS *go = esl_getopts_Create(options);
1768 char *msafile = NULL;
1769 int infmt = eslMSAFILE_UNKNOWN;
1770 ESL_MSAWEIGHT_CFG *cfg = esl_msaweight_cfg_Create();
1771 ESL_MSAWEIGHT_DAT *dat = esl_msaweight_dat_Create();
1772 ESL_ALPHABET *abc = NULL;
1773 ESL_MSAFILE *afp = NULL;
1774 ESL_MSA *msa = NULL;
1775 int nali = 0;
1776 int idx;
1777 int status;
1778
1779 /* Process command line and options
1780 */
1781 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
1782 esl_opt_VerifyConfig(go) != eslOK)
1783 esl_fatal("Failed to parse command line: %s\n", go->errbuf);
1784 if (esl_opt_GetBoolean(go, "-h") == TRUE)
1785 {
1786 esl_banner(stdout, argv[0], banner);
1787 esl_usage (stdout, argv[0], usage);
1788 puts("\noptions:");
1789 esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup; 2=indentation; 80=width */
1790 puts("\noptions for deriving consensus:");
1791 esl_opt_DisplayHelp(stdout, go, 2, 2, 80); /* 1=docgroup; 2=indentation; 80=width */
1792 puts("\noptions for deriving consensus by sampling (on deep MSAs):");
1793 esl_opt_DisplayHelp(stdout, go, 3, 2, 80); /* 1=docgroup; 2=indentation; 80=width */
1794 return 0;
1795 }
1796
1797 if (esl_opt_ArgNumber(go) != 1) esl_fatal("Incorrect number of command line arguments.\n%s\n", usage);
1798 msafile = esl_opt_GetArg(go, 1);
1799
1800 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
1801 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
1802 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1803
1804 if (esl_opt_IsOn(go, "--informat") &&
1805 (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
1806 esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat"));
1807
1808 cfg->fragthresh = esl_opt_GetReal (go, "--fragthresh");
1809 cfg->symfrac = esl_opt_GetReal (go, "--symfrac");
1810 cfg->ignore_rf = esl_opt_GetBoolean(go, "--ignore-rf");
1811 cfg->allow_samp = !esl_opt_GetBoolean(go, "--no-sampling");
1812 cfg->sampthresh = esl_opt_GetInteger(go, "--sampthresh");
1813 cfg->nsamp = esl_opt_GetInteger(go, "--nsamp");
1814 cfg->maxfrag = esl_opt_GetInteger(go, "--maxfrag");
1815 cfg->seed = esl_opt_GetInteger(go, "-s");
1816
1817 if ((status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK)
1818 esl_msafile_OpenFailure(afp, status);
1819
1820 while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
1821 {
1822 nali++;
1823
1824 if ((status = esl_msaweight_PB_adv(cfg, msa, dat)) != eslOK)
1825 esl_fatal("weighting failed");
1826
1827 for (idx = 0; idx < msa->nseq; idx++)
1828 printf("%-25s %.4f\n", msa->sqname[idx], msa->wgt[idx]);
1829
1830
1831 if (dat->cons_by_rf) printf("# consensus by: RF annotation\n");
1832 else if (dat->cons_by_sample) printf("# consensus by: subsampling\n");
1833 else if (dat->cons_by_all) printf("# consensus by: standard rules\n");
1834 else if (dat->cons_allcols) printf("# consensus by: all columns\n");
1835
1836 if (dat->rejected_sample) printf("# (attempted sampling but rejected, too many frags\n");
1837 printf("# sampling allowed? %s\n", cfg->allow_samp ? "yes" : "NO");
1838
1839 printf("# fragthresh: %.4f\n", cfg->fragthresh);
1840 printf("# symfrac: %.4f\n", cfg->symfrac);
1841 if (dat->cons_by_sample)
1842 {
1843 printf("# Info on sampling:\n");
1844 printf("# RNG seed: %" PRIu64 "\n", dat->seed);
1845 printf("# Sample size: %d\n", cfg->nsamp);
1846 printf("# Fragments: %d (<= maxfrag of %d)\n", dat->samp_nfrag, cfg->maxfrag);
1847 }
1848
1849 printf("# number of consensus cols: %d out of %d\n", dat->ncons, (int) msa->alen);
1850 printf("# number of fragments: %d out of %d\n", dat->all_nfrag, msa->nseq);
1851
1852 esl_msa_Destroy(msa);
1853 }
1854 if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */
1855
1856 esl_msaweight_cfg_Destroy(cfg);
1857 esl_msaweight_dat_Destroy(dat);
1858 esl_alphabet_Destroy(abc);
1859 esl_msafile_Close(afp);
1860 esl_getopts_Destroy(go);
1861 exit(0);
1862 }
1863 #endif /*eslMSAWEIGHT_EXAMPLE*/
1864