1 /* Sequence weighting algorithms.
2  *
3  * Implementations of ad hoc sequence weighting algorithms for multiple
4  * sequence alignments:
5  *   PB weights:     Henikoff and Henikoff, JMB 243:574-578, 1994.
6  *   GSC weights:    Gerstein et al., JMB 236:1067-1078, 1994.
7  *   BLOSUM weights: Henikoff and Henikoff, PNAS 89:10915-10919, 1992.
8  *
9  * and also, filtering an alignment to remove "redundant" sequences >=
10  * a given pairwise % identity cutoff, which is sort of a weighting
11  * method.
12  *
13  * The most work has gone into PB weights, the default in HMMER and
14  * Infernal. PB weights are the only one of the weighting schemes that
15  * scales practically to deep alignments.
16  *
17  * Contents:
18  *   1. Position-based (PB) weighting
19  *   2. Optional config, stats collection for advanced PB weighting
20  *   3. Other weighting algorithms (GSC, BLOSUM)
21  *   4. %id filtering
22  *   5. Benchmark
23  *   6. Stats driver
24  *   7. Unit tests
25  *   8. Test driver
26  *   9. Example
27  */
28 #include "esl_config.h"
29 
30 #include <math.h>
31 #include <string.h>
32 #include <ctype.h>
33 
34 #include "easel.h"
35 #include "esl_alphabet.h"
36 #include "esl_distance.h"
37 #include "esl_dmatrix.h"
38 #include "esl_matrixops.h"
39 #include "esl_msa.h"
40 #include "esl_msacluster.h"
41 #include "esl_quicksort.h"
42 #include "esl_tree.h"
43 #include "esl_vectorops.h"
44 
45 #include "esl_msaweight.h"
46 
47 
48 /*****************************************************************
49  * 1. Position-based (PB) weighting
50  *****************************************************************/
51 /* Mar 2019: PB weighting algorithm revised to use only consensus
52  *   columns, not all columns. HMMER benchmarks show improved
53  *   discrimination especially with deep alignments. Also optimized
54  *   order of access, trading off O(K) -> O(KL) memory in return for
55  *   ~40x acceleration. These changes were only introduced for digital
56  *   mode alignments; text mode PB algorithm remains as it was.
57  */
58 
59 static int  consensus_by_rf    (const ESL_MSA *msa, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat);
60 static int  consensus_by_sample(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat);
61 static int  consensus_by_all   (const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat);
62 static int  collect_counts     (const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, const int *conscols, int ncons, int **ct, ESL_MSAWEIGHT_DAT *dat);
63 static int  msaweight_PB_txt(ESL_MSA *msa);
64 
65 /* Function:  esl_msaweight_PB()
66  * Synopsis:  PB (position-based) weights.
67  *
68  * Purpose:   Given a multiple alignment <msa>, calculate sequence
69  *            weights according to the position-based weighting
70  *            algorithm (Henikoff and Henikoff, JMB 243:574-578,
71  *            1994). These weights are stored internally in the <msa>
72  *            object, replacing any weights that may have already been
73  *            there. Weights are $\geq 0$ and they sum to <msa->nseq>.
74  *
75  *            The Henikoffs' algorithm is defined for ungapped
76  *            alignments. It does not give rules for dealing with
77  *            gaps, nor for degenerate residue symbols. The rule here
78  *            is to ignore these, and moreover (in digital mode
79  *            alignments, in order to deal with deep gappy alignments)
80  *            to only consider alignment columns that are "consensus"
81  *            (defined below).  This means that longer sequences
82  *            initially get more weight; hence we do a "double
83  *            normalization" in which the weights are first divided by
84  *            sequence length in canonical residues (to get the
85  *            average weight per residue, to compensate for that
86  *            effect), then normalized to sum to nseq.
87 
88  *            The <msa> may be in either digitized or text mode. The
89  *            weighting algorithm is subtly different depending on
90  *            this mode. Digital mode is preferred. The digital mode
91  *            algorithm handles deep gappy alignments better (it only
92  *            considers consensus columns, not all columns), it deals
93  *            with degenerate residue symbols (by ignoring them, like
94  *            it ignores gaps), and it's much faster. In text mode,
95  *            the algorithm simply uses the 26 letters as "residues"
96  *            (case-insensitively), and ignores all other residues as
97  *            gaps.
98  *
99  *            PB weights require $O(NL)$ time and $O(LK)$ memory, for
100  *            an alignment of $L$ columns, $N$ sequences, and alphabet
101  *            size $K$.
102  *
103  *            Consensus columns (for the digital alignment version)
104  *            are determined as follows:
105  *
106  *              1. if the MSA has RF consensus column annotation, use it.
107  *
108  *              2. else, use HMMER's rules (fragthresh, symfrac):
109  *                    - define sequence fragments as those with aligned
110  *                      lengths where fractional span (l-r+1)/L < fragthresh
111  *                      for leftmost residue position l and rightmost r;
112  *                    - count % residue occupancy per column ignoring
113  *                      external gaps for fragments;
114  *                    - define consensus columns as those with $\geq$
115  *                      symfrac residue occupancy.
116  *                Defaults are fragthresh = 0.5, symfrac = 0.5.
117  *
118  *              3. if all else fails, use all columns.
119  *
120  *            Additionally, at step 2, if the alignment is very deep
121  *            (>50000 sequences by default), we attempt to define
122  *            consensus columns from a statistical sample (of 10,000
123  *            seqs by default). This is a speed optimization. It can
124  *            fail in a pathological case where the alignment contains
125  *            a ridiculous number of fragments piled up on one
126  *            subsequence. (Some DNA repeats can be this way.)  If we
127  *            get >5000 fragments in the sample (by default), we use
128  *            all sequences, not a subsample.
129  *
130  *            All the parameters mentioned with default parameters can
131  *            be changed using the <esl_msaweight_PB_adv()> "advanced"
132  *            version.
133  *
134  * Returns:   <eslOK> on success. <msa->wgt[]> now contains weights for
135  *            each sequence. <eslMSA_HASWGTS> flag is raised in
136  *            <msa->flags>.
137  *
138  * Throws:    <eslEMEM> on allocation error, in which case <msa> is
139  *            returned unmodified.
140  *
141  * Xref:      [Henikoff94b]; squid::weight.c::PositionBasedWeights().
142  */
143 int
esl_msaweight_PB(ESL_MSA * msa)144 esl_msaweight_PB(ESL_MSA *msa)
145 {
146   if (msa->flags & eslMSA_DIGITAL) return esl_msaweight_PB_adv(NULL, msa, NULL);
147   else                             return msaweight_PB_txt(msa);
148 }
149 
150 
151 /* Function:  esl_msaweight_PB_adv()
152  * Synopsis:  Advanced version of PB weights; digital MSAs only.
153  * Incept:    SRE, Wed 20 Mar 2019
154  *
155  * Purpose:   Same as <esl_msaweight_PB()> but only takes digital-mode
156  *            MSAs, and can optionally take customized parameters
157  *            <cfg>, and optionally collect data about the computation
158  *            in <dat>.
159  *
160  * Args:      cfg - optional customized parameters, or NULL to use defaults.
161  *            msa - MSA to weight; weights stored in msa->wgt[]
162  *            dat - optional data collection, or NULL.
163  *
164  * Returns:   <eslOK> on success. <msa->wgt[]> now contains weights for
165  *            each sequence. <eslMSA_HASWGTS> flag is raised in
166  *            <msa->flags>.  <dat>, if provided, contains data about
167  *            stuff that happened during the weight computation.
168  *
169  * Throws:    (no abnormal error conditions)
170  */
171 int
esl_msaweight_PB_adv(const ESL_MSAWEIGHT_CFG * cfg,ESL_MSA * msa,ESL_MSAWEIGHT_DAT * dat)172 esl_msaweight_PB_adv(const ESL_MSAWEIGHT_CFG *cfg, ESL_MSA *msa, ESL_MSAWEIGHT_DAT *dat)
173 {
174   int   ignore_rf   = (cfg? cfg->ignore_rf  : eslMSAWEIGHT_IGNORE_RF);      // default is FALSE: use RF annotation as consensus definition, if RF is present
175   int   allow_samp  = (cfg? cfg->allow_samp : eslMSAWEIGHT_ALLOW_SAMP);     // default is TRUE: allow subsampling speed optimization
176   int   sampthresh  = (cfg? cfg->sampthresh : eslMSAWEIGHT_SAMPTHRESH);     // if nseq > sampthresh, try to determine consensus on a subsample of seqs
177   int **ct          = NULL;     // matrix of symbol counts in each column. ct[apos=(0).1..alen][a=0..Kp-1]
178   int  *r           = NULL;     // number of different canonical residues used in each consensus column. r[j=0..ncons-1]
179   int  *conscols    = NULL;     // list of consensus column indices [0..ncons-1]
180   int   ncons       = 0;        // number of consensus column indices in <conscols> list
181   int   idx, apos, j, a;        // indices over sequences, original columns, consensus columns, symbols
182   int   rlen;                   // number of canonical residues in a seq; used for first PB normalization
183   int   status = eslOK;
184 
185   /* Contract checks & bailouts */
186   ESL_DASSERT1(( msa->nseq >= 1 && msa->alen >= 1));
187   ESL_DASSERT1(( msa->flags & eslMSA_DIGITAL ));
188   if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
189 
190   /* Allocations */
191   ct = esl_mat_ICreate( msa->alen+1, msa->abc->Kp );      // (0).1..alen; 0..Kp-1
192   ESL_ALLOC(conscols, sizeof(int) * msa->alen);
193 
194   /* Determine consensus columns early if we can. (ncons stays = 0 if neither way gets used.) */
195   if      (! ignore_rf && msa->rf)                consensus_by_rf(msa, conscols, &ncons, dat);
196   else if (allow_samp  && msa->nseq > sampthresh) consensus_by_sample(cfg, msa, ct, conscols, &ncons, dat);
197 
198   /* Collect count matrix ct[apos][a]  (either all columns, or if we have consensus already, only consensus columns) */
199   collect_counts(cfg, msa, conscols, ncons, ct, dat);
200 
201   /* If we still haven't determined consensus columns yet, do it now, using <ct> */
202   if (! ncons) consensus_by_all(cfg, msa, ct, conscols, &ncons, dat);
203 
204   /* If we *still* have no consensus columns, that's pretty pathological -- use 'em all */
205   if (! ncons)
206     {
207       for (apos = 1; apos <= msa->alen; apos++) conscols[apos-1] = apos;
208       ncons = msa->alen;
209       if (dat) dat->cons_allcols = TRUE;
210     }
211 
212   /* Count how many different canonical residues are used in each consensus column: r[j] */
213   ESL_ALLOC(r, sizeof(int) * ncons);
214   esl_vec_ISet(r, ncons, 0);
215   for (j = 0; j < ncons; j++)
216     {
217       apos = conscols[j];
218       for (a = 0; a < msa->abc->K; a++)
219 	if (ct[apos][a] > 0) r[j]++;
220     }
221 
222   /* Bump sequence weights using PB weighting rule */
223   esl_vec_DSet(msa->wgt, msa->nseq, 0.0);
224   for (idx = 0; idx < msa->nseq; idx++)
225     {
226       rlen = 0;
227       for (j = 0; j < ncons; j++)
228 	{
229 	  apos = conscols[j];
230 	  a    = msa->ax[idx][apos];
231 	  msa->wgt[idx] += (a >= msa->abc->K ? 0. : 1. / (double) (r[j] * ct[apos][a])); // <= This is the PB weight rule.
232 	  rlen          += (a >= msa->abc->K ? 0  : 1);                                  //    (ternary is faster than an if)
233 	}
234       if (rlen > 0) msa->wgt[idx] /= (double) rlen;  // first normalization, by unaligned seq length
235     }
236 
237   /* Normalize weights to sum to N */
238   esl_vec_DNorm(msa->wgt, msa->nseq);
239   esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
240   msa->flags |= eslMSA_HASWGTS;
241 
242  ERROR:
243   esl_mat_IDestroy(ct);
244   free(r);
245   if (dat) dat->ncons    = ncons;
246   if (dat) dat->conscols = conscols; else free(conscols);
247   return status;
248 }
249 
250 /* consensus_by_rf()
251  * Use RF annotation to define consensus columns.
252  *
253  * <conscols> is allocated for up to <msa->alen> indices of consensus columns.
254  * Upon return, it contains a list of indices, each 1..alen, and
255  * <ret_ncons> is the length of the list (the number of consensus cols).
256  * (1-based, not 0-, because it's a consensus for a digital mode alignment).
257  *
258  * Returns <eslOK> on success.
259  */
260 static int
consensus_by_rf(const ESL_MSA * msa,int * conscols,int * ret_ncons,ESL_MSAWEIGHT_DAT * dat)261 consensus_by_rf(const ESL_MSA *msa, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat)
262 {
263   int ncons = 0;
264   int apos;
265 
266   for (apos = 1; apos <= msa->alen; apos++)
267     {
268       if (esl_abc_CIsGap(msa->abc, msa->rf[apos-1])) continue;
269       conscols[ncons] = apos;
270       ncons++;
271     }
272   if (dat) dat->cons_by_rf = TRUE;
273   *ret_ncons = ncons;
274   return eslOK;
275 }
276 
277 /* consensus_by_sample()
278  * Use a statistical sample of seqs to define consensus columns.
279  *
280  * On deep alignments, it's usually overkill to look at all sequences
281  * just to determine which columns to call consensus. We can speed the
282  * decision up by looking at a subsample instead.
283  *
284  * We want results to be reproducible; by default, the random number
285  * generator (used for taking the sample) uses a fixed seed.
286  *
287  * We have to watch out for a pathological case where there could be a
288  * ridiculous number of fragments piled up on just one piece of the
289  * alignment. This can happen with some DNA repeat elements, for
290  * example. If the sample contains too many fragments, reject this
291  * approach, the caller will have to determine consensus on all
292  * sequences instead.
293  *
294  * In:   cfg      - optional configuration options, or NULL to use all defaults
295  *       msa      - MSA to determine consensus for
296  *       ct       - allocated space for [(0).1..alen][0..Kp-1] observed symbol counts; contents irrelevant
297  *       conscols - allocated space for up to <alen> consensus column indices
298  *
299  * Out:  ct       - [apos=1..alen][a=0..Kp-1] are counts of observed symbol <a> in column <alen> in sample. [0][] are all 0's.
300  *       conscols - [0..ncons-1] list of consensus column indices 1..alen
301  *       ncons    - number of consensus columns in <conscols>, or 0 if consensus determination was rejected or failed
302  *
303  * Returns <eslOK> on success.
304  *
305  * Returns <eslFAIL> if there were too many fragments. Now <ct> still has the counts
306  * observed in the sample, but <conscols> has no valid indices and <ncons> is 0.
307  *
308  * Throws <eslEMEM> on allocation failure.
309  */
310 static int
consensus_by_sample(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,int ** ct,int * conscols,int * ret_ncons,ESL_MSAWEIGHT_DAT * dat)311 consensus_by_sample(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat)
312 {
313   float       fragthresh  = (cfg? cfg->fragthresh : eslMSAWEIGHT_FRAGTHRESH);
314   float       symfrac     = (cfg? cfg->symfrac    : eslMSAWEIGHT_SYMFRAC);
315   int         nsamp       = (cfg? cfg->nsamp      : eslMSAWEIGHT_NSAMP);
316   int         maxfrag     = (cfg? cfg->maxfrag    : eslMSAWEIGHT_MAXFRAG);
317   ESL_RAND64 *rng         = (cfg? esl_rand64_Create(cfg->seed) : esl_rand64_Create(eslMSAWEIGHT_RNGSEED));   // fixed seed for default reproducibility
318   int64_t    *sampidx     = NULL;
319   int         nfrag       = 0;        // number of fragments in sample
320   int         ncons       = 0;        // number of consensus columns defined
321   int         tot;                    // total # of residues+gaps in a column
322   int         minspan;
323   int         apos, a;
324   int         lpos, rpos;
325   int         i, idx;
326   int         status      = eslOK;
327 
328   ESL_ALLOC(sampidx, sizeof(int64_t) * nsamp);
329   esl_mat_ISet(ct, msa->alen+1, msa->abc->Kp, 0);
330   if (dat) dat->seed = esl_rand64_GetSeed(rng);
331 
332   esl_rand64_Deal(rng, nsamp, (int64_t) msa->nseq, sampidx);  // <sampidx> is now an ordered list of <nsamp> indices in range 0..nseq-1
333 
334   minspan = (int) ceil( fragthresh * (float) msa->alen );     // define alispan as aligned length from first to last non-gap. If alispan < minspan, define seq as a fragment
335   for (i = 0; i < nsamp; i++)
336     {
337       idx = (int) sampidx[i];
338 
339       for (lpos = 1;         lpos <= msa->alen; lpos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][lpos])) break;
340       for (rpos = msa->alen; rpos >= 1;         rpos--) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][rpos])) break;
341       if  (rpos - lpos + 1 < minspan) nfrag++; else { lpos = 1; rpos = msa->alen; }
342 
343       for (apos = lpos; apos <= rpos; apos++)
344 	ct[apos][msa->ax[idx][apos]]++;
345     }
346 
347   if (dat) dat->samp_nfrag = nfrag;
348 
349   if (nfrag <= maxfrag)
350     {
351       for (apos = 1; apos <= msa->alen; apos++)
352 	{
353 	  for (tot = 0, a = 0; a < msa->abc->Kp-2; a++) tot += ct[apos][a];  // i.e. <tot> symbols over esl_abc_XIsResidue() || esl_abc_XIsGap(); inclusive of degeneracies, exclusive of missing | nonresidue.
354 	  if ( ((float) ct[apos][msa->abc->K] / (float) tot) < symfrac)      // This is the rule for determining a consensus column: if the frequency of residues/(residues+gaps) >= symfrac
355 	    conscols[ncons++] = apos;
356 	}
357       if (dat) dat->cons_by_sample = TRUE;
358     }
359   else
360     {
361       if (dat) dat->rejected_sample = TRUE;
362       status = eslFAIL;
363     }
364 
365  ERROR:
366   free(sampidx);
367   esl_rand64_Destroy(rng);
368   *ret_ncons = ncons;   // will be 0 if we saw too many fragments.
369   return status;
370 }
371 
372 
373 /* consensus_by_all()
374  * Use counts from all sequences to determine consensus.
375  *
376  * If we weren't able to define a consensus before collecting observed
377  * counts in just those columns, then we had to collect observed
378  * counts on all columns, and now we define the consensus.
379  *
380  * Because we do this from counts, not the MSA, <msa> here is only
381  * used for dimensions <alen> and <Kp>.
382  *
383  * Returns <eslOK> on success.
384  *
385  * Obscure C note: <ct> is input only here, so it might look like
386  * you can declare it <const int **ct>, but don't. <const> syntax
387  * is arcane and confusing with double indirection.
388  */
389 static int
consensus_by_all(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,int ** ct,int * conscols,int * ret_ncons,ESL_MSAWEIGHT_DAT * dat)390 consensus_by_all(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, int **ct, int *conscols, int *ret_ncons, ESL_MSAWEIGHT_DAT *dat)
391 {
392   float symfrac = (cfg? cfg->symfrac : eslMSAWEIGHT_SYMFRAC);
393   int   ncons   = 0;
394   int   apos;
395   int   tot;
396   int   a;
397 
398   for (apos = 1; apos <= msa->alen; apos++)
399     {
400       tot = 0;
401       for (a = 0; a < msa->abc->Kp-2; a++)  // i.e. over esl_abc_XIsResidue() || esl_abc_XIsGap()
402 	tot += ct[apos][a];
403       if ( ((float) ct[apos][msa->abc->K] / (float) tot) < symfrac)
404 	conscols[ncons++] = apos;
405     }
406   if (dat) dat->cons_by_all = TRUE;
407   *ret_ncons = ncons;
408   return eslOK;
409 }
410 
411 
412  /* collect_counts()
413   * Collect a matrix of observed symbol counts in each column: ct[apos=1..alen][a=0..Kp-1].
414   *   - use HMMER's fragment-definition rule, and don't count external gaps of fragments.
415   *   - This matrix is O(KL) for alignment length L, alphabet size K.
416   *     It's responsible for the asymptotic memory complexity of PB weights.
417   *   - "What if I've already marked fragments", you say (maybe in hmmbuild), "can I
418   *     save some time here?" Not really. Even if you know a seq is a fragment, you'd
419   *     need to run the lpos and rpos loops to find its start/end.
420   *   - If we already know what the consensus columns are, only collect counts in them,
421   *     leaving counts in nonconsensus columns zero. This is a time optimization.
422   */
423 static int
collect_counts(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,const int * conscols,int ncons,int ** ct,ESL_MSAWEIGHT_DAT * dat)424 collect_counts(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, const int *conscols, int ncons, int **ct, ESL_MSAWEIGHT_DAT *dat)
425 {
426   float fragthresh  = (cfg? cfg->fragthresh : eslMSAWEIGHT_FRAGTHRESH);     // seq is fragment if (length from 1st to last aligned residue)/alen < fragthresh (i.e. span < minspan)
427   int   minspan     = (int) ceil( fragthresh * (float) msa->alen );         // precalculated span length threshold using <fragthresh>
428   int   lpos, rpos;     // leftmost, rightmost aligned residue (1..alen)
429   int   idx, apos, j;
430 
431 
432   esl_mat_ISet(ct, msa->alen+1, msa->abc->Kp, 0);
433   for (idx = 0; idx < msa->nseq; idx++)
434     {
435       // HMMER mark_fragments() rule. Count "span" from first to last aligned residue. If alispan/alen < fragthresh, it's a fragment.
436       for (lpos = 1;         lpos <= msa->alen; lpos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][lpos])) break;
437       for (rpos = msa->alen; rpos >= 1;         rpos--) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][rpos])) break;
438       // L=0 seq or alen=0? then lpos == msa->alen+1, rpos == 0 => lpos > rpos. rpos-lpos-1 <= 0 but test below still works.
439       if (rpos - lpos + 1 >= minspan) { lpos = 1; rpos = msa->alen; } else if (dat) dat->all_nfrag++;   // full len seqs count cols 1..alen; fragments only count lpos..rpos.
440 
441       if (ncons) // if we have consensus columns already, only count symbols in those columns (faster)...
442 	{
443 	  for (j = 0; j < ncons && conscols[j] <= rpos; j++)
444 	    {
445 	      apos = conscols[j];
446 	      if (apos < lpos) continue;
447 	      ct[apos][msa->ax[idx][apos]]++;
448 	    }
449 	}
450       else      // ... else, count symbols in all columns.
451 	{
452 	  for (apos = lpos; apos <= rpos; apos++)
453 	    ct[apos][msa->ax[idx][apos]]++;
454 	}
455     }
456   return eslOK;
457 }
458 
459 
460 /* msaweight_PB_txt()
461  * PB weighting for text-mode alignment.
462  *
463  * All columns are counted. Symbols [a-zA-Z] are counted as
464  * "residues"; anything else is ignored.
465  *
466  * O(LN) time for alignment length L, number of sequences N.
467  * O(K) memory for K=26 residues.
468  *
469  * You should use digital mode PB weights if at all possible. They're
470  * better (with improved methods for gappy alignments), faster
471  * (because the implementation is optimized for access patterns), and
472  * have more options for customization and monitoring (via the
473  * ESL_MSAWEIGHT_CFG and ESL_MSAWEIGHT_STATS optional objects).
474  */
475 static int
msaweight_PB_txt(ESL_MSA * msa)476 msaweight_PB_txt(ESL_MSA *msa)
477 {
478   int    *ct   = NULL;   	// counts of each residue observed in a column
479   int     K    = 26;		// alphabet size. In text mode, we count [a-zA-z] as "residues"
480   int     r;           		// number of different symbols observed in a column
481   int     idx, apos, a, rlen;
482   int     status = eslOK;
483 
484   ESL_DASSERT1( (msa->nseq >= 1) );
485   ESL_DASSERT1( (msa->alen >= 1) );
486   ESL_DASSERT1( (! (msa->flags & eslMSA_DIGITAL )));
487   if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
488 
489   ESL_ALLOC(ct, sizeof(int) * K);
490   esl_vec_DSet(msa->wgt, msa->nseq, 0.0);
491 
492   for (apos = 0; apos < msa->alen; apos++)
493     {
494       /* Collect # of letters A..Z (case insensitively) in this column, and total */
495       esl_vec_ISet(ct, K, 0.);
496       for (idx = 0; idx < msa->nseq; idx++)
497 	if (isalpha((int) msa->aseq[idx][apos]))
498 	  ct[toupper((int) msa->aseq[idx][apos]) - 'A'] ++;
499       for (r = 0, a = 0; a < K; a++) if (ct[a] > 0) r++;
500 
501       /* Bump weight on each seq by PB rule */
502       if (r > 0) {
503 	for (idx = 0; idx < msa->nseq; idx++) {
504 	  if (isalpha((int) msa->aseq[idx][apos]))
505 	    msa->wgt[idx] += 1. /
506 	      (double) (r * ct[toupper((int) msa->aseq[idx][apos]) - 'A'] );
507 	}
508       }
509     }
510 
511   /* first normalization by # of residues counted in each seq */
512   for (idx = 0; idx < msa->nseq; idx++) {
513     for (rlen = 0, apos = 0; apos < msa->alen; apos++)
514       if (isalpha((int) msa->aseq[idx][apos])) rlen++;
515     if (rlen > 0) msa->wgt[idx] /= (double) rlen;
516     /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
517   }
518 
519   /* Make weights normalize up to nseq, and return.  In pathological
520    * case where all wgts were 0 (no seqs contain any unambiguous
521    * residues), weights become 1.0.
522    */
523   esl_vec_DNorm(msa->wgt, msa->nseq);
524   esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
525   msa->flags |= eslMSA_HASWGTS;
526 
527  ERROR:
528   free(ct);
529   return status;
530 }
531 
532 
533 
534 
535 
536 /*****************************************************************
537  * 2. Optional config, stats collection for advanced PB weighting
538  *****************************************************************/
539 
540 /* Function:  esl_msaweight_cfg_Create()
541  * Synopsis:  Create config options structure for PB weights, %id filter
542  * Incept:    SRE, Thu 21 Mar 2019 [Drive By Truckers, Guns of Umpqua]
543  */
544 ESL_MSAWEIGHT_CFG *
esl_msaweight_cfg_Create(void)545 esl_msaweight_cfg_Create(void)
546 {
547   ESL_MSAWEIGHT_CFG *cfg = NULL;
548   int                status;
549 
550   ESL_ALLOC(cfg, sizeof(ESL_MSAWEIGHT_CFG));
551 
552   cfg->fragthresh = eslMSAWEIGHT_FRAGTHRESH;
553   cfg->symfrac    = eslMSAWEIGHT_SYMFRAC;
554   cfg->ignore_rf  = eslMSAWEIGHT_IGNORE_RF;
555   cfg->allow_samp = eslMSAWEIGHT_ALLOW_SAMP;
556   cfg->sampthresh = eslMSAWEIGHT_SAMPTHRESH;
557   cfg->nsamp      = eslMSAWEIGHT_NSAMP;
558   cfg->maxfrag    = eslMSAWEIGHT_MAXFRAG;
559   cfg->seed       = eslMSAWEIGHT_RNGSEED;
560   cfg->filterpref = eslMSAWEIGHT_FILT_CONSCOVER;
561 
562  ERROR:
563   return cfg;
564 }
565 
566 /* Function:  esl_msaweight_cfg_Destroy()
567  * Synopsis:  Destroy an <ESL_MSAWEIGHT_CFG>
568  */
569 void
esl_msaweight_cfg_Destroy(ESL_MSAWEIGHT_CFG * cfg)570 esl_msaweight_cfg_Destroy(ESL_MSAWEIGHT_CFG *cfg)
571 {
572   free(cfg);
573 }
574 
575 
576 /* Function:  esl_msaweight_dat_Create()
577  * Synopsis:  Create data collection structure for PB weighting
578  * Incept:    SRE, Thu 21 Mar 2019
579  */
580 ESL_MSAWEIGHT_DAT *
esl_msaweight_dat_Create(void)581 esl_msaweight_dat_Create(void)
582 {
583   ESL_MSAWEIGHT_DAT *dat = NULL;
584   int status;
585 
586   ESL_ALLOC(dat, sizeof(ESL_MSAWEIGHT_DAT));
587 
588   dat->seed            = eslMSAWEIGHT_RNGSEED;
589 
590   dat->cons_by_rf      = FALSE;
591   dat->cons_by_sample  = FALSE;
592   dat->cons_by_all     = FALSE;
593   dat->cons_allcols    = FALSE;
594   dat->rejected_sample = FALSE;
595 
596   dat->ncons      = 0;
597   dat->conscols   = NULL;
598 
599   dat->all_nfrag  = 0;
600   dat->samp_nfrag = 0;
601 
602  ERROR:
603   return dat;
604 }
605 
606 /* Function:  esl_msaweight_dat_Reuse()
607  * Synopsis:  Reuse an <ESL_MSAWEIGHT_DAT> structure.
608  */
609 int
esl_msaweight_dat_Reuse(ESL_MSAWEIGHT_DAT * dat)610 esl_msaweight_dat_Reuse(ESL_MSAWEIGHT_DAT *dat)
611 {
612   if (dat->conscols) { free(dat->conscols); }
613   dat->seed            = eslMSAWEIGHT_RNGSEED;
614   dat->cons_by_rf      = FALSE;
615   dat->cons_by_sample  = FALSE;
616   dat->cons_by_all     = FALSE;
617   dat->cons_allcols    = FALSE;
618   dat->rejected_sample = FALSE;
619   dat->ncons           = 0;
620   dat->conscols        = NULL;
621   dat->all_nfrag       = 0;
622   dat->samp_nfrag      = 0;
623   return eslOK;
624 }
625 
626 
627 /* Function:  esl_msaweight_dat_Destroy()
628  * Synopsis:  Free an <ESL_MSAWEIGHT_DAT> structure.
629  */
630 void
esl_msaweight_dat_Destroy(ESL_MSAWEIGHT_DAT * dat)631 esl_msaweight_dat_Destroy(ESL_MSAWEIGHT_DAT *dat)
632 {
633   if (dat)
634     {
635       if (dat->conscols) free(dat->conscols);
636       free(dat);
637     }
638 }
639 
640 
641 /*****************************************************************
642  * 3. Other weighting algorithms
643  *****************************************************************/
644 
645 /* Function:  esl_msaweight_GSC()
646  * Synopsis:  GSC weights.
647  * Incept:    SRE, Fri Nov  3 13:31:14 2006 [Janelia]
648  *
649  * Purpose:   Given a multiple sequence alignment <msa>, calculate
650  *            sequence weights according to the
651  *            Gerstein/Sonnhammer/Chothia algorithm. These weights
652  *            are stored internally in the <msa> object, replacing
653  *            any weights that may have already been there. Weights
654  *            are $\geq 0$ and they sum to <msa->nseq>.
655  *
656  *            The <msa> may be in either digitized or text mode.
657  *            Digital mode is preferred, so that distance calculations
658  *            used by the GSC algorithm are robust against degenerate
659  *            residue symbols.
660  *
661  *            This is an implementation of Gerstein et al., "A method to
662  *            weight protein sequences to correct for unequal
663  *            representation", JMB 236:1067-1078, 1994.
664  *
665  *            The algorithm is $O(N^2)$ memory (it requires a pairwise
666  *            distance matrix) and $O(N^3 + LN^2)$ time ($N^3$ for a UPGMA
667  *            tree building step, $LN^2$ for distance matrix construction)
668  *            for an alignment of N sequences and L columns.
669  *
670  *            In the current implementation, the actual memory
671  *            requirement is dominated by two full NxN distance
672  *            matrices (one tmp copy in UPGMA, and one here): for
673  *            8-byte doubles, that's $16N^2$ bytes. To keep the
674  *            calculation under memory limits, don't process large
675  *            alignments: max 1400 sequences for 32 MB, max 4000
676  *            sequences for 256 MB, max 8000 seqs for 1 GB. Watch
677  *            out, because Pfam alignments can easily blow this up.
678  *
679  * Note:      Memory usage could be improved. UPGMA consumes a distance
680  *            matrix, but that can be D itself, not a copy, if the
681  *            caller doesn't mind the destruction of D. Also, D is
682  *            symmetrical, so we could use upper or lower triangular
683  *            matrices if we rewrote dmatrix to allow them.
684  *
685  *            I also think UPGMA can be reduced to O(N^2) time, by
686  *            being more tricky about rapidly identifying the minimum
687  *            element: could keep min of each row, and update that,
688  *            I think.
689  *
690  * Returns:   <eslOK> on success, and the weights inside <msa> have been
691  *            modified.
692  *
693  * Throws:    <eslEINVAL> if the alignment data are somehow invalid and
694  *            distance matrices can't be calculated. <eslEMEM> on an
695  *            allocation error. In either case, the original <msa> is
696  *            left unmodified.
697  *
698  * Xref:      [Gerstein94]; squid::weight.c::GSCWeights(); STL11/81.
699  */
700 int
esl_msaweight_GSC(ESL_MSA * msa)701 esl_msaweight_GSC(ESL_MSA *msa)
702 {
703   ESL_DMATRIX *D = NULL;     /* distance matrix */
704   ESL_TREE    *T = NULL;     /* UPGMA tree */
705   double      *x = NULL;     /* storage per node, 0..N-2 */
706   double       lw, rw;       /* total branchlen on left, right subtrees */
707   double       lx, rx;	     /* distribution of weight to left, right side */
708   int i;		     /* counter over nodes */
709   int status;
710 
711   /* Contract checks
712    */
713   ESL_DASSERT1( (msa       != NULL) );
714   ESL_DASSERT1( (msa->nseq >= 1)    );
715   ESL_DASSERT1( (msa->alen >= 1)    );
716   ESL_DASSERT1( (msa->wgt  != NULL) );
717   if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
718 
719   /* GSC weights use a rooted tree with "branch lengths" calculated by
720    * UPGMA on a fractional difference matrix - pretty crude.
721    */
722   if (! (msa->flags & eslMSA_DIGITAL))
723     {
724       if ((status = esl_dst_CDiffMx(msa->aseq, msa->nseq, &D))         != eslOK) goto ERROR;
725     }
726   else
727     {
728       if ((status = esl_dst_XDiffMx(msa->abc, msa->ax, msa->nseq, &D)) != eslOK) goto ERROR;
729     }
730 
731   /* oi, look out here.  UPGMA is correct, but old squid library uses
732    * single linkage, so for regression tests ONLY, we use single link.
733    */
734 #ifdef  eslMSAWEIGHT_REGRESSION
735   if ((status = esl_tree_SingleLinkage(D, &T)) != eslOK) goto ERROR;
736 #else
737   if ((status = esl_tree_UPGMA(D, &T)) != eslOK) goto ERROR;
738 #endif
739   esl_tree_SetCladesizes(T);
740 
741   ESL_ALLOC(x, sizeof(double) * (T->N-1));
742 
743   /* Postorder traverse (leaves to root) to calculate the total branch
744    * length under each internal node; store this in x[].  Remember the
745    * total branch length (x[0]) for a future sanity check.
746    */
747   for (i = T->N-2; i >= 0; i--)
748     {
749       x[i] = T->ld[i] + T->rd[i];
750       if (T->left[i]  > 0) x[i] += x[T->left[i]];
751       if (T->right[i] > 0) x[i] += x[T->right[i]];
752     }
753 
754   /* Preorder traverse (root to leaves) to calculate the weights.  Now
755    * we use x[] to mean, the total weight *above* this node that we will
756    * apportion to the node's left and right children. The two
757    * meanings of x[] never cross: every x[] beneath x[i] is still a
758    * total branch length.
759    *
760    * Because the API guarantees that msa is returned unmodified in case
761    * of an exception, and we're touching msa->wgt here, no exceptions
762    * may be thrown from now on in this function.
763    */
764   x[0] = 0;			/* initialize: no branch to the root. */
765   for (i = 0; i <= T->N-2; i++)
766     {
767       lw = T->ld[i];   if (T->left[i]  > 0) lw += x[T->left[i]];
768       rw = T->rd[i];   if (T->right[i] > 0) rw += x[T->right[i]];
769 
770       if (lw+rw == 0.)
771 	{
772 	  /* A special case arises in GSC weights when all branch lengths in a subtree are 0.
773 	   * In this case, all seqs in this clade should get equal weights, sharing x[i] equally.
774            * So, split x[i] in proportion to cladesize, not to branch weight.
775 	   */
776 	  if (T->left[i] > 0)  lx =  x[i] * ((double) T->cladesize[T->left[i]]  / (double) T->cladesize[i]);
777 	  else                 lx =  x[i] / (double) T->cladesize[i];
778 
779 	  if (T->right[i] > 0) rx =  x[i] * ((double) T->cladesize[T->right[i]] / (double) T->cladesize[i]);
780 	  else                 rx =  x[i] / (double) T->cladesize[i];
781 	}
782       else /* normal case: x[i] split in proportion to branch weight. */
783 	{
784 	  lx = x[i] * lw/(lw+rw);
785 	  rx = x[i] * rw/(lw+rw);
786 	}
787 
788       if (T->left[i]  <= 0) msa->wgt[-(T->left[i])] = lx + T->ld[i];
789       else                  x[T->left[i]] = lx + T->ld[i];
790 
791       if (T->right[i] <= 0) msa->wgt[-(T->right[i])] = rx + T->rd[i];
792       else                  x[T->right[i]] = rx + T->rd[i];
793     }
794 
795   /* Renormalize weights to sum to N.
796    */
797   esl_vec_DNorm(msa->wgt, msa->nseq);
798   esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
799   msa->flags |= eslMSA_HASWGTS;
800 
801   free(x);
802   esl_tree_Destroy(T);
803   esl_dmatrix_Destroy(D);
804   return eslOK;
805 
806  ERROR:
807   if (x != NULL) free(x);
808   if (T != NULL) esl_tree_Destroy(T);
809   if (D != NULL) esl_dmatrix_Destroy(D);
810   return status;
811 }
812 
813 
814 
815 /* Function:  esl_msaweight_BLOSUM()
816  * Synopsis:  BLOSUM weights.
817  * Incept:    SRE, Sun Nov  5 09:52:41 2006 [Janelia]
818  *
819  * Purpose:   Given a multiple sequence alignment <msa> and an identity
820  *            threshold <maxid>, calculate sequence weights using the
821  *            BLOSUM algorithm (Henikoff and Henikoff, PNAS
822  *            89:10915-10919, 1992). These weights are stored
823  *            internally in the <msa> object, replacing any weights
824  *            that may have already been there. Weights are $\geq 0$
825  *            and they sum to <msa->nseq>.
826  *
827  *            The algorithm does a single linkage clustering by
828  *            fractional id, defines clusters such that no two clusters
829  *            have a pairwise link $\geq$ <maxid>), and assigns
830  *            weights of $\frac{1}{M_i}$ to each of the $M_i$
831  *            sequences in each cluster $i$. The <maxid> threshold
832  *            is a fractional pairwise identity, in the range
833  *            $0..1$.
834  *
835  *            The <msa> may be in either digitized or text mode.
836  *            Digital mode is preferred, so that the pairwise identity
837  *            calculations deal with degenerate residue symbols
838  *            properly.
839  *
840  * Returns:   <eslOK> on success, and the weights inside <msa> have been
841  *            modified.
842  *
843  * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
844  *            identity calculation fails because of corrupted sequence
845  *            data. In either case, the <msa> is unmodified.
846  *
847  * Xref:      [Henikoff92]; squid::weight.c::BlosumWeights().
848  */
849 int
esl_msaweight_BLOSUM(ESL_MSA * msa,double maxid)850 esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid)
851 {
852   int  *c    = NULL; /* cluster assignments for each sequence */
853   int  *nmem = NULL; /* number of seqs in each cluster */
854   int   nc;	     /* number of clusters  */
855   int   i;           /* loop counter */
856   int   status;
857 
858   /* Contract checks
859    */
860   ESL_DASSERT1( (maxid >= 0. && maxid <= 1.) );
861   ESL_DASSERT1( (msa->nseq >= 1) );
862   ESL_DASSERT1( (msa->alen >= 1) );
863   if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }
864 
865   if ((status = esl_msacluster_SingleLinkage(msa, maxid, &c, NULL, &nc)) != eslOK) goto ERROR;
866   ESL_ALLOC(nmem, sizeof(int) * nc);
867   esl_vec_ISet(nmem, nc, 0);
868   for (i = 0; i < msa->nseq; i++) nmem[c[i]]++;
869   for (i = 0; i < msa->nseq; i++) msa->wgt[i] = 1. / (double) nmem[c[i]];
870 
871   /* Make weights normalize up to nseq, and return.
872    */
873   esl_vec_DNorm(msa->wgt, msa->nseq);
874   esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
875   msa->flags |= eslMSA_HASWGTS;
876 
877   free(nmem);
878   free(c);
879   return eslOK;
880 
881  ERROR:
882   if (c    != NULL) free(c);
883   if (nmem != NULL) free(nmem);
884   return status;
885 }
886 
887 
888 
889 /*****************************************************************
890  * 4. %id filtering
891  *****************************************************************/
892 /* Mar 2019: After revising the PB weighting algorithm, I also went
893  * ahead and revised ER's %id filtering along related lines, because TAJ
894  * coincidentally complained about how filtering could keep a fragment
895  * and throw out a closer-to-consensus sequence.
896  */
897 
898 /* In addition to some of the internal functions used by PB weighting above... */
899 static int sort_doubles_decreasing(const void *data, int e1, int e2);
900 static int set_preference_conscover(const ESL_MSA *msa, const int *conscols, int ncons, double *sortwgt);
901 static int set_preference_randomly(const ESL_MSAWEIGHT_CFG *cfg, int nseq, double *sortwgt);
902 static int set_preference_origorder(int nseq, double *sortwgt);
903 static int msaweight_IDFilter_txt(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa);
904 
905 /* Function:  esl_msaweight_IDFilter()
906  * Synopsis:  Filter by %ID.
907  * Incept:    ER, Wed Oct 29 10:06:43 2008 [Janelia]
908  *
909  * Purpose:   Remove sequences in <msa> that are $\geq$ <maxid>
910  *            fractional identity to another aligned sequence.  Return
911  *            the filtered alignment in <*ret_newmsa>.  The original
912  *            <msa> is not changed.
913  *
914  *            When a pair of sequences has $\geq$ <maxid> fractional
915  *            identity, the default rule for which of them we keep
916  *            differs depending on whether the <msa> is in digital or
917  *            text mode. We typically manipulate biosequence
918  *            alignments in digital mode. In digital mode, the default
919  *            rule (called "conscover") is to prefer the sequence with
920  *            an alignment span that covers more consensus columns.
921  *            (Where "span" is defined by the column coordinates of
922  *            the leftmost and rightmost aligned residues of the
923  *            sequence; fragment definition rules also use the concept
924  *            of "span".)  In the simpler text mode, the rule is to
925  *            keep the earlier sequence and discard the later, in the
926  *            order the seqs come in the alignment. The conscover rule
927  *            is designed to avoid keeping fragments, while trying not
928  *            to bias the insertion/deletion statistics of the
929  *            filtered alignment. To use different preference rules in
930  *            digital mode alignments, see the more customizable
931  *            <esl_msaweight_IDFilter_adv()>.
932  *
933  *            Fractional identity is calculated by
934  *            <esl_dst_{CX}PairID()>, which defines the denominator of
935  *            the calculation as MIN(rlen1, rlen2).
936  *
937  *            Unparsed Stockholm markup is not propagated into the new
938  *            filtered alignment.
939  *
940  * Return:    <eslOK> on success, and <*ret_newmsa> is the filtered
941  *            alignment.
942  *
943  * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
944  *            identity calculation fails because of corrupted sequence
945  *            data. In either case, the <msa> is unmodified.
946  *
947  * Xref:      squid::weight.c::FilterAlignment().
948  */
949 int
esl_msaweight_IDFilter(const ESL_MSA * msa,double maxid,ESL_MSA ** ret_newmsa)950 esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
951 {
952   if (msa->flags & eslMSA_DIGITAL) return esl_msaweight_IDFilter_adv(NULL, msa, maxid, ret_newmsa);
953   else                             return msaweight_IDFilter_txt    (      msa, maxid, ret_newmsa);
954 }
955 
956 
957 /* Function:  esl_msaweight_IDFilter_adv()
958  * Synopsis:  Advanced version of %id filtering; digital MSAs only.
959  * Incept:    SRE, Sat 30 Mar 2019
960  *
961  * Purpose:   Same as <esl_msaweight_IDFilter()> but customizable with
962  *            optional custom parameters in <cfg>. Requires digital
963  *            mode alignment.
964  *
965  *            In particular, <cfg> can be used to select two other
966  *            rules for sequence preference, besides the default
967  *            "conscover" rule: random preference, or preferring the
968  *            lower-index sequence (same as the rule used in text mode
969  *            alignments).
970  *
971  *            For the "conscover" rule, consensus column determination
972  *            can be customized the same way as in PB weighting.
973  */
974 int
esl_msaweight_IDFilter_adv(const ESL_MSAWEIGHT_CFG * cfg,const ESL_MSA * msa,double maxid,ESL_MSA ** ret_newmsa)975 esl_msaweight_IDFilter_adv(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
976 {
977   int     ignore_rf   = (cfg? cfg->ignore_rf  : eslMSAWEIGHT_IGNORE_RF);      // default is FALSE: use RF annotation as consensus definition, if RF is present
978   int     allow_samp  = (cfg? cfg->allow_samp : eslMSAWEIGHT_ALLOW_SAMP);     // default is TRUE: allow subsampling speed optimization
979   int     sampthresh  = (cfg? cfg->sampthresh : eslMSAWEIGHT_SAMPTHRESH);     // if nseq > sampthresh, try to determine consensus on a subsample of seqs
980   int     filterpref  = (cfg? cfg->filterpref : eslMSAWEIGHT_FILT_CONSCOVER); // default preference rule is "conscover"
981   int   **ct          = NULL;     // matrix of symbol counts in each column. ct[apos=(0).1..alen][a=0..Kp-1]
982   int    *conscols    = NULL;     // list of consensus column indices [0..ncons-1]
983   double *sortwgt     = NULL;     // when pair of seqs is >= maxid, retain seq w/ higher <sortwgt>
984   int    *ranked_at   = NULL;     // sorted seq indices 0..nseq-1
985   int    *list        = NULL;     // array of seq indices in new msa
986   int    *useme       = NULL;     // useme[i] is TRUE if seq[i] is kept in new msa
987   int     ncons       = 0;        // number of consensus column indices in <conscols> list
988   int     nnew        = 0;        // how many seqs have been added to <list> and <useme> so far
989   int     apos, r, i;             // index over columns, ranked seq indices, seqs in <list>
990   double  ident;                  // pairwise fractional id (0..1)
991   int     status      = eslOK;
992 
993   ESL_DASSERT1(( msa->nseq >= 1 && msa->alen >= 1));
994   ESL_DASSERT1(( msa->flags & eslMSA_DIGITAL ));
995 
996   /* Allocations that we always need*/
997   ESL_ALLOC(sortwgt,   sizeof(double) * msa->nseq);
998   ESL_ALLOC(ranked_at, sizeof(double) * msa->nseq);
999   ESL_ALLOC(list,      sizeof(int)    * msa->nseq);
1000   ESL_ALLOC(useme,     sizeof(int)    * msa->nseq);
1001   esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */
1002 
1003   /* Determine consensus columns with same procedure as advanced PB weights.
1004    *   1. If RF annotation is provided, use it.
1005    *         (...unless cfg->ignore_rf has been set TRUE)
1006    *   2. If it's a deep alignment, subsample and apply fragthresh/symfrac rules to the sample.
1007    *         (...unless cfg->allow_samp has been set FALSE)
1008    *   3. Otherwise, use standard rule on all sequences:
1009    *      collect observed counts in <ct> (applying fragthresh rule) and
1010    *      determine consensus (applying symfrac rule)
1011    *   4. If all else fails, define all columns as consensus.
1012    *
1013    * We only need to do this if we're using the default "conscover" rule.
1014    */
1015   if (filterpref == eslMSAWEIGHT_FILT_CONSCOVER)
1016     {
1017       /* allocations only needed for consensus determination */
1018       ct = esl_mat_ICreate( msa->alen+1, msa->abc->Kp );      // (0).1..alen; 0..Kp-1
1019       ESL_ALLOC(conscols,  sizeof(int)    * msa->alen);
1020 
1021       if      (! ignore_rf && msa->rf)                consensus_by_rf(msa, conscols, &ncons, NULL);
1022       else if (allow_samp  && msa->nseq > sampthresh) consensus_by_sample(cfg, msa, ct, conscols, &ncons, NULL);
1023       else {
1024 	collect_counts(cfg, msa, conscols, ncons, ct, NULL);
1025 	consensus_by_all(cfg, msa, ct, conscols, &ncons, NULL);
1026       }
1027       if (!ncons) {
1028 	for (apos = 1; apos <= msa->alen; apos++) conscols[apos-1] = apos;
1029 	ncons = msa->alen;
1030       }
1031     }
1032 
1033   /* Assign preferences for which sequences to retain.
1034    * Higher <sortwgt> means higher preference.
1035    */
1036   if      (filterpref == eslMSAWEIGHT_FILT_CONSCOVER) set_preference_conscover(msa, conscols, ncons, sortwgt);
1037   else if (filterpref == eslMSAWEIGHT_FILT_RANDOM)    set_preference_randomly (cfg, msa->nseq,       sortwgt);
1038   else if (filterpref == eslMSAWEIGHT_FILT_ORIGORDER) set_preference_origorder(     msa->nseq,       sortwgt);
1039   else esl_fatal("bad filterpref");
1040 
1041   /* Sort sequence indices by these preferences.
1042    * <ranked_at[]> is a ranked list of sequence indices 0..nseq-1.
1043    */
1044   esl_quicksort(sortwgt, msa->nseq, sort_doubles_decreasing, ranked_at);
1045 
1046   /* Determine which seqs will be kept, favoring highest ranked ones.
1047    */
1048   for (r = 0; r < msa->nseq; r++)  // Try to include each sequence, starting with highest ranked ones
1049     {
1050       for (i = 0; i < nnew; i++)   // Test it against all the seqs we've already put in the list.
1051 	{
1052 	  if ((status = esl_dst_XPairId(msa->abc, msa->ax[ranked_at[r]], msa->ax[list[i]], &ident, NULL, NULL)) != eslOK) goto ERROR;
1053 	  if (ident >= maxid) break;
1054 	}
1055 
1056       if (i == nnew)  // if the seq made it past comparisons with all <nnew> current seqs in the list...
1057 	{
1058 	  list[nnew++]        = ranked_at[r];  // ... add it to the growing list.
1059 	  useme[ranked_at[r]] = TRUE;
1060 	}
1061     }
1062 
1063   /* Filter the input MSA.
1064    */
1065   if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
1066 
1067  ERROR:
1068   free(useme);
1069   free(list);
1070   free(ranked_at);
1071   free(sortwgt);
1072   free(conscols);
1073   esl_mat_IDestroy(ct);
1074   return status;
1075 }
1076 
1077 
1078 
1079 /* Sorting function for the esl_quicksort() in the
1080  * esl_msaweight_IDFilter_adv() implementation below:
1081  * rank seqs in order of decreasing sortwgt.
1082  */
1083 static int
sort_doubles_decreasing(const void * data,int e1,int e2)1084 sort_doubles_decreasing(const void *data, int e1, int e2)
1085 {
1086   double *sortwgt = (double *) data;
1087   if (sortwgt[e1] > sortwgt[e2]) return -1;
1088   if (sortwgt[e1] < sortwgt[e2]) return 1;
1089   return 0;
1090 }
1091 
1092 /* Preference-setting function for which seqs we prefer to retain. The
1093  * default "conscover" rule counts how many consensus columns are
1094  * covered by the "span" of this sequence from its first to last
1095  * residue.
1096  *    - set lpos, rpos to the column coord (1..alen) of the leftmost,
1097  *      rightmost residue in this seq;
1098  *    - count how many consensus columns are covered by that span.
1099  *
1100  * This is not the same as counting how many consensus columns have
1101  * a residue in them. We're trying not to bias (too much) against
1102  * deletions when we select representative sequences.
1103  */
1104 static int
set_preference_conscover(const ESL_MSA * msa,const int * conscols,int ncons,double * sortwgt)1105 set_preference_conscover(const ESL_MSA *msa, const int *conscols, int ncons, double *sortwgt)
1106 {
1107   int idx, apos, lpos, rpos, j;
1108 
1109   for (idx = 0; idx < msa->nseq; idx++)
1110     {
1111       for (lpos = 1;         lpos <= msa->alen; lpos++) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][lpos])) break;
1112       for (rpos = msa->alen; rpos >= 1;         rpos--) if (esl_abc_XIsResidue(msa->abc, msa->ax[idx][rpos])) break;
1113 
1114       sortwgt[idx] = 0.;
1115       for (j = 0; j < ncons && conscols[j] <= rpos; j++)
1116 	{
1117 	  apos = conscols[j];
1118 	  if (apos < lpos) continue;
1119 	  sortwgt[idx] += 1.;
1120 	}
1121     }
1122   return eslOK;
1123 }
1124 
1125 /* Preference-setting function where we assign random preference.
1126  */
1127 static int
set_preference_randomly(const ESL_MSAWEIGHT_CFG * cfg,int nseq,double * sortwgt)1128 set_preference_randomly(const ESL_MSAWEIGHT_CFG *cfg, int nseq, double *sortwgt)
1129 {
1130   ESL_RAND64 *rng = (cfg? esl_rand64_Create(cfg->seed) : esl_rand64_Create(eslMSAWEIGHT_RNGSEED));
1131   int         idx;
1132 
1133   for (idx = 0; idx < nseq; idx++)
1134     sortwgt[idx] = esl_rand64_double(rng);
1135 
1136   esl_rand64_Destroy(rng);
1137   return eslOK;
1138 }
1139 
1140 /* Preference-setting function that favors the earlier sequence
1141  * in the alignment; same as the rule used for text mode alignments.
1142  */
1143 static int
set_preference_origorder(int nseq,double * sortwgt)1144 set_preference_origorder(int nseq, double *sortwgt)
1145 {
1146   int idx;
1147 
1148   for (idx = 0; idx < nseq; idx++)
1149     sortwgt[idx] = (double) (nseq - idx); // sets them to nseq..1
1150   return eslOK;
1151 }
1152 
1153 
1154 /* msaweight_IDFilter_txt()
1155  * %id filtering for text-mode MSAs.
1156  */
1157 static int
msaweight_IDFilter_txt(const ESL_MSA * msa,double maxid,ESL_MSA ** ret_newmsa)1158 msaweight_IDFilter_txt(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
1159 {
1160   int     *list   = NULL;               /* array of seqs in new msa */
1161   int     *useme  = NULL;               /* TRUE if seq is kept in new msa */
1162   int      nnew;			/* number of seqs in new alignment */
1163   double   ident;                       /* pairwise percentage id */
1164   int      i,j;                         /* seqs counters*/
1165   int      remove;                      /* TRUE if sq is to be removed */
1166   int      status;
1167 
1168   ESL_DASSERT1(( msa ));
1169   ESL_DASSERT1(( msa->nseq >= 1 ));
1170   ESL_DASSERT1(( msa->alen >= 1 ));
1171   ESL_DASSERT1(( ! (msa->flags & eslMSA_DIGITAL )));
1172 
1173   /* allocate */
1174   ESL_ALLOC(list,  sizeof(int) * msa->nseq);
1175   ESL_ALLOC(useme, sizeof(int) * msa->nseq);
1176   esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */
1177 
1178   /* find which seqs to keep (list) */
1179   nnew = 0;
1180   for (i = 0; i < msa->nseq; i++)
1181     {
1182       remove = FALSE;
1183       for (j = 0; j < nnew; j++)
1184 	{
1185 	  if (! (msa->flags & eslMSA_DIGITAL)) {
1186 	    if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL))       != eslOK) goto ERROR;
1187 	  }
1188 	  else {
1189 	    if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR;
1190 	  }
1191 
1192 	  if (ident >= maxid)
1193 	    {
1194 	      remove = TRUE;
1195 	      break;
1196 	    }
1197 	}
1198       if (remove == FALSE) {
1199 	list[nnew++] = i;
1200 	useme[i]     = TRUE;
1201       }
1202     }
1203   if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
1204 
1205   free(list);
1206   free(useme);
1207   return eslOK;
1208 
1209  ERROR:
1210   if (list  != NULL) free(list);
1211   if (useme != NULL) free(useme);
1212   return status;
1213 }
1214 
1215 
1216 
1217 /*****************************************************************
1218  * 5. Benchmark
1219  *****************************************************************/
1220 #ifdef eslMSAWEIGHT_BENCHMARK
1221 /* gcc -g -Wall -o benchmark -I. -L. -DeslMSAWEIGHT_BENCHMARK esl_msaweight.c -leasel -lm
1222  * ./benchmark <MSA file>
1223  *
1224  * Script for benchmarks on Pfam:
1225  *     ./benchmark --gsc --maxN 4000 /misc/data0/databases/Pfam/Pfam-A.full
1226  *     ./benchmark --blosum          /misc/data0/databases/Pfam/Pfam-A.full
1227  *     ./benchmark --pb              /misc/data0/databases/Pfam/Pfam-A.full
1228  */
1229 #include "easel.h"
1230 #include "esl_getopts.h"
1231 #include "esl_msa.h"
1232 #include "esl_msafile.h"
1233 #include "esl_msaweight.h"
1234 #include "esl_vectorops.h"
1235 #include "esl_stopwatch.h"
1236 
1237 #define WGROUP "--blosum,--gsc,--pb"
1238 
1239 static ESL_OPTIONS options[] = {
1240     /* name     type         deflt   env   rng   togs    req      incmpt   help                          docgrp */
1241   { "-h",       eslARG_NONE, FALSE,  NULL, NULL, NULL,   NULL,      NULL, "show help and usage",             0 },
1242   { "--blosum", eslARG_NONE, FALSE,  NULL, NULL, WGROUP, NULL,      NULL, "use BLOSUM weights",              0 },
1243   { "--gsc",    eslARG_NONE,"default",NULL,NULL, WGROUP, NULL,      NULL, "use GSC weights",                 0 },
1244   { "--pb",     eslARG_NONE, FALSE,  NULL, NULL, WGROUP, NULL,      NULL, "use position-based weights",      0 },
1245   { "--id",     eslARG_REAL, "0.62", NULL,"0<=x<=1",NULL,"--blosum",NULL, "id threshold for --blosum",       0 },
1246   { "--maxN",   eslARG_INT,    "0",  NULL,"n>=0",  NULL,  NULL,     NULL, "skip alignments w/ > <n> seqs",   0 },
1247   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1248 };
1249 
1250 static char usage[] = "Usage: ./benchmark [-options] <msa_file>";
1251 
1252 int
main(int argc,char ** argv)1253 main(int argc, char **argv)
1254 {
1255   ESL_STOPWATCH *w;
1256   ESL_GETOPTS   *go;
1257   char          *msafile;
1258   ESL_MSAFILE   *afp;
1259   ESL_MSA       *msa;
1260   int            do_gsc;
1261   int            do_pb;
1262   int            do_blosum;
1263   int            maxN;
1264   double         maxid;
1265   double         cpu;
1266   int            status;
1267 
1268   /* Process command line
1269    */
1270   go = esl_getopts_Create(options);
1271   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf);
1272   if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("failed to parse cmd line: %s", go->errbuf);
1273   if (esl_opt_GetBoolean(go, "-h") == TRUE) {
1274     puts(usage);
1275     puts("\n  where options are:");
1276     esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
1277     return 0;
1278   }
1279   do_blosum = esl_opt_GetBoolean(go, "--blosum");
1280   do_gsc    = esl_opt_GetBoolean(go, "--gsc");
1281   do_pb     = esl_opt_GetBoolean(go, "--pb");
1282   maxid     = esl_opt_GetReal   (go, "--id");
1283   maxN      = esl_opt_GetInteger(go, "--maxN");
1284   if (esl_opt_ArgNumber(go) != 1) {
1285     puts("Incorrect number of command line arguments.");
1286     puts(usage);
1287     return 1;
1288   }
1289   if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("failed to parse cmd line: %s", go->errbuf);
1290   esl_getopts_Destroy(go);
1291 
1292   w = esl_stopwatch_Create();
1293 
1294   /* Weight one or more alignments from input file
1295    */
1296   if ((status = esl_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
1297     esl_msafile_OpenFailure(afp, status);
1298 
1299   while ( (status = esl_msafile_Read(afp, &msa)) != eslEOF)
1300     {
1301       if (status != eslOK) esl_msafile_ReadFailure(afp, status);
1302       if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }
1303 
1304       esl_stopwatch_Start(w);
1305 
1306       if      (do_gsc) 	  esl_msaweight_GSC(msa);
1307       else if (do_pb) 	  esl_msaweight_PB(msa);
1308       else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid);
1309 
1310       esl_stopwatch_Stop(w);
1311       cpu = w->user;
1312       printf("%-20s %6d  %6d  %.3f\n", msa->name, msa->alen, msa->nseq, cpu);
1313       esl_msa_Destroy(msa);
1314     }
1315   esl_msafile_Close(afp);
1316 
1317   esl_stopwatch_Destroy(w);
1318   return eslOK;
1319 }
1320 #endif /* eslMSAWEIGHT_BENCHMARK */
1321 /*-------------------- end, benchmark  --------------------------*/
1322 
1323 
1324 
1325 /*****************************************************************
1326  * 6. Statistics driver
1327  *****************************************************************/
1328 #ifdef eslMSAWEIGHT_STATS
1329 /* gcc -g -Wall -o stats -I. -L. -DeslMSAWEIGHT_STATS esl_msaweight.c -leasel -lm
1330  * ./stats <MSA file>
1331  *
1332  * Script for weight statistics on Pfam:
1333  *     ./stats --gsc --maxN 4000 /misc/data0/databases/Pfam/Pfam-A.full
1334  *     ./stats --blosum          /misc/data0/databases/Pfam/Pfam-A.full
1335  *     ./stats --pb              /misc/data0/databases/Pfam/Pfam-A.full
1336  */
1337 #include "easel.h"
1338 #include "esl_getopts.h"
1339 #include "esl_msa.h"
1340 #include "esl_msafile.h"
1341 #include "esl_msaweight.h"
1342 #include "esl_vectorops.h"
1343 
1344 #define WGROUP "--blosum,--gsc,--pb"
1345 
1346 static ESL_OPTIONS options[] = {
1347     /* name     type         deflt   env   rng   togs    req      incmpt   help                          docgrp */
1348   { "-h",       eslARG_NONE, FALSE,  NULL, NULL, NULL,   NULL,      NULL, "show help and usage",             0 },
1349   { "--blosum", eslARG_NONE, FALSE,  NULL, NULL, WGROUP, NULL,      NULL, "use BLOSUM weights",              0 },
1350   { "--gsc",    eslARG_NONE,"default",NULL,NULL, WGROUP, NULL,      NULL, "use GSC weights",                 0 },
1351   { "--pb",     eslARG_NONE, FALSE,  NULL, NULL, WGROUP, NULL,      NULL, "use position-based weights",      0 },
1352   { "--id",     eslARG_REAL, "0.62", NULL,"0<=x<=1",NULL,"--blosum",NULL, "id threshold for --blosum",       0 },
1353   { "--maxN",   eslARG_INT,    "0",  NULL,"n>=0",  NULL,  NULL,     NULL, "skip alignments w/ > <n> seqs",   0 },
1354   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1355 };
1356 
1357 static char usage[] = "Usage: ./stats [-options] <msa_file>";
1358 
1359 int
main(int argc,char ** argv)1360 main(int argc, char **argv)
1361 {
1362   ESL_GETOPTS  *go;
1363   char         *msafile;
1364   ESL_MSAFILE  *afp;
1365   ESL_MSA      *msa;
1366   int           do_gsc;
1367   int           do_pb;
1368   int           do_blosum;
1369   int           maxN;
1370   double        maxid;
1371   int           nsmall, nbig;
1372   int           i;
1373   int           status;
1374 
1375   /* Process command line  */
1376   go = esl_getopts_Create(options);
1377   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("%s", go->errbuf);
1378   if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("%s", go->errbuf);
1379   if (esl_opt_GetBoolean(go, "-h") == TRUE){
1380     puts(usage);
1381     puts("\n  where options are:");
1382     esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
1383     return 0;
1384   }
1385   do_blosum = esl_opt_GetBoolean(go, "--blosum");
1386   do_gsc    = esl_opt_GetBoolean(go, "--gsc");
1387   do_pb     = esl_opt_GetBoolean(go, "--pb");
1388   maxid     = esl_opt_GetReal   (go, "--id");
1389   maxN      = esl_opt_GetInteger(go, "--maxN");
1390   if (esl_opt_ArgNumber(go) != 1) {
1391     puts("Incorrect number of command line arguments.");
1392     puts(usage);
1393     return 1;
1394   }
1395   if ((msafile = esl_opt_GetArg(go, 1)) == NULL) esl_fatal("%s", go->errbuf);
1396   esl_getopts_Destroy(go);
1397 
1398   /* Weight one or more alignments from input file
1399    */
1400   if ((status = esl_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
1401     esl_msafile_OpenFailure(afp, status);
1402 
1403   while ( (status = esl_msafile_Read(afp, &msa)) != eslEOF)
1404     {
1405       if (status != eslOK) esl_msafile_ReadFailure(afp, status);
1406       if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }
1407 
1408       if      (do_gsc) 	  esl_msaweight_GSC(msa);
1409       else if (do_pb) 	  esl_msaweight_PB(msa);
1410       else if (do_blosum) esl_msaweight_BLOSUM(msa, maxid);
1411 
1412       for (nsmall = 0, nbig = 0, i = 0; i < msa->nseq; i++) {
1413 	if (msa->wgt[i] < 0.2) nsmall++;
1414 	if (msa->wgt[i] > 5.0) nbig++;
1415       }
1416 
1417       printf("%-20s  %5d %5d %8.4f  %8.4f  %5d  %5d\n",
1418 	     msa->name,
1419 	     msa->nseq,
1420 	     msa->alen,
1421 	     esl_vec_DMin(msa->wgt, msa->nseq),
1422 	     esl_vec_DMax(msa->wgt, msa->nseq),
1423 	     nsmall,
1424 	     nbig);
1425       esl_msa_Destroy(msa);
1426     }
1427   esl_msafile_Close(afp);
1428   return eslOK;
1429 }
1430 #endif /* eslMSAWEIGHT_STATS */
1431 /*---------------- end, statistics driver  ----------------------*/
1432 
1433 
1434 
1435 
1436 /*****************************************************************
1437  * 7. Unit tests
1438  *****************************************************************/
1439 #ifdef eslMSAWEIGHT_TESTDRIVE
1440 
1441 #include "esl_msafile.h"
1442 
1443 /* GSC weighting test on text-mode alignment <msa>, where we expect
1444  * the weights to be <expect[0]..expect[nseq-1]>.
1445  *
1446  * If <abc> is non-NULL, digitize alignment and run test again in
1447  * digital alignment mode.
1448  *
1449  * On failure, caller utest provides <msg> as the error msg to print.
1450  */
1451 static void
do_GSC(ESL_ALPHABET * abc,ESL_MSA * msa,double * expect,char * msg)1452 do_GSC(ESL_ALPHABET *abc, ESL_MSA *msa, double *expect, char *msg)
1453 {
1454   /* text mode */
1455   if (esl_msaweight_GSC(msa)                               != eslOK) esl_fatal(msg);
1456   if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1457 
1458   /* digital mode */
1459   if (abc)
1460     {
1461       if (esl_msa_Digitize(abc, msa, NULL)                     != eslOK) esl_fatal(msg);
1462       if (esl_msaweight_GSC(msa)                               != eslOK) esl_fatal(msg);
1463       if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1464       if (esl_msa_Textize(msa)                                 != eslOK) esl_fatal(msg);
1465     }
1466 }
1467 
1468 /* As above, but for default PB weights */
1469 static void
do_PB(ESL_ALPHABET * abc,ESL_MSA * msa,double * expect,char * msg)1470 do_PB(ESL_ALPHABET *abc, ESL_MSA *msa, double *expect, char *msg)
1471 {
1472   if (esl_msaweight_PB(msa)                                != eslOK) esl_fatal(msg);
1473   if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1474 
1475   if (abc)
1476     {
1477       if (esl_msa_Digitize(abc, msa, NULL)                     != eslOK) esl_fatal(msg);
1478       if (esl_msaweight_PB(msa)                                != eslOK) esl_fatal(msg);
1479       if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1480       if (esl_msa_Textize(msa)                                 != eslOK) esl_fatal(msg);
1481     }
1482 }
1483 
1484 /* As above, but for BLOSUM weights with fractional identity threshold <maxid> */
1485 static void
do_BLOSUM(ESL_ALPHABET * abc,ESL_MSA * msa,double maxid,double * expect,char * msg)1486 do_BLOSUM(ESL_ALPHABET *abc, ESL_MSA *msa, double maxid, double *expect, char *msg)
1487 {
1488   if (esl_msaweight_BLOSUM(msa, maxid)                     != eslOK) esl_fatal(msg);
1489   if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1490 
1491   if (abc)
1492     {
1493       if (esl_msa_Digitize(abc, msa, NULL)                     != eslOK) esl_fatal(msg);
1494       if (esl_msaweight_BLOSUM(msa, maxid)                     != eslOK) esl_fatal(msg);
1495       if (esl_vec_DCompare(msa->wgt, expect, msa->nseq, 0.001) != eslOK) esl_fatal(msg);
1496       if (esl_msa_Textize(msa)                                 != eslOK) esl_fatal(msg);
1497     }
1498 }
1499 
1500 /* utest_wgt_identical_seqs
1501  * For all identical sequences, weights are uniform, all 1.0.
1502  */
1503 static void
utest_identical_seqs(void)1504 utest_identical_seqs(void)
1505 {
1506   char          msg[]      = "identical_seqs test failed";
1507   ESL_MSA      *msa        = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1 AAAAA\nseq2 AAAAA\nseq3 AAAAA\nseq4 AAAAA\nseq5 AAAAA\n//\n", eslMSAFILE_STOCKHOLM);
1508   ESL_ALPHABET *aa_abc     = esl_alphabet_Create(eslAMINO);
1509   ESL_ALPHABET *nt_abc     = esl_alphabet_Create(eslDNA);
1510   double        uniform[5] = { 1.0, 1.0, 1.0, 1.0, 1.0 };
1511 
1512   do_GSC   (aa_abc, msa,       uniform, msg);
1513   do_GSC   (nt_abc, msa,       uniform, msg);
1514   do_PB    (aa_abc, msa,       uniform, msg);
1515   do_PB    (nt_abc, msa,       uniform, msg);
1516   do_BLOSUM(aa_abc, msa, 0.62, uniform, msg);
1517   do_BLOSUM(nt_abc, msa, 0.62, uniform, msg);
1518 
1519   esl_alphabet_Destroy(nt_abc);
1520   esl_alphabet_Destroy(aa_abc);
1521   esl_msa_Destroy(msa);
1522 }
1523 
1524 /* The "contrived" example of [Henikoff94b].
1525  * "Correct" solution is 1==2, 3==4, 5==2x others.
1526  * PB, GSC, BLOSUM weights all agree.
1527  */
1528 static void
utest_henikoff_contrived(void)1529 utest_henikoff_contrived(void)
1530 {
1531   char          msg[]      = "henikoff_contrived test failed";
1532   ESL_MSA      *msa        = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1 AAAAA\nseq2 AAAAA\nseq3 CCCCC\nseq4 CCCCC\nseq5 TTTTT\n//\n", eslMSAFILE_STOCKHOLM);
1533   ESL_ALPHABET *aa_abc     = esl_alphabet_Create(eslAMINO);
1534   ESL_ALPHABET *nt_abc     = esl_alphabet_Create(eslDNA);
1535   double        ewgt[5]  = { 0.833333, 0.833333, 0.833333, 0.833333, 1.66667 };
1536 
1537   do_GSC   (aa_abc, msa,       ewgt, msg);
1538   do_GSC   (nt_abc, msa,       ewgt, msg);
1539   do_PB    (aa_abc, msa,       ewgt, msg);
1540   do_PB    (nt_abc, msa,       ewgt, msg);
1541   do_BLOSUM(aa_abc, msa, 0.62, ewgt, msg);
1542   do_BLOSUM(nt_abc, msa, 0.62, ewgt, msg);
1543 
1544   esl_alphabet_Destroy(nt_abc);
1545   esl_alphabet_Destroy(aa_abc);
1546   esl_msa_Destroy(msa);
1547 }
1548 
1549 /* The "nitrogenase segments" example of [Henikoff94b].
1550  * PB, GSC, BLOSUM wgts differ on this example.
1551  */
1552 static void
utest_nitrogenase(void)1553 utest_nitrogenase(void)
1554 {
1555   char          msg[]   = "nitrogenase test failed";
1556   ESL_MSA      *msa     = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nNIFE_CLOPA GYVGS\nNIFD_AZOVI GFDGF\nNIFD_BRAJA GYDGF\nNIFK_ANASP GYQGG\n//\n", eslMSAFILE_STOCKHOLM);
1557   ESL_ALPHABET *aa_abc  = esl_alphabet_Create(eslAMINO);
1558   double        egsc[4] = { 1.125000, 0.875000, 0.875000, 1.125000 };
1559   double        epb[4]  = { 1.066667, 1.066667, 0.800000, 1.066667 };
1560   double        eblo[4] = { 1.333333, 0.666667, 0.666667, 1.333333 };
1561 
1562   do_GSC   (aa_abc, msa,       egsc, msg);
1563   do_PB    (aa_abc, msa,       epb,  msg);
1564   do_BLOSUM(aa_abc, msa, 0.62, eblo, msg);
1565 
1566   esl_alphabet_Destroy(aa_abc);
1567   esl_msa_Destroy(msa);
1568 }
1569 
1570 /* gerstein4 test:
1571  * alignment that makes the same distances as Figure 4 from [Gerstein94].
1572  */
1573 static void
utest_gerstein4(void)1574 utest_gerstein4(void)
1575 {
1576   char          msg[]   = "gerstein4 test failed";
1577   ESL_MSA      *msa     = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nA  AAAAAAAAAA\nB  TTAAAAAAAA\nC  ATAAAACCCC\nD  GGGAAGGGGG\n//\n", eslMSAFILE_STOCKHOLM);
1578   ESL_ALPHABET *aa_abc  = esl_alphabet_Create(eslAMINO);
1579   ESL_ALPHABET *nt_abc  = esl_alphabet_Create(eslDNA);
1580   double        egsc[4] = { 0.760870, 0.760870, 1.086957, 1.391304 };
1581   double        epb[4]  = { 0.800000, 0.800000, 1.000000, 1.400000 };
1582   double        eblo[4] = { 0.666667, 0.666667, 1.333333, 1.333333 };
1583   double        uni[4]  = { 1.0, 1.0, 1.0, 1.0 };
1584 
1585   do_GSC   (aa_abc, msa,       egsc, msg);
1586   do_GSC   (nt_abc, msa,       egsc, msg);
1587   do_PB    (aa_abc, msa,       epb,  msg);
1588   do_PB    (nt_abc, msa,       epb,  msg);
1589   do_BLOSUM(aa_abc, msa, 0.62, eblo, msg);
1590   do_BLOSUM(nt_abc, msa, 0.62, eblo, msg);
1591 
1592   /* BLOSUM weights have the peculiar property of going flat at maxid=0.0
1593    * (when everyone clusters) or maxid=1.0 (nobody clusters). Test that here.
1594    */
1595   do_BLOSUM(aa_abc, msa, 0.0,  uni, msg);
1596   do_BLOSUM(aa_abc, msa, 1.0,  uni, msg);
1597 
1598   esl_alphabet_Destroy(nt_abc);
1599   esl_alphabet_Destroy(aa_abc);
1600   esl_msa_Destroy(msa);
1601 }
1602 
1603 /* pathologs test
1604  * A gappy alignment where no seqs overlap, and weighting methods
1605  * have to resort to uniform weights.
1606  */
1607 static void
utest_pathologs(void)1608 utest_pathologs(void)
1609 {
1610   char          msg[]   = "pathologs test failed";
1611   ESL_MSA      *msa     = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nA  A----\nB  -C---\nC  --G--\nD  ---T-\nE  ----T\n//\n", eslMSAFILE_STOCKHOLM);
1612   ESL_ALPHABET *aa_abc  = esl_alphabet_Create(eslAMINO);
1613   ESL_ALPHABET *nt_abc  = esl_alphabet_Create(eslDNA);
1614   double        uni[5]  = { 1.0, 1.0, 1.0, 1.0, 1.0 };
1615 
1616   do_GSC   (aa_abc, msa,       uni, msg);
1617   do_GSC   (nt_abc, msa,       uni, msg);
1618   do_PB    (aa_abc, msa,       uni, msg);
1619   do_PB    (nt_abc, msa,       uni, msg);
1620   do_BLOSUM(aa_abc, msa, 0.62, uni, msg);
1621   do_BLOSUM(nt_abc, msa, 0.62, uni, msg);
1622 
1623   esl_alphabet_Destroy(nt_abc);
1624   esl_alphabet_Destroy(aa_abc);
1625   esl_msa_Destroy(msa);
1626 }
1627 
1628 
1629 /* Simple test of the %id filter.
1630  * seq1 and seq2 are 100% identical, but seq1 is shorter.
1631  * seq2 is preferred in the default conscover rule;
1632  * seq1 is preferred in the origorder rule;
1633  * either seq1 or seq2 will be selected with the random rule.
1634  */
1635 static void
utest_idfilter(void)1636 utest_idfilter(void)
1637 {
1638   char msg[] = "idfilter test failed";
1639   ESL_MSAWEIGHT_CFG *cfg  = esl_msaweight_cfg_Create();
1640   ESL_ALPHABET      *abc  = esl_alphabet_Create(eslAMINO);
1641   ESL_MSA           *msa  = esl_msa_CreateFromString("# STOCKHOLM 1.0\n\nseq1  ..AAAAAAAA\nseq2  AAAAAAAAAA\nseq3  CCCCCCCCCC\nseq4  GGGGGGGGGG\n//\n", eslMSAFILE_STOCKHOLM);
1642   ESL_MSA           *msa2 = NULL;
1643 
1644   /* Default text mode rule is origorder */
1645   if (esl_msaweight_IDFilter(msa, 1.0, &msa2)           != eslOK) esl_fatal(msg);
1646   if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq1") != 0)     esl_fatal(msg);
1647   esl_msa_Destroy(msa2);
1648 
1649   /* Default digital mode rule is conscover */
1650   if (esl_msa_Digitize(abc, msa, NULL)                  != eslOK) esl_fatal(msg);
1651   if (esl_msaweight_IDFilter(msa, 1.0, &msa2)           != eslOK) esl_fatal(msg);
1652   if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq2") != 0)     esl_fatal(msg);
1653   esl_msa_Destroy(msa2);
1654 
1655   /* Ditto with _adv too */
1656   if (esl_msaweight_IDFilter_adv(cfg, msa, 1.0, &msa2)  != eslOK) esl_fatal(msg);
1657   if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq2") != 0)     esl_fatal(msg);
1658   esl_msa_Destroy(msa2);
1659 
1660   /* Custom setting _adv to origorder */
1661   cfg->filterpref = eslMSAWEIGHT_FILT_ORIGORDER;
1662   if (esl_msaweight_IDFilter_adv(cfg, msa, 1.0, &msa2)  != eslOK) esl_fatal(msg);
1663   if (msa2->nseq != 3 || strcmp(msa2->sqname[0], "seq1") != 0)     esl_fatal(msg);
1664   esl_msa_Destroy(msa2);
1665 
1666   /* Custom setting _adv to randorder */
1667   cfg->filterpref = eslMSAWEIGHT_FILT_RANDOM;
1668   if (esl_msaweight_IDFilter_adv(cfg, msa, 1.0, &msa2)  != eslOK) esl_fatal(msg);
1669   if (msa2->nseq != 3)                                            esl_fatal(msg);
1670   if (strcmp(msa2->sqname[0], "seq1") != 0 && strcmp(msa2->sqname[0], "seq2") != 0) esl_fatal(msg);
1671   esl_msa_Destroy(msa2);
1672 
1673   esl_msaweight_cfg_Destroy(cfg);
1674   esl_alphabet_Destroy(abc);
1675   esl_msa_Destroy(msa);
1676 }
1677 
1678 #endif /*eslMSAWEIGHT_TESTDRIVE*/
1679 /*-------------------- end, unit tests  -------------------------*/
1680 
1681 
1682 
1683 /*****************************************************************
1684  * 8. Test driver
1685  *****************************************************************/
1686 #ifdef eslMSAWEIGHT_TESTDRIVE
1687 
1688 #include "easel.h"
1689 #include "esl_getopts.h"
1690 #include "esl_msa.h"
1691 #include "esl_msafile.h"
1692 #include "esl_msaweight.h"
1693 
1694 static ESL_OPTIONS options[] = {
1695   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
1696   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
1697   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1698 };
1699 static char usage[]  = "[-options]";
1700 static char banner[] = "test driver for msaweight module";
1701 
1702 int
main(int argc,char ** argv)1703 main(int argc, char **argv)
1704 {
1705   ESL_GETOPTS  *go   = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1706 
1707   fprintf(stderr, "## %s\n", argv[0]);
1708 
1709   utest_identical_seqs();
1710   utest_henikoff_contrived();
1711   utest_nitrogenase();
1712   utest_gerstein4();
1713   utest_pathologs();
1714 
1715   utest_idfilter();
1716 
1717   fprintf(stderr, "#  status = ok\n");
1718 
1719   esl_getopts_Destroy(go);
1720   exit(0);
1721 }
1722 #endif /*eslMSAWEIGHT_TESTDRIVE*/
1723 /*-------------------- end, test driver  -------------------------*/
1724 
1725 
1726 
1727 
1728 
1729 /*****************************************************************
1730  * 9. Example
1731  *****************************************************************/
1732 #ifdef eslMSAWEIGHT_EXAMPLE
1733 
1734 #include "easel.h"
1735 #include "esl_getopts.h"
1736 #include "esl_msa.h"
1737 #include "esl_msafile.h"
1738 #include "esl_msaweight.h"
1739 
1740 static ESL_OPTIONS options[] = {
1741   /* name             type          default                           env  range       toggles reqs incomp             help                                                    docgroup*/
1742   { "-h",            eslARG_NONE,   FALSE,                            NULL, NULL,       NULL,  NULL, NULL,            "show brief help on version and usage",                      1 },
1743   { "--informat",    eslARG_STRING, NULL,                             NULL, NULL,       NULL,  NULL, NULL,            "specify the input MSA file is in format <s>",               1 },
1744   { "--dna",         eslARG_NONE,   FALSE,                            NULL, NULL,       NULL,  NULL, NULL,            "use DNA alphabet",                                          1 },
1745   { "--rna",         eslARG_NONE,   FALSE,                            NULL, NULL,       NULL,  NULL, NULL,            "use RNA alphabet",                                          1 },
1746   { "--amino",       eslARG_NONE,   FALSE,                            NULL, NULL,       NULL,  NULL, NULL,            "use protein alphabet",                                      1 },
1747 
1748   { "--ignore-rf",   eslARG_NONE,   eslMSAWEIGHT_IGNORE_RF,           NULL, NULL,       NULL,  NULL, NULL,            "ignore any RF line; always determine our own consensus",    2 },
1749   { "--fragthresh",  eslARG_REAL,   ESL_STR(eslMSAWEIGHT_FRAGTHRESH), NULL, "0<=x<=1",  NULL,  NULL, NULL,            "seq is fragment if aspan/alen < fragthresh",                2 },	// 0.0 = no fragments; 1.0 = everything is a frag except 100% full-span aseq
1750   { "--symfrac",     eslARG_REAL,   ESL_STR(eslMSAWEIGHT_SYMFRAC),    NULL, "0<=x<=1",  NULL,  NULL, NULL,            "col is consensus if nres/(nres+ngap) >= symfrac",           2 },	// 0.0 = all cols are consensus; 1.0 = only 100% all-residue cols are consensus
1751 
1752   { "--no-sampling", eslARG_NONE,   FALSE,                            NULL, NULL,       NULL,  NULL, NULL,            "never use subsampling to determine consensus",              3 },
1753   { "--nsamp",       eslARG_INT,    ESL_STR(eslMSAWEIGHT_NSAMP),      NULL, "n>=0",     NULL,  NULL, "--no-sampling", "number of seqs to sample (if using sampling)",              3 },
1754   { "--sampthresh",  eslARG_INT,    ESL_STR(eslMSAWEIGHT_SAMPTHRESH), NULL, "n>=0",     NULL,  NULL, "--no-sampling", "switch to using sampling when nseq > nsamp",                3 },
1755   { "--maxfrag",     eslARG_INT,    ESL_STR(eslMSAWEIGHT_MAXFRAG),    NULL, "n>=0",     NULL,  NULL, "--no-sampling", "if sample has > maxfrag fragments, don't use sample",       3 },
1756   { "-s",            eslARG_INT,    ESL_STR(eslMSAWEIGHT_RNGSEED),    NULL, "n>=0",     NULL,  NULL, NULL,            "set random number seed to <n>",                             3 },
1757 
1758   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1759 };
1760 
1761 static char usage[]  = "[-options] <msafile>";
1762 static char banner[] = "esl_msaweight example";
1763 
1764 int
main(int argc,char ** argv)1765 main(int argc, char **argv)
1766 {
1767   ESL_GETOPTS  *go           = esl_getopts_Create(options);
1768   char         *msafile      = NULL;
1769   int           infmt        = eslMSAFILE_UNKNOWN;
1770   ESL_MSAWEIGHT_CFG *cfg     = esl_msaweight_cfg_Create();
1771   ESL_MSAWEIGHT_DAT *dat     = esl_msaweight_dat_Create();
1772   ESL_ALPHABET *abc          = NULL;
1773   ESL_MSAFILE  *afp          = NULL;
1774   ESL_MSA      *msa          = NULL;
1775   int           nali         = 0;
1776   int           idx;
1777   int           status;
1778 
1779   /* Process command line and options
1780    */
1781   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
1782       esl_opt_VerifyConfig(go)               != eslOK)
1783     esl_fatal("Failed to parse command line: %s\n", go->errbuf);
1784   if (esl_opt_GetBoolean(go, "-h") == TRUE)
1785     {
1786       esl_banner(stdout, argv[0], banner);
1787       esl_usage (stdout, argv[0], usage);
1788       puts("\noptions:");
1789       esl_opt_DisplayHelp(stdout, go, 1, 2, 80); /* 1=docgroup; 2=indentation; 80=width */
1790       puts("\noptions for deriving consensus:");
1791       esl_opt_DisplayHelp(stdout, go, 2, 2, 80); /* 1=docgroup; 2=indentation; 80=width */
1792       puts("\noptions for deriving consensus by sampling (on deep MSAs):");
1793       esl_opt_DisplayHelp(stdout, go, 3, 2, 80); /* 1=docgroup; 2=indentation; 80=width */
1794       return 0;
1795     }
1796 
1797   if (esl_opt_ArgNumber(go) != 1) esl_fatal("Incorrect number of command line arguments.\n%s\n", usage);
1798   msafile = esl_opt_GetArg(go, 1);
1799 
1800   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
1801   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
1802   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
1803 
1804   if (esl_opt_IsOn(go, "--informat") &&
1805       (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
1806     esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat"));
1807 
1808   cfg->fragthresh =  esl_opt_GetReal   (go, "--fragthresh");
1809   cfg->symfrac    =  esl_opt_GetReal   (go, "--symfrac");
1810   cfg->ignore_rf  =  esl_opt_GetBoolean(go, "--ignore-rf");
1811   cfg->allow_samp = !esl_opt_GetBoolean(go, "--no-sampling");
1812   cfg->sampthresh =  esl_opt_GetInteger(go, "--sampthresh");
1813   cfg->nsamp      =  esl_opt_GetInteger(go, "--nsamp");
1814   cfg->maxfrag    =  esl_opt_GetInteger(go, "--maxfrag");
1815   cfg->seed       =  esl_opt_GetInteger(go, "-s");
1816 
1817   if ((status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK)
1818     esl_msafile_OpenFailure(afp, status);
1819 
1820   while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
1821     {
1822       nali++;
1823 
1824       if ((status = esl_msaweight_PB_adv(cfg, msa, dat)) != eslOK)
1825 	esl_fatal("weighting failed");
1826 
1827       for (idx = 0; idx < msa->nseq; idx++)
1828 	printf("%-25s %.4f\n", msa->sqname[idx], msa->wgt[idx]);
1829 
1830 
1831       if      (dat->cons_by_rf)     printf("# consensus by:             RF annotation\n");
1832       else if (dat->cons_by_sample) printf("# consensus by:             subsampling\n");
1833       else if (dat->cons_by_all)    printf("# consensus by:             standard rules\n");
1834       else if (dat->cons_allcols)   printf("# consensus by:             all columns\n");
1835 
1836       if (dat->rejected_sample)     printf("# (attempted sampling but rejected, too many frags\n");
1837       printf("# sampling allowed?         %s\n", cfg->allow_samp ? "yes" : "NO");
1838 
1839       printf("# fragthresh:               %.4f\n", cfg->fragthresh);
1840       printf("# symfrac:                  %.4f\n", cfg->symfrac);
1841       if (dat->cons_by_sample)
1842 	{
1843 	  printf("# Info on sampling:\n");
1844 	  printf("#    RNG seed:     %" PRIu64 "\n", dat->seed);
1845 	  printf("#    Sample size:  %d\n", cfg->nsamp);
1846 	  printf("#    Fragments:    %d  (<= maxfrag of %d)\n", dat->samp_nfrag, cfg->maxfrag);
1847 	}
1848 
1849       printf("# number of consensus cols: %d out of %d\n", dat->ncons,     (int) msa->alen);
1850       printf("# number of fragments:      %d out of %d\n", dat->all_nfrag, msa->nseq);
1851 
1852       esl_msa_Destroy(msa);
1853     }
1854   if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */
1855 
1856   esl_msaweight_cfg_Destroy(cfg);
1857   esl_msaweight_dat_Destroy(dat);
1858   esl_alphabet_Destroy(abc);
1859   esl_msafile_Close(afp);
1860   esl_getopts_Destroy(go);
1861   exit(0);
1862 }
1863 #endif /*eslMSAWEIGHT_EXAMPLE*/
1864