1 /* rnamat.c
2 *
3 * Routines for dealing with RNA subsitution/transition matrix.
4 *
5 * Robert J. Klein
6 * February 25, 2002
7 */
8 #include "esl_config.h"
9 #include "p7_config.h"
10 #include "config.h"
11
12 #include <stdlib.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <math.h>
16
17 #include "easel.h"
18 #include "esl_msa.h"
19 #include "esl_stack.h"
20 #include "esl_vectorops.h"
21 #include "esl_wuss.h"
22
23 #include "hmmer.h"
24
25 #include "infernal.h"
26
27 static matrix_t *setup_matrix (int size);
28 static float simple_identity(const ESL_ALPHABET *abc, char *s1, char *s2);
29 /*
30 static void print_matrix (FILE *fp, fullmat_t *fullmat);
31 static float get_min_alpha_beta_sum (fullmat_t *fullmat);
32 static void count_matrix (ESL_MSA *msa, fullmat_t *fullmat, double *background_nt, int cutoff_perc, int product_weights);
33 static int unpaired_res (int i);
34 */
35
36 /*
37 * Maps c as follows:
38 * A->0
39 * C->1
40 * G->2
41 * T->3
42 * U->3
43 * else->-1
44 */
numbered_nucleotide(char c)45 int numbered_nucleotide (char c) {
46 switch (c) {
47 case 'A':
48 case 'a':
49 return (0);
50 case 'C':
51 case 'c':
52 return (1);
53 case 'G':
54 case 'g':
55 return (2);
56 case 'T':
57 case 't':
58 case 'U':
59 case 'u':
60 return (3);
61 }
62 return (-1);
63 }
64
65 /*
66 * Maps base pair c,d as follows:
67 *
68 * AA -> 0
69 * AC -> 1
70 * ....
71 * TG -> 14
72 * TT -> 15 (T==U)
73 * Anything else maps to -1
74 */
numbered_basepair(char c,char d)75 int numbered_basepair (char c, char d) {
76 int c_num, d_num;
77 c_num = numbered_nucleotide (c);
78 d_num = numbered_nucleotide (d);
79 if (c_num < 0 || d_num < 0) {
80 return (-1);
81 } else {
82 return ((c_num << 2) | d_num);
83 }
84 }
85
86 /* Function: rjk_KHS2ct()
87 * Incept: SRE 29 Feb 2000 [Seattle]; from COVE 1.0 code
88 * Modified: RJK 27 Feb 2002 [St. Louis]; from Infernal code (rna_ops.c)
89 * Purpose: Convert a secondary structure string (0..len-1) to an array of
90 * integers representing what position each position is base-paired
91 * to (0..len-1) or -1 if none. This is a change from what Sean
92 * did in the Infernal code back towards the original way it was
93 * done in the Squid code (compstruct_main.c). In this case, the
94 * numbering scheme does not match Zuker's .ct files, but does
95 * match the way the MSA is stored using the SQUID library
96 * functions.
97 *
98 * This version does not allow pseudoknots. Thus ">" and "<" are
99 * used for base pairs, and all other characters, including white
100 * space, are taken to mean unpaired nucleotides.
101 *
102 * Return: ret_ct is allocated here and must be free'd by caller.
103 * Returns pointer to ret_ct, or NULL if ss is somehow inconsistent.
104 */
rjk_KHS2ct(char * ss,int len)105 int *rjk_KHS2ct(char *ss, int len) {
106 ESL_STACK *pda;
107 int *ct;
108 int pos, pair;
109 int status;
110
111 /* Initialization: always initialize the main pda (0),
112 */
113 pda = esl_stack_ICreate();
114
115 ESL_ALLOC(ct, len * sizeof(int));
116 for (pos = 0; pos < len; pos++)
117 ct[pos] = -1;
118
119 for (pos = 0; pos < len; pos++) {
120 if (!isprint(ss[pos])) { /* armor against garbage strings */
121 free (ct);
122 esl_stack_Destroy(pda);
123 return (NULL);
124 } else if (ss[pos] == '>') { /* left side of a pair: push onto stack */
125 if((status = esl_stack_IPush(pda, pos)) != eslOK) goto ERROR;
126 } else if (ss[pos] == '<') { /* right side of a pair; resolve pair */
127 if (esl_stack_IPop(pda, &pair) == eslEOD) {
128 free (ct);
129 esl_stack_Destroy(pda);
130 return (NULL);
131 } else {
132 if(status != eslOK) goto ERROR;
133 ct[pos] = pair;
134 ct[pair] = pos;
135 }
136 }
137 }
138 /* nothing should be left on stacks */
139 if (esl_stack_ObjectCount(pda) != 0) {
140 free (ct);
141 esl_stack_Destroy(pda);
142 return (NULL);
143 }
144 esl_stack_Destroy(pda);
145 return (ct);
146
147 ERROR:
148 cm_Fail("Memory allocation error.");
149 return NULL; /* never reached */
150 }
151
152 /*
153 * Setup the matrix by allocating matrix in two dimensions as triangle.
154 * Initialize to 0.0
155 */
setup_matrix(int size)156 matrix_t *setup_matrix (int size) {
157 int c;
158 matrix_t *mat;
159 int status;
160
161 ESL_ALLOC(mat, sizeof(matrix_t));
162 mat->edge_size = size;
163 mat->full_size = matrix_index((size-1),(size-1)) + 1;
164 mat->E = 0.0;
165 mat->H = 0.0;
166 ESL_ALLOC(mat->matrix, sizeof(double) * mat->full_size);
167 for (c=0; c<mat->full_size; c++) {
168 mat->matrix[c] = 0.0;
169 }
170 return(mat);
171
172 ERROR:
173 cm_Fail("Memory allocation error.");
174 return NULL; /* never reached */
175 }
176
177 /*
178 * middle_of_stem
179 *
180 * Boolean function, returns TRUE if the gap is in the middle of a stem,
181 * false if otherwise.
182 *
183 * Inputs:
184 * msa -- the msa
185 * i, j -- indices of the two seqs
186 * pos -- the position of the gap in question
187 * ct -- array of ct arrays
188 */
middle_of_stem(ESL_MSA * msa,int i,int j,int pos,int ** ct)189 int middle_of_stem (ESL_MSA *msa, int i, int j, int pos, int **ct) {
190 int gap_start_pos, gap_stop_pos;
191
192 if (esl_abc_CIsGap(msa->abc, msa->aseq[i][pos]) &&
193 esl_abc_CIsGap(msa->abc, msa->aseq[j][pos]))
194 /* Double gap -- exit */
195 return (0);
196
197 if (esl_abc_CIsGap(msa->abc, msa->aseq[j][pos])) {
198 /* Swap so gap is in i */
199 gap_start_pos = i;
200 i = j;
201 j = gap_start_pos;
202 }
203
204 /* Now, find start and end positions of the gap */
205 for (gap_start_pos = pos; gap_start_pos >= 0; gap_start_pos--) {
206 if (!esl_abc_CIsGap(msa->abc, msa->aseq[i][gap_start_pos]))
207 break;
208 }
209 for (gap_stop_pos = pos; gap_stop_pos < msa->alen; gap_stop_pos++) {
210 if (!esl_abc_CIsGap(msa->abc, msa->aseq[i][gap_stop_pos]))
211 break;
212 }
213 if (gap_start_pos < 0 || gap_start_pos >= msa->alen)
214 /* Gap at end of alignment; can't be internal to stem */
215 return (0);
216
217 if (ct[i][gap_start_pos] == 0 || ct[j][gap_start_pos] == 0 ||
218 ct[i][gap_stop_pos] == 0 || ct[j][gap_stop_pos] == 0)
219 /* Either of the ends not paired */
220 return (0);
221
222 if (ct[i][gap_start_pos] != ct[j][gap_start_pos] ||
223 ct[i][gap_stop_pos] != ct[j][gap_stop_pos])
224 /* The ends not paired homologously */
225 return (0);
226
227 return (1);
228 }
229
230 /* TAKEN FROM SQUID's weight.c's simple_distance, but rewritten to
231 * be simple_identity
232 * Function: simple_identity()
233 *
234 * Purpose: For two identical-length null-terminated strings, return
235 * the fractional identity between them. (0..1)
236 * (Gaps don't count toward anything.)
237 */
238 float
simple_identity(const ESL_ALPHABET * abc,char * s1,char * s2)239 simple_identity(const ESL_ALPHABET *abc, char *s1, char *s2)
240 {
241 int diff = 0;
242 int valid = 0;
243
244 for (; *s1 != '\0'; s1++, s2++)
245 {
246 if (esl_abc_CIsGap(abc, *s1) || esl_abc_CIsGap(abc, *s2)) continue;
247 if (*s1 == *s2) diff++;
248 valid++;
249 }
250 return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
251 }
252
253 /*
254 * count_matrix
255 *
256 * Given an MSA and two matrices, counts pairs of column(s) from two sequences
257 * at a time into the matrices using the BLOSUM algorithm.
258 *
259 * Fills in paired matrix (for basepairs), unpaired, background nt count in
260 * aligned regions
261 *
262 * If product weights is false, weight of a pair is average of their weights.
263 * If it is true, weight of a pair is product of their weights
264 *
265 * Each nucleotide at each position can be:
266 * one of gap, unknown character (not ACGTU), known character
267 * one of left bp, right bp, not base paired
268 * If both characters are gaps:
269 * Skip
270 * If both characters are known:
271 * If both are left bps and match to same right bps
272 * continue
273 * If both are right bps and match to same left bps
274 * add to pairedmat
275 * Otherwise
276 * Add to unpairedmat
277 *
278 * Note: Never used in current implementation. Here for reference,
279 * in case new RIBOSUM matrices are trained someday.
280 * EPN, Tue Aug 7 15:10:33 2007
281 */
count_matrix(ESL_MSA * msa,fullmat_t * fullmat,double * background_nt,int cutoff_perc,int product_weights)282 void count_matrix (ESL_MSA *msa, fullmat_t *fullmat, double *background_nt,
283 int cutoff_perc, int product_weights) {
284
285 /* contract check */
286 if(msa->abc->type != eslRNA) cm_Fail("In count_matrix, MSA's alphabet is not RNA.");
287 if(msa->flags & eslMSA_DIGITAL) cm_Fail("In count_matrix, MSA must be text, not digitized.");
288
289 int status;
290 int i, j; /* Seqs we're checking */
291 int pos; /* Column we're counting */
292 int prev_pos; /* Last column we counted */
293 float cur_wgt;
294 int **ct; /* ct matrix for all the seqs */
295 #ifdef REPORT_COUNTED
296 int totpair = 0;
297 int totsing = 0;
298 #endif
299
300 /*****************************************
301 * 1. Allocate and fill in ct array
302 *****************************************/
303 ESL_ALLOC(ct, sizeof(int *)*msa->nseq);
304 if (msa->ss_cons != NULL) {
305 ct[0] = rjk_KHS2ct (msa->ss_cons, msa->alen);
306 } else {
307 ct[0] = rjk_KHS2ct (msa->ss[0], msa->alen);
308 }
309 for (i=1; i<msa->nseq; i++) {
310 if (msa->ss_cons != NULL) {
311 ct[i] = ct[0];
312 } else {
313 ct[i] = rjk_KHS2ct (msa->ss[i], msa->alen);
314 }
315 }
316 for (i=0; i<msa->nseq; i++) {
317 if (ct[i] == NULL) {
318 cm_Fail("CT string %d is NULL\n", i);
319 }
320 }
321
322 /**********************************
323 * 2. Count
324 **********************************/
325 for (i=0; i<msa->nseq; i++) {
326 for (j=0; j<i; j++) {
327 /* First, make sure it's above the cutoff */
328 if (simple_identity(msa->abc, msa->aseq[i], msa->aseq[j]) <
329 0.01*(float)cutoff_perc)
330 continue; /* Not above cutoff */
331
332 if (product_weights == 1) {
333 cur_wgt = msa->wgt[i] * msa->wgt[j];
334 } else {
335 cur_wgt = (msa->wgt[i] + msa->wgt[j])/2.;
336 }
337
338 /* First, skip starting double gaps */
339 for (prev_pos = 0; prev_pos <msa->alen &&
340 is_rna_gap (msa, i, prev_pos) && is_rna_gap (msa, j, prev_pos);
341 prev_pos++);
342
343 for (pos=prev_pos; pos<msa->alen; pos++) {
344 if (is_defined_rna_nucleotide(msa, i, pos) &&
345 is_defined_rna_nucleotide (msa, j, pos)) {
346 /* If both positions are defined nucleotides */
347 /* If both are left bps and match to same right bps, continue
348 If both are right bps and match to same left bps, add to \
349 pairedmat. Otherwise, add to unpairedmat */
350 if (ct[i][pos] >= 0 && ct[j][pos] >= 0) { /* Base pairs */
351 if (ct[i][pos] == ct[j][pos]) { /* Both equal */
352 if (is_defined_rna_nucleotide(msa, i, ct[i][pos]) && \
353 is_defined_rna_nucleotide (msa, j, ct[j][pos])) {
354 /* Both are RNA nucleotides */
355 if (pos < ct[i][pos] && pos <ct[j][pos]) { /* Both left bps */
356 continue;
357 } else { /* Both right bps */
358 fullmat->paired->matrix\
359 [matrix_index(numbered_basepair(msa->aseq[i][ct[i][pos]],
360 msa->aseq[i][pos]),
361 numbered_basepair(msa->aseq[j][ct[j][pos]],
362 msa->aseq[j][pos]))] +=
363 cur_wgt;
364 background_nt[numbered_nucleotide
365 (msa->aseq[i][ct[i][pos]])] += cur_wgt;
366 background_nt[numbered_nucleotide
367 (msa->aseq[i][pos])] += cur_wgt;
368 background_nt[numbered_nucleotide
369 (msa->aseq[j][ct[j][pos]])] += cur_wgt;
370 background_nt[numbered_nucleotide
371 (msa->aseq[j][pos])] += cur_wgt;
372 #ifdef REPORT_COUNTED
373 totpair++;
374 #endif
375 continue;
376 }
377 }
378 }
379 }
380
381 fullmat->unpaired->matrix
382 [matrix_index(numbered_nucleotide(msa->aseq[i][pos]),
383 numbered_nucleotide(msa->aseq[j][pos]))] += cur_wgt;
384 background_nt[numbered_nucleotide(msa->aseq[i][pos])] += cur_wgt;
385 background_nt[numbered_nucleotide(msa->aseq[j][pos])] += cur_wgt;
386 #ifdef REPORT_COUNTED
387 totsing++;
388 #endif
389 }
390 }
391 #ifdef REPORT_COUNTED
392 printf ("%d pairs counted\n%d singles counted\n", totpair, totsing);
393 totpair = 0;
394 totsing = 0;
395 #endif
396 }
397 }
398
399 /* Free ct arrays */
400 if (ct[0] == ct[1]) {
401 free (ct[0]);
402 } else {
403 for (i=0; i<msa->nseq; i++) {
404 free (ct[i]);
405 }
406 }
407 free (ct);
408 return;
409
410 ERROR:
411 cm_Fail("Memory allocation error.");
412 }
413
414 /*
415 * print_matrix
416 *
417 * Dumps the paired and unpaired matrices and gap penalties
418 */
print_matrix(FILE * fp,fullmat_t * fullmat)419 void print_matrix (FILE *fp, fullmat_t *fullmat)
420 {
421 int i, j;
422
423 fprintf (fp, "%s\n\n", fullmat->name);
424
425 /* EPN: print background NT frequencies */
426 fprintf (fp, " ");
427 for (i=0; i < fullmat->abc->K; i++) {
428 fprintf (fp, "%c ", fullmat->abc->sym[i]);
429 }
430 fprintf (fp, "\n");
431
432 fprintf (fp, " ");
433 for (i=0; i < fullmat->abc->K; i++) {
434 fprintf (fp, "%-11f ", fullmat->g[i]);
435 }
436 fprintf (fp, "\n\n");
437
438 fprintf (fp, " ");
439 for (i=0; i < fullmat->abc->K; i++) {
440 fprintf (fp, "%c ", fullmat->abc->sym[i]);
441 }
442 fprintf (fp, "\n");
443
444 for (i=0; i < fullmat->abc->K; i++) {
445 fprintf (fp, "%c ", fullmat->abc->sym[i]);
446 for (j=0; j<=i; j++) {
447 fprintf (fp, "%-11f ", fullmat->unpaired->matrix[matrix_index(fullmat->abc->inmap[(int) fullmat->abc->sym[i]], fullmat->abc->inmap[(int)fullmat->abc->sym[j]])]);
448 }
449 fprintf (fp, "\n");
450 }
451 if (strstr (fullmat->name, "RIBOPROB") == NULL) /* Not probability mat */
452 fprintf (fp, "H: %.4f\nE: %.4f\n", fullmat->unpaired->H, fullmat->unpaired->E);
453
454 fprintf (fp, "\n ");
455 for (i=0; i < (fullmat->abc->K * fullmat->abc->K); i++) {
456 fprintf (fp, "%c%c ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]);
457 }
458 fprintf (fp, "\n");
459 for (i=0; i < (fullmat->abc->K * fullmat->abc->K); i++) {
460 fprintf (fp, "%c%c ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]);
461 for (j=0; j<=i; j++) {
462 /* ORIGINAL: fprintf (fp, "%-9.2f ", fullmat->paired->matrix[matrix_index(numbered_basepair(RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]), numbered_basepair (RNAPAIR_ALPHABET[j], RNAPAIR_ALPHABET2[j]))]);*/
463 /*EPN: */
464 fprintf (fp, "%-11f ", fullmat->paired->matrix[matrix_index(numbered_basepair(RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]), numbered_basepair (RNAPAIR_ALPHABET[j], RNAPAIR_ALPHABET2[j]))]);
465 }
466 fprintf (fp, "\n");
467 }
468
469 if (strstr (fullmat->name, "RIBOPROB") == NULL) /* Not probability mat */
470 fprintf (fp, "H: %.4f\nE: %.4f\n", fullmat->paired->H, fullmat->paired->E);
471 fprintf (fp, "\n");
472 }
473
474 /*
475 * Read the matrix from a file.
476 * New EPN version, expects background freqs in file.
477 */
ReadMatrix(const ESL_ALPHABET * abc,FILE * matfp)478 fullmat_t *ReadMatrix(const ESL_ALPHABET *abc, FILE *matfp) {
479
480 /* Contract check */
481 if(abc->type != eslRNA)
482 cm_Fail("Trying to read RIBOSUM matrix from RSEARCH, but alphabet is not eslRNA.");
483
484 int status;
485 char linebuf[256];
486 char fullbuf[16384];
487 int fullbuf_used = 0;
488 fullmat_t *fullmat;
489 int i;
490 char *cp, *end_mat_pos;
491
492 ESL_ALLOC(fullmat, (sizeof(fullmat_t)));
493 fullmat->abc = abc; /* just a pointer */
494 fullmat->unpaired = setup_matrix (fullmat->abc->K);
495 fullmat->paired = setup_matrix (fullmat->abc->K * fullmat->abc->K);
496 ESL_ALLOC(fullmat->g, sizeof(float) * fullmat->abc->K);
497
498 while (fgets (linebuf, 255, matfp)) {
499 strncpy (fullbuf+fullbuf_used, linebuf, 16384-fullbuf_used-1);
500 fullbuf_used += strlen(linebuf);
501 if (fullbuf_used >= 16384) {
502 cm_Fail ("ERROR: Matrix file bigger than 16kb\n");
503 }
504 }
505
506 /* First, find RIBO, and copy matrix name to fullmat->name */
507 cp = strstr (fullbuf, "RIBO");
508 for (i = 0; cp[i] && !isspace(cp[i]); i++); /* Find space after RIBO */
509 ESL_ALLOC(fullmat->name, sizeof(char)*(i+1));
510 strncpy (fullmat->name, cp, i);
511 fullmat->name[i] = '\0';
512 cp = cp + i;
513 if(strstr (fullmat->name, "SUM")) { fullmat->scores_flag = TRUE; fullmat->probs_flag = FALSE; }
514 else if(strstr (fullmat->name, "PROB")) { fullmat->scores_flag = FALSE; fullmat->probs_flag = TRUE; }
515 else cm_Fail("ERROR reading matrix, name does not include SUM or PROB.\n");
516
517 /* Now, find the first A */
518 cp = strchr (cp, 'A');
519 fullmat->unpaired->edge_size = 0;
520 /* And count how edge size of the matrix */
521 while (*cp != '\n' && cp-fullbuf < fullbuf_used) {
522 if (!isspace (cp[0]) && isspace (cp[1])) {
523 fullmat->unpaired->edge_size++;
524 }
525 cp++;
526 }
527 /* EPN added to read background freqs to store in g vector */
528
529 /* Read background freqs until we hit the next A */
530 end_mat_pos = strchr (cp, 'A');
531 for (i=0; cp - fullbuf < end_mat_pos-fullbuf; i++) {
532 while (!isdigit(*cp) && *cp != '-' && *cp != '.' && \
533 cp-fullbuf < fullbuf_used && cp != end_mat_pos) {
534 cp++;
535 }
536 if (cp == end_mat_pos)
537 break;
538 if (cp-fullbuf < fullbuf_used) {
539 fullmat->g[i] = atof(cp);
540 while ((isdigit (*cp) || *cp == '-' || *cp == '.') &&\
541 (cp-fullbuf <fullbuf_used)) {
542 cp++;
543 }
544 }
545 }
546 /* we've read the background, normalize it */
547 esl_vec_FNorm(fullmat->g, fullmat->unpaired->edge_size);
548
549 /* We've already found the next A */
550 /* end EPN block */
551
552 /* Take numbers until we hit the H: */
553 end_mat_pos = strstr (cp, "H:");
554 for (i=0; cp - fullbuf < end_mat_pos-fullbuf; i++) {
555 while (!isdigit(*cp) && *cp != '-' && *cp != '.' && \
556 cp-fullbuf < fullbuf_used && cp != end_mat_pos) {
557 cp++;
558 }
559 if (cp == end_mat_pos)
560 break;
561 if (cp-fullbuf < fullbuf_used) {
562 fullmat->unpaired->matrix[i] = atof(cp);
563 while ((isdigit (*cp) || *cp == '-' || *cp == '.') &&\
564 (cp-fullbuf <fullbuf_used)) {
565 cp++;
566 }
567 }
568 }
569 fullmat->unpaired->full_size = i;
570
571 /* Skip the H: */
572 cp += 2;
573 fullmat->unpaired->H = atof(cp);
574
575 /* Now, go past the E: */
576 cp = strstr (cp, "E:") + 2;
577 fullmat->unpaired->E = atof(cp);
578
579 /********* PAIRED MATRIX ************/
580 /* Now, find the first A */
581 cp = strchr (cp, 'A');
582 fullmat->paired->edge_size = 0;
583 /* And count how edge size of the matrix */
584 while (*cp != '\n') {
585 if (!isspace (cp[0]) && isspace (cp[1])) {
586 fullmat->paired->edge_size++;
587 }
588 cp++;
589 }
590
591 /* Find next A */
592 while (*cp != 'A' && (cp-fullbuf) < fullbuf_used) cp++;
593
594 /* Take numbers until we hit the H: */
595 end_mat_pos = strstr (cp, "H:");
596 for (i=0; cp - fullbuf < end_mat_pos-fullbuf; i++) {
597 while (!isdigit(*cp) && *cp != '-' && *cp != '.' && \
598 cp-fullbuf < fullbuf_used && cp != end_mat_pos) {
599 cp++;
600 }
601 if (cp == end_mat_pos)
602 break;
603 if (cp-fullbuf < fullbuf_used) {
604 fullmat->paired->matrix[i] = atof(cp);
605 while ((isdigit (*cp) || *cp == '-' || *cp == '.') &&\
606 (cp-fullbuf <fullbuf_used)) {
607 cp++;
608 }
609 }
610 }
611 fullmat->paired->full_size = i;
612
613 /* Skip the H: */
614 cp += 2;
615 fullmat->paired->H = atof(cp);
616
617 /* Now, go past the E: */
618 cp = strstr (cp, "E:") + 2;
619 fullmat->paired->E = atof(cp);
620
621 /*print_matrix(stdout, fullmat);*/
622 return (fullmat);
623
624 ERROR:
625 cm_Fail("Memory allocation error.");
626 return NULL; /* never reached */
627 }
628
629 /*
630 * MatFileOpen
631 *
632 * Given name of matrix file, open it
633 *
634 */
MatFileOpen(char * matfile)635 FILE *MatFileOpen (char *matfile)
636 {
637 FILE *fp;
638
639 if (matfile == NULL)
640 return NULL;
641
642 fp = fopen (matfile, "r");
643 if (fp != NULL) return (fp);
644
645 return (NULL);
646 }
647
648 /*
649 * Function: get_min_alpha_beta_sum()
650 * Date: RJK, Mon Apr 29, 2002 [St. Louis]
651 * Purpose: Given a full matrix, reports minimum sum allowed
652 * for alpha and beta (or alpha' and beta')
653 * The maximum for alpha+beta is found by
654 * min Sp(i,j)(k,l) - max{Su(i,k), Su(j,l)}
655 * for all i,j,k.l
656 */
get_min_alpha_beta_sum(fullmat_t * fullmat)657 float get_min_alpha_beta_sum (fullmat_t *fullmat) {
658 float max_sum = 9999999.9; /* max allowed value of alpha+beta */
659 float cur_dif;
660 int i,j,k,l;
661 int pair_ij, pair_kl;
662
663 for (i=0; i<fullmat->abc->K; i++)
664 for (j=0; j<fullmat->abc->K; j++)
665 for (k=0; k<fullmat->abc->K; k++)
666 for (l=0; l<fullmat->abc->K; l++) {
667 pair_ij = numbered_basepair(fullmat->abc->sym[i], fullmat->abc->sym[j]);
668 pair_kl = numbered_basepair(fullmat->abc->sym[k], fullmat->abc->sym[l]);
669 /* First check for i,k paired */
670 cur_dif = fullmat->paired->matrix[matrix_index(pair_ij, pair_kl)] -
671 fullmat->unpaired->matrix[matrix_index(i,k)];
672 if (cur_dif < max_sum)
673 max_sum = cur_dif;
674 /* And repeat for j,l */
675 cur_dif = fullmat->paired->matrix[matrix_index(pair_ij, pair_kl)] -
676 fullmat->unpaired->matrix[matrix_index(j,l)];
677 if (cur_dif < max_sum)
678 max_sum = cur_dif;
679 }
680 return (-1. * max_sum);
681 }
682
683 /* EPN, Tue Feb 6 15:34:00 2007 */
FreeMat(fullmat_t * fullmat)684 void FreeMat(fullmat_t *fullmat)
685 {
686 if(fullmat->unpaired != NULL)
687 {
688 free(fullmat->unpaired->matrix);
689 free(fullmat->unpaired);
690 }
691 if(fullmat->paired != NULL)
692 {
693 free(fullmat->paired->matrix);
694 free(fullmat->paired);
695 }
696 if(fullmat->name != NULL)
697 free(fullmat->name);
698 if(fullmat->g != NULL)
699 free(fullmat->g);
700 free(fullmat);
701 }
702
703
704 /* Function: ribosum_calc_targets()
705 * Incept: EPN, Wed Mar 14 06:01:11 2007
706 *
707 * Purpose: Given a RIBOSUM score matrix data structure (fullmat_t) with
708 * log odds scores (as read in from a RIBOSUM file) and a background
709 * model (fullmat->g), overwrite the log-odds scores with target
710 * probabilities f_ij that satisfy:
711 *
712 * sum_ij f_ij = 1.0
713 *
714 * NOTE: these target probs are not the same target probs
715 * dumped by makernamat -p in RSEARCH, those *off-diagonals*
716 * are double what I'm calculating here.
717 * (so that sum_ii f'_ii + sum_j<i f'_ij = 1.0)
718 *
719 *
720 * Returns: <eslOK> on success.
721 *
722 */
ribosum_calc_targets(fullmat_t * fullmat)723 int ribosum_calc_targets(fullmat_t *fullmat)
724 {
725 int idx;
726 int a,b,i,j,k,l;
727
728 /* Check the contract. */
729 if(!(fullmat->scores_flag)) ESL_EXCEPTION(eslEINVAL, "in ribosum_calc_targets(), matrix is not in log odds mode");
730 if(fullmat->probs_flag) ESL_EXCEPTION(eslEINVAL, "in ribosum_calc_targets(), matrix is already in probs mode");
731
732 /*printf("\nbeginning of ribosum_calc_targets, printing mx:\n");
733 print_matrix(stdout, fullmat);*/
734
735 /* convert log odds score s_ij, to target (f_ij)
736 * using background freqs (g), by:
737 * f_ij = g_i * g_j * 2^{s_ij} */
738
739 /* first convert the unpaired (singlet) matrix,
740 * remember matrix is set up as a vector */
741 idx = 0;
742 for(i = 0; i < fullmat->abc->K; i++)
743 for(j = 0; j <= i; j++)
744 {
745 fullmat->unpaired->matrix[idx] =
746 fullmat->g[i] * fullmat->g[j] * sreEXP2(fullmat->unpaired->matrix[idx]);
747 idx++;
748 }
749 /* and the paired matrix, careful about for loops here, we
750 * use 4 nested ones just to keep track of which backgrounds to multiply
751 * (the g[i] * g[j] * g[k] * g[l] part) */
752 idx = 0;
753 for(a = 0; a < sizeof(RNAPAIR_ALPHABET)-1; a++)
754 for(b = 0; b <= a; b++)
755 {
756 i = a / fullmat->abc->K;
757 j = a % fullmat->abc->K;
758 k = b / fullmat->abc->K;
759 l = b % fullmat->abc->K;
760
761 fullmat->paired->matrix[idx] =
762 fullmat->g[i] * fullmat->g[j] * fullmat->g[k] * fullmat->g[l] *
763 sreEXP2(fullmat->paired->matrix[idx]);
764 idx++;
765 }
766
767 /* We have to be careful with normalizing the matrices b/c they are
768 * symmetric and fullmat_t only stores f_ij i<=j. So we double
769 * the f_ij if i!=j, normalize it (it should then sum to 1.)
770 * and then halve the f_ij i != j's. */
771 /* normalize the unpaired matrix */
772 idx = 0;
773 for(i = 0; i < fullmat->abc->K; i++)
774 for(j = 0; j <= i; j++)
775 {
776 if(i != j) fullmat->unpaired->matrix[idx] *= 2.;
777 idx++;
778 }
779 esl_vec_DNorm(fullmat->unpaired->matrix, fullmat->unpaired->full_size);
780 idx = 0;
781 for(i = 0; i < fullmat->abc->K; i++)
782 for(j = 0; j <= i; j++)
783 {
784 if(i != j) fullmat->unpaired->matrix[idx] *= 0.5;
785 idx++;
786 }
787
788 /* normalize the paired matrix */
789 idx = 0;
790 for(a = 0; a < sizeof(RNAPAIR_ALPHABET)-1; a++)
791 for(b = 0; b <= a; b++)
792 {
793 if(a != b) fullmat->paired->matrix[idx] *= 2.;
794 idx++;
795 }
796 esl_vec_DNorm(fullmat->paired->matrix, fullmat->paired->full_size);
797 idx = 0;
798 for(a = 0; a < sizeof(RNAPAIR_ALPHABET)-1; a++)
799 for(b = 0; b <= a; b++)
800 {
801 if(a != b) fullmat->paired->matrix[idx] *= 0.5;
802 idx++;
803 }
804
805 /* Lower the scores_flag, raise probs_flag */
806 fullmat->scores_flag = FALSE;
807 fullmat->probs_flag = TRUE;
808
809 /*printf("\nend of ribosum_calc_targets, printing mx:\n");
810 print_matrix(stdout, fullmat);*/
811 return eslOK;
812 }
813
814 /* Function: ribosum_MSA_resolve_degeneracies
815 *
816 * Incept: EPN, Thu Mar 15 05:37:22 2007
817 *
818 * Purpose: Given a RIBOSUM score matrix data structure (fullmat_t) with
819 * target probabilites and a MSA with SS markup, remove all
820 * ambiguous bases. Do this by selecting the most likely
821 * singlet or base pair that matches each ambiguity given the
822 * RIBOSUM matrix.
823 * (ex: (G|A) for 'R', or (GG|GC|AG|AC) bp for 'RS' base pair)
824 *
825 *
826 * Returns: <eslOK> on success.
827 *
828 * The degenerate code used here is:
829 * (taken from http://www.neb.com/neb/products/REs/RE_code.html
830 *
831 * X = A or C or G or T
832 * R = G or A
833 * Y = C or T
834 * M = A or C
835 * K = G or T
836 * S = G or C
837 * W = A or T
838 * H = not G (A or C or T)
839 * B = not A (C or G or T)
840 * V = not T (A or C or G)
841 * D = not C (A or G or T)
842 * N = A or C or G or T
843 */
ribosum_MSA_resolve_degeneracies(fullmat_t * fullmat,ESL_MSA * msa)844 int ribosum_MSA_resolve_degeneracies(fullmat_t *fullmat, ESL_MSA *msa)
845 {
846 int idx;
847 int i,j;
848 int *ct; /* 0..alen-1 base pair partners array */
849 int apos;
850 char c; /* tmp char for current degeneracy, uppercase */
851 char *cp; /* tmp char pointer for finding c in degen_string */
852 char c_m; /* tmp char for current bp mate's degeneracy, uppercase */
853 char *cp_m; /* tmp char pointer for finding c_m in degen_string */
854 char degen_string[13] = "XRYMKSWHBVDN\0";
855 char rna_string[5] = "ACGU\0";
856 int **degen_mx;
857 float *unpaired_marginals;
858 float *paired_marginals;
859 float *cur_unpaired_marginals;
860 float *cur_paired_marginals;
861 char *aseq;
862 int dpos; /* position of c within degen_string */
863 int dpos_m; /* position of c_m within degen_string */
864 int argmax;
865 int mate; /* used as ct[apos-1] */
866 int status;
867 ESL_ALPHABET *msa_abc = msa->abc; /* when we textize this, msa->abc will be set to NULL! */
868
869 /* Check the contract. */
870 if(!(fullmat->probs_flag)) ESL_EXCEPTION(eslEINVAL, "in ribosum_MSA_resolve_degeneracies(), matrix is not in probs mode");
871 if(fullmat->scores_flag) ESL_EXCEPTION(eslEINVAL, "in ribosum_MSA_resolve_degeneracies(), matrix is in scores mode");
872 if(msa->nseq != 1) ESL_EXCEPTION(eslEINVAL, "MSA does not have exactly 1 seq");
873 if(fullmat->abc->type != eslRNA) ESL_EXCEPTION(eslEINVAL, " matrix alphabet not RNA");
874 if(! (msa->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, " MSA is not digitized");
875 if(msa->abc->type != eslRNA && msa->abc->type != eslDNA) ESL_EXCEPTION(eslEINVAL, " MSA alphabet not DNA or RNA");
876
877 /*printf("in ribosum_MSA_resolve_degeneracies()\n");*/
878
879 ESL_ALLOC(unpaired_marginals, sizeof(float) * fullmat->abc->K);
880 ESL_ALLOC(paired_marginals, sizeof(float) * (fullmat->abc->K * fullmat->abc->K));
881 ESL_ALLOC(cur_unpaired_marginals, sizeof(float) * fullmat->abc->K);
882 ESL_ALLOC(cur_paired_marginals, sizeof(float) * (fullmat->abc->K * fullmat->abc->K));
883
884 /* Laboriously fill in degen_mx, NOTE: this will fall over if alphabet is not RNA! */
885 /* This is somewhat unnec, now that we use esl_alphabet.c, but I didnt' want to redo it, so I left it */
886 ESL_ALLOC(degen_mx, sizeof(int *) * 12);
887 for(i = 0; i < 12; i++)
888 {
889 ESL_ALLOC(degen_mx[i], sizeof(int) * fullmat->abc->K);
890 esl_vec_ISet(degen_mx[i], fullmat->abc->K, 0.);
891 }
892 /* 'X' = A|C|G|U */
893 degen_mx[0][0] = degen_mx[0][1] = degen_mx[0][2] = degen_mx[0][3] = 1;
894 /* 'R' = A|G */
895 degen_mx[1][0] = degen_mx[1][2] = 1;
896 /* 'Y' = C|U */
897 degen_mx[2][1] = degen_mx[2][3] = 1;
898 /* 'M' = A|C */
899 degen_mx[3][0] = degen_mx[3][1] = 1;
900 /* 'K' = G|U */
901 degen_mx[4][2] = degen_mx[4][3] = 1;
902 /* 'S' = C|G */
903 degen_mx[5][1] = degen_mx[5][2] = 1;
904 /* 'W' = A|U */
905 degen_mx[6][0] = degen_mx[6][3] = 1;
906 /* 'H' = A|C|U */
907 degen_mx[7][0] = degen_mx[7][1] = degen_mx[7][3] = 1;
908 /* 'B' = C|G|U */
909 degen_mx[8][1] = degen_mx[8][2] = degen_mx[8][3] = 1;
910 /* 'V' = A|C|G */
911 degen_mx[9][0] = degen_mx[9][1] = degen_mx[9][2] = 1;
912 /* 'D' = A|G|U */
913 degen_mx[10][0] = degen_mx[10][2] = degen_mx[10][3] = 1;
914 /* 'N' = A|C|G|U */
915 degen_mx[11][0] = degen_mx[11][1] = degen_mx[11][2] = degen_mx[11][3] = 1;
916
917 /* calculate paired_marginals and unpaired_marginals as:
918 * marginal[x] = sum_y P(x,y)
919 */
920 esl_vec_FSet(unpaired_marginals, fullmat->abc->K, 0.);
921 esl_vec_FSet(paired_marginals, fullmat->abc->K * fullmat->abc->K, 0.);
922
923 for(i = 0; i < fullmat->abc->K; i++)
924 for(j = 0; j < fullmat->abc->K; j++)
925 unpaired_marginals[i] += fullmat->unpaired->matrix[matrix_index(i,j)];
926 idx = 0;
927 for(i = 0; i < (fullmat->abc->K*fullmat->abc->K); i++)
928 for(j = 0; j < (fullmat->abc->K*fullmat->abc->K); j++)
929 paired_marginals[i] += fullmat->paired->matrix[matrix_index(i,j)];
930
931 /*for(i = 0; i < (fullmat->abc->K); i++)
932 printf("unpaired_marginals[i:%d]: %f\n", i, unpaired_marginals[i]);
933 for(i = 0; i < (fullmat->abc->K*fullmat->abc->K); i++)
934 printf("paired_marginals[i:%d]: %f\n", i, paired_marginals[i]);*/
935
936 esl_vec_FNorm(unpaired_marginals, fullmat->abc->K);
937 esl_vec_FNorm(paired_marginals, fullmat->abc->K*fullmat->abc->K);
938
939 /* get ct array, indexed 1..alen while apos is 0..alen-1 */
940 ESL_ALLOC(ct, (msa->alen+1) * sizeof(int));
941 esl_wuss2ct(msa->ss_cons, msa->alen, ct);
942
943 ESL_ALLOC(aseq, sizeof(char) * (msa->alen+1));
944 status = esl_msa_Textize(msa);
945 if(status == eslECORRUPT) cm_Fail("esl_msa_Textize() returned status: %d, the msa must contain invalid digitized chars.", status);
946 else if(status != eslOK) goto ERROR;
947
948 /* remember we only have 1 seq in the MSA */
949 for(apos = 0; apos < msa->alen; apos++)
950 {
951 if (esl_abc_CIsGap(fullmat->abc, msa->aseq[0][apos])) continue; /* we can still have gaps in 1 seq MSA, they'll
952 * be dealt with (ignored) in
953 * modelmaker.c:HandModelmaker() */
954 mate = ct[(apos+1)]; /* apos is 0..alen-1, ct is 1..alen, so mate will be 1..alen now */
955 if(mate != 0 && esl_abc_CIsGap(msa_abc, msa->aseq[0][(mate-1)])) mate = 0;
956 /* apos is a base paired res, but mate is a gap, pretend apos is SS for our purposes here */
957 else if(mate != 0 && ((mate-1) < apos)) continue;
958 /* apos is a base paired res, but we've already changed him to an unambiguous res when we changed his mate (which was not a gap) */
959
960 c = toupper(msa->aseq[0][apos]);
961 if(c == 'T') c = 'U';
962 cp = strchr(rna_string, c);
963 if(cp == NULL)
964 {
965 /* a degeneracy */
966 if((cp = strchr(degen_string, c)) == NULL) ESL_XEXCEPTION(eslEINVAL, "character is not ACGTU or a recognized ambiguity code");
967 dpos = cp-degen_string;
968 if(mate == 0) /* single stranded */
969 {
970 /*printf("\nCASE 1 SS AMBIG\n");
971 printf("apos: %d c: %c\n", apos, c);*/
972 /* of possible residues, find the one with the highest marginal
973 * in RIBOSUM: argmax_x sum_Y P(x,y) */
974 for(i = 0; i < fullmat->abc->K; i++)
975 cur_unpaired_marginals[i] = degen_mx[dpos][i] * unpaired_marginals[i];
976 argmax = esl_vec_FArgMax(cur_unpaired_marginals, fullmat->abc->K);
977 msa->aseq[0][apos] = rna_string[argmax];
978 /*printf("c: %c at posn %d argmax: %d msa[apos:%d]: %c\n", c, (int) (cp-degen_string), argmax, apos, msa->aseq[0][apos]);
979 printf("new ss: %c\n", rna_string[argmax]);*/
980 }
981 else /* paired */
982 {
983 /* is mate ambiguous? */
984 c_m = toupper(msa->aseq[0][(mate-1)]);
985 if(c_m == 'T') c_m = 'U';
986 cp_m = strchr(rna_string, c_m);
987 if(cp_m == NULL)
988 {
989 /* mate is ambiguous */
990 /*printf("\nCASE 4 PAIR, BOTH AMBIG\n");
991 printf("left (apos) %d c: %c mate %d c_m: %c\n", apos, c, (mate-1), c_m);
992 */
993 if((cp_m = strchr(degen_string, c_m)) == NULL) ESL_XEXCEPTION(eslEINVAL, "character is not ACGTU or a recognized ambiguity code");
994 dpos_m = cp_m-degen_string;
995 /* we know that mate != 0 and (mate-1) >= apos, we continued above if that was false */
996 idx = 0;
997 for(i = 0; i < (fullmat->abc->K); i++)
998 for(j = 0; j < (fullmat->abc->K); j++)
999 {
1000 cur_paired_marginals[idx] = degen_mx[dpos][i] * degen_mx[dpos_m][j] *
1001 paired_marginals[idx];
1002 /*printf("degen_mx[dpos: %d][i:%d]: %d\n", dpos, i, degen_mx[dpos][i]);
1003 printf("degen_mx[dpos_m:%d][j:%d]: %d\n", dpos_m, j, degen_mx[dpos_m][j]);
1004 printf("cur_paired_marginals[idx:%d]: %f\n", idx, cur_paired_marginals[idx]);*/
1005 idx++;
1006 }
1007 argmax = esl_vec_FArgMax(cur_paired_marginals, (fullmat->abc->K*fullmat->abc->K));
1008 msa->aseq[0][apos] = RNAPAIR_ALPHABET[argmax];
1009 msa->aseq[0][(mate-1)] = RNAPAIR_ALPHABET2[argmax];
1010 /*printf("new bp: left: %c right: %c\n", RNAPAIR_ALPHABET[argmax], RNAPAIR_ALPHABET2[argmax]);*/
1011
1012 }
1013 else /* mate is unambiguous */
1014 {
1015 /*printf("\nCASE 2 PAIR, LEFT AMBIG, RIGHT NOT\n");
1016 printf("left (apos) %d c: %c mate %d c_m: %c\n", apos, c, (mate-1), c_m);*/
1017 cp_m = strchr(rna_string, c_m);
1018 dpos_m = cp_m - rna_string;
1019 /* cp_m is 0 for A, 1 for C, 2 for G, 3 for U in mate-1 */
1020 idx = 0;
1021 for(i = 0; i < (fullmat->abc->K); i++)
1022 for(j = 0; j < (fullmat->abc->K); j++)
1023 {
1024 cur_paired_marginals[idx] = degen_mx[dpos][i] * (j == dpos_m) *
1025 paired_marginals[idx];
1026 /*printf("cur_paired_marginals[idx:%d]: %f\n", idx, cur_paired_marginals[idx]);*/
1027 idx++;
1028 }
1029 argmax = esl_vec_FArgMax(cur_paired_marginals, (fullmat->abc->K*fullmat->abc->K));
1030 msa->aseq[0][apos] = RNAPAIR_ALPHABET[argmax];
1031 msa->aseq[0][(mate-1)] = RNAPAIR_ALPHABET2[argmax];
1032 /*printf("new bp: left: %c right: %c\n", RNAPAIR_ALPHABET[argmax], RNAPAIR_ALPHABET2[argmax]);*/
1033 }
1034 }
1035 }
1036 /* we could still have unambiguous apos, but an ambiguous mate, which we deal
1037 * with here: */
1038 if(mate != 0)
1039 {
1040 c_m = toupper(msa->aseq[0][(mate-1)]);
1041 if(c_m == 'T') c = 'U';
1042 cp_m = strchr(rna_string, c_m);
1043 if(cp_m == NULL)
1044 {
1045 /* mate is ambiguous */
1046 if((cp_m = strchr(degen_string, c_m)) == NULL) ESL_XEXCEPTION(eslEINVAL, "character is not ACGTU or a recognized ambiguity code");
1047 dpos_m = cp_m - degen_string;
1048 /*printf("\nCASE 3 PAIR, LEFT NOT, RIGHT AMBIG\n");
1049 printf("left (apos) %d c: %c mate %d c_m: %c dpos_m\n", apos, c, (mate-1), c_m, dpos);*/
1050 cp = strchr(rna_string, c);
1051 if(cp == NULL) ESL_XEXCEPTION(eslEINVAL, "character is not ACGTU or a recognized ambiguity code");
1052 dpos = cp - rna_string;
1053 idx = 0;
1054 for(i = 0; i < (fullmat->abc->K); i++)
1055 for(j = 0; j < (fullmat->abc->K); j++)
1056 {
1057 cur_paired_marginals[idx] = (i == dpos) * degen_mx[dpos_m][j] *
1058 paired_marginals[idx];
1059 /*printf("cur_paired_marginals[idx:%d]: %f\n", idx, cur_paired_marginals[idx]);*/
1060 idx++;
1061 }
1062 argmax = esl_vec_FArgMax(cur_paired_marginals, (fullmat->abc->K*fullmat->abc->K));
1063 msa->aseq[0][apos] = RNAPAIR_ALPHABET[argmax];
1064 msa->aseq[0][(mate-1)] = RNAPAIR_ALPHABET2[argmax];
1065 /*printf("new bp: left: %c right: %c\n", RNAPAIR_ALPHABET[argmax], RNAPAIR_ALPHABET2[argmax]);*/
1066 }
1067 }
1068 }
1069 /* go through the sequence again, there should be no ambiguities now */
1070 for(apos = 0; apos < msa->alen; apos++) {
1071 if(esl_abc_CIsGap(msa_abc, msa->aseq[0][apos])) continue;
1072 c = toupper(msa->aseq[0][apos]);
1073 if(c == 'T') c = 'U';
1074 cp = strchr(rna_string, c);
1075 if(cp == NULL) ESL_XEXCEPTION(eslEINVAL, "ribosum_MSA_resolve_degeneracies(), second pass check character %d (%c) is still ambiguous!\n", apos, c);
1076 }
1077
1078 if((status = esl_msa_Digitize(msa_abc, msa, NULL)) != eslOK) goto ERROR;
1079 free(unpaired_marginals);
1080 free(paired_marginals);
1081 free(cur_unpaired_marginals);
1082 free(cur_paired_marginals);
1083 for(i = 0; i < 12; i++) free(degen_mx[i]);
1084 free(degen_mx);
1085 free(ct);
1086 free(aseq);
1087
1088 return eslOK;
1089
1090 ERROR:
1091 return eslEINVAL;
1092 }
1093
1094 /*
1095 * Maps i as follows:
1096 * 0->A
1097 * 1->C
1098 * 2->G
1099 * 3->U
1100 * else->-1
1101 */
unpaired_res(int i)1102 int unpaired_res (int i)
1103 {
1104 switch (i) {
1105 case 0:
1106 return ('A');
1107 case 1:
1108 return ('C');
1109 case 2:
1110 return ('G');
1111 case 3:
1112 return ('U');
1113 }
1114 return (-1);
1115 }
1116
1117
1118