1 /* Formatting, transmitting, and printing single alignments to a
2 * profile.
3 *
4 * Contents:
5 * 1. The P7_ALIDISPLAY object.
6 * 2. The P7_ALIDISPLAY API.
7 * 3. Debugging/dev code.
8 * 4. Benchmark driver.
9 * 5. Unit tests.
10 * 6. Test driver.
11 * 7. Example.
12 */
13 #include "p7_config.h"
14
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <string.h>
18 #include <ctype.h>
19 #include <string.h>
20
21 #include "easel.h"
22 #include "hmmer.h"
23
24 // Define bit-vector constants used in the _Serialize() and _Deserialize routines
25 #define RFLINE_PRESENT (1 << 0)
26 #define MMLINE_PRESENT (1 << 1)
27 #define CSLINE_PRESENT (1 << 2)
28 #define PPLINE_PRESENT (1 << 3)
29 #define ASEQ_PRESENT (1 << 4)
30 #define NTSEQ_PRESENT (1 << 5)
31
32 /*****************************************************************
33 * 1. The P7_ALIDISPLAY object
34 *****************************************************************/
35
36 /* A P7_ALIDISPLAY structure originally had two possible states:
37 * "serialized", in which all of the variable-length fields of the structure
38 * were pointers into one block of allocated memory, which was pointed to by the
39 * structure's "mem" field, and
40 * "deserialized", in which each of the variable-length fields of the structure
41 * were separate regions of allocated memory. The intent of this was that
42 * "serialized" structures would be faster to create, as they only require one
43 * memory allocation, and easier to send over sockets, but that "deserialized"
44 * structures would be easier to modify and/or reuse. However, HMMER currently
45 * does not ever modify or reuse P7_ALIDISPLAY structures.
46 *
47 * P7_ALIDISPLAY structures are created in "serialized" mode. The only time they
48 * were ever converted into "deserialized" mode was in hmmpgmd2msa, and this was
49 * somewhat unnecessary.
50 *
51 * Now, we have added _Serialize() and _Deserialize() functions that have the more
52 * traditional behavior of converting P7_ALIDISPLAY structures to and from
53 * contiguous blocks of bytes for transmission over sockets. With this change,
54 * P7_ALIDISPLAY structures will always be in "serialized" mode, and will never be
55 * converted to their "deserialized" mode. (Yes, this terminology is confusing.)
56 *
57 * For H3, we will leave the existing code to handle the "serialized" and "deserialized"
58 * modes in place. HMMER's output code is being heavily revised for H4, and we will
59 * revisit how these modes are handled in H4 once that portion of the code stabilizes.
60 */
61
62 /* Function: p7_alidisplay_Create()
63 * Synopsis: Create an alignment display, from trace and oprofile.
64 *
65 * Purpose: Creates and returns an alignment display for domain number
66 * <which> in traceback <tr>, where the traceback
67 * corresponds to an alignment of optimized profile <om> to digital sequence
68 * <dsq>, and the unique name of that target
69 * sequence <dsq> is <sqname>. The <which> index starts at 0.
70 *
71 * It will be a little faster if the trace is indexed with
72 * <p7_trace_Index()> first. The number of domains is then
73 * in <tr->ndom>. If the caller wants to create alidisplays
74 * for all of these, it would loop <which> from
75 * <0..tr->ndom-1>.
76 *
77 * However, even without an index, the routine will work fine.
78 *
79 * Args: tr - traceback
80 * which - domain number, 0..tr->ndom-1
81 * om - optimized profile (query)
82 * sq - digital sequence (target)
83 * ntsq - text sequence (original nucleotide target in the case of translated search)
84 * ddef_app - optional posterior prob alignment line; only nhmmer sends a not-NULL value
85 *
86 * Returns: <eslOK> on success.
87 *
88 * Throws: <NULL> on allocation failure, or if something's internally corrupt
89 * in the data.
90 */
91 P7_ALIDISPLAY *
p7_alidisplay_Create(const P7_TRACE * tr,int which,const P7_OPROFILE * om,const ESL_SQ * sq,const ESL_SQ * ntsq)92 p7_alidisplay_Create(const P7_TRACE *tr, int which, const P7_OPROFILE *om, const ESL_SQ *sq, const ESL_SQ *ntsq)
93 {
94 P7_ALIDISPLAY *ad = NULL;
95 char *Alphabet = om->abc->sym;
96 int n, pos, z;
97 int z1,z2;
98 int k,x,i,s;
99 int hmm_namelen, hmm_acclen, hmm_desclen;
100 int sq_namelen, sq_acclen, sq_desclen;
101 int status;
102 ESL_SQ *ntorfseqtxt = NULL;
103
104 /* First figure out which piece of the trace (from first match to last match)
105 * we're going to represent, and how big it is.
106 */
107 if (tr->ndom > 0) { /* if we have an index, this is a little faster: */
108 for (z1 = tr->tfrom[which]; z1 < tr->N; z1++) if (tr->st[z1] == p7T_M) break; /* find next M state */
109 if (z1 == tr->N) return NULL; /* no M? corrupt trace */
110 for (z2 = tr->tto[which]; z2 >= 0 ; z2--) if (tr->st[z2] == p7T_M) break; /* find prev M state */
111 if (z2 == -1) return NULL; /* no M? corrupt trace */
112 } else { /* without an index, we can still do it fine: */
113 for (z1 = 0; which >= 0 && z1 < tr->N; z1++) if (tr->st[z1] == p7T_B) which--; /* find the right B state */
114 if (z1 == tr->N) return NULL; /* no such domain <which> */
115 for (; z1 < tr->N; z1++) if (tr->st[z1] == p7T_M) break; /* find next M state */
116 if (z1 == tr->N) return NULL; /* no M? corrupt trace */
117 for (z2 = z1; z2 < tr->N; z2++) if (tr->st[z2] == p7T_E) break; /* find the next E state */
118 for (; z2 >= 0; z2--) if (tr->st[z2] == p7T_M) break; /* find prev M state */
119 if (z2 == -1) return NULL; /* no M? corrupt trace */
120 }
121
122 /* Now we know that z1..z2 in the trace will be represented in the
123 * alidisplay; that's z2-z1+1 positions. We need a \0 trailer on all
124 * our display lines, so allocate z2-z1+2. We know each position is
125 * M, D, or I, so there's a 1:1 correspondence of trace positions
126 * with alignment display positions. We also know the display
127 * starts and ends with M states.
128 *
129 * So now let's allocate. The alidisplay is packed into a single
130 * memory space, so this appears to be intricate, but it's just
131 * bookkeeping.
132 */
133 n = (z2-z1+2) * 3; /* model, mline, aseq mandatory */
134
135 if (om->rf[0] != 0) n += z2-z1+2; /* optional reference line */
136 if (om->mm[0] != 0) n += z2-z1+2; /* optional reference line */
137 if (om->cs[0] != 0) n += z2-z1+2; /* optional structure line */
138 if (tr->pp != NULL) n += z2-z1+2; /* optional posterior prob line */
139 hmm_namelen = strlen(om->name); n += hmm_namelen + 1;
140 hmm_acclen = (om->acc != NULL ? strlen(om->acc) : 0); n += hmm_acclen + 1;
141 hmm_desclen = (om->desc != NULL ? strlen(om->desc) : 0); n += hmm_desclen + 1;
142
143 sq_namelen = strlen(sq->name); n += sq_namelen + 1;
144 sq_acclen = strlen(sq->acc); n += sq_acclen + 1; /* sq->acc is "\0" when unset */
145 sq_desclen = strlen(sq->desc); n += sq_desclen + 1; /* same for desc */
146
147 ESL_ALLOC(ad, sizeof(P7_ALIDISPLAY));
148 ad->mem = NULL;
149
150 pos = 0;
151 ad->memsize = sizeof(char) * n;
152 ESL_ALLOC(ad->mem, ad->memsize);
153 if (om->rf[0] != 0) { ad->rfline = ad->mem + pos; pos += z2-z1+2; } else { ad->rfline = NULL; }
154 //if (om->mm[0] != 0) { ad->mmline = ad->mem + pos; pos += z2-z1+2; } else { ad->mmline = NULL; }
155 ad->mmline = NULL;
156 if (om->cs[0] != 0) { ad->csline = ad->mem + pos; pos += z2-z1+2; } else { ad->csline = NULL; }
157 ad->model = ad->mem + pos; pos += z2-z1+2;
158 ad->mline = ad->mem + pos; pos += z2-z1+2;
159 ad->aseq = ad->mem + pos; pos += z2-z1+2;
160
161 if (tr->pp != NULL) { ad->ppline = ad->mem + pos; pos += z2-z1+2;} else { ad->ppline = NULL; }
162 ad->hmmname = ad->mem + pos; pos += hmm_namelen +1;
163 ad->hmmacc = ad->mem + pos; pos += hmm_acclen +1;
164 ad->hmmdesc = ad->mem + pos; pos += hmm_desclen +1;
165 ad->sqname = ad->mem + pos; pos += sq_namelen +1;
166 ad->sqacc = ad->mem + pos; pos += sq_acclen +1;
167 ad->sqdesc = ad->mem + pos; pos += sq_desclen +1;
168
169 strcpy(ad->hmmname, om->name);
170 if (om->acc != NULL) strcpy(ad->hmmacc, om->acc); else ad->hmmacc[0] = 0;
171 if (om->desc != NULL) strcpy(ad->hmmdesc, om->desc); else ad->hmmdesc[0] = 0;
172
173 strcpy(ad->sqname, sq->name);
174 strcpy(ad->sqacc, sq->acc);
175 strcpy(ad->sqdesc, sq->desc);
176
177 /* Determine hit coords */
178 ad->hmmfrom = tr->k[z1];
179 ad->hmmto = tr->k[z2];
180 ad->M = om->M;
181
182 ad->sqfrom = tr->i[z1];
183 ad->sqto = tr->i[z2];
184 ad->L = sq->n;
185
186 /* optional rf line */
187 if (ad->rfline != NULL) {
188 for (z = z1; z <= z2; z++) ad->rfline[z-z1] = ((tr->st[z] == p7T_I) ? '.' : om->rf[tr->k[z]]);
189 ad->rfline[z-z1] = '\0';
190 }
191
192 /* optional mm line */
193 if (ad->mmline != NULL) {
194 for (z = z1; z <= z2; z++) ad->mmline[z-z1] = ((tr->st[z] == p7T_I) ? '.' : om->mm[tr->k[z]]);
195 ad->mmline[z-z1] = '\0';
196 }
197
198 /* optional cs line */
199 if (ad->csline != NULL) {
200 for (z = z1; z <= z2; z++) ad->csline[z-z1] = ((tr->st[z] == p7T_I) ? '.' : om->cs[tr->k[z]]);
201 ad->csline[z-z1] = '\0';
202 }
203
204 /* optional pp line */
205 if (ad->ppline != NULL) {
206 for (z = z1; z <= z2; z++) ad->ppline[z-z1] = ( (tr->st[z] == p7T_D) ? '.' : p7_alidisplay_EncodePostProb(tr->pp[z]));
207 ad->ppline[z-z1] = '\0';
208 }
209
210
211 /* mandatory three alignment display lines: model, mline, aseq */
212 for (z = z1; z <= z2; z++)
213 {
214 k = tr->k[z];
215 i = tr->i[z];
216 x = sq->dsq[i];
217 s = tr->st[z];
218
219 switch (s) {
220 case p7T_M:
221 ad->model[z-z1] = om->consensus[k];
222 if (x == esl_abc_DigitizeSymbol(om->abc, om->consensus[k])) ad->mline[z-z1] = ad->model[z-z1];
223 else if (p7_oprofile_FGetEmission(om, k, x) > 1.0) ad->mline[z-z1] = '+'; /* >1 not >0; om has odds ratios, not scores */
224 else ad->mline[z-z1] = ' ';
225 ad->aseq [z-z1] = toupper(Alphabet[x]);
226 break;
227
228 case p7T_I:
229 ad->model [z-z1] = '.';
230 ad->mline [z-z1] = ' ';
231 ad->aseq [z-z1] = tolower(Alphabet[x]);
232 break;
233
234 case p7T_D:
235 ad->model [z-z1] = om->consensus[k];
236 ad->mline [z-z1] = ' ';
237 ad->aseq [z-z1] = '-';
238
239 break;
240
241 default: ESL_XEXCEPTION(eslEINVAL, "invalid state in trace: not M,D,I");
242 }
243 }
244 ad->model [z2-z1+1] = '\0';
245 ad->mline [z2-z1+1] = '\0';
246 ad->aseq [z2-z1+1] = '\0';
247 ad->N = z2-z1+1;
248 ad->ntseq = NULL; // Mark this NULL so it doesn't cause problems with serialization.
249 // nhmmer can reset it if it wants later
250 esl_sq_Destroy(ntorfseqtxt);
251
252
253 return ad;
254
255 ERROR:
256 p7_alidisplay_Destroy(ad);
257 return NULL;
258 }
259
260 /* Function: p7_alidisplay_Create_empty()
261 * Synopsis: Creates an empty P7_ALIDISPLAY object
262 *
263 * Purpose: Creates an empty P7_ALIDISPLAY object, one that does not contain an alignment to
264 * display but has been initialized to reasonable values (NULL for all strings, 0 for the amount
265 * of memory allocated in its buffer, etc.). This is mainly intended to be used to create a structure
266 * that a serialized P7_ALIDISPLAY object can be deserialized into.
267 *
268 * Returns: Pointer to the new <P7_ALIDISPLAY>
269 *
270 * Throws: Returns NULL if unable to allocate memory
271 */
p7_alidisplay_Create_empty()272 extern P7_ALIDISPLAY *p7_alidisplay_Create_empty()
273 {
274 P7_ALIDISPLAY *new_obj;
275 int status; // standard ESL_ALLOC error condition
276
277 ESL_ALLOC(new_obj, sizeof(P7_ALIDISPLAY));
278
279 // Init fields to sensible "empty" values
280 new_obj->rfline = NULL;
281 new_obj->mmline = NULL;
282 new_obj->csline = NULL;
283 new_obj->model = NULL;
284 new_obj->mline = NULL;
285 new_obj->aseq = NULL;
286 new_obj->ntseq = NULL;
287 new_obj->ppline = NULL;
288 new_obj->N = 0;
289
290 new_obj->hmmname = NULL;
291 new_obj->hmmacc = NULL;
292 new_obj->hmmdesc = NULL;
293 new_obj->hmmfrom = 0;
294 new_obj->hmmto = 0;
295 new_obj->M = 0;
296
297 new_obj->sqname = NULL;
298 new_obj->sqacc = NULL;
299 new_obj->sqdesc = NULL;
300 new_obj->sqfrom = 0;
301 new_obj->sqto = 0;
302 new_obj->L = 0;
303
304 new_obj->memsize = 0;
305 new_obj->mem = NULL;
306
307 return new_obj;
308
309 ERROR: // only get here if the ESL_ALLOC fails
310 return NULL;
311 }
312 /* Function: p7_alidisplay_Clone()
313 * Synopsis: Make a duplicate of an ALIDISPLAY.
314 *
315 * Purpose: Create a duplicate of alignment display <ad>.
316 * Return a pointer to the duplicate. Caller
317 * is responsible for freeing the new object.
318 *
319 * Returns: pointer to new <P7_ALIDISPLAY>
320 *
321 * Throws: <NULL> on allocation failure.
322 */
323 P7_ALIDISPLAY *
p7_alidisplay_Clone(const P7_ALIDISPLAY * ad)324 p7_alidisplay_Clone(const P7_ALIDISPLAY *ad)
325 {
326 P7_ALIDISPLAY *ad2 = NULL;
327 int status;
328
329 ESL_ALLOC(ad2, sizeof(P7_ALIDISPLAY));
330 ad2->rfline = ad2->mmline = ad2->csline = ad2->model = ad2->mline = ad2->aseq = ad2->ntseq = ad2->ppline = NULL;
331 ad2->hmmname = ad2->hmmacc = ad2->hmmdesc = NULL;
332 ad2->sqname = ad2->sqacc = ad2->sqdesc = NULL;
333 ad2->mem = NULL;
334 ad2->memsize = 0;
335
336 if (ad->memsize) /* serialized */
337 {
338 ESL_ALLOC(ad2->mem, sizeof(char) * ad->memsize);
339 ad2->memsize = ad->memsize;
340 memcpy(ad2->mem, ad->mem, ad->memsize);
341
342 ad2->rfline = (ad->rfline ? ad2->mem + (ad->rfline - ad->mem) : NULL );
343 ad2->mmline = (ad->mmline ? ad2->mem + (ad->mmline - ad->mem) : NULL );
344 ad2->csline = (ad->csline ? ad2->mem + (ad->csline - ad->mem) : NULL );
345 ad2->model = ad2->mem + (ad->model - ad->mem);
346 ad2->mline = ad2->mem + (ad->mline - ad->mem);
347 ad2->aseq = ad2->mem + (ad->aseq - ad->mem);
348 ad2->ntseq = (ad->ntseq ? ad2->mem + (ad->ntseq - ad->mem) : NULL );
349 ad2->ppline = (ad->ppline ? ad2->mem + (ad->ppline - ad->mem) : NULL );
350 ad2->N = ad->N;
351
352 ad2->hmmname = ad2->mem + (ad->hmmname - ad->mem);
353 ad2->hmmacc = ad2->mem + (ad->hmmacc - ad->mem);
354 ad2->hmmdesc = ad2->mem + (ad->hmmdesc - ad->mem);
355 ad2->hmmfrom = ad->hmmfrom;
356 ad2->hmmto = ad->hmmto;
357 ad2->M = ad->M;
358
359 ad2->sqname = ad2->mem + (ad->sqname - ad->mem);
360 ad2->sqacc = ad2->mem + (ad->sqacc - ad->mem);
361 ad2->sqdesc = ad2->mem + (ad->sqdesc - ad->mem);
362 ad2->sqfrom = ad->sqfrom;
363 ad2->sqto = ad->sqto;
364 ad2->L = ad->L;
365 }
366 else /* deserialized */
367 {
368 if ( esl_strdup(ad->rfline, -1, &(ad2->rfline)) != eslOK) goto ERROR;
369 if ( esl_strdup(ad->mmline, -1, &(ad2->mmline)) != eslOK) goto ERROR;
370 if ( esl_strdup(ad->csline, -1, &(ad2->csline)) != eslOK) goto ERROR;
371 if ( esl_strdup(ad->model, -1, &(ad2->model)) != eslOK) goto ERROR;
372 if ( esl_strdup(ad->mline, -1, &(ad2->mline)) != eslOK) goto ERROR;
373 if ( esl_strdup(ad->aseq, -1, &(ad2->aseq)) != eslOK) goto ERROR;
374 if ( esl_strdup(ad->ntseq, -1, &(ad2->ntseq)) != eslOK) goto ERROR;
375 if ( esl_strdup(ad->ppline, -1, &(ad2->ppline)) != eslOK) goto ERROR;
376 ad2->N = ad->N;
377
378 if ( esl_strdup(ad->hmmname, -1, &(ad2->hmmname)) != eslOK) goto ERROR;
379 if ( esl_strdup(ad->hmmacc, -1, &(ad2->hmmacc)) != eslOK) goto ERROR;
380 if ( esl_strdup(ad->hmmdesc, -1, &(ad2->hmmdesc)) != eslOK) goto ERROR;
381 ad2->hmmfrom = ad->hmmfrom;
382 ad2->hmmto = ad->hmmto;
383 ad2->M = ad->M;
384
385 if ( esl_strdup(ad->sqname, -1, &(ad2->sqname)) != eslOK) goto ERROR;
386 if ( esl_strdup(ad->sqacc, -1, &(ad2->sqacc)) != eslOK) goto ERROR;
387 if ( esl_strdup(ad->sqdesc, -1, &(ad2->sqdesc)) != eslOK) goto ERROR;
388 ad2->sqfrom = ad->sqfrom;
389 ad2->sqto = ad->sqto;
390 ad2->L = ad->L;
391 }
392
393 return ad2;
394
395 ERROR:
396 if (ad2) p7_alidisplay_Destroy(ad2);
397 return NULL;
398 }
399
400
401 /* Function: p7_alidisplay_Sizeof()
402 * Synopsis: Returns the total size of a P7_ALIDISPLAY, in bytes.
403 *
404 * Purpose: Return the total size of <P7_ALIDISPLAY> <ad>, in bytes.
405 *
406 * Note that <ad->memsize = p7_alidisplay_Sizeof(ad) - sizeof(P7_ALIDISPLAY)>,
407 * for a serialized object, because <ad->memsize> only refers to the sum
408 * of the variable-length allocated fields.
409 *
410 * Args: ad - P7_ALIDISPLAY to get the size of
411 *
412 * Returns: size of <ad> in bytes
413 */
414 size_t
p7_alidisplay_Sizeof(const P7_ALIDISPLAY * ad)415 p7_alidisplay_Sizeof(const P7_ALIDISPLAY *ad)
416 {
417 size_t n = sizeof(P7_ALIDISPLAY);
418
419 if (ad->rfline) n += ad->N+1; /* +1 for \0 */
420 if (ad->mmline) n += ad->N+1;
421 if (ad->csline) n += ad->N+1;
422 if (ad->ppline) n += ad->N+1;
423 n += 3 * (ad->N+1); /* model, mline, aseq */
424 if (ad->ntseq) n += (3 * ad->N) + 1; /* ntseq */
425 n += 1 + strlen(ad->hmmname);
426 n += 1 + strlen(ad->hmmacc); /* optional acc, desc fields: when not present, just "" ("\0") */
427 n += 1 + strlen(ad->hmmdesc);
428 n += 1 + strlen(ad->sqname);
429 n += 1 + strlen(ad->sqacc);
430 n += 1 + strlen(ad->sqdesc);
431
432 return n;
433 }
434
435
436 #define SER_BASE_SIZE ((5 * sizeof(int)) + (3 * sizeof(int64_t)) +1) // Total size of the fixed-length fields in a
437 // serialized P7_ALIDISPLAY
438
439 /* Function: p7_alidisplay_Serialize
440 * Synopsis: Serializes a HMMD_SEARCH_STATS object into a stream of bytes
441 *. that can be reliably transmitted over internet sockets
442 *
443 * Purpose: Converts an architecture-dependent P7_SEARCH_STATS object into a contiguous stream
444 * of bytes with each field of the data structure in network byte order for transmission
445 * over sockets. The serialized byte stream may be part of a larger allocated buffer.
446 * If the provided buffer is NULL, allocates a new buffer large enough for the serialized object
447 * If the provided buffer is not large enough to hold the serialized object and its existing data, re-allocates
448 * a larger buffer
449 *
450 * Inputs: obj: A pointer to the HMMD_SEARCH_STATS object to be serialized
451 * buf: Handle to the buffer that the object should be serialized into. If *buf is NULL,
452 * a new buffer will be allocated. buf == NULL is not allowed.
453 * n: Offset (in bytes) from the start of the buffer to where the serialized object should start.
454 * nalloc: size (in bytes) of the buffer passed in buf
455 *
456 *Returns: On success: returns eslOK, sets *buf to the base of the buffer containing the object
457 * if allocation or re-allocation was requried, sets *n to the offset from the start of the buffer
458 * to the first position after the serialized object and sets *nalloc to the new size of the buffer
459 * if allocation or re-allocation was required.
460 *
461 * Throws: Returns eslEMEM if unable to allocate or re-allocate memory. Returns eslEINVAL if obj == NULL, n == NULL, or buf == NULL
462 */
p7_alidisplay_Serialize(const P7_ALIDISPLAY * obj,uint8_t ** buf,uint32_t * n,uint32_t * nalloc)463 int p7_alidisplay_Serialize(const P7_ALIDISPLAY *obj, uint8_t **buf, uint32_t *n, uint32_t *nalloc){
464
465 int status; // error variable used by ESL_ALLOC
466 uint32_t ser_size; // size of the structure when serialized
467 uint8_t *ptr; // current position within the buffer
468 uint32_t network_32bit; // hold 32-bit fields after conversion to network order
469 uint64_t network_64bit; // hold 64-bit fields after conversion to network order
470 uint8_t presence_flags = 0; // Bit-vector that records presence or absence of optional strings
471 uint32_t hmmname_length, hmmacc_length, hmmdesc_length, sqname_length, sqacc_length, sqdesc_length;
472
473 // check to make sure we were passed a valid pointer
474 if(obj == NULL || buf == NULL || n == NULL){ // no object to serialize or nowhere to put a buffer pointer
475 return(eslEINVAL);
476 }
477
478 // Pass 1: Compute size of the serialized data structure
479 /* 11 ints: 4 int fields in P7_ALIDISPLAY + lengths of 6 variable-length strings + total length of serialized structure
480 3 int64_t fields in P7_ALIDISPLAY
481 1 byte for presence/absence bit vector for rfline, mmline, csline, ppline, aseq, ntseq
482 */
483 ser_size = SER_BASE_SIZE;
484
485 // This method will work regardless of whether the object is in "serialized" or "deserialized" format
486 // it's also blatantly stolen from the _Sizeof() routine.
487 // Note that we can't just call _Sizeof() to figure out how big the serialized data structure will be because
488 // the serialized structure has some fields that differ from the base P7_ALIDISPLAY structure
489
490 if (obj->rfline){
491 presence_flags += RFLINE_PRESENT;
492 ser_size += obj->N+1; /* +1 for \0 */
493 }
494
495 if (obj->mmline) {
496 presence_flags += MMLINE_PRESENT;
497 ser_size += obj->N+1;
498 }
499
500 if (obj->csline){
501 presence_flags += CSLINE_PRESENT;
502 ser_size += obj->N+1;
503 }
504
505 ser_size += 2 * (obj->N+1); /* model, mline */
506
507 if (obj->aseq){
508 presence_flags += ASEQ_PRESENT;
509 ser_size += obj->N+1;
510 }
511
512 if (obj->ntseq){
513 presence_flags += NTSEQ_PRESENT;
514 ser_size += (3 * obj->N) + 1; /* ntseq */
515 }
516
517 if (obj->ppline){
518 presence_flags += PPLINE_PRESENT;
519 ser_size += obj->N+1;
520 }
521
522 hmmname_length = strlen(obj->hmmname);
523 ser_size += 1 + hmmname_length;
524
525 hmmacc_length = strlen(obj->hmmacc);
526 ser_size += 1 + hmmacc_length; /* optional acc, desc fields: when not present, just "" ("\0") */
527
528 hmmdesc_length = strlen(obj->hmmdesc);
529 ser_size += 1 + hmmdesc_length;
530
531 sqname_length = strlen(obj->sqname);
532 ser_size += 1 + sqname_length;
533
534 sqacc_length = strlen(obj->sqacc);
535 ser_size += 1 + sqacc_length;
536
537 sqdesc_length = strlen(obj->sqdesc);
538 ser_size += 1 + sqdesc_length;
539
540 // Now that we know how big the serialized data structure will be, determine if we have enough buffer space to hold it
541 if(*buf == NULL){ // have no buffer, so allocate one
542 ESL_ALLOC(*buf, ser_size);
543 *nalloc = ser_size;
544 }
545
546 if((*n + ser_size) > *nalloc){ //have a buffer, but it's not big enough
547 ESL_REALLOC(*buf, (*n + ser_size));
548 *nalloc = *n + ser_size;
549 }
550
551 // Pass 2: serialize the structure
552
553 //First, the fixed-length field
554 ptr = *buf + *n; // point to the start of empty space in the buffer
555
556 network_32bit = esl_hton32(ser_size); // first field in the serialized object is its size
557 memcpy(ptr, &network_32bit, sizeof(uint32_t)); // Write size of the serialized object into the buffer
558 ptr += sizeof(uint32_t);
559
560 // Field 2: N
561 network_32bit = esl_hton32(obj->N);
562 memcpy(ptr, &network_32bit, sizeof(uint32_t));
563 ptr += sizeof(uint32_t);
564
565 // Field 3: Hmmfrom field
566 network_32bit = esl_hton32(obj->hmmfrom);
567 memcpy(ptr, &network_32bit, sizeof(uint32_t));
568 ptr += sizeof(uint32_t);
569
570 // Field 4: Hmmto field
571 network_32bit = esl_hton32(obj->hmmto);
572 memcpy(ptr, &network_32bit, sizeof(uint32_t));
573 ptr += sizeof(uint32_t);
574
575 // Field 5: M field
576 network_32bit = esl_hton32(obj->M);
577 memcpy(ptr, &network_32bit, sizeof(uint32_t));
578 ptr += sizeof(uint32_t);
579
580 // Field 6: Sqfrom
581 network_64bit = esl_hton64(obj->sqfrom);
582 memcpy(ptr, &network_64bit, sizeof(int64_t));
583 ptr += sizeof(int64_t);
584
585 // Field 7: Sqto
586 network_64bit = esl_hton64(obj->sqto);
587 memcpy(ptr, &network_64bit, sizeof(int64_t));
588 ptr += sizeof(int64_t);
589
590 // Field 8: L
591 network_64bit = esl_hton64(obj->L);
592 memcpy(ptr, &network_64bit, sizeof(int64_t));
593 ptr += sizeof(int64_t);
594
595 // Field 9: presence_flags
596 memcpy(ptr, &presence_flags, sizeof(uint8_t));
597 ptr += sizeof(uint8_t);
598
599 //Now, the strings, some of which are optional
600 // Note that many of these strings are fixed-length if they are present
601
602 // Field 10: Rfline
603 if(presence_flags & RFLINE_PRESENT){
604 strcpy((char *) ptr, obj->rfline);
605 ptr += obj->N + 1;
606 }
607
608 // Field 11: mmline
609 if(presence_flags & MMLINE_PRESENT){
610 strcpy((char *) ptr, obj->mmline);
611 ptr += obj->N + 1;
612 }
613
614 // Field 12: csline
615 if(presence_flags & CSLINE_PRESENT){
616 strcpy((char *) ptr, obj->csline);
617 ptr += obj->N + 1;
618 }
619
620 // Field 13: Model
621 strcpy((char *) ptr, obj->model);
622 ptr += obj->N + 1;
623
624 // Field 14: Mline
625 strcpy((char *) ptr, obj->mline);
626 ptr += obj->N + 1;
627
628 // Field 15: Aseq
629 if(presence_flags & ASEQ_PRESENT){
630 strcpy((char *) ptr, obj->aseq);
631 ptr += obj->N + 1;
632 }
633
634 // Field 16: ntseq
635 if(presence_flags & NTSEQ_PRESENT){
636 strcpy((char *) ptr, obj->ntseq);
637 ptr += (3 * obj->N) + 1;
638 }
639
640 // Field 17: PPline
641 if(presence_flags & PPLINE_PRESENT){
642 strcpy((char *) ptr, obj->ppline);
643 ptr += obj->N + 1;
644 }
645
646 // Field 18: Hmmname
647 strcpy((char *) ptr, obj->hmmname);
648 ptr += hmmname_length + 1;
649
650 // Field 19: Hmmacc
651 strcpy((char *) ptr, obj->hmmacc);
652 ptr += hmmacc_length + 1;
653
654 // Field 20: Hmmdesc
655 strcpy((char *) ptr, obj->hmmdesc);
656 ptr += hmmdesc_length + 1;
657
658 // Field 21: Sqname
659 strcpy((char *) ptr, obj->sqname);
660 ptr += sqname_length +1;
661
662 // Field 22: Sqacc
663 strcpy((char *) ptr, obj->sqacc);
664 ptr += sqacc_length +1 ;
665
666 // Field 23: Sqdesc
667 strcpy((char *) ptr, obj->sqdesc);
668 ptr += sqdesc_length +1 ;
669
670 // sanity-check that we computed the length correctly
671 if(ptr != *buf + *n + ser_size){
672 printf("Serialized object length did not match computed length in p7_alidisplay_Serialize\n");
673 return eslEINVAL;
674 }
675
676 *n = ptr - *buf; // update n to be just past the serialized object
677 return eslOK; // If we make it here, everything succeeded, so return a pass
678
679 ERROR:
680 return eslEMEM;
681 }
682
683 /* Function: p7_alidisplay_Deserialize
684 * Synopsis: Derializes a P7_ALIDISPLAY object from a stream of bytes in network order into
685 * a valid data structure
686 *
687 * Purpose: Deserializes a serialized P7_ALIDISPLAY object from
688 *. buf starting at position position *pos.
689 *
690 * Inputs: buf: the buffer that the object should be de-serialized from
691 * pos: a pointer to the offset from the start of buf to the beginning of the object
692 * ret_obj: a P7_ALIDISPLAY structure to deserialize the object into. May not be NULL. May either be an
693 * "empty" object created with p7_alidisplay_Create_empty, or a P7_ALIDISPLAY object containing valid data
694 *
695 * Returns: On success: returns eslOK, deserializes the P7_ALIDISPLAY object into ret_object, and updates
696 *. pos to point to the position after the end of the P7_ALIDISPLAY object.
697 *
698 * Throws: Returns eslEINVAL if ret_obj == NULL, buf == NULL, or N == NULL. Returnts eslEMEM if unable to increase
699 * the buffer in ret_obj to match the size of the deserialized object.
700 */
p7_alidisplay_Deserialize(const uint8_t * buf,uint32_t * n,P7_ALIDISPLAY * ret_obj)701 extern int p7_alidisplay_Deserialize(const uint8_t *buf, uint32_t *n, P7_ALIDISPLAY *ret_obj){
702 int status; // Standard Easel error code variable
703
704 uint8_t *ptr;
705 char *mem_ptr;
706 uint64_t network_64bit; // holds 64-bit values in network order
707 uint32_t network_32bit; // holds 64-bit values in network order
708 uint32_t obj_size; // How much space does the variable-length portion of the serialized object take up?
709 uint8_t presence_flags; // bit-vector that tells us which strings are present in the object
710 int string_length; // used to hold the length of strings copied out of serialized object
711
712 if ((buf == NULL) || (ret_obj == NULL) || (n == NULL)){ // check to make sure we've been passed valid objects
713 return(eslEINVAL);
714 }
715
716 mem_ptr = NULL; // set this for safety
717 ptr = (uint8_t *) buf + *n; // Get pointer to start of the object
718 //First field: Size of the serialized object. Copy out of buffer into scalar variable to deal with memory alignment, convert to
719 // host machine order
720 memcpy(&network_32bit, ptr, sizeof(uint32_t)); // Grab the bytes out of the buffer
721 obj_size = esl_ntoh32(network_32bit);
722 ptr += sizeof(uint32_t);
723
724 if(ret_obj->memsize < (obj_size - SER_BASE_SIZE)){ // ret_obj doesn't have enough space for this P7_ALIDISPLAY
725 if(ret_obj->mem != NULL){
726 ESL_REALLOC(ret_obj->mem, (obj_size - SER_BASE_SIZE));
727 }
728 else{
729 ESL_ALLOC(ret_obj->mem, (obj_size - SER_BASE_SIZE));
730 }
731 ret_obj->memsize = obj_size - SER_BASE_SIZE;
732 }
733
734 // Second field: N
735 memcpy(&network_32bit, ptr, sizeof(uint32_t)); // Grab the bytes out of the buffer
736 ret_obj->N = esl_ntoh32(network_32bit);
737 ptr += sizeof(uint32_t);
738
739 // Third field: Hmmfrom
740 memcpy(&network_32bit, ptr, sizeof(uint32_t));
741 ret_obj->hmmfrom = esl_ntoh32(network_32bit);
742 ptr += sizeof(uint32_t);
743
744 // Fourth field: Hmmto
745 memcpy(&network_32bit, ptr, sizeof(uint32_t));
746 ret_obj->hmmto = esl_ntoh32(network_32bit);
747 ptr += sizeof(uint32_t);
748
749 // Fifth field: M
750 memcpy(&network_32bit, ptr, sizeof(uint32_t));
751 ret_obj->M = esl_ntoh32(network_32bit);
752 ptr += sizeof(uint32_t);
753
754 // Sixth field: sqfrom
755 memcpy(&network_64bit, ptr, sizeof(uint64_t));
756 ret_obj->sqfrom = esl_ntoh64(network_64bit);
757 ptr += sizeof(uint64_t);
758
759 // Seventh field: sqto
760 memcpy(&network_64bit, ptr, sizeof(uint64_t));
761 ret_obj->sqto = esl_ntoh64(network_64bit);
762 ptr += sizeof(uint64_t);
763
764 // Eighth field: L
765 memcpy(&network_64bit, ptr, sizeof(uint64_t));
766 ret_obj->L = esl_ntoh64(network_64bit);
767 ptr += sizeof(uint64_t);
768
769 // Ninth field: presence flags
770 presence_flags = *ptr; // no need for memcpy with one-byte field
771 ptr += sizeof(uint8_t);
772
773 // Bulk copy the strings into the alidisplay's mem field
774 memcpy(ret_obj->mem, ptr, (obj_size - SER_BASE_SIZE));
775 mem_ptr = ret_obj->mem;
776 ptr += (obj_size - SER_BASE_SIZE);
777 // Tenth field: rfline, if present
778
779 if(ptr != buf + *n + obj_size){
780 printf("Error: in p7_alidisplay_Deserialize, found object (ptr) to be of size %ld, expected %u.\n", (long int) (ptr - (buf + *n)), obj_size);
781 //return eslEINVAL;
782 }
783
784 if(presence_flags & RFLINE_PRESENT){
785 ret_obj->rfline = mem_ptr;
786 string_length = strlen(ret_obj->rfline);
787 mem_ptr+= string_length +1; // + 1 to account for end-of-string character
788 }
789 else{ // not present
790 ret_obj->rfline = NULL;
791 }
792
793 // Eleventh field: mmline, if present
794 if(presence_flags & MMLINE_PRESENT){
795 ret_obj->mmline = mem_ptr;
796 string_length = strlen(ret_obj->mmline);
797 mem_ptr+= string_length + 1;
798 }
799 else{ // not present
800 ret_obj->mmline = NULL;
801 }
802
803 // Twelfth field: csline, if present
804 if(presence_flags & CSLINE_PRESENT){
805 ret_obj->csline = mem_ptr;
806 string_length = strlen(ret_obj->csline);
807 mem_ptr+= string_length + 1;
808 }
809 else{ // not present
810 ret_obj->csline = NULL;
811 }
812
813
814 // Thirteenth field: model
815 ret_obj->model = mem_ptr;
816 string_length = strlen(ret_obj->model);
817 mem_ptr+= string_length + 1;
818
819 // Thirteenth field: mline
820 ret_obj->mline = mem_ptr;
821 string_length = strlen(ret_obj->mline);
822 mem_ptr+= string_length + 1;
823
824 // Fourteenth field: aseq, if present
825 if(presence_flags & ASEQ_PRESENT){
826 ret_obj->aseq = mem_ptr;
827 string_length = strlen(ret_obj->aseq);
828 mem_ptr+= string_length + 1;
829 }
830 else{ // not present
831 ret_obj->aseq = NULL;
832 }
833
834 // Fifteenth field: ntseq, if present
835 if(presence_flags & NTSEQ_PRESENT){
836 ret_obj->ntseq = mem_ptr;
837 string_length = strlen(ret_obj->ntseq);
838 mem_ptr+= string_length + 1;
839 }
840 else{ // not present
841 ret_obj->ntseq = NULL;
842 }
843
844 // Sixteenth field: ppline, if present
845 if(presence_flags & PPLINE_PRESENT){
846 ret_obj->ppline = mem_ptr;
847 string_length = strlen(ret_obj->ppline);
848 mem_ptr+= string_length + 1;
849 }
850 else{ // not present
851 ret_obj->ppline = NULL;
852 }
853
854 // Seventeenth field: hmmname
855 ret_obj->hmmname = mem_ptr;
856 string_length = strlen(ret_obj->hmmname);
857 mem_ptr+= string_length + 1;
858
859 // Eighteenth field: hmmacc
860 ret_obj->hmmacc = mem_ptr;
861 string_length = strlen(ret_obj->hmmacc);
862 mem_ptr+= string_length + 1;
863
864 // Nineteenth field: hmmdesc
865 ret_obj->hmmdesc = mem_ptr;
866 string_length = strlen(ret_obj->hmmdesc);
867 mem_ptr+= string_length + 1;
868
869 // Twentyith field: sqname
870 ret_obj->sqname = mem_ptr;
871 string_length = strlen(ret_obj->sqname);
872 mem_ptr+= string_length + 1;
873
874 // Twentyfirst field: sqacc
875 ret_obj->sqacc = mem_ptr;
876 string_length = strlen(ret_obj->sqacc);
877 mem_ptr+= string_length + 1;
878
879 // Twentysecond field: sqdesc
880 ret_obj->sqdesc = mem_ptr;
881 string_length = strlen(ret_obj->sqdesc);
882 mem_ptr+= string_length +1;
883
884 // Sanity-check that we got the length right
885 if(mem_ptr - ret_obj->mem != (obj_size - SER_BASE_SIZE)){
886 printf("Error: at end of p7_alidisplay_Deserialize, found strings to be of size %ld, expected %ld.\n", (long)(mem_ptr - ret_obj->mem), (long)(obj_size - SER_BASE_SIZE));
887 return eslEINVAL;
888 }
889 *n += obj_size;
890 return eslOK; // as usual, if we get to the end of the routine without failing, return success
891
892 ERROR: // only get here if we can't allocate memory
893 return eslEMEM;
894 }
895
896 /* Function: p7_alidisplay_Serialize()
897 * Synopsis: Serialize a P7_ALIDISPLAY, using internal memory.
898 *
899 * Purpose: Serialize the <P7_ALIDISPLAY> <ad>, internally converting
900 * all its variable-length allocations to a single
901 * contiguous memory allocation. Serialization aids
902 * interprocess communication.
903 *
904 * If <ad> is already serialized, do nothing.
905 *
906 * Args: ad - alidisplay to serialize
907 *
908 * Returns: <eslOK> on success.
909 *
910 * Throws: <eslEMEM> on allocation failure, and <ad> is restored to
911 * its original (deserialized) state.
912 */
913 int
p7_alidisplay_Serialize_old(P7_ALIDISPLAY * ad)914 p7_alidisplay_Serialize_old(P7_ALIDISPLAY *ad)
915 {
916 int pos;
917 int n;
918 int status;
919
920 if (ad->mem) return eslOK; /* already serialized, so no-op */
921 ad->memsize = p7_alidisplay_Sizeof(ad) - sizeof(P7_ALIDISPLAY);
922 ESL_ALLOC(ad->mem, ad->memsize);
923
924 /* allow no exceptions past this point, because API guarantees restore of original state upon error */
925
926 pos = 0;
927 if (ad->rfline) { memcpy(ad->mem+pos, ad->rfline, ad->N+1); free(ad->rfline); ad->rfline = ad->mem+pos; pos += ad->N+1; }
928 if (ad->mmline) { memcpy(ad->mem+pos, ad->mmline, ad->N+1); free(ad->mmline); ad->mmline = ad->mem+pos; pos += ad->N+1; }
929 if (ad->csline) { memcpy(ad->mem+pos, ad->csline, ad->N+1); free(ad->csline); ad->csline = ad->mem+pos; pos += ad->N+1; }
930 memcpy(ad->mem+pos, ad->model, ad->N+1); free(ad->model); ad->model = ad->mem+pos; pos += ad->N+1;
931 memcpy(ad->mem+pos, ad->mline, ad->N+1); free(ad->mline); ad->mline = ad->mem+pos; pos += ad->N+1;
932 memcpy(ad->mem+pos, ad->aseq, ad->N+1); free(ad->aseq); ad->aseq = ad->mem+pos; pos += ad->N+1;
933 if (ad->ntseq) { memcpy(ad->mem+pos, ad->ntseq, (3*ad->N)+1); free(ad->ntseq); ad->ntseq = ad->mem+pos; pos += (3*ad->N)+1; }
934 if (ad->ppline) { memcpy(ad->mem+pos, ad->ppline, ad->N+1); free(ad->ppline); ad->ppline = ad->mem+pos; pos += ad->N+1; }
935 n = 1 + strlen(ad->hmmname); memcpy(ad->mem + pos, ad->hmmname, n); free(ad->hmmname); ad->hmmname = ad->mem+pos; pos += n;
936 n = 1 + strlen(ad->hmmacc); memcpy(ad->mem + pos, ad->hmmacc, n); free(ad->hmmacc); ad->hmmacc = ad->mem+pos; pos += n;
937 n = 1 + strlen(ad->hmmdesc); memcpy(ad->mem + pos, ad->hmmdesc, n); free(ad->hmmdesc); ad->hmmdesc = ad->mem+pos; pos += n;
938 n = 1 + strlen(ad->sqname); memcpy(ad->mem + pos, ad->sqname, n); free(ad->sqname); ad->sqname = ad->mem+pos; pos += n;
939 n = 1 + strlen(ad->sqacc); memcpy(ad->mem + pos, ad->sqacc, n); free(ad->sqacc); ad->sqacc = ad->mem+pos; pos += n;
940 n = 1 + strlen(ad->sqdesc); memcpy(ad->mem + pos, ad->sqdesc, n); free(ad->sqdesc); ad->sqdesc = ad->mem+pos; pos += n;
941
942 return eslOK;
943
944 ERROR:
945 if (ad->mem) { free(ad->mem); ad->mem = NULL; }
946 return status;
947 }
948
949 /* Function: p7_alidisplay_Deserialize()
950 * Synopsis: Deserialize a P7_ALIDISPLAY, using internal memory.
951 *
952 * Purpose: Deserialize the <P7_ALIDISPLAY> <ad>, converting its internal
953 * allocations from a single contiguous memory chunk to individual
954 * variable-length allocations. Deserialization facilitates
955 * reallocation/editing of individual elements of the display.
956 *
957 * If <ad> is already deserialized, do nothing.
958 *
959 * Args: ad - alidisplay to serialize
960 *
961 * Returns: <eslOK> on success
962 *
963 * Throws: <eslEMEM> on allocation failure, and <ad> is restored to
964 * its original (serialized) state.
965 */
966 int
p7_alidisplay_Deserialize_old(P7_ALIDISPLAY * ad)967 p7_alidisplay_Deserialize_old(P7_ALIDISPLAY *ad)
968 {
969 int pos;
970 int n;
971 int status;
972
973 if (ad->mem == NULL) return eslOK; /* already deserialized, so no-op */
974
975 pos = 0;
976 if (ad->rfline) { ESL_ALLOC(ad->rfline, sizeof(char) * ad->N+1); memcpy(ad->rfline, ad->mem+pos, ad->N+1); pos += ad->N+1; }
977 if (ad->mmline) { ESL_ALLOC(ad->mmline, sizeof(char) * ad->N+1); memcpy(ad->mmline, ad->mem+pos, ad->N+1); pos += ad->N+1; }
978 if (ad->csline) { ESL_ALLOC(ad->csline, sizeof(char) * ad->N+1); memcpy(ad->csline, ad->mem+pos, ad->N+1); pos += ad->N+1; }
979 ESL_ALLOC(ad->model, sizeof(char) * ad->N+1); memcpy(ad->model, ad->mem+pos, ad->N+1); pos += ad->N+1;
980 ESL_ALLOC(ad->mline, sizeof(char) * ad->N+1); memcpy(ad->mline, ad->mem+pos, ad->N+1); pos += ad->N+1;
981 ESL_ALLOC(ad->aseq, sizeof(char) * ad->N+1); memcpy(ad->aseq, ad->mem+pos, ad->N+1); pos += ad->N+1;
982 if (ad->ntseq) { ESL_ALLOC(ad->ntseq, sizeof(char) * (3*ad->N)+1); memcpy(ad->ntseq, ad->mem+pos, (3*ad->N)+1); pos += (3*ad->N)+1; }
983 if (ad->ppline) { ESL_ALLOC(ad->ppline, sizeof(char) * ad->N+1); memcpy(ad->ppline, ad->mem+pos, ad->N+1); pos += ad->N+1; }
984 n = 1 + strlen(ad->mem+pos); ESL_ALLOC(ad->hmmname, sizeof(char) * n); memcpy(ad->hmmname, ad->mem+pos, n); pos += n;
985 n = 1 + strlen(ad->mem+pos); ESL_ALLOC(ad->hmmacc, sizeof(char) * n); memcpy(ad->hmmacc, ad->mem+pos, n); pos += n;
986 n = 1 + strlen(ad->mem+pos); ESL_ALLOC(ad->hmmdesc, sizeof(char) * n); memcpy(ad->hmmdesc, ad->mem+pos, n); pos += n;
987 n = 1 + strlen(ad->mem+pos); ESL_ALLOC(ad->sqname, sizeof(char) * n); memcpy(ad->sqname, ad->mem+pos, n); pos += n;
988 n = 1 + strlen(ad->mem+pos); ESL_ALLOC(ad->sqacc, sizeof(char) * n); memcpy(ad->sqacc, ad->mem+pos, n); pos += n;
989 n = 1 + strlen(ad->mem+pos); ESL_ALLOC(ad->sqdesc, sizeof(char) * n); memcpy(ad->sqdesc, ad->mem+pos, n); pos += n;
990
991 free(ad->mem);
992 ad->mem = NULL;
993 ad->memsize = 0;
994 return eslOK;
995
996 ERROR:
997 /* restore serialized state, if an alloc fails. tedious, if not nontrivial. */
998 /* the pointers are non-NULL whether we just allocated them or if they're pointing into mem, so we have to check against mem+pos */
999 pos = 0;
1000 if (ad->rfline) { if (ad->rfline != ad->mem+pos) { free(ad->rfline); ad->rfline = ad->mem+pos; } pos += ad->N+1; }
1001 if (ad->mmline) { if (ad->mmline != ad->mem+pos) { free(ad->mmline); ad->mmline = ad->mem+pos; } pos += ad->N+1; }
1002 if (ad->csline) { if (ad->csline != ad->mem+pos) { free(ad->csline); ad->csline = ad->mem+pos; } pos += ad->N+1; }
1003 if (ad->model != ad->mem+pos) { free(ad->model); ad->model = ad->mem+pos; } pos += ad->N+1;
1004 if (ad->mline != ad->mem+pos) { free(ad->mline); ad->mline = ad->mem+pos; } pos += ad->N+1;
1005 if (ad->aseq != ad->mem+pos) { free(ad->aseq); ad->aseq = ad->mem+pos; } pos += ad->N+1;
1006 if (ad->ntseq) { if (ad->ntseq != ad->mem+pos) { free(ad->ntseq); ad->ntseq = ad->mem+pos; } pos += (3*ad->N)+1; }
1007 if (ad->ppline) { if (ad->ppline != ad->mem+pos) { free(ad->ppline); ad->ppline = ad->mem+pos; } pos += ad->N+1; }
1008
1009 n = 1 + strlen(ad->hmmname); if (ad->hmmname != ad->mem+pos) { free(ad->hmmname); ad->hmmname = ad->mem+pos; } pos += n;
1010 n = 1 + strlen(ad->hmmacc); if (ad->hmmacc != ad->mem+pos) { free(ad->hmmacc); ad->hmmacc = ad->mem+pos; } pos += n;
1011 n = 1 + strlen(ad->hmmname); if (ad->hmmdesc != ad->mem+pos) { free(ad->hmmdesc); ad->hmmdesc = ad->mem+pos; } pos += n;
1012 n = 1 + strlen(ad->sqname); if (ad->sqname != ad->mem+pos) { free(ad->sqname); ad->sqname = ad->mem+pos; } pos += n;
1013 n = 1 + strlen(ad->sqacc); if (ad->sqacc != ad->mem+pos) { free(ad->sqacc); ad->sqacc = ad->mem+pos; } pos += n;
1014 n = 1 + strlen(ad->sqname); if (ad->sqdesc != ad->mem+pos) { free(ad->sqdesc); ad->sqdesc = ad->mem+pos; } pos += n;
1015 return status;
1016 }
1017
1018
1019 /* Function: p7_alidisplay_Destroy()
1020 * Synopsis: Frees a <P7_ALIDISPLAY>
1021 */
1022 void
p7_alidisplay_Destroy(P7_ALIDISPLAY * ad)1023 p7_alidisplay_Destroy(P7_ALIDISPLAY *ad)
1024 {
1025 if (ad == NULL) return;
1026 if (ad->mem)
1027 { /* serialized form */
1028 free(ad->mem);
1029 }
1030 else
1031 { /* deserialized form */
1032 if (ad->rfline) free(ad->rfline);
1033 if (ad->mmline) free(ad->mmline);
1034 if (ad->csline) free(ad->csline);
1035 if (ad->model) free(ad->model);
1036 if (ad->mline) free(ad->mline);
1037 if (ad->aseq) free(ad->aseq);
1038 if (ad->ntseq) free(ad->ntseq);
1039 if (ad->ppline) free(ad->ppline);
1040 if (ad->hmmname) free(ad->hmmname);
1041 if (ad->hmmacc) free(ad->hmmacc);
1042 if (ad->hmmdesc) free(ad->hmmdesc);
1043 if (ad->sqname) free(ad->sqname);
1044 if (ad->sqacc) free(ad->sqacc);
1045 if (ad->sqdesc) free(ad->sqdesc);
1046 }
1047 free(ad);
1048 }
1049 /*---------------- end, alidisplay object -----------------------*/
1050
1051
1052
1053 /*****************************************************************
1054 * 2. The P7_ALIDISPLAY API
1055 *****************************************************************/
1056
1057 static int
integer_textwidth(long n)1058 integer_textwidth(long n)
1059 {
1060 int w = (n < 0)? 1 : 0;
1061 while (n != 0) { n /= 10; w++; }
1062 return w;
1063 }
1064
1065
1066 /* Function: p7_alidisplay_EncodePostProb()
1067 * Synopsis: Convert a posterior probability to a char code.
1068 *
1069 * Purpose: Convert the posterior probability <p> to
1070 * a character code suitable for Stockholm format
1071 * <#=GC PP_cons> and <#=GR seqname PP> annotation
1072 * lines. HMMER uses the same codes in alignment
1073 * output.
1074 *
1075 * Characters <0-9*> are used; $0.0 \leq p < 0.05$
1076 * is coded as 0, $0.05 \leq p < 0.15$ is coded as
1077 * 1, ... and so on ..., $0.85 \leq p < 0.95$ is
1078 * coded as 9, and $0.95 \leq p \leq 1.0$ is coded
1079 * as '*'.
1080 *
1081 * Returns: the encoded character.
1082 */
1083 char
p7_alidisplay_EncodePostProb(float p)1084 p7_alidisplay_EncodePostProb(float p)
1085 {
1086 return (p + 0.05 >= 1.0) ? '*' : (char) ((p + 0.05) * 10.0) + '0';
1087 }
1088
1089 /* Function: p7_alidisplay_DecodePostProb()
1090 * Synopsis: Convert a char code post prob to an approx float.
1091 *
1092 * Purpose: Convert posterior probability code <pc>, which
1093 * is [0-9*], to an approximate floating point probability.
1094 *
1095 * The result is crude, because <pc> has already discretized
1096 * with loss of precision. We require that
1097 * <p7_alidisplay_EncodePostProb(p7_alidisplay_DecodePostProb(pc)) == pc>,
1098 * and that <pc=='0'> decodes to a nonzero probability just to
1099 * avoid any possible absorbing-zero artifacts.
1100 *
1101 * Returns: the decoded real-valued approximate probability.
1102 */
1103 float
p7_alidisplay_DecodePostProb(char pc)1104 p7_alidisplay_DecodePostProb(char pc)
1105 {
1106 if (pc == '0') return 0.01;
1107 else if (pc == '*') return 1.0;
1108 else if (pc == '.') return 0.0;
1109 else return ((float) (pc - '0') / 10.);
1110 }
1111
1112
1113 /* Function: p7_alidisplay_Print()
1114 * Synopsis: Human readable output of <P7_ALIDISPLAY>
1115 *
1116 * Purpose: Prints alignment <ad> to stream <fp>.
1117 *
1118 * Put at least <min_aliwidth> alignment characters per
1119 * line; try to make lines no longer than <linewidth>
1120 * characters, including name, coords, and spacing. The
1121 * width of lines may exceed <linewidth>, if that's what it
1122 * takes to put a name, coords, and <min_aliwidth>
1123 * characters of alignment on a line.
1124 *
1125 * As a special case, if <linewidth> is negative or 0, then
1126 * alignments are formatted in a single block of unlimited
1127 * line length.
1128 *
1129 * Returns: <eslOK> on success.
1130 *
1131 * Throws: <eslEWRITE> on write error, such as filling the disk.
1132 */
1133 int
p7_alidisplay_Print(FILE * fp,P7_ALIDISPLAY * ad,int min_aliwidth,int linewidth,P7_PIPELINE * pli)1134 p7_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli)
1135 {
1136 int status;
1137 if ((status = p7_nontranslated_alidisplay_Print(fp, ad, min_aliwidth, linewidth, pli->show_accessions)) != eslOK) return status;
1138
1139 return status;
1140 }
1141
1142 /* Function: p7_nontranslated_alidisplay_Print()
1143 * Synopsis: Human readable output of <P7_ALIDISPLAY>
1144 *
1145 * Purpose: Prints alignment <ad> to stream <fp>.
1146 *
1147 * Put at least <min_aliwidth> alignment characters per
1148 * line; try to make lines no longer than <linewidth>
1149 * characters, including name, coords, and spacing. The
1150 * width of lines may exceed <linewidth>, if that's what it
1151 * takes to put a name, coords, and <min_aliwidth>
1152 * characters of alignment on a line.
1153 *
1154 * As a special case, if <linewidth> is negative or 0, then
1155 * alignments are formatted in a single block of unlimited
1156 * line length.
1157 *
1158 * Returns: <eslOK> on success.
1159 *
1160 * Throws: <eslEWRITE> on write error, such as filling the disk.
1161 */
1162 int
p7_nontranslated_alidisplay_Print(FILE * fp,P7_ALIDISPLAY * ad,int min_aliwidth,int linewidth,int show_accessions)1163 p7_nontranslated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, int show_accessions)
1164 {
1165 char *buf = NULL;
1166 char *show_hmmname = NULL;
1167 char *show_seqname = NULL;
1168 int namewidth, coordwidth, aliwidth;
1169 int pos;
1170 int status;
1171 int ni, nk;
1172 int z;
1173 long i1,i2;
1174 int k1,k2;
1175
1176 /* implement the --acc option for preferring accessions over names in output */
1177 show_hmmname = (show_accessions && ad->hmmacc[0] != '\0') ? ad->hmmacc : ad->hmmname;
1178 show_seqname = (show_accessions && ad->sqacc[0] != '\0') ? ad->sqacc : ad->sqname;
1179
1180 /* dynamically size the output lines */
1181 namewidth = ESL_MAX(strlen(show_hmmname), strlen(show_seqname));
1182 coordwidth = ESL_MAX(ESL_MAX(integer_textwidth(ad->hmmfrom),
1183 integer_textwidth(ad->hmmto)),
1184 ESL_MAX(integer_textwidth(ad->sqfrom),
1185 integer_textwidth(ad->sqto)));
1186
1187 aliwidth = (linewidth > 0) ? linewidth - namewidth - 2*coordwidth - 5 : ad->N;
1188 if (aliwidth < ad->N && aliwidth < min_aliwidth) aliwidth = min_aliwidth; /* at least, regardless of some silly linewidth setting */
1189 ESL_ALLOC(buf, sizeof(char) * (aliwidth+1));
1190 buf[aliwidth] = 0;
1191
1192 /* Break the alignment into multiple blocks of width aliwidth for printing */
1193 i1 = ad->sqfrom;
1194 k1 = ad->hmmfrom;
1195 for (pos = 0; pos < ad->N; pos += aliwidth)
1196 {
1197 if (pos > 0) { if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed"); } /* blank line betweeen blocks */
1198
1199 ni = nk = 0;
1200 for (z = pos; z < pos + aliwidth && z < ad->N; z++) {
1201 if (ad->model[z] != '.') nk++; /* k advances except on insert states */
1202 if (ad->aseq[z] != '-') ni++; /* i advances except on delete states */
1203 }
1204
1205 k2 = k1+nk-1;
1206 if (ad->sqfrom < ad->sqto) i2 = i1+ni-1;
1207 else i2 = i1-ni+1; // revcomp hit for DNA
1208
1209 if (ad->csline != NULL) { strncpy(buf, ad->csline+pos, aliwidth); if (fprintf(fp, " %*s %s CS\n", namewidth+coordwidth+1, "", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed"); }
1210 if (ad->rfline != NULL) { strncpy(buf, ad->rfline+pos, aliwidth); if (fprintf(fp, " %*s %s RF\n", namewidth+coordwidth+1, "", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed"); }
1211 if (ad->mmline != NULL) { strncpy(buf, ad->mmline+pos, aliwidth); if (fprintf(fp, " %*s %s MM\n", namewidth+coordwidth+1, "", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed"); }
1212
1213 strncpy(buf, ad->model+pos, aliwidth); if (fprintf(fp, " %*s %*d %s %-*d\n", namewidth, show_hmmname, coordwidth, k1, buf, coordwidth, k2) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed");
1214 strncpy(buf, ad->mline+pos, aliwidth); if (fprintf(fp, " %*s %s\n", namewidth+coordwidth+1, " ", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed");
1215
1216 if (ni > 0) { strncpy(buf, ad->aseq+pos, aliwidth); if (fprintf(fp, " %*s %*ld %s %-*ld\n", namewidth, show_seqname, coordwidth, i1, buf, coordwidth, i2) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed"); }
1217 else { strncpy(buf, ad->aseq+pos, aliwidth); if (fprintf(fp, " %*s %*s %s %*s\n", namewidth, show_seqname, coordwidth, "-", buf, coordwidth, "-") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed"); }
1218
1219 if (ad->ppline != NULL) { strncpy(buf, ad->ppline+pos, aliwidth); if (fprintf(fp, " %*s %s PP\n", namewidth+coordwidth+1, "", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "alignment display write failed"); }
1220
1221 k1 += nk;
1222 if (ad->sqfrom < ad->sqto) i1 += ni;
1223 else i1 -= ni; // revcomp hit for DNA
1224 }
1225 fflush(fp);
1226 free(buf);
1227 return eslOK;
1228
1229 ERROR:
1230 if (buf) free(buf);
1231 return status;
1232 }
1233
1234 /* Function: p7_alidisplay_Backconvert()
1235 * Synopsis: Convert an alidisplay to a faux trace and subsequence.
1236 *
1237 * Purpose: Convert alignment display object <ad> to a faux subsequence
1238 * and faux subsequence trace, returning them in <ret_sq> and
1239 * <ret_tr>.
1240 *
1241 * The subsequence <*ret_sq> is digital; ascii residues in
1242 * <ad> are digitized using digital alphabet <abc>.
1243 *
1244 * The subsequence and trace are suitable for passing as
1245 * array elements to <p7_tracealign_Seqs>. This is the
1246 * main purpose of backconversion. Results of a profile
1247 * search are stored in a hit list as a processed
1248 * <P7_ALIDISPLAY>, not as a <P7_TRACE> and <ESL_SQ>, to
1249 * reduce space and to reduce communication overhead in
1250 * parallelized search implementations. After reduction
1251 * to a final hit list, a master may want to construct a
1252 * multiple alignment of all the significant hits.
1253 *
1254 * Returns: <eslOK> on success.
1255 *
1256 * Throws: <eslEMEM> on allocation failures. <eslECORRUPT> on unexpected internal
1257 * data corruption. On any exception, <*ret_sq> and <*ret_tr> are
1258 * <NULL>.
1259 *
1260 * Xref: SRE:J4/29.
1261 */
1262 int
p7_alidisplay_Backconvert(const P7_ALIDISPLAY * ad,const ESL_ALPHABET * abc,ESL_SQ ** ret_sq,P7_TRACE ** ret_tr)1263 p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr)
1264 {
1265 ESL_SQ *sq = NULL; /* RETURN: faux subsequence */
1266 P7_TRACE *tr = NULL; /* RETURN: faux trace */
1267 int subL = 0; /* subsequence length in the <ad> */
1268 int a, i, k; /* coords for <ad>, <sq->dsq>, model */
1269 char s; /* current state type: MDI */
1270 int status;
1271
1272 /* Make a first pass over <ad> just to calculate subseq length */
1273 for (a = 0; a < ad->N; a++)
1274 if (! esl_abc_CIsGap(abc, ad->aseq[a])) subL++;
1275
1276 /* Allocations */
1277 if ((sq = esl_sq_CreateDigital(abc)) == NULL) { status = eslEMEM; goto ERROR; }
1278 if ((status = esl_sq_GrowTo(sq, subL)) != eslOK) goto ERROR;
1279
1280 if ((tr = (ad->ppline == NULL) ? p7_trace_Create() : p7_trace_CreateWithPP()) == NULL) { status = eslEMEM; goto ERROR; }
1281 if ((status = p7_trace_GrowTo(tr, subL+6)) != eslOK) goto ERROR; /* +6 is for SNB/ECT */
1282
1283 /* Construction of dsq, trace */
1284 sq->dsq[0] = eslDSQ_SENTINEL;
1285 if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_S, 0, 0) : p7_trace_AppendWithPP(tr, p7T_S, 0, 0, 0.0))) != eslOK) goto ERROR;
1286 if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_N, 0, 0) : p7_trace_AppendWithPP(tr, p7T_N, 0, 0, 0.0))) != eslOK) goto ERROR;
1287 if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_B, 0, 0) : p7_trace_AppendWithPP(tr, p7T_B, 0, 0, 0.0))) != eslOK) goto ERROR;
1288 k = ad->hmmfrom - 1; // -1 so the first M causes k to advance to <hmmfrom>.
1289 i = 1; // ... which assumes <ad> always starts with M; currently true, all alis are local alis.
1290 for (a = 0; a < ad->N; a++)
1291 {
1292 /* here, anything that could appear in the original input
1293 * sequence, as opposed to an alignment gap, needs to be
1294 * reconstructed. So, do not test for IsResidue(), because that
1295 * will fail to reconstruct * chars. use ! IsGap() instead.
1296 * [xref iss#135]
1297 */
1298 if (! esl_abc_CIsGap(abc, ad->model[a])) { k++; s = (! esl_abc_CIsGap(abc, ad->aseq[a]) ? p7T_M : p7T_D); }
1299 else s = p7T_I;
1300
1301 if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, s, k, i) : p7_trace_AppendWithPP(tr, s, k, i, p7_alidisplay_DecodePostProb(ad->ppline[a])))) != eslOK) goto ERROR;
1302
1303 switch (s) {
1304 case p7T_M: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); i++; break;
1305 case p7T_I: sq->dsq[i] = esl_abc_DigitizeSymbol(abc, ad->aseq[a]); i++; break;
1306 case p7T_D: break;
1307 }
1308 }
1309 if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_E, 0, 0) : p7_trace_AppendWithPP(tr, p7T_E, 0, 0, 0.0))) != eslOK) goto ERROR;
1310 if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_C, 0, 0) : p7_trace_AppendWithPP(tr, p7T_C, 0, 0, 0.0))) != eslOK) goto ERROR;
1311 if ((status = ((ad->ppline == NULL) ? p7_trace_Append(tr, p7T_T, 0, 0) : p7_trace_AppendWithPP(tr, p7T_T, 0, 0, 0.0))) != eslOK) goto ERROR;
1312 sq->dsq[i] = eslDSQ_SENTINEL;
1313
1314 /* some sanity checks */
1315 if (tr->N != ad->N + 6) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace ended up with unexpected size (%s/%s)", ad->sqname, ad->hmmname);
1316 if (k != ad->hmmto) ESL_XEXCEPTION(eslECORRUPT, "backconverted trace didn't end at expected place on model (%s/%s)", ad->sqname, ad->hmmname);
1317 if (i != subL+1) ESL_XEXCEPTION(eslECORRUPT, "backconverted subseq didn't end at expected length (%s/%s)", ad->sqname, ad->hmmname);
1318
1319 /* Set up <sq> annotation as a subseq of a source sequence */
1320 if ((status = esl_sq_FormatName(sq, "%s/%" PRId64 "-%" PRId64 "", ad->sqname, ad->sqfrom, ad->sqto)) != eslOK) goto ERROR;
1321 if ((status = esl_sq_FormatDesc(sq, "[subseq from] %s", ad->sqdesc[0] != '\0' ? ad->sqdesc : ad->sqname)) != eslOK) goto ERROR;
1322 if ((status = esl_sq_SetSource (sq, ad->sqname)) != eslOK) goto ERROR;
1323 if (ad->sqacc[0] != '\0') { if ((status = esl_sq_SetAccession (sq, ad->sqacc)) != eslOK) goto ERROR; }
1324 sq->n = subL;
1325 sq->start = ad->sqfrom;
1326 sq->end = ad->sqto;
1327 sq->C = 0;
1328 sq->W = subL;
1329 sq->L = ad->L;
1330
1331 tr->M = ad->M;
1332 tr->L = ad->L;
1333
1334 *ret_sq = sq;
1335 *ret_tr = tr;
1336 return eslOK;
1337
1338 ERROR:
1339 if (sq != NULL) esl_sq_Destroy(sq);
1340 if (tr != NULL) p7_trace_Destroy(tr);
1341 *ret_sq = NULL;
1342 *ret_tr = NULL;
1343 return status;
1344 }
1345 /*------------------- end, alidisplay API -----------------------*/
1346
1347
1348 /*****************************************************************
1349 * 3. Debugging/dev code
1350 *****************************************************************/
1351
1352
1353 /* Function: p7_alidisplay_Sample()
1354 * Synopsis: Sample a random, ugly <P7_ALIDISPLAY> for test purposes
1355 * Incept: SRE, Wed Feb 28 14:22:12 2018 [Caravan Palace, Dragons]
1356 *
1357 * Purpose: Sample a random, dirty <P7_ALIDISPLAY> of length <N> for
1358 * testing purposes, using random number generator <rng>.
1359 * Return it through <ret_ad>. Caller frees.
1360 *
1361 * P7_ALIDISPLAY is assumed to be a _local_ alignment.
1362 * Must start with M, and end with M|D.
1363 *
1364 * Args: rng - random number generator
1365 * N - length of alignment
1366 * ret_ad - RETURN: random sampled <P7_ALIDISPLAY>
1367 *
1368 * Returns: <eslOK> on success, and <ret_ad> points to the new
1369 * <P7_ALIDISPLAY>
1370 *
1371 * Throws: <eslEMEM> on allocation error, and <ret_ad> is NULL.
1372 */
1373 int
p7_alidisplay_Sample(ESL_RANDOMNESS * rng,int N,P7_ALIDISPLAY ** ret_ad)1374 p7_alidisplay_Sample(ESL_RANDOMNESS *rng, int N, P7_ALIDISPLAY **ret_ad)
1375 {
1376 P7_ALIDISPLAY *ad = NULL;
1377 char *guidestring = NULL; /* string [0..N-1] composed of MDI */
1378 int nM = 0;
1379 int nD = 0;
1380 int nI = 0;
1381 enum p7t_statetype_e last_st;
1382 int pos;
1383 int status;
1384
1385 ESL_ALLOC(guidestring, sizeof(char) * (N+1));
1386
1387 guidestring[0] = 'M'; nM++; last_st = p7T_M; /* local alignments must start with M */
1388 for (pos = 1; pos < N-1; pos++)
1389 {
1390 switch (last_st)
1391 {
1392 case p7T_M:
1393 switch (esl_rnd_Roll(rng, 3))
1394 {
1395 case 0: guidestring[pos] = 'M'; nM++; last_st = p7T_M; break;
1396 case 1: guidestring[pos] = 'D'; nD++; last_st = p7T_D; break;
1397 case 2: guidestring[pos] = 'I'; nI++; last_st = p7T_I; break;
1398 }
1399 break;
1400
1401 case p7T_I:
1402 switch (esl_rnd_Roll(rng, 2))
1403 {
1404 case 0: guidestring[pos] = 'M'; nM++; last_st = p7T_M; break;
1405 case 1: guidestring[pos] = 'I'; nI++; last_st = p7T_I; break;
1406 }
1407 break;
1408
1409 case p7T_D:
1410 switch (esl_rnd_Roll(rng, 2))
1411 {
1412 case 0: guidestring[pos] = 'M'; nM++; last_st = p7T_M; break;
1413 case 1: guidestring[pos] = 'D'; nD++; last_st = p7T_D; break;
1414 }
1415 break;
1416
1417 default:
1418 break;
1419 }
1420 }
1421 /* local alignments can end on M or D. (optimal local alignments can only end on M) */
1422 switch (last_st) {
1423 case p7T_I:
1424 guidestring[N-1] = 'M'; nM++; break;
1425 default:
1426 switch (esl_rnd_Roll(rng, 2)) {
1427 case 0: guidestring[N-1] = 'M'; nM++; break;
1428 case 1: guidestring[N-1] = 'D'; nD++; break;
1429 }
1430 break;
1431 }
1432 guidestring[N] = '\0';
1433
1434 ESL_ALLOC(ad, sizeof(P7_ALIDISPLAY));
1435 ad->rfline = ad->mmline = ad->csline = ad->model = ad->mline = ad->aseq = ad->ntseq = ad->ppline = NULL;
1436 ad->hmmname = ad->hmmacc = ad->hmmdesc = NULL;
1437 ad->sqname = ad->sqacc = ad->sqdesc = NULL;
1438 ad->mem = NULL;
1439 ad->memsize = 0;
1440
1441 /* Optional lines are added w/ 50% chance */
1442 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->rfline, sizeof(char) * (N+1));
1443 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->mmline, sizeof(char) * (N+1));
1444 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->csline, sizeof(char) * (N+1));
1445 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->ppline, sizeof(char) * (N+1));
1446 ESL_ALLOC(ad->model, sizeof(char) * (N+1));
1447 ESL_ALLOC(ad->mline, sizeof(char) * (N+1));
1448 ESL_ALLOC(ad->aseq, sizeof(char) * (N+1));
1449 ad->N = N;
1450
1451 esl_strdup("my_hmm", -1, &(ad->hmmname));
1452 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("PF000007", -1, &(ad->hmmacc)); else esl_strdup("", -1, &(ad->hmmacc));
1453 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("(hmm description)", -1, &(ad->hmmdesc)); else esl_strdup("", -1, &(ad->hmmdesc));
1454
1455 esl_strdup("my_seq", -1, &(ad->sqname));
1456 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("ABC000001.42", -1, &(ad->sqacc)); else esl_strdup("", -1, &(ad->sqacc));
1457 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("(sequence description)", -1, &(ad->sqdesc)); else esl_strdup("", -1, &(ad->sqdesc));
1458
1459 /* model, seq coords must look valid. */
1460 ad->hmmfrom = 100;
1461 ad->hmmto = ad->hmmfrom + nM + nD - 1;
1462 ad->M = ad->hmmto + esl_rnd_Roll(rng, 2);
1463
1464 ad->sqfrom = 1000;
1465 ad->sqto = ad->sqfrom + nM + nI - 1;
1466 ad->L = ad->sqto + esl_rnd_Roll(rng, 2);
1467
1468 /* rfline is free-char "reference annotation" on consensus; H3 puts '.' for inserts. */
1469 if (ad->rfline) {
1470 for (pos = 0; pos < N; pos++)
1471 ad->rfline[pos] = (guidestring[pos] == 'I' ? '.' : 'x');
1472 ad->rfline[pos] = '\0';
1473 }
1474
1475 /* mmline indicates which columns should be masked (assigned background distribution), '.' indicates no mask; H3 puts '.' for inserts. */
1476 if (ad->mmline) {
1477 for (pos = 0; pos < N; pos++)
1478 ad->mmline[pos] = (guidestring[pos] == 'I' ? '.' : '.');
1479 ad->mmline[pos] = '\0';
1480 }
1481
1482 /* csline is optional. It has free-char "consensus structure annotation" on consensus positions. H3 puts '.' on inserts. */
1483 if (ad->csline) {
1484 for (pos = 0; pos < N; pos++)
1485 ad->csline[pos] = (guidestring[pos] == 'I' ? '.' : 'X');
1486 ad->csline[pos] = '\0';
1487 }
1488
1489 /* the mandatory three-line alignment display:
1490 *
1491 * guidestring: MMMDI
1492 * model: XXXX.
1493 * mline: A+
1494 * aseq: AAA-a
1495 */
1496 for (pos = 0; pos < N; pos++)
1497 {
1498 switch (guidestring[pos]) {
1499 case 'M':
1500 ad->model[pos] = 'X';
1501 switch (esl_rnd_Roll(rng, 3)) {
1502 case 0: ad->mline[pos] = 'A';
1503 case 1: ad->mline[pos] = '+';
1504 case 2: ad->mline[pos] = ' ';
1505 }
1506 if (ad->mline[pos] == ' ' && esl_rnd_Roll(rng, 50) == 0) ad->aseq[pos] = '*'; // dirty aligned sequence up with nasty * stop codons, about 1/(3*50) of the time.
1507 else ad->aseq[pos] = 'A'; // ... they would only be aligned to ' ' on an mline.
1508 break; // ... they might appear aligned to a match or insert state (see iss#135)
1509
1510 case 'D':
1511 ad->model[pos] = 'X';
1512 ad->mline[pos] = ' ';
1513 ad->aseq[pos] = '-';
1514 break;
1515
1516 case 'I':
1517 ad->model[pos] = '.';
1518 ad->mline[pos] = ' ';
1519 ad->aseq[pos] = 'a';
1520 break;
1521 }
1522 }
1523 ad->model[pos] = '\0';
1524 ad->mline[pos] = '\0';
1525 ad->aseq[pos] = '\0';
1526
1527 /* ppline is optional */
1528 if (ad->ppline) {
1529 for (pos = 0; pos < N; pos++)
1530 ad->ppline[pos] = (guidestring[pos] == 'D' ? '.' : p7_alidisplay_EncodePostProb(esl_random(rng)));
1531 ad->ppline[pos] = '\0';
1532 }
1533
1534 free(guidestring);
1535 *ret_ad = ad;
1536 return eslOK;
1537
1538 ERROR:
1539 if (guidestring) free(guidestring);
1540 if (ad) p7_alidisplay_Destroy(ad);
1541 *ret_ad = NULL;
1542 return status;
1543 }
1544
1545
1546
1547 /* Function: p7_alidisplay_Dump()
1548 * Synopsis: Print contents of P7_ALIDISPLAY for inspection.
1549 *
1550 * Purpose: Print contents of the <P7_ALIDISPLAY> <ad> to
1551 * stream <fp> for inspection. Includes all elements
1552 * of the structure, whether the object is allocated
1553 * in serialized or deserialized form, and the total
1554 * size of the object in bytes.
1555 *
1556 * Returns: <eslOK>
1557 */
1558 int
p7_alidisplay_Dump(FILE * fp,const P7_ALIDISPLAY * ad)1559 p7_alidisplay_Dump(FILE *fp, const P7_ALIDISPLAY *ad)
1560 {
1561 fprintf(fp, "P7_ALIDISPLAY dump\n");
1562 fprintf(fp, "------------------\n");
1563
1564 fprintf(fp, "rfline = %s\n", ad->rfline ? ad->rfline : "[none]");
1565 fprintf(fp, "mmline = %s\n", ad->mmline ? ad->mmline : "[none]");
1566 fprintf(fp, "csline = %s\n", ad->csline ? ad->csline : "[none]");
1567 fprintf(fp, "model = %s\n", ad->model);
1568 fprintf(fp, "mline = %s\n", ad->mline);
1569 fprintf(fp, "aseq = %s\n", ad->aseq);
1570 fprintf(fp, "N = %d\n", ad->N);
1571 fprintf(fp, "\n");
1572
1573 fprintf(fp, "hmmname = %s\n", ad->hmmname);
1574 fprintf(fp, "hmmacc = %s\n", ad->hmmacc[0] == '\0' ? "[none]" : ad->hmmacc);
1575 fprintf(fp, "hmmdesc = %s\n", ad->hmmdesc[0] == '\0' ? "[none]" : ad->hmmdesc);
1576 fprintf(fp, "hmmfrom = %d\n", ad->hmmfrom);
1577 fprintf(fp, "hmmto = %d\n", ad->hmmto);
1578 fprintf(fp, "M = %d\n", ad->M);
1579 fprintf(fp, "\n");
1580
1581 fprintf(fp, "sqname = %s\n", ad->sqname);
1582 fprintf(fp, "sqacc = %s\n", ad->sqacc[0] == '\0' ? "[none]" : ad->sqacc);
1583 fprintf(fp, "sqdesc = %s\n", ad->sqdesc[0] == '\0' ? "[none]" : ad->sqdesc);
1584 fprintf(fp, "sqfrom = %" PRId64 "\n", ad->sqfrom);
1585 fprintf(fp, "sqto = %" PRId64 "\n", ad->sqto);
1586 fprintf(fp, "L = %" PRId64 "\n", ad->L);
1587 fprintf(fp, "\n");
1588
1589 fprintf(fp, "size = %d bytes\n", (int) p7_alidisplay_Sizeof(ad));
1590 fprintf(fp, "%s\n", ad->mem ? "serialized" : "not serialized");
1591 return eslOK;
1592 }
1593
1594 /* Function: p7_alidisplay_Compare()
1595 * Synopsis: Compare two <P7_ALIDISPLAY> objects for equality
1596 *
1597 * Purpose: Compare alignment displays <ad1> and <ad2> for
1598 * equality. Return <eslOK> if they have identical
1599 * contents; <eslFAIL> if not.
1600 *
1601 * Only contents matter, not serialization status;
1602 * a serialized and deserialized version of the same
1603 * alidisplay will compare identical.
1604 */
1605 int
p7_alidisplay_Compare(const P7_ALIDISPLAY * ad1,const P7_ALIDISPLAY * ad2)1606 p7_alidisplay_Compare(const P7_ALIDISPLAY *ad1, const P7_ALIDISPLAY *ad2)
1607 {
1608 if (ad1->mem && ad2->mem) /* both objects serialized */
1609 {
1610 if (ad1->memsize != ad2->memsize) return eslFAIL;
1611 if (memcmp(ad1->mem, ad2->mem, ad1->memsize) != 0) return eslFAIL;
1612 }
1613
1614 if (esl_strcmp(ad1->rfline, ad2->rfline) != eslOK) return eslFAIL;
1615 if (esl_strcmp(ad1->mmline, ad2->mmline) != eslOK) return eslFAIL;
1616 if (esl_strcmp(ad1->csline, ad2->csline) != eslOK) return eslFAIL;
1617 if (esl_strcmp(ad1->model, ad2->model) != eslOK) return eslFAIL;
1618 if (esl_strcmp(ad1->mline, ad2->mline) != eslOK) return eslFAIL;
1619 if (esl_strcmp(ad1->aseq, ad2->aseq) != eslOK) return eslFAIL;
1620 if (esl_strcmp(ad1->ntseq, ad2->ntseq) != eslOK) return eslFAIL;
1621 if (esl_strcmp(ad1->ppline, ad2->ppline) != eslOK) return eslFAIL;
1622 if (ad1->N != ad2->N) return eslFAIL;
1623
1624 if (esl_strcmp(ad1->hmmname, ad2->hmmname) != eslOK) return eslFAIL;
1625 if (esl_strcmp(ad1->hmmacc, ad2->hmmacc) != eslOK) return eslFAIL;
1626 if (esl_strcmp(ad1->hmmdesc, ad2->hmmdesc) != eslOK) return eslFAIL;
1627 if (ad1->hmmfrom != ad2->hmmfrom) return eslFAIL;
1628 if (ad1->hmmto != ad2->hmmto) return eslFAIL;
1629 if (ad1->M != ad2->M) return eslFAIL;
1630
1631 if (esl_strcmp(ad1->sqname, ad2->sqname) != eslOK) return eslFAIL;
1632 if (esl_strcmp(ad1->sqacc, ad2->sqacc) != eslOK) return eslFAIL;
1633 if (esl_strcmp(ad1->sqdesc, ad2->sqdesc) != eslOK) return eslFAIL;
1634 if (ad1->sqfrom != ad2->sqfrom) return eslFAIL;
1635 if (ad1->sqto != ad2->sqto) return eslFAIL;
1636 if (ad1->M != ad2->M) return eslFAIL;
1637
1638 return eslOK;
1639 }
1640
1641
1642 /*-------------- end, debugging/dev code ------------------------*/
1643
1644
1645
1646 /*****************************************************************
1647 * 4. Benchmark driver.
1648 *****************************************************************/
1649 #ifdef p7ALIDISPLAY_BENCHMARK
1650 /*
1651 gcc -o benchmark-alidisplay -std=gnu99 -g -Wall -O2 -I. -L. -I../easel -L../easel -Dp7ALIDISPLAY_BENCHMARK p7_alidisplay.c -lhmmer -leasel -lm
1652
1653 ./benchmark-alidisplay <hmmfile> runs benchmark
1654 ./benchmark-alidisplay -b <hmmfile> gets baseline time to subtract: just random trace generation
1655 */
1656 #include "p7_config.h"
1657
1658 #include "easel.h"
1659 #include "esl_alphabet.h"
1660 #include "esl_getopts.h"
1661 #include "esl_random.h"
1662 #include "esl_stopwatch.h"
1663
1664 #include "hmmer.h"
1665
1666 static ESL_OPTIONS options[] = {
1667 /* name type default env range toggles reqs incomp help docgroup*/
1668 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1669 { "-b", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "baseline timing", 0 },
1670 { "-p", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "include fake PP line, just to see how it looks", 0 },
1671 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
1672 { "-N", eslARG_INT, "50000", NULL, "n>0", NULL, NULL, NULL, "number of traces to generate", 0 },
1673 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1674 };
1675 static char usage[] = "[-options] <hmmfile>";
1676 static char banner[] = "benchmark driver for P7_ALIDISPLAY";
1677
1678 int
main(int argc,char ** argv)1679 main(int argc, char **argv)
1680 {
1681 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1682 char *hmmfile = esl_opt_GetArg(go, 1);
1683 int N = esl_opt_GetInteger(go, "-N");
1684 ESL_STOPWATCH *w = esl_stopwatch_Create();
1685 ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
1686 ESL_ALPHABET *abc = NULL;
1687 P7_HMMFILE *hfp = NULL;
1688 P7_HMM *hmm = NULL;
1689 P7_BG *bg = NULL;
1690 P7_PROFILE *gm = NULL;
1691 P7_OPROFILE *om = NULL;
1692 P7_TRACE *tr = NULL;
1693 ESL_SQ *sq = NULL;
1694 P7_ALIDISPLAY *ad = NULL;
1695 int i,z;
1696
1697 if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
1698 if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM");
1699 p7_hmmfile_Close(hfp);
1700
1701
1702 bg = p7_bg_Create(abc);
1703 p7_bg_SetLength(bg, 0);
1704 gm = p7_profile_Create(hmm->M, abc);
1705 p7_ProfileConfig(hmm, bg, gm, 0, p7_UNIGLOCAL); /* that sets N,C,J to generate nothing */
1706 om = p7_oprofile_Create(gm->M, abc);
1707 p7_oprofile_Convert(gm, om);
1708
1709 if (esl_opt_GetBoolean(go, "-p")) tr = p7_trace_CreateWithPP();
1710 else tr = p7_trace_Create();
1711
1712 sq = esl_sq_CreateDigital(abc);
1713
1714 esl_stopwatch_Start(w);
1715 for (i = 0; i < N; i++)
1716 {
1717 p7_ProfileEmit(r, hmm, gm, bg, sq, tr);
1718 esl_sq_SetName(sq, "random");
1719
1720 if (! esl_opt_GetBoolean(go, "-b"))
1721 {
1722 if (esl_opt_GetBoolean(go, "-p"))
1723 for (z = 0; z < tr->N; z++)
1724 if (tr->i[z] > 0) tr->pp[z] = esl_random(r);
1725
1726 ad = p7_alidisplay_Create(tr, 0, om, sq, NULL);
1727 p7_alidisplay_Print(stdout, ad, 40, 80, FALSE);
1728 p7_alidisplay_Destroy(ad);
1729 }
1730 p7_trace_Reuse(tr);
1731 esl_sq_Reuse(sq);
1732 }
1733 esl_stopwatch_Stop(w);
1734 esl_stopwatch_Display(stdout, w, "# CPU time: ");
1735
1736 esl_sq_Destroy(sq);
1737 p7_trace_Destroy(tr);
1738 p7_oprofile_Destroy(om);
1739 p7_profile_Destroy(gm);
1740 p7_bg_Destroy(bg);
1741 p7_hmm_Destroy(hmm);
1742 esl_alphabet_Destroy(abc);
1743 esl_randomness_Destroy(r);
1744 esl_stopwatch_Destroy(w);
1745 esl_getopts_Destroy(go);
1746 return 0;
1747 }
1748 #endif /*p7ALIDISPLAY_BENCHMARK*/
1749 /*--------------------- end, benchmark driver -------------------*/
1750
1751 /****************************************************************
1752 * 5. Unit tests.
1753 ****************************************************************/
1754 #ifdef p7ALIDISPLAY_TESTDRIVE
1755
1756 /*Testing function that generates a P7_ALIDISPLAY containing a nucleotide sequence string rather than an amino
1757 *sequence string. This function should only be used for testing the serialization/deserializaton code. No attempt
1758 *is made to make the nucleotide string be reasonable or even valid -- it's just a valid C string of the correct length
1759 * Like p7_alidisplay_Sample, which it is based on, it randomly selects which of the optional fields should be present in the
1760 * alidisplay */
1761 static int
alidisplay_SampleFake_ntseq(ESL_RANDOMNESS * rng,int N,P7_ALIDISPLAY ** ret_ad)1762 alidisplay_SampleFake_ntseq(ESL_RANDOMNESS *rng, int N, P7_ALIDISPLAY **ret_ad)
1763 {
1764 P7_ALIDISPLAY *ad = NULL;
1765 char *guidestring = NULL; /* string [0..N-1] composed of MDI */
1766 int nM = 0;
1767 int nD = 0;
1768 int nI = 0;
1769 enum p7t_statetype_e last_st;
1770 int pos;
1771 int status;
1772
1773 ESL_ALLOC(guidestring, sizeof(char) * (N+1));
1774
1775 guidestring[0] = 'M'; nM++; last_st = p7T_M; /* local alignments must start with M */
1776 for (pos = 1; pos < N-1; pos++)
1777 {
1778 switch (last_st)
1779 {
1780 case p7T_M:
1781 switch (esl_rnd_Roll(rng, 3))
1782 {
1783 case 0: guidestring[pos] = 'M'; nM++; last_st = p7T_M; break;
1784 case 1: guidestring[pos] = 'D'; nD++; last_st = p7T_D; break;
1785 case 2: guidestring[pos] = 'I'; nI++; last_st = p7T_I; break;
1786 }
1787 break;
1788
1789 case p7T_I:
1790 switch (esl_rnd_Roll(rng, 2))
1791 {
1792 case 0: guidestring[pos] = 'M'; nM++; last_st = p7T_M; break;
1793 case 1: guidestring[pos] = 'I'; nI++; last_st = p7T_I; break;
1794 }
1795 break;
1796
1797 case p7T_D:
1798 switch (esl_rnd_Roll(rng, 2))
1799 {
1800 case 0: guidestring[pos] = 'M'; nM++; last_st = p7T_M; break;
1801 case 1: guidestring[pos] = 'D'; nD++; last_st = p7T_D; break;
1802 }
1803 break;
1804
1805 default:
1806 break;
1807 }
1808 }
1809 /* local alignments can end on M or D. (optimal local alignments can only end on M) */
1810 switch (last_st) {
1811 case p7T_I:
1812 guidestring[N-1] = 'M'; nM++; break;
1813 default:
1814 switch (esl_rnd_Roll(rng, 2)) {
1815 case 0: guidestring[N-1] = 'M'; nM++; break;
1816 case 1: guidestring[N-1] = 'D'; nD++; break;
1817 }
1818 break;
1819 }
1820 guidestring[N] = '\0';
1821
1822 ESL_ALLOC(ad, sizeof(P7_ALIDISPLAY));
1823 ad->rfline = ad->mmline = ad->csline = ad->model = ad->mline = ad->aseq = ad->ntseq = ad->ppline = NULL;
1824 ad->hmmname = ad->hmmacc = ad->hmmdesc = NULL;
1825 ad->sqname = ad->sqacc = ad->sqdesc = NULL;
1826 ad->mem = NULL;
1827 ad->memsize = 0;
1828
1829 /* Optional lines are added w/ 50% chance */
1830 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->rfline, sizeof(char) * (N+1));
1831 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->mmline, sizeof(char) * (N+1));
1832 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->csline, sizeof(char) * (N+1));
1833 if (esl_rnd_Roll(rng, 2) == 0) ESL_ALLOC(ad->ppline, sizeof(char) * (N+1));
1834 ESL_ALLOC(ad->model, sizeof(char) * (N+1));
1835 ESL_ALLOC(ad->mline, sizeof(char) * (N+1));
1836 ESL_ALLOC(ad->ntseq, sizeof(char) * ((3 *N)+1));
1837 ad->N = N;
1838
1839 esl_strdup("my_hmm", -1, &(ad->hmmname));
1840 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("PF000007", -1, &(ad->hmmacc)); else esl_strdup("", -1, &(ad->hmmacc));
1841 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("(hmm description)", -1, &(ad->hmmdesc)); else esl_strdup("", -1, &(ad->hmmdesc));
1842
1843 esl_strdup("my_seq", -1, &(ad->sqname));
1844 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("ABC000001.42", -1, &(ad->sqacc)); else esl_strdup("", -1, &(ad->sqacc));
1845 if (esl_rnd_Roll(rng, 2) == 0) esl_strdup("(sequence description)", -1, &(ad->sqdesc)); else esl_strdup("", -1, &(ad->sqdesc));
1846
1847 /* model, seq coords must look valid. */
1848 ad->hmmfrom = 100;
1849 ad->hmmto = ad->hmmfrom + nM + nD - 1;
1850 ad->M = ad->hmmto + esl_rnd_Roll(rng, 2);
1851
1852 ad->sqfrom = 1000;
1853 ad->sqto = ad->sqfrom + nM + nI - 1;
1854 ad->L = ad->sqto + esl_rnd_Roll(rng, 2);
1855
1856 /* rfline is free-char "reference annotation" on consensus; H3 puts '.' for inserts. */
1857 if (ad->rfline) {
1858 for (pos = 0; pos < N; pos++)
1859 ad->rfline[pos] = (guidestring[pos] == 'I' ? '.' : 'x');
1860 ad->rfline[pos] = '\0';
1861 }
1862
1863 /* mmline indicates which columns should be masked (assigned background distribution), '.' indicates no mask; H3 puts '.' for inserts. */
1864 if (ad->mmline) {
1865 for (pos = 0; pos < N; pos++)
1866 ad->mmline[pos] = (guidestring[pos] == 'I' ? '.' : '.');
1867 ad->mmline[pos] = '\0';
1868 }
1869
1870 /* csline is optional. It has free-char "consensus structure annotation" on consensus positions. H3 puts '.' on inserts. */
1871 if (ad->csline) {
1872 for (pos = 0; pos < N; pos++)
1873 ad->csline[pos] = (guidestring[pos] == 'I' ? '.' : 'X');
1874 ad->csline[pos] = '\0';
1875 }
1876
1877 /* the mandatory three-line alignment display:
1878 *
1879 * guidestring: MMMDI
1880 * model: XXXX.
1881 * mline: A+
1882 * ntseq: AAA-a
1883 */
1884 for (pos = 0; pos < N; pos++)
1885 {
1886 switch (guidestring[pos]) {
1887 case 'M':
1888 ad->model[pos] = 'X';
1889 switch (esl_rnd_Roll(rng, 3)) {
1890 case 0: ad->mline[pos] = 'A';
1891 case 1: ad->mline[pos] = '+';
1892 case 2: ad->mline[pos] = ' ';
1893 }
1894 if (ad->mline[pos] == ' ' && esl_rnd_Roll(rng, 50) == 0) ad->ntseq[pos] = '*'; // dirty aligned sequence up with nasty * stop codons, about 1/(3*50) of the time.
1895 else ad->ntseq[pos] = 'A'; // ... they would only be aligned to ' ' on an mline.
1896 break; // ... they might appear aligned to a match or insert state (see iss#135)
1897
1898 case 'D':
1899 ad->model[pos] = 'X';
1900 ad->mline[pos] = ' ';
1901 ad->ntseq[pos] = '-';
1902 break;
1903
1904 case 'I':
1905 ad->model[pos] = '.';
1906 ad->mline[pos] = ' ';
1907 ad->ntseq[pos] = 'a';
1908 break;
1909 }
1910 }
1911 ad->model[pos] = '\0';
1912 ad->mline[pos] = '\0';
1913
1914 // pad out ntseq to the 3N length it needs
1915 for (pos = N; pos < (3 * N); pos++){
1916 if(esl_rnd_Roll(rng, 50) == 0) ad->ntseq[pos] = '*'; // dirty aligned sequence up with nasty * stop codons, about 1/(3*50) of the time.
1917 else ad->ntseq[pos] = 'A'; // ... they would only be aligned to ' ' on an mline.
1918 }
1919 ad->ntseq[pos] = '\0';
1920
1921 /* ppline is optional */
1922 if (ad->ppline) {
1923 for (pos = 0; pos < N; pos++)
1924 ad->ppline[pos] = (guidestring[pos] == 'D' ? '.' : p7_alidisplay_EncodePostProb(esl_random(rng)));
1925 ad->ppline[pos] = '\0';
1926 }
1927
1928 free(guidestring);
1929 *ret_ad = ad;
1930 return eslOK;
1931
1932 ERROR:
1933 if (guidestring) free(guidestring);
1934 if (ad) p7_alidisplay_Destroy(ad);
1935 *ret_ad = NULL;
1936 return status;
1937 }
1938
1939
1940
1941
1942 static void
utest_Serialize(ESL_RANDOMNESS * rng,int ntrials)1943 utest_Serialize(ESL_RANDOMNESS *rng, int ntrials)
1944 {
1945 char msg[] = "utest_Serialize failed";
1946 P7_ALIDISPLAY **serial = malloc(ntrials * sizeof(P7_ALIDISPLAY *));
1947 P7_ALIDISPLAY **deserial = malloc(ntrials * sizeof(P7_ALIDISPLAY *));
1948 uint8_t **buf = malloc(sizeof(uint8_t *));
1949 uint32_t n = 0;
1950 uint32_t nalloc = 0;
1951 int alignment_length;
1952 int i;
1953
1954 *buf = NULL;
1955 for (i = 0; i < ntrials; i++)
1956 {
1957 // Create random alignment to serialize
1958 alignment_length = (esl_random_uint32(rng) %300) + 50;
1959 if (esl_rnd_Roll(rng, 2) == 0)
1960 { // 50% chance of alidisplay with an amino sequence, 50% chance of alidisplay with nucleotide seq.
1961 if (p7_alidisplay_Sample(rng, alignment_length, &(serial[i])) != eslOK) esl_fatal(msg);
1962 }
1963 else
1964 {
1965 if (alidisplay_SampleFake_ntseq(rng, alignment_length, &(serial[i])) != eslOK) esl_fatal(msg);
1966 }
1967 if (p7_alidisplay_Serialize(serial[i], buf, &n, &nalloc) != eslOK) esl_fatal(msg);
1968 }
1969
1970 n = 0; // reset to start of buffer
1971 for (i = 0; i < ntrials; i++)
1972 {
1973 if ((deserial[i] = p7_alidisplay_Create_empty()) == NULL) esl_fatal(msg);
1974 if( p7_alidisplay_Deserialize(*buf, &n, deserial[i]) != eslOK) esl_fatal(msg);
1975 }
1976
1977 // free the buffer here to make sure we've actually copied all the data out of it into the new structures
1978 free(*buf);
1979 free(buf);
1980 for (i = 0; i < ntrials; i++)
1981 if (p7_alidisplay_Compare(serial[i], deserial[i]) != eslOK) esl_fatal(msg); // deserialized structure didn't match serialized
1982 // haven't failed yet, so we've succeeded. Clean up and exit
1983
1984 for (i = 0; i < ntrials; i++) {
1985 p7_alidisplay_Destroy(serial[i]);
1986 p7_alidisplay_Destroy(deserial[i]);
1987 }
1988 free(serial);
1989 free(deserial);
1990 return;
1991 }
1992
1993 // Test that the _Serialize() function generates the correct errors when passed invalid arguments
1994 static void
utest_serialize_error_conditions(ESL_RANDOMNESS * rng)1995 utest_serialize_error_conditions(ESL_RANDOMNESS *rng)
1996 {
1997 char msg[] = "utest_serialize_error_conditions failed";
1998 P7_ALIDISPLAY *foo = NULL;
1999 uint8_t **buf = malloc(sizeof(uint8_t *));
2000 uint32_t n = 0;
2001 uint32_t nalloc = 0;
2002
2003 // Create an alisplay to work with. Don't really care about its contents -- other tests will verify
2004 // correct serialization and deserialization
2005 *buf = NULL; // set buf to valid value
2006 if ( p7_alidisplay_Sample(rng, 100, &foo) != eslOK) esl_fatal(msg);
2007 if ( p7_alidisplay_Serialize(foo, NULL, &n, &nalloc) != eslEINVAL) esl_fatal(msg); // Test 1: _Serialize returns error if passed NULL buffer
2008 if ( p7_alidisplay_Serialize(foo, buf, NULL, &nalloc) != eslEINVAL) esl_fatal(msg); // Test 2: error on NULL n ptr
2009 if ( p7_alidisplay_Serialize(NULL, buf, &n, &nalloc) != eslEINVAL) esl_fatal(msg); // Test 3: error on NULL object ptr
2010
2011 if (buf) { free(*buf); free(buf); }
2012 p7_alidisplay_Destroy(foo);
2013 return;
2014 }
2015
2016 static void
utest_deserialize_error_conditions(ESL_RANDOMNESS * rng)2017 utest_deserialize_error_conditions(ESL_RANDOMNESS *rng)
2018 {
2019 char msg[] = "utest_deserialize_error_conditions failed";
2020 P7_ALIDISPLAY *sampled = NULL; // sampled alidisplay that we'll serialze
2021 P7_ALIDISPLAY *deserial = NULL; // alidisplay to hold the deserialized object
2022 uint8_t *buf = NULL;
2023 uint32_t n = 0, nalloc = 0;
2024
2025 if ((deserial = p7_alidisplay_Create_empty()) == NULL) esl_fatal(msg);
2026 if ( p7_alidisplay_Sample(rng, 100, &sampled) != eslOK) esl_fatal(msg);
2027 if ( p7_alidisplay_Serialize(sampled, &buf, &n, &nalloc) != eslOK) esl_fatal(msg);
2028
2029
2030 if ( p7_alidisplay_Deserialize(NULL, &n, deserial) != eslEINVAL) esl_fatal(msg); // Test 1: error on buf == NULL;
2031 if ( p7_alidisplay_Deserialize(buf, NULL, deserial) != eslEINVAL) esl_fatal(msg); // Test 2: error on n == NULL
2032 if ( p7_alidisplay_Deserialize(buf, &n, NULL) != eslEINVAL) esl_fatal(msg); // Test 3: error on serialized object == NULL
2033
2034 free(buf);
2035 p7_alidisplay_Destroy(deserial);
2036 p7_alidisplay_Destroy(sampled);
2037 return;
2038 }
2039
2040
2041 static void
utest_Backconvert(int be_verbose,ESL_RANDOMNESS * rng,ESL_ALPHABET * abc,int ntrials,int N)2042 utest_Backconvert(int be_verbose, ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, int ntrials, int N)
2043 {
2044 char msg[] = "utest_Backconvert failed";
2045 P7_ALIDISPLAY *ad = NULL;
2046 ESL_SQ *sq = NULL;
2047 P7_TRACE *tr = NULL;
2048 int trial;
2049
2050 for (trial = 0; trial < ntrials; trial++)
2051 {
2052 if ( p7_alidisplay_Sample(rng, N, &ad) != eslOK) esl_fatal(msg);
2053 if ( p7_alidisplay_Serialize_old(ad) != eslOK) esl_fatal(msg);
2054 if (be_verbose && p7_alidisplay_Dump(stdout, ad) != eslOK) esl_fatal(msg);
2055 if ( p7_alidisplay_Backconvert(ad, abc, &sq, &tr) != eslOK) esl_fatal(msg);
2056 if (be_verbose && p7_trace_Dump(stdout, tr, NULL, sq->dsq) != eslOK) esl_fatal(msg);
2057 if ( p7_trace_Validate(tr, abc, sq->dsq, NULL) != eslOK) esl_fatal(msg);
2058
2059 p7_alidisplay_Destroy(ad);
2060 esl_sq_Destroy(sq);
2061 p7_trace_Destroy(tr);
2062 }
2063 return;
2064 }
2065
2066
2067 #endif /*p7ALIDISPLAY_TESTDRIVE*/
2068 /*------------------- end, unit tests ---------------------------*/
2069
2070 /*****************************************************************
2071 * 6. Test driver.
2072 *****************************************************************/
2073 #ifdef p7ALIDISPLAY_TESTDRIVE
2074
2075 #include <stdlib.h>
2076 #include <string.h>
2077
2078 #include "easel.h"
2079 #include "esl_getopts.h"
2080 #include "esl_random.h"
2081
2082 #include "hmmer.h"
2083
2084 static ESL_OPTIONS options[] = {
2085 /* name type default env range togs reqs incomp help docgrp */
2086 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
2087 {"-N", eslARG_INT, "10", NULL, NULL, NULL, NULL, NULL, "number of random-sampled alidisplays to test", 0},
2088 {"-L", eslARG_INT, "20", NULL, NULL, NULL, NULL, NULL, "length of random-sampled alidisplays to test", 0},
2089 {"-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0},
2090 {"-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show verbose commentary/output", 0},
2091 { 0,0,0,0,0,0,0,0,0,0},
2092 };
2093 static char usage[] = "[-options]";
2094 static char banner[] = "test driver for p7_alidisplay.c";
2095
2096 int
main(int argc,char ** argv)2097 main(int argc, char **argv)
2098 {
2099 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2100 ESL_RANDOMNESS *rng = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s"));
2101 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
2102 int N = esl_opt_GetInteger(go, "-N");
2103 int L = esl_opt_GetInteger(go, "-L");
2104 int be_verbose = esl_opt_GetBoolean(go, "-v");
2105
2106 //utest_Serialize_old ( rng, N, L);
2107 utest_Serialize(rng, 100);
2108 utest_Backconvert(be_verbose, rng, abc, N, L);
2109 utest_serialize_error_conditions(rng);
2110 utest_deserialize_error_conditions(rng);
2111
2112 esl_alphabet_Destroy(abc);
2113 esl_randomness_Destroy(rng);
2114 esl_getopts_Destroy(go);
2115 return 0;
2116 }
2117 #endif /*p7ALIDISPLAY_TESTDRIVE*/
2118 /*------------------- end, test driver --------------------------*/
2119
2120
2121 /*****************************************************************
2122 * 7. Example.
2123 *****************************************************************/
2124 /*
2125 gcc -o p7_alidisplay_example -std=gnu99 -g -Wall -I. -L. -I../easel -L../easel -Dp7ALIDISPLAY_EXAMPLE p7_alidisplay.c -lhmmer -leasel -lm
2126 */
2127 #ifdef p7ALIDISPLAY_EXAMPLE
2128 #include "p7_config.h"
2129
2130 #include "easel.h"
2131 #include "esl_alphabet.h"
2132 #include "esl_getopts.h"
2133 #include "esl_sq.h"
2134 #include "esl_sqio.h"
2135
2136 #include "hmmer.h"
2137
2138 static ESL_OPTIONS options[] = {
2139 /* name type default env range toggles reqs incomp help docgroup*/
2140 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
2141 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2142 };
2143 static char usage[] = "[-options] <hmmfile> <seqfile>";
2144 static char banner[] = "example driver for P7_ALIDISPLAY";
2145
2146 int
main(int argc,char ** argv)2147 main(int argc, char **argv)
2148 {
2149 ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
2150 char *hmmfile = esl_opt_GetArg(go, 1);
2151 char *seqfile = esl_opt_GetArg(go, 2);
2152 ESL_ALPHABET *abc = NULL;
2153 P7_HMMFILE *hfp = NULL;
2154 P7_HMM *hmm = NULL;
2155 P7_BG *bg = NULL;
2156 P7_PROFILE *gm = NULL;
2157 P7_OPROFILE *om = NULL;
2158 P7_TRACE *tr1 = NULL;
2159 P7_TRACE *tr2 = NULL;
2160 ESL_SQFILE *sqfp = NULL;
2161 ESL_SQ *sq = NULL;
2162 ESL_SQ *sq2 = NULL;
2163 P7_ALIDISPLAY *ad = NULL;
2164 P7_PIPELINE *pli = NULL;
2165 P7_TOPHITS *hitlist = NULL;
2166
2167 p7_FLogsumInit();
2168
2169 /* Read a single HMM from a file */
2170 if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile);
2171 if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM");
2172 p7_hmmfile_Close(hfp);
2173
2174 /* Read a single sequence from a file */
2175 if (esl_sqfile_Open(seqfile, eslSQFILE_UNKNOWN, NULL, &sqfp) != eslOK) p7_Fail("Failed to open sequence file %s", seqfile);
2176 sq = esl_sq_CreateDigital(abc);
2177 if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence");
2178
2179 /* Configure a profile from the HMM */
2180 bg = p7_bg_Create(abc);
2181 p7_bg_SetLength(bg, 0);
2182 gm = p7_profile_Create(hmm->M, abc);
2183 p7_ProfileConfig(hmm, bg, gm, 0, p7_UNILOCAL);
2184 om = p7_oprofile_Create(gm->M, abc);
2185 p7_oprofile_Convert(gm, om);
2186
2187 /* Create a pipeline and a top hits list */
2188 pli = p7_pipeline_Create(NULL/*=default*/, hmm->M, 400, FALSE, p7_SEARCH_SEQS);
2189 hitlist = p7_tophits_Create();
2190
2191 /* Run the pipeline */
2192 p7_pli_NewSeq(pli, sq);
2193 p7_bg_SetLength(bg, sq->n);
2194 p7_oprofile_ReconfigLength(om, sq->n);
2195 p7_Pipeline(pli, om, bg, sq, NULL, hitlist);
2196
2197 if (hitlist->N == 0) { p7_Fail("target sequence doesn't hit"); }
2198
2199 if (p7_alidisplay_Backconvert(hitlist->hit[0]->dcl[0].ad, abc, &sq2, &tr2) != eslOK) p7_Fail("backconvert failed");
2200
2201 p7_trace_Dump(stdout, tr2, gm, sq2->dsq);
2202
2203 p7_tophits_Destroy(hitlist);
2204 p7_alidisplay_Destroy(ad);
2205 esl_sq_Destroy(sq);
2206 esl_sq_Destroy(sq2);
2207 p7_trace_Destroy(tr2);
2208 p7_trace_Destroy(tr1);
2209 p7_oprofile_Destroy(om);
2210 p7_profile_Destroy(gm);
2211 p7_bg_Destroy(bg);
2212 p7_hmm_Destroy(hmm);
2213 esl_alphabet_Destroy(abc);
2214 esl_getopts_Destroy(go);
2215 return 0;
2216 }
2217 #endif /*p7ALIDISPLAY_EXAMPLE*/
2218
2219
2220