1 #ifndef VIENNA_RNA_PACKAGE_UNSTRUCTURED_DOMAIN_H
2 #define VIENNA_RNA_PACKAGE_UNSTRUCTURED_DOMAIN_H
3 
4 /**
5  *  @file unstructured_domains.h
6  *  @ingroup domains_up
7  *  @brief    Functions to modify unstructured domains, e.g. to incorporate ligands binding to unpaired stretches
8  */
9 
10 /**
11  *  @addtogroup domains_up
12  *
13  *  @brief  Add and modify unstructured domains to the RNA folding grammar
14  *
15  *  This module provides the tools to add and modify unstructured domains to the production rules of the RNA folding grammar.
16  *  Usually this functionality is utilized for incorporating ligand binding to unpaired stretches of an RNA.
17  *
18  *  @bug  Although the additional production rule(s) for unstructured domains as descibed in @ref sec_domains_up
19  *        are always treated as 'segments possibly bound to one or more ligands', the current implementation requires
20  *        that at least one ligand is bound. The default implementation already takes care of the required changes,
21  *        however, upon using callback functions other than the default ones, one has to take care of this fact.
22  *        Please also note, that this behavior might change in one of the next releases, such that the decomposition
23  *        schemes as shown above comply with the actual implementation.
24  *
25  *  A default implementation allows one to readily use this feature by simply adding sequence motifs and corresponding
26  *  binding free energies with the function vrna_ud_add_motif() (see also @ref ligands_up).
27  *
28  *  The grammar extension is realized using a callback function that
29  *  - evaluates the binding free energy of a ligand to its target sequence segment (white boxes in the figures above), or
30  *  - returns the free energy of an unpaired stretch possibly bound by a ligand, stored in the additional @em U DP matrix.
31  *
32  *  The callback is passed the segment positions, the loop context, and which of the two above mentioned
33  *  evaluations are required. A second callback implements the pre-processing step that
34  *  prepares the @em U DP matrix by evaluating all possible cases of the additional production rule.
35  *  Both callbacks have a default implementation in @em RNAlib, but may be over-written by a
36  *  user-implementation, making it fully user-customizable.
37  *
38  *  For equilibrium probability computations, two additional callbacks exist. One to store/add and one to retrieve the
39  *  probability of unstructured domains at particular positions. Our implementation already takes care of computing
40  *  the probabilities, but users of the unstructured domain feature are required to provide a mechanism to efficiently
41  *  store/add the corresponding values into some external data structure.
42  */
43 
44 
45 /**
46  *  @addtogroup ligands_up
47  *
48  *  @brief  Add ligand binding to loop regions using the @ref domains_up feature
49  *
50  *  Sometime, certain ligands, like single strand binding (SSB) proteins, compete with intramolecular
51  *  base pairing of the RNA. In situations, where the dissociation constant of the ligand is known and
52  *  the ligand binds to a consecutive stretch of single-stranded nucleotides we can use the @ref domains_up
53  *  functionality to extend the RNA folding grammar. This module provides a convenience default implementation
54  *  that covers most of the application scenarios.
55  *
56  *  The function vrna_ud_add_motif() attaches a ligands sequence motif and corresponding binding free energy
57  *  to the list of known ligand motifs within a #vrna_fold_compound_t.domains_up attribute. The first call to
58  *  this function initializes the @ref domains_up feature with our default implementation. Subsequent calls of
59  *  secondary structure predciction algorithms with the modified #vrna_fold_compound_t then directly include
60  *  the competition of the ligand with regules base pairing. Since we utilize the unstructured domain extension,
61  *  The ligand binding model can be removed again using the vrna_ud_remove() function.
62  *
63  */
64 
65 
66 /** @brief Typename for the ligand binding extension data structure #vrna_unstructured_domain_s
67  *  @ingroup domains_up
68  */
69 typedef struct vrna_unstructured_domain_s vrna_ud_t;
70 
71 typedef struct vrna_unstructured_domain_motif_s vrna_ud_motif_t;
72 
73 #include <ViennaRNA/datastructures/basic.h>
74 #include <ViennaRNA/fold_compound.h>
75 #include <ViennaRNA/utils/structures.h>
76 
77 /**
78  *  @brief Callback to retrieve binding free energy of a ligand bound to an unpaired sequence segment
79  *
80  *  @ingroup domains_up
81  *
82  *  @callback
83  *  @parblock
84  *  This function will be called to determine the additional energy contribution of a specific unstructured
85  *  domain, e.g. the binding free energy of some ligand.
86  *  @endparblock
87  *
88  *  @param  vc        The current #vrna_fold_compound_t
89  *  @param  i         The start of the unstructured domain (5' end)
90  *  @param  j         The end of the unstructured domain (3' end)
91  *  @param  loop_type The loop context of the unstructured domain
92  *  @param  data      Auxiliary data
93  *  @return           The auxiliary energy contribution in deka-cal/mol
94  */
95 typedef int (vrna_callback_ud_energy)(vrna_fold_compound_t  *vc,
96                                       int                   i,
97                                       int                   j,
98                                       unsigned int          loop_type,
99                                       void                  *data);
100 
101 /**
102  *  @brief Callback to retrieve Boltzmann factor of the binding free energy of a ligand bound to an unpaired sequence segment
103  *  @ingroup domains_up
104  *
105  *  @callback
106  *  @parblock
107  *  This function will be called to determine the additional energy contribution of a specific unstructured
108  *  domain, e.g. the binding free energy of some ligand (Partition function variant, i.e. the Boltzmann factors
109  *  instead of actual free energies).
110  *  @endparblock
111  *
112  *  @param  vc        The current #vrna_fold_compound_t
113  *  @param  i         The start of the unstructured domain (5' end)
114  *  @param  j         The end of the unstructured domain (3' end)
115  *  @param  loop_type The loop context of the unstructured domain
116  *  @param  data      Auxiliary data
117  *  @return           The auxiliary energy contribution as Boltzmann factor
118  */
119 typedef FLT_OR_DBL (vrna_callback_ud_exp_energy)(vrna_fold_compound_t *vc,
120                                                  int                  i,
121                                                  int                  j,
122                                                  unsigned int         loop_type,
123                                                  void                 *data);
124 
125 /**
126  *  @brief Callback for pre-processing the production rule of the ligand binding to unpaired stretches feature
127  *
128  *  @ingroup domains_up
129  *
130  *  @callback
131  *  @parblock
132  *  The production rule for the unstructured domain grammar extension
133  *  @endparblock
134  */
135 typedef void (vrna_callback_ud_production)(vrna_fold_compound_t *vc,
136                                            void                 *data);
137 
138 /**
139  *  @brief Callback for pre-processing the production rule of the ligand binding to unpaired stretches feature (partition function variant)
140  *
141  *  @ingroup domains_up
142  *
143  *  @callback
144  *  @parblock
145  *  The production rule for the unstructured domain grammar extension (Partition function variant)
146  *  @endparblock
147  */
148 typedef void (vrna_callback_ud_exp_production)(vrna_fold_compound_t *vc,
149                                                void                 *data);
150 
151 
152 /**
153  *  @brief Callback to store/add equilibrium probability for a ligand bound to an unpaired sequence segment
154  *  @ingroup domains_up
155  *
156  *  @callback
157  *  @parblock
158  *  A callback function to store equilibrium probabilities for the unstructured domain feature
159  *  @endparblock
160  */
161 typedef void (vrna_callback_ud_probs_add)(vrna_fold_compound_t  *vc,
162                                           int                   i,
163                                           int                   j,
164                                           unsigned int          loop_type,
165                                           FLT_OR_DBL            exp_energy,
166                                           void                  *data);
167 
168 /**
169  *  @brief Callback to retrieve equilibrium probability for a ligand bound to an unpaired sequence segment
170  *  @ingroup domains_up
171  *
172  *  @callback
173  *  @parblock
174  *  A callback function to retrieve equilibrium probabilities for the unstructured domain feature
175  *  @endparblock
176  */
177 typedef FLT_OR_DBL (vrna_callback_ud_probs_get)(vrna_fold_compound_t  *vc,
178                                                 int                   i,
179                                                 int                   j,
180                                                 unsigned int          loop_type,
181                                                 int                   motif,
182                                                 void                  *data);
183 
184 
185 /**
186  *  @brief Flag to indicate ligand bound to unpiared stretch in the exterior loop
187  *  @ingroup domains_up
188  */
189 #define VRNA_UNSTRUCTURED_DOMAIN_EXT_LOOP    1U
190 
191 /**
192  *  @brief Flag to indicate ligand bound to unpaired stretch in a hairpin loop
193  *  @ingroup domains_up
194  */
195 #define VRNA_UNSTRUCTURED_DOMAIN_HP_LOOP     2U
196 
197 /**
198  *  @brief Flag to indicate ligand bound to unpiared stretch in an interior loop
199  *  @ingroup domains_up
200  */
201 #define VRNA_UNSTRUCTURED_DOMAIN_INT_LOOP    4U
202 
203 /**
204  *  @brief Flag to indicate ligand bound to unpiared stretch in a multibranch loop
205  *  @ingroup domains_up
206  */
207 #define VRNA_UNSTRUCTURED_DOMAIN_MB_LOOP     8U
208 
209 /**
210  *  @brief Flag to indicate ligand binding without additional unbound nucleotides (motif-only)
211  *  @ingroup domains_up
212  */
213 #define VRNA_UNSTRUCTURED_DOMAIN_MOTIF       16U
214 
215 /**
216  *  @brief Flag to indicate ligand bound to unpiared stretch in any loop (convenience macro)
217  *  @ingroup domains_up
218  */
219 #define VRNA_UNSTRUCTURED_DOMAIN_ALL_LOOPS   (VRNA_UNSTRUCTURED_DOMAIN_EXT_LOOP | \
220                                               VRNA_UNSTRUCTURED_DOMAIN_HP_LOOP | \
221                                               VRNA_UNSTRUCTURED_DOMAIN_INT_LOOP | \
222                                               VRNA_UNSTRUCTURED_DOMAIN_MB_LOOP)
223 
224 /**
225  *  @brief  Data structure to store all functionality for ligand binding
226  *  @ingroup domains_up
227  */
228 struct vrna_unstructured_domain_s {
229   /*
230    **********************************
231    * Keep track of all motifs added
232    **********************************
233    */
234   int           uniq_motif_count;                   /**<  @brief The unique number of motifs of different lengths */
235   unsigned int  *uniq_motif_size;                   /**<  @brief An array storing a unique list of motif lengths */
236 
237   int           motif_count;                        /**<  @brief Total number of distinguished motifs */
238   char          **motif;                            /**<  @brief Motif sequences */
239   char          **motif_name;                       /**<  @brief Motif identifier/name */
240   unsigned int  *motif_size;                        /**<  @brief Motif lengths */
241   double        *motif_en;                          /**<  @brief Ligand binding free energy contribution */
242   unsigned int  *motif_type;                        /**<  @brief Type of motif, i.e. loop type the ligand binds to */
243 
244   /*
245    **********************************
246    * Grammar extension for ligand
247    * binding
248    **********************************
249    */
250   vrna_callback_ud_production     *prod_cb;       /**<  @brief Callback to ligand binding production rule, i.e. create/fill DP free energy matrices
251                                                    *    @details This callback will be executed right before the actual secondary structure decompositions,
252                                                    *    and, therefore, any implementation must not interleave with the regular DP matrices.
253                                                    */
254   vrna_callback_ud_exp_production *exp_prod_cb;   /**<  @brief Callback to ligand binding production rule, i.e. create/fill DP partition function matrices */
255   vrna_callback_ud_energy         *energy_cb;     /**<  @brief Callback to evaluate free energy of ligand binding to a particular unpaired stretch */
256   vrna_callback_ud_exp_energy     *exp_energy_cb; /**<  @brief Callback to evaluate Boltzmann factor of ligand binding to a particular unpaired stretch */
257   void                            *data;          /**<  @brief Auxiliary data structure passed to energy evaluation callbacks */
258   vrna_callback_free_auxdata      *free_data;     /**<  @brief Callback to free auxiliary data structure */
259   vrna_callback_ud_probs_add      *probs_add;     /**<  @brief Callback to store/add outside partition function */
260   vrna_callback_ud_probs_get      *probs_get;     /**<  @brief Callback to retrieve outside partition function */
261 };
262 
263 
264 struct vrna_unstructured_domain_motif_s {
265   int start;
266   int number;
267 };
268 
269 
270 /**
271  *  @brief Detect unstructured domains in centroid structure
272  *
273  *  Given a centroid structure and a set of unstructured domains compute
274  *  the list of unstructured domain motifs present in the centroid.
275  *  Since we do not explicitly annotate unstructured domain motifs in
276  *  dot-bracket strings, this function can be used to check for the
277  *  presence and location of unstructured domain motifs under the
278  *  assumption that the dot-bracket string is the centroid structure
279  *  of the equiibrium ensemble.
280  *
281  *  @see vrna_centroid()
282  *  @ingroup domains_up
283  *
284  *  @param  fc        The fold_compound data structure with pre-computed equilibrium probabilities and model settings
285  *  @param  structure The centroid structure in dot-bracket notation
286  *  @return           A list of unstructured domain motifs (possibly NULL). The last element terminates the list with
287  *                    @p start=0, @p number=-1
288  */
289 vrna_ud_motif_t *
290 vrna_ud_motifs_centroid(vrna_fold_compound_t  *fc,
291                         const char            *structure);
292 
293 
294 /**
295  *  @brief Detect unstructured domains in MEA structure
296  *
297  *  Given an MEA structure and a set of unstructured domains compute
298  *  the list of unstructured domain motifs present in the MEA structure.
299  *  Since we do not explicitly annotate unstructured domain motifs in
300  *  dot-bracket strings, this function can be used to check for the
301  *  presence and location of unstructured domain motifs under the
302  *  assumption that the dot-bracket string is the MEA structure
303  *  of the equiibrium ensemble.
304  *
305  *  @see MEA()
306  *  @ingroup domains_up
307  *
308  *  @param  fc                The fold_compound data structure with pre-computed equilibrium probabilities and model settings
309  *  @param  structure         The MEA structure in dot-bracket notation
310  *  @param  probability_list  The list of probabilities to extract the MEA structure from
311  *  @return                   A list of unstructured domain motifs (possibly NULL). The last element terminates the list
312  *                            with @p start=0, @p number=-1
313  */
314 vrna_ud_motif_t *
315 vrna_ud_motifs_MEA(vrna_fold_compound_t *fc,
316                    const char           *structure,
317                    vrna_ep_t            *probability_list);
318 
319 
320 /**
321  *  @brief Detect unstructured domains in MFE structure
322  *
323  *  Given an MFE structure and a set of unstructured domains compute
324  *  the list of unstructured domain motifs present in the MFE structure.
325  *  Since we do not explicitly annotate unstructured domain motifs in
326  *  dot-bracket strings, this function can be used to check for the
327  *  presence and location of unstructured domain motifs under the
328  *  assumption that the dot-bracket string is the MFE structure
329  *  of the equiibrium ensemble.
330  *
331  *  @see vrna_mfe()
332  *  @ingroup domains_up
333  *
334  *  @param  fc        The fold_compound data structure with model settings
335  *  @param  structure The MFE structure in dot-bracket notation
336  *  @return           A list of unstructured domain motifs (possibly NULL). The last element terminates the list with @p start=0, @p number=-1
337  */
338 vrna_ud_motif_t *
339 vrna_ud_motifs_MFE(vrna_fold_compound_t *fc,
340                    const char           *structure);
341 
342 
343 /**
344  *  @brief  Add an unstructured domain motif, e.g. for ligand binding
345  *
346  *  This function adds a ligand binding motif and the associated binding free energy
347  *  to the #vrna_ud_t attribute of a #vrna_fold_compound_t. The motif data
348  *  will then be used in subsequent secondary structure predictions. Multiple calls
349  *  to this function with different motifs append all additional data to a list of
350  *  ligands, which all will be evaluated. Ligand motif data can be removed from the
351  *  #vrna_fold_compound_t again using the vrna_ud_remove() function. The loop
352  *  type parameter allows one to limit the ligand binding to particular loop type,
353  *  such as the exterior loop, hairpin loops, interior loops, or multibranch loops.
354  *
355  *  @see  #VRNA_UNSTRUCTURED_DOMAIN_EXT_LOOP, #VRNA_UNSTRUCTURED_DOMAIN_HP_LOOP,
356  *  #VRNA_UNSTRUCTURED_DOMAIN_INT_LOOP, #VRNA_UNSTRUCTURED_DOMAIN_MB_LOOP, #VRNA_UNSTRUCTURED_DOMAIN_ALL_LOOPS,
357  *  vrna_ud_remove()
358  *
359  *  @ingroup domains_up
360  *
361  *  @param  vc          The #vrna_fold_compound_t data structure the ligand motif should be bound to
362  *  @param  motif       The sequence motif the ligand binds to
363  *  @param  motif_en    The binding free energy of the ligand in kcal/mol
364  *  @param  motif_name  The name/id of the motif (may be @p NULL)
365  *  @param  loop_type   The loop type the ligand binds to
366  *
367  */
368 void  vrna_ud_add_motif(vrna_fold_compound_t  *vc,
369                         const char            *motif,
370                         double                motif_en,
371                         const char            *motif_name,
372                         unsigned int          loop_type);
373 
374 
375 /**
376  *  @brief  Get a list of unique motif sizes that start at a certain position within the sequence
377  *
378  */
379 int *vrna_ud_get_motif_size_at(vrna_fold_compound_t *vc,
380                                int                  i,
381                                unsigned int         loop_type);
382 
383 
384 int *
385 vrna_ud_get_motifs_at(vrna_fold_compound_t  *vc,
386                       int                   i,
387                       unsigned int          loop_type);
388 
389 
390 vrna_ud_motif_t *
391 vrna_ud_detect_motifs(vrna_fold_compound_t  *vc,
392                       const char            *structure);
393 
394 
395 /**
396  *  @brief Remove ligand binding to unpaired stretches
397  *
398  *  This function removes all ligand motifs that were bound to a #vrna_fold_compound_t using
399  *  the vrna_ud_add_motif() function.
400  *
401  *  @ingroup domains_up
402  *
403  *  @param vc The #vrna_fold_compound_t data structure the ligand motif data should be removed from
404  */
405 void  vrna_ud_remove(vrna_fold_compound_t *vc);
406 
407 
408 /**
409  *  @brief  Attach an auxiliary data structure
410  *
411  *  This function binds an arbitrary, auxiliary data structure for user-implemented ligand binding.
412  *  The optional callback @p free_cb will be passed the bound data structure whenever the #vrna_fold_compound_t
413  *  is removed from memory to avoid memory leaks.
414  *
415  *  @see vrna_ud_set_prod_rule_cb(), vrna_ud_set_exp_prod_rule_cb(),
416  *  vrna_ud_remove()
417  *
418  *  @ingroup domains_up
419  *
420  *  @param  vc      The #vrna_fold_compound_t data structure the auxiliary data structure should be bound to
421  *  @param  data    A pointer to the auxiliary data structure
422  *  @param  free_cb A pointer to a callback function that free's memory occupied by @p data
423  */
424 void  vrna_ud_set_data(vrna_fold_compound_t       *vc,
425                        void                       *data,
426                        vrna_callback_free_auxdata *free_cb);
427 
428 
429 /**
430  *  @brief Attach production rule callbacks for free energies computations
431  *
432  *  Use this function to bind a user-implemented grammar extension for unstructured
433  *  domains.
434  *
435  *  The callback @p e_cb needs to evaluate the free energy contribution @f$f(i,j)@f$ of
436  *  the unpaired segment @f$[i,j]@f$. It will be executed in each of the regular secondary
437  *  structure production rules. Whenever the callback is passed the #VRNA_UNSTRUCTURED_DOMAIN_MOTIF
438  *  flag via its @p loop_type parameter the contribution of any ligand that consecutively
439  *  binds from position @f$i@f$ to @f$j@f$ (the white box) is requested. Otherwise, the callback
440  *  usually performs a lookup in the precomputed @p B matrices. Which @p B matrix is
441  *  addressed will be indicated by the flags #VRNA_UNSTRUCTURED_DOMAIN_EXT_LOOP, #VRNA_UNSTRUCTURED_DOMAIN_HP_LOOP
442  *  #VRNA_UNSTRUCTURED_DOMAIN_INT_LOOP, and #VRNA_UNSTRUCTURED_DOMAIN_MB_LOOP. As their names already imply,
443  *  they specify exterior loops (@p F production rule), hairpin loops and interior loops
444  *  (@p C production rule), and multibranch loops (@p M and @p M1 production rule).
445  *
446  *  @image html   ligands_up_callback.svg
447  *  @image latex  ligands_up_callback.eps
448  *
449  *  The @p pre_cb callback will be executed as a pre-processing step right before the
450  *  regular secondary structure rules. Usually one would use this callback to fill the
451  *  dynamic programming matrices @p U and preparations of the auxiliary data structure
452  *  #vrna_unstructured_domain_s.data
453  *
454  *  @image html   B_prod_rule.svg
455  *  @image latex  B_prod_rule.eps
456  *
457  *  @ingroup domains_up
458  *
459  *  @param  vc      The #vrna_fold_compound_t data structure the callback will be bound to
460  *  @param  pre_cb  A pointer to a callback function for the @p B production rule
461  *  @param  e_cb    A pointer to a callback function for free energy evaluation
462  */
463 void vrna_ud_set_prod_rule_cb(vrna_fold_compound_t        *vc,
464                               vrna_callback_ud_production *pre_cb,
465                               vrna_callback_ud_energy     *e_cb);
466 
467 
468 /**
469  *  @brief Attach production rule for partition function
470  *
471  *  This function is the partition function companion of vrna_ud_set_prod_rule_cb().
472  *
473  *  Use it to bind callbacks to (i) fill the @p U production rule dynamic programming
474  *  matrices and/or prepare the #vrna_unstructured_domain_s.data, and (ii) provide a callback
475  *  to retrieve partition functions for subsegments @f$ [i,j] @f$.
476  *
477  *  @image html   B_prod_rule.svg
478  *  @image latex  B_prod_rule.eps
479  *
480  *  @image html   ligands_up_callback.svg
481  *  @image latex  ligands_up_callback.eps
482  *
483  *  @ingroup domains_up
484  *
485  *  @see vrna_ud_set_prod_rule_cb()
486  *
487  *  @param  vc        The #vrna_fold_compound_t data structure the callback will be bound to
488  *  @param  pre_cb    A pointer to a callback function for the @p B production rule
489  *  @param  exp_e_cb  A pointer to a callback function that retrieves the partition function
490  *                    for a segment @f$[i,j]@f$ that may be bound by one or more ligands.
491  */
492 void  vrna_ud_set_exp_prod_rule_cb(vrna_fold_compound_t             *vc,
493                                    vrna_callback_ud_exp_production  *pre_cb,
494                                    vrna_callback_ud_exp_energy      *exp_e_cb);
495 
496 
497 void  vrna_ud_set_prob_cb(vrna_fold_compound_t        *vc,
498                           vrna_callback_ud_probs_add  *setter,
499                           vrna_callback_ud_probs_get  *getter);
500 
501 
502 #endif
503