1 #ifndef VIENNA_RNA_PACKAGE_STRING_UTILS_H
2 #define VIENNA_RNA_PACKAGE_STRING_UTILS_H
3 
4 #ifdef VRNA_WARN_DEPRECATED
5 # if defined(__clang__)
6 #  define DEPRECATED(func, msg) func __attribute__ ((deprecated("", msg)))
7 # elif defined(__GNUC__)
8 #  define DEPRECATED(func, msg) func __attribute__ ((deprecated(msg)))
9 # else
10 #  define DEPRECATED(func, msg) func
11 # endif
12 #else
13 # define DEPRECATED(func, msg) func
14 #endif
15 
16 /**
17  *  @file     ViennaRNA/utils/strings.h
18  *  @ingroup  utils, string_utils
19  *  @brief    General utility- and helper-functions for RNA sequence and structure strings used throughout the ViennaRNA Package
20  */
21 
22 /**
23  *  @addtogroup   string_utils
24  *  @{
25  *  @brief  Functions to parse, convert, manipulate, create, and compare (nucleic acid sequence) strings.
26  */
27 
28 #include <stdarg.h>
29 #include <ViennaRNA/datastructures/basic.h>
30 
31 /**
32  * @brief Stringify a macro after expansion
33  */
34 #define XSTR(s) STR(s)
35 
36 /**
37  * @brief Stringify a macro argument
38  */
39 #define STR(s) #s
40 
41 #ifndef FILENAME_MAX_LENGTH
42 
43 /**
44  *  @brief Maximum length of filenames that are generated by our programs
45  *
46  *  This definition should be used throughout the complete ViennaRNA package
47  *  wherever a static array holding filenames of output files is declared.
48  */
49 #define FILENAME_MAX_LENGTH   80
50 
51 /**
52  *  @brief Maximum length of id taken from fasta header for filename generation
53  *
54  *  this has to be smaller than FILENAME_MAX_LENGTH since in most cases,
55  *  some suffix will be appended to the ID
56  */
57 #define FILENAME_ID_LENGTH    42
58 
59 #endif
60 
61 #ifdef HAVE_CONFIG_H
62 #include <config.h>
63 #ifndef HAVE_STRDUP
64 char *strdup(const char *s);
65 
66 
67 #endif
68 #endif
69 
70 /**
71  *  @brief Safely create a formatted string
72  *
73  *  This function is a safe implementation for creating a formatted character array,
74  *  similar to @em sprintf.
75  *  Internally, it uses the @em asprintf function if available to dynamically allocate
76  *  a large enough character array to store the supplied content. If @em asprintf is
77  *  not available, mimic it's behavior using @em vsnprintf.
78  *
79  *  @note The returned pointer of this function should always be passed to @em free() to
80  *  release the allocated memory
81  *
82  *  @see vrna_strdup_vprintf(), vrna_strcat_printf()
83  *
84  *  @param  format  The format string (See also asprintf)
85  *  @param  ...     The list of variables used to fill the format string
86  *  @return         The formatted, null-terminated string, or NULL if something has gone wrong
87  */
88 char *
89 vrna_strdup_printf(const char *format,
90                    ...);
91 
92 
93 /**
94  *  @brief Safely create a formatted string
95  *
96  *  This function is the @em va_list version of vrna_strdup_printf()
97  *
98  *  @note The returned pointer of this function should always be passed to @em free() to
99  *  release the allocated memory
100  *
101  *  @see vrna_strdup_printf(), vrna_strcat_printf(), vrna_strcat_vprintf()
102  *
103  *  @param  format  The format string (See also asprintf)
104  *  @param  argp    The list of arguments to fill the format string
105  *  @return         The formatted, null-terminated string, or NULL if something has gone wrong
106  */
107 char *
108 vrna_strdup_vprintf(const char  *format,
109                     va_list     argp);
110 
111 
112 /**
113  *  @brief Safely append a formatted string to another string
114  *
115  *  This function is a safe implementation for appending a formatted character array,
116  *  similar to a cobination of @em strcat and @em sprintf.
117  *  The function automatically allocates enough memory to store both, the previous
118  *  content stored at @p dest and the appended format string. If the @p dest pointer
119  *  is NULL, the function allocate memory only for the format string.
120  *  The function returns the number of characters in the resulting string or -1
121  *  in case of an error.
122  *
123  *  @see vrna_strcat_vprintf(), vrna_strdup_printf(), vrna_strdup_vprintf()
124  *
125  *  @param  dest    The address of a char *pointer where the formatted string is to be appended
126  *  @param  format  The format string (See also sprintf)
127  *  @param  ...     The list of variables used to fill the format string
128  *  @return         The number of characters in the final string, or -1 on error
129  */
130 int
131 vrna_strcat_printf(char       **dest,
132                    const char *format,
133                    ...);
134 
135 
136 /**
137  *  @brief Safely append a formatted string to another string
138  *
139  *  This function is the @em va_list version of vrna_strcat_printf()
140  *
141  *  @see vrna_strcat_printf(), vrna_strdup_printf(), vrna_strdup_vprintf()
142  *
143  *  @param  dest    The address of a char *pointer where the formatted string is to be appended
144  *  @param  format  The format string (See also sprintf)
145  *  @param  args    The list of argument to fill the format string
146  *  @return         The number of characters in the final string, or -1 on error
147  */
148 int
149 vrna_strcat_vprintf(char        **dest,
150                     const char  *format,
151                     va_list     args);
152 
153 
154 /**
155  *  @brief Split a string into tokens using a delimiting character
156  *
157  *  This function splits a string into an array of strings using a single
158  *  character that delimits the elements within the string. The default
159  *  delimiter is the ampersand @c '&' and will be used when @c NULL is
160  *  passed as a second argument. The returned list is NULL terminated, i.e.
161  *  the last element is @c NULL. If the delimiter is not found, the returned
162  *  list contains exactly one element: the input string.
163  *
164  *  For instance, the following code:
165  *
166  * @code{.c}
167  * char **tok = vrna_strsplit("GGGG&CCCC&AAAAA", NULL);
168  *
169  * for (char **ptr = tok; *ptr; ptr++) {
170  *  printf("%s\n", *ptr);
171  *  free(*ptr);
172  * }
173  * free(tok);
174  * @endcode
175  *  produces this output:
176  *
177  * @verbatim
178  * GGGG
179  * CCCC
180  * AAAAA
181  * @endverbatim
182  *  and properly free's the memory occupied by the returned element array.
183  *
184  *  @note This function internally uses @em strtok_r() and is therefore
185  *  considered to be thread-safe. Also note, that it is the users responsibility
186  *  to free the memory of the array and that of the individual element strings!
187  *
188  *  @param  string    The input string that should be split into elements
189  *  @param  delimiter The delimiting character. If @c NULL, the delimiter is @c "&"
190  *  @return           A @c NULL terminated list of the elements in the string
191  */
192 char **
193 vrna_strsplit(const char  *string,
194               const char  *delimiter);
195 
196 
197 char *
198 vrna_strjoin(const char **strings,
199              const char *delimiter);
200 
201 /**
202  *  @brief Create a random string using characters from a specified symbol set
203  *
204  *  @param l        The length of the sequence
205  *  @param symbols  The symbol set
206  *  @return         A random string of length 'l' containing characters from the symbolset
207  */
208 char *
209 vrna_random_string(int        l,
210                    const char symbols[]);
211 
212 
213 /**
214  *  @brief Calculate hamming distance between two sequences
215  *
216  *  @param s1   The first sequence
217  *  @param s2   The second sequence
218  *  @return     The hamming distance between s1 and s2
219  */
220 int
221 vrna_hamming_distance(const char  *s1,
222                       const char  *s2);
223 
224 
225 /**
226  *  @brief Calculate hamming distance between two sequences up to a specified length
227  *
228  *  This function is similar to vrna_hamming_distance() but instead of comparing both sequences
229  *  up to their actual length only the first 'n' characters are taken into account
230  *  @param  s1  The first sequence
231  *  @param  s2  The second sequence
232  *  @param  n   The length of the subsequences to consider (starting from the 5' end)
233  *  @return     The hamming distance between s1 and s2
234  */
235 int
236 vrna_hamming_distance_bound(const char  *s1,
237                             const char  *s2,
238                             int         n);
239 
240 
241 /**
242  *  @brief Convert an input sequence (possibly containing DNA alphabet characters) to RNA alphabet
243  *
244  *  This function substitudes <i>T</i> and <i>t</i> with <i>U</i> and <i>u</i>, respectively
245  *
246  *  @param sequence The sequence to be converted
247  */
248 void
249 vrna_seq_toRNA(char *sequence);
250 
251 
252 /**
253  *  @brief Convert an input sequence to uppercase
254  *
255  *  @param sequence The sequence to be converted
256  */
257 void
258 vrna_seq_toupper(char *sequence);
259 
260 
261 /**
262  *  @brief  Reverse a string in-place
263  *
264  *  This function reverses a character string in the form of
265  *  an array of characters in-place, i.e. it changes the input
266  *  parameter.
267  *
268  *  @post After execution, the input @p sequence consists of the
269  *        reverse string prior to the execution.
270  *
271  *  @see vrna_DNA_complement()
272  *
273  *  @param  sequence  The string to reverse
274  */
275 void
276 vrna_seq_reverse(char *sequence);
277 
278 
279 /**
280  *  @brief  Retrieve a DNA sequence which resembles the complement of the input sequence
281  *
282  *  This function returns a mew DNA string which is the complement
283  *  of the input, i.e. the nucleotide letters `A`,`C`,`G`, and `T`
284  *  are substituted by their complements `T`,`G`,`C`, and `A`, respectively.
285  *
286  *  Any characters not belonging to the alphabet of the 4 canonical
287  *  bases of DNA are not altered.
288  *
289  *  @note This function also handles lower-case input sequences and
290  *        treats `U` of the RNA alphabet equally to `T`
291  *
292  *  @see vrna_seq_reverse()
293  *
294  *  @param  sequence  the input DNA sequence
295  *  @return           The complement of the input DNA sequence
296  */
297 char *
298 vrna_DNA_complement(const char *sequence);
299 
300 
301 /**
302  *  @brief  Remove gap characters from a nucleotide sequence
303  *
304  *  @param  sequence  The original, null-terminated nucleotide sequence
305  *  @return           A copy of the input sequence with all gap characters removed
306  */
307 char *
308 vrna_seq_ungapped(const char *sequence);
309 
310 
311 /**
312  *  @brief Add a separating '&' character into a string according to cut-point position
313  *
314  *  If the cut-point position is less or equal to zero, this function just
315  *  returns a copy of the provided string. Otherwise, the cut-point character
316  *  is set at the corresponding position
317  *
318  *  @param  string    The original string
319  *  @param  cp        The cut-point position
320  *  @return           A copy of the provided string including the cut-point character
321  */
322 char *
323 vrna_cut_point_insert(const char  *string,
324                       int         cp);
325 
326 
327 /**
328  *  @brief  Remove a separating '&' character from a string
329  *
330  *  This function removes the cut-point indicating '&' character from a string
331  *  and memorizes its position in a provided integer variable. If not '&' is
332  *  found in the input, the integer variable is set to -1. The function returns
333  *  a copy of the input string with the '&' being sliced out.
334  *
335  *  @param  string  The original string
336  *  @param  cp      The cut-point position
337  *  @return         A copy of the input string with the '&' being sliced out
338  */
339 char *
340 vrna_cut_point_remove(const char  *string,
341                       int         *cp);
342 
343 
344 /**
345  *  @}
346  */
347 
348 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
349 
350 /**
351  *  @brief Convert an input sequence to uppercase
352  *  @deprecated   Use vrna_seq_toupper() instead!
353  */
354 DEPRECATED(void  str_uppercase(char *sequence), "Use vrna_seq_toupper() instead");
355 
356 /**
357  *  @brief Convert a DNA input sequence to RNA alphabet
358  *
359  *  @deprecated Use vrna_seq_toRNA() instead!
360  */
361 DEPRECATED(void str_DNA2RNA(char *sequence), "Use vrna_seq_toRNA() instead");
362 
363 /**
364  *  @brief Create a random string using characters from a specified symbol set
365  *
366  *  @deprecated Use vrna_random_string() instead!
367  */
368 DEPRECATED(char *random_string(int        l,
369                                const char symbols[]),
370            "Use vrna_random_string() instead");
371 
372 /**
373  *  @brief Calculate hamming distance between two sequences
374  *
375  *  @deprecated Use vrna_hamming_distance() instead!
376  */
377 DEPRECATED(int hamming(const char *s1,
378                        const char *s2),
379            "Use vrna_hamming_distance() instead");
380 
381 /**
382  *  @brief Calculate hamming distance between two sequences up to a specified length
383  *
384  *  @deprecated Use vrna_hamming_distance_bound() instead!
385  */
386 DEPRECATED(int hamming_bound(const char *s1,
387                              const char *s2,
388                              int        n),
389            "Use vrna_hamming_distance_bound() instead");
390 
391 #endif
392 
393 #endif
394