1 #ifndef VIENNA_RNA_PACKAGE_STRING_UTILS_H 2 #define VIENNA_RNA_PACKAGE_STRING_UTILS_H 3 4 #ifdef VRNA_WARN_DEPRECATED 5 # if defined(__clang__) 6 # define DEPRECATED(func, msg) func __attribute__ ((deprecated("", msg))) 7 # elif defined(__GNUC__) 8 # define DEPRECATED(func, msg) func __attribute__ ((deprecated(msg))) 9 # else 10 # define DEPRECATED(func, msg) func 11 # endif 12 #else 13 # define DEPRECATED(func, msg) func 14 #endif 15 16 /** 17 * @file ViennaRNA/utils/strings.h 18 * @ingroup utils, string_utils 19 * @brief General utility- and helper-functions for RNA sequence and structure strings used throughout the ViennaRNA Package 20 */ 21 22 /** 23 * @addtogroup string_utils 24 * @{ 25 * @brief Functions to parse, convert, manipulate, create, and compare (nucleic acid sequence) strings. 26 */ 27 28 #include <stdarg.h> 29 #include <ViennaRNA/datastructures/basic.h> 30 31 /** 32 * @brief Stringify a macro after expansion 33 */ 34 #define XSTR(s) STR(s) 35 36 /** 37 * @brief Stringify a macro argument 38 */ 39 #define STR(s) #s 40 41 #ifndef FILENAME_MAX_LENGTH 42 43 /** 44 * @brief Maximum length of filenames that are generated by our programs 45 * 46 * This definition should be used throughout the complete ViennaRNA package 47 * wherever a static array holding filenames of output files is declared. 48 */ 49 #define FILENAME_MAX_LENGTH 80 50 51 /** 52 * @brief Maximum length of id taken from fasta header for filename generation 53 * 54 * this has to be smaller than FILENAME_MAX_LENGTH since in most cases, 55 * some suffix will be appended to the ID 56 */ 57 #define FILENAME_ID_LENGTH 42 58 59 #endif 60 61 #ifdef HAVE_CONFIG_H 62 #include <config.h> 63 #ifndef HAVE_STRDUP 64 char *strdup(const char *s); 65 66 67 #endif 68 #endif 69 70 /** 71 * @brief Safely create a formatted string 72 * 73 * This function is a safe implementation for creating a formatted character array, 74 * similar to @em sprintf. 75 * Internally, it uses the @em asprintf function if available to dynamically allocate 76 * a large enough character array to store the supplied content. If @em asprintf is 77 * not available, mimic it's behavior using @em vsnprintf. 78 * 79 * @note The returned pointer of this function should always be passed to @em free() to 80 * release the allocated memory 81 * 82 * @see vrna_strdup_vprintf(), vrna_strcat_printf() 83 * 84 * @param format The format string (See also asprintf) 85 * @param ... The list of variables used to fill the format string 86 * @return The formatted, null-terminated string, or NULL if something has gone wrong 87 */ 88 char * 89 vrna_strdup_printf(const char *format, 90 ...); 91 92 93 /** 94 * @brief Safely create a formatted string 95 * 96 * This function is the @em va_list version of vrna_strdup_printf() 97 * 98 * @note The returned pointer of this function should always be passed to @em free() to 99 * release the allocated memory 100 * 101 * @see vrna_strdup_printf(), vrna_strcat_printf(), vrna_strcat_vprintf() 102 * 103 * @param format The format string (See also asprintf) 104 * @param argp The list of arguments to fill the format string 105 * @return The formatted, null-terminated string, or NULL if something has gone wrong 106 */ 107 char * 108 vrna_strdup_vprintf(const char *format, 109 va_list argp); 110 111 112 /** 113 * @brief Safely append a formatted string to another string 114 * 115 * This function is a safe implementation for appending a formatted character array, 116 * similar to a cobination of @em strcat and @em sprintf. 117 * The function automatically allocates enough memory to store both, the previous 118 * content stored at @p dest and the appended format string. If the @p dest pointer 119 * is NULL, the function allocate memory only for the format string. 120 * The function returns the number of characters in the resulting string or -1 121 * in case of an error. 122 * 123 * @see vrna_strcat_vprintf(), vrna_strdup_printf(), vrna_strdup_vprintf() 124 * 125 * @param dest The address of a char *pointer where the formatted string is to be appended 126 * @param format The format string (See also sprintf) 127 * @param ... The list of variables used to fill the format string 128 * @return The number of characters in the final string, or -1 on error 129 */ 130 int 131 vrna_strcat_printf(char **dest, 132 const char *format, 133 ...); 134 135 136 /** 137 * @brief Safely append a formatted string to another string 138 * 139 * This function is the @em va_list version of vrna_strcat_printf() 140 * 141 * @see vrna_strcat_printf(), vrna_strdup_printf(), vrna_strdup_vprintf() 142 * 143 * @param dest The address of a char *pointer where the formatted string is to be appended 144 * @param format The format string (See also sprintf) 145 * @param args The list of argument to fill the format string 146 * @return The number of characters in the final string, or -1 on error 147 */ 148 int 149 vrna_strcat_vprintf(char **dest, 150 const char *format, 151 va_list args); 152 153 154 /** 155 * @brief Split a string into tokens using a delimiting character 156 * 157 * This function splits a string into an array of strings using a single 158 * character that delimits the elements within the string. The default 159 * delimiter is the ampersand @c '&' and will be used when @c NULL is 160 * passed as a second argument. The returned list is NULL terminated, i.e. 161 * the last element is @c NULL. If the delimiter is not found, the returned 162 * list contains exactly one element: the input string. 163 * 164 * For instance, the following code: 165 * 166 * @code{.c} 167 * char **tok = vrna_strsplit("GGGG&CCCC&AAAAA", NULL); 168 * 169 * for (char **ptr = tok; *ptr; ptr++) { 170 * printf("%s\n", *ptr); 171 * free(*ptr); 172 * } 173 * free(tok); 174 * @endcode 175 * produces this output: 176 * 177 * @verbatim 178 * GGGG 179 * CCCC 180 * AAAAA 181 * @endverbatim 182 * and properly free's the memory occupied by the returned element array. 183 * 184 * @note This function internally uses @em strtok_r() and is therefore 185 * considered to be thread-safe. Also note, that it is the users responsibility 186 * to free the memory of the array and that of the individual element strings! 187 * 188 * @param string The input string that should be split into elements 189 * @param delimiter The delimiting character. If @c NULL, the delimiter is @c "&" 190 * @return A @c NULL terminated list of the elements in the string 191 */ 192 char ** 193 vrna_strsplit(const char *string, 194 const char *delimiter); 195 196 197 char * 198 vrna_strjoin(const char **strings, 199 const char *delimiter); 200 201 /** 202 * @brief Create a random string using characters from a specified symbol set 203 * 204 * @param l The length of the sequence 205 * @param symbols The symbol set 206 * @return A random string of length 'l' containing characters from the symbolset 207 */ 208 char * 209 vrna_random_string(int l, 210 const char symbols[]); 211 212 213 /** 214 * @brief Calculate hamming distance between two sequences 215 * 216 * @param s1 The first sequence 217 * @param s2 The second sequence 218 * @return The hamming distance between s1 and s2 219 */ 220 int 221 vrna_hamming_distance(const char *s1, 222 const char *s2); 223 224 225 /** 226 * @brief Calculate hamming distance between two sequences up to a specified length 227 * 228 * This function is similar to vrna_hamming_distance() but instead of comparing both sequences 229 * up to their actual length only the first 'n' characters are taken into account 230 * @param s1 The first sequence 231 * @param s2 The second sequence 232 * @param n The length of the subsequences to consider (starting from the 5' end) 233 * @return The hamming distance between s1 and s2 234 */ 235 int 236 vrna_hamming_distance_bound(const char *s1, 237 const char *s2, 238 int n); 239 240 241 /** 242 * @brief Convert an input sequence (possibly containing DNA alphabet characters) to RNA alphabet 243 * 244 * This function substitudes <i>T</i> and <i>t</i> with <i>U</i> and <i>u</i>, respectively 245 * 246 * @param sequence The sequence to be converted 247 */ 248 void 249 vrna_seq_toRNA(char *sequence); 250 251 252 /** 253 * @brief Convert an input sequence to uppercase 254 * 255 * @param sequence The sequence to be converted 256 */ 257 void 258 vrna_seq_toupper(char *sequence); 259 260 261 /** 262 * @brief Reverse a string in-place 263 * 264 * This function reverses a character string in the form of 265 * an array of characters in-place, i.e. it changes the input 266 * parameter. 267 * 268 * @post After execution, the input @p sequence consists of the 269 * reverse string prior to the execution. 270 * 271 * @see vrna_DNA_complement() 272 * 273 * @param sequence The string to reverse 274 */ 275 void 276 vrna_seq_reverse(char *sequence); 277 278 279 /** 280 * @brief Retrieve a DNA sequence which resembles the complement of the input sequence 281 * 282 * This function returns a mew DNA string which is the complement 283 * of the input, i.e. the nucleotide letters `A`,`C`,`G`, and `T` 284 * are substituted by their complements `T`,`G`,`C`, and `A`, respectively. 285 * 286 * Any characters not belonging to the alphabet of the 4 canonical 287 * bases of DNA are not altered. 288 * 289 * @note This function also handles lower-case input sequences and 290 * treats `U` of the RNA alphabet equally to `T` 291 * 292 * @see vrna_seq_reverse() 293 * 294 * @param sequence the input DNA sequence 295 * @return The complement of the input DNA sequence 296 */ 297 char * 298 vrna_DNA_complement(const char *sequence); 299 300 301 /** 302 * @brief Remove gap characters from a nucleotide sequence 303 * 304 * @param sequence The original, null-terminated nucleotide sequence 305 * @return A copy of the input sequence with all gap characters removed 306 */ 307 char * 308 vrna_seq_ungapped(const char *sequence); 309 310 311 /** 312 * @brief Add a separating '&' character into a string according to cut-point position 313 * 314 * If the cut-point position is less or equal to zero, this function just 315 * returns a copy of the provided string. Otherwise, the cut-point character 316 * is set at the corresponding position 317 * 318 * @param string The original string 319 * @param cp The cut-point position 320 * @return A copy of the provided string including the cut-point character 321 */ 322 char * 323 vrna_cut_point_insert(const char *string, 324 int cp); 325 326 327 /** 328 * @brief Remove a separating '&' character from a string 329 * 330 * This function removes the cut-point indicating '&' character from a string 331 * and memorizes its position in a provided integer variable. If not '&' is 332 * found in the input, the integer variable is set to -1. The function returns 333 * a copy of the input string with the '&' being sliced out. 334 * 335 * @param string The original string 336 * @param cp The cut-point position 337 * @return A copy of the input string with the '&' being sliced out 338 */ 339 char * 340 vrna_cut_point_remove(const char *string, 341 int *cp); 342 343 344 /** 345 * @} 346 */ 347 348 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY 349 350 /** 351 * @brief Convert an input sequence to uppercase 352 * @deprecated Use vrna_seq_toupper() instead! 353 */ 354 DEPRECATED(void str_uppercase(char *sequence), "Use vrna_seq_toupper() instead"); 355 356 /** 357 * @brief Convert a DNA input sequence to RNA alphabet 358 * 359 * @deprecated Use vrna_seq_toRNA() instead! 360 */ 361 DEPRECATED(void str_DNA2RNA(char *sequence), "Use vrna_seq_toRNA() instead"); 362 363 /** 364 * @brief Create a random string using characters from a specified symbol set 365 * 366 * @deprecated Use vrna_random_string() instead! 367 */ 368 DEPRECATED(char *random_string(int l, 369 const char symbols[]), 370 "Use vrna_random_string() instead"); 371 372 /** 373 * @brief Calculate hamming distance between two sequences 374 * 375 * @deprecated Use vrna_hamming_distance() instead! 376 */ 377 DEPRECATED(int hamming(const char *s1, 378 const char *s2), 379 "Use vrna_hamming_distance() instead"); 380 381 /** 382 * @brief Calculate hamming distance between two sequences up to a specified length 383 * 384 * @deprecated Use vrna_hamming_distance_bound() instead! 385 */ 386 DEPRECATED(int hamming_bound(const char *s1, 387 const char *s2, 388 int n), 389 "Use vrna_hamming_distance_bound() instead"); 390 391 #endif 392 393 #endif 394