1 /*
2 * ViennaRNA/utils/strings.c
3 *
4 * c Ivo L Hofacker and Walter Fontana
5 * Vienna RNA package
6 */
7
8 #ifdef HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <ctype.h>
15 #include <errno.h>
16 #include <time.h>
17 #include <string.h>
18 #include <sys/types.h>
19 #include <stdint.h>
20 #include <stdarg.h>
21
22 #include "ViennaRNA/utils/basic.h"
23 #include "ViennaRNA/utils/strings.h"
24
25 /*
26 #################################
27 # PRIVATE FUNCTION DECLARATIONS #
28 #################################
29 */
30
31 /*
32 #################################
33 # BEGIN OF FUNCTION DEFINITIONS #
34 #################################
35 */
36
37 #ifndef HAVE_STRDUP
38 char *
strdup(const char * s)39 strdup(const char *s)
40 {
41 char *dup;
42
43 dup = vrna_alloc(strlen(s) + 1);
44 strcpy(dup, s);
45 return dup;
46 }
47
48
49 #endif
50
51 PUBLIC char *
vrna_strdup_printf(const char * format,...)52 vrna_strdup_printf(const char *format,
53 ...)
54 {
55 char *result;
56 va_list argp;
57
58 va_start(argp, format);
59 result = vrna_strdup_vprintf(format, argp);
60 va_end(argp); /* Each va_start() or va_copy() needs a va_end() */
61
62 return result;
63 }
64
65
66 PUBLIC char *
vrna_strdup_vprintf(const char * format,va_list argp)67 vrna_strdup_vprintf(const char *format,
68 va_list argp)
69 {
70 char *result;
71 int r;
72
73 result = NULL;
74
75 #ifndef HAVE_VASPRINTF
76 int count;
77 va_list copy;
78 va_copy(copy, argp);
79
80 r = -1;
81
82 /* retrieve the number of characters that the string requires */
83 #ifdef _WIN32
84 /*
85 * vsnprintf() in Windows is not ANSI compliant, although it's
86 * "...included for compliance to the ANSI standard"
87 * Thus, we use _vscprintf() that explicitly counts characters
88 */
89 count = _vscprintf(format, argp);
90 #else
91 count = vsnprintf(NULL, 0, format, argp);
92 #endif
93
94 if ((count >= 0) && (count < INT_MAX)) {
95 char *buf = (char *)vrna_alloc(sizeof(char) * (count + 1));
96 if (buf == NULL)
97 r = -1;
98 else if ((r = vsnprintf(buf, count + 1, format, copy)) < 0)
99 free(buf);
100 else
101 result = buf;
102 }
103
104 va_end(copy); /* Each va_start() or va_copy() needs a va_end() */
105 #else
106 /* the default is to use vasprintf() if available */
107 r = vasprintf(&result, format, argp);
108 #endif
109
110 /* check for any memory allocation error indicated by r == -1 */
111 if (r == -1) {
112 vrna_message_warning("vrna_strdup_printf: memory allocation failure!");
113 result = NULL;
114 }
115
116 return result;
117 }
118
119
120 PUBLIC int
vrna_strcat_printf(char ** dest,const char * format,...)121 vrna_strcat_printf(char **dest,
122 const char *format,
123 ...)
124 {
125 int r;
126 va_list argp;
127
128 va_start(argp, format);
129 r = vrna_strcat_vprintf(dest, format, argp);
130 va_end(argp); /* Each va_start() or va_copy() needs a va_end() */
131
132 return r;
133 }
134
135
136 PUBLIC int
vrna_strcat_vprintf(char ** dest,const char * format,va_list args)137 vrna_strcat_vprintf(char **dest,
138 const char *format,
139 va_list args)
140 {
141 char *buf;
142 int r, l1, l2;
143 size_t old_count, new_count;
144
145 if ((!dest) || (!format))
146 return -1;
147
148 va_list copy;
149 va_copy(copy, args);
150
151 r = -1;
152 buf = *dest;
153 old_count = (buf) ? strlen(buf) : 0;
154
155 /* retrieve the number of characters that the string requires */
156 #ifdef _WIN32
157 /*
158 * vsnprintf() in Windows is not ANSI compliant, although it's
159 * "...included for compliance to the ANSI standard"
160 * Thus, we use _vscprintf() that explicitly counts characters
161 */
162 new_count = _vscprintf(format, args);
163 #else
164 new_count = vsnprintf(NULL, 0, format, args);
165 #endif
166
167 /* determine longer and shorter part of new string for INT overflow protection */
168 if (old_count > new_count) {
169 l1 = old_count;
170 l2 = new_count;
171 } else {
172 l1 = new_count;
173 l2 = old_count;
174 }
175
176 if ((new_count > 0) && (l1 < SIZE_MAX) && ((SIZE_MAX - l1) > l2)) {
177 buf = (char *)vrna_realloc(buf, sizeof(char) * (old_count + new_count + 1));
178 if (buf == NULL) {
179 r = -1;
180 } else if ((r = vsnprintf(buf + old_count, new_count + 1, format, copy)) < 0) {
181 free(buf);
182 } else {
183 *dest = buf;
184 r = old_count + new_count;
185 }
186 } else if (new_count == 0) {
187 /* we do not treat empty format string as error */
188 r = (int)old_count;
189 }
190
191 va_end(copy); /* Each va_start() or va_copy() needs a va_end() */
192
193 /* check for any memory allocation error indicated by r == -1 */
194 if (r == -1) {
195 vrna_message_warning("vrna_strcat_printf: memory allocation failure!");
196 *dest = NULL;
197 }
198
199 return r;
200 }
201
202
203 PUBLIC char *
vrna_random_string(int l,const char symbols[])204 vrna_random_string(int l,
205 const char symbols[])
206 {
207 char *r;
208 int i, rn, base;
209
210 base = (int)strlen(symbols);
211 r = (char *)vrna_alloc(sizeof(char) * (l + 1));
212
213 for (i = 0; i < l; i++) {
214 rn = (int)(vrna_urn() * base); /* [0, base-1] */
215 r[i] = symbols[rn];
216 }
217 r[l] = '\0';
218 return r;
219 }
220
221
222 /*-----------------------------------------------------------------*/
223
224 PUBLIC int
vrna_hamming_distance(const char * s1,const char * s2)225 vrna_hamming_distance(const char *s1,
226 const char *s2)
227 {
228 int h = 0;
229
230 for (; *s1 && *s2; s1++, s2++)
231 if (*s1 != *s2)
232 h++;
233
234 return h;
235 }
236
237
238 PUBLIC int
vrna_hamming_distance_bound(const char * s1,const char * s2,int boundary)239 vrna_hamming_distance_bound(const char *s1,
240 const char *s2,
241 int boundary)
242 {
243 int h = 0;
244
245 for (; *s1 && *s2 && boundary; s1++, s2++, boundary--)
246 if (*s1 != *s2)
247 h++;
248
249 return h;
250 }
251
252
253 PUBLIC void
vrna_seq_toRNA(char * sequence)254 vrna_seq_toRNA(char *sequence)
255 {
256 unsigned int i;
257
258 if (sequence) {
259 for (i = 0; sequence[i]; i++) {
260 if (sequence[i] == 'T')
261 sequence[i] = 'U';
262
263 if (sequence[i] == 't')
264 sequence[i] = 'u';
265 }
266 }
267 }
268
269
270 PUBLIC void
vrna_seq_toupper(char * sequence)271 vrna_seq_toupper(char *sequence)
272 {
273 unsigned int i;
274
275 if (sequence)
276 for (i = 0; sequence[i]; i++)
277 sequence[i] = toupper(sequence[i]);
278 }
279
280
281 PUBLIC void
vrna_seq_reverse(char * sequence)282 vrna_seq_reverse(char *sequence)
283 {
284 if (sequence) {
285 char *p1 = sequence;
286 char *p2 = sequence + strlen(sequence) - 1;
287
288 while (p1 < p2) {
289 char tmp = *p1;
290 *p1++ = *p2;
291 *p2-- = tmp;
292 }
293 }
294 }
295
296
297 PUBLIC char *
vrna_DNA_complement(const char * sequence)298 vrna_DNA_complement(const char *sequence)
299 {
300 char *complement, *ptr;
301 size_t n;
302
303 complement = NULL;
304
305 if (sequence) {
306 n = strlen(sequence);
307 complement = (char *)vrna_alloc(sizeof(char) * (n + 1));
308 /* copy the input string */
309 complement = memcpy(complement, sequence, sizeof(char) * n);
310
311 /* complement characters */
312 for (ptr = complement; *ptr; ptr++) {
313 switch (*ptr) {
314 case 'A':
315 *ptr = 'T';
316 break;
317
318 case 'a':
319 *ptr = 't';
320 break;
321
322 case 'C':
323 *ptr = 'G';
324 break;
325
326 case 'c':
327 *ptr = 'g';
328 break;
329
330 case 'G':
331 *ptr = 'C';
332 break;
333
334 case 'g':
335 *ptr = 'c';
336 break;
337
338 case 'T': /* fall through */
339 case 'U':
340 *ptr = 'A';
341 break;
342
343 case 't': /* fall through */
344 case 'u':
345 *ptr = 'a';
346 break;
347
348 default:
349 break;
350 }
351 }
352
353 complement[n] = '\0';
354 }
355
356 return complement;
357 }
358
359
360 PUBLIC char *
vrna_cut_point_insert(const char * string,int cp)361 vrna_cut_point_insert(const char *string,
362 int cp)
363 {
364 char *ctmp;
365 int len;
366
367 if (cp > 0) {
368 len = strlen(string);
369 ctmp = (char *)vrna_alloc((len + 2) * sizeof(char));
370 /* first sequence */
371 (void)strncpy(ctmp, string, cp - 1);
372 /* spacer */
373 ctmp[cp - 1] = '&';
374 /* second sequence */
375 (void)strcat(ctmp, string + cp - 1);
376 } else {
377 ctmp = strdup(string);
378 }
379
380 return ctmp;
381 }
382
383
384 PUBLIC char *
vrna_cut_point_remove(const char * string,int * cp)385 vrna_cut_point_remove(const char *string,
386 int *cp)
387 {
388 char *pos, *copy = NULL;
389 unsigned int len;
390
391 *cp = -1;
392
393 if (string) {
394 len = strlen(string);
395 copy = strdup(string);
396 if ((pos = strchr(copy, '&'))) {
397 *cp = (int)(pos - copy) + 1;
398 if (*cp >= len)
399 *cp = -1;
400
401 if (strchr(pos + 1, '&'))
402 vrna_message_error("more than one cut-point in input");
403
404 for (; *pos; pos++)
405 *pos = *(pos + 1); /* splice out the & */
406 }
407 }
408
409 return copy;
410 }
411
412
413 PUBLIC char **
vrna_strsplit(const char * string,const char * delimiter)414 vrna_strsplit(const char *string,
415 const char *delimiter)
416 {
417 char delim[2], *ptr, *ptr2, *token, *save, **split;
418 unsigned int n;
419
420 split = NULL;
421 n = 0;
422
423 if (string) {
424 if ((delimiter) && (*delimiter))
425 delim[0] = *delimiter;
426 else
427 delim[0] = '&';
428
429 delim[1] = '\0';
430
431 /* copy string such that we can alter it via strtok() */
432 ptr2 = strdup(string);
433
434 /* count how many elements we'll extract */
435 ptr = ptr2;
436
437 while (*ptr++)
438 if (*ptr == *delim)
439 n++;
440
441 /*
442 * allocate (n + 1) + 1 elements in split list
443 * n + 1 elements plus 1 additional element to indicate
444 * the last element in split
445 */
446 split = (char **)vrna_alloc(sizeof(char *) * (n + 2));
447
448 n = 0;
449 token = strtok_r(ptr2, delim, &save);
450
451 while (token != NULL) {
452 split[n++] = vrna_strdup_printf("%s", token);
453 token = strtok_r(NULL, delim, &save);
454 }
455
456 split[n] = NULL;
457
458 free(ptr2);
459 }
460
461 return split;
462 }
463
464
465 PUBLIC char *
vrna_strjoin(const char ** strings,const char * delimiter)466 vrna_strjoin(const char **strings,
467 const char *delimiter)
468 {
469 char *s = NULL;
470 size_t n, offset, *lengths, num_strings, mem_strings, total_length;
471
472 if (strings) {
473 total_length = 0;
474 mem_strings = 32;
475 lengths = (size_t *)vrna_alloc(sizeof(size_t) * mem_strings);
476
477 for (n = 0; strings[n]; n++) {
478 lengths[n] = strlen(strings[n]);
479 total_length += lengths[n];
480
481 if (n == mem_strings) {
482 mem_strings += 32;
483 lengths = (size_t *)vrna_realloc(lengths, sizeof(size_t) * mem_strings);
484 }
485 }
486
487 if ((delimiter) && (*delimiter))
488 total_length += (n - 1);
489
490 /* finally, glue the strings together */
491 s = (char *)vrna_alloc(sizeof(char) * (total_length + 1));
492
493 for (offset = 0, n = 0; strings[n]; n++) {
494 memcpy(s + offset, strings[n], sizeof(char) * lengths[n]);
495 offset += lengths[n];
496
497 if ((delimiter) &&
498 (*delimiter) &&
499 (strings[n + 1]))
500 s[offset++] = *delimiter;
501 }
502
503 s[total_length] = '\0';
504
505 free(lengths);
506 }
507
508 return s;
509 }
510
511
512 #if 0
513 PUBLIC char *
514 vrna_strsplice(const char *string,
515 const char *delimiter,
516 unsigned int **positions,
517 unsigned int options)
518 {
519 char *result = NULL;
520
521 if (string) {
522 if (delimiter) {
523 if (options & VRNA_STRSPLICE_IN){
524 if (positions) {
525 /* count how many more characters we require for the fully spliced string */
526 for (size_t n = 0; positions[n] != 0; n++);
527
528 size_t dl = strlen(delimiter);
529 size_t l = strlen(string);
530
531 result = (char *)vrna_alloc(sizeof(char) * (l + dl * n + 1));
532
533 /* finally, construct the spliced sequence */
534 size_t start = 0;
535 size_t end = 0;
536 size_t last_pos = 0;
537 /* handle first case separately */
538 memcpy(result, string, sizeof(char) * ((*positions)[0] - 1));
539 memcpy(result + (*positions)[0] - 1, delimiter, sizeof(char) * dl);
540 start += (*positions)[0] - 1;
541 end += (*positions)[0] - 1 + dl;
542
543 for (size_t i = 1; i < n; i++) {
544 memcpy(result + end, string + start, sizeof(char) * positions
545 }
546
547 } else {
548 result = strdup(string);
549 }
550 } else if (options & VRNA_STRSPLICE_OUT) {
551
552 }
553 } else {
554 /* no delimiter specified, so we don't need to do anything */
555 result = strdup(string);
556 if ((options & VRNA_STRSPLICE_OUT) &&
557 (positions)) {
558 *positions = (unsigned int *)vrna_alloc(sizeof(unsigned int));
559 (*positions)[0] = 0;
560 }
561 }
562 }
563
564 return result;
565 }
566
567 #endif
568
569 PUBLIC char *
vrna_seq_ungapped(const char * seq)570 vrna_seq_ungapped(const char *seq)
571 {
572 char *tmp_sequence, *b;
573 int i;
574
575 tmp_sequence = NULL;
576
577 if (seq) {
578 tmp_sequence = strdup(seq);
579
580 b = tmp_sequence;
581 i = 0;
582 do {
583 if ((*b == '-') || (*b == '_') || (*b == '~') || (*b == '.'))
584 continue;
585
586 tmp_sequence[i] = *b;
587 i++;
588 } while (*(++b));
589
590 tmp_sequence = (char *)vrna_realloc(tmp_sequence, (i + 1) * sizeof(char));
591 tmp_sequence[i] = '\0';
592 }
593
594 return tmp_sequence;
595 }
596
597
598 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
599
600 /*###########################################*/
601 /*# deprecated functions below #*/
602 /*###########################################*/
603
604 PUBLIC void
str_uppercase(char * sequence)605 str_uppercase(char *sequence)
606 {
607 vrna_seq_toupper(sequence);
608 }
609
610
611 PUBLIC void
str_DNA2RNA(char * sequence)612 str_DNA2RNA(char *sequence)
613 {
614 vrna_seq_toRNA(sequence);
615 }
616
617
618 PUBLIC char *
random_string(int l,const char symbols[])619 random_string(int l,
620 const char symbols[])
621 {
622 return vrna_random_string(l, symbols);
623 }
624
625
626 PUBLIC int
hamming(const char * s1,const char * s2)627 hamming(const char *s1,
628 const char *s2)
629 {
630 return vrna_hamming_distance(s1, s2);
631 }
632
633
634 PUBLIC int
hamming_bound(const char * s1,const char * s2,int boundary)635 hamming_bound(const char *s1,
636 const char *s2,
637 int boundary)
638 {
639 return vrna_hamming_distance_bound(s1, s2, boundary);
640 }
641
642
643 #endif
644