1 /*
2  *    ViennaRNA/utils/strings.c
3  *
4  *               c  Ivo L Hofacker and Walter Fontana
5  *                        Vienna RNA package
6  */
7 
8 #ifdef HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <ctype.h>
15 #include <errno.h>
16 #include <time.h>
17 #include <string.h>
18 #include <sys/types.h>
19 #include <stdint.h>
20 #include <stdarg.h>
21 
22 #include "ViennaRNA/utils/basic.h"
23 #include "ViennaRNA/utils/strings.h"
24 
25 /*
26  #################################
27  # PRIVATE FUNCTION DECLARATIONS #
28  #################################
29  */
30 
31 /*
32  #################################
33  # BEGIN OF FUNCTION DEFINITIONS #
34  #################################
35  */
36 
37 #ifndef HAVE_STRDUP
38 char *
strdup(const char * s)39 strdup(const char *s)
40 {
41   char *dup;
42 
43   dup = vrna_alloc(strlen(s) + 1);
44   strcpy(dup, s);
45   return dup;
46 }
47 
48 
49 #endif
50 
51 PUBLIC char *
vrna_strdup_printf(const char * format,...)52 vrna_strdup_printf(const char *format,
53                    ...)
54 {
55   char    *result;
56   va_list argp;
57 
58   va_start(argp, format);
59   result = vrna_strdup_vprintf(format, argp);
60   va_end(argp); /* Each va_start() or va_copy() needs a va_end() */
61 
62   return result;
63 }
64 
65 
66 PUBLIC char *
vrna_strdup_vprintf(const char * format,va_list argp)67 vrna_strdup_vprintf(const char  *format,
68                     va_list     argp)
69 {
70   char    *result;
71   int     r;
72 
73   result = NULL;
74 
75 #ifndef HAVE_VASPRINTF
76   int     count;
77   va_list copy;
78   va_copy(copy, argp);
79 
80   r = -1;
81 
82   /* retrieve the number of characters that the string requires */
83 #ifdef _WIN32
84   /*
85    * vsnprintf() in Windows is not ANSI compliant, although it's
86    * "...included for compliance to the ANSI standard"
87    * Thus, we use _vscprintf() that explicitly counts characters
88    */
89   count = _vscprintf(format, argp);
90 #else
91   count = vsnprintf(NULL, 0, format, argp);
92 #endif
93 
94   if ((count >= 0) && (count < INT_MAX)) {
95     char *buf = (char *)vrna_alloc(sizeof(char) * (count + 1));
96     if (buf == NULL)
97       r = -1;
98     else if ((r = vsnprintf(buf, count + 1, format, copy)) < 0)
99       free(buf);
100     else
101       result = buf;
102   }
103 
104   va_end(copy);  /* Each va_start() or va_copy() needs a va_end() */
105 #else
106   /* the default is to use vasprintf() if available */
107   r = vasprintf(&result, format, argp);
108 #endif
109 
110   /* check for any memory allocation error indicated by r == -1 */
111   if (r == -1) {
112     vrna_message_warning("vrna_strdup_printf: memory allocation failure!");
113     result = NULL;
114   }
115 
116   return result;
117 }
118 
119 
120 PUBLIC int
vrna_strcat_printf(char ** dest,const char * format,...)121 vrna_strcat_printf(char       **dest,
122                    const char *format,
123                    ...)
124 {
125   int     r;
126   va_list argp;
127 
128   va_start(argp, format);
129   r = vrna_strcat_vprintf(dest, format, argp);
130   va_end(argp); /* Each va_start() or va_copy() needs a va_end() */
131 
132   return r;
133 }
134 
135 
136 PUBLIC int
vrna_strcat_vprintf(char ** dest,const char * format,va_list args)137 vrna_strcat_vprintf(char        **dest,
138                     const char  *format,
139                     va_list     args)
140 {
141   char    *buf;
142   int     r, l1, l2;
143   size_t  old_count, new_count;
144 
145   if ((!dest) || (!format))
146     return -1;
147 
148   va_list copy;
149   va_copy(copy, args);
150 
151   r         = -1;
152   buf       = *dest;
153   old_count = (buf) ? strlen(buf) : 0;
154 
155   /* retrieve the number of characters that the string requires */
156 #ifdef _WIN32
157   /*
158    * vsnprintf() in Windows is not ANSI compliant, although it's
159    * "...included for compliance to the ANSI standard"
160    * Thus, we use _vscprintf() that explicitly counts characters
161    */
162   new_count = _vscprintf(format, args);
163 #else
164   new_count = vsnprintf(NULL, 0, format, args);
165 #endif
166 
167   /* determine longer and shorter part of new string for INT overflow protection */
168   if (old_count > new_count) {
169     l1  = old_count;
170     l2  = new_count;
171   } else {
172     l1  = new_count;
173     l2  = old_count;
174   }
175 
176   if ((new_count > 0) && (l1 < SIZE_MAX) && ((SIZE_MAX - l1) > l2)) {
177     buf = (char *)vrna_realloc(buf, sizeof(char) * (old_count + new_count + 1));
178     if (buf == NULL) {
179       r = -1;
180     } else if ((r = vsnprintf(buf + old_count, new_count + 1, format, copy)) < 0) {
181       free(buf);
182     } else {
183       *dest = buf;
184       r     = old_count + new_count;
185     }
186   } else if (new_count == 0) {
187     /* we do not treat empty format string as error */
188     r = (int)old_count;
189   }
190 
191   va_end(copy);  /* Each va_start() or va_copy() needs a va_end() */
192 
193   /* check for any memory allocation error indicated by r == -1 */
194   if (r == -1) {
195     vrna_message_warning("vrna_strcat_printf: memory allocation failure!");
196     *dest = NULL;
197   }
198 
199   return r;
200 }
201 
202 
203 PUBLIC char *
vrna_random_string(int l,const char symbols[])204 vrna_random_string(int        l,
205                    const char symbols[])
206 {
207   char  *r;
208   int   i, rn, base;
209 
210   base  = (int)strlen(symbols);
211   r     = (char *)vrna_alloc(sizeof(char) * (l + 1));
212 
213   for (i = 0; i < l; i++) {
214     rn    = (int)(vrna_urn() * base); /* [0, base-1] */
215     r[i]  = symbols[rn];
216   }
217   r[l] = '\0';
218   return r;
219 }
220 
221 
222 /*-----------------------------------------------------------------*/
223 
224 PUBLIC int
vrna_hamming_distance(const char * s1,const char * s2)225 vrna_hamming_distance(const char  *s1,
226                       const char  *s2)
227 {
228   int h = 0;
229 
230   for (; *s1 && *s2; s1++, s2++)
231     if (*s1 != *s2)
232       h++;
233 
234   return h;
235 }
236 
237 
238 PUBLIC int
vrna_hamming_distance_bound(const char * s1,const char * s2,int boundary)239 vrna_hamming_distance_bound(const char  *s1,
240                             const char  *s2,
241                             int         boundary)
242 {
243   int h = 0;
244 
245   for (; *s1 && *s2 && boundary; s1++, s2++, boundary--)
246     if (*s1 != *s2)
247       h++;
248 
249   return h;
250 }
251 
252 
253 PUBLIC void
vrna_seq_toRNA(char * sequence)254 vrna_seq_toRNA(char *sequence)
255 {
256   unsigned int i;
257 
258   if (sequence) {
259     for (i = 0; sequence[i]; i++) {
260       if (sequence[i] == 'T')
261         sequence[i] = 'U';
262 
263       if (sequence[i] == 't')
264         sequence[i] = 'u';
265     }
266   }
267 }
268 
269 
270 PUBLIC void
vrna_seq_toupper(char * sequence)271 vrna_seq_toupper(char *sequence)
272 {
273   unsigned int i;
274 
275   if (sequence)
276     for (i = 0; sequence[i]; i++)
277       sequence[i] = toupper(sequence[i]);
278 }
279 
280 
281 PUBLIC void
vrna_seq_reverse(char * sequence)282 vrna_seq_reverse(char *sequence)
283 {
284   if (sequence) {
285     char *p1 = sequence;
286     char *p2 = sequence + strlen(sequence) - 1;
287 
288     while (p1 < p2) {
289       char tmp = *p1;
290       *p1++ = *p2;
291       *p2-- = tmp;
292     }
293   }
294 }
295 
296 
297 PUBLIC char *
vrna_DNA_complement(const char * sequence)298 vrna_DNA_complement(const char *sequence)
299 {
300   char    *complement, *ptr;
301   size_t  n;
302 
303   complement = NULL;
304 
305   if (sequence) {
306     n           = strlen(sequence);
307     complement  = (char *)vrna_alloc(sizeof(char) * (n + 1));
308     /* copy the input string */
309     complement  = memcpy(complement, sequence, sizeof(char) * n);
310 
311     /* complement characters */
312     for (ptr = complement; *ptr; ptr++) {
313       switch (*ptr) {
314         case 'A':
315           *ptr = 'T';
316           break;
317 
318         case 'a':
319           *ptr = 't';
320           break;
321 
322         case 'C':
323           *ptr = 'G';
324           break;
325 
326         case 'c':
327           *ptr = 'g';
328           break;
329 
330         case 'G':
331           *ptr = 'C';
332           break;
333 
334         case 'g':
335           *ptr = 'c';
336           break;
337 
338         case 'T': /* fall through */
339         case 'U':
340           *ptr = 'A';
341           break;
342 
343         case 't': /* fall through */
344         case 'u':
345           *ptr = 'a';
346           break;
347 
348         default:
349           break;
350       }
351     }
352 
353     complement[n] = '\0';
354   }
355 
356   return complement;
357 }
358 
359 
360 PUBLIC char *
vrna_cut_point_insert(const char * string,int cp)361 vrna_cut_point_insert(const char  *string,
362                       int         cp)
363 {
364   char  *ctmp;
365   int   len;
366 
367   if (cp > 0) {
368     len   = strlen(string);
369     ctmp  = (char *)vrna_alloc((len + 2) * sizeof(char));
370     /* first sequence */
371     (void)strncpy(ctmp, string, cp - 1);
372     /* spacer */
373     ctmp[cp - 1] = '&';
374     /* second sequence */
375     (void)strcat(ctmp, string + cp - 1);
376   } else {
377     ctmp = strdup(string);
378   }
379 
380   return ctmp;
381 }
382 
383 
384 PUBLIC char *
vrna_cut_point_remove(const char * string,int * cp)385 vrna_cut_point_remove(const char  *string,
386                       int         *cp)
387 {
388   char *pos, *copy = NULL;
389   unsigned int len;
390 
391   *cp = -1;
392 
393   if (string) {
394     len = strlen(string);
395     copy = strdup(string);
396     if ((pos = strchr(copy, '&'))) {
397       *cp = (int)(pos - copy) + 1;
398       if (*cp >= len)
399         *cp = -1;
400 
401       if (strchr(pos + 1, '&'))
402         vrna_message_error("more than one cut-point in input");
403 
404       for (; *pos; pos++)
405         *pos = *(pos + 1);               /* splice out the & */
406     }
407   }
408 
409   return copy;
410 }
411 
412 
413 PUBLIC char **
vrna_strsplit(const char * string,const char * delimiter)414 vrna_strsplit(const char  *string,
415               const char  *delimiter)
416 {
417   char          delim[2], *ptr, *ptr2, *token, *save, **split;
418   unsigned int  n;
419 
420   split = NULL;
421   n     = 0;
422 
423   if (string) {
424     if ((delimiter) && (*delimiter))
425       delim[0] = *delimiter;
426     else
427       delim[0] = '&';
428 
429     delim[1] = '\0';
430 
431     /* copy string such that we can alter it via strtok() */
432     ptr2 = strdup(string);
433 
434     /* count how many elements we'll extract */
435     ptr = ptr2;
436 
437     while (*ptr++)
438       if (*ptr == *delim)
439         n++;
440 
441     /*
442      * allocate (n + 1) + 1 elements in split list
443      * n + 1 elements plus 1 additional element to indicate
444      * the last element in split
445      */
446     split = (char **)vrna_alloc(sizeof(char *) * (n + 2));
447 
448     n     = 0;
449     token = strtok_r(ptr2, delim, &save);
450 
451     while (token != NULL) {
452       split[n++]  = vrna_strdup_printf("%s", token);
453       token       = strtok_r(NULL, delim, &save);
454     }
455 
456     split[n] = NULL;
457 
458     free(ptr2);
459   }
460 
461   return split;
462 }
463 
464 
465 PUBLIC char *
vrna_strjoin(const char ** strings,const char * delimiter)466 vrna_strjoin(const char **strings,
467              const char *delimiter)
468 {
469   char        *s = NULL;
470   size_t      n, offset, *lengths, num_strings, mem_strings, total_length;
471 
472   if (strings) {
473     total_length  = 0;
474     mem_strings   = 32;
475     lengths       = (size_t *)vrna_alloc(sizeof(size_t) * mem_strings);
476 
477     for (n = 0; strings[n]; n++) {
478       lengths[n]    = strlen(strings[n]);
479       total_length += lengths[n];
480 
481       if (n == mem_strings) {
482         mem_strings += 32;
483         lengths      = (size_t *)vrna_realloc(lengths, sizeof(size_t) * mem_strings);
484       }
485     }
486 
487     if ((delimiter) && (*delimiter))
488       total_length += (n - 1);
489 
490     /* finally, glue the strings together */
491     s = (char *)vrna_alloc(sizeof(char) * (total_length + 1));
492 
493     for (offset = 0, n = 0; strings[n]; n++) {
494       memcpy(s + offset, strings[n], sizeof(char) * lengths[n]);
495       offset += lengths[n];
496 
497       if ((delimiter) &&
498           (*delimiter) &&
499           (strings[n + 1]))
500         s[offset++] = *delimiter;
501     }
502 
503     s[total_length] = '\0';
504 
505     free(lengths);
506   }
507 
508   return s;
509 }
510 
511 
512 #if 0
513 PUBLIC char *
514 vrna_strsplice(const char   *string,
515                const char   *delimiter,
516                unsigned int **positions,
517                unsigned int options)
518 {
519   char *result = NULL;
520 
521   if (string) {
522     if (delimiter) {
523       if (options & VRNA_STRSPLICE_IN){
524         if (positions) {
525           /* count how many more characters we require for the fully spliced string */
526           for (size_t n = 0; positions[n] != 0; n++);
527 
528           size_t dl = strlen(delimiter);
529           size_t l  = strlen(string);
530 
531           result = (char *)vrna_alloc(sizeof(char) * (l + dl * n + 1));
532 
533           /* finally, construct the spliced sequence */
534           size_t start = 0;
535           size_t end   = 0;
536           size_t last_pos = 0;
537           /* handle first case separately */
538           memcpy(result, string, sizeof(char) * ((*positions)[0] - 1));
539           memcpy(result + (*positions)[0] - 1, delimiter, sizeof(char) * dl);
540           start += (*positions)[0] - 1;
541           end   += (*positions)[0] - 1 + dl;
542 
543           for (size_t i = 1; i < n; i++) {
544             memcpy(result + end, string + start, sizeof(char) * positions
545           }
546 
547         } else {
548           result = strdup(string);
549         }
550       } else if (options & VRNA_STRSPLICE_OUT) {
551 
552       }
553     } else {
554       /* no delimiter specified, so we don't need to do anything */
555       result = strdup(string);
556       if ((options & VRNA_STRSPLICE_OUT) &&
557           (positions)) {
558         *positions = (unsigned int *)vrna_alloc(sizeof(unsigned int));
559         (*positions)[0] = 0;
560       }
561     }
562   }
563 
564   return result;
565 }
566 
567 #endif
568 
569 PUBLIC char *
vrna_seq_ungapped(const char * seq)570 vrna_seq_ungapped(const char *seq)
571 {
572   char  *tmp_sequence, *b;
573   int   i;
574 
575   tmp_sequence = NULL;
576 
577   if (seq) {
578     tmp_sequence = strdup(seq);
579 
580     b = tmp_sequence;
581     i = 0;
582     do {
583       if ((*b == '-') || (*b == '_') || (*b == '~') || (*b == '.'))
584         continue;
585 
586       tmp_sequence[i] = *b;
587       i++;
588     } while (*(++b));
589 
590     tmp_sequence    = (char *)vrna_realloc(tmp_sequence, (i + 1) * sizeof(char));
591     tmp_sequence[i] = '\0';
592   }
593 
594   return tmp_sequence;
595 }
596 
597 
598 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
599 
600 /*###########################################*/
601 /*# deprecated functions below              #*/
602 /*###########################################*/
603 
604 PUBLIC void
str_uppercase(char * sequence)605 str_uppercase(char *sequence)
606 {
607   vrna_seq_toupper(sequence);
608 }
609 
610 
611 PUBLIC void
str_DNA2RNA(char * sequence)612 str_DNA2RNA(char *sequence)
613 {
614   vrna_seq_toRNA(sequence);
615 }
616 
617 
618 PUBLIC char *
random_string(int l,const char symbols[])619 random_string(int         l,
620               const char  symbols[])
621 {
622   return vrna_random_string(l, symbols);
623 }
624 
625 
626 PUBLIC int
hamming(const char * s1,const char * s2)627 hamming(const char  *s1,
628         const char  *s2)
629 {
630   return vrna_hamming_distance(s1, s2);
631 }
632 
633 
634 PUBLIC int
hamming_bound(const char * s1,const char * s2,int boundary)635 hamming_bound(const char  *s1,
636               const char  *s2,
637               int         boundary)
638 {
639   return vrna_hamming_distance_bound(s1, s2, boundary);
640 }
641 
642 
643 #endif
644