1 #include "rlang.h"
2 #include <R_ext/GraphicsEngine.h>
3 #include <string.h>
4 #include <stdlib.h>
5 
6 // Interface functions ---------------------------------------------------------
7 
8 void copy_character(sexp* tgt, sexp* src, R_xlen_t len);
9 R_xlen_t unescape_character_in_copy(sexp* tgt, sexp* src, R_xlen_t i);
10 
rlang_symbol(sexp * chr)11 sexp* rlang_symbol(sexp* chr) {
12   return r_str_as_symbol(r_chr_get(chr, 0));
13 }
14 
rlang_sym_as_character(sexp * sym)15 sexp* rlang_sym_as_character(sexp* sym) {
16   sexp* str = KEEP(r_str_unserialise_unicode(PRINTNAME(sym)));
17   sexp* out = r_str_as_character(str);
18   FREE(1);
19   return out;
20 }
21 
rlang_unescape_character(sexp * chr)22 sexp* rlang_unescape_character(sexp* chr) {
23   R_xlen_t len = Rf_xlength(chr);
24   R_xlen_t i = unescape_character_in_copy(r_null, chr, 0);
25   if (i == len) return chr;
26 
27   sexp* ret = KEEP(r_new_vector(STRSXP, len));
28   copy_character(ret, chr, i);
29   unescape_character_in_copy(ret, chr, i);
30   FREE(1);
31   return ret;
32 }
33 
34 // Private functions -----------------------------------------------------------
35 
36 static sexp* unescape_char_to_sexp(char* tmp);
37 static bool has_unicode_escape(const char* chr);
38 static int unescape_char(char* chr);
39 static int unescape_char_found(char* chr);
40 static int process_byte(char* tgt, char* const src, int* len_processed);
41 static bool has_codepoint(const char* src);
42 static bool is_hex(const char chr);
43 
copy_character(sexp * tgt,sexp * src,R_xlen_t len)44 void copy_character(sexp* tgt, sexp* src, R_xlen_t len) {
45   for (int i = 0; i < len; ++i) {
46     SET_STRING_ELT(tgt, i, STRING_ELT(src, i));
47   }
48 }
49 
unescape_character_in_copy(sexp * tgt,sexp * src,R_xlen_t i)50 R_xlen_t unescape_character_in_copy(sexp* tgt, sexp* src, R_xlen_t i) {
51   R_xlen_t len = r_length(src);
52   int dry_run = Rf_isNull(tgt);
53 
54   for (; i < len; ++i) {
55     sexp* old_elt = STRING_ELT(src, i);
56     sexp* new_elt = r_str_unserialise_unicode(old_elt);
57     if (dry_run) {
58       if (old_elt != new_elt) return i;
59     } else {
60       SET_STRING_ELT(tgt, i, new_elt);
61     }
62   }
63 
64   return i;
65 }
66 
r_str_unserialise_unicode(sexp * r_string)67 sexp* r_str_unserialise_unicode(sexp* r_string) {
68   int ce = Rf_getCharCE(r_string);
69   const char* src = CHAR(r_string);
70 
71   if (!has_unicode_escape(src)) {
72     return r_string;
73   }
74 
75   const char* re_enc = Rf_reEnc(src, ce, CE_UTF8, 0);
76 
77   if (re_enc == src) {
78     // The string was not copied because we're in a UTF-8 locale.
79     // Need to check first if the string has any UTF-8 escapes.
80     int orig_len = strlen(re_enc);
81     char tmp[orig_len + 1];
82     memcpy(tmp, re_enc, orig_len + 1);
83     return unescape_char_to_sexp(tmp);
84   } else {
85     // The string has been copied so it's safe to use as buffer
86     char* tmp = (char*)re_enc;
87     return unescape_char_to_sexp(tmp);
88   }
89 }
90 
unescape_char_to_sexp(char * tmp)91 static sexp* unescape_char_to_sexp(char* tmp) {
92   int len = unescape_char(tmp);
93   return Rf_mkCharLenCE(tmp, len, CE_UTF8);
94 }
95 
has_unicode_escape(const char * chr)96 static bool has_unicode_escape(const char* chr) {
97   while (*chr) {
98     if (has_codepoint(chr)) {
99       return true;
100     }
101     ++chr;
102   }
103 
104   return false;
105 }
106 
unescape_char(char * chr)107 static int unescape_char(char* chr) {
108   int len = 0;
109 
110   while (*chr) {
111     if (has_codepoint(chr)) {
112       return len + unescape_char_found(chr);
113     } else {
114       ++chr;
115       ++len;
116     }
117   }
118 
119   return len;
120 }
121 
unescape_char_found(char * chr)122 static int unescape_char_found(char* chr) {
123   char* source = chr;
124   char* target = chr;
125   int len = 0;
126 
127   while (*source) {
128     int len_processed;
129     int len_new = process_byte(target, source, &len_processed);
130     source += len_processed;
131     target += len_new;
132     len += len_new;
133   }
134 
135   *target = 0;
136   return len;
137 }
138 
process_byte(char * tgt,char * const src,int * len_processed)139 static int process_byte(char* tgt, char* const src, int* len_processed) {
140   if (!has_codepoint(src)) {
141     // Copy only the first character (angle bracket or not), advance
142     *tgt = *src;
143     *len_processed = 1;
144     return 1;
145   }
146 
147   unsigned int codepoint = strtoul(src + strlen("<U+"), NULL, 16);
148   *len_processed = strlen("<U+xxxx>");
149 
150   // We have 8 bytes space, codepoints occupy less than that:
151   return (int)Rf_ucstoutf8(tgt, codepoint);
152 }
153 
has_codepoint(const char * src)154 static bool has_codepoint(const char* src) {
155   if (src[0] != '<') return false;
156   if (src[1] != 'U') return false;
157   if (src[2] != '+') return false;
158   for (int i = 3; i < 7; ++i) {
159     if (!is_hex(src[i])) return false;
160   }
161   if (src[7] != '>') return false;
162   return true;
163 }
164 
is_hex(const char chr)165 static bool is_hex(const char chr) {
166   if (chr >= '0' && chr <= '9') return true;
167   if (chr >= 'A' && chr <= 'F') return true;
168   return false;
169 }
170