1 #include "rlang.h"
2 #include <R_ext/GraphicsEngine.h>
3 #include <string.h>
4 #include <stdlib.h>
5
6 // Interface functions ---------------------------------------------------------
7
8 void copy_character(sexp* tgt, sexp* src, R_xlen_t len);
9 R_xlen_t unescape_character_in_copy(sexp* tgt, sexp* src, R_xlen_t i);
10
rlang_symbol(sexp * chr)11 sexp* rlang_symbol(sexp* chr) {
12 return r_str_as_symbol(r_chr_get(chr, 0));
13 }
14
rlang_sym_as_character(sexp * sym)15 sexp* rlang_sym_as_character(sexp* sym) {
16 sexp* str = KEEP(r_str_unserialise_unicode(PRINTNAME(sym)));
17 sexp* out = r_str_as_character(str);
18 FREE(1);
19 return out;
20 }
21
rlang_unescape_character(sexp * chr)22 sexp* rlang_unescape_character(sexp* chr) {
23 R_xlen_t len = Rf_xlength(chr);
24 R_xlen_t i = unescape_character_in_copy(r_null, chr, 0);
25 if (i == len) return chr;
26
27 sexp* ret = KEEP(r_new_vector(STRSXP, len));
28 copy_character(ret, chr, i);
29 unescape_character_in_copy(ret, chr, i);
30 FREE(1);
31 return ret;
32 }
33
34 // Private functions -----------------------------------------------------------
35
36 static sexp* unescape_char_to_sexp(char* tmp);
37 static bool has_unicode_escape(const char* chr);
38 static int unescape_char(char* chr);
39 static int unescape_char_found(char* chr);
40 static int process_byte(char* tgt, char* const src, int* len_processed);
41 static bool has_codepoint(const char* src);
42 static bool is_hex(const char chr);
43
copy_character(sexp * tgt,sexp * src,R_xlen_t len)44 void copy_character(sexp* tgt, sexp* src, R_xlen_t len) {
45 for (int i = 0; i < len; ++i) {
46 SET_STRING_ELT(tgt, i, STRING_ELT(src, i));
47 }
48 }
49
unescape_character_in_copy(sexp * tgt,sexp * src,R_xlen_t i)50 R_xlen_t unescape_character_in_copy(sexp* tgt, sexp* src, R_xlen_t i) {
51 R_xlen_t len = r_length(src);
52 int dry_run = Rf_isNull(tgt);
53
54 for (; i < len; ++i) {
55 sexp* old_elt = STRING_ELT(src, i);
56 sexp* new_elt = r_str_unserialise_unicode(old_elt);
57 if (dry_run) {
58 if (old_elt != new_elt) return i;
59 } else {
60 SET_STRING_ELT(tgt, i, new_elt);
61 }
62 }
63
64 return i;
65 }
66
r_str_unserialise_unicode(sexp * r_string)67 sexp* r_str_unserialise_unicode(sexp* r_string) {
68 int ce = Rf_getCharCE(r_string);
69 const char* src = CHAR(r_string);
70
71 if (!has_unicode_escape(src)) {
72 return r_string;
73 }
74
75 const char* re_enc = Rf_reEnc(src, ce, CE_UTF8, 0);
76
77 if (re_enc == src) {
78 // The string was not copied because we're in a UTF-8 locale.
79 // Need to check first if the string has any UTF-8 escapes.
80 int orig_len = strlen(re_enc);
81 char tmp[orig_len + 1];
82 memcpy(tmp, re_enc, orig_len + 1);
83 return unescape_char_to_sexp(tmp);
84 } else {
85 // The string has been copied so it's safe to use as buffer
86 char* tmp = (char*)re_enc;
87 return unescape_char_to_sexp(tmp);
88 }
89 }
90
unescape_char_to_sexp(char * tmp)91 static sexp* unescape_char_to_sexp(char* tmp) {
92 int len = unescape_char(tmp);
93 return Rf_mkCharLenCE(tmp, len, CE_UTF8);
94 }
95
has_unicode_escape(const char * chr)96 static bool has_unicode_escape(const char* chr) {
97 while (*chr) {
98 if (has_codepoint(chr)) {
99 return true;
100 }
101 ++chr;
102 }
103
104 return false;
105 }
106
unescape_char(char * chr)107 static int unescape_char(char* chr) {
108 int len = 0;
109
110 while (*chr) {
111 if (has_codepoint(chr)) {
112 return len + unescape_char_found(chr);
113 } else {
114 ++chr;
115 ++len;
116 }
117 }
118
119 return len;
120 }
121
unescape_char_found(char * chr)122 static int unescape_char_found(char* chr) {
123 char* source = chr;
124 char* target = chr;
125 int len = 0;
126
127 while (*source) {
128 int len_processed;
129 int len_new = process_byte(target, source, &len_processed);
130 source += len_processed;
131 target += len_new;
132 len += len_new;
133 }
134
135 *target = 0;
136 return len;
137 }
138
process_byte(char * tgt,char * const src,int * len_processed)139 static int process_byte(char* tgt, char* const src, int* len_processed) {
140 if (!has_codepoint(src)) {
141 // Copy only the first character (angle bracket or not), advance
142 *tgt = *src;
143 *len_processed = 1;
144 return 1;
145 }
146
147 unsigned int codepoint = strtoul(src + strlen("<U+"), NULL, 16);
148 *len_processed = strlen("<U+xxxx>");
149
150 // We have 8 bytes space, codepoints occupy less than that:
151 return (int)Rf_ucstoutf8(tgt, codepoint);
152 }
153
has_codepoint(const char * src)154 static bool has_codepoint(const char* src) {
155 if (src[0] != '<') return false;
156 if (src[1] != 'U') return false;
157 if (src[2] != '+') return false;
158 for (int i = 3; i < 7; ++i) {
159 if (!is_hex(src[i])) return false;
160 }
161 if (src[7] != '>') return false;
162 return true;
163 }
164
is_hex(const char chr)165 static bool is_hex(const char chr) {
166 if (chr >= '0' && chr <= '9') return true;
167 if (chr >= 'A' && chr <= 'F') return true;
168 return false;
169 }
170