1 /*
2 This code is adapted from Alex Couture-Beil's <rjson_pkg@mofo.ca>
3 rjson package. It converts strings containing Unicode of the form
4 \u<4 hex characters> to R's format, i.e. by mapping them to
5 1, 2, 3 or 4 bytes.
6
7 This is adapted so that it can be used independently of a JSON string
8 and just converts an arbitrary character.
9
10 It is distributed under the GPL-2 license.
11 */
12
13 #if 0
14 static int x;
15 #else
16 #include <Rdefines.h>
17 #include <Rinternals.h>
18
19 #include <stdlib.h>
20
21 #define MASKBITS 0x3F
22 #define MASKBYTE 0x80
23 #define MASK2BYTES 0xC0
24 #define MASK3BYTES 0xE0
25
26
27
UTF8Encode2BytesUnicode(unsigned short input,char * s)28 int UTF8Encode2BytesUnicode( unsigned short input, char * s )
29 {
30 // 0xxxxxxx
31 if( input < 0x80 )
32 {
33 s[ 0 ] = (char) input;
34 return 1;
35 }
36 // 110xxxxx 10xxxxxx
37 else if( input < 0x800 )
38 {
39 s[ 0 ] = (char)(MASK2BYTES | ( input >> 6 ) );
40 s[ 1 ] = (MASKBYTE | ( input & MASKBITS ) );
41 return 2;
42 }
43 // 1110xxxx 10xxxxxx 10xxxxxx
44 else // if( input < 0x10000 ) // the input is unsigned short, so tautology
45 {
46 s[ 0 ] = (MASK3BYTES | ( input >> 12 ) );
47 s[ 1 ] = (MASKBYTE | ( ( input >> 6 ) & MASKBITS ) );
48 s[ 2 ] = (MASKBYTE | ( input & MASKBITS ) );
49 return 3;
50 }
51 return 0; // avoid compiler warnings
52 }
53
54
55 #define ASSERT(cond) if(!(cond)) error("overrunning buffers in mapString");
56
57
58 /* Convert a string with \unnnn elements to its Unicode form.
59 s - the input string
60 nchar - the number of characters in s.
61 buf - the output string
62 bufLen - the number of bytes available in buf.
63 */
mapString(const char * s,int nchar,char * buf,size_t bufLen)64 SEXP mapString(const char *s, int nchar, char *buf, size_t bufLen)
65 {
66 int i = 0;
67 buf[0] = '\0';
68 char *cur = buf;
69
70 while( i < nchar ) {
71 while(i < nchar && cur < buf + bufLen &&
72 s[ i ] != '\\' && s[ i ] != '\0') {
73 cur[0] = s[i];
74 i++; cur++;
75 }
76
77 if(i >= nchar || cur >= buf + bufLen )
78 break;
79
80 if(s[i] == '\0')
81 break;
82
83 if( s[ i ] == '\\' ) {
84 i++;
85 if(i >= nchar) {
86 Rf_warning("ending string with an escape: %d > %d", (int) i, (int) nchar);
87 break;
88 }
89
90 switch( s[ i ] ) {
91 case '"':
92 cur[0] = '\\';
93 cur[1] = '"';
94 cur+=2;
95 break;
96 case '\\':
97 case '/':
98 cur[ 0 ] = s[ i ];
99 cur++;
100 break;
101 case 'r':
102 cur[0] = '\r'; cur++;
103 break;
104 case 'n':
105 cur[0] = '\n'; cur++;
106 break;
107 case 'b':
108 cur[0] = '\b'; cur++;
109 break;
110 case 't':
111 cur[0] = '\t'; cur++;
112 break;
113 case 'f':
114 cur[0] = '\f'; cur++;
115 break;
116 case 'u':
117 {
118 int j;
119 if(i > nchar - 3) {
120 Rf_error("walking passed the end");
121 }
122 for(j = 1; j <= 4; j++ )
123 if( (i + j >= nchar) || ( ( s[ i + j ] >= 'a' && s[ i + j ] <= 'f' ) ||
124 ( s[ i + j ] >= 'A' && s[ i + j ] <= 'F' ) ||
125 ( s[ i + j ] >= '0' && s[ i + j ] <= '9' ) ) == FALSE ) {
126 Rf_error("unexpected unicode escaped char '%c'; 4 hex digits should follow the \\u (found %i valid digits)", s[ i + j ], j - 1);
127 }
128
129 unsigned short unicode;
130 char unicode_buf[ 5 ]; /* to hold 4 digit hex (to prevent scanning a 5th digit accidentally */
131 strncpy( unicode_buf, s + i + 1, 5 );
132 unicode_buf[ 4 ] = '\0';
133 sscanf( unicode_buf, "%hx", &unicode);
134 cur += UTF8Encode2BytesUnicode( unicode, cur);
135
136 i += 4; /* skip the four digits - actually point to last digit, which is then incremented outside of switch */
137 }
138 break;
139 default:
140 cur[ 0 ] = s[ i ];
141 cur++;
142 break;
143 }
144
145 i++; /* move to next char */
146 }
147 }
148 cur[0] = '\0';
149
150 ASSERT(i <= nchar && cur < buf + bufLen)
151
152
153 return(mkCharCE( buf, CE_UTF8 ));
154 }
155
156
157 /* R interface to mapString. Takes a vector of strings and a
158 a same length integer vector of lengths. */
R_mapString(SEXP str,SEXP suggestedLen)159 SEXP R_mapString(SEXP str, SEXP suggestedLen)
160 {
161 int numEls = Rf_length(str), i;
162 SEXP ans;
163 PROTECT(ans = NEW_CHARACTER(numEls));
164 for(i = 0; i < numEls; i++) {
165
166 size_t num;
167 if(Rf_length(suggestedLen))
168 num = INTEGER(suggestedLen)[i];
169 else
170 num = 4 * strlen(CHAR(STRING_ELT(str, i)));
171
172 char * buf = (char *) R_alloc(num, sizeof(char));
173 if(!buf) {
174 Rf_error("can't allocate memory for working buffer");
175 }
176
177 const char *tmp;
178 tmp = CHAR(STRING_ELT(str, i));
179 SET_STRING_ELT(ans, i,
180 mapString(tmp, (int)strlen(tmp), buf,
181 INTEGER(suggestedLen)[i]));
182 }
183
184 UNPROTECT(1);
185 return(ans);
186 }
187
188 #endif
189
190
191