1 /*
2   This code is adapted from Alex Couture-Beil's <rjson_pkg@mofo.ca>
3   rjson package.  It converts strings containing Unicode of the form
4   \u<4 hex characters> to R's format, i.e. by mapping them to
5   1, 2, 3 or 4 bytes.
6 
7   This is adapted so that it can be used independently of a JSON string
8   and just converts an arbitrary character.
9 
10   It is distributed under the GPL-2 license.
11  */
12 
13 #if 0
14 static int x;
15 #else
16 #include <Rdefines.h>
17 #include <Rinternals.h>
18 
19 #include <stdlib.h>
20 
21 #define         MASKBITS                0x3F
22 #define         MASKBYTE                0x80
23 #define         MASK2BYTES              0xC0
24 #define         MASK3BYTES              0xE0
25 
26 
27 
UTF8Encode2BytesUnicode(unsigned short input,char * s)28 int UTF8Encode2BytesUnicode( unsigned short input, char * s )
29 {
30 	// 0xxxxxxx
31 	if( input < 0x80 )
32 	{
33 	    s[ 0 ] = (char) input;
34 		return 1;
35 	}
36 	// 110xxxxx 10xxxxxx
37 	else if( input < 0x800 )
38 	{
39 	    s[ 0 ] = (char)(MASK2BYTES | ( input >> 6 ) );
40 	    s[ 1 ] = (MASKBYTE | ( input & MASKBITS ) );
41 		return 2;
42 	}
43 	// 1110xxxx 10xxxxxx 10xxxxxx
44 	else // if( input < 0x10000 ) // the input is unsigned short, so tautology
45 	{
46 	    s[ 0 ] = (MASK3BYTES | ( input >> 12 ) );
47 	    s[ 1 ] = (MASKBYTE | ( ( input >> 6 ) & MASKBITS ) );
48 	    s[ 2 ] = (MASKBYTE | ( input & MASKBITS ) );
49 	    return 3;
50 	}
51 	return 0; // avoid compiler warnings
52 }
53 
54 
55 #define ASSERT(cond)  if(!(cond)) error("overrunning buffers in mapString");
56 
57 
58 /* Convert a string with \unnnn elements to its Unicode form.
59   s - the input string
60   nchar - the number of characters in s.
61   buf - the output string
62   bufLen - the number of bytes available in buf.
63  */
mapString(const char * s,int nchar,char * buf,size_t bufLen)64 SEXP mapString(const char *s, int nchar, char *buf, size_t bufLen)
65 {
66 	int i = 0;
67 	buf[0] = '\0';
68 	char *cur = buf;
69 
70 	while( i < nchar ) {
71 	    while(i < nchar && cur < buf + bufLen &&
72                      s[ i ] != '\\' && s[ i ] != '\0') {
73 		cur[0] = s[i];
74 		i++; cur++;
75 	    }
76 
77             if(i >= nchar || cur >= buf + bufLen )
78 		break;
79 
80 	    if(s[i] == '\0')
81 		break;
82 
83 	    if( s[ i ] == '\\' ) {
84 		i++;
85 		if(i >= nchar) {
86 		    Rf_warning("ending string with an escape: %d > %d", (int) i, (int) nchar);
87 		    break;
88 		}
89 
90 		switch( s[ i ] ) {
91 		case '"':
92 		    cur[0] = '\\';
93 		    cur[1] = '"';
94 		    cur+=2;
95 		    break;
96 		case '\\':
97 		case '/':
98 		    cur[ 0 ] = s[ i ];
99 		    cur++;
100 		    break;
101 		case 'r':
102 		    cur[0] = '\r'; cur++;
103 		    break;
104 		case 'n':
105 		    cur[0] = '\n'; cur++;
106 		    break;
107 		case 'b':
108 		    cur[0] = '\b'; cur++;
109 		    break;
110 		case 't':
111 		    cur[0] = '\t'; cur++;
112 		    break;
113 		case 'f':
114 		    cur[0] = '\f'; cur++;
115 		    break;
116 		case 'u':
117 		    {
118                       int j;
119 		      if(i > nchar - 3) {
120 			  Rf_error("walking passed the end");
121 		      }
122   		      for(j = 1; j <= 4; j++ )
123 			if( (i + j >= nchar) || ( ( s[ i + j ] >= 'a' && s[ i + j ] <= 'f' ) ||
124 			      ( s[ i + j ] >= 'A' && s[ i + j ] <= 'F' ) ||
125 			      ( s[ i + j ] >= '0' && s[ i + j ] <= '9' ) ) == FALSE ) {
126 			    Rf_error("unexpected unicode escaped char '%c'; 4 hex digits should follow the \\u (found %i valid digits)", s[ i + j ], j - 1);
127 			}
128 
129 		    unsigned short unicode;
130 		    char unicode_buf[ 5 ]; /* to hold 4 digit hex (to prevent scanning a 5th digit accidentally */
131 		    strncpy( unicode_buf, s + i + 1, 5 );
132 		    unicode_buf[ 4 ] = '\0';
133 		    sscanf( unicode_buf, "%hx", &unicode);
134 		    cur += UTF8Encode2BytesUnicode( unicode, cur);
135 
136 		    i += 4; /* skip the four digits - actually point to last digit, which is then incremented outside of switch */
137 		    }
138 		    break;
139 		default:
140 		    cur[ 0 ] = s[ i ];
141 		    cur++;
142 		    break;
143 		}
144 
145 		i++; /* move to next char */
146 	    }
147 	}
148 	cur[0] = '\0';
149 
150 	ASSERT(i <= nchar && cur < buf + bufLen)
151 
152 
153 	return(mkCharCE( buf, CE_UTF8 ));
154 }
155 
156 
157 /* R interface to mapString.  Takes a vector of strings and a
158    a same length integer vector of lengths. */
R_mapString(SEXP str,SEXP suggestedLen)159 SEXP R_mapString(SEXP str, SEXP suggestedLen)
160 {
161     int numEls = Rf_length(str), i;
162     SEXP ans;
163     PROTECT(ans = NEW_CHARACTER(numEls));
164     for(i = 0; i < numEls; i++) {
165 
166 	size_t num;
167 	if(Rf_length(suggestedLen))
168 	    num = INTEGER(suggestedLen)[i];
169 	else
170 	    num = 4 * strlen(CHAR(STRING_ELT(str, i)));
171 
172 	char * buf = (char *) R_alloc(num, sizeof(char));
173 	if(!buf) {
174 	    Rf_error("can't allocate memory for working buffer");
175 	}
176 
177 	const char *tmp;
178 	tmp = CHAR(STRING_ELT(str, i));
179 	SET_STRING_ELT(ans, i,
180 		       mapString(tmp, (int)strlen(tmp), buf,
181 				 INTEGER(suggestedLen)[i]));
182     }
183 
184     UNPROTECT(1);
185     return(ans);
186 }
187 
188 #endif
189 
190 
191