1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include <stdio.h>
5 #include <string>
6 #include <stdlib.h>
7 #include <errno.h>
8 #include <string.h>
9 #include <iostream>
10 #include <fstream>
11 
12 // We only use U8_* macros, which are entirely inline.
13 #include "unicode/utf8.h"
14 
15 // This contains a codepage and ISO 14882:1998 illegality table.
16 // Use "make gen-table" to rebuild it.
17 #include "cptbl.h"
18 
19 /**
20  * What is this?
21  *
22  * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code
23  * in utf-8 into something consumable by certain compilers (Solaris, xlC)
24  * which aren't quite standards compliant.
25  *
26  * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN'
27  * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc.
28  *   (some compilers do not support the u8 prefix correctly.)
29  * - if the system is EBCDIC-based, that is used to correct the input characters.
30  *
31  * Usage:
32  *   escapesrc infile.cpp outfile.cpp
33  * Normally this is invoked by the build stage, with a rule such as:
34  *
35  * _%.cpp: $(srcdir)/%.cpp
36  *       @$(BINDIR)/escapesrc$(EXEEXT) $< $@
37  * %.o: _%.cpp
38  *       $(COMPILE.cc) ... $@ $<
39  *
40  * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp
41  * from being itself escaped.
42  */
43 
44 
45 static const char
46   kSPACE   = 0x20,
47   kTAB     = 0x09,
48   kLF      = 0x0A,
49   kCR      = 0x0D;
50 
51 // For convenience
52 # define cp1047_to_8859(c) cp1047_8859_1[c]
53 
54 // Our app's name
55 std::string prog;
56 
57 /**
58  * Give the usual 1-line documentation and exit
59  */
usage()60 void usage() {
61   fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
62 }
63 
64 /**
65  * Delete the output file (if any)
66  * We want to delete even if we didn't generate, because it might be stale.
67  */
cleanup(const std::string & outfile)68 int cleanup(const std::string &outfile) {
69   const char *outstr = outfile.c_str();
70   if(outstr && *outstr) {
71     int rc = std::remove(outstr);
72     if(rc == 0) {
73       fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
74       return 0;
75     } else {
76       if( errno == ENOENT ) {
77         return 0; // File did not exist - no error.
78       } else {
79         perror("std::remove");
80         return 1;
81       }
82     }
83   }
84   return 0;
85 }
86 
87 /**
88  * Skip across any known whitespace.
89  * @param p startpoint
90  * @param e limit
91  * @return first non-whitespace char
92  */
skipws(const char * p,const char * e)93 inline const char *skipws(const char *p, const char *e) {
94   for(;p<e;p++) {
95     switch(*p) {
96     case kSPACE:
97     case kTAB:
98     case kLF:
99     case kCR:
100       break;
101     default:
102       return p; // non ws
103     }
104   }
105   return p;
106 }
107 
108 /**
109  * Append a byte, hex encoded
110  * @param outstr sstring to append to
111  * @param byte the byte to append
112  */
appendByte(std::string & outstr,uint8_t byte)113 void appendByte(std::string &outstr,
114                 uint8_t byte) {
115     char tmp2[5];
116     sprintf(tmp2, "\\x%02X", 0xFF & (int)(byte));
117     outstr += tmp2;
118 }
119 
120 /**
121  * Append the bytes from 'linestr' into outstr, with escaping
122  * @param outstr the output buffer
123  * @param linestr the input buffer
124  * @param pos in/out: the current char under consideration
125  * @param chars the number of chars to consider
126  * @return true on failure
127  */
appendUtf8(std::string & outstr,const std::string & linestr,size_t & pos,size_t chars)128 bool appendUtf8(std::string &outstr,
129                 const std::string &linestr,
130                 size_t &pos,
131                 size_t chars) {
132   char tmp[9];
133   for(size_t i=0;i<chars;i++) {
134     tmp[i] = linestr[++pos];
135   }
136   tmp[chars] = 0;
137   unsigned int c;
138   sscanf(tmp, "%X", &c);
139   UChar32 ch = c & 0x1FFFFF;
140 
141   // now to append \\x%% etc
142   uint8_t bytesNeeded = U8_LENGTH(ch);
143   if(bytesNeeded == 0) {
144     fprintf(stderr, "Illegal code point U+%X\n", ch);
145     return true;
146   }
147   uint8_t bytes[4];
148   uint8_t *s = bytes;
149   size_t i = 0;
150   U8_APPEND_UNSAFE(s, i, ch);
151   for(size_t t = 0; t<i; t++) {
152     appendByte(outstr, s[t]);
153   }
154   return false;
155 }
156 
157 /**
158  * Fixup u8"x"
159  * @param linestr string to mutate. Already escaped into \u format.
160  * @param origpos beginning, points to 'u8"'
161  * @param pos end, points to "
162  * @return false for no-problem, true for failure!
163  */
fixu8(std::string & linestr,size_t origpos,size_t & endpos)164 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
165   size_t pos = origpos + 3;
166   std::string outstr;
167   outstr += '\"'; // local encoding
168   for(;pos<endpos;pos++) {
169     char c = linestr[pos];
170     if(c == '\\') {
171       char c2 = linestr[++pos];
172       switch(c2) {
173       case '\'':
174       case '"':
175 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
176         c2 = cp1047_to_8859(c2);
177 #endif
178         appendByte(outstr, c2);
179         break;
180       case 'u':
181         appendUtf8(outstr, linestr, pos, 4);
182         break;
183       case 'U':
184         appendUtf8(outstr, linestr, pos, 8);
185         break;
186       }
187     } else {
188 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
189       c = cp1047_to_8859(c);
190 #endif
191       appendByte(outstr, c);
192     }
193   }
194   outstr += ('\"');
195 
196   linestr.replace(origpos, (endpos-origpos+1), outstr);
197 
198   return false; // OK
199 }
200 
201 /**
202  * fix the u"x"/u'x'/u8"x" string at the position
203  * u8'x' is not supported, sorry.
204  * @param linestr the input string
205  * @param pos the position
206  * @return false = no err, true = had err
207  */
fixAt(std::string & linestr,size_t pos)208 bool fixAt(std::string &linestr, size_t pos) {
209   size_t origpos = pos;
210 
211   if(linestr[pos] != 'u') {
212     fprintf(stderr, "Not a 'u'?");
213     return true;
214   }
215 
216   pos++; // past 'u'
217 
218   bool utf8 = false;
219 
220   if(linestr[pos] == '8') { // u8"
221     utf8 = true;
222     pos++;
223   }
224 
225   char quote = linestr[pos];
226 
227   if(quote != '\'' && quote != '\"') {
228     fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
229     return true;
230   }
231 
232   if(quote == '\'' && utf8) {
233     fprintf(stderr, "Cannot do u8'...'\n");
234     return true;
235   }
236 
237   pos ++;
238 
239   //printf("u%c…%c\n", quote, quote);
240 
241   for(; pos < linestr.size(); pos++) {
242     if(linestr[pos] == quote) {
243       if(utf8) {
244         return fixu8(linestr, origpos, pos); // fix u8"..."
245       } else {
246         return false; // end of quote
247       }
248     }
249     if(linestr[pos] == '\\') {
250       pos++;
251       if(linestr[pos] == quote) continue; // quoted quote
252       if(linestr[pos] == 'u') continue; // for now ... unicode escape
253       if(linestr[pos] == '\\') continue;
254       // some other escape… ignore
255     } else {
256       size_t old_pos = pos;
257       int32_t i = pos;
258 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
259       // mogrify 1-4 bytes from 1047 'back' to utf-8
260       char old_byte = linestr[pos];
261       linestr[pos] = cp1047_to_8859(linestr[pos]);
262       // how many more?
263       int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
264       for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
265         linestr[pos2] = cp1047_to_8859(linestr[pos2]);
266         if(linestr[pos2] == 0x0A) {
267           linestr[pos2] = 0x85; // NL is ambiguous here
268         }
269       }
270 #endif
271 
272       // Proceed to decode utf-8
273       const uint8_t *s = (const uint8_t*) (linestr.c_str());
274       int32_t length = linestr.size();
275       UChar32 c;
276       if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
277 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
278         linestr[pos] = old_byte; // put it back
279 #endif
280         continue; // single code point not previously legal for \u escaping
281       }
282 
283       // otherwise, convert it to \u / \U
284       {
285         U8_NEXT(s, i, length, c);
286       }
287       if(c<0) {
288         fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos);
289         fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
290         return true;
291       }
292 
293       size_t seqLen = (i-pos);
294 
295       //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);
296 
297       char newSeq[20];
298       if( c <= 0xFFFF) {
299         sprintf(newSeq, "\\u%04X", c);
300       } else {
301         sprintf(newSeq, "\\U%08X", c);
302       }
303       linestr.replace(pos, seqLen, newSeq);
304       pos += strlen(newSeq) - 1;
305     }
306   }
307 
308   return false;
309 }
310 
311 /**
312  * Fixup an entire line
313  * false = no err
314  * true = had err
315  * @param no the line number (not used)
316  * @param linestr the string to fix
317  * @return true if any err, else false
318  */
fixLine(int,std::string & linestr)319 bool fixLine(int /*no*/, std::string &linestr) {
320   const char *line = linestr.c_str();
321   size_t len = linestr.size();
322 
323   // no u' in the line?
324   if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
325     return false; // Nothing to do. No u' or u" detected
326   }
327 
328   // start from the end and find all u" cases
329   size_t pos = len = linestr.size();
330   if(len>INT32_MAX/2) {
331     return true;
332   }
333   while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
334     //printf("found doublequote at %d\n", pos);
335     if(fixAt(linestr, pos)) return true;
336     if(pos == 0) break;
337     pos--;
338   }
339 
340   // reset and find all u' cases
341   pos = len = linestr.size();
342   while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
343     //printf("found singlequote at %d\n", pos);
344     if(fixAt(linestr, pos)) return true;
345     if(pos == 0) break;
346     pos--;
347   }
348 
349   // reset and find all u8" cases
350   pos = len = linestr.size();
351   while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
352     if(fixAt(linestr, pos)) return true;
353     if(pos == 0) break;
354     pos--;
355   }
356 
357   //fprintf(stderr, "%d - fixed\n", no);
358   return false;
359 }
360 
361 /**
362  * Convert a whole file
363  * @param infile
364  * @param outfile
365  * @return 1 on err, 0 otherwise
366  */
convert(const std::string & infile,const std::string & outfile)367 int convert(const std::string &infile, const std::string &outfile) {
368   fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());
369 
370   std::ifstream inf;
371 
372   inf.open(infile.c_str(), std::ios::in);
373 
374   if(!inf.is_open()) {
375     fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
376     cleanup(outfile);
377     return 1;
378   }
379 
380   std::ofstream outf;
381 
382   outf.open(outfile.c_str(), std::ios::out);
383 
384   if(!outf.is_open()) {
385     fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
386     return 1;
387   }
388 
389   // TODO: any platform variations of #line?
390   outf << "#line 1 \"" << infile << "\"" << '\n';
391 
392   int no = 0;
393   std::string linestr;
394   while( getline( inf, linestr)) {
395     no++;
396     if(fixLine(no, linestr)) {
397       goto fail;
398     }
399     outf << linestr << '\n';
400   }
401 
402   if(inf.eof()) {
403     return 0;
404   }
405 fail:
406   outf.close();
407   fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
408   cleanup(outfile);
409   return 1;
410 }
411 
412 /**
413  * Main function
414  */
main(int argc,const char * argv[])415 int main(int argc, const char *argv[]) {
416   prog = argv[0];
417 
418   if(argc != 3) {
419     usage();
420     return 1;
421   }
422 
423   std::string infile = argv[1];
424   std::string outfile = argv[2];
425 
426   return convert(infile, outfile);
427 }
428