1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2011-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  ppucd.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2011dec11
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/uchar.h"
19 #include "charstr.h"
20 #include "cstring.h"
21 #include "ppucd.h"
22 #include "uassert.h"
23 #include "uparse.h"
24 
25 #include <stdio.h>
26 #include <string.h>
27 
28 U_NAMESPACE_BEGIN
29 
~PropertyNames()30 PropertyNames::~PropertyNames() {}
31 
32 int32_t
getPropertyEnum(const char * name) const33 PropertyNames::getPropertyEnum(const char *name) const {
34     return u_getPropertyEnum(name);
35 }
36 
37 int32_t
getPropertyValueEnum(int32_t property,const char * name) const38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
39     return u_getPropertyValueEnum((UProperty)property, name);
40 }
41 
UniProps()42 UniProps::UniProps()
43         : start(U_SENTINEL), end(U_SENTINEL),
44           bmg(U_SENTINEL), bpb(U_SENTINEL),
45           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
46           digitValue(-1), numericValue(NULL),
47           name(NULL), nameAlias(NULL) {
48     memset(binProps, 0, sizeof(binProps));
49     memset(intProps, 0, sizeof(intProps));
50     memset(age, 0, 4);
51 }
52 
~UniProps()53 UniProps::~UniProps() {}
54 
55 const int32_t PreparsedUCD::kNumLineBuffers;
56 
PreparsedUCD(const char * filename,UErrorCode & errorCode)57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
58         : icuPnames(new PropertyNames()), pnames(icuPnames),
59           file(NULL),
60           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
61           lineNumber(0),
62           lineType(NO_LINE),
63           fieldLimit(NULL), lineLimit(NULL) {
64     if(U_FAILURE(errorCode)) { return; }
65 
66     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
67         filename=NULL;
68         file=stdin;
69     } else {
70         file=fopen(filename, "r");
71     }
72     if(file==NULL) {
73         perror("error opening preparsed UCD");
74         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
75         errorCode=U_FILE_ACCESS_ERROR;
76         return;
77     }
78 
79     memset(ucdVersion, 0, 4);
80     lines[0][0]=0;
81 }
82 
~PreparsedUCD()83 PreparsedUCD::~PreparsedUCD() {
84     if(file!=stdin) {
85         fclose(file);
86     }
87     delete icuPnames;
88 }
89 
90 // Same order as the LineType values.
91 static const char *lineTypeStrings[]={
92     NULL,
93     NULL,
94     "ucd",
95     "property",
96     "binary",
97     "value",
98     "defaults",
99     "block",
100     "cp",
101     "unassigned",
102     "algnamesrange"
103 };
104 
105 PreparsedUCD::LineType
readLine(UErrorCode & errorCode)106 PreparsedUCD::readLine(UErrorCode &errorCode) {
107     if(U_FAILURE(errorCode)) { return NO_LINE; }
108     // Select the next available line buffer.
109     while(!isLineBufferAvailable(lineIndex)) {
110         ++lineIndex;
111         if (lineIndex == kNumLineBuffers) {
112             lineIndex = 0;
113         }
114     }
115     char *line=lines[lineIndex];
116     *line=0;
117     lineLimit=fieldLimit=line;
118     lineType=NO_LINE;
119     char *result=fgets(line, sizeof(lines[0]), file);
120     if(result==NULL) {
121         if(ferror(file)) {
122             perror("error reading preparsed UCD");
123             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
124             errorCode=U_FILE_ACCESS_ERROR;
125         }
126         return NO_LINE;
127     }
128     ++lineNumber;
129     if(*line=='#') {
130         fieldLimit=strchr(line, 0);
131         return lineType=EMPTY_LINE;
132     }
133     // Remove trailing /r/n.
134     char c;
135     char *limit=strchr(line, 0);
136     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
137     // Remove trailing white space.
138     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
139     *limit=0;
140     lineLimit=limit;
141     if(line==limit) {
142         fieldLimit=limit;
143         return lineType=EMPTY_LINE;
144     }
145     // Split by ';'.
146     char *semi=line;
147     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
148     fieldLimit=strchr(line, 0);
149     // Determine the line type.
150     int32_t type;
151     for(type=EMPTY_LINE+1;; ++type) {
152         if(type==LINE_TYPE_COUNT) {
153             fprintf(stderr,
154                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
155                     line, (long)lineNumber);
156             errorCode=U_PARSE_ERROR;
157             return NO_LINE;
158         }
159         if(0==strcmp(line, lineTypeStrings[type])) {
160             break;
161         }
162     }
163     lineType=(LineType)type;
164     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
165         u_versionFromString(ucdVersion, fieldLimit+1);
166     }
167     return lineType;
168 }
169 
170 const char *
firstField()171 PreparsedUCD::firstField() {
172     char *field=lines[lineIndex];
173     fieldLimit=strchr(field, 0);
174     return field;
175 }
176 
177 const char *
nextField()178 PreparsedUCD::nextField() {
179     if(fieldLimit==lineLimit) { return NULL; }
180     char *field=fieldLimit+1;
181     fieldLimit=strchr(field, 0);
182     return field;
183 }
184 
185 const UniProps *
getProps(UnicodeSet & newValues,UErrorCode & errorCode)186 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
187     if(U_FAILURE(errorCode)) { return NULL; }
188     newValues.clear();
189     if(!lineHasPropertyValues()) {
190         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
191         return NULL;
192     }
193     firstField();
194     const char *field=nextField();
195     if(field==NULL) {
196         // No range field after the type.
197         fprintf(stderr,
198                 "error in preparsed UCD: missing default/block/cp range field "
199                 "(no second field) on line %ld\n",
200                 (long)lineNumber);
201         errorCode=U_PARSE_ERROR;
202         return NULL;
203     }
204     UChar32 start, end;
205     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
206     UniProps *props;
207     UBool insideBlock=FALSE;  // TRUE if cp or unassigned range inside the block range.
208     switch(lineType) {
209     case DEFAULTS_LINE:
210         // Should occur before any block/cp/unassigned line.
211         if(blockLineIndex>=0) {
212             fprintf(stderr,
213                     "error in preparsed UCD: default line %ld after one or more block lines\n",
214                     (long)lineNumber);
215             errorCode=U_PARSE_ERROR;
216             return NULL;
217         }
218         if(defaultLineIndex>=0) {
219             fprintf(stderr,
220                     "error in preparsed UCD: second line with default properties on line %ld\n",
221                     (long)lineNumber);
222             errorCode=U_PARSE_ERROR;
223             return NULL;
224         }
225         if(start!=0 || end!=0x10ffff) {
226             fprintf(stderr,
227                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
228                     field, (long)lineNumber);
229             errorCode=U_PARSE_ERROR;
230             return NULL;
231         }
232         props=&defaultProps;
233         defaultLineIndex=lineIndex;
234         break;
235     case BLOCK_LINE:
236         blockProps=defaultProps;  // Block inherits default properties.
237         props=&blockProps;
238         blockLineIndex=lineIndex;
239         break;
240     case CP_LINE:
241     case UNASSIGNED_LINE:
242         if(blockProps.start<=start && end<=blockProps.end) {
243             insideBlock=TRUE;
244             if(lineType==CP_LINE) {
245                 // Code point range fully inside the last block inherits the block properties.
246                 cpProps=blockProps;
247             } else {
248                 // Unassigned line inside the block is based on default properties
249                 // which override block properties.
250                 cpProps=defaultProps;
251                 newValues=blockValues;
252                 // Except, it inherits the one blk=Block property.
253                 int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
254                 cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
255                 newValues.remove((UChar32)UCHAR_BLOCK);
256             }
257         } else if(start>blockProps.end || end<blockProps.start) {
258             // Code point range fully outside the last block inherits the default properties.
259             cpProps=defaultProps;
260         } else {
261             // Code point range partially overlapping with the last block is illegal.
262             fprintf(stderr,
263                     "error in preparsed UCD: cp range %s on line %ld only "
264                     "partially overlaps with block range %04lX..%04lX\n",
265                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
266             errorCode=U_PARSE_ERROR;
267             return NULL;
268         }
269         props=&cpProps;
270         break;
271     default:
272         // Will not occur because of the range check above.
273         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
274         return NULL;
275     }
276     props->start=start;
277     props->end=end;
278     while((field=nextField())!=NULL) {
279         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
280     }
281     if(lineType==BLOCK_LINE) {
282         blockValues=newValues;
283     } else if(lineType==UNASSIGNED_LINE && insideBlock) {
284         // Unset newValues for values that are the same as the block values.
285         for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
286             if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
287                 newValues.remove(prop);
288             }
289         }
290         for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
291             int32_t index=prop-UCHAR_INT_START;
292             if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
293                 newValues.remove(prop);
294             }
295         }
296     }
297     return props;
298 }
299 
300 static const struct {
301     const char *name;
302     int32_t prop;
303 } ppucdProperties[]={
304     { "Name_Alias", PPUCD_NAME_ALIAS },
305     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
306     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
307 };
308 
309 // Returns TRUE for "ok to continue parsing fields".
310 UBool
parseProperty(UniProps & props,const char * field,UnicodeSet & newValues,UErrorCode & errorCode)311 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
312                             UErrorCode &errorCode) {
313     CharString pBuffer;
314     const char *p=field;
315     const char *v=strchr(p, '=');
316     int binaryValue;
317     if(*p=='-') {
318         if(v!=NULL) {
319             fprintf(stderr,
320                     "error in preparsed UCD: mix of binary-property-no and "
321                     "enum-property syntax '%s' on line %ld\n",
322                     field, (long)lineNumber);
323             errorCode=U_PARSE_ERROR;
324             return FALSE;
325         }
326         binaryValue=0;
327         ++p;
328     } else if(v==NULL) {
329         binaryValue=1;
330     } else {
331         binaryValue=-1;
332         // Copy out the property name rather than modifying the field (writing a NUL).
333         pBuffer.append(p, (int32_t)(v-p), errorCode);
334         p=pBuffer.data();
335         ++v;
336     }
337     int32_t prop=pnames->getPropertyEnum(p);
338     if(prop<0) {
339         for(int32_t i=0;; ++i) {
340             if(i==UPRV_LENGTHOF(ppucdProperties)) {
341                 // Ignore unknown property names.
342                 return TRUE;
343             }
344             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
345                 prop=ppucdProperties[i].prop;
346                 U_ASSERT(prop>=0);
347                 break;
348             }
349         }
350     }
351     if(prop<UCHAR_BINARY_LIMIT) {
352         if(binaryValue>=0) {
353             props.binProps[prop]=(UBool)binaryValue;
354         } else {
355             // No binary value for a binary property.
356             fprintf(stderr,
357                     "error in preparsed UCD: enum-property syntax '%s' "
358                     "for binary property on line %ld\n",
359                     field, (long)lineNumber);
360             errorCode=U_PARSE_ERROR;
361         }
362     } else if(binaryValue>=0) {
363         // Binary value for a non-binary property.
364         fprintf(stderr,
365                 "error in preparsed UCD: binary-property syntax '%s' "
366                 "for non-binary property on line %ld\n",
367                 field, (long)lineNumber);
368         errorCode=U_PARSE_ERROR;
369     } else if (prop < UCHAR_INT_START) {
370         fprintf(stderr,
371                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
372                 prop, (long)lineNumber);
373         errorCode=U_PARSE_ERROR;
374     } else if(prop<UCHAR_INT_LIMIT) {
375         int32_t value=pnames->getPropertyValueEnum(prop, v);
376         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
377             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
378             char *end;
379             unsigned long ccc=uprv_strtoul(v, &end, 10);
380             if(v<end && *end==0 && ccc<=254) {
381                 value=(int32_t)ccc;
382             }
383         }
384         if(value==UCHAR_INVALID_CODE) {
385             fprintf(stderr,
386                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
387                     field, (long)lineNumber);
388             errorCode=U_PARSE_ERROR;
389         } else {
390             props.intProps[prop-UCHAR_INT_START]=value;
391         }
392     } else if(*v=='<') {
393         // Do not parse default values like <code point>, just set null values.
394         switch(prop) {
395         case UCHAR_BIDI_MIRRORING_GLYPH:
396             props.bmg=U_SENTINEL;
397             break;
398         case UCHAR_BIDI_PAIRED_BRACKET:
399             props.bpb=U_SENTINEL;
400             break;
401         case UCHAR_SIMPLE_CASE_FOLDING:
402             props.scf=U_SENTINEL;
403             break;
404         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
405             props.slc=U_SENTINEL;
406             break;
407         case UCHAR_SIMPLE_TITLECASE_MAPPING:
408             props.stc=U_SENTINEL;
409             break;
410         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
411             props.suc=U_SENTINEL;
412             break;
413         case UCHAR_CASE_FOLDING:
414             props.cf.remove();
415             break;
416         case UCHAR_LOWERCASE_MAPPING:
417             props.lc.remove();
418             break;
419         case UCHAR_TITLECASE_MAPPING:
420             props.tc.remove();
421             break;
422         case UCHAR_UPPERCASE_MAPPING:
423             props.uc.remove();
424             break;
425         case UCHAR_SCRIPT_EXTENSIONS:
426             props.scx.clear();
427             break;
428         default:
429             fprintf(stderr,
430                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
431                     field, (long)lineNumber);
432             errorCode=U_PARSE_ERROR;
433         }
434     } else {
435         char c;
436         switch(prop) {
437         case UCHAR_NUMERIC_VALUE:
438             props.numericValue=v;
439             c=*v;
440             if('0'<=c && c<='9' && v[1]==0) {
441                 props.digitValue=c-'0';
442             } else {
443                 props.digitValue=-1;
444             }
445             break;
446         case UCHAR_NAME:
447             props.name=v;
448             break;
449         case UCHAR_AGE:
450             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
451             break;
452         case UCHAR_BIDI_MIRRORING_GLYPH:
453             props.bmg=parseCodePoint(v, errorCode);
454             break;
455         case UCHAR_BIDI_PAIRED_BRACKET:
456             props.bpb=parseCodePoint(v, errorCode);
457             break;
458         case UCHAR_SIMPLE_CASE_FOLDING:
459             props.scf=parseCodePoint(v, errorCode);
460             break;
461         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
462             props.slc=parseCodePoint(v, errorCode);
463             break;
464         case UCHAR_SIMPLE_TITLECASE_MAPPING:
465             props.stc=parseCodePoint(v, errorCode);
466             break;
467         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
468             props.suc=parseCodePoint(v, errorCode);
469             break;
470         case UCHAR_CASE_FOLDING:
471             parseString(v, props.cf, errorCode);
472             break;
473         case UCHAR_LOWERCASE_MAPPING:
474             parseString(v, props.lc, errorCode);
475             break;
476         case UCHAR_TITLECASE_MAPPING:
477             parseString(v, props.tc, errorCode);
478             break;
479         case UCHAR_UPPERCASE_MAPPING:
480             parseString(v, props.uc, errorCode);
481             break;
482         case PPUCD_NAME_ALIAS:
483             props.nameAlias=v;
484             break;
485         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
486         case PPUCD_TURKIC_CASE_FOLDING:
487             // No need to parse their values: They are hardcoded in the runtime library.
488             break;
489         case UCHAR_SCRIPT_EXTENSIONS:
490             parseScriptExtensions(v, props.scx, errorCode);
491             break;
492         default:
493             // Ignore unhandled properties.
494             return TRUE;
495         }
496     }
497     if(U_SUCCESS(errorCode)) {
498         newValues.add((UChar32)prop);
499         return TRUE;
500     } else {
501         return FALSE;
502     }
503 }
504 
505 UBool
getRangeForAlgNames(UChar32 & start,UChar32 & end,UErrorCode & errorCode)506 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
507     if(U_FAILURE(errorCode)) { return FALSE; }
508     if(lineType!=ALG_NAMES_RANGE_LINE) {
509         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
510         return FALSE;
511     }
512     firstField();
513     const char *field=nextField();
514     if(field==NULL) {
515         // No range field after the type.
516         fprintf(stderr,
517                 "error in preparsed UCD: missing algnamesrange range field "
518                 "(no second field) on line %ld\n",
519                 (long)lineNumber);
520         errorCode=U_PARSE_ERROR;
521         return FALSE;
522     }
523     return parseCodePointRange(field, start, end, errorCode);
524 }
525 
526 UChar32
parseCodePoint(const char * s,UErrorCode & errorCode)527 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
528     char *end;
529     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
530     if(end<=s || *end!=0 || value>=0x110000) {
531         fprintf(stderr,
532                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
533                 s, (long)lineNumber);
534         errorCode=U_PARSE_ERROR;
535         return U_SENTINEL;
536     }
537     return (UChar32)value;
538 }
539 
540 UBool
parseCodePointRange(const char * s,UChar32 & start,UChar32 & end,UErrorCode & errorCode)541 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
542     uint32_t st, e;
543     u_parseCodePointRange(s, &st, &e, &errorCode);
544     if(U_FAILURE(errorCode)) {
545         fprintf(stderr,
546                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
547                 s, (long)lineNumber);
548         return FALSE;
549     }
550     start=(UChar32)st;
551     end=(UChar32)e;
552     return TRUE;
553 }
554 
555 void
parseString(const char * s,UnicodeString & uni,UErrorCode & errorCode)556 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
557     UChar *buffer=toUCharPtr(uni.getBuffer(-1));
558     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
559     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
560         errorCode=U_ZERO_ERROR;
561         uni.releaseBuffer(0);
562         buffer=toUCharPtr(uni.getBuffer(length));
563         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
564     }
565     uni.releaseBuffer(length);
566     if(U_FAILURE(errorCode)) {
567         fprintf(stderr,
568                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
569                 s, (long)lineNumber);
570     }
571 }
572 
573 void
parseScriptExtensions(const char * s,UnicodeSet & scx,UErrorCode & errorCode)574 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
575     if(U_FAILURE(errorCode)) { return; }
576     scx.clear();
577     CharString scString;
578     for(;;) {
579         const char *scs;
580         const char *scLimit=strchr(s, ' ');
581         if(scLimit!=NULL) {
582             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
583             if(U_FAILURE(errorCode)) { return; }
584         } else {
585             scs=s;
586         }
587         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
588         if(script==UCHAR_INVALID_CODE) {
589             fprintf(stderr,
590                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
591                     scs, (long)lineNumber);
592             errorCode=U_PARSE_ERROR;
593             return;
594         } else if(scx.contains(script)) {
595             fprintf(stderr,
596                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
597                     scs, (long)lineNumber);
598             errorCode=U_PARSE_ERROR;
599             return;
600         } else {
601             scx.add(script);
602         }
603         if(scLimit!=NULL) {
604             s=scLimit+1;
605         } else {
606             break;
607         }
608     }
609     if(scx.isEmpty()) {
610         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
611         errorCode=U_PARSE_ERROR;
612     }
613 }
614 
615 U_NAMESPACE_END
616