1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2011-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  ppucd.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2011dec11
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/uchar.h"
19 #include "charstr.h"
20 #include "cstring.h"
21 #include "ppucd.h"
22 #include "uassert.h"
23 #include "uparse.h"
24 
25 #include <stdio.h>
26 #include <string.h>
27 
28 U_NAMESPACE_BEGIN
29 
~PropertyNames()30 PropertyNames::~PropertyNames() {}
31 
32 // TODO: Create a concrete subclass for the default PropertyNames implementation
33 // using the ICU library built-in property names API & data.
34 // Currently only the genprops tool uses PreparsedUCD, and provides its own
35 // PropertyNames implementation using its just-build property names data and its own code.
36 // At some point, we should use PreparsedUCD in tests, and then we will need the
37 // default implementation somewhere.
38 #if 0
39 int32_t
40 PropertyNames::getPropertyEnum(const char *name) const {
41     return u_getPropertyEnum(name);
42 }
43 
44 int32_t
45 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
46     return u_getPropertyValueEnum((UProperty)property, name);
47 }
48 #endif
49 
UniProps()50 UniProps::UniProps()
51         : start(U_SENTINEL), end(U_SENTINEL),
52           bmg(U_SENTINEL), bpb(U_SENTINEL),
53           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
54           digitValue(-1), numericValue(NULL),
55           name(NULL), nameAlias(NULL) {
56     memset(binProps, 0, sizeof(binProps));
57     memset(intProps, 0, sizeof(intProps));
58     memset(age, 0, 4);
59 }
60 
~UniProps()61 UniProps::~UniProps() {}
62 
63 const int32_t PreparsedUCD::kNumLineBuffers;
64 
PreparsedUCD(const char * filename,UErrorCode & errorCode)65 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
66         : pnames(nullptr),
67           file(NULL),
68           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
69           lineNumber(0),
70           lineType(NO_LINE),
71           fieldLimit(NULL), lineLimit(NULL) {
72     if(U_FAILURE(errorCode)) { return; }
73 
74     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
75         filename=NULL;
76         file=stdin;
77     } else {
78         file=fopen(filename, "r");
79     }
80     if(file==NULL) {
81         perror("error opening preparsed UCD");
82         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
83         errorCode=U_FILE_ACCESS_ERROR;
84         return;
85     }
86 
87     memset(ucdVersion, 0, 4);
88     lines[0][0]=0;
89 }
90 
~PreparsedUCD()91 PreparsedUCD::~PreparsedUCD() {
92     if(file!=stdin) {
93         fclose(file);
94     }
95 }
96 
97 // Same order as the LineType values.
98 static const char *lineTypeStrings[]={
99     NULL,
100     NULL,
101     "ucd",
102     "property",
103     "binary",
104     "value",
105     "defaults",
106     "block",
107     "cp",
108     "unassigned",
109     "algnamesrange"
110 };
111 
112 PreparsedUCD::LineType
readLine(UErrorCode & errorCode)113 PreparsedUCD::readLine(UErrorCode &errorCode) {
114     if(U_FAILURE(errorCode)) { return NO_LINE; }
115     // Select the next available line buffer.
116     while(!isLineBufferAvailable(lineIndex)) {
117         ++lineIndex;
118         if (lineIndex == kNumLineBuffers) {
119             lineIndex = 0;
120         }
121     }
122     char *line=lines[lineIndex];
123     *line=0;
124     lineLimit=fieldLimit=line;
125     lineType=NO_LINE;
126     char *result=fgets(line, sizeof(lines[0]), file);
127     if(result==NULL) {
128         if(ferror(file)) {
129             perror("error reading preparsed UCD");
130             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
131             errorCode=U_FILE_ACCESS_ERROR;
132         }
133         return NO_LINE;
134     }
135     ++lineNumber;
136     if(*line=='#') {
137         fieldLimit=strchr(line, 0);
138         return lineType=EMPTY_LINE;
139     }
140     // Remove trailing /r/n.
141     char c;
142     char *limit=strchr(line, 0);
143     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
144     // Remove trailing white space.
145     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
146     *limit=0;
147     lineLimit=limit;
148     if(line==limit) {
149         fieldLimit=limit;
150         return lineType=EMPTY_LINE;
151     }
152     // Split by ';'.
153     char *semi=line;
154     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
155     fieldLimit=strchr(line, 0);
156     // Determine the line type.
157     int32_t type;
158     for(type=EMPTY_LINE+1;; ++type) {
159         if(type==LINE_TYPE_COUNT) {
160             fprintf(stderr,
161                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
162                     line, (long)lineNumber);
163             errorCode=U_PARSE_ERROR;
164             return NO_LINE;
165         }
166         if(0==strcmp(line, lineTypeStrings[type])) {
167             break;
168         }
169     }
170     lineType=(LineType)type;
171     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
172         u_versionFromString(ucdVersion, fieldLimit+1);
173     }
174     return lineType;
175 }
176 
177 const char *
firstField()178 PreparsedUCD::firstField() {
179     char *field=lines[lineIndex];
180     fieldLimit=strchr(field, 0);
181     return field;
182 }
183 
184 const char *
nextField()185 PreparsedUCD::nextField() {
186     if(fieldLimit==lineLimit) { return NULL; }
187     char *field=fieldLimit+1;
188     fieldLimit=strchr(field, 0);
189     return field;
190 }
191 
192 const UniProps *
getProps(UnicodeSet & newValues,UErrorCode & errorCode)193 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
194     if(U_FAILURE(errorCode)) { return NULL; }
195     newValues.clear();
196     if(!lineHasPropertyValues()) {
197         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
198         return NULL;
199     }
200     firstField();
201     const char *field=nextField();
202     if(field==NULL) {
203         // No range field after the type.
204         fprintf(stderr,
205                 "error in preparsed UCD: missing default/block/cp range field "
206                 "(no second field) on line %ld\n",
207                 (long)lineNumber);
208         errorCode=U_PARSE_ERROR;
209         return NULL;
210     }
211     UChar32 start, end;
212     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
213     UniProps *props;
214     UBool insideBlock=FALSE;  // TRUE if cp or unassigned range inside the block range.
215     switch(lineType) {
216     case DEFAULTS_LINE:
217         // Should occur before any block/cp/unassigned line.
218         if(blockLineIndex>=0) {
219             fprintf(stderr,
220                     "error in preparsed UCD: default line %ld after one or more block lines\n",
221                     (long)lineNumber);
222             errorCode=U_PARSE_ERROR;
223             return NULL;
224         }
225         if(defaultLineIndex>=0) {
226             fprintf(stderr,
227                     "error in preparsed UCD: second line with default properties on line %ld\n",
228                     (long)lineNumber);
229             errorCode=U_PARSE_ERROR;
230             return NULL;
231         }
232         if(start!=0 || end!=0x10ffff) {
233             fprintf(stderr,
234                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
235                     field, (long)lineNumber);
236             errorCode=U_PARSE_ERROR;
237             return NULL;
238         }
239         props=&defaultProps;
240         defaultLineIndex=lineIndex;
241         break;
242     case BLOCK_LINE:
243         blockProps=defaultProps;  // Block inherits default properties.
244         props=&blockProps;
245         blockLineIndex=lineIndex;
246         break;
247     case CP_LINE:
248     case UNASSIGNED_LINE:
249         if(blockProps.start<=start && end<=blockProps.end) {
250             insideBlock=TRUE;
251             if(lineType==CP_LINE) {
252                 // Code point range fully inside the last block inherits the block properties.
253                 cpProps=blockProps;
254             } else {
255                 // Unassigned line inside the block is based on default properties
256                 // which override block properties.
257                 cpProps=defaultProps;
258                 newValues=blockValues;
259                 // Except, it inherits the one blk=Block property.
260                 int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
261                 cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
262                 newValues.remove((UChar32)UCHAR_BLOCK);
263             }
264         } else if(start>blockProps.end || end<blockProps.start) {
265             // Code point range fully outside the last block inherits the default properties.
266             cpProps=defaultProps;
267         } else {
268             // Code point range partially overlapping with the last block is illegal.
269             fprintf(stderr,
270                     "error in preparsed UCD: cp range %s on line %ld only "
271                     "partially overlaps with block range %04lX..%04lX\n",
272                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
273             errorCode=U_PARSE_ERROR;
274             return NULL;
275         }
276         props=&cpProps;
277         break;
278     default:
279         // Will not occur because of the range check above.
280         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
281         return NULL;
282     }
283     props->start=start;
284     props->end=end;
285     while((field=nextField())!=NULL) {
286         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
287     }
288     if(lineType==BLOCK_LINE) {
289         blockValues=newValues;
290     } else if(lineType==UNASSIGNED_LINE && insideBlock) {
291         // Unset newValues for values that are the same as the block values.
292         for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
293             if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
294                 newValues.remove(prop);
295             }
296         }
297         for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
298             int32_t index=prop-UCHAR_INT_START;
299             if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
300                 newValues.remove(prop);
301             }
302         }
303     }
304     return props;
305 }
306 
307 static const struct {
308     const char *name;
309     int32_t prop;
310 } ppucdProperties[]={
311     { "Name_Alias", PPUCD_NAME_ALIAS },
312     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
313     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
314 };
315 
316 // Returns TRUE for "ok to continue parsing fields".
317 UBool
parseProperty(UniProps & props,const char * field,UnicodeSet & newValues,UErrorCode & errorCode)318 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
319                             UErrorCode &errorCode) {
320     CharString pBuffer;
321     const char *p=field;
322     const char *v=strchr(p, '=');
323     int binaryValue;
324     if(*p=='-') {
325         if(v!=NULL) {
326             fprintf(stderr,
327                     "error in preparsed UCD: mix of binary-property-no and "
328                     "enum-property syntax '%s' on line %ld\n",
329                     field, (long)lineNumber);
330             errorCode=U_PARSE_ERROR;
331             return FALSE;
332         }
333         binaryValue=0;
334         ++p;
335     } else if(v==NULL) {
336         binaryValue=1;
337     } else {
338         binaryValue=-1;
339         // Copy out the property name rather than modifying the field (writing a NUL).
340         pBuffer.append(p, (int32_t)(v-p), errorCode);
341         p=pBuffer.data();
342         ++v;
343     }
344     int32_t prop=pnames->getPropertyEnum(p);
345     if(prop<0) {
346         for(int32_t i=0;; ++i) {
347             if(i==UPRV_LENGTHOF(ppucdProperties)) {
348                 // Ignore unknown property names.
349                 return TRUE;
350             }
351             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
352                 prop=ppucdProperties[i].prop;
353                 U_ASSERT(prop>=0);
354                 break;
355             }
356         }
357     }
358     if(prop<UCHAR_BINARY_LIMIT) {
359         if(binaryValue>=0) {
360             props.binProps[prop]=(UBool)binaryValue;
361         } else {
362             // No binary value for a binary property.
363             fprintf(stderr,
364                     "error in preparsed UCD: enum-property syntax '%s' "
365                     "for binary property on line %ld\n",
366                     field, (long)lineNumber);
367             errorCode=U_PARSE_ERROR;
368         }
369     } else if(binaryValue>=0) {
370         // Binary value for a non-binary property.
371         fprintf(stderr,
372                 "error in preparsed UCD: binary-property syntax '%s' "
373                 "for non-binary property on line %ld\n",
374                 field, (long)lineNumber);
375         errorCode=U_PARSE_ERROR;
376     } else if (prop < UCHAR_INT_START) {
377         fprintf(stderr,
378                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
379                 prop, (long)lineNumber);
380         errorCode=U_PARSE_ERROR;
381     } else if(prop<UCHAR_INT_LIMIT) {
382         int32_t value=pnames->getPropertyValueEnum(prop, v);
383         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
384             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
385             char *end;
386             unsigned long ccc=uprv_strtoul(v, &end, 10);
387             if(v<end && *end==0 && ccc<=254) {
388                 value=(int32_t)ccc;
389             }
390         }
391         if(value==UCHAR_INVALID_CODE) {
392             fprintf(stderr,
393                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
394                     field, (long)lineNumber);
395             errorCode=U_PARSE_ERROR;
396         } else {
397             props.intProps[prop-UCHAR_INT_START]=value;
398         }
399     } else if(*v=='<') {
400         // Do not parse default values like <code point>, just set null values.
401         switch(prop) {
402         case UCHAR_BIDI_MIRRORING_GLYPH:
403             props.bmg=U_SENTINEL;
404             break;
405         case UCHAR_BIDI_PAIRED_BRACKET:
406             props.bpb=U_SENTINEL;
407             break;
408         case UCHAR_SIMPLE_CASE_FOLDING:
409             props.scf=U_SENTINEL;
410             break;
411         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
412             props.slc=U_SENTINEL;
413             break;
414         case UCHAR_SIMPLE_TITLECASE_MAPPING:
415             props.stc=U_SENTINEL;
416             break;
417         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
418             props.suc=U_SENTINEL;
419             break;
420         case UCHAR_CASE_FOLDING:
421             props.cf.remove();
422             break;
423         case UCHAR_LOWERCASE_MAPPING:
424             props.lc.remove();
425             break;
426         case UCHAR_TITLECASE_MAPPING:
427             props.tc.remove();
428             break;
429         case UCHAR_UPPERCASE_MAPPING:
430             props.uc.remove();
431             break;
432         case UCHAR_SCRIPT_EXTENSIONS:
433             props.scx.clear();
434             break;
435         default:
436             fprintf(stderr,
437                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
438                     field, (long)lineNumber);
439             errorCode=U_PARSE_ERROR;
440         }
441     } else {
442         char c;
443         switch(prop) {
444         case UCHAR_NUMERIC_VALUE:
445             props.numericValue=v;
446             c=*v;
447             if('0'<=c && c<='9' && v[1]==0) {
448                 props.digitValue=c-'0';
449             } else {
450                 props.digitValue=-1;
451             }
452             break;
453         case UCHAR_NAME:
454             props.name=v;
455             break;
456         case UCHAR_AGE:
457             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
458             break;
459         case UCHAR_BIDI_MIRRORING_GLYPH:
460             props.bmg=parseCodePoint(v, errorCode);
461             break;
462         case UCHAR_BIDI_PAIRED_BRACKET:
463             props.bpb=parseCodePoint(v, errorCode);
464             break;
465         case UCHAR_SIMPLE_CASE_FOLDING:
466             props.scf=parseCodePoint(v, errorCode);
467             break;
468         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
469             props.slc=parseCodePoint(v, errorCode);
470             break;
471         case UCHAR_SIMPLE_TITLECASE_MAPPING:
472             props.stc=parseCodePoint(v, errorCode);
473             break;
474         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
475             props.suc=parseCodePoint(v, errorCode);
476             break;
477         case UCHAR_CASE_FOLDING:
478             parseString(v, props.cf, errorCode);
479             break;
480         case UCHAR_LOWERCASE_MAPPING:
481             parseString(v, props.lc, errorCode);
482             break;
483         case UCHAR_TITLECASE_MAPPING:
484             parseString(v, props.tc, errorCode);
485             break;
486         case UCHAR_UPPERCASE_MAPPING:
487             parseString(v, props.uc, errorCode);
488             break;
489         case PPUCD_NAME_ALIAS:
490             props.nameAlias=v;
491             break;
492         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
493         case PPUCD_TURKIC_CASE_FOLDING:
494             // No need to parse their values: They are hardcoded in the runtime library.
495             break;
496         case UCHAR_SCRIPT_EXTENSIONS:
497             parseScriptExtensions(v, props.scx, errorCode);
498             break;
499         default:
500             // Ignore unhandled properties.
501             return TRUE;
502         }
503     }
504     if(U_SUCCESS(errorCode)) {
505         newValues.add((UChar32)prop);
506         return TRUE;
507     } else {
508         return FALSE;
509     }
510 }
511 
512 UBool
getRangeForAlgNames(UChar32 & start,UChar32 & end,UErrorCode & errorCode)513 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
514     if(U_FAILURE(errorCode)) { return FALSE; }
515     if(lineType!=ALG_NAMES_RANGE_LINE) {
516         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
517         return FALSE;
518     }
519     firstField();
520     const char *field=nextField();
521     if(field==NULL) {
522         // No range field after the type.
523         fprintf(stderr,
524                 "error in preparsed UCD: missing algnamesrange range field "
525                 "(no second field) on line %ld\n",
526                 (long)lineNumber);
527         errorCode=U_PARSE_ERROR;
528         return FALSE;
529     }
530     return parseCodePointRange(field, start, end, errorCode);
531 }
532 
533 UChar32
parseCodePoint(const char * s,UErrorCode & errorCode)534 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
535     char *end;
536     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
537     if(end<=s || *end!=0 || value>=0x110000) {
538         fprintf(stderr,
539                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
540                 s, (long)lineNumber);
541         errorCode=U_PARSE_ERROR;
542         return U_SENTINEL;
543     }
544     return (UChar32)value;
545 }
546 
547 UBool
parseCodePointRange(const char * s,UChar32 & start,UChar32 & end,UErrorCode & errorCode)548 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
549     uint32_t st, e;
550     u_parseCodePointRange(s, &st, &e, &errorCode);
551     if(U_FAILURE(errorCode)) {
552         fprintf(stderr,
553                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
554                 s, (long)lineNumber);
555         return FALSE;
556     }
557     start=(UChar32)st;
558     end=(UChar32)e;
559     return TRUE;
560 }
561 
562 void
parseString(const char * s,UnicodeString & uni,UErrorCode & errorCode)563 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
564     UChar *buffer=toUCharPtr(uni.getBuffer(-1));
565     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
566     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
567         errorCode=U_ZERO_ERROR;
568         uni.releaseBuffer(0);
569         buffer=toUCharPtr(uni.getBuffer(length));
570         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
571     }
572     uni.releaseBuffer(length);
573     if(U_FAILURE(errorCode)) {
574         fprintf(stderr,
575                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
576                 s, (long)lineNumber);
577     }
578 }
579 
580 void
parseScriptExtensions(const char * s,UnicodeSet & scx,UErrorCode & errorCode)581 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
582     if(U_FAILURE(errorCode)) { return; }
583     scx.clear();
584     CharString scString;
585     for(;;) {
586         const char *scs;
587         const char *scLimit=strchr(s, ' ');
588         if(scLimit!=NULL) {
589             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
590             if(U_FAILURE(errorCode)) { return; }
591         } else {
592             scs=s;
593         }
594         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
595         if(script==UCHAR_INVALID_CODE) {
596             fprintf(stderr,
597                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
598                     scs, (long)lineNumber);
599             errorCode=U_PARSE_ERROR;
600             return;
601         } else if(scx.contains(script)) {
602             fprintf(stderr,
603                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
604                     scs, (long)lineNumber);
605             errorCode=U_PARSE_ERROR;
606             return;
607         } else {
608             scx.add(script);
609         }
610         if(scLimit!=NULL) {
611             s=scLimit+1;
612         } else {
613             break;
614         }
615     }
616     if(scx.isEmpty()) {
617         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
618         errorCode=U_PARSE_ERROR;
619     }
620 }
621 
622 U_NAMESPACE_END
623