1 /* ------------------------------------------------------------------------
2 
3    unicodedata -- Provides access to the Unicode database.
4 
5    Data was extracted from the UnicodeData.txt file.
6    The current version number is reported in the unidata_version constant.
7 
8    Written by Marc-Andre Lemburg (mal@lemburg.com).
9    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10    Modified by Martin v. Löwis (martin@v.loewis.de)
11 
12    Copyright (c) Corporation for National Research Initiatives.
13 
14    ------------------------------------------------------------------------ */
15 
16 #define PY_SSIZE_T_CLEAN
17 
18 #include "Python.h"
19 #include "ucnhash.h"
20 #include "structmember.h"
21 
22 #include <stdbool.h>
23 
24 _Py_IDENTIFIER(NFC);
25 _Py_IDENTIFIER(NFD);
26 _Py_IDENTIFIER(NFKC);
27 _Py_IDENTIFIER(NFKD);
28 
29 /*[clinic input]
30 module unicodedata
31 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
32 [clinic start generated code]*/
33 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
34 
35 /* character properties */
36 
37 typedef struct {
38     const unsigned char category;       /* index into
39                                            _PyUnicode_CategoryNames */
40     const unsigned char combining;      /* combining class value 0 - 255 */
41     const unsigned char bidirectional;  /* index into
42                                            _PyUnicode_BidirectionalNames */
43     const unsigned char mirrored;       /* true if mirrored in bidir mode */
44     const unsigned char east_asian_width;       /* index into
45                                                    _PyUnicode_EastAsianWidth */
46     const unsigned char normalization_quick_check; /* see is_normalized() */
47 } _PyUnicode_DatabaseRecord;
48 
49 typedef struct change_record {
50     /* sequence of fields should be the same as in merge_old_version */
51     const unsigned char bidir_changed;
52     const unsigned char category_changed;
53     const unsigned char decimal_changed;
54     const unsigned char mirrored_changed;
55     const unsigned char east_asian_width_changed;
56     const double numeric_changed;
57 } change_record;
58 
59 /* data file generated by Tools/unicode/makeunicodedata.py */
60 #include "unicodedata_db.h"
61 
62 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)63 _getrecord_ex(Py_UCS4 code)
64 {
65     int index;
66     if (code >= 0x110000)
67         index = 0;
68     else {
69         index = index1[(code>>SHIFT)];
70         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71     }
72 
73     return &_PyUnicode_Database_Records[index];
74 }
75 
76 /* ------------- Previous-version API ------------------------------------- */
77 typedef struct previous_version {
78     PyObject_HEAD
79     const char *name;
80     const change_record* (*getrecord)(Py_UCS4);
81     Py_UCS4 (*normalization)(Py_UCS4);
82 } PreviousDBVersion;
83 
84 #include "clinic/unicodedata.c.h"
85 
86 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
87 
88 static PyMemberDef DB_members[] = {
89         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
90         {NULL}
91 };
92 
93 /* forward declaration */
94 static PyTypeObject UCD_Type;
95 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
96 
97 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))98 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
99                      Py_UCS4 (*normalization)(Py_UCS4))
100 {
101         PreviousDBVersion *self;
102         self = PyObject_New(PreviousDBVersion, &UCD_Type);
103         if (self == NULL)
104                 return NULL;
105         self->name = name;
106         self->getrecord = getrecord;
107         self->normalization = normalization;
108         return (PyObject*)self;
109 }
110 
111 
112 /* --- Module API --------------------------------------------------------- */
113 
114 /*[clinic input]
115 unicodedata.UCD.decimal
116 
117     self: self
118     chr: int(accept={str})
119     default: object=NULL
120     /
121 
122 Converts a Unicode character into its equivalent decimal value.
123 
124 Returns the decimal value assigned to the character chr as integer.
125 If no such value is defined, default is returned, or, if not given,
126 ValueError is raised.
127 [clinic start generated code]*/
128 
129 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)130 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
131                              PyObject *default_value)
132 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
133 {
134     int have_old = 0;
135     long rc;
136     Py_UCS4 c = (Py_UCS4)chr;
137 
138     if (self && UCD_Check(self)) {
139         const change_record *old = get_old_record(self, c);
140         if (old->category_changed == 0) {
141             /* unassigned */
142             have_old = 1;
143             rc = -1;
144         }
145         else if (old->decimal_changed != 0xFF) {
146             have_old = 1;
147             rc = old->decimal_changed;
148         }
149     }
150 
151     if (!have_old)
152         rc = Py_UNICODE_TODECIMAL(c);
153     if (rc < 0) {
154         if (default_value == NULL) {
155             PyErr_SetString(PyExc_ValueError,
156                             "not a decimal");
157             return NULL;
158         }
159         else {
160             Py_INCREF(default_value);
161             return default_value;
162         }
163     }
164     return PyLong_FromLong(rc);
165 }
166 
167 /*[clinic input]
168 unicodedata.UCD.digit
169 
170     self: self
171     chr: int(accept={str})
172     default: object=NULL
173     /
174 
175 Converts a Unicode character into its equivalent digit value.
176 
177 Returns the digit value assigned to the character chr as integer.
178 If no such value is defined, default is returned, or, if not given,
179 ValueError is raised.
180 [clinic start generated code]*/
181 
182 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)183 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
184 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
185 {
186     long rc;
187     Py_UCS4 c = (Py_UCS4)chr;
188     rc = Py_UNICODE_TODIGIT(c);
189     if (rc < 0) {
190         if (default_value == NULL) {
191             PyErr_SetString(PyExc_ValueError, "not a digit");
192             return NULL;
193         }
194         else {
195             Py_INCREF(default_value);
196             return default_value;
197         }
198     }
199     return PyLong_FromLong(rc);
200 }
201 
202 /*[clinic input]
203 unicodedata.UCD.numeric
204 
205     self: self
206     chr: int(accept={str})
207     default: object=NULL
208     /
209 
210 Converts a Unicode character into its equivalent numeric value.
211 
212 Returns the numeric value assigned to the character chr as float.
213 If no such value is defined, default is returned, or, if not given,
214 ValueError is raised.
215 [clinic start generated code]*/
216 
217 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)218 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
219                              PyObject *default_value)
220 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
221 {
222     int have_old = 0;
223     double rc;
224     Py_UCS4 c = (Py_UCS4)chr;
225 
226     if (self && UCD_Check(self)) {
227         const change_record *old = get_old_record(self, c);
228         if (old->category_changed == 0) {
229             /* unassigned */
230             have_old = 1;
231             rc = -1.0;
232         }
233         else if (old->decimal_changed != 0xFF) {
234             have_old = 1;
235             rc = old->decimal_changed;
236         }
237     }
238 
239     if (!have_old)
240         rc = Py_UNICODE_TONUMERIC(c);
241     if (rc == -1.0) {
242         if (default_value == NULL) {
243             PyErr_SetString(PyExc_ValueError, "not a numeric character");
244             return NULL;
245         }
246         else {
247             Py_INCREF(default_value);
248             return default_value;
249         }
250     }
251     return PyFloat_FromDouble(rc);
252 }
253 
254 /*[clinic input]
255 unicodedata.UCD.category
256 
257     self: self
258     chr: int(accept={str})
259     /
260 
261 Returns the general category assigned to the character chr as string.
262 [clinic start generated code]*/
263 
264 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)265 unicodedata_UCD_category_impl(PyObject *self, int chr)
266 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
267 {
268     int index;
269     Py_UCS4 c = (Py_UCS4)chr;
270     index = (int) _getrecord_ex(c)->category;
271     if (self && UCD_Check(self)) {
272         const change_record *old = get_old_record(self, c);
273         if (old->category_changed != 0xFF)
274             index = old->category_changed;
275     }
276     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
277 }
278 
279 /*[clinic input]
280 unicodedata.UCD.bidirectional
281 
282     self: self
283     chr: int(accept={str})
284     /
285 
286 Returns the bidirectional class assigned to the character chr as string.
287 
288 If no such value is defined, an empty string is returned.
289 [clinic start generated code]*/
290 
291 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)292 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
293 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
294 {
295     int index;
296     Py_UCS4 c = (Py_UCS4)chr;
297     index = (int) _getrecord_ex(c)->bidirectional;
298     if (self && UCD_Check(self)) {
299         const change_record *old = get_old_record(self, c);
300         if (old->category_changed == 0)
301             index = 0; /* unassigned */
302         else if (old->bidir_changed != 0xFF)
303             index = old->bidir_changed;
304     }
305     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
306 }
307 
308 /*[clinic input]
309 unicodedata.UCD.combining -> int
310 
311     self: self
312     chr: int(accept={str})
313     /
314 
315 Returns the canonical combining class assigned to the character chr as integer.
316 
317 Returns 0 if no combining class is defined.
318 [clinic start generated code]*/
319 
320 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)321 unicodedata_UCD_combining_impl(PyObject *self, int chr)
322 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
323 {
324     int index;
325     Py_UCS4 c = (Py_UCS4)chr;
326     index = (int) _getrecord_ex(c)->combining;
327     if (self && UCD_Check(self)) {
328         const change_record *old = get_old_record(self, c);
329         if (old->category_changed == 0)
330             index = 0; /* unassigned */
331     }
332     return index;
333 }
334 
335 /*[clinic input]
336 unicodedata.UCD.mirrored -> int
337 
338     self: self
339     chr: int(accept={str})
340     /
341 
342 Returns the mirrored property assigned to the character chr as integer.
343 
344 Returns 1 if the character has been identified as a "mirrored"
345 character in bidirectional text, 0 otherwise.
346 [clinic start generated code]*/
347 
348 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)349 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
350 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
351 {
352     int index;
353     Py_UCS4 c = (Py_UCS4)chr;
354     index = (int) _getrecord_ex(c)->mirrored;
355     if (self && UCD_Check(self)) {
356         const change_record *old = get_old_record(self, c);
357         if (old->category_changed == 0)
358             index = 0; /* unassigned */
359         else if (old->mirrored_changed != 0xFF)
360             index = old->mirrored_changed;
361     }
362     return index;
363 }
364 
365 /*[clinic input]
366 unicodedata.UCD.east_asian_width
367 
368     self: self
369     chr: int(accept={str})
370     /
371 
372 Returns the east asian width assigned to the character chr as string.
373 [clinic start generated code]*/
374 
375 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)376 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
377 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
378 {
379     int index;
380     Py_UCS4 c = (Py_UCS4)chr;
381     index = (int) _getrecord_ex(c)->east_asian_width;
382     if (self && UCD_Check(self)) {
383         const change_record *old = get_old_record(self, c);
384         if (old->category_changed == 0)
385             index = 0; /* unassigned */
386         else if (old->east_asian_width_changed != 0xFF)
387             index = old->east_asian_width_changed;
388     }
389     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
390 }
391 
392 /*[clinic input]
393 unicodedata.UCD.decomposition
394 
395     self: self
396     chr: int(accept={str})
397     /
398 
399 Returns the character decomposition mapping assigned to the character chr as string.
400 
401 An empty string is returned in case no such mapping is defined.
402 [clinic start generated code]*/
403 
404 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)405 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
406 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
407 {
408     char decomp[256];
409     int code, index, count;
410     size_t i;
411     unsigned int prefix_index;
412     Py_UCS4 c = (Py_UCS4)chr;
413 
414     code = (int)c;
415 
416     if (self && UCD_Check(self)) {
417         const change_record *old = get_old_record(self, c);
418         if (old->category_changed == 0)
419             return PyUnicode_FromString(""); /* unassigned */
420     }
421 
422     if (code < 0 || code >= 0x110000)
423         index = 0;
424     else {
425         index = decomp_index1[(code>>DECOMP_SHIFT)];
426         index = decomp_index2[(index<<DECOMP_SHIFT)+
427                              (code&((1<<DECOMP_SHIFT)-1))];
428     }
429 
430     /* high byte is number of hex bytes (usually one or two), low byte
431        is prefix code (from*/
432     count = decomp_data[index] >> 8;
433 
434     /* XXX: could allocate the PyString up front instead
435        (strlen(prefix) + 5 * count + 1 bytes) */
436 
437     /* Based on how index is calculated above and decomp_data is generated
438        from Tools/unicode/makeunicodedata.py, it should not be possible
439        to overflow decomp_prefix. */
440     prefix_index = decomp_data[index] & 255;
441     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
442 
443     /* copy prefix */
444     i = strlen(decomp_prefix[prefix_index]);
445     memcpy(decomp, decomp_prefix[prefix_index], i);
446 
447     while (count-- > 0) {
448         if (i)
449             decomp[i++] = ' ';
450         assert(i < sizeof(decomp));
451         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
452                       decomp_data[++index]);
453         i += strlen(decomp + i);
454     }
455     return PyUnicode_FromStringAndSize(decomp, i);
456 }
457 
458 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)459 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
460 {
461     if (code >= 0x110000) {
462         *index = 0;
463     } else if (self && UCD_Check(self) &&
464                get_old_record(self, code)->category_changed==0) {
465         /* unassigned in old version */
466         *index = 0;
467     }
468     else {
469         *index = decomp_index1[(code>>DECOMP_SHIFT)];
470         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471                                (code&((1<<DECOMP_SHIFT)-1))];
472     }
473 
474     /* high byte is number of hex bytes (usually one or two), low byte
475        is prefix code (from*/
476     *count = decomp_data[*index] >> 8;
477     *prefix = decomp_data[*index] & 255;
478 
479     (*index)++;
480 }
481 
482 #define SBase   0xAC00
483 #define LBase   0x1100
484 #define VBase   0x1161
485 #define TBase   0x11A7
486 #define LCount  19
487 #define VCount  21
488 #define TCount  28
489 #define NCount  (VCount*TCount)
490 #define SCount  (LCount*NCount)
491 
492 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)493 nfd_nfkd(PyObject *self, PyObject *input, int k)
494 {
495     PyObject *result;
496     Py_UCS4 *output;
497     Py_ssize_t i, o, osize;
498     int kind;
499     void *data;
500     /* Longest decomposition in Unicode 3.2: U+FDFA */
501     Py_UCS4 stack[20];
502     Py_ssize_t space, isize;
503     int index, prefix, count, stackptr;
504     unsigned char prev, cur;
505 
506     stackptr = 0;
507     isize = PyUnicode_GET_LENGTH(input);
508     space = isize;
509     /* Overallocate at most 10 characters. */
510     if (space > 10) {
511         if (space <= PY_SSIZE_T_MAX - 10)
512             space += 10;
513     }
514     else {
515         space *= 2;
516     }
517     osize = space;
518     output = PyMem_NEW(Py_UCS4, space);
519     if (!output) {
520         PyErr_NoMemory();
521         return NULL;
522     }
523     i = o = 0;
524     kind = PyUnicode_KIND(input);
525     data = PyUnicode_DATA(input);
526 
527     while (i < isize) {
528         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
529         while(stackptr) {
530             Py_UCS4 code = stack[--stackptr];
531             /* Hangul Decomposition adds three characters in
532                a single step, so we need at least that much room. */
533             if (space < 3) {
534                 Py_UCS4 *new_output;
535                 osize += 10;
536                 space += 10;
537                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
538                 if (new_output == NULL) {
539                     PyMem_Free(output);
540                     PyErr_NoMemory();
541                     return NULL;
542                 }
543                 output = new_output;
544             }
545             /* Hangul Decomposition. */
546             if (SBase <= code && code < (SBase+SCount)) {
547                 int SIndex = code - SBase;
548                 int L = LBase + SIndex / NCount;
549                 int V = VBase + (SIndex % NCount) / TCount;
550                 int T = TBase + SIndex % TCount;
551                 output[o++] = L;
552                 output[o++] = V;
553                 space -= 2;
554                 if (T != TBase) {
555                     output[o++] = T;
556                     space --;
557                 }
558                 continue;
559             }
560             /* normalization changes */
561             if (self && UCD_Check(self)) {
562                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
563                 if (value != 0) {
564                     stack[stackptr++] = value;
565                     continue;
566                 }
567             }
568 
569             /* Other decompositions. */
570             get_decomp_record(self, code, &index, &prefix, &count);
571 
572             /* Copy character if it is not decomposable, or has a
573                compatibility decomposition, but we do NFD. */
574             if (!count || (prefix && !k)) {
575                 output[o++] = code;
576                 space--;
577                 continue;
578             }
579             /* Copy decomposition onto the stack, in reverse
580                order.  */
581             while(count) {
582                 code = decomp_data[index + (--count)];
583                 stack[stackptr++] = code;
584             }
585         }
586     }
587 
588     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
589                                        output, o);
590     PyMem_Free(output);
591     if (!result)
592         return NULL;
593     /* result is guaranteed to be ready, as it is compact. */
594     kind = PyUnicode_KIND(result);
595     data = PyUnicode_DATA(result);
596 
597     /* Sort canonically. */
598     i = 0;
599     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
600     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
601         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
602         if (prev == 0 || cur == 0 || prev <= cur) {
603             prev = cur;
604             continue;
605         }
606         /* Non-canonical order. Need to switch *i with previous. */
607         o = i - 1;
608         while (1) {
609             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
610             PyUnicode_WRITE(kind, data, o+1,
611                             PyUnicode_READ(kind, data, o));
612             PyUnicode_WRITE(kind, data, o, tmp);
613             o--;
614             if (o < 0)
615                 break;
616             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
617             if (prev == 0 || prev <= cur)
618                 break;
619         }
620         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
621     }
622     return result;
623 }
624 
625 static int
find_nfc_index(PyObject * self,struct reindex * nfc,Py_UCS4 code)626 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
627 {
628     unsigned int index;
629     for (index = 0; nfc[index].start; index++) {
630         unsigned int start = nfc[index].start;
631         if (code < start)
632             return -1;
633         if (code <= start + nfc[index].count) {
634             unsigned int delta = code - start;
635             return nfc[index].index + delta;
636         }
637     }
638     return -1;
639 }
640 
641 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)642 nfc_nfkc(PyObject *self, PyObject *input, int k)
643 {
644     PyObject *result;
645     int kind;
646     void *data;
647     Py_UCS4 *output;
648     Py_ssize_t i, i1, o, len;
649     int f,l,index,index1,comb;
650     Py_UCS4 code;
651     Py_ssize_t skipped[20];
652     int cskipped = 0;
653 
654     result = nfd_nfkd(self, input, k);
655     if (!result)
656         return NULL;
657     /* result will be "ready". */
658     kind = PyUnicode_KIND(result);
659     data = PyUnicode_DATA(result);
660     len = PyUnicode_GET_LENGTH(result);
661 
662     /* We allocate a buffer for the output.
663        If we find that we made no changes, we still return
664        the NFD result. */
665     output = PyMem_NEW(Py_UCS4, len);
666     if (!output) {
667         PyErr_NoMemory();
668         Py_DECREF(result);
669         return 0;
670     }
671     i = o = 0;
672 
673   again:
674     while (i < len) {
675       for (index = 0; index < cskipped; index++) {
676           if (skipped[index] == i) {
677               /* *i character is skipped.
678                  Remove from list. */
679               skipped[index] = skipped[cskipped-1];
680               cskipped--;
681               i++;
682               goto again; /* continue while */
683           }
684       }
685       /* Hangul Composition. We don't need to check for <LV,T>
686          pairs, since we always have decomposed data. */
687       code = PyUnicode_READ(kind, data, i);
688       if (LBase <= code && code < (LBase+LCount) &&
689           i + 1 < len &&
690           VBase <= PyUnicode_READ(kind, data, i+1) &&
691           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
692           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
693              and V character is a modern vowel (0x1161 ~ 0x1175). */
694           int LIndex, VIndex;
695           LIndex = code - LBase;
696           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
697           code = SBase + (LIndex*VCount+VIndex)*TCount;
698           i+=2;
699           if (i < len &&
700               TBase < PyUnicode_READ(kind, data, i) &&
701               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
702               /* check T character is a modern trailing consonant
703                  (0x11A8 ~ 0x11C2). */
704               code += PyUnicode_READ(kind, data, i)-TBase;
705               i++;
706           }
707           output[o++] = code;
708           continue;
709       }
710 
711       /* code is still input[i] here */
712       f = find_nfc_index(self, nfc_first, code);
713       if (f == -1) {
714           output[o++] = code;
715           i++;
716           continue;
717       }
718       /* Find next unblocked character. */
719       i1 = i+1;
720       comb = 0;
721       /* output base character for now; might be updated later. */
722       output[o] = PyUnicode_READ(kind, data, i);
723       while (i1 < len) {
724           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
725           int comb1 = _getrecord_ex(code1)->combining;
726           if (comb) {
727               if (comb1 == 0)
728                   break;
729               if (comb >= comb1) {
730                   /* Character is blocked. */
731                   i1++;
732                   continue;
733               }
734           }
735           l = find_nfc_index(self, nfc_last, code1);
736           /* i1 cannot be combined with i. If i1
737              is a starter, we don't need to look further.
738              Otherwise, record the combining class. */
739           if (l == -1) {
740             not_combinable:
741               if (comb1 == 0)
742                   break;
743               comb = comb1;
744               i1++;
745               continue;
746           }
747           index = f*TOTAL_LAST + l;
748           index1 = comp_index[index >> COMP_SHIFT];
749           code = comp_data[(index1<<COMP_SHIFT)+
750                            (index&((1<<COMP_SHIFT)-1))];
751           if (code == 0)
752               goto not_combinable;
753 
754           /* Replace the original character. */
755           output[o] = code;
756           /* Mark the second character unused. */
757           assert(cskipped < 20);
758           skipped[cskipped++] = i1;
759           i1++;
760           f = find_nfc_index(self, nfc_first, output[o]);
761           if (f == -1)
762               break;
763       }
764       /* Output character was already written.
765          Just advance the indices. */
766       o++; i++;
767     }
768     if (o == len) {
769         /* No changes. Return original string. */
770         PyMem_Free(output);
771         return result;
772     }
773     Py_DECREF(result);
774     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
775                                        output, o);
776     PyMem_Free(output);
777     return result;
778 }
779 
780 // This needs to match the logic in makeunicodedata.py
781 // which constructs the quickcheck data.
782 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
783 
784 /* Run the Unicode normalization "quickcheck" algorithm.
785  *
786  * Return YES or NO if quickcheck determines the input is certainly
787  * normalized or certainly not, and MAYBE if quickcheck is unable to
788  * tell.
789  *
790  * If `yes_only` is true, then return MAYBE as soon as we determine
791  * the answer is not YES.
792  *
793  * For background and details on the algorithm, see UAX #15:
794  *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795  */
796 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,int nfc,int k,bool yes_only)797 is_normalized_quickcheck(PyObject *self, PyObject *input,
798                          int nfc, int k, bool yes_only)
799 {
800     /* An older version of the database is requested, quickchecks must be
801        disabled. */
802     if (self && UCD_Check(self))
803         return NO;
804 
805     Py_ssize_t i, len;
806     int kind;
807     void *data;
808     unsigned char prev_combining = 0;
809 
810     /* The two quickcheck bits at this shift have type QuickcheckResult. */
811     int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
812 
813     QuickcheckResult result = YES; /* certainly normalized, unless we find something */
814 
815     i = 0;
816     kind = PyUnicode_KIND(input);
817     data = PyUnicode_DATA(input);
818     len = PyUnicode_GET_LENGTH(input);
819     while (i < len) {
820         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
821         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
822 
823         unsigned char combining = record->combining;
824         if (combining && prev_combining > combining)
825             return NO; /* non-canonical sort order, not normalized */
826         prev_combining = combining;
827 
828         unsigned char quickcheck_whole = record->normalization_quick_check;
829         if (yes_only) {
830             if (quickcheck_whole & (3 << quickcheck_shift))
831                 return MAYBE;
832         } else {
833             switch ((quickcheck_whole >> quickcheck_shift) & 3) {
834             case NO:
835               return NO;
836             case MAYBE:
837               result = MAYBE; /* this string might need normalization */
838             }
839         }
840     }
841     return result;
842 }
843 
844 /*[clinic input]
845 unicodedata.UCD.is_normalized
846 
847     self: self
848     form: unicode
849     unistr as input: unicode
850     /
851 
852 Return whether the Unicode string unistr is in the normal form 'form'.
853 
854 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
855 [clinic start generated code]*/
856 
857 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)858 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
859                                    PyObject *input)
860 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
861 {
862     if (PyUnicode_READY(input) == -1) {
863         return NULL;
864     }
865 
866     if (PyUnicode_GET_LENGTH(input) == 0) {
867         /* special case empty input strings. */
868         Py_RETURN_TRUE;
869     }
870 
871     PyObject *result;
872     int nfc = 0;
873     int k = 0;
874     QuickcheckResult m;
875 
876     PyObject *cmp;
877     int match = 0;
878 
879     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
880         nfc = 1;
881     }
882     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
883         nfc = 1;
884         k = 1;
885     }
886     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
887         /* matches default values for `nfc` and `k` */
888     }
889     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
890         k = 1;
891     }
892     else {
893         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
894         return NULL;
895     }
896 
897     m = is_normalized_quickcheck(self, input, nfc, k, false);
898 
899     if (m == MAYBE) {
900         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
901         if (cmp == NULL) {
902             return NULL;
903         }
904         match = PyUnicode_Compare(input, cmp);
905         Py_DECREF(cmp);
906         result = (match == 0) ? Py_True : Py_False;
907     }
908     else {
909         result = (m == YES) ? Py_True : Py_False;
910     }
911 
912     Py_INCREF(result);
913     return result;
914 }
915 
916 
917 /*[clinic input]
918 unicodedata.UCD.normalize
919 
920     self: self
921     form: unicode
922     unistr as input: unicode
923     /
924 
925 Return the normal form 'form' for the Unicode string unistr.
926 
927 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
928 [clinic start generated code]*/
929 
930 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)931 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
932                                PyObject *input)
933 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
934 {
935     if (PyUnicode_GET_LENGTH(input) == 0) {
936         /* Special case empty input strings, since resizing
937            them  later would cause internal errors. */
938         Py_INCREF(input);
939         return input;
940     }
941 
942     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
943         if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
944             Py_INCREF(input);
945             return input;
946         }
947         return nfc_nfkc(self, input, 0);
948     }
949     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
950         if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
951             Py_INCREF(input);
952             return input;
953         }
954         return nfc_nfkc(self, input, 1);
955     }
956     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
957         if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
958             Py_INCREF(input);
959             return input;
960         }
961         return nfd_nfkd(self, input, 0);
962     }
963     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
964         if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
965             Py_INCREF(input);
966             return input;
967         }
968         return nfd_nfkd(self, input, 1);
969     }
970     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
971     return NULL;
972 }
973 
974 /* -------------------------------------------------------------------- */
975 /* unicode character name tables */
976 
977 /* data file generated by Tools/unicode/makeunicodedata.py */
978 #include "unicodename_db.h"
979 
980 /* -------------------------------------------------------------------- */
981 /* database code (cut and pasted from the unidb package) */
982 
983 static unsigned long
_gethash(const char * s,int len,int scale)984 _gethash(const char *s, int len, int scale)
985 {
986     int i;
987     unsigned long h = 0;
988     unsigned long ix;
989     for (i = 0; i < len; i++) {
990         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
991         ix = h & 0xff000000;
992         if (ix)
993             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
994     }
995     return h;
996 }
997 
998 static const char * const hangul_syllables[][3] = {
999     { "G",  "A",   ""   },
1000     { "GG", "AE",  "G"  },
1001     { "N",  "YA",  "GG" },
1002     { "D",  "YAE", "GS" },
1003     { "DD", "EO",  "N", },
1004     { "R",  "E",   "NJ" },
1005     { "M",  "YEO", "NH" },
1006     { "B",  "YE",  "D"  },
1007     { "BB", "O",   "L"  },
1008     { "S",  "WA",  "LG" },
1009     { "SS", "WAE", "LM" },
1010     { "",   "OE",  "LB" },
1011     { "J",  "YO",  "LS" },
1012     { "JJ", "U",   "LT" },
1013     { "C",  "WEO", "LP" },
1014     { "K",  "WE",  "LH" },
1015     { "T",  "WI",  "M"  },
1016     { "P",  "YU",  "B"  },
1017     { "H",  "EU",  "BS" },
1018     { 0,    "YI",  "S"  },
1019     { 0,    "I",   "SS" },
1020     { 0,    0,     "NG" },
1021     { 0,    0,     "J"  },
1022     { 0,    0,     "C"  },
1023     { 0,    0,     "K"  },
1024     { 0,    0,     "T"  },
1025     { 0,    0,     "P"  },
1026     { 0,    0,     "H"  }
1027 };
1028 
1029 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1030 static int
is_unified_ideograph(Py_UCS4 code)1031 is_unified_ideograph(Py_UCS4 code)
1032 {
1033     return
1034         (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
1035         (0x4E00 <= code && code <= 0x9FEF)   || /* CJK Ideograph */
1036         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
1037         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
1038         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1039         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1040         (0x2CEB0 <= code && code <= 0x2EBEF);   /* CJK Ideograph Extension F */
1041 }
1042 
1043 /* macros used to determine if the given code point is in the PUA range that
1044  * we are using to store aliases and named sequences */
1045 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1046 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1047                           (cp < named_sequences_end))
1048 
1049 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1050 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1051            int with_alias_and_seq)
1052 {
1053     /* Find the name associated with the given code point.
1054      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1055      * that we are using for aliases and named sequences. */
1056     int offset;
1057     int i;
1058     int word;
1059     const unsigned char* w;
1060 
1061     if (code >= 0x110000)
1062         return 0;
1063 
1064     /* XXX should we just skip all the code points in the PUAs here? */
1065     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1066         return 0;
1067 
1068     if (self && UCD_Check(self)) {
1069         /* in 3.2.0 there are no aliases and named sequences */
1070         const change_record *old;
1071         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1072             return 0;
1073         old = get_old_record(self, code);
1074         if (old->category_changed == 0) {
1075             /* unassigned */
1076             return 0;
1077         }
1078     }
1079 
1080     if (SBase <= code && code < SBase+SCount) {
1081         /* Hangul syllable. */
1082         int SIndex = code - SBase;
1083         int L = SIndex / NCount;
1084         int V = (SIndex % NCount) / TCount;
1085         int T = SIndex % TCount;
1086 
1087         if (buflen < 27)
1088             /* Worst case: HANGUL SYLLABLE <10chars>. */
1089             return 0;
1090         strcpy(buffer, "HANGUL SYLLABLE ");
1091         buffer += 16;
1092         strcpy(buffer, hangul_syllables[L][0]);
1093         buffer += strlen(hangul_syllables[L][0]);
1094         strcpy(buffer, hangul_syllables[V][1]);
1095         buffer += strlen(hangul_syllables[V][1]);
1096         strcpy(buffer, hangul_syllables[T][2]);
1097         buffer += strlen(hangul_syllables[T][2]);
1098         *buffer = '\0';
1099         return 1;
1100     }
1101 
1102     if (is_unified_ideograph(code)) {
1103         if (buflen < 28)
1104             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1105             return 0;
1106         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1107         return 1;
1108     }
1109 
1110     /* get offset into phrasebook */
1111     offset = phrasebook_offset1[(code>>phrasebook_shift)];
1112     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1113                                (code&((1<<phrasebook_shift)-1))];
1114     if (!offset)
1115         return 0;
1116 
1117     i = 0;
1118 
1119     for (;;) {
1120         /* get word index */
1121         word = phrasebook[offset] - phrasebook_short;
1122         if (word >= 0) {
1123             word = (word << 8) + phrasebook[offset+1];
1124             offset += 2;
1125         } else
1126             word = phrasebook[offset++];
1127         if (i) {
1128             if (i > buflen)
1129                 return 0; /* buffer overflow */
1130             buffer[i++] = ' ';
1131         }
1132         /* copy word string from lexicon.  the last character in the
1133            word has bit 7 set.  the last word in a string ends with
1134            0x80 */
1135         w = lexicon + lexicon_offset[word];
1136         while (*w < 128) {
1137             if (i >= buflen)
1138                 return 0; /* buffer overflow */
1139             buffer[i++] = *w++;
1140         }
1141         if (i >= buflen)
1142             return 0; /* buffer overflow */
1143         buffer[i++] = *w & 127;
1144         if (*w == 128)
1145             break; /* end of word */
1146     }
1147 
1148     return 1;
1149 }
1150 
1151 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1152 _cmpname(PyObject *self, int code, const char* name, int namelen)
1153 {
1154     /* check if code corresponds to the given name */
1155     int i;
1156     char buffer[NAME_MAXLEN+1];
1157     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1158         return 0;
1159     for (i = 0; i < namelen; i++) {
1160         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1161             return 0;
1162     }
1163     return buffer[namelen] == '\0';
1164 }
1165 
1166 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1167 find_syllable(const char *str, int *len, int *pos, int count, int column)
1168 {
1169     int i, len1;
1170     *len = -1;
1171     for (i = 0; i < count; i++) {
1172         const char *s = hangul_syllables[i][column];
1173         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1174         if (len1 <= *len)
1175             continue;
1176         if (strncmp(str, s, len1) == 0) {
1177             *len = len1;
1178             *pos = i;
1179         }
1180     }
1181     if (*len == -1) {
1182         *len = 0;
1183     }
1184 }
1185 
1186 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1187 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1188 {
1189     /* check if named sequences are allowed */
1190     if (!with_named_seq && IS_NAMED_SEQ(cp))
1191         return 0;
1192     /* if the code point is in the PUA range that we use for aliases,
1193      * convert it to obtain the right code point */
1194     if (IS_ALIAS(cp))
1195         *code = name_aliases[cp-aliases_start];
1196     else
1197         *code = cp;
1198     return 1;
1199 }
1200 
1201 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1202 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1203          int with_named_seq)
1204 {
1205     /* Return the code point associated with the given name.
1206      * Named aliases are resolved too (unless self != NULL (i.e. we are using
1207      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
1208      * using for the named sequence, and the caller must then convert it. */
1209     unsigned int h, v;
1210     unsigned int mask = code_size-1;
1211     unsigned int i, incr;
1212 
1213     /* Check for hangul syllables. */
1214     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1215         int len, L = -1, V = -1, T = -1;
1216         const char *pos = name + 16;
1217         find_syllable(pos, &len, &L, LCount, 0);
1218         pos += len;
1219         find_syllable(pos, &len, &V, VCount, 1);
1220         pos += len;
1221         find_syllable(pos, &len, &T, TCount, 2);
1222         pos += len;
1223         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1224             *code = SBase + (L*VCount+V)*TCount + T;
1225             return 1;
1226         }
1227         /* Otherwise, it's an illegal syllable name. */
1228         return 0;
1229     }
1230 
1231     /* Check for unified ideographs. */
1232     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1233         /* Four or five hexdigits must follow. */
1234         v = 0;
1235         name += 22;
1236         namelen -= 22;
1237         if (namelen != 4 && namelen != 5)
1238             return 0;
1239         while (namelen--) {
1240             v *= 16;
1241             if (*name >= '0' && *name <= '9')
1242                 v += *name - '0';
1243             else if (*name >= 'A' && *name <= 'F')
1244                 v += *name - 'A' + 10;
1245             else
1246                 return 0;
1247             name++;
1248         }
1249         if (!is_unified_ideograph(v))
1250             return 0;
1251         *code = v;
1252         return 1;
1253     }
1254 
1255     /* the following is the same as python's dictionary lookup, with
1256        only minor changes.  see the makeunicodedata script for more
1257        details */
1258 
1259     h = (unsigned int) _gethash(name, namelen, code_magic);
1260     i = (~h) & mask;
1261     v = code_hash[i];
1262     if (!v)
1263         return 0;
1264     if (_cmpname(self, v, name, namelen))
1265         return _check_alias_and_seq(v, code, with_named_seq);
1266     incr = (h ^ (h >> 3)) & mask;
1267     if (!incr)
1268         incr = mask;
1269     for (;;) {
1270         i = (i + incr) & mask;
1271         v = code_hash[i];
1272         if (!v)
1273             return 0;
1274         if (_cmpname(self, v, name, namelen))
1275             return _check_alias_and_seq(v, code, with_named_seq);
1276         incr = incr << 1;
1277         if (incr > mask)
1278             incr = incr ^ code_poly;
1279     }
1280 }
1281 
1282 static const _PyUnicode_Name_CAPI hashAPI =
1283 {
1284     sizeof(_PyUnicode_Name_CAPI),
1285     _getucname,
1286     _getcode
1287 };
1288 
1289 /* -------------------------------------------------------------------- */
1290 /* Python bindings */
1291 
1292 /*[clinic input]
1293 unicodedata.UCD.name
1294 
1295     self: self
1296     chr: int(accept={str})
1297     default: object=NULL
1298     /
1299 
1300 Returns the name assigned to the character chr as a string.
1301 
1302 If no name is defined, default is returned, or, if not given,
1303 ValueError is raised.
1304 [clinic start generated code]*/
1305 
1306 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1307 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1308 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1309 {
1310     char name[NAME_MAXLEN+1];
1311     Py_UCS4 c = (Py_UCS4)chr;
1312 
1313     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1314         if (default_value == NULL) {
1315             PyErr_SetString(PyExc_ValueError, "no such name");
1316             return NULL;
1317         }
1318         else {
1319             Py_INCREF(default_value);
1320             return default_value;
1321         }
1322     }
1323 
1324     return PyUnicode_FromString(name);
1325 }
1326 
1327 /*[clinic input]
1328 unicodedata.UCD.lookup
1329 
1330     self: self
1331     name: str(accept={str, robuffer}, zeroes=True)
1332     /
1333 
1334 Look up character by name.
1335 
1336 If a character with the given name is found, return the
1337 corresponding character.  If not found, KeyError is raised.
1338 [clinic start generated code]*/
1339 
1340 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1341 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1342                             Py_ssize_clean_t name_length)
1343 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1344 {
1345     Py_UCS4 code;
1346     unsigned int index;
1347     if (name_length > NAME_MAXLEN) {
1348         PyErr_SetString(PyExc_KeyError, "name too long");
1349         return NULL;
1350     }
1351 
1352     if (!_getcode(self, name, (int)name_length, &code, 1)) {
1353         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1354         return NULL;
1355     }
1356     /* check if code is in the PUA range that we use for named sequences
1357        and convert it */
1358     if (IS_NAMED_SEQ(code)) {
1359         index = code-named_sequences_start;
1360         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1361                                          named_sequences[index].seq,
1362                                          named_sequences[index].seqlen);
1363     }
1364     return PyUnicode_FromOrdinal(code);
1365 }
1366 
1367 /* XXX Add doc strings. */
1368 
1369 static PyMethodDef unicodedata_functions[] = {
1370     UNICODEDATA_UCD_DECIMAL_METHODDEF
1371     UNICODEDATA_UCD_DIGIT_METHODDEF
1372     UNICODEDATA_UCD_NUMERIC_METHODDEF
1373     UNICODEDATA_UCD_CATEGORY_METHODDEF
1374     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1375     UNICODEDATA_UCD_COMBINING_METHODDEF
1376     UNICODEDATA_UCD_MIRRORED_METHODDEF
1377     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1378     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1379     UNICODEDATA_UCD_NAME_METHODDEF
1380     UNICODEDATA_UCD_LOOKUP_METHODDEF
1381     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1382     UNICODEDATA_UCD_NORMALIZE_METHODDEF
1383     {NULL, NULL}                /* sentinel */
1384 };
1385 
1386 static PyTypeObject UCD_Type = {
1387         /* The ob_type field must be initialized in the module init function
1388          * to be portable to Windows without using C++. */
1389         PyVarObject_HEAD_INIT(NULL, 0)
1390         "unicodedata.UCD",              /*tp_name*/
1391         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1392         0,                      /*tp_itemsize*/
1393         /* methods */
1394         (destructor)PyObject_Del, /*tp_dealloc*/
1395         0,                      /*tp_vectorcall_offset*/
1396         0,                      /*tp_getattr*/
1397         0,                      /*tp_setattr*/
1398         0,                      /*tp_as_async*/
1399         0,                      /*tp_repr*/
1400         0,                      /*tp_as_number*/
1401         0,                      /*tp_as_sequence*/
1402         0,                      /*tp_as_mapping*/
1403         0,                      /*tp_hash*/
1404         0,                      /*tp_call*/
1405         0,                      /*tp_str*/
1406         PyObject_GenericGetAttr,/*tp_getattro*/
1407         0,                      /*tp_setattro*/
1408         0,                      /*tp_as_buffer*/
1409         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1410         0,                      /*tp_doc*/
1411         0,                      /*tp_traverse*/
1412         0,                      /*tp_clear*/
1413         0,                      /*tp_richcompare*/
1414         0,                      /*tp_weaklistoffset*/
1415         0,                      /*tp_iter*/
1416         0,                      /*tp_iternext*/
1417         unicodedata_functions,  /*tp_methods*/
1418         DB_members,             /*tp_members*/
1419         0,                      /*tp_getset*/
1420         0,                      /*tp_base*/
1421         0,                      /*tp_dict*/
1422         0,                      /*tp_descr_get*/
1423         0,                      /*tp_descr_set*/
1424         0,                      /*tp_dictoffset*/
1425         0,                      /*tp_init*/
1426         0,                      /*tp_alloc*/
1427         0,                      /*tp_new*/
1428         0,                      /*tp_free*/
1429         0,                      /*tp_is_gc*/
1430 };
1431 
1432 PyDoc_STRVAR(unicodedata_docstring,
1433 "This module provides access to the Unicode Character Database which\n\
1434 defines character properties for all Unicode characters. The data in\n\
1435 this database is based on the UnicodeData.txt file version\n\
1436 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1437 \n\
1438 The module uses the same names and symbols as defined by the\n\
1439 UnicodeData File Format " UNIDATA_VERSION ".");
1440 
1441 static struct PyModuleDef unicodedatamodule = {
1442         PyModuleDef_HEAD_INIT,
1443         "unicodedata",
1444         unicodedata_docstring,
1445         -1,
1446         unicodedata_functions,
1447         NULL,
1448         NULL,
1449         NULL,
1450         NULL
1451 };
1452 
1453 PyMODINIT_FUNC
PyInit_unicodedata(void)1454 PyInit_unicodedata(void)
1455 {
1456     PyObject *m, *v;
1457 
1458     Py_TYPE(&UCD_Type) = &PyType_Type;
1459 
1460     m = PyModule_Create(&unicodedatamodule);
1461     if (!m)
1462         return NULL;
1463 
1464     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1465     Py_INCREF(&UCD_Type);
1466     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1467 
1468     /* Previous versions */
1469     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1470     if (v != NULL)
1471         PyModule_AddObject(m, "ucd_3_2_0", v);
1472 
1473     /* Export C API */
1474     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1475     if (v != NULL)
1476         PyModule_AddObject(m, "ucnhash_CAPI", v);
1477     return m;
1478 }
1479 
1480 /*
1481 Local variables:
1482 c-basic-offset: 4
1483 indent-tabs-mode: nil
1484 End:
1485 */
1486