1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode database.
4
5 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
7
8 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 Modified by Martin v. Löwis (martin@v.loewis.de)
11
12 Copyright (c) Corporation for National Research Initiatives.
13
14 ------------------------------------------------------------------------ */
15
16 #define PY_SSIZE_T_CLEAN
17
18 #include "Python.h"
19 #include "ucnhash.h"
20 #include "structmember.h"
21
22 #include <stdbool.h>
23
24 _Py_IDENTIFIER(NFC);
25 _Py_IDENTIFIER(NFD);
26 _Py_IDENTIFIER(NFKC);
27 _Py_IDENTIFIER(NFKD);
28
29 /*[clinic input]
30 module unicodedata
31 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
32 [clinic start generated code]*/
33 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
34
35 /* character properties */
36
37 typedef struct {
38 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
46 const unsigned char normalization_quick_check; /* see is_normalized() */
47 } _PyUnicode_DatabaseRecord;
48
49 typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
54 const unsigned char mirrored_changed;
55 const unsigned char east_asian_width_changed;
56 const double numeric_changed;
57 } change_record;
58
59 /* data file generated by Tools/unicode/makeunicodedata.py */
60 #include "unicodedata_db.h"
61
62 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)63 _getrecord_ex(Py_UCS4 code)
64 {
65 int index;
66 if (code >= 0x110000)
67 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74 }
75
76 /* ------------- Previous-version API ------------------------------------- */
77 typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82 } PreviousDBVersion;
83
84 #include "clinic/unicodedata.c.h"
85
86 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
88 static PyMemberDef DB_members[] = {
89 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
90 {NULL}
91 };
92
93 /* forward declaration */
94 static PyTypeObject UCD_Type;
95 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
96
97 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))98 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
99 Py_UCS4 (*normalization)(Py_UCS4))
100 {
101 PreviousDBVersion *self;
102 self = PyObject_New(PreviousDBVersion, &UCD_Type);
103 if (self == NULL)
104 return NULL;
105 self->name = name;
106 self->getrecord = getrecord;
107 self->normalization = normalization;
108 return (PyObject*)self;
109 }
110
111
112 /* --- Module API --------------------------------------------------------- */
113
114 /*[clinic input]
115 unicodedata.UCD.decimal
116
117 self: self
118 chr: int(accept={str})
119 default: object=NULL
120 /
121
122 Converts a Unicode character into its equivalent decimal value.
123
124 Returns the decimal value assigned to the character chr as integer.
125 If no such value is defined, default is returned, or, if not given,
126 ValueError is raised.
127 [clinic start generated code]*/
128
129 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)130 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
131 PyObject *default_value)
132 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
133 {
134 int have_old = 0;
135 long rc;
136 Py_UCS4 c = (Py_UCS4)chr;
137
138 if (self && UCD_Check(self)) {
139 const change_record *old = get_old_record(self, c);
140 if (old->category_changed == 0) {
141 /* unassigned */
142 have_old = 1;
143 rc = -1;
144 }
145 else if (old->decimal_changed != 0xFF) {
146 have_old = 1;
147 rc = old->decimal_changed;
148 }
149 }
150
151 if (!have_old)
152 rc = Py_UNICODE_TODECIMAL(c);
153 if (rc < 0) {
154 if (default_value == NULL) {
155 PyErr_SetString(PyExc_ValueError,
156 "not a decimal");
157 return NULL;
158 }
159 else {
160 Py_INCREF(default_value);
161 return default_value;
162 }
163 }
164 return PyLong_FromLong(rc);
165 }
166
167 /*[clinic input]
168 unicodedata.UCD.digit
169
170 self: self
171 chr: int(accept={str})
172 default: object=NULL
173 /
174
175 Converts a Unicode character into its equivalent digit value.
176
177 Returns the digit value assigned to the character chr as integer.
178 If no such value is defined, default is returned, or, if not given,
179 ValueError is raised.
180 [clinic start generated code]*/
181
182 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)183 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
184 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
185 {
186 long rc;
187 Py_UCS4 c = (Py_UCS4)chr;
188 rc = Py_UNICODE_TODIGIT(c);
189 if (rc < 0) {
190 if (default_value == NULL) {
191 PyErr_SetString(PyExc_ValueError, "not a digit");
192 return NULL;
193 }
194 else {
195 Py_INCREF(default_value);
196 return default_value;
197 }
198 }
199 return PyLong_FromLong(rc);
200 }
201
202 /*[clinic input]
203 unicodedata.UCD.numeric
204
205 self: self
206 chr: int(accept={str})
207 default: object=NULL
208 /
209
210 Converts a Unicode character into its equivalent numeric value.
211
212 Returns the numeric value assigned to the character chr as float.
213 If no such value is defined, default is returned, or, if not given,
214 ValueError is raised.
215 [clinic start generated code]*/
216
217 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)218 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
219 PyObject *default_value)
220 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
221 {
222 int have_old = 0;
223 double rc;
224 Py_UCS4 c = (Py_UCS4)chr;
225
226 if (self && UCD_Check(self)) {
227 const change_record *old = get_old_record(self, c);
228 if (old->category_changed == 0) {
229 /* unassigned */
230 have_old = 1;
231 rc = -1.0;
232 }
233 else if (old->decimal_changed != 0xFF) {
234 have_old = 1;
235 rc = old->decimal_changed;
236 }
237 }
238
239 if (!have_old)
240 rc = Py_UNICODE_TONUMERIC(c);
241 if (rc == -1.0) {
242 if (default_value == NULL) {
243 PyErr_SetString(PyExc_ValueError, "not a numeric character");
244 return NULL;
245 }
246 else {
247 Py_INCREF(default_value);
248 return default_value;
249 }
250 }
251 return PyFloat_FromDouble(rc);
252 }
253
254 /*[clinic input]
255 unicodedata.UCD.category
256
257 self: self
258 chr: int(accept={str})
259 /
260
261 Returns the general category assigned to the character chr as string.
262 [clinic start generated code]*/
263
264 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)265 unicodedata_UCD_category_impl(PyObject *self, int chr)
266 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
267 {
268 int index;
269 Py_UCS4 c = (Py_UCS4)chr;
270 index = (int) _getrecord_ex(c)->category;
271 if (self && UCD_Check(self)) {
272 const change_record *old = get_old_record(self, c);
273 if (old->category_changed != 0xFF)
274 index = old->category_changed;
275 }
276 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
277 }
278
279 /*[clinic input]
280 unicodedata.UCD.bidirectional
281
282 self: self
283 chr: int(accept={str})
284 /
285
286 Returns the bidirectional class assigned to the character chr as string.
287
288 If no such value is defined, an empty string is returned.
289 [clinic start generated code]*/
290
291 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)292 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
293 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
294 {
295 int index;
296 Py_UCS4 c = (Py_UCS4)chr;
297 index = (int) _getrecord_ex(c)->bidirectional;
298 if (self && UCD_Check(self)) {
299 const change_record *old = get_old_record(self, c);
300 if (old->category_changed == 0)
301 index = 0; /* unassigned */
302 else if (old->bidir_changed != 0xFF)
303 index = old->bidir_changed;
304 }
305 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
306 }
307
308 /*[clinic input]
309 unicodedata.UCD.combining -> int
310
311 self: self
312 chr: int(accept={str})
313 /
314
315 Returns the canonical combining class assigned to the character chr as integer.
316
317 Returns 0 if no combining class is defined.
318 [clinic start generated code]*/
319
320 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)321 unicodedata_UCD_combining_impl(PyObject *self, int chr)
322 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
323 {
324 int index;
325 Py_UCS4 c = (Py_UCS4)chr;
326 index = (int) _getrecord_ex(c)->combining;
327 if (self && UCD_Check(self)) {
328 const change_record *old = get_old_record(self, c);
329 if (old->category_changed == 0)
330 index = 0; /* unassigned */
331 }
332 return index;
333 }
334
335 /*[clinic input]
336 unicodedata.UCD.mirrored -> int
337
338 self: self
339 chr: int(accept={str})
340 /
341
342 Returns the mirrored property assigned to the character chr as integer.
343
344 Returns 1 if the character has been identified as a "mirrored"
345 character in bidirectional text, 0 otherwise.
346 [clinic start generated code]*/
347
348 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)349 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
350 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
351 {
352 int index;
353 Py_UCS4 c = (Py_UCS4)chr;
354 index = (int) _getrecord_ex(c)->mirrored;
355 if (self && UCD_Check(self)) {
356 const change_record *old = get_old_record(self, c);
357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
361 }
362 return index;
363 }
364
365 /*[clinic input]
366 unicodedata.UCD.east_asian_width
367
368 self: self
369 chr: int(accept={str})
370 /
371
372 Returns the east asian width assigned to the character chr as string.
373 [clinic start generated code]*/
374
375 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)376 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
377 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
378 {
379 int index;
380 Py_UCS4 c = (Py_UCS4)chr;
381 index = (int) _getrecord_ex(c)->east_asian_width;
382 if (self && UCD_Check(self)) {
383 const change_record *old = get_old_record(self, c);
384 if (old->category_changed == 0)
385 index = 0; /* unassigned */
386 else if (old->east_asian_width_changed != 0xFF)
387 index = old->east_asian_width_changed;
388 }
389 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
390 }
391
392 /*[clinic input]
393 unicodedata.UCD.decomposition
394
395 self: self
396 chr: int(accept={str})
397 /
398
399 Returns the character decomposition mapping assigned to the character chr as string.
400
401 An empty string is returned in case no such mapping is defined.
402 [clinic start generated code]*/
403
404 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)405 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
406 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
407 {
408 char decomp[256];
409 int code, index, count;
410 size_t i;
411 unsigned int prefix_index;
412 Py_UCS4 c = (Py_UCS4)chr;
413
414 code = (int)c;
415
416 if (self && UCD_Check(self)) {
417 const change_record *old = get_old_record(self, c);
418 if (old->category_changed == 0)
419 return PyUnicode_FromString(""); /* unassigned */
420 }
421
422 if (code < 0 || code >= 0x110000)
423 index = 0;
424 else {
425 index = decomp_index1[(code>>DECOMP_SHIFT)];
426 index = decomp_index2[(index<<DECOMP_SHIFT)+
427 (code&((1<<DECOMP_SHIFT)-1))];
428 }
429
430 /* high byte is number of hex bytes (usually one or two), low byte
431 is prefix code (from*/
432 count = decomp_data[index] >> 8;
433
434 /* XXX: could allocate the PyString up front instead
435 (strlen(prefix) + 5 * count + 1 bytes) */
436
437 /* Based on how index is calculated above and decomp_data is generated
438 from Tools/unicode/makeunicodedata.py, it should not be possible
439 to overflow decomp_prefix. */
440 prefix_index = decomp_data[index] & 255;
441 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
442
443 /* copy prefix */
444 i = strlen(decomp_prefix[prefix_index]);
445 memcpy(decomp, decomp_prefix[prefix_index], i);
446
447 while (count-- > 0) {
448 if (i)
449 decomp[i++] = ' ';
450 assert(i < sizeof(decomp));
451 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
452 decomp_data[++index]);
453 i += strlen(decomp + i);
454 }
455 return PyUnicode_FromStringAndSize(decomp, i);
456 }
457
458 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)459 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
460 {
461 if (code >= 0x110000) {
462 *index = 0;
463 } else if (self && UCD_Check(self) &&
464 get_old_record(self, code)->category_changed==0) {
465 /* unassigned in old version */
466 *index = 0;
467 }
468 else {
469 *index = decomp_index1[(code>>DECOMP_SHIFT)];
470 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471 (code&((1<<DECOMP_SHIFT)-1))];
472 }
473
474 /* high byte is number of hex bytes (usually one or two), low byte
475 is prefix code (from*/
476 *count = decomp_data[*index] >> 8;
477 *prefix = decomp_data[*index] & 255;
478
479 (*index)++;
480 }
481
482 #define SBase 0xAC00
483 #define LBase 0x1100
484 #define VBase 0x1161
485 #define TBase 0x11A7
486 #define LCount 19
487 #define VCount 21
488 #define TCount 28
489 #define NCount (VCount*TCount)
490 #define SCount (LCount*NCount)
491
492 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)493 nfd_nfkd(PyObject *self, PyObject *input, int k)
494 {
495 PyObject *result;
496 Py_UCS4 *output;
497 Py_ssize_t i, o, osize;
498 int kind;
499 void *data;
500 /* Longest decomposition in Unicode 3.2: U+FDFA */
501 Py_UCS4 stack[20];
502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
504 unsigned char prev, cur;
505
506 stackptr = 0;
507 isize = PyUnicode_GET_LENGTH(input);
508 space = isize;
509 /* Overallocate at most 10 characters. */
510 if (space > 10) {
511 if (space <= PY_SSIZE_T_MAX - 10)
512 space += 10;
513 }
514 else {
515 space *= 2;
516 }
517 osize = space;
518 output = PyMem_NEW(Py_UCS4, space);
519 if (!output) {
520 PyErr_NoMemory();
521 return NULL;
522 }
523 i = o = 0;
524 kind = PyUnicode_KIND(input);
525 data = PyUnicode_DATA(input);
526
527 while (i < isize) {
528 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
529 while(stackptr) {
530 Py_UCS4 code = stack[--stackptr];
531 /* Hangul Decomposition adds three characters in
532 a single step, so we need at least that much room. */
533 if (space < 3) {
534 Py_UCS4 *new_output;
535 osize += 10;
536 space += 10;
537 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
538 if (new_output == NULL) {
539 PyMem_Free(output);
540 PyErr_NoMemory();
541 return NULL;
542 }
543 output = new_output;
544 }
545 /* Hangul Decomposition. */
546 if (SBase <= code && code < (SBase+SCount)) {
547 int SIndex = code - SBase;
548 int L = LBase + SIndex / NCount;
549 int V = VBase + (SIndex % NCount) / TCount;
550 int T = TBase + SIndex % TCount;
551 output[o++] = L;
552 output[o++] = V;
553 space -= 2;
554 if (T != TBase) {
555 output[o++] = T;
556 space --;
557 }
558 continue;
559 }
560 /* normalization changes */
561 if (self && UCD_Check(self)) {
562 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
563 if (value != 0) {
564 stack[stackptr++] = value;
565 continue;
566 }
567 }
568
569 /* Other decompositions. */
570 get_decomp_record(self, code, &index, &prefix, &count);
571
572 /* Copy character if it is not decomposable, or has a
573 compatibility decomposition, but we do NFD. */
574 if (!count || (prefix && !k)) {
575 output[o++] = code;
576 space--;
577 continue;
578 }
579 /* Copy decomposition onto the stack, in reverse
580 order. */
581 while(count) {
582 code = decomp_data[index + (--count)];
583 stack[stackptr++] = code;
584 }
585 }
586 }
587
588 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
589 output, o);
590 PyMem_Free(output);
591 if (!result)
592 return NULL;
593 /* result is guaranteed to be ready, as it is compact. */
594 kind = PyUnicode_KIND(result);
595 data = PyUnicode_DATA(result);
596
597 /* Sort canonically. */
598 i = 0;
599 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
600 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
601 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
602 if (prev == 0 || cur == 0 || prev <= cur) {
603 prev = cur;
604 continue;
605 }
606 /* Non-canonical order. Need to switch *i with previous. */
607 o = i - 1;
608 while (1) {
609 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
610 PyUnicode_WRITE(kind, data, o+1,
611 PyUnicode_READ(kind, data, o));
612 PyUnicode_WRITE(kind, data, o, tmp);
613 o--;
614 if (o < 0)
615 break;
616 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
617 if (prev == 0 || prev <= cur)
618 break;
619 }
620 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
621 }
622 return result;
623 }
624
625 static int
find_nfc_index(PyObject * self,struct reindex * nfc,Py_UCS4 code)626 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
627 {
628 unsigned int index;
629 for (index = 0; nfc[index].start; index++) {
630 unsigned int start = nfc[index].start;
631 if (code < start)
632 return -1;
633 if (code <= start + nfc[index].count) {
634 unsigned int delta = code - start;
635 return nfc[index].index + delta;
636 }
637 }
638 return -1;
639 }
640
641 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)642 nfc_nfkc(PyObject *self, PyObject *input, int k)
643 {
644 PyObject *result;
645 int kind;
646 void *data;
647 Py_UCS4 *output;
648 Py_ssize_t i, i1, o, len;
649 int f,l,index,index1,comb;
650 Py_UCS4 code;
651 Py_ssize_t skipped[20];
652 int cskipped = 0;
653
654 result = nfd_nfkd(self, input, k);
655 if (!result)
656 return NULL;
657 /* result will be "ready". */
658 kind = PyUnicode_KIND(result);
659 data = PyUnicode_DATA(result);
660 len = PyUnicode_GET_LENGTH(result);
661
662 /* We allocate a buffer for the output.
663 If we find that we made no changes, we still return
664 the NFD result. */
665 output = PyMem_NEW(Py_UCS4, len);
666 if (!output) {
667 PyErr_NoMemory();
668 Py_DECREF(result);
669 return 0;
670 }
671 i = o = 0;
672
673 again:
674 while (i < len) {
675 for (index = 0; index < cskipped; index++) {
676 if (skipped[index] == i) {
677 /* *i character is skipped.
678 Remove from list. */
679 skipped[index] = skipped[cskipped-1];
680 cskipped--;
681 i++;
682 goto again; /* continue while */
683 }
684 }
685 /* Hangul Composition. We don't need to check for <LV,T>
686 pairs, since we always have decomposed data. */
687 code = PyUnicode_READ(kind, data, i);
688 if (LBase <= code && code < (LBase+LCount) &&
689 i + 1 < len &&
690 VBase <= PyUnicode_READ(kind, data, i+1) &&
691 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
692 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
693 and V character is a modern vowel (0x1161 ~ 0x1175). */
694 int LIndex, VIndex;
695 LIndex = code - LBase;
696 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
697 code = SBase + (LIndex*VCount+VIndex)*TCount;
698 i+=2;
699 if (i < len &&
700 TBase < PyUnicode_READ(kind, data, i) &&
701 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
702 /* check T character is a modern trailing consonant
703 (0x11A8 ~ 0x11C2). */
704 code += PyUnicode_READ(kind, data, i)-TBase;
705 i++;
706 }
707 output[o++] = code;
708 continue;
709 }
710
711 /* code is still input[i] here */
712 f = find_nfc_index(self, nfc_first, code);
713 if (f == -1) {
714 output[o++] = code;
715 i++;
716 continue;
717 }
718 /* Find next unblocked character. */
719 i1 = i+1;
720 comb = 0;
721 /* output base character for now; might be updated later. */
722 output[o] = PyUnicode_READ(kind, data, i);
723 while (i1 < len) {
724 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
725 int comb1 = _getrecord_ex(code1)->combining;
726 if (comb) {
727 if (comb1 == 0)
728 break;
729 if (comb >= comb1) {
730 /* Character is blocked. */
731 i1++;
732 continue;
733 }
734 }
735 l = find_nfc_index(self, nfc_last, code1);
736 /* i1 cannot be combined with i. If i1
737 is a starter, we don't need to look further.
738 Otherwise, record the combining class. */
739 if (l == -1) {
740 not_combinable:
741 if (comb1 == 0)
742 break;
743 comb = comb1;
744 i1++;
745 continue;
746 }
747 index = f*TOTAL_LAST + l;
748 index1 = comp_index[index >> COMP_SHIFT];
749 code = comp_data[(index1<<COMP_SHIFT)+
750 (index&((1<<COMP_SHIFT)-1))];
751 if (code == 0)
752 goto not_combinable;
753
754 /* Replace the original character. */
755 output[o] = code;
756 /* Mark the second character unused. */
757 assert(cskipped < 20);
758 skipped[cskipped++] = i1;
759 i1++;
760 f = find_nfc_index(self, nfc_first, output[o]);
761 if (f == -1)
762 break;
763 }
764 /* Output character was already written.
765 Just advance the indices. */
766 o++; i++;
767 }
768 if (o == len) {
769 /* No changes. Return original string. */
770 PyMem_Free(output);
771 return result;
772 }
773 Py_DECREF(result);
774 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
775 output, o);
776 PyMem_Free(output);
777 return result;
778 }
779
780 // This needs to match the logic in makeunicodedata.py
781 // which constructs the quickcheck data.
782 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
783
784 /* Run the Unicode normalization "quickcheck" algorithm.
785 *
786 * Return YES or NO if quickcheck determines the input is certainly
787 * normalized or certainly not, and MAYBE if quickcheck is unable to
788 * tell.
789 *
790 * If `yes_only` is true, then return MAYBE as soon as we determine
791 * the answer is not YES.
792 *
793 * For background and details on the algorithm, see UAX #15:
794 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795 */
796 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,int nfc,int k,bool yes_only)797 is_normalized_quickcheck(PyObject *self, PyObject *input,
798 int nfc, int k, bool yes_only)
799 {
800 /* An older version of the database is requested, quickchecks must be
801 disabled. */
802 if (self && UCD_Check(self))
803 return NO;
804
805 Py_ssize_t i, len;
806 int kind;
807 void *data;
808 unsigned char prev_combining = 0;
809
810 /* The two quickcheck bits at this shift have type QuickcheckResult. */
811 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
812
813 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
814
815 i = 0;
816 kind = PyUnicode_KIND(input);
817 data = PyUnicode_DATA(input);
818 len = PyUnicode_GET_LENGTH(input);
819 while (i < len) {
820 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
821 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
822
823 unsigned char combining = record->combining;
824 if (combining && prev_combining > combining)
825 return NO; /* non-canonical sort order, not normalized */
826 prev_combining = combining;
827
828 unsigned char quickcheck_whole = record->normalization_quick_check;
829 if (yes_only) {
830 if (quickcheck_whole & (3 << quickcheck_shift))
831 return MAYBE;
832 } else {
833 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
834 case NO:
835 return NO;
836 case MAYBE:
837 result = MAYBE; /* this string might need normalization */
838 }
839 }
840 }
841 return result;
842 }
843
844 /*[clinic input]
845 unicodedata.UCD.is_normalized
846
847 self: self
848 form: unicode
849 unistr as input: unicode
850 /
851
852 Return whether the Unicode string unistr is in the normal form 'form'.
853
854 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
855 [clinic start generated code]*/
856
857 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)858 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
859 PyObject *input)
860 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
861 {
862 if (PyUnicode_READY(input) == -1) {
863 return NULL;
864 }
865
866 if (PyUnicode_GET_LENGTH(input) == 0) {
867 /* special case empty input strings. */
868 Py_RETURN_TRUE;
869 }
870
871 PyObject *result;
872 int nfc = 0;
873 int k = 0;
874 QuickcheckResult m;
875
876 PyObject *cmp;
877 int match = 0;
878
879 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
880 nfc = 1;
881 }
882 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
883 nfc = 1;
884 k = 1;
885 }
886 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
887 /* matches default values for `nfc` and `k` */
888 }
889 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
890 k = 1;
891 }
892 else {
893 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
894 return NULL;
895 }
896
897 m = is_normalized_quickcheck(self, input, nfc, k, false);
898
899 if (m == MAYBE) {
900 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
901 if (cmp == NULL) {
902 return NULL;
903 }
904 match = PyUnicode_Compare(input, cmp);
905 Py_DECREF(cmp);
906 result = (match == 0) ? Py_True : Py_False;
907 }
908 else {
909 result = (m == YES) ? Py_True : Py_False;
910 }
911
912 Py_INCREF(result);
913 return result;
914 }
915
916
917 /*[clinic input]
918 unicodedata.UCD.normalize
919
920 self: self
921 form: unicode
922 unistr as input: unicode
923 /
924
925 Return the normal form 'form' for the Unicode string unistr.
926
927 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
928 [clinic start generated code]*/
929
930 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)931 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
932 PyObject *input)
933 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
934 {
935 if (PyUnicode_GET_LENGTH(input) == 0) {
936 /* Special case empty input strings, since resizing
937 them later would cause internal errors. */
938 Py_INCREF(input);
939 return input;
940 }
941
942 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
943 if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
944 Py_INCREF(input);
945 return input;
946 }
947 return nfc_nfkc(self, input, 0);
948 }
949 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
950 if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
951 Py_INCREF(input);
952 return input;
953 }
954 return nfc_nfkc(self, input, 1);
955 }
956 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
957 if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
958 Py_INCREF(input);
959 return input;
960 }
961 return nfd_nfkd(self, input, 0);
962 }
963 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
964 if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
965 Py_INCREF(input);
966 return input;
967 }
968 return nfd_nfkd(self, input, 1);
969 }
970 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
971 return NULL;
972 }
973
974 /* -------------------------------------------------------------------- */
975 /* unicode character name tables */
976
977 /* data file generated by Tools/unicode/makeunicodedata.py */
978 #include "unicodename_db.h"
979
980 /* -------------------------------------------------------------------- */
981 /* database code (cut and pasted from the unidb package) */
982
983 static unsigned long
_gethash(const char * s,int len,int scale)984 _gethash(const char *s, int len, int scale)
985 {
986 int i;
987 unsigned long h = 0;
988 unsigned long ix;
989 for (i = 0; i < len; i++) {
990 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
991 ix = h & 0xff000000;
992 if (ix)
993 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
994 }
995 return h;
996 }
997
998 static const char * const hangul_syllables[][3] = {
999 { "G", "A", "" },
1000 { "GG", "AE", "G" },
1001 { "N", "YA", "GG" },
1002 { "D", "YAE", "GS" },
1003 { "DD", "EO", "N", },
1004 { "R", "E", "NJ" },
1005 { "M", "YEO", "NH" },
1006 { "B", "YE", "D" },
1007 { "BB", "O", "L" },
1008 { "S", "WA", "LG" },
1009 { "SS", "WAE", "LM" },
1010 { "", "OE", "LB" },
1011 { "J", "YO", "LS" },
1012 { "JJ", "U", "LT" },
1013 { "C", "WEO", "LP" },
1014 { "K", "WE", "LH" },
1015 { "T", "WI", "M" },
1016 { "P", "YU", "B" },
1017 { "H", "EU", "BS" },
1018 { 0, "YI", "S" },
1019 { 0, "I", "SS" },
1020 { 0, 0, "NG" },
1021 { 0, 0, "J" },
1022 { 0, 0, "C" },
1023 { 0, 0, "K" },
1024 { 0, 0, "T" },
1025 { 0, 0, "P" },
1026 { 0, 0, "H" }
1027 };
1028
1029 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1030 static int
is_unified_ideograph(Py_UCS4 code)1031 is_unified_ideograph(Py_UCS4 code)
1032 {
1033 return
1034 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
1035 (0x4E00 <= code && code <= 0x9FEF) || /* CJK Ideograph */
1036 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
1037 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
1038 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1039 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1040 (0x2CEB0 <= code && code <= 0x2EBEF); /* CJK Ideograph Extension F */
1041 }
1042
1043 /* macros used to determine if the given code point is in the PUA range that
1044 * we are using to store aliases and named sequences */
1045 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1046 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1047 (cp < named_sequences_end))
1048
1049 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1050 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1051 int with_alias_and_seq)
1052 {
1053 /* Find the name associated with the given code point.
1054 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1055 * that we are using for aliases and named sequences. */
1056 int offset;
1057 int i;
1058 int word;
1059 const unsigned char* w;
1060
1061 if (code >= 0x110000)
1062 return 0;
1063
1064 /* XXX should we just skip all the code points in the PUAs here? */
1065 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1066 return 0;
1067
1068 if (self && UCD_Check(self)) {
1069 /* in 3.2.0 there are no aliases and named sequences */
1070 const change_record *old;
1071 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1072 return 0;
1073 old = get_old_record(self, code);
1074 if (old->category_changed == 0) {
1075 /* unassigned */
1076 return 0;
1077 }
1078 }
1079
1080 if (SBase <= code && code < SBase+SCount) {
1081 /* Hangul syllable. */
1082 int SIndex = code - SBase;
1083 int L = SIndex / NCount;
1084 int V = (SIndex % NCount) / TCount;
1085 int T = SIndex % TCount;
1086
1087 if (buflen < 27)
1088 /* Worst case: HANGUL SYLLABLE <10chars>. */
1089 return 0;
1090 strcpy(buffer, "HANGUL SYLLABLE ");
1091 buffer += 16;
1092 strcpy(buffer, hangul_syllables[L][0]);
1093 buffer += strlen(hangul_syllables[L][0]);
1094 strcpy(buffer, hangul_syllables[V][1]);
1095 buffer += strlen(hangul_syllables[V][1]);
1096 strcpy(buffer, hangul_syllables[T][2]);
1097 buffer += strlen(hangul_syllables[T][2]);
1098 *buffer = '\0';
1099 return 1;
1100 }
1101
1102 if (is_unified_ideograph(code)) {
1103 if (buflen < 28)
1104 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1105 return 0;
1106 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1107 return 1;
1108 }
1109
1110 /* get offset into phrasebook */
1111 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1112 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1113 (code&((1<<phrasebook_shift)-1))];
1114 if (!offset)
1115 return 0;
1116
1117 i = 0;
1118
1119 for (;;) {
1120 /* get word index */
1121 word = phrasebook[offset] - phrasebook_short;
1122 if (word >= 0) {
1123 word = (word << 8) + phrasebook[offset+1];
1124 offset += 2;
1125 } else
1126 word = phrasebook[offset++];
1127 if (i) {
1128 if (i > buflen)
1129 return 0; /* buffer overflow */
1130 buffer[i++] = ' ';
1131 }
1132 /* copy word string from lexicon. the last character in the
1133 word has bit 7 set. the last word in a string ends with
1134 0x80 */
1135 w = lexicon + lexicon_offset[word];
1136 while (*w < 128) {
1137 if (i >= buflen)
1138 return 0; /* buffer overflow */
1139 buffer[i++] = *w++;
1140 }
1141 if (i >= buflen)
1142 return 0; /* buffer overflow */
1143 buffer[i++] = *w & 127;
1144 if (*w == 128)
1145 break; /* end of word */
1146 }
1147
1148 return 1;
1149 }
1150
1151 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1152 _cmpname(PyObject *self, int code, const char* name, int namelen)
1153 {
1154 /* check if code corresponds to the given name */
1155 int i;
1156 char buffer[NAME_MAXLEN+1];
1157 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1158 return 0;
1159 for (i = 0; i < namelen; i++) {
1160 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1161 return 0;
1162 }
1163 return buffer[namelen] == '\0';
1164 }
1165
1166 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1167 find_syllable(const char *str, int *len, int *pos, int count, int column)
1168 {
1169 int i, len1;
1170 *len = -1;
1171 for (i = 0; i < count; i++) {
1172 const char *s = hangul_syllables[i][column];
1173 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1174 if (len1 <= *len)
1175 continue;
1176 if (strncmp(str, s, len1) == 0) {
1177 *len = len1;
1178 *pos = i;
1179 }
1180 }
1181 if (*len == -1) {
1182 *len = 0;
1183 }
1184 }
1185
1186 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1187 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1188 {
1189 /* check if named sequences are allowed */
1190 if (!with_named_seq && IS_NAMED_SEQ(cp))
1191 return 0;
1192 /* if the code point is in the PUA range that we use for aliases,
1193 * convert it to obtain the right code point */
1194 if (IS_ALIAS(cp))
1195 *code = name_aliases[cp-aliases_start];
1196 else
1197 *code = cp;
1198 return 1;
1199 }
1200
1201 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1202 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1203 int with_named_seq)
1204 {
1205 /* Return the code point associated with the given name.
1206 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1207 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1208 * using for the named sequence, and the caller must then convert it. */
1209 unsigned int h, v;
1210 unsigned int mask = code_size-1;
1211 unsigned int i, incr;
1212
1213 /* Check for hangul syllables. */
1214 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1215 int len, L = -1, V = -1, T = -1;
1216 const char *pos = name + 16;
1217 find_syllable(pos, &len, &L, LCount, 0);
1218 pos += len;
1219 find_syllable(pos, &len, &V, VCount, 1);
1220 pos += len;
1221 find_syllable(pos, &len, &T, TCount, 2);
1222 pos += len;
1223 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1224 *code = SBase + (L*VCount+V)*TCount + T;
1225 return 1;
1226 }
1227 /* Otherwise, it's an illegal syllable name. */
1228 return 0;
1229 }
1230
1231 /* Check for unified ideographs. */
1232 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1233 /* Four or five hexdigits must follow. */
1234 v = 0;
1235 name += 22;
1236 namelen -= 22;
1237 if (namelen != 4 && namelen != 5)
1238 return 0;
1239 while (namelen--) {
1240 v *= 16;
1241 if (*name >= '0' && *name <= '9')
1242 v += *name - '0';
1243 else if (*name >= 'A' && *name <= 'F')
1244 v += *name - 'A' + 10;
1245 else
1246 return 0;
1247 name++;
1248 }
1249 if (!is_unified_ideograph(v))
1250 return 0;
1251 *code = v;
1252 return 1;
1253 }
1254
1255 /* the following is the same as python's dictionary lookup, with
1256 only minor changes. see the makeunicodedata script for more
1257 details */
1258
1259 h = (unsigned int) _gethash(name, namelen, code_magic);
1260 i = (~h) & mask;
1261 v = code_hash[i];
1262 if (!v)
1263 return 0;
1264 if (_cmpname(self, v, name, namelen))
1265 return _check_alias_and_seq(v, code, with_named_seq);
1266 incr = (h ^ (h >> 3)) & mask;
1267 if (!incr)
1268 incr = mask;
1269 for (;;) {
1270 i = (i + incr) & mask;
1271 v = code_hash[i];
1272 if (!v)
1273 return 0;
1274 if (_cmpname(self, v, name, namelen))
1275 return _check_alias_and_seq(v, code, with_named_seq);
1276 incr = incr << 1;
1277 if (incr > mask)
1278 incr = incr ^ code_poly;
1279 }
1280 }
1281
1282 static const _PyUnicode_Name_CAPI hashAPI =
1283 {
1284 sizeof(_PyUnicode_Name_CAPI),
1285 _getucname,
1286 _getcode
1287 };
1288
1289 /* -------------------------------------------------------------------- */
1290 /* Python bindings */
1291
1292 /*[clinic input]
1293 unicodedata.UCD.name
1294
1295 self: self
1296 chr: int(accept={str})
1297 default: object=NULL
1298 /
1299
1300 Returns the name assigned to the character chr as a string.
1301
1302 If no name is defined, default is returned, or, if not given,
1303 ValueError is raised.
1304 [clinic start generated code]*/
1305
1306 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1307 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1308 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1309 {
1310 char name[NAME_MAXLEN+1];
1311 Py_UCS4 c = (Py_UCS4)chr;
1312
1313 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1314 if (default_value == NULL) {
1315 PyErr_SetString(PyExc_ValueError, "no such name");
1316 return NULL;
1317 }
1318 else {
1319 Py_INCREF(default_value);
1320 return default_value;
1321 }
1322 }
1323
1324 return PyUnicode_FromString(name);
1325 }
1326
1327 /*[clinic input]
1328 unicodedata.UCD.lookup
1329
1330 self: self
1331 name: str(accept={str, robuffer}, zeroes=True)
1332 /
1333
1334 Look up character by name.
1335
1336 If a character with the given name is found, return the
1337 corresponding character. If not found, KeyError is raised.
1338 [clinic start generated code]*/
1339
1340 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1341 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1342 Py_ssize_clean_t name_length)
1343 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1344 {
1345 Py_UCS4 code;
1346 unsigned int index;
1347 if (name_length > NAME_MAXLEN) {
1348 PyErr_SetString(PyExc_KeyError, "name too long");
1349 return NULL;
1350 }
1351
1352 if (!_getcode(self, name, (int)name_length, &code, 1)) {
1353 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1354 return NULL;
1355 }
1356 /* check if code is in the PUA range that we use for named sequences
1357 and convert it */
1358 if (IS_NAMED_SEQ(code)) {
1359 index = code-named_sequences_start;
1360 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1361 named_sequences[index].seq,
1362 named_sequences[index].seqlen);
1363 }
1364 return PyUnicode_FromOrdinal(code);
1365 }
1366
1367 /* XXX Add doc strings. */
1368
1369 static PyMethodDef unicodedata_functions[] = {
1370 UNICODEDATA_UCD_DECIMAL_METHODDEF
1371 UNICODEDATA_UCD_DIGIT_METHODDEF
1372 UNICODEDATA_UCD_NUMERIC_METHODDEF
1373 UNICODEDATA_UCD_CATEGORY_METHODDEF
1374 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1375 UNICODEDATA_UCD_COMBINING_METHODDEF
1376 UNICODEDATA_UCD_MIRRORED_METHODDEF
1377 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1378 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1379 UNICODEDATA_UCD_NAME_METHODDEF
1380 UNICODEDATA_UCD_LOOKUP_METHODDEF
1381 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1382 UNICODEDATA_UCD_NORMALIZE_METHODDEF
1383 {NULL, NULL} /* sentinel */
1384 };
1385
1386 static PyTypeObject UCD_Type = {
1387 /* The ob_type field must be initialized in the module init function
1388 * to be portable to Windows without using C++. */
1389 PyVarObject_HEAD_INIT(NULL, 0)
1390 "unicodedata.UCD", /*tp_name*/
1391 sizeof(PreviousDBVersion), /*tp_basicsize*/
1392 0, /*tp_itemsize*/
1393 /* methods */
1394 (destructor)PyObject_Del, /*tp_dealloc*/
1395 0, /*tp_vectorcall_offset*/
1396 0, /*tp_getattr*/
1397 0, /*tp_setattr*/
1398 0, /*tp_as_async*/
1399 0, /*tp_repr*/
1400 0, /*tp_as_number*/
1401 0, /*tp_as_sequence*/
1402 0, /*tp_as_mapping*/
1403 0, /*tp_hash*/
1404 0, /*tp_call*/
1405 0, /*tp_str*/
1406 PyObject_GenericGetAttr,/*tp_getattro*/
1407 0, /*tp_setattro*/
1408 0, /*tp_as_buffer*/
1409 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1410 0, /*tp_doc*/
1411 0, /*tp_traverse*/
1412 0, /*tp_clear*/
1413 0, /*tp_richcompare*/
1414 0, /*tp_weaklistoffset*/
1415 0, /*tp_iter*/
1416 0, /*tp_iternext*/
1417 unicodedata_functions, /*tp_methods*/
1418 DB_members, /*tp_members*/
1419 0, /*tp_getset*/
1420 0, /*tp_base*/
1421 0, /*tp_dict*/
1422 0, /*tp_descr_get*/
1423 0, /*tp_descr_set*/
1424 0, /*tp_dictoffset*/
1425 0, /*tp_init*/
1426 0, /*tp_alloc*/
1427 0, /*tp_new*/
1428 0, /*tp_free*/
1429 0, /*tp_is_gc*/
1430 };
1431
1432 PyDoc_STRVAR(unicodedata_docstring,
1433 "This module provides access to the Unicode Character Database which\n\
1434 defines character properties for all Unicode characters. The data in\n\
1435 this database is based on the UnicodeData.txt file version\n\
1436 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1437 \n\
1438 The module uses the same names and symbols as defined by the\n\
1439 UnicodeData File Format " UNIDATA_VERSION ".");
1440
1441 static struct PyModuleDef unicodedatamodule = {
1442 PyModuleDef_HEAD_INIT,
1443 "unicodedata",
1444 unicodedata_docstring,
1445 -1,
1446 unicodedata_functions,
1447 NULL,
1448 NULL,
1449 NULL,
1450 NULL
1451 };
1452
1453 PyMODINIT_FUNC
PyInit_unicodedata(void)1454 PyInit_unicodedata(void)
1455 {
1456 PyObject *m, *v;
1457
1458 Py_TYPE(&UCD_Type) = &PyType_Type;
1459
1460 m = PyModule_Create(&unicodedatamodule);
1461 if (!m)
1462 return NULL;
1463
1464 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1465 Py_INCREF(&UCD_Type);
1466 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1467
1468 /* Previous versions */
1469 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1470 if (v != NULL)
1471 PyModule_AddObject(m, "ucd_3_2_0", v);
1472
1473 /* Export C API */
1474 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1475 if (v != NULL)
1476 PyModule_AddObject(m, "ucnhash_CAPI", v);
1477 return m;
1478 }
1479
1480 /*
1481 Local variables:
1482 c-basic-offset: 4
1483 indent-tabs-mode: nil
1484 End:
1485 */
1486