1 #ifndef Py_CPYTHON_UNICODEOBJECT_H
2 #  error "this header file must not be included directly"
3 #endif
4 
5 /* Py_UNICODE was the native Unicode storage format (code unit) used by
6    Python and represents a single Unicode element in the Unicode type.
7    With PEP 393, Py_UNICODE is deprecated and replaced with a
8    typedef to wchar_t. */
9 #define PY_UNICODE_TYPE wchar_t
10 /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
11 
12 /* --- Internal Unicode Operations ---------------------------------------- */
13 
14 #ifndef USE_UNICODE_WCHAR_CACHE
15 #  define USE_UNICODE_WCHAR_CACHE 1
16 #endif /* USE_UNICODE_WCHAR_CACHE */
17 
18 /* Since splitting on whitespace is an important use case, and
19    whitespace in most situations is solely ASCII whitespace, we
20    optimize for the common case by using a quick look-up table
21    _Py_ascii_whitespace (see below) with an inlined check.
22 
23  */
24 #define Py_UNICODE_ISSPACE(ch) \
25     ((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
26 
27 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
28 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
29 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
30 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
31 
32 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
33 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
34 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
35 
36 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
37 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
38 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
39 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
40 
41 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
42 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
43 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
44 
45 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
46 
47 #define Py_UNICODE_ISALNUM(ch) \
48        (Py_UNICODE_ISALPHA(ch) || \
49     Py_UNICODE_ISDECIMAL(ch) || \
50     Py_UNICODE_ISDIGIT(ch) || \
51     Py_UNICODE_ISNUMERIC(ch))
52 
53 /* macros to work with surrogates */
54 #define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
55 #define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
56 #define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
57 /* Join two surrogate characters and return a single Py_UCS4 value. */
58 #define Py_UNICODE_JOIN_SURROGATES(high, low)  \
59     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
60       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
61 /* high surrogate = top 10 bits added to D800 */
62 #define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
63 /* low surrogate = bottom 10 bits added to DC00 */
64 #define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
65 
66 /* --- Unicode Type ------------------------------------------------------- */
67 
68 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
69    structure. state.ascii and state.compact are set, and the data
70    immediately follow the structure. utf8_length and wstr_length can be found
71    in the length field; the utf8 pointer is equal to the data pointer. */
72 typedef struct {
73     /* There are 4 forms of Unicode strings:
74 
75        - compact ascii:
76 
77          * structure = PyASCIIObject
78          * test: PyUnicode_IS_COMPACT_ASCII(op)
79          * kind = PyUnicode_1BYTE_KIND
80          * compact = 1
81          * ascii = 1
82          * ready = 1
83          * (length is the length of the utf8 and wstr strings)
84          * (data starts just after the structure)
85          * (since ASCII is decoded from UTF-8, the utf8 string are the data)
86 
87        - compact:
88 
89          * structure = PyCompactUnicodeObject
90          * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
91          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
92            PyUnicode_4BYTE_KIND
93          * compact = 1
94          * ready = 1
95          * ascii = 0
96          * utf8 is not shared with data
97          * utf8_length = 0 if utf8 is NULL
98          * wstr is shared with data and wstr_length=length
99            if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
100            or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
101          * wstr_length = 0 if wstr is NULL
102          * (data starts just after the structure)
103 
104        - legacy string, not ready:
105 
106          * structure = PyUnicodeObject
107          * test: kind == PyUnicode_WCHAR_KIND
108          * length = 0 (use wstr_length)
109          * hash = -1
110          * kind = PyUnicode_WCHAR_KIND
111          * compact = 0
112          * ascii = 0
113          * ready = 0
114          * interned = SSTATE_NOT_INTERNED
115          * wstr is not NULL
116          * data.any is NULL
117          * utf8 is NULL
118          * utf8_length = 0
119 
120        - legacy string, ready:
121 
122          * structure = PyUnicodeObject structure
123          * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
124          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
125            PyUnicode_4BYTE_KIND
126          * compact = 0
127          * ready = 1
128          * data.any is not NULL
129          * utf8 is shared and utf8_length = length with data.any if ascii = 1
130          * utf8_length = 0 if utf8 is NULL
131          * wstr is shared with data.any and wstr_length = length
132            if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
133            or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
134          * wstr_length = 0 if wstr is NULL
135 
136        Compact strings use only one memory block (structure + characters),
137        whereas legacy strings use one block for the structure and one block
138        for characters.
139 
140        Legacy strings are created by PyUnicode_FromUnicode() and
141        PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
142        when PyUnicode_READY() is called.
143 
144        See also _PyUnicode_CheckConsistency().
145     */
146     PyObject_HEAD
147     Py_ssize_t length;          /* Number of code points in the string */
148     Py_hash_t hash;             /* Hash value; -1 if not set */
149     struct {
150         /*
151            SSTATE_NOT_INTERNED (0)
152            SSTATE_INTERNED_MORTAL (1)
153            SSTATE_INTERNED_IMMORTAL (2)
154 
155            If interned != SSTATE_NOT_INTERNED, the two references from the
156            dictionary to this object are *not* counted in ob_refcnt.
157          */
158         unsigned int interned:2;
159         /* Character size:
160 
161            - PyUnicode_WCHAR_KIND (0):
162 
163              * character type = wchar_t (16 or 32 bits, depending on the
164                platform)
165 
166            - PyUnicode_1BYTE_KIND (1):
167 
168              * character type = Py_UCS1 (8 bits, unsigned)
169              * all characters are in the range U+0000-U+00FF (latin1)
170              * if ascii is set, all characters are in the range U+0000-U+007F
171                (ASCII), otherwise at least one character is in the range
172                U+0080-U+00FF
173 
174            - PyUnicode_2BYTE_KIND (2):
175 
176              * character type = Py_UCS2 (16 bits, unsigned)
177              * all characters are in the range U+0000-U+FFFF (BMP)
178              * at least one character is in the range U+0100-U+FFFF
179 
180            - PyUnicode_4BYTE_KIND (4):
181 
182              * character type = Py_UCS4 (32 bits, unsigned)
183              * all characters are in the range U+0000-U+10FFFF
184              * at least one character is in the range U+10000-U+10FFFF
185          */
186         unsigned int kind:3;
187         /* Compact is with respect to the allocation scheme. Compact unicode
188            objects only require one memory block while non-compact objects use
189            one block for the PyUnicodeObject struct and another for its data
190            buffer. */
191         unsigned int compact:1;
192         /* The string only contains characters in the range U+0000-U+007F (ASCII)
193            and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
194            set, use the PyASCIIObject structure. */
195         unsigned int ascii:1;
196         /* The ready flag indicates whether the object layout is initialized
197            completely. This means that this is either a compact object, or
198            the data pointer is filled out. The bit is redundant, and helps
199            to minimize the test in PyUnicode_IS_READY(). */
200         unsigned int ready:1;
201         /* Padding to ensure that PyUnicode_DATA() is always aligned to
202            4 bytes (see issue #19537 on m68k). */
203         unsigned int :24;
204     } state;
205     wchar_t *wstr;              /* wchar_t representation (null-terminated) */
206 } PyASCIIObject;
207 
208 /* Non-ASCII strings allocated through PyUnicode_New use the
209    PyCompactUnicodeObject structure. state.compact is set, and the data
210    immediately follow the structure. */
211 typedef struct {
212     PyASCIIObject _base;
213     Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
214                                  * terminating \0. */
215     char *utf8;                 /* UTF-8 representation (null-terminated) */
216     Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
217                                  * surrogates count as two code points. */
218 } PyCompactUnicodeObject;
219 
220 /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
221    PyUnicodeObject structure. The actual string data is initially in the wstr
222    block, and copied into the data block using _PyUnicode_Ready. */
223 typedef struct {
224     PyCompactUnicodeObject _base;
225     union {
226         void *any;
227         Py_UCS1 *latin1;
228         Py_UCS2 *ucs2;
229         Py_UCS4 *ucs4;
230     } data;                     /* Canonical, smallest-form Unicode buffer */
231 } PyUnicodeObject;
232 
233 PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
234     PyObject *op,
235     int check_content);
236 
237 /* Fast access macros */
238 
239 /* Returns the deprecated Py_UNICODE representation's size in code units
240    (this includes surrogate pairs as 2 units).
241    If the Py_UNICODE representation is not available, it will be computed
242    on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
243 
244 /* Py_DEPRECATED(3.3) */
245 #define PyUnicode_GET_SIZE(op)                       \
246     (assert(PyUnicode_Check(op)),                    \
247      (((PyASCIIObject *)(op))->wstr) ?               \
248       PyUnicode_WSTR_LENGTH(op) :                    \
249       ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\
250        assert(((PyASCIIObject *)(op))->wstr),        \
251        PyUnicode_WSTR_LENGTH(op)))
252 
253 /* Py_DEPRECATED(3.3) */
254 #define PyUnicode_GET_DATA_SIZE(op) \
255     (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
256 
257 /* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
258    representation on demand.  Using this macro is very inefficient now,
259    try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
260    use PyUnicode_WRITE() and PyUnicode_READ(). */
261 
262 /* Py_DEPRECATED(3.3) */
263 #define PyUnicode_AS_UNICODE(op) \
264     (assert(PyUnicode_Check(op)), \
265      (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
266       PyUnicode_AsUnicode(_PyObject_CAST(op)))
267 
268 /* Py_DEPRECATED(3.3) */
269 #define PyUnicode_AS_DATA(op) \
270     ((const char *)(PyUnicode_AS_UNICODE(op)))
271 
272 
273 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
274 
275 /* Values for PyASCIIObject.state: */
276 
277 /* Interning state. */
278 #define SSTATE_NOT_INTERNED 0
279 #define SSTATE_INTERNED_MORTAL 1
280 #define SSTATE_INTERNED_IMMORTAL 2
281 
282 /* Return true if the string contains only ASCII characters, or 0 if not. The
283    string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
284    ready. */
285 #define PyUnicode_IS_ASCII(op)                   \
286     (assert(PyUnicode_Check(op)),                \
287      assert(PyUnicode_IS_READY(op)),             \
288      ((PyASCIIObject*)op)->state.ascii)
289 
290 /* Return true if the string is compact or 0 if not.
291    No type checks or Ready calls are performed. */
292 #define PyUnicode_IS_COMPACT(op) \
293     (((PyASCIIObject*)(op))->state.compact)
294 
295 /* Return true if the string is a compact ASCII string (use PyASCIIObject
296    structure), or 0 if not.  No type checks or Ready calls are performed. */
297 #define PyUnicode_IS_COMPACT_ASCII(op)                 \
298     (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
299 
300 enum PyUnicode_Kind {
301 /* String contains only wstr byte characters.  This is only possible
302    when the string was created with a legacy API and _PyUnicode_Ready()
303    has not been called yet.  */
304     PyUnicode_WCHAR_KIND = 0,
305 /* Return values of the PyUnicode_KIND() macro: */
306     PyUnicode_1BYTE_KIND = 1,
307     PyUnicode_2BYTE_KIND = 2,
308     PyUnicode_4BYTE_KIND = 4
309 };
310 
311 /* Return pointers to the canonical representation cast to unsigned char,
312    Py_UCS2, or Py_UCS4 for direct character access.
313    No checks are performed, use PyUnicode_KIND() before to ensure
314    these will work correctly. */
315 
316 #define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
317 #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
318 #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
319 
320 /* Return one of the PyUnicode_*_KIND values defined above. */
321 #define PyUnicode_KIND(op) \
322     (assert(PyUnicode_Check(op)), \
323      assert(PyUnicode_IS_READY(op)),            \
324      ((PyASCIIObject *)(op))->state.kind)
325 
326 /* Return a void pointer to the raw unicode buffer. */
327 #define _PyUnicode_COMPACT_DATA(op)                     \
328     (PyUnicode_IS_ASCII(op) ?                   \
329      ((void*)((PyASCIIObject*)(op) + 1)) :              \
330      ((void*)((PyCompactUnicodeObject*)(op) + 1)))
331 
332 #define _PyUnicode_NONCOMPACT_DATA(op)                  \
333     (assert(((PyUnicodeObject*)(op))->data.any),        \
334      ((((PyUnicodeObject *)(op))->data.any)))
335 
336 #define PyUnicode_DATA(op) \
337     (assert(PyUnicode_Check(op)), \
338      PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
339      _PyUnicode_NONCOMPACT_DATA(op))
340 
341 /* In the access macros below, "kind" may be evaluated more than once.
342    All other macro parameters are evaluated exactly once, so it is safe
343    to put side effects into them (such as increasing the index). */
344 
345 /* Write into the canonical representation, this macro does not do any sanity
346    checks and is intended for usage in loops.  The caller should cache the
347    kind and data pointers obtained from other macro calls.
348    index is the index in the string (starts at 0) and value is the new
349    code point value which should be written to that location. */
350 #define PyUnicode_WRITE(kind, data, index, value) \
351     do { \
352         switch ((kind)) { \
353         case PyUnicode_1BYTE_KIND: { \
354             ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
355             break; \
356         } \
357         case PyUnicode_2BYTE_KIND: { \
358             ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
359             break; \
360         } \
361         default: { \
362             assert((kind) == PyUnicode_4BYTE_KIND); \
363             ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
364         } \
365         } \
366     } while (0)
367 
368 /* Read a code point from the string's canonical representation.  No checks
369    or ready calls are performed. */
370 #define PyUnicode_READ(kind, data, index) \
371     ((Py_UCS4) \
372     ((kind) == PyUnicode_1BYTE_KIND ? \
373         ((const Py_UCS1 *)(data))[(index)] : \
374         ((kind) == PyUnicode_2BYTE_KIND ? \
375             ((const Py_UCS2 *)(data))[(index)] : \
376             ((const Py_UCS4 *)(data))[(index)] \
377         ) \
378     ))
379 
380 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
381    calls PyUnicode_KIND() and might call it twice.  For single reads, use
382    PyUnicode_READ_CHAR, for multiple consecutive reads callers should
383    cache kind and use PyUnicode_READ instead. */
384 #define PyUnicode_READ_CHAR(unicode, index) \
385     (assert(PyUnicode_Check(unicode)),          \
386      assert(PyUnicode_IS_READY(unicode)),       \
387      (Py_UCS4)                                  \
388         (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
389             ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
390             (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
391                 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
392                 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
393             ) \
394         ))
395 
396 /* Returns the length of the unicode string. The caller has to make sure that
397    the string has it's canonical representation set before calling
398    this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
399 #define PyUnicode_GET_LENGTH(op)                \
400     (assert(PyUnicode_Check(op)),               \
401      assert(PyUnicode_IS_READY(op)),            \
402      ((PyASCIIObject *)(op))->length)
403 
404 
405 /* Fast check to determine whether an object is ready. Equivalent to
406    PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any */
407 
408 #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
409 
410 /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
411    case.  If the canonical representation is not yet set, it will still call
412    _PyUnicode_Ready().
413    Returns 0 on success and -1 on errors. */
414 #define PyUnicode_READY(op)                        \
415     (assert(PyUnicode_Check(op)),                       \
416      (PyUnicode_IS_READY(op) ?                          \
417       0 : _PyUnicode_Ready(_PyObject_CAST(op))))
418 
419 /* Return a maximum character value which is suitable for creating another
420    string based on op.  This is always an approximation but more efficient
421    than iterating over the string. */
422 #define PyUnicode_MAX_CHAR_VALUE(op) \
423     (assert(PyUnicode_IS_READY(op)),                                    \
424      (PyUnicode_IS_ASCII(op) ?                                          \
425       (0x7f) :                                                          \
426       (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
427        (0xffU) :                                                        \
428        (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
429         (0xffffU) :                                                     \
430         (0x10ffffU)))))
431 
432 Py_DEPRECATED(3.3)
_PyUnicode_get_wstr_length(PyObject * op)433 static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {
434     return PyUnicode_IS_COMPACT_ASCII(op) ?
435             ((PyASCIIObject*)op)->length :
436             ((PyCompactUnicodeObject*)op)->wstr_length;
437 }
438 #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)
439 
440 /* === Public API ========================================================= */
441 
442 /* --- Plain Py_UNICODE --------------------------------------------------- */
443 
444 /* With PEP 393, this is the recommended way to allocate a new unicode object.
445    This function will allocate the object and its buffer in a single memory
446    block.  Objects created using this function are not resizable. */
447 PyAPI_FUNC(PyObject*) PyUnicode_New(
448     Py_ssize_t size,            /* Number of code points in the new string */
449     Py_UCS4 maxchar             /* maximum code point value in the string */
450     );
451 
452 /* Initializes the canonical string representation from the deprecated
453    wstr/Py_UNICODE representation. This function is used to convert Unicode
454    objects which were created using the old API to the new flexible format
455    introduced with PEP 393.
456 
457    Don't call this function directly, use the public PyUnicode_READY() macro
458    instead. */
459 PyAPI_FUNC(int) _PyUnicode_Ready(
460     PyObject *unicode           /* Unicode object */
461     );
462 
463 /* Get a copy of a Unicode string. */
464 PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
465     PyObject *unicode
466     );
467 
468 /* Copy character from one unicode object into another, this function performs
469    character conversion when necessary and falls back to memcpy() if possible.
470 
471    Fail if to is too small (smaller than *how_many* or smaller than
472    len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
473    kind(to), or if *to* has more than 1 reference.
474 
475    Return the number of written character, or return -1 and raise an exception
476    on error.
477 
478    Pseudo-code:
479 
480        how_many = min(how_many, len(from) - from_start)
481        to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
482        return how_many
483 
484    Note: The function doesn't write a terminating null character.
485    */
486 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
487     PyObject *to,
488     Py_ssize_t to_start,
489     PyObject *from,
490     Py_ssize_t from_start,
491     Py_ssize_t how_many
492     );
493 
494 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
495    may crash if parameters are invalid (e.g. if the output string
496    is too short). */
497 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
498     PyObject *to,
499     Py_ssize_t to_start,
500     PyObject *from,
501     Py_ssize_t from_start,
502     Py_ssize_t how_many
503     );
504 
505 /* Fill a string with a character: write fill_char into
506    unicode[start:start+length].
507 
508    Fail if fill_char is bigger than the string maximum character, or if the
509    string has more than 1 reference.
510 
511    Return the number of written character, or return -1 and raise an exception
512    on error. */
513 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
514     PyObject *unicode,
515     Py_ssize_t start,
516     Py_ssize_t length,
517     Py_UCS4 fill_char
518     );
519 
520 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
521    if parameters are invalid (e.g. if length is longer than the string). */
522 PyAPI_FUNC(void) _PyUnicode_FastFill(
523     PyObject *unicode,
524     Py_ssize_t start,
525     Py_ssize_t length,
526     Py_UCS4 fill_char
527     );
528 
529 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
530    size.
531 
532    u may be NULL which causes the contents to be undefined. It is the
533    user's responsibility to fill in the needed data afterwards. Note
534    that modifying the Unicode object contents after construction is
535    only allowed if u was set to NULL.
536 
537    The buffer is copied into the new object. */
538 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
539     const Py_UNICODE *u,        /* Unicode buffer */
540     Py_ssize_t size             /* size of buffer */
541     );
542 
543 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
544    Scan the string to find the maximum character. */
545 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
546     int kind,
547     const void *buffer,
548     Py_ssize_t size);
549 
550 /* Create a new string from a buffer of ASCII characters.
551    WARNING: Don't check if the string contains any non-ASCII character. */
552 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
553     const char *buffer,
554     Py_ssize_t size);
555 
556 /* Compute the maximum character of the substring unicode[start:end].
557    Return 127 for an empty string. */
558 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
559     PyObject *unicode,
560     Py_ssize_t start,
561     Py_ssize_t end);
562 
563 /* Return a read-only pointer to the Unicode object's internal
564    Py_UNICODE buffer.
565    If the wchar_t/Py_UNICODE representation is not yet available, this
566    function will calculate it. */
567 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
568     PyObject *unicode           /* Unicode object */
569     );
570 
571 /* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
572    contains null characters. */
573 PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
574     PyObject *unicode           /* Unicode object */
575     );
576 
577 /* Return a read-only pointer to the Unicode object's internal
578    Py_UNICODE buffer and save the length at size.
579    If the wchar_t/Py_UNICODE representation is not yet available, this
580    function will calculate it. */
581 
582 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
583     PyObject *unicode,          /* Unicode object */
584     Py_ssize_t *size            /* location where to save the length */
585     );
586 
587 
588 /* --- _PyUnicodeWriter API ----------------------------------------------- */
589 
590 typedef struct {
591     PyObject *buffer;
592     void *data;
593     enum PyUnicode_Kind kind;
594     Py_UCS4 maxchar;
595     Py_ssize_t size;
596     Py_ssize_t pos;
597 
598     /* minimum number of allocated characters (default: 0) */
599     Py_ssize_t min_length;
600 
601     /* minimum character (default: 127, ASCII) */
602     Py_UCS4 min_char;
603 
604     /* If non-zero, overallocate the buffer (default: 0). */
605     unsigned char overallocate;
606 
607     /* If readonly is 1, buffer is a shared string (cannot be modified)
608        and size is set to 0. */
609     unsigned char readonly;
610 } _PyUnicodeWriter ;
611 
612 /* Initialize a Unicode writer.
613  *
614  * By default, the minimum buffer size is 0 character and overallocation is
615  * disabled. Set min_length, min_char and overallocate attributes to control
616  * the allocation of the buffer. */
617 PyAPI_FUNC(void)
618 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
619 
620 /* Prepare the buffer to write 'length' characters
621    with the specified maximum character.
622 
623    Return 0 on success, raise an exception and return -1 on error. */
624 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
625     (((MAXCHAR) <= (WRITER)->maxchar                                  \
626       && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
627      ? 0                                                              \
628      : (((LENGTH) == 0)                                               \
629         ? 0                                                           \
630         : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
631 
632 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
633    instead. */
634 PyAPI_FUNC(int)
635 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
636                                  Py_ssize_t length, Py_UCS4 maxchar);
637 
638 /* Prepare the buffer to have at least the kind KIND.
639    For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
640    support characters in range U+000-U+FFFF.
641 
642    Return 0 on success, raise an exception and return -1 on error. */
643 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
644     (assert((KIND) != PyUnicode_WCHAR_KIND),                          \
645      (KIND) <= (WRITER)->kind                                         \
646      ? 0                                                              \
647      : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
648 
649 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
650    macro instead. */
651 PyAPI_FUNC(int)
652 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
653                                      enum PyUnicode_Kind kind);
654 
655 /* Append a Unicode character.
656    Return 0 on success, raise an exception and return -1 on error. */
657 PyAPI_FUNC(int)
658 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
659     Py_UCS4 ch
660     );
661 
662 /* Append a Unicode string.
663    Return 0 on success, raise an exception and return -1 on error. */
664 PyAPI_FUNC(int)
665 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
666     PyObject *str               /* Unicode string */
667     );
668 
669 /* Append a substring of a Unicode string.
670    Return 0 on success, raise an exception and return -1 on error. */
671 PyAPI_FUNC(int)
672 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
673     PyObject *str,              /* Unicode string */
674     Py_ssize_t start,
675     Py_ssize_t end
676     );
677 
678 /* Append an ASCII-encoded byte string.
679    Return 0 on success, raise an exception and return -1 on error. */
680 PyAPI_FUNC(int)
681 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
682     const char *str,           /* ASCII-encoded byte string */
683     Py_ssize_t len             /* number of bytes, or -1 if unknown */
684     );
685 
686 /* Append a latin1-encoded byte string.
687    Return 0 on success, raise an exception and return -1 on error. */
688 PyAPI_FUNC(int)
689 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
690     const char *str,           /* latin1-encoded byte string */
691     Py_ssize_t len             /* length in bytes */
692     );
693 
694 /* Get the value of the writer as a Unicode string. Clear the
695    buffer of the writer. Raise an exception and return NULL
696    on error. */
697 PyAPI_FUNC(PyObject *)
698 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
699 
700 /* Deallocate memory of a writer (clear its internal buffer). */
701 PyAPI_FUNC(void)
702 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
703 
704 
705 /* Format the object based on the format_spec, as defined in PEP 3101
706    (Advanced String Formatting). */
707 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
708     _PyUnicodeWriter *writer,
709     PyObject *obj,
710     PyObject *format_spec,
711     Py_ssize_t start,
712     Py_ssize_t end);
713 
714 /* --- Manage the default encoding ---------------------------------------- */
715 
716 /* Returns a pointer to the default encoding (UTF-8) of the
717    Unicode object unicode.
718 
719    Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
720    in the unicodeobject.
721 
722    _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
723    support the previous internal function with the same behaviour.
724 
725    Use of this API is DEPRECATED since no size information can be
726    extracted from the returned data.
727 */
728 
729 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
730 
731 #define _PyUnicode_AsString PyUnicode_AsUTF8
732 
733 /* --- UTF-7 Codecs ------------------------------------------------------- */
734 
735 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
736     PyObject *unicode,          /* Unicode object */
737     int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
738     int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
739     const char *errors          /* error handling */
740     );
741 
742 /* --- UTF-8 Codecs ------------------------------------------------------- */
743 
744 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
745     PyObject *unicode,
746     const char *errors);
747 
748 /* --- UTF-32 Codecs ------------------------------------------------------ */
749 
750 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
751     PyObject *object,           /* Unicode object */
752     const char *errors,         /* error handling */
753     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
754     );
755 
756 /* --- UTF-16 Codecs ------------------------------------------------------ */
757 
758 /* Returns a Python string object holding the UTF-16 encoded value of
759    the Unicode data.
760 
761    If byteorder is not 0, output is written according to the following
762    byte order:
763 
764    byteorder == -1: little endian
765    byteorder == 0:  native byte order (writes a BOM mark)
766    byteorder == 1:  big endian
767 
768    If byteorder is 0, the output string will always start with the
769    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
770    prepended.
771 */
772 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
773     PyObject* unicode,          /* Unicode object */
774     const char *errors,         /* error handling */
775     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
776     );
777 
778 /* --- Unicode-Escape Codecs ---------------------------------------------- */
779 
780 /* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
781 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
782         const char *string,     /* Unicode-Escape encoded string */
783         Py_ssize_t length,      /* size of string */
784         const char *errors,     /* error handling */
785         Py_ssize_t *consumed    /* bytes consumed */
786 );
787 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
788    chars. */
789 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
790         const char *string,     /* Unicode-Escape encoded string */
791         Py_ssize_t length,      /* size of string */
792         const char *errors,     /* error handling */
793         Py_ssize_t *consumed,   /* bytes consumed */
794         const char **first_invalid_escape  /* on return, points to first
795                                               invalid escaped char in
796                                               string. */
797 );
798 
799 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
800 
801 /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
802 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
803         const char *string,     /* Unicode-Escape encoded string */
804         Py_ssize_t length,      /* size of string */
805         const char *errors,     /* error handling */
806         Py_ssize_t *consumed    /* bytes consumed */
807 );
808 
809 /* --- Latin-1 Codecs ----------------------------------------------------- */
810 
811 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
812     PyObject* unicode,
813     const char* errors);
814 
815 /* --- ASCII Codecs ------------------------------------------------------- */
816 
817 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
818     PyObject* unicode,
819     const char* errors);
820 
821 /* --- Character Map Codecs ----------------------------------------------- */
822 
823 /* Translate an Unicode object by applying a character mapping table to
824    it and return the resulting Unicode object.
825 
826    The mapping table must map Unicode ordinal integers to Unicode strings,
827    Unicode ordinal integers or None (causing deletion of the character).
828 
829    Mapping tables may be dictionaries or sequences. Unmapped character
830    ordinals (ones which cause a LookupError) are left untouched and
831    are copied as-is.
832 */
833 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
834     PyObject *unicode,          /* Unicode object */
835     PyObject *mapping,          /* encoding mapping */
836     const char *errors          /* error handling */
837     );
838 
839 /* --- Decimal Encoder ---------------------------------------------------- */
840 
841 /* Coverts a Unicode object holding a decimal value to an ASCII string
842    for using in int, float and complex parsers.
843    Transforms code points that have decimal digit property to the
844    corresponding ASCII digit code points.  Transforms spaces to ASCII.
845    Transforms code points starting from the first non-ASCII code point that
846    is neither a decimal digit nor a space to the end into '?'. */
847 
848 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
849     PyObject *unicode           /* Unicode object */
850     );
851 
852 /* --- Methods & Slots ---------------------------------------------------- */
853 
854 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
855     PyObject *separator,
856     PyObject *const *items,
857     Py_ssize_t seqlen
858     );
859 
860 /* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
861    0 otherwise.  The right argument must be ASCII identifier.
862    Any error occurs inside will be cleared before return. */
863 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
864     PyObject *left,             /* Left string */
865     _Py_Identifier *right       /* Right identifier */
866     );
867 
868 /* Test whether a unicode is equal to ASCII string.  Return 1 if true,
869    0 otherwise.  The right argument must be ASCII-encoded string.
870    Any error occurs inside will be cleared before return. */
871 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
872     PyObject *left,
873     const char *right           /* ASCII-encoded string */
874     );
875 
876 /* Externally visible for str.strip(unicode) */
877 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
878     PyObject *self,
879     int striptype,
880     PyObject *sepobj
881     );
882 
883 /* Using explicit passed-in values, insert the thousands grouping
884    into the string pointed to by buffer.  For the argument descriptions,
885    see Objects/stringlib/localeutil.h */
886 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
887     _PyUnicodeWriter *writer,
888     Py_ssize_t n_buffer,
889     PyObject *digits,
890     Py_ssize_t d_pos,
891     Py_ssize_t n_digits,
892     Py_ssize_t min_width,
893     const char *grouping,
894     PyObject *thousands_sep,
895     Py_UCS4 *maxchar);
896 
897 /* === Characters Type APIs =============================================== */
898 
899 /* Helper array used by Py_UNICODE_ISSPACE(). */
900 
901 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
902 
903 /* These should not be used directly. Use the Py_UNICODE_IS* and
904    Py_UNICODE_TO* macros instead.
905 
906    These APIs are implemented in Objects/unicodectype.c.
907 
908 */
909 
910 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
911     Py_UCS4 ch       /* Unicode character */
912     );
913 
914 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
915     Py_UCS4 ch       /* Unicode character */
916     );
917 
918 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
919     Py_UCS4 ch       /* Unicode character */
920     );
921 
922 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
923     Py_UCS4 ch       /* Unicode character */
924     );
925 
926 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
927     Py_UCS4 ch       /* Unicode character */
928     );
929 
930 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
931     const Py_UCS4 ch         /* Unicode character */
932     );
933 
934 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
935     const Py_UCS4 ch         /* Unicode character */
936     );
937 
938 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
939     Py_UCS4 ch       /* Unicode character */
940     );
941 
942 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
943     Py_UCS4 ch       /* Unicode character */
944     );
945 
946 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
947     Py_UCS4 ch       /* Unicode character */
948     );
949 
950 PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
951     Py_UCS4 ch,       /* Unicode character */
952     Py_UCS4 *res
953     );
954 
955 PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
956     Py_UCS4 ch,       /* Unicode character */
957     Py_UCS4 *res
958     );
959 
960 PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
961     Py_UCS4 ch,       /* Unicode character */
962     Py_UCS4 *res
963     );
964 
965 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
966     Py_UCS4 ch,       /* Unicode character */
967     Py_UCS4 *res
968     );
969 
970 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
971     Py_UCS4 ch         /* Unicode character */
972     );
973 
974 PyAPI_FUNC(int) _PyUnicode_IsCased(
975     Py_UCS4 ch         /* Unicode character */
976     );
977 
978 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
979     Py_UCS4 ch       /* Unicode character */
980     );
981 
982 PyAPI_FUNC(int) _PyUnicode_ToDigit(
983     Py_UCS4 ch       /* Unicode character */
984     );
985 
986 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
987     Py_UCS4 ch       /* Unicode character */
988     );
989 
990 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
991     Py_UCS4 ch       /* Unicode character */
992     );
993 
994 PyAPI_FUNC(int) _PyUnicode_IsDigit(
995     Py_UCS4 ch       /* Unicode character */
996     );
997 
998 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
999     Py_UCS4 ch       /* Unicode character */
1000     );
1001 
1002 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1003     Py_UCS4 ch       /* Unicode character */
1004     );
1005 
1006 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1007     Py_UCS4 ch       /* Unicode character */
1008     );
1009 
1010 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
1011 
1012 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
1013 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
1014 
1015 /* Fast equality check when the inputs are known to be exact unicode types
1016    and where the hash values are equal (i.e. a very probable match) */
1017 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
1018 
1019 /* Equality check. Returns -1 on failure. */
1020 PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
1021 
1022 PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
1023 PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
1024 
1025 PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
1026