1 /*
2 punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
3 http://www.nicemice.net/idn/
4 Adam M. Costello
5 http://www.nicemice.net/amc/
6 
7 This is ANSI C code (C89) implementing Punycode 1.0.x.
8 
9 This single file contains three sections (an interface, an
10 implementation, and a wrapper for testing) that would normally belong
11 in three separate files (punycode.h, punycode.c, punycode-test.c), but
12 here they are bundled into one file (punycode-sample.c) for convenient
13 testing.  Anyone wishing to reuse this code will probably want to split
14 it apart.
15 
16 */
17 
18 /************************************************************/
19 /* Public interface (would normally go in its own .h file): */
20 
21 #include <limits.h>
22 #include <stddef.h>
23 
24 enum punycode_status {
25   punycode_success    = 0,
26   punycode_bad_input  = 1, /* Input is invalid.                       */
27   punycode_big_output = 2, /* Output would exceed the space provided. */
28   punycode_overflow   = 3  /* Wider integers needed to process input. */
29 };
30 
31 /* punycode_uint needs to be unsigned and needs to be */
32 /* at least 26 bits wide.  The particular type can be */
33 /* specified by defining PUNYCODE_UINT, otherwise a   */
34 /* suitable type will be chosen automatically.        */
35 
36 #ifdef PUNYCODE_UINT
37   typedef PUNYCODE_UINT punycode_uint;
38 #elif UINT_MAX >= (1 << 26) - 1
39   typedef unsigned int punycode_uint;
40 #else
41   typedef unsigned long punycode_uint;
42 #endif
43 
44 enum punycode_status punycode_encode(
45   size_t,                 /* input_length  */
46   const punycode_uint [], /* input         */
47   const unsigned char [], /* case_flags    */
48   size_t *,               /* output_length */
49   char []                 /* output        */
50 );
51 
52 /*
53     punycode_encode() converts a sequence of code points (presumed to be
54     Unicode code points) to Punycode.
55 
56     Input arguments (to be supplied by the caller):
57 
58         input_length
59             The number of code points in the input array and the number
60             of flags in the case_flags array.
61 
62         input
63             An array of code points.  They are presumed to be Unicode
64             code points, but that is not strictly necessary.  The
65             array contains code points, not code units.  UTF-16 uses
66             code units D800 through DFFF to refer to code points
67             10000..10FFFF.  The code points D800..DFFF do not occur in
68             any valid Unicode string.  The code points that can occur in
69             Unicode strings (0..D7FF and E000..10FFFF) are also called
70             Unicode scalar values.
71 
72         case_flags
73             A null pointer or an array of boolean values parallel to
74             the input array.  Nonzero (true, flagged) suggests that the
75             corresponding Unicode character be forced to uppercase after
76             being decoded (if possible), and zero (false, unflagged)
77             suggests that it be forced to lowercase (if possible).
78             ASCII code points (0..7F) are encoded literally, except that
79             ASCII letters are forced to uppercase or lowercase according
80             to the corresponding case flags.  If case_flags is a null
81             pointer then ASCII letters are left as they are, and other
82             code points are treated as unflagged.
83 
84     Output arguments (to be filled in by the function):
85 
86         output
87             An array of ASCII code points.  It is *not* null-terminated;
88             it will contain zeros if and only if the input contains
89             zeros.  (Of course the caller can leave room for a
90             terminator and add one if needed.)
91 
92     Input/output arguments (to be supplied by the caller and overwritten
93     by the function):
94 
95         output_length
96             The caller passes in the maximum number of ASCII code points
97             that it can receive.  On successful return it will contain
98             the number of ASCII code points actually output.
99 
100     Return value:
101 
102         Can be any of the punycode_status values defined above except
103         punycode_bad_input.  If not punycode_success, then output_size
104         and output might contain garbage.
105 */
106 
107 enum punycode_status punycode_decode(
108   size_t,           /* input_length  */
109   const char [],    /* input         */
110   size_t *,         /* output_length */
111   punycode_uint [], /* output        */
112   unsigned char []  /* case_flags    */
113 );
114 
115 /*
116     punycode_decode() converts Punycode to a sequence of code points
117     (presumed to be Unicode code points).
118 
119     Input arguments (to be supplied by the caller):
120 
121         input_length
122             The number of ASCII code points in the input array.
123 
124         input
125             An array of ASCII code points (0..7F).
126 
127     Output arguments (to be filled in by the function):
128 
129         output
130             An array of code points like the input argument of
131             punycode_encode() (see above).
132 
133         case_flags
134             A null pointer (if the flags are not needed by the caller)
135             or an array of boolean values parallel to the output array.
136             Nonzero (true, flagged) suggests that the corresponding
137             Unicode character be forced to uppercase by the caller (if
138             possible), and zero (false, unflagged) suggests that it
139             be forced to lowercase (if possible).  ASCII code points
140             (0..7F) are output already in the proper case, but their
141             flags will be set appropriately so that applying the flags
142             would be harmless.
143 
144     Input/output arguments (to be supplied by the caller and overwritten
145     by the function):
146 
147         output_length
148             The caller passes in the maximum number of code points
149             that it can receive into the output array (which is also
150             the maximum number of flags that it can receive into the
151             case_flags array, if case_flags is not a null pointer).  On
152             successful return it will contain the number of code points
153             actually output (which is also the number of flags actually
154             output, if case_flags is not a null pointer).  The decoder
155             will never need to output more code points than the number
156             of ASCII code points in the input, because of the way the
157             encoding is defined.  The number of code points output
158             cannot exceed the maximum possible value of a punycode_uint,
159             even if the supplied output_length is greater than that.
160 
161     Return value:
162 
163         Can be any of the punycode_status values defined above.  If not
164         punycode_success, then output_length, output, and case_flags
165         might contain garbage.
166 */
167 
168