1 /* 2 punycode-sample.c 2.0.0 (2004-Mar-21-Sun) 3 http://www.nicemice.net/idn/ 4 Adam M. Costello 5 http://www.nicemice.net/amc/ 6 7 This is ANSI C code (C89) implementing Punycode 1.0.x. 8 9 This single file contains three sections (an interface, an 10 implementation, and a wrapper for testing) that would normally belong 11 in three separate files (punycode.h, punycode.c, punycode-test.c), but 12 here they are bundled into one file (punycode-sample.c) for convenient 13 testing. Anyone wishing to reuse this code will probably want to split 14 it apart. 15 16 */ 17 18 /************************************************************/ 19 /* Public interface (would normally go in its own .h file): */ 20 21 #include <limits.h> 22 #include <stddef.h> 23 24 enum punycode_status { 25 punycode_success = 0, 26 punycode_bad_input = 1, /* Input is invalid. */ 27 punycode_big_output = 2, /* Output would exceed the space provided. */ 28 punycode_overflow = 3 /* Wider integers needed to process input. */ 29 }; 30 31 /* punycode_uint needs to be unsigned and needs to be */ 32 /* at least 26 bits wide. The particular type can be */ 33 /* specified by defining PUNYCODE_UINT, otherwise a */ 34 /* suitable type will be chosen automatically. */ 35 36 #ifdef PUNYCODE_UINT 37 typedef PUNYCODE_UINT punycode_uint; 38 #elif UINT_MAX >= (1 << 26) - 1 39 typedef unsigned int punycode_uint; 40 #else 41 typedef unsigned long punycode_uint; 42 #endif 43 44 enum punycode_status punycode_encode( 45 size_t, /* input_length */ 46 const punycode_uint [], /* input */ 47 const unsigned char [], /* case_flags */ 48 size_t *, /* output_length */ 49 char [] /* output */ 50 ); 51 52 /* 53 punycode_encode() converts a sequence of code points (presumed to be 54 Unicode code points) to Punycode. 55 56 Input arguments (to be supplied by the caller): 57 58 input_length 59 The number of code points in the input array and the number 60 of flags in the case_flags array. 61 62 input 63 An array of code points. They are presumed to be Unicode 64 code points, but that is not strictly necessary. The 65 array contains code points, not code units. UTF-16 uses 66 code units D800 through DFFF to refer to code points 67 10000..10FFFF. The code points D800..DFFF do not occur in 68 any valid Unicode string. The code points that can occur in 69 Unicode strings (0..D7FF and E000..10FFFF) are also called 70 Unicode scalar values. 71 72 case_flags 73 A null pointer or an array of boolean values parallel to 74 the input array. Nonzero (true, flagged) suggests that the 75 corresponding Unicode character be forced to uppercase after 76 being decoded (if possible), and zero (false, unflagged) 77 suggests that it be forced to lowercase (if possible). 78 ASCII code points (0..7F) are encoded literally, except that 79 ASCII letters are forced to uppercase or lowercase according 80 to the corresponding case flags. If case_flags is a null 81 pointer then ASCII letters are left as they are, and other 82 code points are treated as unflagged. 83 84 Output arguments (to be filled in by the function): 85 86 output 87 An array of ASCII code points. It is *not* null-terminated; 88 it will contain zeros if and only if the input contains 89 zeros. (Of course the caller can leave room for a 90 terminator and add one if needed.) 91 92 Input/output arguments (to be supplied by the caller and overwritten 93 by the function): 94 95 output_length 96 The caller passes in the maximum number of ASCII code points 97 that it can receive. On successful return it will contain 98 the number of ASCII code points actually output. 99 100 Return value: 101 102 Can be any of the punycode_status values defined above except 103 punycode_bad_input. If not punycode_success, then output_size 104 and output might contain garbage. 105 */ 106 107 enum punycode_status punycode_decode( 108 size_t, /* input_length */ 109 const char [], /* input */ 110 size_t *, /* output_length */ 111 punycode_uint [], /* output */ 112 unsigned char [] /* case_flags */ 113 ); 114 115 /* 116 punycode_decode() converts Punycode to a sequence of code points 117 (presumed to be Unicode code points). 118 119 Input arguments (to be supplied by the caller): 120 121 input_length 122 The number of ASCII code points in the input array. 123 124 input 125 An array of ASCII code points (0..7F). 126 127 Output arguments (to be filled in by the function): 128 129 output 130 An array of code points like the input argument of 131 punycode_encode() (see above). 132 133 case_flags 134 A null pointer (if the flags are not needed by the caller) 135 or an array of boolean values parallel to the output array. 136 Nonzero (true, flagged) suggests that the corresponding 137 Unicode character be forced to uppercase by the caller (if 138 possible), and zero (false, unflagged) suggests that it 139 be forced to lowercase (if possible). ASCII code points 140 (0..7F) are output already in the proper case, but their 141 flags will be set appropriately so that applying the flags 142 would be harmless. 143 144 Input/output arguments (to be supplied by the caller and overwritten 145 by the function): 146 147 output_length 148 The caller passes in the maximum number of code points 149 that it can receive into the output array (which is also 150 the maximum number of flags that it can receive into the 151 case_flags array, if case_flags is not a null pointer). On 152 successful return it will contain the number of code points 153 actually output (which is also the number of flags actually 154 output, if case_flags is not a null pointer). The decoder 155 will never need to output more code points than the number 156 of ASCII code points in the input, because of the way the 157 encoding is defined. The number of code points output 158 cannot exceed the maximum possible value of a punycode_uint, 159 even if the supplied output_length is greater than that. 160 161 Return value: 162 163 Can be any of the punycode_status values defined above. If not 164 punycode_success, then output_length, output, and case_flags 165 might contain garbage. 166 */ 167 168