1 // Example use of charset.hh in ../include/charset.hh.
2 //
3 // To compile, you need to make charset.hh accessible on the include path.
4 // Also, charset.hh includes my iconv wrapper, which is in ../include/Incover.hh,
5 // and depends on other things in that directory; those dependencies could be
6 // made to go away.
7
8
9 // This source file should be viewed using a UTF8 editor, and its output when run should be
10 // viewed on a UTF8 terminal.
11
12
13 #include "charset.hh"
14
15 #include <iostream>
16 #include <algorithm>
17 #include <list>
18
19 using namespace pbe;
20 using namespace std;
21
22
compile_time_tagged_strings_example()23 void compile_time_tagged_strings_example()
24 {
25 // This example declares strings with compile-time-fixed character sets, converts
26 // them to other compile-time-fixed character sets, combines them, and checks for
27 // consistency:
28
29 cout << "\ncompile_time_tagged_strings_example:\n";
30
31 utf8_string french = "Le traité simplifié prêt à être soumis "
32 "à l'approbation des gouvernements";
33 latin1_string french_fixed = french.recode<latin1>();
34
35 utf8_string icelandic = "Smjörið er brætt og hveitið smátt og smátt hrært út í það";
36 latin1_string icelandic_fixed = icelandic.recode<latin1>();
37
38 utf8_string all = french + icelandic;
39
40 latin1_string all_fixed = french_fixed + icelandic_fixed;
41
42 if ((all.recode<latin1>() == all_fixed)
43 && (all == all_fixed.recode<utf8>())) {
44 cout << "Pass, both strings are '" << all << "'\n";
45 }
46 }
47
48
utf8_const_iterator_example()49 void utf8_const_iterator_example()
50 {
51 // This example shows how a string with a variable-width
52 // character set can be iterated over character-at-a-time
53 // or "unit"-at-a-time.
54
55 cout << "\nutf8_const_iterator_example:\n";
56
57 utf8_string s = "Théâtre"; // My editor stores UTF8.
58
59 // Iterate "unit" (byte) at a time:
60 cout << "Here are the bytes of '" << s << "': " << hex;
61 for (utf8_string::const_iterator i = s.begin();
62 i != s.end(); ++i) {
63 char8_t c = *i;
64 cout << static_cast<unsigned int>(static_cast<uint8_t>(c)) << " ";
65 }
66
67 // Iterate character at a time:
68 cout << "\nHere are the characters of '" << s << "': ";
69 for (utf8_string::const_character_iterator i = s.begin();
70 i != utf8_string::const_character_iterator( s.end() ); ++i) {
71 utf8_char_t c = *i; // A 32-bit decoded Unicode character
72 cout << static_cast<unsigned int>(c) << " ";
73 }
74 cout << dec << "\n";
75 }
76
77
utf8_output_iterator_example()78 void utf8_output_iterator_example()
79 {
80 // This example shows how a string with a variable-width
81 // character set can be appended to using push_back and
82 // an output iterator.
83
84 cout << "\nutf8_output_iterator_example:\n";
85
86 utf8_string s;
87
88 for (utf8_char_t c=64; c<96; ++c) {
89 s.push_back(c);
90 }
91
92 utf8_string::character_output_iterator i(s);
93
94 for (utf8_char_t c=150; c<200; ++c) {
95 *i++ = c;
96 // s.push_back(c);
97 }
98
99 cout << "Unicode characters 64 to 95 and 150 to 199:\n"
100 << s << "\n";
101 }
102
103
utf8_word_split_example()104 void utf8_word_split_example()
105 {
106 // This example demonstrates a case where a "unit" rather than a character iterator for a
107 // UTF8 string is useful: because bytes < 128 can only ever represent single characters in
108 // UTF8, we can treat a UTF8 string as a sequence of bytes when spliting at spaces.
109
110 cout << "\nutf8_word_split_example:\n";
111
112 utf8_string s = "Yo también quemo la Corona española";
113 utf8_string::const_iterator i = s.begin();
114 utf8_string::const_iterator e = s.end();
115 utf8_string::const_iterator j;
116 do {
117 j = find(i,e,' ');
118 utf8_string word(i,j);
119 cout << word << "\n";
120 i = j+1;
121 } while (j != e);
122 }
123
124
ucs4_line_wrap_example()125 void ucs4_line_wrap_example()
126 {
127 // Sometimes a random-access character iterator is needed, but an iso_8859 or similar byte
128 // character set can't be used because the characters in the content are not restricted.
129 // In this case, ucs4 is normally the best choice - though its requirement for 4 bytes per
130 // character may be considered a disadvantage in memory-limited applications.
131 // This example uses random access to break a string into lines of <=40 characters each.
132
133 cout << "\nucs4_line_wrap_example:\n";
134
135 utf8_string text_var = "Партия Единая Россия отказалась от формирования первой "
136 "тройки федерального списка - его возглавил только президент "
137 "Владимир Путин. Такое решение было принято на съезде Единой "
138 "России во вторник. Накануне президент России дал согласие "
139 "возглавить список Единой России на выборах в Госдуму.";
140
141 ucs4_string text_fixed = text_var.recode<ucs4>();
142
143 for (unsigned int i=39; i<text_fixed.length(); i+=40) {
144 while (text_fixed[i]!=' ') {
145 --i;
146 }
147 text_fixed[i] = '\n';
148 }
149
150 cout << text_fixed.recode<utf8>() << "\n";
151 }
152
153
154 // This example shows how a library-user can make a new character set available.
155 // The example is the KOI8 character set, a fixed-width byte character set containing
156 // cyrillic and latin characters.
157
158 ////// This section needs some attention from a preprocessor expert; I want to use
159 ////// a counter of some sort to allocate new charset_t values with a macro:
160 ////// PBE_DEFINE_CHARSET(koi8);
161 ////// But I can't see a good way to do it. For the time being, I'll choose a value
162 ////// manually:
163 const charset_t koi8 = static_cast<charset_t>(25);
164
165 // Define charset_traits for KOI8:
166 namespace pbe {
167 template <>
168 struct charset_traits<koi8> {
169 typedef char8_t unit_t;
170 typedef char8_t char_t;
171 };
172 };
173 typedef tagged_string<koi8> koi8_string;
174
user_defined_charset_example()175 void user_defined_charset_example()
176 {
177 charset_names[koi8] = "koi8";
178
179 cout << "\nuser_defined_charset_example:\n";
180
181 // We'll convert a string back and forth between utf8 and koi8:
182 utf8_string u = "Код Обмена Информацией, 8 бит";
183 koi8_string k = u.recode<koi8>();
184 utf8_string u2 = k.recode<utf8>();
185
186 // KOI8 is a more compact encoiding than UTF8 for cyrillic:
187 cout << "Length of UTF8 string = " << u2.length()
188 << ", length of KOI8 string = " << k.length() << "\n";
189 }
190
191
runtime_tagged_example()192 void runtime_tagged_example()
193 {
194 // This example shows how character sets known only at run-time can be used.
195 // This is motivated by multipart MIME email, where each part can have a different
196 // character set. But since MIME is rather complex to parse, this example uses
197 // the following simpler format: the input byte sequence consists of a character
198 // set name (in ascii) followed by data using that character set enclosed in {},
199 // followed by further content in another character set, and so on.
200 // This example first creates such a message and then decomposes it.
201
202 cout << "\nruntime_tagged_example:\n";
203
204 // We'll store the hybrid message in a std::string.
205 string message =
206 string("utf8{") + "El catalán, moneda lingüística" + "}"
207 + "iso-8859-1{" + utf8_string("får årets Nobelpris i litteratur.").recode<latin1>() + "}";
208 // + "ucs2{" + utf8_string("Директором СВР назначен Михаил Фрадков").recode<ucs2>() + "}";
209
210 // Now parse it into a list of run-time-tagged strings:
211 typedef list<rt_tagged_string> strings_t;
212 strings_t strings;
213 string::const_iterator i = message.begin();
214 string::const_iterator e = message.end();
215 while (i != e) {
216 string::const_iterator j = find(i,e,'{');
217 string charset_name(i,j);
218 string::const_iterator k = find(j,e,'}');
219 string content(j+1,k);
220 rt_tagged_string s(lookup_charset(charset_name),content);
221 strings.push_back(s);
222 i = k+1;
223 }
224
225 // Output the parsed strings, converting to UTF8 to do so:
226 for (strings_t::const_iterator a = strings.begin();
227 a != strings.end(); ++a) {
228 utf8_string u = a->recode<utf8>();
229 cout << u << "\n";
230 }
231
232 }
233
234
235
236 // The following examples illustrate planned functionality that's not yet implemented:
237
238 #if 1
239
240 #endif
241
242
243
main()244 int main()
245 {
246 // These examples work:
247 compile_time_tagged_strings_example();
248 utf8_const_iterator_example();
249 utf8_output_iterator_example();
250 utf8_word_split_example();
251 ucs4_line_wrap_example();
252
253 runtime_tagged_example();
254
255 // These examples don't yet work:
256 #if 1
257 user_defined_charset_example();
258 #endif
259
260 return 0;
261 }
262
263