1 // Example use of charset.hh in ../include/charset.hh.
2 //
3 // To compile, you need to make charset.hh accessible on the include path.
4 // Also, charset.hh includes my iconv wrapper, which is in ../include/Incover.hh,
5 // and depends on other things in that directory; those dependencies could be
6 // made to go away.
7 
8 
9 // This source file should be viewed using a UTF8 editor, and its output when run should be
10 // viewed on a UTF8 terminal.
11 
12 
13 #include "charset.hh"
14 
15 #include <iostream>
16 #include <algorithm>
17 #include <list>
18 
19 using namespace pbe;
20 using namespace std;
21 
22 
compile_time_tagged_strings_example()23 void compile_time_tagged_strings_example()
24 {
25   // This example declares strings with compile-time-fixed character sets, converts
26   // them to other compile-time-fixed character sets, combines them, and checks for
27   // consistency:
28 
29   cout << "\ncompile_time_tagged_strings_example:\n";
30 
31   utf8_string french = "Le traité simplifié prêt à être soumis "
32                        "à l'approbation des gouvernements";
33   latin1_string french_fixed = french.recode<latin1>();
34 
35   utf8_string icelandic = "Smjörið er brætt og hveitið smátt og smátt hrært út í það";
36   latin1_string icelandic_fixed = icelandic.recode<latin1>();
37 
38   utf8_string all = french + icelandic;
39 
40   latin1_string all_fixed = french_fixed + icelandic_fixed;
41 
42   if ((all.recode<latin1>() == all_fixed)
43       && (all == all_fixed.recode<utf8>())) {
44     cout << "Pass, both strings are '" << all << "'\n";
45   }
46 }
47 
48 
utf8_const_iterator_example()49 void utf8_const_iterator_example()
50 {
51   // This example shows how a string with a variable-width
52   // character set can be iterated over character-at-a-time
53   // or "unit"-at-a-time.
54 
55   cout << "\nutf8_const_iterator_example:\n";
56 
57   utf8_string s = "Théâtre";  // My editor stores UTF8.
58 
59   // Iterate "unit" (byte) at a time:
60   cout << "Here are the bytes of '" << s << "': " << hex;
61   for (utf8_string::const_iterator i = s.begin();
62        i != s.end(); ++i) {
63     char8_t c = *i;
64     cout << static_cast<unsigned int>(static_cast<uint8_t>(c)) << " ";
65   }
66 
67   // Iterate character at a time:
68   cout << "\nHere are the characters of '" << s << "': ";
69   for (utf8_string::const_character_iterator i = s.begin();
70        i != utf8_string::const_character_iterator( s.end() ); ++i) {
71     utf8_char_t c = *i;  // A 32-bit decoded Unicode character
72     cout << static_cast<unsigned int>(c) << " ";
73   }
74   cout << dec << "\n";
75 }
76 
77 
utf8_output_iterator_example()78 void utf8_output_iterator_example()
79 {
80   // This example shows how a string with a variable-width
81   // character set can be appended to using push_back and
82   // an output iterator.
83 
84   cout << "\nutf8_output_iterator_example:\n";
85 
86   utf8_string s;
87 
88   for (utf8_char_t c=64; c<96; ++c) {
89     s.push_back(c);
90   }
91 
92   utf8_string::character_output_iterator i(s);
93 
94   for (utf8_char_t c=150; c<200; ++c) {
95     *i++ = c;
96 //    s.push_back(c);
97   }
98 
99   cout << "Unicode characters 64 to 95 and 150 to 199:\n"
100        << s << "\n";
101 }
102 
103 
utf8_word_split_example()104 void utf8_word_split_example()
105 {
106   // This example demonstrates a case where a "unit" rather than a character iterator for a
107   // UTF8 string is useful: because bytes < 128 can only ever represent single characters in
108   // UTF8, we can treat a UTF8 string as a sequence of bytes when spliting at spaces.
109 
110   cout << "\nutf8_word_split_example:\n";
111 
112   utf8_string s = "Yo también quemo la Corona española";
113   utf8_string::const_iterator i = s.begin();
114   utf8_string::const_iterator e = s.end();
115   utf8_string::const_iterator j;
116   do {
117     j = find(i,e,' ');
118     utf8_string word(i,j);
119     cout << word << "\n";
120     i = j+1;
121   } while (j != e);
122 }
123 
124 
ucs4_line_wrap_example()125 void ucs4_line_wrap_example()
126 {
127   // Sometimes a random-access character iterator is needed, but an iso_8859 or similar byte
128   // character set can't be used because the characters in the content are not restricted.
129   // In this case, ucs4 is normally the best choice - though its requirement for 4 bytes per
130   // character may be considered a disadvantage in memory-limited applications.
131   // This example uses random access to break a string into lines of <=40 characters each.
132 
133   cout << "\nucs4_line_wrap_example:\n";
134 
135   utf8_string text_var = "Партия Единая Россия отказалась от формирования первой "
136                          "тройки федерального списка - его возглавил только президент "
137                          "Владимир Путин.  Такое решение было принято на съезде Единой "
138                          "России во вторник.  Накануне президент России дал согласие "
139                          "возглавить список Единой России на выборах в Госдуму.";
140 
141   ucs4_string text_fixed = text_var.recode<ucs4>();
142 
143   for (unsigned int i=39; i<text_fixed.length(); i+=40) {
144     while (text_fixed[i]!=' ') {
145       --i;
146     }
147     text_fixed[i] = '\n';
148   }
149 
150   cout << text_fixed.recode<utf8>() << "\n";
151 }
152 
153 
154 // This example shows how a library-user can make a new character set available.
155 // The example is the KOI8 character set, a fixed-width byte character set containing
156 // cyrillic and latin characters.
157 
158 ////// This section needs some attention from a preprocessor expert; I want to use
159 ////// a counter of some sort to allocate new charset_t values with a macro:
160 ////// PBE_DEFINE_CHARSET(koi8);
161 ////// But I can't see a good way to do it.  For the time being, I'll choose a value
162 ////// manually:
163 const charset_t koi8 = static_cast<charset_t>(25);
164 
165 // Define charset_traits for KOI8:
166 namespace pbe {
167   template <>
168   struct charset_traits<koi8> {
169     typedef char8_t unit_t;
170     typedef char8_t char_t;
171   };
172 };
173 typedef tagged_string<koi8> koi8_string;
174 
user_defined_charset_example()175 void user_defined_charset_example()
176 {
177   charset_names[koi8] = "koi8";
178 
179   cout << "\nuser_defined_charset_example:\n";
180 
181   // We'll convert a string back and forth between utf8 and koi8:
182   utf8_string u = "Код Обмена Информацией, 8 бит";
183   koi8_string k = u.recode<koi8>();
184   utf8_string u2 = k.recode<utf8>();
185 
186   // KOI8 is a more compact encoiding than UTF8 for cyrillic:
187   cout << "Length of UTF8 string = " << u2.length()
188        << ", length of KOI8 string = " << k.length() << "\n";
189 }
190 
191 
runtime_tagged_example()192 void runtime_tagged_example()
193 {
194   // This example shows how character sets known only at run-time can be used.
195   // This is motivated by multipart MIME email, where each part can have a different
196   // character set.  But since MIME is rather complex to parse, this example uses
197   // the following simpler format: the input byte sequence consists of a character
198   // set name (in ascii) followed by data using that character set enclosed in {},
199   // followed by further content in another character set, and so on.
200   // This example first creates such a message and then decomposes it.
201 
202   cout << "\nruntime_tagged_example:\n";
203 
204   // We'll store the hybrid message in a std::string.
205   string message =
206     string("utf8{")  + "El catalán, moneda lingüística" + "}"
207          + "iso-8859-1{" + utf8_string("får årets Nobelpris i litteratur.").recode<latin1>() + "}";
208 //       + "ucs2{"   + utf8_string("Директором СВР назначен Михаил Фрадков").recode<ucs2>() + "}";
209 
210   // Now parse it into a list of run-time-tagged strings:
211   typedef list<rt_tagged_string> strings_t;
212   strings_t strings;
213   string::const_iterator i = message.begin();
214   string::const_iterator e = message.end();
215   while (i != e) {
216     string::const_iterator j = find(i,e,'{');
217     string charset_name(i,j);
218     string::const_iterator k = find(j,e,'}');
219     string content(j+1,k);
220     rt_tagged_string s(lookup_charset(charset_name),content);
221     strings.push_back(s);
222     i = k+1;
223   }
224 
225   // Output the parsed strings, converting to UTF8 to do so:
226   for (strings_t::const_iterator a = strings.begin();
227        a != strings.end(); ++a) {
228     utf8_string u = a->recode<utf8>();
229     cout << u << "\n";
230   }
231 
232 }
233 
234 
235 
236 // The following examples illustrate planned functionality that's not yet implemented:
237 
238 #if 1
239 
240 #endif
241 
242 
243 
main()244 int main()
245 {
246   // These examples work:
247   compile_time_tagged_strings_example();
248   utf8_const_iterator_example();
249   utf8_output_iterator_example();
250   utf8_word_split_example();
251   ucs4_line_wrap_example();
252 
253   runtime_tagged_example();
254 
255   // These examples don't yet work:
256 #if 1
257   user_defined_charset_example();
258 #endif
259 
260   return 0;
261 }
262 
263