1 /*=============================================================================
2     Copyright (c) 2001-2011 Joel de Guzman
3 
4     Distributed under the Boost Software License, Version 1.0. (See accompanying
5     file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 =============================================================================*/
7 #include <boost/config/warning_disable.hpp>
8 #include <boost/spirit/include/qi.hpp>
9 #include <boost/spirit/include/phoenix.hpp>
10 #include <boost/unordered_map.hpp>
11 #include <boost/algorithm/string/trim.hpp>
12 #include <boost/cstdint.hpp>
13 #include <boost/foreach.hpp>
14 #include <boost/array.hpp>
15 #include <boost/scoped_array.hpp>
16 #include <boost/range/iterator_range.hpp>
17 
18 #include <iostream>
19 #include <iomanip>
20 #include <fstream>
21 #include <vector>
22 #include <algorithm>
23 #include <string>
24 #include <map>
25 
26 // We place the data here. Each line comprises various fields
27 typedef std::vector<std::string> ucd_line;
28 typedef std::vector<ucd_line> ucd_vector;
29 typedef std::vector<ucd_line>::iterator ucd_iterator;
30 
31 // spirit and phoenix using declarations
32 using boost::spirit::qi::parse;
33 using boost::spirit::qi::hex;
34 using boost::spirit::qi::char_;
35 using boost::spirit::qi::eol;
36 using boost::spirit::qi::rule;
37 using boost::spirit::qi::omit;
38 using boost::spirit::qi::_1;
39 using boost::spirit::qi::_val;
40 using boost::phoenix::push_back;
41 using boost::phoenix::ref;
42 
43 // basic unsigned types
44 using boost::uint8_t;
45 using boost::uint16_t;
46 using boost::uint32_t;
47 
48 // a char range
49 struct ucd_range
50 {
ucd_rangeucd_range51     ucd_range(uint32_t start, uint32_t finish)
52         : start(start), finish(finish) {}
53 
54     // we need this so we can use ucd_range as a multimap key
operator <(ucd_range const & a,ucd_range const & b)55     friend bool operator<(ucd_range const& a, ucd_range const& b)
56     {
57         return a.start < b.start;
58     }
59 
60     uint32_t start;
61     uint32_t finish;
62 };
63 
64 class ucd_info
65 {
66 public:
67 
ucd_info(char const * filename)68     ucd_info(char const* filename)
69     {
70         std::ifstream in(filename, std::ios_base::in);
71         if (!in)
72         {
73             std::cerr << "Error: Could not open input file: "
74                 << filename << std::endl;
75         }
76         else
77         {
78             std::string data;               // We will read the contents here.
79             in.unsetf(std::ios::skipws);    // No white space skipping!
80             std::copy(
81                 std::istream_iterator<char>(in),
82                 std::istream_iterator<char>(),
83                 std::back_inserter(data));
84 
85             typedef std::string::const_iterator iterator_type;
86             iterator_type f = data.begin();
87             iterator_type l = data.end();
88 
89             rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
90             rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl);
91             rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
92             rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]);
93 
94             parse(f, l, file, info);
95         }
96     }
97 
98     template <typename Array>
collect(Array & data,int field,bool collect_properties=true) const99     void collect(Array& data, int field, bool collect_properties = true) const
100     {
101         BOOST_ASSERT(!info.empty());
102         ucd_vector::const_iterator current = info.begin();
103         ucd_vector::const_iterator end = info.end();
104 
105         while (current != end)
106         {
107             std::string range = (*current)[0];
108             boost::trim(range);
109 
110             std::string::const_iterator f = range.begin();
111             std::string::const_iterator l = range.end();
112 
113             // get the code-point range
114             uint32_t start;
115             uint32_t finish;
116             parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));
117 
118             // special case for UnicodeData.txt ranges:
119             if ((*current)[1].find("First>") != std::string::npos)
120             {
121                 ++current;
122                 BOOST_ASSERT(current != end);
123                 BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);
124 
125                 std::string range = (*current)[0];
126                 boost::trim(range);
127                 f = range.begin();
128                 l = range.end();
129 
130                 parse(f, l, hex[ref(finish) = _1]);
131             }
132 
133             std::string code;
134             if (field < int(current->size()))
135                 code = (*current)[field];
136             boost::trim(code);
137             // Only collect properties we are interested in
138             if (collect_properties) // code for properties
139             {
140                 if (!ignore_property(code))
141                 {
142                     for (uint32_t i = start; i <= finish; ++i)
143                         data[i] |= map_property(code);
144                 }
145             }
146             else // code for actual numeric values
147             {
148                 for (uint32_t i = start; i <= finish; ++i)
149                 {
150                     if (code.empty())
151                     {
152                         data[i] = 0; // signal that this code maps to itself
153                     }
154                     else
155                     {
156                         f = code.begin();
157                         l = code.end();
158                         parse(f, l, hex, data[i]);
159                     }
160                 }
161             }
162             ++current;
163         }
164     }
165 
166 private:
167 
ignore_property(std::string const & p)168     static bool ignore_property(std::string const& p)
169     {
170         // We don't handle all properties
171         std::map<std::string, int>& pm = get_property_map();
172         std::map<std::string, int>::iterator i = pm.find(p);
173         return i == pm.end();
174     }
175 
176     static int
map_property(std::string const & p)177     map_property(std::string const& p)
178     {
179         std::map<std::string, int>& pm = get_property_map();
180         std::map<std::string, int>::iterator i = pm.find(p);
181         BOOST_ASSERT(i != pm.end());
182         return i->second;
183     }
184 
185     static std::map<std::string, int>&
get_property_map()186     get_property_map()
187     {
188         // The properties we are interested in:
189         static std::map<std::string, int> map;
190         if (map.empty())
191         {
192             // General_Category
193             map["Lu"] = 0;
194             map["Ll"] = 1;
195             map["Lt"] = 2;
196             map["Lm"] = 3;
197             map["Lo"] = 4;
198 
199             map["Mn"] = 8;
200             map["Me"] = 9;
201             map["Mc"] = 10;
202 
203             map["Nd"] = 16;
204             map["Nl"] = 17;
205             map["No"] = 18;
206 
207             map["Zs"] = 24;
208             map["Zl"] = 25;
209             map["Zp"] = 26;
210 
211             map["Cc"] = 32;
212             map["Cf"] = 33;
213             map["Co"] = 34;
214             map["Cs"] = 35;
215             map["Cn"] = 36;
216 
217             map["Pd"] = 40;
218             map["Ps"] = 41;
219             map["Pe"] = 42;
220             map["Pc"] = 43;
221             map["Po"] = 44;
222             map["Pi"] = 45;
223             map["Pf"] = 46;
224 
225             map["Sm"] = 48;
226             map["Sc"] = 49;
227             map["Sk"] = 50;
228             map["So"] = 51;
229 
230             // Derived Properties.
231             map["Alphabetic"] = 64;
232             map["Uppercase"] = 128;
233             map["Lowercase"] = 256;
234             map["White_Space"] = 512;
235             map["Hex_Digit"] = 1024;
236             map["Noncharacter_Code_Point"] = 2048;
237             map["Default_Ignorable_Code_Point"] = 4096;
238 
239             // Script
240             map["Arabic"] = 0;
241             map["Imperial_Aramaic"] = 1;
242             map["Armenian"] = 2;
243             map["Avestan"] = 3;
244             map["Balinese"] = 4;
245             map["Bamum"] = 5;
246             map["Bengali"] = 6;
247             map["Bopomofo"] = 7;
248             map["Braille"] = 8;
249             map["Buginese"] = 9;
250             map["Buhid"] = 10;
251             map["Canadian_Aboriginal"] = 11;
252             map["Carian"] = 12;
253             map["Cham"] = 13;
254             map["Cherokee"] = 14;
255             map["Coptic"] = 15;
256             map["Cypriot"] = 16;
257             map["Cyrillic"] = 17;
258             map["Devanagari"] = 18;
259             map["Deseret"] = 19;
260             map["Egyptian_Hieroglyphs"] = 20;
261             map["Ethiopic"] = 21;
262             map["Georgian"] = 22;
263             map["Glagolitic"] = 23;
264             map["Gothic"] = 24;
265             map["Greek"] = 25;
266             map["Gujarati"] = 26;
267             map["Gurmukhi"] = 27;
268             map["Hangul"] = 28;
269             map["Han"] = 29;
270             map["Hanunoo"] = 30;
271             map["Hebrew"] = 31;
272             map["Hiragana"] = 32;
273             map["Katakana_Or_Hiragana"] = 33;
274             map["Old_Italic"] = 34;
275             map["Javanese"] = 35;
276             map["Kayah_Li"] = 36;
277             map["Katakana"] = 37;
278             map["Kharoshthi"] = 38;
279             map["Khmer"] = 39;
280             map["Kannada"] = 40;
281             map["Kaithi"] = 41;
282             map["Tai_Tham"] = 42;
283             map["Lao"] = 43;
284             map["Latin"] = 44;
285             map["Lepcha"] = 45;
286             map["Limbu"] = 46;
287             map["Linear_B"] = 47;
288             map["Lisu"] = 48;
289             map["Lycian"] = 49;
290             map["Lydian"] = 50;
291             map["Malayalam"] = 51;
292             map["Mongolian"] = 52;
293             map["Meetei_Mayek"] = 53;
294             map["Myanmar"] = 54;
295             map["Nko"] = 55;
296             map["Ogham"] = 56;
297             map["Ol_Chiki"] = 57;
298             map["Old_Turkic"] = 58;
299             map["Oriya"] = 59;
300             map["Osmanya"] = 60;
301             map["Phags_Pa"] = 61;
302             map["Inscriptional_Pahlavi"] = 62;
303             map["Phoenician"] = 63;
304             map["Inscriptional_Parthian"] = 64;
305             map["Rejang"] = 65;
306             map["Runic"] = 66;
307             map["Samaritan"] = 67;
308             map["Old_South_Arabian"] = 68;
309             map["Saurashtra"] = 69;
310             map["Shavian"] = 70;
311             map["Sinhala"] = 71;
312             map["Sundanese"] = 72;
313             map["Syloti_Nagri"] = 73;
314             map["Syriac"] = 74;
315             map["Tagbanwa"] = 75;
316             map["Tai_Le"] = 76;
317             map["New_Tai_Lue"] = 77;
318             map["Tamil"] = 78;
319             map["Tai_Viet"] = 79;
320             map["Telugu"] = 80;
321             map["Tifinagh"] = 81;
322             map["Tagalog"] = 82;
323             map["Thaana"] = 83;
324             map["Thai"] = 84;
325             map["Tibetan"] = 85;
326             map["Ugaritic"] = 86;
327             map["Vai"] = 87;
328             map["Old_Persian"] = 88;
329             map["Cuneiform"] = 89;
330             map["Yi"] = 90;
331             map["Inherited"] = 91;
332             map["Common"] = 92;
333             map["Unknown"] = 93;
334         }
335         return map;
336     }
337 
338     ucd_vector info;
339 };
340 
341 template <typename T, uint32_t block_size_ = 256>
342 class ucd_table_builder
343 {
344 public:
345 
346     static uint32_t const block_size = block_size_;
347     static uint32_t const full_span = 0x110000;
348     typedef T value_type;
349 
ucd_table_builder()350     ucd_table_builder() : p(new T[full_span])
351     {
352         for (uint32_t i = 0; i < full_span; ++i)
353             p[i] = 0;
354     }
355 
collect(char const * filename,int field,bool collect_properties=true)356     void collect(char const* filename, int field, bool collect_properties = true)
357     {
358         std::cout << "collecting " << filename << std::endl;
359         ucd_info info(filename);
360         info.collect(p, field, collect_properties);
361     }
362 
build(std::vector<uint8_t> & stage1,std::vector<T const * > & stage2)363     void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
364     {
365         std::cout << "building tables" << std::endl;
366         std::map<block_ptr, std::vector<T const*> > blocks;
367         for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
368             blocks[block_ptr(i)].push_back(i);
369 
370         // Not enough bits to store the block indices.
371         BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));
372 
373         typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
374         std::map<T const*, std::vector<T const*> > sorted_blocks;
375         BOOST_FOREACH(blocks_value_type const& val, blocks)
376         {
377             sorted_blocks[val.first.p] = val.second;
378         }
379 
380         stage1.clear();
381         stage1.reserve(full_span / block_size);
382         stage1.resize(full_span / block_size);
383         stage2.clear();
384         stage2.reserve(blocks.size());
385 
386         typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type;
387         BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
388         {
389             stage2.push_back(val.first);
390             BOOST_FOREACH(T const* val2, val.second)
391             {
392                 stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
393             }
394         }
395     }
396 
397 private:
398 
399     struct block_ptr
400     {
block_ptrucd_table_builder::block_ptr401         block_ptr(T const* p) : p(p) {}
402 
operator <(block_ptr a,block_ptr b)403         friend bool operator<(block_ptr a, block_ptr b)
404         {
405             return std::lexicographical_compare(
406                 a.p, a.p + block_size, b.p, b.p + block_size);
407         }
408 
409         T const* p;
410     };
411 
412     boost::scoped_array<T> p;
413 };
414 
415 template <typename Out>
print_tab(Out & out,int tab)416 void print_tab(Out& out, int tab)
417 {
418     for (int i = 0; i < tab; ++i)
419         out << ' ';
420 }
421 
422 template <typename Out, typename C>
print_table(Out & out,C const & c,bool trailing_comma,int width=4,int group=16)423 void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
424 {
425     int const tab = 4;
426     typename C::size_type size = c.size();
427     BOOST_ASSERT(size > 1);
428     print_tab(out, tab);
429     out << std::setw(width) << int(c[0]);
430     for (C::size_type i = 1; i < size; ++i)
431     {
432         out << ", ";
433         if ((i % group) == 0)
434         {
435             out << std::endl;
436             print_tab(out, tab);
437         }
438         out << std::setw(width) << int(c[i]);
439     }
440 
441     if (trailing_comma)
442         out << ", " << std::endl;
443 }
444 
445 template <typename Out>
print_head(Out & out)446 void print_head(Out& out)
447 {
448     out
449         << "/*=============================================================================\n"
450         << "    Copyright (c) 2001-2011 Joel de Guzman\n"
451         << "\n"
452         << "    Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
453         << "    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
454         << "\n"
455         << "    AUTOGENERATED. DO NOT EDIT!!!\n"
456         << "==============================================================================*/\n"
457         << "#include <boost/cstdint.hpp>\n"
458         << "\n"
459         << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
460         << "{"
461         ;
462 }
463 
464 template <typename Out>
print_tail(Out & out)465 void print_tail(Out& out)
466 {
467     out
468         << "\n"
469         << "}}}} // namespace boost::spirit::unicode::detail\n"
470         ;
471 }
472 
get_int_type_name(int size)473 char const* get_int_type_name(int size)
474 {
475     switch (size)
476     {
477         case 1: return "::boost::uint8_t";
478         case 2: return "::boost::uint16_t";
479         case 4: return "::boost::uint32_t";
480         case 5: return "::boost::uint64_t";
481         default: BOOST_ASSERT(false); return 0; // invalid size
482     };
483 }
484 
485 template <typename Out, typename Builder>
print_file(Out & out,Builder & builder,int field_width,char const * name)486 void print_file(Out& out, Builder& builder, int field_width, char const* name)
487 {
488     std::cout << "Generating " << name << " tables" << std::endl;
489 
490     uint32_t const block_size = Builder::block_size;
491     typedef typename Builder::value_type value_type;
492     print_head(out);
493 
494     std::vector<uint8_t> stage1;
495     std::vector<value_type const*> stage2;
496     builder.build(stage1, stage2);
497     std::cout << "Block Size: " << block_size << std::endl;
498     std::cout << "Total Bytes: "
499         << stage1.size()+(stage2.size()*block_size*sizeof(value_type))
500         << std::endl;
501 
502     out
503         << "\n"
504         << "    static const ::boost::uint8_t " << name << "_stage1[] = {\n"
505         << "\n"
506         ;
507 
508     print_table(out, stage1, false, 3);
509     char const* int_name = get_int_type_name(sizeof(value_type));
510 
511     out
512         << "\n"
513         << "    };"
514         << "\n"
515         << "\n"
516         << "    static const " << int_name << ' ' << name << "_stage2[] = {"
517         ;
518 
519     int block_n = 0;
520     for (int i = 0; i < int(stage2.size()); ++i)
521     {
522         value_type const* p = stage2[i];
523         bool last = (i+1 == stage2.size());
524         out << "\n\n    // block " << block_n++ << std::endl;
525         print_table(out,
526             boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
527     }
528 
529     out
530         << "\n"
531         << "    };"
532         << "\n"
533         ;
534 
535     out
536         << "\n"
537         << "    inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
538         << "    {\n"
539         << "        ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
540         << "        return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
541         << "    }\n"
542         ;
543 
544     print_tail(out);
545 }
546 
main()547 int main()
548 {
549     // The category tables
550     {
551         std::ofstream out("category_table.hpp");
552         ucd_table_builder<uint16_t, 256> builder;
553         builder.collect("UnicodeData.txt", 2);
554         builder.collect("DerivedCoreProperties.txt", 1);
555         builder.collect("PropList.txt", 1);
556         print_file(out, builder, 4, "category");
557     }
558 
559     // The script tables
560     {
561         std::ofstream out("script_table.hpp");
562         ucd_table_builder<uint8_t, 256> builder;
563         builder.collect("Scripts.txt", 1);
564         print_file(out, builder, 3, "script");
565     }
566 
567     // The lowercase tables
568     {
569         std::ofstream out("lowercase_table.hpp");
570         ucd_table_builder<uint32_t, 256> builder;
571         builder.collect("UnicodeData.txt", 13, false);
572         print_file(out, builder, 6, "lowercase");
573     }
574 
575     // The uppercase tables
576     {
577         std::ofstream out("uppercase_table.hpp");
578         ucd_table_builder<uint32_t, 256> builder;
579         builder.collect("UnicodeData.txt", 12, false);
580         print_file(out, builder, 6, "uppercase");
581     }
582 
583     return 0;
584 }
585