1 //
2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE_1_0.txt or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #define BOOST_LOCALE_SOURCE
9 #include <boost/locale/boundary.hpp>
10 #include <boost/locale/generator.hpp>
11 #include <unicode/uversion.h>
12 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
13 #include <unicode/utext.h>
14 #endif
15 #include <unicode/brkiter.h>
16 #include <unicode/rbbi.h>
17 
18 #include "cdata.hpp"
19 #include "all_generator.hpp"
20 #include "icu_util.hpp"
21 #include "uconv.hpp"
22 
23 namespace boost {
24 namespace locale {
25 namespace boundary {
26 namespace impl_icu {
27 
28 using namespace boost::locale::impl_icu;
29 
map_direct(boundary_type t,icu::BreakIterator * it,int reserve)30 index_type map_direct(boundary_type t,icu::BreakIterator *it,int reserve)
31 {
32     index_type indx;
33     indx.reserve(reserve);
34     icu::RuleBasedBreakIterator *rbbi=dynamic_cast<icu::RuleBasedBreakIterator *>(it);
35 
36     indx.push_back(break_info());
37     it->first();
38     int pos=0;
39     while((pos=it->next())!=icu::BreakIterator::DONE) {
40         indx.push_back(break_info(pos));
41         /// Character does not have any specific break types
42         if(t!=character && rbbi) {
43             //
44             // There is a collapse for MSVC: int32_t defined by both boost::cstdint and icu...
45             // So need to pick one ;(
46             //
47             std::vector< ::int32_t> buffer;
48             ::int32_t membuf[8]={0}; // try not to use memory allocation if possible
49             ::int32_t *buf=membuf;
50 
51             UErrorCode err=U_ZERO_ERROR;
52             int n = rbbi->getRuleStatusVec(buf,8,err);
53 
54             if(err == U_BUFFER_OVERFLOW_ERROR) {
55                 buf=&buffer.front();
56                 buffer.resize(n,0);
57                 n = rbbi->getRuleStatusVec(buf,buffer.size(),err);
58             }
59 
60             check_and_throw_icu_error(err);
61 
62             for(int i=0;i<n;i++) {
63                 switch(t) {
64                 case word:
65                     if(UBRK_WORD_NONE<=buf[i] && buf[i]<UBRK_WORD_NONE_LIMIT)
66                         indx.back().rule |= word_none;
67                     else if(UBRK_WORD_NUMBER<=buf[i] && buf[i]<UBRK_WORD_NUMBER_LIMIT)
68                         indx.back().rule |= word_number;
69                     else if(UBRK_WORD_LETTER<=buf[i] && buf[i]<UBRK_WORD_LETTER_LIMIT)
70                         indx.back().rule |= word_letter;
71                     else if(UBRK_WORD_KANA<=buf[i] && buf[i]<UBRK_WORD_KANA_LIMIT)
72                         indx.back().rule |= word_kana;
73                     else if(UBRK_WORD_IDEO<=buf[i] && buf[i]<UBRK_WORD_IDEO_LIMIT)
74                         indx.back().rule |= word_ideo;
75                     break;
76 
77                 case line:
78                     if(UBRK_LINE_SOFT<=buf[i] && buf[i]<UBRK_LINE_SOFT_LIMIT)
79                         indx.back().rule |= line_soft;
80                     else if(UBRK_LINE_HARD<=buf[i] && buf[i]<UBRK_LINE_HARD_LIMIT)
81                         indx.back().rule |= line_hard;
82                     break;
83 
84                 case sentence:
85                     if(UBRK_SENTENCE_TERM<=buf[i] && buf[i]<UBRK_SENTENCE_TERM_LIMIT)
86                         indx.back().rule |= sentence_term;
87                     else if(UBRK_SENTENCE_SEP<=buf[i] && buf[i]<UBRK_SENTENCE_SEP_LIMIT)
88                         indx.back().rule |= sentence_sep;
89                     break;
90                 default:
91                     ;
92                 }
93             }
94         }
95         else {
96             indx.back().rule |=character_any; // Baisc mark... for character
97         }
98     }
99     return indx;
100 }
101 
get_iterator(boundary_type t,icu::Locale const & loc)102 std::auto_ptr<icu::BreakIterator> get_iterator(boundary_type t,icu::Locale const &loc)
103 {
104     UErrorCode err=U_ZERO_ERROR;
105     std::auto_ptr<icu::BreakIterator> bi;
106     switch(t) {
107     case character:
108         bi.reset(icu::BreakIterator::createCharacterInstance(loc,err));
109         break;
110     case word:
111         bi.reset(icu::BreakIterator::createWordInstance(loc,err));
112         break;
113     case sentence:
114         bi.reset(icu::BreakIterator::createSentenceInstance(loc,err));
115         break;
116     case line:
117         bi.reset(icu::BreakIterator::createLineInstance(loc,err));
118         break;
119     default:
120         throw std::runtime_error("Invalid iteration type");
121     }
122     check_and_throw_icu_error(err);
123     if(!bi.get())
124         throw std::runtime_error("Failed to create break iterator");
125     return bi;
126 }
127 
128 
129 template<typename CharType>
do_map(boundary_type t,CharType const * begin,CharType const * end,icu::Locale const & loc,std::string const & encoding)130 index_type do_map(boundary_type t,CharType const *begin,CharType const *end,icu::Locale const &loc,std::string const &encoding)
131 {
132     index_type indx;
133     std::auto_ptr<icu::BreakIterator> bi(get_iterator(t,loc));
134 
135 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
136     UErrorCode err=U_ZERO_ERROR;
137     if(sizeof(CharType) == 2 || (sizeof(CharType)==1 && encoding=="UTF-8"))
138     {
139         UText *ut=0;
140         try {
141             if(sizeof(CharType)==1)
142                 ut=utext_openUTF8(0,reinterpret_cast<char const *>(begin),end-begin,&err);
143             else // sizeof(CharType)==2
144                 ut=utext_openUChars(0,reinterpret_cast<UChar const *>(begin),end-begin,&err);
145 
146             check_and_throw_icu_error(err);
147             err=U_ZERO_ERROR;
148             if(!ut) throw std::runtime_error("Failed to create UText");
149             bi->setText(ut,err);
150             check_and_throw_icu_error(err);
151             index_type res=map_direct(t,bi.get(),end-begin);
152             indx.swap(res);
153         }
154         catch(...) {
155             if(ut)
156                 utext_close(ut);
157             throw;
158         }
159         if(ut) utext_close(ut);
160     }
161     else
162 #endif
163     {
164         icu_std_converter<CharType> cvt(encoding);
165         icu::UnicodeString str=cvt.icu(begin,end);
166         bi->setText(str);
167         index_type indirect = map_direct(t,bi.get(),str.length());
168         indx=indirect;
169         for(size_t i=1;i<indirect.size();i++) {
170             size_t offset_inderect=indirect[i-1].offset;
171             size_t diff = indirect[i].offset - offset_inderect;
172             size_t offset_direct=indx[i-1].offset;
173             indx[i].offset=offset_direct + cvt.cut(str,begin,end,diff,offset_inderect,offset_direct);
174         }
175     }
176     return indx;
177 } // do_map
178 
179 template<typename CharType>
180 class boundary_indexing_impl : public boundary_indexing<CharType> {
181 public:
boundary_indexing_impl(cdata const & data)182     boundary_indexing_impl(cdata const &data) :
183         locale_(data.locale),
184         encoding_(data.encoding)
185     {
186     }
map(boundary_type t,CharType const * begin,CharType const * end) const187     index_type map(boundary_type t,CharType const *begin,CharType const *end) const
188     {
189         return do_map<CharType>(t,begin,end,locale_,encoding_);
190     }
191 private:
192     icu::Locale locale_;
193     std::string encoding_;
194 };
195 
196 
197 
198 } // impl_icu
199 } // boundary
200 
201 namespace impl_icu {
create_boundary(std::locale const & in,cdata const & cd,character_facet_type type)202     std::locale create_boundary(std::locale const &in,cdata const &cd,character_facet_type type)
203     {
204         using namespace boost::locale::boundary::impl_icu;
205         switch(type) {
206         case char_facet:
207             return std::locale(in,new boundary_indexing_impl<char>(cd));
208         case wchar_t_facet:
209             return std::locale(in,new boundary_indexing_impl<wchar_t>(cd));
210         #ifdef BOOST_HAS_CHAR16_T
211         case char16_t_facet:
212             return std::locale(in,new boundary_indexing_impl<char16_t>(cd));
213         #endif
214         #ifdef BOOST_HAS_CHAR32_T
215         case char32_t_facet:
216             return std::locale(in,new boundary_indexing_impl<char32_t>(cd));
217         #endif
218         default:
219             return in;
220         }
221     }
222 } // impl_icu
223 
224 } // locale
225 } // boost
226 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
227