1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #define BOOST_LOCALE_SOURCE
9 #include <boost/locale/boundary.hpp>
10 #include <boost/locale/generator.hpp>
11 #include <unicode/uversion.h>
12 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
13 #include <unicode/utext.h>
14 #endif
15 #include <unicode/brkiter.h>
16 #include <unicode/rbbi.h>
17
18 #include "cdata.hpp"
19 #include "all_generator.hpp"
20 #include "icu_util.hpp"
21 #include "uconv.hpp"
22
23 namespace boost {
24 namespace locale {
25 namespace boundary {
26 namespace impl_icu {
27
28 using namespace boost::locale::impl_icu;
29
map_direct(boundary_type t,icu::BreakIterator * it,int reserve)30 index_type map_direct(boundary_type t,icu::BreakIterator *it,int reserve)
31 {
32 index_type indx;
33 indx.reserve(reserve);
34 icu::RuleBasedBreakIterator *rbbi=dynamic_cast<icu::RuleBasedBreakIterator *>(it);
35
36 indx.push_back(break_info());
37 it->first();
38 int pos=0;
39 while((pos=it->next())!=icu::BreakIterator::DONE) {
40 indx.push_back(break_info(pos));
41 /// Character does not have any specific break types
42 if(t!=character && rbbi) {
43 //
44 // There is a collapse for MSVC: int32_t defined by both boost::cstdint and icu...
45 // So need to pick one ;(
46 //
47 std::vector< ::int32_t> buffer;
48 ::int32_t membuf[8]={0}; // try not to use memory allocation if possible
49 ::int32_t *buf=membuf;
50
51 UErrorCode err=U_ZERO_ERROR;
52 int n = rbbi->getRuleStatusVec(buf,8,err);
53
54 if(err == U_BUFFER_OVERFLOW_ERROR) {
55 buf=&buffer.front();
56 buffer.resize(n,0);
57 n = rbbi->getRuleStatusVec(buf,buffer.size(),err);
58 }
59
60 check_and_throw_icu_error(err);
61
62 for(int i=0;i<n;i++) {
63 switch(t) {
64 case word:
65 if(UBRK_WORD_NONE<=buf[i] && buf[i]<UBRK_WORD_NONE_LIMIT)
66 indx.back().rule |= word_none;
67 else if(UBRK_WORD_NUMBER<=buf[i] && buf[i]<UBRK_WORD_NUMBER_LIMIT)
68 indx.back().rule |= word_number;
69 else if(UBRK_WORD_LETTER<=buf[i] && buf[i]<UBRK_WORD_LETTER_LIMIT)
70 indx.back().rule |= word_letter;
71 else if(UBRK_WORD_KANA<=buf[i] && buf[i]<UBRK_WORD_KANA_LIMIT)
72 indx.back().rule |= word_kana;
73 else if(UBRK_WORD_IDEO<=buf[i] && buf[i]<UBRK_WORD_IDEO_LIMIT)
74 indx.back().rule |= word_ideo;
75 break;
76
77 case line:
78 if(UBRK_LINE_SOFT<=buf[i] && buf[i]<UBRK_LINE_SOFT_LIMIT)
79 indx.back().rule |= line_soft;
80 else if(UBRK_LINE_HARD<=buf[i] && buf[i]<UBRK_LINE_HARD_LIMIT)
81 indx.back().rule |= line_hard;
82 break;
83
84 case sentence:
85 if(UBRK_SENTENCE_TERM<=buf[i] && buf[i]<UBRK_SENTENCE_TERM_LIMIT)
86 indx.back().rule |= sentence_term;
87 else if(UBRK_SENTENCE_SEP<=buf[i] && buf[i]<UBRK_SENTENCE_SEP_LIMIT)
88 indx.back().rule |= sentence_sep;
89 break;
90 default:
91 ;
92 }
93 }
94 }
95 else {
96 indx.back().rule |=character_any; // Baisc mark... for character
97 }
98 }
99 return indx;
100 }
101
get_iterator(boundary_type t,icu::Locale const & loc)102 std::auto_ptr<icu::BreakIterator> get_iterator(boundary_type t,icu::Locale const &loc)
103 {
104 UErrorCode err=U_ZERO_ERROR;
105 std::auto_ptr<icu::BreakIterator> bi;
106 switch(t) {
107 case character:
108 bi.reset(icu::BreakIterator::createCharacterInstance(loc,err));
109 break;
110 case word:
111 bi.reset(icu::BreakIterator::createWordInstance(loc,err));
112 break;
113 case sentence:
114 bi.reset(icu::BreakIterator::createSentenceInstance(loc,err));
115 break;
116 case line:
117 bi.reset(icu::BreakIterator::createLineInstance(loc,err));
118 break;
119 default:
120 throw std::runtime_error("Invalid iteration type");
121 }
122 check_and_throw_icu_error(err);
123 if(!bi.get())
124 throw std::runtime_error("Failed to create break iterator");
125 return bi;
126 }
127
128
129 template<typename CharType>
do_map(boundary_type t,CharType const * begin,CharType const * end,icu::Locale const & loc,std::string const & encoding)130 index_type do_map(boundary_type t,CharType const *begin,CharType const *end,icu::Locale const &loc,std::string const &encoding)
131 {
132 index_type indx;
133 std::auto_ptr<icu::BreakIterator> bi(get_iterator(t,loc));
134
135 #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306
136 UErrorCode err=U_ZERO_ERROR;
137 if(sizeof(CharType) == 2 || (sizeof(CharType)==1 && encoding=="UTF-8"))
138 {
139 UText *ut=0;
140 try {
141 if(sizeof(CharType)==1)
142 ut=utext_openUTF8(0,reinterpret_cast<char const *>(begin),end-begin,&err);
143 else // sizeof(CharType)==2
144 ut=utext_openUChars(0,reinterpret_cast<UChar const *>(begin),end-begin,&err);
145
146 check_and_throw_icu_error(err);
147 err=U_ZERO_ERROR;
148 if(!ut) throw std::runtime_error("Failed to create UText");
149 bi->setText(ut,err);
150 check_and_throw_icu_error(err);
151 index_type res=map_direct(t,bi.get(),end-begin);
152 indx.swap(res);
153 }
154 catch(...) {
155 if(ut)
156 utext_close(ut);
157 throw;
158 }
159 if(ut) utext_close(ut);
160 }
161 else
162 #endif
163 {
164 icu_std_converter<CharType> cvt(encoding);
165 icu::UnicodeString str=cvt.icu(begin,end);
166 bi->setText(str);
167 index_type indirect = map_direct(t,bi.get(),str.length());
168 indx=indirect;
169 for(size_t i=1;i<indirect.size();i++) {
170 size_t offset_inderect=indirect[i-1].offset;
171 size_t diff = indirect[i].offset - offset_inderect;
172 size_t offset_direct=indx[i-1].offset;
173 indx[i].offset=offset_direct + cvt.cut(str,begin,end,diff,offset_inderect,offset_direct);
174 }
175 }
176 return indx;
177 } // do_map
178
179 template<typename CharType>
180 class boundary_indexing_impl : public boundary_indexing<CharType> {
181 public:
boundary_indexing_impl(cdata const & data)182 boundary_indexing_impl(cdata const &data) :
183 locale_(data.locale),
184 encoding_(data.encoding)
185 {
186 }
map(boundary_type t,CharType const * begin,CharType const * end) const187 index_type map(boundary_type t,CharType const *begin,CharType const *end) const
188 {
189 return do_map<CharType>(t,begin,end,locale_,encoding_);
190 }
191 private:
192 icu::Locale locale_;
193 std::string encoding_;
194 };
195
196
197
198 } // impl_icu
199 } // boundary
200
201 namespace impl_icu {
create_boundary(std::locale const & in,cdata const & cd,character_facet_type type)202 std::locale create_boundary(std::locale const &in,cdata const &cd,character_facet_type type)
203 {
204 using namespace boost::locale::boundary::impl_icu;
205 switch(type) {
206 case char_facet:
207 return std::locale(in,new boundary_indexing_impl<char>(cd));
208 case wchar_t_facet:
209 return std::locale(in,new boundary_indexing_impl<wchar_t>(cd));
210 #ifdef BOOST_HAS_CHAR16_T
211 case char16_t_facet:
212 return std::locale(in,new boundary_indexing_impl<char16_t>(cd));
213 #endif
214 #ifdef BOOST_HAS_CHAR32_T
215 case char32_t_facet:
216 return std::locale(in,new boundary_indexing_impl<char32_t>(cd));
217 #endif
218 default:
219 return in;
220 }
221 }
222 } // impl_icu
223
224 } // locale
225 } // boost
226 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
227