1 // 2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See 5 // accompanying file LICENSE_1_0.txt or copy at 6 // http://www.boost.org/LICENSE_1_0.txt) 7 // 8 #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED 9 #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED 10 11 #include <boost/locale/config.hpp> 12 #include <boost/locale/boundary/types.hpp> 13 #include <boost/locale/boundary/facets.hpp> 14 #include <boost/locale/boundary/segment.hpp> 15 #include <boost/locale/boundary/boundary_point.hpp> 16 #include <boost/iterator/iterator_facade.hpp> 17 #include <boost/type_traits/is_same.hpp> 18 #include <boost/shared_ptr.hpp> 19 #include <boost/cstdint.hpp> 20 #include <boost/assert.hpp> 21 #ifdef BOOST_MSVC 22 # pragma warning(push) 23 # pragma warning(disable : 4275 4251 4231 4660) 24 #endif 25 #include <string> 26 #include <locale> 27 #include <vector> 28 #include <iterator> 29 #include <algorithm> 30 #include <stdexcept> 31 32 #include <iostream> 33 34 namespace boost { 35 36 namespace locale { 37 38 namespace boundary { 39 /// 40 /// \defgroup boundary Boundary Analysis 41 /// 42 /// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries 43 /// 44 /// @{ 45 /// 46 47 /// \cond INTERNAL 48 49 namespace details { 50 51 template<typename IteratorType,typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category> 52 struct mapping_traits { 53 typedef typename std::iterator_traits<IteratorType>::value_type char_type; mapboost::locale::boundary::details::mapping_traits54 static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) 55 { 56 std::basic_string<char_type> str(b,e); 57 return std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size()); 58 } 59 }; 60 61 template<typename CharType,typename SomeIteratorType> 62 struct linear_iterator_traits { 63 static const bool is_linear = 64 is_same<SomeIteratorType,CharType*>::value 65 || is_same<SomeIteratorType,CharType const*>::value 66 || is_same<SomeIteratorType,typename std::basic_string<CharType>::iterator>::value 67 || is_same<SomeIteratorType,typename std::basic_string<CharType>::const_iterator>::value 68 || is_same<SomeIteratorType,typename std::vector<CharType>::iterator>::value 69 || is_same<SomeIteratorType,typename std::vector<CharType>::const_iterator>::value 70 ; 71 }; 72 73 74 75 template<typename IteratorType> 76 struct mapping_traits<IteratorType,std::random_access_iterator_tag> { 77 78 typedef typename std::iterator_traits<IteratorType>::value_type char_type; 79 80 81 mapboost::locale::boundary::details::mapping_traits82 static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) 83 { 84 index_type result; 85 86 // 87 // Optimize for most common cases 88 // 89 // C++0x requires that string is continious in memory and all known 90 // string implementations 91 // do this because of c_str() support. 92 // 93 94 if(linear_iterator_traits<char_type,IteratorType>::is_linear && b!=e) 95 { 96 char_type const *begin = &*b; 97 char_type const *end = begin + (e-b); 98 index_type tmp=std::use_facet<boundary_indexing<char_type> >(l).map(t,begin,end); 99 result.swap(tmp); 100 } 101 else { 102 std::basic_string<char_type> str(b,e); 103 index_type tmp = std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size()); 104 result.swap(tmp); 105 } 106 return result; 107 } 108 }; 109 110 template<typename BaseIterator> 111 class mapping { 112 public: 113 typedef BaseIterator base_iterator; 114 typedef typename std::iterator_traits<base_iterator>::value_type char_type; 115 116 mapping(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc)117 mapping(boundary_type type, 118 base_iterator begin, 119 base_iterator end, 120 std::locale const &loc) 121 : 122 index_(new index_type()), 123 begin_(begin), 124 end_(end) 125 { 126 index_type idx=details::mapping_traits<base_iterator>::map(type,begin,end,loc); 127 index_->swap(idx); 128 } 129 mapping()130 mapping() 131 { 132 } 133 index() const134 index_type const &index() const 135 { 136 return *index_; 137 } 138 begin() const139 base_iterator begin() const 140 { 141 return begin_; 142 } 143 end() const144 base_iterator end() const 145 { 146 return end_; 147 } 148 149 private: 150 boost::shared_ptr<index_type> index_; 151 base_iterator begin_,end_; 152 }; 153 154 template<typename BaseIterator> 155 class segment_index_iterator : 156 public boost::iterator_facade< 157 segment_index_iterator<BaseIterator>, 158 segment<BaseIterator>, 159 boost::bidirectional_traversal_tag, 160 segment<BaseIterator> const & 161 > 162 { 163 public: 164 typedef BaseIterator base_iterator; 165 typedef mapping<base_iterator> mapping_type; 166 typedef segment<base_iterator> segment_type; 167 segment_index_iterator()168 segment_index_iterator() : current_(0,0),map_(0) 169 { 170 } 171 segment_index_iterator(base_iterator p,mapping_type const * map,rule_type mask,bool full_select)172 segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) : 173 map_(map), 174 mask_(mask), 175 full_select_(full_select) 176 { 177 set(p); 178 } segment_index_iterator(bool is_begin,mapping_type const * map,rule_type mask,bool full_select)179 segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) : 180 map_(map), 181 mask_(mask), 182 full_select_(full_select) 183 { 184 if(is_begin) 185 set_begin(); 186 else 187 set_end(); 188 } 189 dereference() const190 segment_type const &dereference() const 191 { 192 return value_; 193 } 194 equal(segment_index_iterator const & other) const195 bool equal(segment_index_iterator const &other) const 196 { 197 return map_ == other.map_ && current_.second == other.current_.second; 198 } 199 increment()200 void increment() 201 { 202 std::pair<size_t,size_t> next = current_; 203 if(full_select_) { 204 next.first = next.second; 205 while(next.second < size()) { 206 next.second++; 207 if(valid_offset(next.second)) 208 break; 209 } 210 if(next.second == size()) 211 next.first = next.second - 1; 212 } 213 else { 214 while(next.second < size()) { 215 next.first = next.second; 216 next.second++; 217 if(valid_offset(next.second)) 218 break; 219 } 220 } 221 update_current(next); 222 } 223 decrement()224 void decrement() 225 { 226 std::pair<size_t,size_t> next = current_; 227 if(full_select_) { 228 while(next.second >1) { 229 next.second--; 230 if(valid_offset(next.second)) 231 break; 232 } 233 next.first = next.second; 234 while(next.first >0) { 235 next.first--; 236 if(valid_offset(next.first)) 237 break; 238 } 239 } 240 else { 241 while(next.second >1) { 242 next.second--; 243 if(valid_offset(next.second)) 244 break; 245 } 246 next.first = next.second - 1; 247 } 248 update_current(next); 249 } 250 251 private: 252 set_end()253 void set_end() 254 { 255 current_.first = size() - 1; 256 current_.second = size(); 257 value_ = segment_type(map_->end(),map_->end(),0); 258 } set_begin()259 void set_begin() 260 { 261 current_.first = current_.second = 0; 262 value_ = segment_type(map_->begin(),map_->begin(),0); 263 increment(); 264 } 265 set(base_iterator p)266 void set(base_iterator p) 267 { 268 size_t dist=std::distance(map_->begin(),p); 269 index_type::const_iterator b=map_->index().begin(),e=map_->index().end(); 270 index_type::const_iterator 271 boundary_point=std::upper_bound(b,e,break_info(dist)); 272 while(boundary_point != e && (boundary_point->rule & mask_)==0) 273 boundary_point++; 274 275 current_.first = current_.second = boundary_point - b; 276 277 if(full_select_) { 278 while(current_.first > 0) { 279 current_.first --; 280 if(valid_offset(current_.first)) 281 break; 282 } 283 } 284 else { 285 if(current_.first > 0) 286 current_.first --; 287 } 288 value_.first = map_->begin(); 289 std::advance(value_.first,get_offset(current_.first)); 290 value_.second = value_.first; 291 std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first)); 292 293 update_rule(); 294 } 295 update_current(std::pair<size_t,size_t> pos)296 void update_current(std::pair<size_t,size_t> pos) 297 { 298 std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first); 299 std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second); 300 std::advance(value_.first,first_diff); 301 std::advance(value_.second,second_diff); 302 current_ = pos; 303 update_rule(); 304 } 305 update_rule()306 void update_rule() 307 { 308 if(current_.second != size()) { 309 value_.rule(index()[current_.second].rule); 310 } 311 } get_offset(size_t ind) const312 size_t get_offset(size_t ind) const 313 { 314 if(ind == size()) 315 return index().back().offset; 316 return index()[ind].offset; 317 } 318 valid_offset(size_t offset) const319 bool valid_offset(size_t offset) const 320 { 321 return offset == 0 322 || offset == size() // make sure we not acess index[size] 323 || (index()[offset].rule & mask_)!=0; 324 } 325 size() const326 size_t size() const 327 { 328 return index().size(); 329 } 330 index() const331 index_type const &index() const 332 { 333 return map_->index(); 334 } 335 336 337 segment_type value_; 338 std::pair<size_t,size_t> current_; 339 mapping_type const *map_; 340 rule_type mask_; 341 bool full_select_; 342 }; 343 344 template<typename BaseIterator> 345 class boundary_point_index_iterator : 346 public boost::iterator_facade< 347 boundary_point_index_iterator<BaseIterator>, 348 boundary_point<BaseIterator>, 349 boost::bidirectional_traversal_tag, 350 boundary_point<BaseIterator> const & 351 > 352 { 353 public: 354 typedef BaseIterator base_iterator; 355 typedef mapping<base_iterator> mapping_type; 356 typedef boundary_point<base_iterator> boundary_point_type; 357 boundary_point_index_iterator()358 boundary_point_index_iterator() : current_(0),map_(0) 359 { 360 } 361 boundary_point_index_iterator(bool is_begin,mapping_type const * map,rule_type mask)362 boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) : 363 map_(map), 364 mask_(mask) 365 { 366 if(is_begin) 367 set_begin(); 368 else 369 set_end(); 370 } boundary_point_index_iterator(base_iterator p,mapping_type const * map,rule_type mask)371 boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) : 372 map_(map), 373 mask_(mask) 374 { 375 set(p); 376 } 377 dereference() const378 boundary_point_type const &dereference() const 379 { 380 return value_; 381 } 382 equal(boundary_point_index_iterator const & other) const383 bool equal(boundary_point_index_iterator const &other) const 384 { 385 return map_ == other.map_ && current_ == other.current_; 386 } 387 increment()388 void increment() 389 { 390 size_t next = current_; 391 while(next < size()) { 392 next++; 393 if(valid_offset(next)) 394 break; 395 } 396 update_current(next); 397 } 398 decrement()399 void decrement() 400 { 401 size_t next = current_; 402 while(next>0) { 403 next--; 404 if(valid_offset(next)) 405 break; 406 } 407 update_current(next); 408 } 409 410 private: set_end()411 void set_end() 412 { 413 current_ = size(); 414 value_ = boundary_point_type(map_->end(),0); 415 } set_begin()416 void set_begin() 417 { 418 current_ = 0; 419 value_ = boundary_point_type(map_->begin(),0); 420 } 421 set(base_iterator p)422 void set(base_iterator p) 423 { 424 size_t dist = std::distance(map_->begin(),p); 425 426 index_type::const_iterator b=index().begin(); 427 index_type::const_iterator e=index().end(); 428 index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist)); 429 430 if(ptr==index().end()) 431 current_=size()-1; 432 else 433 current_=ptr - index().begin(); 434 435 while(!valid_offset(current_)) 436 current_ ++; 437 438 std::ptrdiff_t diff = get_offset(current_) - dist; 439 std::advance(p,diff); 440 value_.iterator(p); 441 update_rule(); 442 } 443 update_current(size_t pos)444 void update_current(size_t pos) 445 { 446 std::ptrdiff_t diff = get_offset(pos) - get_offset(current_); 447 base_iterator i=value_.iterator(); 448 std::advance(i,diff); 449 current_ = pos; 450 value_.iterator(i); 451 update_rule(); 452 } 453 update_rule()454 void update_rule() 455 { 456 if(current_ != size()) { 457 value_.rule(index()[current_].rule); 458 } 459 } get_offset(size_t ind) const460 size_t get_offset(size_t ind) const 461 { 462 if(ind == size()) 463 return index().back().offset; 464 return index()[ind].offset; 465 } 466 valid_offset(size_t offset) const467 bool valid_offset(size_t offset) const 468 { 469 return offset == 0 470 || offset + 1 >= size() // last and first are always valid regardless of mark 471 || (index()[offset].rule & mask_)!=0; 472 } 473 size() const474 size_t size() const 475 { 476 return index().size(); 477 } 478 index() const479 index_type const &index() const 480 { 481 return map_->index(); 482 } 483 484 485 boundary_point_type value_; 486 size_t current_; 487 mapping_type const *map_; 488 rule_type mask_; 489 }; 490 491 492 } // details 493 494 /// \endcond 495 496 template<typename BaseIterator> 497 class segment_index; 498 499 template<typename BaseIterator> 500 class boundary_point_index; 501 502 503 /// 504 /// \brief This class holds an index of segments in the text range and allows to iterate over them 505 /// 506 /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators 507 /// to the \ref segment objects. 508 /// 509 /// It provides two options on way of selecting segments: 510 /// 511 /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to 512 /// various masks %as \ref word_any. 513 /// \n 514 /// The default is to select any types of boundaries. 515 /// \n 516 /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators 517 /// would iterate only over the words containing Kana letters and \ref word_any would select all types of 518 /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text 519 /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead 520 /// of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?". 521 /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous 522 /// %boundary point does not fit the selected rule. 523 /// \n 524 /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?". 525 /// \n 526 /// This text contains three %boundary points separating it to sentences by different rules: 527 /// - The exclamation mark "!" ends the sentence "Hello!" 528 /// - The line feed that splits the sentence "How\nare you?" into two parts. 529 /// - The question mark that ends the second sentence. 530 /// \n 531 /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would 532 /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required 533 /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include 534 /// all the text up to previous valid %boundary point and would return two expected sentences: 535 /// "Hello!" and "How\nare you?". 536 /// 537 /// This class allows to find a segment according to the given iterator in range using \ref find() member 538 /// function. 539 /// 540 /// \note 541 /// 542 /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text 543 /// invalidates existing iterators and they can't be used any more. 544 /// - segment_index can be created from boundary_point_index or other segment_index that was created with 545 /// same \ref boundary_type. This is very fast operation %as they shared same index 546 /// and it does not require its regeneration. 547 /// 548 /// \see 549 /// 550 /// - \ref boundary_point_index 551 /// - \ref segment 552 /// - \ref boundary_point 553 /// 554 555 template<typename BaseIterator> 556 class segment_index { 557 public: 558 559 /// 560 /// The type of the iterator used to iterate over the original text 561 /// 562 typedef BaseIterator base_iterator; 563 #ifdef BOOST_LOCALE_DOXYGEN 564 /// 565 /// The bidirectional iterator that iterates over \ref value_type objects. 566 /// 567 /// - The iterators may be invalidated by use of any non-const member function 568 /// including but not limited to \ref rule(rule_type) and \ref full_select(bool). 569 /// - The returned value_type object is valid %as long %as iterator points to it. 570 /// So this following code is wrong %as t used after p was updated: 571 /// \code 572 /// segment_index<some_iterator>::iterator p=index.begin(); 573 /// segment<some_iterator> &t = *p; 574 /// ++p; 575 /// cout << t.str() << endl; 576 /// \endcode 577 /// 578 typedef unspecified_iterator_type iterator; 579 /// 580 /// \copydoc iterator 581 /// 582 typedef unspecified_iterator_type const_iterator; 583 #else 584 typedef details::segment_index_iterator<base_iterator> iterator; 585 typedef details::segment_index_iterator<base_iterator> const_iterator; 586 #endif 587 /// 588 /// The type dereferenced by the \ref iterator and \ref const_iterator. It is 589 /// an object that represents selected segment. 590 /// 591 typedef segment<base_iterator> value_type; 592 593 /// 594 /// Default constructor. 595 /// 596 /// \note 597 /// 598 /// When this object is constructed by default it does not include a valid index, thus 599 /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined 600 /// behavior 601 /// segment_index()602 segment_index() : mask_(0xFFFFFFFFu),full_select_(false) 603 { 604 } 605 /// 606 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text 607 /// in range [begin,end) using a rule \a mask for locale \a loc. 608 /// segment_index(boundary_type type,base_iterator begin,base_iterator end,rule_type mask,std::locale const & loc=std::locale ())609 segment_index(boundary_type type, 610 base_iterator begin, 611 base_iterator end, 612 rule_type mask, 613 std::locale const &loc=std::locale()) 614 : 615 map_(type,begin,end,loc), 616 mask_(mask), 617 full_select_(false) 618 { 619 } 620 /// 621 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text 622 /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc. 623 /// segment_index(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())624 segment_index(boundary_type type, 625 base_iterator begin, 626 base_iterator end, 627 std::locale const &loc=std::locale()) 628 : 629 map_(type,begin,end,loc), 630 mask_(0xFFFFFFFFu), 631 full_select_(false) 632 { 633 } 634 635 /// 636 /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information 637 /// and used default rule (all possible segments) 638 /// 639 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text 640 /// range it is much better to create one from another rather then indexing the same 641 /// range twice. 642 /// 643 /// \note \ref rule() flags are not copied 644 /// 645 segment_index(boundary_point_index<base_iterator> const &); 646 /// 647 /// Copy an index from a \ref boundary_point_index. It copies all indexing information 648 /// and uses the default rule (all possible segments) 649 /// 650 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text 651 /// range it is much better to create one from another rather then indexing the same 652 /// range twice. 653 /// 654 /// \note \ref rule() flags are not copied 655 /// 656 segment_index const &operator = (boundary_point_index<base_iterator> const &); 657 658 659 /// 660 /// Create a new index for %boundary analysis \ref boundary_type "type" of the text 661 /// in range [begin,end) for locale \a loc. 662 /// 663 /// \note \ref rule() and \ref full_select() remain unchanged. 664 /// map(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())665 void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) 666 { 667 map_ = mapping_type(type,begin,end,loc); 668 } 669 670 /// 671 /// Get the \ref iterator on the beginning of the segments range. 672 /// 673 /// Preconditions: the segment_index should have a mapping 674 /// 675 /// \note 676 /// 677 /// The returned iterator is invalidated by access to any non-const member functions of this object 678 /// begin() const679 iterator begin() const 680 { 681 return iterator(true,&map_,mask_,full_select_); 682 } 683 684 /// 685 /// Get the \ref iterator on the ending of the segments range. 686 /// 687 /// Preconditions: the segment_index should have a mapping 688 /// 689 /// The returned iterator is invalidated by access to any non-const member functions of this object 690 /// end() const691 iterator end() const 692 { 693 return iterator(false,&map_,mask_,full_select_); 694 } 695 696 /// 697 /// Find a first valid segment following a position \a p. 698 /// 699 /// If \a p is inside a valid segment this segment is selected: 700 /// 701 /// For example: For \ref word %boundary analysis with \ref word_any rule(): 702 /// 703 /// - "to| be or ", would point to "be", 704 /// - "t|o be or ", would point to "to", 705 /// - "to be or| ", would point to end. 706 /// 707 /// 708 /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator 709 /// to the text in the mapped range. 710 /// 711 /// The returned iterator is invalidated by access to any non-const member functions of this object 712 /// find(base_iterator p) const713 iterator find(base_iterator p) const 714 { 715 return iterator(p,&map_,mask_,full_select_); 716 } 717 718 /// 719 /// Get the mask of rules that are used 720 /// rule() const721 rule_type rule() const 722 { 723 return mask_; 724 } 725 /// 726 /// Set the mask of rules that are used 727 /// rule(rule_type v)728 void rule(rule_type v) 729 { 730 mask_ = v; 731 } 732 733 /// 734 /// Get the full_select property value - should segment include in the range 735 /// values that not belong to specific \ref rule() or not. 736 /// 737 /// The default value is false. 738 /// 739 /// For example for \ref sentence %boundary with rule \ref sentence_term the segments 740 /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false 741 /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() 742 /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the 743 /// following part "are you?" 744 /// 745 full_select() const746 bool full_select() const 747 { 748 return full_select_; 749 } 750 751 /// 752 /// Set the full_select property value - should segment include in the range 753 /// values that not belong to specific \ref rule() or not. 754 /// 755 /// The default value is false. 756 /// 757 /// For example for \ref sentence %boundary with rule \ref sentence_term the segments 758 /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false 759 /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() 760 /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the 761 /// following part "are you?" 762 /// 763 full_select(bool v)764 void full_select(bool v) 765 { 766 full_select_ = v; 767 } 768 769 private: 770 friend class boundary_point_index<base_iterator>; 771 typedef details::mapping<base_iterator> mapping_type; 772 mapping_type map_; 773 rule_type mask_; 774 bool full_select_; 775 }; 776 777 /// 778 /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating 779 /// over them. 780 /// 781 /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators 782 /// to the \ref boundary_point objects. 783 /// 784 /// It provides an option that affects selecting %boundary points according to different rules: 785 /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific 786 /// types of %boundary points like \ref sentence_term. 787 /// 788 /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default 789 /// rule is used the %boundary points would be: 790 /// 791 /// - "|Hello! How\nare you?" 792 /// - "Hello! |How\nare you?" 793 /// - "Hello! How\n|are you?" 794 /// - "Hello! How\nare you?|" 795 /// 796 /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be: 797 /// 798 /// - "|Hello! How\nare you?" 799 /// - "Hello! |How\nare you?" 800 /// - "Hello! How\nare you?|" 801 /// 802 /// Such that a %boundary point defined by a line feed character would be ignored. 803 /// 804 /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member 805 /// function. 806 /// 807 /// \note 808 /// - Even an empty text range [x,x) considered to have a one %boundary point x. 809 /// - \a a and \a b points of the range [a,b) are always considered %boundary points 810 /// regardless the rules used. 811 /// - Changing any of the option \ref rule() or course re-indexing the text 812 /// invalidates existing iterators and they can't be used any more. 813 /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with 814 /// same \ref boundary_type. This is very fast operation %as they shared same index 815 /// and it does not require its regeneration. 816 /// 817 /// \see 818 /// 819 /// - \ref segment_index 820 /// - \ref boundary_point 821 /// - \ref segment 822 /// 823 824 825 template<typename BaseIterator> 826 class boundary_point_index { 827 public: 828 /// 829 /// The type of the iterator used to iterate over the original text 830 /// 831 typedef BaseIterator base_iterator; 832 #ifdef BOOST_LOCALE_DOXYGEN 833 /// 834 /// The bidirectional iterator that iterates over \ref value_type objects. 835 /// 836 /// - The iterators may be invalidated by use of any non-const member function 837 /// including but not limited to \ref rule(rule_type) member function. 838 /// - The returned value_type object is valid %as long %as iterator points to it. 839 /// So this following code is wrong %as t used after p was updated: 840 /// \code 841 /// boundary_point_index<some_iterator>::iterator p=index.begin(); 842 /// boundary_point<some_iterator> &t = *p; 843 /// ++p; 844 /// rule_type r = t->rule(); 845 /// \endcode 846 /// 847 typedef unspecified_iterator_type iterator; 848 /// 849 /// \copydoc iterator 850 /// 851 typedef unspecified_iterator_type const_iterator; 852 #else 853 typedef details::boundary_point_index_iterator<base_iterator> iterator; 854 typedef details::boundary_point_index_iterator<base_iterator> const_iterator; 855 #endif 856 /// 857 /// The type dereferenced by the \ref iterator and \ref const_iterator. It is 858 /// an object that represents the selected \ref boundary_point "boundary point". 859 /// 860 typedef boundary_point<base_iterator> value_type; 861 862 /// 863 /// Default constructor. 864 /// 865 /// \note 866 /// 867 /// When this object is constructed by default it does not include a valid index, thus 868 /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined 869 /// behavior 870 /// boundary_point_index()871 boundary_point_index() : mask_(0xFFFFFFFFu) 872 { 873 } 874 875 /// 876 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text 877 /// in range [begin,end) using a rule \a mask for locale \a loc. 878 /// boundary_point_index(boundary_type type,base_iterator begin,base_iterator end,rule_type mask,std::locale const & loc=std::locale ())879 boundary_point_index(boundary_type type, 880 base_iterator begin, 881 base_iterator end, 882 rule_type mask, 883 std::locale const &loc=std::locale()) 884 : 885 map_(type,begin,end,loc), 886 mask_(mask) 887 { 888 } 889 /// 890 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text 891 /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc. 892 /// boundary_point_index(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())893 boundary_point_index(boundary_type type, 894 base_iterator begin, 895 base_iterator end, 896 std::locale const &loc=std::locale()) 897 : 898 map_(type,begin,end,loc), 899 mask_(0xFFFFFFFFu) 900 { 901 } 902 903 /// 904 /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information 905 /// and uses the default rule (all possible %boundary points) 906 /// 907 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text 908 /// range it is much better to create one from another rather then indexing the same 909 /// range twice. 910 /// 911 /// \note \ref rule() flags are not copied 912 /// 913 boundary_point_index(segment_index<base_iterator> const &other); 914 /// 915 /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information 916 /// and keeps the current \ref rule() unchanged 917 /// 918 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text 919 /// range it is much better to create one from another rather then indexing the same 920 /// range twice. 921 /// 922 /// \note \ref rule() flags are not copied 923 /// 924 boundary_point_index const &operator=(segment_index<base_iterator> const &other); 925 926 /// 927 /// Create a new index for %boundary analysis \ref boundary_type "type" of the text 928 /// in range [begin,end) for locale \a loc. 929 /// 930 /// \note \ref rule() remains unchanged. 931 /// map(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())932 void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) 933 { 934 map_ = mapping_type(type,begin,end,loc); 935 } 936 937 /// 938 /// Get the \ref iterator on the beginning of the %boundary points range. 939 /// 940 /// Preconditions: this boundary_point_index should have a mapping 941 /// 942 /// \note 943 /// 944 /// The returned iterator is invalidated by access to any non-const member functions of this object 945 /// begin() const946 iterator begin() const 947 { 948 return iterator(true,&map_,mask_); 949 } 950 951 /// 952 /// Get the \ref iterator on the ending of the %boundary points range. 953 /// 954 /// Preconditions: this boundary_point_index should have a mapping 955 /// 956 /// \note 957 /// 958 /// The returned iterator is invalidated by access to any non-const member functions of this object 959 /// end() const960 iterator end() const 961 { 962 return iterator(false,&map_,mask_); 963 } 964 965 /// 966 /// Find a first valid %boundary point on a position \a p or following it. 967 /// 968 /// For example: For \ref word %boundary analysis of the text "to be or" 969 /// 970 /// - "|to be", would return %boundary point at "|to be", 971 /// - "t|o be", would point to "to| be" 972 /// 973 /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator 974 /// to the text in the mapped range. 975 /// 976 /// The returned iterator is invalidated by access to any non-const member functions of this object 977 /// find(base_iterator p) const978 iterator find(base_iterator p) const 979 { 980 return iterator(p,&map_,mask_); 981 } 982 983 /// 984 /// Get the mask of rules that are used 985 /// rule() const986 rule_type rule() const 987 { 988 return mask_; 989 } 990 /// 991 /// Set the mask of rules that are used 992 /// rule(rule_type v)993 void rule(rule_type v) 994 { 995 mask_ = v; 996 } 997 998 private: 999 1000 friend class segment_index<base_iterator>; 1001 typedef details::mapping<base_iterator> mapping_type; 1002 mapping_type map_; 1003 rule_type mask_; 1004 }; 1005 1006 /// \cond INTERNAL 1007 template<typename BaseIterator> segment_index(boundary_point_index<BaseIterator> const & other)1008 segment_index<BaseIterator>::segment_index(boundary_point_index<BaseIterator> const &other) : 1009 map_(other.map_), 1010 mask_(0xFFFFFFFFu), 1011 full_select_(false) 1012 { 1013 } 1014 1015 template<typename BaseIterator> boundary_point_index(segment_index<BaseIterator> const & other)1016 boundary_point_index<BaseIterator>::boundary_point_index(segment_index<BaseIterator> const &other) : 1017 map_(other.map_), 1018 mask_(0xFFFFFFFFu) 1019 { 1020 } 1021 1022 template<typename BaseIterator> operator =(boundary_point_index<BaseIterator> const & other)1023 segment_index<BaseIterator> const &segment_index<BaseIterator>::operator=(boundary_point_index<BaseIterator> const &other) 1024 { 1025 map_ = other.map_; 1026 return *this; 1027 } 1028 1029 template<typename BaseIterator> operator =(segment_index<BaseIterator> const & other)1030 boundary_point_index<BaseIterator> const &boundary_point_index<BaseIterator>::operator=(segment_index<BaseIterator> const &other) 1031 { 1032 map_ = other.map_; 1033 return *this; 1034 } 1035 /// \endcond 1036 1037 typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef 1038 typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef 1039 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T 1040 typedef segment_index<std::u16string::const_iterator> u16ssegment_index;///< convenience typedef 1041 #endif 1042 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T 1043 typedef segment_index<std::u32string::const_iterator> u32ssegment_index;///< convenience typedef 1044 #endif 1045 1046 typedef segment_index<char const *> csegment_index; ///< convenience typedef 1047 typedef segment_index<wchar_t const *> wcsegment_index; ///< convenience typedef 1048 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T 1049 typedef segment_index<char16_t const *> u16csegment_index; ///< convenience typedef 1050 #endif 1051 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T 1052 typedef segment_index<char32_t const *> u32csegment_index; ///< convenience typedef 1053 #endif 1054 1055 typedef boundary_point_index<std::string::const_iterator> sboundary_point_index;///< convenience typedef 1056 typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index;///< convenience typedef 1057 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T 1058 typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index;///< convenience typedef 1059 #endif 1060 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T 1061 typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index;///< convenience typedef 1062 #endif 1063 1064 typedef boundary_point_index<char const *> cboundary_point_index; ///< convenience typedef 1065 typedef boundary_point_index<wchar_t const *> wcboundary_point_index; ///< convenience typedef 1066 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T 1067 typedef boundary_point_index<char16_t const *> u16cboundary_point_index;///< convenience typedef 1068 #endif 1069 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T 1070 typedef boundary_point_index<char32_t const *> u32cboundary_point_index;///< convenience typedef 1071 #endif 1072 1073 1074 1075 } // boundary 1076 1077 } // locale 1078 } // boost 1079 1080 /// 1081 /// \example boundary.cpp 1082 /// Example of using segment_index 1083 /// \example wboundary.cpp 1084 /// Example of using segment_index over wide strings 1085 /// 1086 1087 #ifdef BOOST_MSVC 1088 #pragma warning(pop) 1089 #endif 1090 1091 #endif 1092 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 1093