1 //
2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE_1_0.txt or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
9 #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
10 
11 #include <boost/locale/config.hpp>
12 #include <boost/locale/boundary/types.hpp>
13 #include <boost/locale/boundary/facets.hpp>
14 #include <boost/locale/boundary/segment.hpp>
15 #include <boost/locale/boundary/boundary_point.hpp>
16 #include <boost/iterator/iterator_facade.hpp>
17 #include <boost/type_traits/is_same.hpp>
18 #include <boost/shared_ptr.hpp>
19 #include <boost/cstdint.hpp>
20 #include <boost/assert.hpp>
21 #ifdef BOOST_MSVC
22 #  pragma warning(push)
23 #  pragma warning(disable : 4275 4251 4231 4660)
24 #endif
25 #include <string>
26 #include <locale>
27 #include <vector>
28 #include <iterator>
29 #include <algorithm>
30 #include <stdexcept>
31 
32 #include <iostream>
33 
34 namespace boost {
35 
36     namespace locale {
37 
38         namespace boundary {
39             ///
40             /// \defgroup boundary Boundary Analysis
41             ///
42             /// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries
43             ///
44             /// @{
45             ///
46 
47             /// \cond INTERNAL
48 
49             namespace details {
50 
51                 template<typename IteratorType,typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category>
52                 struct mapping_traits {
53                     typedef typename std::iterator_traits<IteratorType>::value_type char_type;
mapboost::locale::boundary::details::mapping_traits54                     static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
55                     {
56                         std::basic_string<char_type> str(b,e);
57                         return std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size());
58                     }
59                 };
60 
61                 template<typename CharType,typename SomeIteratorType>
62                 struct linear_iterator_traits {
63                     static const bool is_linear =
64                         is_same<SomeIteratorType,CharType*>::value
65                         || is_same<SomeIteratorType,CharType const*>::value
66                         || is_same<SomeIteratorType,typename std::basic_string<CharType>::iterator>::value
67                         || is_same<SomeIteratorType,typename std::basic_string<CharType>::const_iterator>::value
68                         || is_same<SomeIteratorType,typename std::vector<CharType>::iterator>::value
69                         || is_same<SomeIteratorType,typename std::vector<CharType>::const_iterator>::value
70                         ;
71                 };
72 
73 
74 
75                 template<typename IteratorType>
76                 struct mapping_traits<IteratorType,std::random_access_iterator_tag> {
77 
78                     typedef typename std::iterator_traits<IteratorType>::value_type char_type;
79 
80 
81 
mapboost::locale::boundary::details::mapping_traits82                     static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
83                     {
84                         index_type result;
85 
86                         //
87                         // Optimize for most common cases
88                         //
89                         // C++0x requires that string is continious in memory and all known
90                         // string implementations
91                         // do this because of c_str() support.
92                         //
93 
94                         if(linear_iterator_traits<char_type,IteratorType>::is_linear && b!=e)
95                         {
96                             char_type const *begin = &*b;
97                             char_type const *end = begin + (e-b);
98                             index_type tmp=std::use_facet<boundary_indexing<char_type> >(l).map(t,begin,end);
99                             result.swap(tmp);
100                         }
101                         else {
102                             std::basic_string<char_type> str(b,e);
103                             index_type tmp = std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size());
104                             result.swap(tmp);
105                         }
106                         return result;
107                     }
108                 };
109 
110                 template<typename BaseIterator>
111                 class mapping {
112                 public:
113                     typedef BaseIterator base_iterator;
114                     typedef typename std::iterator_traits<base_iterator>::value_type char_type;
115 
116 
mapping(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc)117                     mapping(boundary_type type,
118                             base_iterator begin,
119                             base_iterator end,
120                             std::locale const &loc)
121                         :
122                             index_(new index_type()),
123                             begin_(begin),
124                             end_(end)
125                     {
126                         index_type idx=details::mapping_traits<base_iterator>::map(type,begin,end,loc);
127                         index_->swap(idx);
128                     }
129 
mapping()130                     mapping()
131                     {
132                     }
133 
index() const134                     index_type const &index() const
135                     {
136                         return *index_;
137                     }
138 
begin() const139                     base_iterator begin() const
140                     {
141                         return begin_;
142                     }
143 
end() const144                     base_iterator end() const
145                     {
146                         return end_;
147                     }
148 
149                 private:
150                     boost::shared_ptr<index_type> index_;
151                     base_iterator begin_,end_;
152                 };
153 
154                 template<typename BaseIterator>
155                 class segment_index_iterator :
156                     public boost::iterator_facade<
157                         segment_index_iterator<BaseIterator>,
158                         segment<BaseIterator>,
159                         boost::bidirectional_traversal_tag,
160                         segment<BaseIterator> const &
161                     >
162                 {
163                 public:
164                     typedef BaseIterator base_iterator;
165                     typedef mapping<base_iterator> mapping_type;
166                     typedef segment<base_iterator> segment_type;
167 
segment_index_iterator()168                     segment_index_iterator() : current_(0,0),map_(0)
169                     {
170                     }
171 
segment_index_iterator(base_iterator p,mapping_type const * map,rule_type mask,bool full_select)172                     segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) :
173                         map_(map),
174                         mask_(mask),
175                         full_select_(full_select)
176                     {
177                         set(p);
178                     }
segment_index_iterator(bool is_begin,mapping_type const * map,rule_type mask,bool full_select)179                     segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) :
180                         map_(map),
181                         mask_(mask),
182                         full_select_(full_select)
183                     {
184                         if(is_begin)
185                             set_begin();
186                         else
187                             set_end();
188                     }
189 
dereference() const190                     segment_type const &dereference() const
191                     {
192                         return value_;
193                     }
194 
equal(segment_index_iterator const & other) const195                     bool equal(segment_index_iterator const &other) const
196                     {
197                         return map_ == other.map_ && current_.second == other.current_.second;
198                     }
199 
increment()200                     void increment()
201                     {
202                         std::pair<size_t,size_t> next = current_;
203                         if(full_select_) {
204                             next.first = next.second;
205                             while(next.second < size()) {
206                                 next.second++;
207                                 if(valid_offset(next.second))
208                                     break;
209                             }
210                             if(next.second == size())
211                                 next.first = next.second - 1;
212                         }
213                         else {
214                             while(next.second < size()) {
215                                 next.first = next.second;
216                                 next.second++;
217                                 if(valid_offset(next.second))
218                                     break;
219                             }
220                         }
221                         update_current(next);
222                     }
223 
decrement()224                     void decrement()
225                     {
226                         std::pair<size_t,size_t> next = current_;
227                         if(full_select_) {
228                             while(next.second >1) {
229                                 next.second--;
230                                 if(valid_offset(next.second))
231                                     break;
232                             }
233                             next.first = next.second;
234                             while(next.first >0) {
235                                 next.first--;
236                                 if(valid_offset(next.first))
237                                     break;
238                             }
239                         }
240                         else {
241                             while(next.second >1) {
242                                 next.second--;
243                                 if(valid_offset(next.second))
244                                     break;
245                             }
246                             next.first = next.second - 1;
247                         }
248                         update_current(next);
249                     }
250 
251                 private:
252 
set_end()253                     void set_end()
254                     {
255                         current_.first  = size() - 1;
256                         current_.second = size();
257                         value_ = segment_type(map_->end(),map_->end(),0);
258                     }
set_begin()259                     void set_begin()
260                     {
261                         current_.first = current_.second = 0;
262                         value_ = segment_type(map_->begin(),map_->begin(),0);
263                         increment();
264                     }
265 
set(base_iterator p)266                     void set(base_iterator p)
267                     {
268                         size_t dist=std::distance(map_->begin(),p);
269                         index_type::const_iterator b=map_->index().begin(),e=map_->index().end();
270                         index_type::const_iterator
271                             boundary_point=std::upper_bound(b,e,break_info(dist));
272                         while(boundary_point != e && (boundary_point->rule & mask_)==0)
273                             boundary_point++;
274 
275                         current_.first = current_.second = boundary_point - b;
276 
277                         if(full_select_) {
278                             while(current_.first > 0) {
279                                 current_.first --;
280                                 if(valid_offset(current_.first))
281                                     break;
282                             }
283                         }
284                         else {
285                             if(current_.first > 0)
286                                 current_.first --;
287                         }
288                         value_.first = map_->begin();
289                         std::advance(value_.first,get_offset(current_.first));
290                         value_.second = value_.first;
291                         std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first));
292 
293                         update_rule();
294                     }
295 
update_current(std::pair<size_t,size_t> pos)296                     void update_current(std::pair<size_t,size_t> pos)
297                     {
298                         std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first);
299                         std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second);
300                         std::advance(value_.first,first_diff);
301                         std::advance(value_.second,second_diff);
302                         current_ = pos;
303                         update_rule();
304                     }
305 
update_rule()306                     void update_rule()
307                     {
308                         if(current_.second != size()) {
309                             value_.rule(index()[current_.second].rule);
310                         }
311                     }
get_offset(size_t ind) const312                     size_t get_offset(size_t ind) const
313                     {
314                         if(ind == size())
315                             return index().back().offset;
316                         return index()[ind].offset;
317                     }
318 
valid_offset(size_t offset) const319                     bool valid_offset(size_t offset) const
320                     {
321                         return  offset == 0
322                                 || offset == size() // make sure we not acess index[size]
323                                 || (index()[offset].rule & mask_)!=0;
324                     }
325 
size() const326                     size_t size() const
327                     {
328                         return index().size();
329                     }
330 
index() const331                     index_type const &index() const
332                     {
333                         return map_->index();
334                     }
335 
336 
337                     segment_type value_;
338                     std::pair<size_t,size_t> current_;
339                     mapping_type const *map_;
340                     rule_type mask_;
341                     bool full_select_;
342                 };
343 
344                 template<typename BaseIterator>
345                 class boundary_point_index_iterator :
346                     public boost::iterator_facade<
347                         boundary_point_index_iterator<BaseIterator>,
348                         boundary_point<BaseIterator>,
349                         boost::bidirectional_traversal_tag,
350                         boundary_point<BaseIterator> const &
351                     >
352                 {
353                 public:
354                     typedef BaseIterator base_iterator;
355                     typedef mapping<base_iterator> mapping_type;
356                     typedef boundary_point<base_iterator> boundary_point_type;
357 
boundary_point_index_iterator()358                     boundary_point_index_iterator() : current_(0),map_(0)
359                     {
360                     }
361 
boundary_point_index_iterator(bool is_begin,mapping_type const * map,rule_type mask)362                     boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) :
363                         map_(map),
364                         mask_(mask)
365                     {
366                         if(is_begin)
367                             set_begin();
368                         else
369                             set_end();
370                     }
boundary_point_index_iterator(base_iterator p,mapping_type const * map,rule_type mask)371                     boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) :
372                         map_(map),
373                         mask_(mask)
374                     {
375                         set(p);
376                     }
377 
dereference() const378                     boundary_point_type const &dereference() const
379                     {
380                         return value_;
381                     }
382 
equal(boundary_point_index_iterator const & other) const383                     bool equal(boundary_point_index_iterator const &other) const
384                     {
385                         return map_ == other.map_ && current_ == other.current_;
386                     }
387 
increment()388                     void increment()
389                     {
390                         size_t next = current_;
391                         while(next < size()) {
392                             next++;
393                             if(valid_offset(next))
394                                 break;
395                         }
396                         update_current(next);
397                     }
398 
decrement()399                     void decrement()
400                     {
401                         size_t next = current_;
402                         while(next>0) {
403                             next--;
404                             if(valid_offset(next))
405                                 break;
406                         }
407                         update_current(next);
408                     }
409 
410                 private:
set_end()411                     void set_end()
412                     {
413                         current_ = size();
414                         value_ = boundary_point_type(map_->end(),0);
415                     }
set_begin()416                     void set_begin()
417                     {
418                         current_ = 0;
419                         value_ = boundary_point_type(map_->begin(),0);
420                     }
421 
set(base_iterator p)422                     void set(base_iterator p)
423                     {
424                         size_t dist =  std::distance(map_->begin(),p);
425 
426                         index_type::const_iterator b=index().begin();
427                         index_type::const_iterator e=index().end();
428                         index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist));
429 
430                         if(ptr==index().end())
431                             current_=size()-1;
432                         else
433                             current_=ptr - index().begin();
434 
435                         while(!valid_offset(current_))
436                             current_ ++;
437 
438                         std::ptrdiff_t diff = get_offset(current_) - dist;
439                         std::advance(p,diff);
440                         value_.iterator(p);
441                         update_rule();
442                     }
443 
update_current(size_t pos)444                     void update_current(size_t pos)
445                     {
446                         std::ptrdiff_t diff = get_offset(pos) - get_offset(current_);
447                         base_iterator i=value_.iterator();
448                         std::advance(i,diff);
449                         current_ = pos;
450                         value_.iterator(i);
451                         update_rule();
452                     }
453 
update_rule()454                     void update_rule()
455                     {
456                         if(current_ != size()) {
457                             value_.rule(index()[current_].rule);
458                         }
459                     }
get_offset(size_t ind) const460                     size_t get_offset(size_t ind) const
461                     {
462                         if(ind == size())
463                             return index().back().offset;
464                         return index()[ind].offset;
465                     }
466 
valid_offset(size_t offset) const467                     bool valid_offset(size_t offset) const
468                     {
469                         return  offset == 0
470                                 || offset + 1 >= size() // last and first are always valid regardless of mark
471                                 || (index()[offset].rule & mask_)!=0;
472                     }
473 
size() const474                     size_t size() const
475                     {
476                         return index().size();
477                     }
478 
index() const479                     index_type const &index() const
480                     {
481                         return map_->index();
482                     }
483 
484 
485                     boundary_point_type value_;
486                     size_t current_;
487                     mapping_type const *map_;
488                     rule_type mask_;
489                 };
490 
491 
492             } // details
493 
494             /// \endcond
495 
496             template<typename BaseIterator>
497             class segment_index;
498 
499             template<typename BaseIterator>
500             class boundary_point_index;
501 
502 
503             ///
504             /// \brief This class holds an index of segments in the text range and allows to iterate over them
505             ///
506             /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
507             /// to the \ref segment objects.
508             ///
509             /// It provides two options on way of selecting segments:
510             ///
511             /// -   \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to
512             ///     various masks %as \ref word_any.
513             ///     \n
514             ///     The default is to select any types of boundaries.
515             ///     \n
516             ///     For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators
517             ///     would iterate only over the words containing Kana letters and \ref word_any would select all types of
518             ///     words excluding ranges that consist of white space and punctuation marks. So iterating over the text
519             ///     "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead
520             ///     of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?".
521             /// -   \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous
522             ///     %boundary point does not fit the selected rule.
523             ///     \n
524             ///     For example: We want to fetch all sentences from the following text: "Hello! How\nare you?".
525             ///     \n
526             ///     This text contains three %boundary points separating it to sentences by different rules:
527             ///     - The exclamation mark "!" ends the sentence "Hello!"
528             ///     - The line feed that splits the sentence "How\nare you?" into two parts.
529             ///     - The question mark that ends the second sentence.
530             ///     \n
531             ///     If you would only change the \ref rule() to \ref sentence_term then the segment_index would
532             ///     provide two sentences "Hello!" and "are you?" %as only them actually terminated with required
533             ///     terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include
534             ///     all the text up to previous valid %boundary point and would return two expected sentences:
535             ///     "Hello!" and "How\nare you?".
536             ///
537             /// This class allows to find a segment according to the given iterator in range using \ref find() member
538             /// function.
539             ///
540             /// \note
541             ///
542             /// -   Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text
543             ///     invalidates existing iterators and they can't be used any more.
544             /// -   segment_index can be created from boundary_point_index or other segment_index that was created with
545             ///     same \ref boundary_type.  This is very fast operation %as they shared same index
546             ///     and it does not require its regeneration.
547             ///
548             /// \see
549             ///
550             /// - \ref boundary_point_index
551             /// - \ref segment
552             /// - \ref boundary_point
553             ///
554 
555             template<typename BaseIterator>
556             class segment_index {
557             public:
558 
559                 ///
560                 /// The type of the iterator used to iterate over the original text
561                 ///
562                 typedef BaseIterator base_iterator;
563                 #ifdef BOOST_LOCALE_DOXYGEN
564                 ///
565                 /// The bidirectional iterator that iterates over \ref value_type objects.
566                 ///
567                 /// -   The iterators may be invalidated by use of any non-const member function
568                 ///     including but not limited to \ref rule(rule_type) and \ref full_select(bool).
569                 /// -   The returned value_type object is valid %as long %as iterator points to it.
570                 ///     So this following code is wrong %as t used after p was updated:
571                 ///     \code
572                 ///     segment_index<some_iterator>::iterator p=index.begin();
573                 ///     segment<some_iterator> &t = *p;
574                 ///     ++p;
575                 ///     cout << t.str() << endl;
576                 ///     \endcode
577                 ///
578                 typedef unspecified_iterator_type iterator;
579                 ///
580                 /// \copydoc iterator
581                 ///
582                 typedef unspecified_iterator_type const_iterator;
583                 #else
584                 typedef details::segment_index_iterator<base_iterator> iterator;
585                 typedef details::segment_index_iterator<base_iterator> const_iterator;
586                 #endif
587                 ///
588                 /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
589                 /// an object that represents selected segment.
590                 ///
591                 typedef segment<base_iterator> value_type;
592 
593                 ///
594                 /// Default constructor.
595                 ///
596                 /// \note
597                 ///
598                 /// When this object is constructed by default it does not include a valid index, thus
599                 /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
600                 /// behavior
601                 ///
segment_index()602                 segment_index() : mask_(0xFFFFFFFFu),full_select_(false)
603                 {
604                 }
605                 ///
606                 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
607                 /// in range [begin,end) using a rule \a mask for locale \a loc.
608                 ///
segment_index(boundary_type type,base_iterator begin,base_iterator end,rule_type mask,std::locale const & loc=std::locale ())609                 segment_index(boundary_type type,
610                             base_iterator begin,
611                             base_iterator end,
612                             rule_type mask,
613                             std::locale const &loc=std::locale())
614                     :
615                         map_(type,begin,end,loc),
616                         mask_(mask),
617                         full_select_(false)
618                 {
619                 }
620                 ///
621                 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
622                 /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc.
623                 ///
segment_index(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())624                 segment_index(boundary_type type,
625                             base_iterator begin,
626                             base_iterator end,
627                             std::locale const &loc=std::locale())
628                     :
629                         map_(type,begin,end,loc),
630                         mask_(0xFFFFFFFFu),
631                         full_select_(false)
632                 {
633                 }
634 
635                 ///
636                 /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information
637                 /// and used default rule (all possible segments)
638                 ///
639                 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
640                 /// range it is much better to create one from another rather then indexing the same
641                 /// range twice.
642                 ///
643                 /// \note \ref rule() flags are not copied
644                 ///
645                 segment_index(boundary_point_index<base_iterator> const &);
646                 ///
647                 /// Copy an index from a \ref boundary_point_index. It copies all indexing information
648                 /// and uses the default rule (all possible segments)
649                 ///
650                 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
651                 /// range it is much better to create one from another rather then indexing the same
652                 /// range twice.
653                 ///
654                 /// \note \ref rule() flags are not copied
655                 ///
656                 segment_index const &operator = (boundary_point_index<base_iterator> const &);
657 
658 
659                 ///
660                 /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
661                 /// in range [begin,end) for locale \a loc.
662                 ///
663                 /// \note \ref rule() and \ref full_select() remain unchanged.
664                 ///
map(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())665                 void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale())
666                 {
667                     map_ = mapping_type(type,begin,end,loc);
668                 }
669 
670                 ///
671                 /// Get the \ref iterator on the beginning of the segments range.
672                 ///
673                 /// Preconditions: the segment_index should have a mapping
674                 ///
675                 /// \note
676                 ///
677                 /// The returned iterator is invalidated by access to any non-const member functions of this object
678                 ///
begin() const679                 iterator begin() const
680                 {
681                     return iterator(true,&map_,mask_,full_select_);
682                 }
683 
684                 ///
685                 /// Get the \ref iterator on the ending of the segments range.
686                 ///
687                 /// Preconditions: the segment_index should have a mapping
688                 ///
689                 /// The returned iterator is invalidated by access to any non-const member functions of this object
690                 ///
end() const691                 iterator end() const
692                 {
693                     return iterator(false,&map_,mask_,full_select_);
694                 }
695 
696                 ///
697                 /// Find a first valid segment following a position \a p.
698                 ///
699                 /// If \a p is inside a valid segment this segment is selected:
700                 ///
701                 /// For example: For \ref word %boundary analysis with \ref word_any rule():
702                 ///
703                 /// - "to| be or ", would point to "be",
704                 /// - "t|o be or ", would point to "to",
705                 /// - "to be or| ", would point to end.
706                 ///
707                 ///
708                 /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator
709                 /// to the text in the mapped range.
710                 ///
711                 /// The returned iterator is invalidated by access to any non-const member functions of this object
712                 ///
find(base_iterator p) const713                 iterator find(base_iterator p) const
714                 {
715                     return iterator(p,&map_,mask_,full_select_);
716                 }
717 
718                 ///
719                 /// Get the mask of rules that are used
720                 ///
rule() const721                 rule_type rule() const
722                 {
723                     return mask_;
724                 }
725                 ///
726                 /// Set the mask of rules that are used
727                 ///
rule(rule_type v)728                 void rule(rule_type v)
729                 {
730                     mask_ = v;
731                 }
732 
733                 ///
734                 /// Get the full_select property value -  should segment include in the range
735                 /// values that not belong to specific \ref rule() or not.
736                 ///
737                 /// The default value is false.
738                 ///
739                 /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
740                 /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
741                 /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
742                 /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
743                 /// following part "are you?"
744                 ///
745 
full_select() const746                 bool full_select()  const
747                 {
748                     return full_select_;
749                 }
750 
751                 ///
752                 /// Set the full_select property value -  should segment include in the range
753                 /// values that not belong to specific \ref rule() or not.
754                 ///
755                 /// The default value is false.
756                 ///
757                 /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
758                 /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
759                 /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
760                 /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
761                 /// following part "are you?"
762                 ///
763 
full_select(bool v)764                 void full_select(bool v)
765                 {
766                     full_select_ = v;
767                 }
768 
769             private:
770                 friend class boundary_point_index<base_iterator>;
771                 typedef details::mapping<base_iterator> mapping_type;
772                 mapping_type  map_;
773                 rule_type mask_;
774                 bool full_select_;
775             };
776 
777             ///
778             /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating
779             /// over them.
780             ///
781             /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
782             /// to the \ref boundary_point objects.
783             ///
784             /// It provides an option that affects selecting %boundary points according to different rules:
785             /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific
786             /// types of %boundary points like \ref sentence_term.
787             ///
788             /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default
789             /// rule is used the %boundary points would be:
790             ///
791             /// - "|Hello! How\nare you?"
792             /// - "Hello! |How\nare you?"
793             /// - "Hello! How\n|are you?"
794             /// - "Hello! How\nare you?|"
795             ///
796             /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be:
797             ///
798             /// - "|Hello! How\nare you?"
799             /// - "Hello! |How\nare you?"
800             /// - "Hello! How\nare you?|"
801             ///
802             /// Such that a %boundary point defined by a line feed character would be ignored.
803             ///
804             /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member
805             /// function.
806             ///
807             /// \note
808             /// -   Even an empty text range [x,x) considered to have a one %boundary point x.
809             /// -   \a a and \a b points of the range [a,b) are always considered %boundary points
810             ///     regardless the rules used.
811             /// -   Changing any of the option \ref rule() or course re-indexing the text
812             ///     invalidates existing iterators and they can't be used any more.
813             /// -   boundary_point_index can be created from segment_index or other boundary_point_index that was created with
814             ///     same \ref boundary_type.  This is very fast operation %as they shared same index
815             ///     and it does not require its regeneration.
816             ///
817             /// \see
818             ///
819             /// - \ref segment_index
820             /// - \ref boundary_point
821             /// - \ref segment
822             ///
823 
824 
825             template<typename BaseIterator>
826             class boundary_point_index {
827             public:
828                 ///
829                 /// The type of the iterator used to iterate over the original text
830                 ///
831                 typedef BaseIterator base_iterator;
832                 #ifdef BOOST_LOCALE_DOXYGEN
833                 ///
834                 /// The bidirectional iterator that iterates over \ref value_type objects.
835                 ///
836                 /// -   The iterators may be invalidated by use of any non-const member function
837                 ///     including but not limited to \ref rule(rule_type) member function.
838                 /// -   The returned value_type object is valid %as long %as iterator points to it.
839                 ///     So this following code is wrong %as t used after p was updated:
840                 ///     \code
841                 ///     boundary_point_index<some_iterator>::iterator p=index.begin();
842                 ///     boundary_point<some_iterator> &t = *p;
843                 ///     ++p;
844                 ///     rule_type r = t->rule();
845                 ///     \endcode
846                 ///
847                 typedef unspecified_iterator_type iterator;
848                 ///
849                 /// \copydoc iterator
850                 ///
851                 typedef unspecified_iterator_type const_iterator;
852                 #else
853                 typedef details::boundary_point_index_iterator<base_iterator> iterator;
854                 typedef details::boundary_point_index_iterator<base_iterator> const_iterator;
855                 #endif
856                 ///
857                 /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
858                 /// an object that represents the selected \ref boundary_point "boundary point".
859                 ///
860                 typedef boundary_point<base_iterator> value_type;
861 
862                 ///
863                 /// Default constructor.
864                 ///
865                 /// \note
866                 ///
867                 /// When this object is constructed by default it does not include a valid index, thus
868                 /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
869                 /// behavior
870                 ///
boundary_point_index()871                 boundary_point_index() : mask_(0xFFFFFFFFu)
872                 {
873                 }
874 
875                 ///
876                 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
877                 /// in range [begin,end) using a rule \a mask for locale \a loc.
878                 ///
boundary_point_index(boundary_type type,base_iterator begin,base_iterator end,rule_type mask,std::locale const & loc=std::locale ())879                 boundary_point_index(boundary_type type,
880                             base_iterator begin,
881                             base_iterator end,
882                             rule_type mask,
883                             std::locale const &loc=std::locale())
884                     :
885                         map_(type,begin,end,loc),
886                         mask_(mask)
887                 {
888                 }
889                 ///
890                 /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
891                 /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc.
892                 ///
boundary_point_index(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())893                 boundary_point_index(boundary_type type,
894                             base_iterator begin,
895                             base_iterator end,
896                             std::locale const &loc=std::locale())
897                     :
898                         map_(type,begin,end,loc),
899                         mask_(0xFFFFFFFFu)
900                 {
901                 }
902 
903                 ///
904                 /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information
905                 /// and uses the default rule (all possible %boundary points)
906                 ///
907                 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
908                 /// range it is much better to create one from another rather then indexing the same
909                 /// range twice.
910                 ///
911                 /// \note \ref rule() flags are not copied
912                 ///
913                 boundary_point_index(segment_index<base_iterator> const &other);
914                 ///
915                 /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information
916                 /// and keeps the current \ref rule() unchanged
917                 ///
918                 /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
919                 /// range it is much better to create one from another rather then indexing the same
920                 /// range twice.
921                 ///
922                 /// \note \ref rule() flags are not copied
923                 ///
924                 boundary_point_index const &operator=(segment_index<base_iterator> const &other);
925 
926                 ///
927                 /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
928                 /// in range [begin,end) for locale \a loc.
929                 ///
930                 /// \note \ref rule() remains unchanged.
931                 ///
map(boundary_type type,base_iterator begin,base_iterator end,std::locale const & loc=std::locale ())932                 void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale())
933                 {
934                     map_ = mapping_type(type,begin,end,loc);
935                 }
936 
937                 ///
938                 /// Get the \ref iterator on the beginning of the %boundary points range.
939                 ///
940                 /// Preconditions: this boundary_point_index should have a mapping
941                 ///
942                 /// \note
943                 ///
944                 /// The returned iterator is invalidated by access to any non-const member functions of this object
945                 ///
begin() const946                 iterator begin() const
947                 {
948                     return iterator(true,&map_,mask_);
949                 }
950 
951                 ///
952                 /// Get the \ref iterator on the ending of the %boundary points range.
953                 ///
954                 /// Preconditions: this boundary_point_index should have a mapping
955                 ///
956                 /// \note
957                 ///
958                 /// The returned iterator is invalidated by access to any non-const member functions of this object
959                 ///
end() const960                 iterator end() const
961                 {
962                     return iterator(false,&map_,mask_);
963                 }
964 
965                 ///
966                 /// Find a first valid %boundary point on a position \a p or following it.
967                 ///
968                 /// For example: For \ref word %boundary analysis of the text "to be or"
969                 ///
970                 /// - "|to be", would return %boundary point at "|to be",
971                 /// - "t|o be", would point to "to| be"
972                 ///
973                 /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator
974                 /// to the text in the mapped range.
975                 ///
976                 /// The returned iterator is invalidated by access to any non-const member functions of this object
977                 ///
find(base_iterator p) const978                 iterator find(base_iterator p) const
979                 {
980                     return iterator(p,&map_,mask_);
981                 }
982 
983                 ///
984                 /// Get the mask of rules that are used
985                 ///
rule() const986                 rule_type rule() const
987                 {
988                     return mask_;
989                 }
990                 ///
991                 /// Set the mask of rules that are used
992                 ///
rule(rule_type v)993                 void rule(rule_type v)
994                 {
995                     mask_ = v;
996                 }
997 
998             private:
999 
1000                 friend class segment_index<base_iterator>;
1001                 typedef details::mapping<base_iterator> mapping_type;
1002                 mapping_type  map_;
1003                 rule_type mask_;
1004             };
1005 
1006             /// \cond INTERNAL
1007             template<typename BaseIterator>
segment_index(boundary_point_index<BaseIterator> const & other)1008             segment_index<BaseIterator>::segment_index(boundary_point_index<BaseIterator> const &other) :
1009                 map_(other.map_),
1010                 mask_(0xFFFFFFFFu),
1011                 full_select_(false)
1012             {
1013             }
1014 
1015             template<typename BaseIterator>
boundary_point_index(segment_index<BaseIterator> const & other)1016             boundary_point_index<BaseIterator>::boundary_point_index(segment_index<BaseIterator> const &other) :
1017                 map_(other.map_),
1018                 mask_(0xFFFFFFFFu)
1019             {
1020             }
1021 
1022             template<typename BaseIterator>
operator =(boundary_point_index<BaseIterator> const & other)1023             segment_index<BaseIterator> const &segment_index<BaseIterator>::operator=(boundary_point_index<BaseIterator> const &other)
1024             {
1025                 map_ = other.map_;
1026                 return *this;
1027             }
1028 
1029             template<typename BaseIterator>
operator =(segment_index<BaseIterator> const & other)1030             boundary_point_index<BaseIterator> const &boundary_point_index<BaseIterator>::operator=(segment_index<BaseIterator> const &other)
1031             {
1032                 map_ = other.map_;
1033                 return *this;
1034             }
1035             /// \endcond
1036 
1037             typedef segment_index<std::string::const_iterator> ssegment_index;      ///< convenience typedef
1038             typedef segment_index<std::wstring::const_iterator> wssegment_index;    ///< convenience typedef
1039             #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
1040             typedef segment_index<std::u16string::const_iterator> u16ssegment_index;///< convenience typedef
1041             #endif
1042             #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
1043             typedef segment_index<std::u32string::const_iterator> u32ssegment_index;///< convenience typedef
1044             #endif
1045 
1046             typedef segment_index<char const *> csegment_index;                     ///< convenience typedef
1047             typedef segment_index<wchar_t const *> wcsegment_index;                 ///< convenience typedef
1048             #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
1049             typedef segment_index<char16_t const *> u16csegment_index;              ///< convenience typedef
1050             #endif
1051             #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
1052             typedef segment_index<char32_t const *> u32csegment_index;              ///< convenience typedef
1053             #endif
1054 
1055             typedef boundary_point_index<std::string::const_iterator> sboundary_point_index;///< convenience typedef
1056             typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index;///< convenience typedef
1057             #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
1058             typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index;///< convenience typedef
1059             #endif
1060             #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
1061             typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index;///< convenience typedef
1062             #endif
1063 
1064             typedef boundary_point_index<char const *> cboundary_point_index;       ///< convenience typedef
1065             typedef boundary_point_index<wchar_t const *> wcboundary_point_index;   ///< convenience typedef
1066             #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
1067             typedef boundary_point_index<char16_t const *> u16cboundary_point_index;///< convenience typedef
1068             #endif
1069             #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
1070             typedef boundary_point_index<char32_t const *> u32cboundary_point_index;///< convenience typedef
1071             #endif
1072 
1073 
1074 
1075         } // boundary
1076 
1077     } // locale
1078 } // boost
1079 
1080 ///
1081 /// \example boundary.cpp
1082 /// Example of using segment_index
1083 /// \example wboundary.cpp
1084 /// Example of using segment_index over wide strings
1085 ///
1086 
1087 #ifdef BOOST_MSVC
1088 #pragma warning(pop)
1089 #endif
1090 
1091 #endif
1092 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
1093