1 #ifndef CORELIB___STR_UTIL__HPP
2 #define CORELIB___STR_UTIL__HPP
3 
4 /*  $Id: ncbistr_util.hpp 563021 2018-05-01 14:45:23Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:  Eugene Vasilchenko, Aaron Ucko, Denis Vakatov, Anatoliy Kuznetsov
30  *
31  * File Description:
32  *      String algorithms
33  *
34  */
35 
36 /// @file ncbistr_util.hpp
37 /// Algorithms for string processing
38 
39 #include <corelib/ncbistr.hpp>
40 #include <corelib/ncbiexpt.hpp>
41 
42 
43 BEGIN_NCBI_SCOPE
44 
45 
46 /** @addtogroup String
47  *
48  * @{
49  */
50 
51 class CStrTokenizeBase;
52 
53 /// Do-nothing token position container
54 ///
55 struct CStrDummyTokenPos
56 {
push_backCStrDummyTokenPos57     void push_back(string::size_type /*pos*/) {}
reserveCStrDummyTokenPos58     void reserve(string::size_type) {}
59 };
60 
61 /// Do nothing token counter
62 ///
63 struct CStrDummyTokenCount
64 {
65     static
CountCStrDummyTokenCount66     size_t Count(CStrTokenizeBase& /*tokenizer*/)
67     {
68         return 0;
69     }
70 };
71 
72 /// Do nothing target reservation trait
73 ///
74 /// applies for list<> or cases when reservation only makes things slower
75 /// (may be the case if we use deque<> as a target container and tokenize
76 /// large text)
77 ///
78 template<class TV, class TP>
79 struct CStrDummyTargetReserve
80 {
81     static
ReserveCStrDummyTargetReserve82     void Reserve(CStrTokenizeBase& /*tokenizer*/,
83                  TV&               /*target*/,
84                  TP&               /*token_pos*/)
85     {}
86 };
87 
88 
89 /// Singly-linked list of substrings that will constitute a single
90 /// Split/Tokenize piece, optimized for the typical one-node case.
91 class CTempStringList
92 {
93 public:
CTempStringList(CTempString_Storage * storage)94     CTempStringList(CTempString_Storage* storage)
95         : m_LastNode(NULL), m_Storage(storage) { }
96 
97     void   Add(const CTempString& s);
98     void   Clear(void);
99     void   Join(string* s) const;
100     void   Join(CTempString* s) const;
101     void   Join(CTempStringEx* s) const;
102     size_t GetSize(void) const;
103 
104 private:
105     struct SNode
106     {
SNodeCTempStringList::SNode107         SNode() { }
SNodeCTempStringList::SNode108         SNode(const CTempString& s) : str(s) { }
109 
110         CTempString       str;
111         unique_ptr<SNode> next;
112     };
113 
114     SNode  m_FirstNode;
115     SNode *m_LastNode;
116     CTempString_Storage* m_Storage;
117 };
118 
119 inline
Add(const CTempString & s)120 void CTempStringList::Add(const CTempString& s)
121 {
122     if (m_LastNode == NULL) {
123         m_FirstNode.str = s;
124         m_LastNode = &m_FirstNode;
125     } else {
126         m_LastNode->next.reset(new SNode(s));
127         m_LastNode = m_LastNode->next.get();
128     }
129 }
130 
131 inline
Clear(void)132 void CTempStringList::Clear(void)
133 {
134     m_FirstNode.str.clear();
135     m_FirstNode.next.reset();
136     m_LastNode = NULL;
137 }
138 
139 class CStrTokenizeBase
140 {
141 public:
142     typedef NStr::TSplitFlags TFlags;
143 
144     CStrTokenizeBase(const CTempString& str, const CTempString& delim,
145                      TFlags flags, CTempString_Storage* storage);
146 
GetPos(void) const147     SIZE_TYPE GetPos(void) const { return m_Pos; }
AtEnd(void) const148     bool      AtEnd (void) const { return m_Pos == NPOS; }
149 
150     /// Return TRUE if it found some text and put it into collector.
151     /// The collector can have empty strings, even if function returns FALSE.
152     /// @note
153     ///   This method don't honor NStr::fSplit_Truncate_Begin flag,
154     ///   due it's stream nature, so you should process it in the calling code.
155     ///   NStr::fSplit_Truncate_Begin is honored.
Advance(CTempStringList * part_collector)156     bool Advance(CTempStringList* part_collector) {
157         return Advance(part_collector, NULL, NULL);
158     };
159     bool Advance(CTempStringList* part_collector,
160                  SIZE_TYPE* ptr_part_start /* out */,
161                  SIZE_TYPE* ptr_delim_pos  /* out */);
162 
163     /// Skip all delimiters starting from current position.
SkipDelims(void)164     void SkipDelims(void) { x_SkipDelims(true); }
165 
166     /// Assumes that we already have a delimiter on the previous
167     /// position, so just skip all subsequent, depending on flags.
MergeDelims(void)168     void MergeDelims(void) { x_SkipDelims(false); }
169 
170     // Set new delimiters.
171     void SetDelim(const CTempString& delim);
172 
173 protected:
174     const CTempString&   m_Str;
175     CTempString          m_Delim;
176     SIZE_TYPE            m_Pos;
177     TFlags               m_Flags;
178     CTempString_Storage* m_Storage;
179 
180 private:
181     void x_ExtendInternalDelim();
182     void x_SkipDelims(bool force_skip);
183 
184     CTempStringEx        m_InternalDelim;
185     CTempString_Storage  m_DelimStorage;
186 };
187 
188 inline
CStrTokenizeBase(const CTempString & str,const CTempString & delim,TFlags flags,CTempString_Storage * storage)189 CStrTokenizeBase::CStrTokenizeBase(const CTempString& str,
190                                    const CTempString& delim,
191                                    TFlags flags,
192                                    CTempString_Storage* storage)
193     : m_Str(str), m_Pos(0), m_Flags(flags), m_Storage(storage)
194 {
195     SetDelim(delim);
196 }
197 
198 inline
SetDelim(const CTempString & delim)199 void CStrTokenizeBase::SetDelim(const CTempString& delim)
200 {
201     m_Delim = delim;
202 
203     if ((m_Flags & NStr::fSplit_ByPattern) == 0) {
204         m_InternalDelim = m_Delim;
205     } else {
206         m_InternalDelim.assign(m_Delim, 0, 1);
207     }
208     if ((m_Flags & (NStr::fSplit_CanEscape | NStr::fSplit_CanQuote)) != 0) {
209         x_ExtendInternalDelim();
210     }
211 }
212 
213 /// Main tokenization algorithm
214 ///
215 /// TStr    - string type (must follow std::string interface)
216 /// TV      - container type for tokens (std::vector)
217 /// TP      - target container type for token positions (vector<size_t>)
218 /// TCount  - token count trait for the target container space reservation
219 /// TReserve- target space reservation trait
220 ///
221 template<class TStr,
222          class TV,
223          class TP = CStrDummyTokenPos,
224          class TCount = CStrDummyTokenCount,
225          class TReserve = CStrDummyTargetReserve<TV, TP> >
226 class CStrTokenize : public CStrTokenizeBase
227 {
228 public:
229     typedef TStr     TString;
230     typedef TV       TContainer;
231     typedef TP       TPosContainer;
232     typedef TCount   TCountTrait;
233     typedef TReserve TReserveTrait;
234 
235     /// Constructor
236     ///
237     /// @param str
238     ///   String to be tokenized.
239     /// @param delim
240     ///   Set of char delimiters used to tokenize string "str".
241     ///   If delimiter is empty, then input string is appended to "arr" as is.
242     /// @param flags
243     ///   Flags governing splitting.
244     ///   Without NStr::fSplit_MergeDelimiters, delimiters that immediately follow
245     ///   each other are treated as separate delimiters - empty string(s)
246     ///   appear in the target output.
247     ///
CStrTokenize(const TString & str,const TString & delim,TFlags flags,CTempString_Storage * storage)248     CStrTokenize(const TString& str, const TString& delim, TFlags flags, CTempString_Storage* storage)
249         : CStrTokenizeBase(str, delim, flags, storage)
250         { }
251 
252     /// Tokenize the string using the specified set of char delimiters.
253     ///
254     /// @param target
255     ///   Output container.
256     ///   Tokens defined in "str" by using symbols from "delim" are added
257     ///   to the list "target".
258     /// @param token_pos
259     ///   Container for the tokens' positions in "str".
260     /// @param empty_str
261     ///   String added to target when there are no other characters between
262     ///   delimiters
263     ///
Do(TContainer & target,TPosContainer & token_pos,const TString & empty_str=TString ())264     void Do(TContainer&     target,
265             TPosContainer&  token_pos,
266             const TString&  empty_str = TString())
267     {
268         auto target_initial_size = target.size();
269 
270         // Special cases
271         if (m_Str.empty()) {
272             return;
273         } else if (m_Delim.empty()) {
274             target.push_back(m_Str);
275             token_pos.push_back(0);
276             return;
277         }
278         // Do target space reservation (if applicable)
279         TReserveTrait::Reserve(*this, target, token_pos);
280 
281         // Tokenization
282 
283         CTempStringList part_collector(m_Storage);
284         SIZE_TYPE prev_pos;
285         SIZE_TYPE delim_pos = NPOS;
286         m_Pos = 0;
287         do {
288             Advance(&part_collector, &prev_pos, &delim_pos);
289             target.push_back(empty_str); // reserve space for value added next line
290             part_collector.Join(&target.back());
291             part_collector.Clear();
292             token_pos.push_back(prev_pos);
293         } while ( !AtEnd() );
294 
295         if ( (m_Flags & NStr::fSplit_Truncate_End) == 0 ) {
296             // account training delimiter
297             if (delim_pos != NPOS) {
298                 // add empty token after last delimiter
299                 target.push_back(empty_str);
300                 token_pos.push_back(delim_pos + 1);
301             }
302         }
303         else {
304             // truncate trailing delimiters
305 
306             // number of newly added items
307             SIZE_TYPE num_new = target.size() - target_initial_size;
308             // number of empty trailing items to delete
309             SIZE_TYPE num_del = 0;
310             for (auto i = target.rbegin(); i != target.rend() && num_new--; ++i) {
311                 if (!i->empty()) {
312                     break;
313                 }
314                 num_del++;
315             }
316             if (num_del) {
317                 target.resize(target.size()-num_del);
318                 token_pos.resize(token_pos.size()-num_del);
319             }
320         }
321     }
322 };
323 
324 
325 /// token count trait for std::string
326 ///
327 struct CStringTokenCount
328 {
329     static
CountCStringTokenCount330     size_t Count(CStrTokenizeBase& tokenizer)
331     {
332         size_t tokens = 0;
333 
334         do {
335             if (tokenizer.Advance(NULL, NULL, NULL)) {
336                 ++tokens;
337             }
338         } while ( !tokenizer.AtEnd() );
339 
340         return tokens;
341     }
342 };
343 
344 
345 /// Target reservation trait (applies for vector<>)
346 ///
347 template<class TV, class TP, class TCount>
348 struct CStrTargetReserve
349 {
350     static
ReserveCStrTargetReserve351     void Reserve(CStrTokenizeBase& tokenizer,
352                  TV&               target,
353                  TP&               token_pos)
354     {
355         // Reserve vector size only for empty vectors.  For vectors which
356         // already have items this has been known to work more slowly.
357         if ( target.empty() ) {
358             size_t tokens = TCount::Count(tokenizer);
359             if (tokens) {
360                 token_pos.reserve(tokens);
361                 target.reserve(tokens);
362             }
363         }
364     }
365 };
366 
367 
368 /// Adapter for token position container pointer(NULL legal)
369 /// Makes pointer to a container look as a legal container
370 ///
371 template<class TPosContainer>
372 class CStrTokenPosAdapter
373 {
374 public:
375     /// If token_pos construction parameter is NULL all calls are ignored
CStrTokenPosAdapter(TPosContainer * token_pos)376     CStrTokenPosAdapter(TPosContainer* token_pos)
377         : m_TokenPos(token_pos)
378     {}
379 
push_back(string::size_type pos)380     void push_back(string::size_type pos)
381     {
382         if (m_TokenPos) m_TokenPos->push_back(pos);
383     }
reserve(string::size_type capacity)384     void reserve(string::size_type capacity)
385     {
386         if (m_TokenPos) m_TokenPos->reserve(capacity);
387     }
size()388     string::size_type size()
389     {
390         return m_TokenPos ? m_TokenPos->size() : 0;
391     }
resize(string::size_type newsize)392     void resize(string::size_type newsize)
393     {
394         if (m_TokenPos) m_TokenPos->resize(newsize);
395     }
396 private:
397     TPosContainer* m_TokenPos;
398 };
399 
400 
401 
402  /*
403  * @}
404  */
405 
406 /////////////////////////////////////////////////////////////////////////////
407 
408 
409 END_NCBI_SCOPE
410 
411 #endif  // CORELIB___STR_UTIL__HPP
412