1 #ifndef CORELIB___STR_UTIL__HPP
2 #define CORELIB___STR_UTIL__HPP
3
4 /* $Id: ncbistr_util.hpp 563021 2018-05-01 14:45:23Z ivanov $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Authors: Eugene Vasilchenko, Aaron Ucko, Denis Vakatov, Anatoliy Kuznetsov
30 *
31 * File Description:
32 * String algorithms
33 *
34 */
35
36 /// @file ncbistr_util.hpp
37 /// Algorithms for string processing
38
39 #include <corelib/ncbistr.hpp>
40 #include <corelib/ncbiexpt.hpp>
41
42
43 BEGIN_NCBI_SCOPE
44
45
46 /** @addtogroup String
47 *
48 * @{
49 */
50
51 class CStrTokenizeBase;
52
53 /// Do-nothing token position container
54 ///
55 struct CStrDummyTokenPos
56 {
push_backCStrDummyTokenPos57 void push_back(string::size_type /*pos*/) {}
reserveCStrDummyTokenPos58 void reserve(string::size_type) {}
59 };
60
61 /// Do nothing token counter
62 ///
63 struct CStrDummyTokenCount
64 {
65 static
CountCStrDummyTokenCount66 size_t Count(CStrTokenizeBase& /*tokenizer*/)
67 {
68 return 0;
69 }
70 };
71
72 /// Do nothing target reservation trait
73 ///
74 /// applies for list<> or cases when reservation only makes things slower
75 /// (may be the case if we use deque<> as a target container and tokenize
76 /// large text)
77 ///
78 template<class TV, class TP>
79 struct CStrDummyTargetReserve
80 {
81 static
ReserveCStrDummyTargetReserve82 void Reserve(CStrTokenizeBase& /*tokenizer*/,
83 TV& /*target*/,
84 TP& /*token_pos*/)
85 {}
86 };
87
88
89 /// Singly-linked list of substrings that will constitute a single
90 /// Split/Tokenize piece, optimized for the typical one-node case.
91 class CTempStringList
92 {
93 public:
CTempStringList(CTempString_Storage * storage)94 CTempStringList(CTempString_Storage* storage)
95 : m_LastNode(NULL), m_Storage(storage) { }
96
97 void Add(const CTempString& s);
98 void Clear(void);
99 void Join(string* s) const;
100 void Join(CTempString* s) const;
101 void Join(CTempStringEx* s) const;
102 size_t GetSize(void) const;
103
104 private:
105 struct SNode
106 {
SNodeCTempStringList::SNode107 SNode() { }
SNodeCTempStringList::SNode108 SNode(const CTempString& s) : str(s) { }
109
110 CTempString str;
111 unique_ptr<SNode> next;
112 };
113
114 SNode m_FirstNode;
115 SNode *m_LastNode;
116 CTempString_Storage* m_Storage;
117 };
118
119 inline
Add(const CTempString & s)120 void CTempStringList::Add(const CTempString& s)
121 {
122 if (m_LastNode == NULL) {
123 m_FirstNode.str = s;
124 m_LastNode = &m_FirstNode;
125 } else {
126 m_LastNode->next.reset(new SNode(s));
127 m_LastNode = m_LastNode->next.get();
128 }
129 }
130
131 inline
Clear(void)132 void CTempStringList::Clear(void)
133 {
134 m_FirstNode.str.clear();
135 m_FirstNode.next.reset();
136 m_LastNode = NULL;
137 }
138
139 class CStrTokenizeBase
140 {
141 public:
142 typedef NStr::TSplitFlags TFlags;
143
144 CStrTokenizeBase(const CTempString& str, const CTempString& delim,
145 TFlags flags, CTempString_Storage* storage);
146
GetPos(void) const147 SIZE_TYPE GetPos(void) const { return m_Pos; }
AtEnd(void) const148 bool AtEnd (void) const { return m_Pos == NPOS; }
149
150 /// Return TRUE if it found some text and put it into collector.
151 /// The collector can have empty strings, even if function returns FALSE.
152 /// @note
153 /// This method don't honor NStr::fSplit_Truncate_Begin flag,
154 /// due it's stream nature, so you should process it in the calling code.
155 /// NStr::fSplit_Truncate_Begin is honored.
Advance(CTempStringList * part_collector)156 bool Advance(CTempStringList* part_collector) {
157 return Advance(part_collector, NULL, NULL);
158 };
159 bool Advance(CTempStringList* part_collector,
160 SIZE_TYPE* ptr_part_start /* out */,
161 SIZE_TYPE* ptr_delim_pos /* out */);
162
163 /// Skip all delimiters starting from current position.
SkipDelims(void)164 void SkipDelims(void) { x_SkipDelims(true); }
165
166 /// Assumes that we already have a delimiter on the previous
167 /// position, so just skip all subsequent, depending on flags.
MergeDelims(void)168 void MergeDelims(void) { x_SkipDelims(false); }
169
170 // Set new delimiters.
171 void SetDelim(const CTempString& delim);
172
173 protected:
174 const CTempString& m_Str;
175 CTempString m_Delim;
176 SIZE_TYPE m_Pos;
177 TFlags m_Flags;
178 CTempString_Storage* m_Storage;
179
180 private:
181 void x_ExtendInternalDelim();
182 void x_SkipDelims(bool force_skip);
183
184 CTempStringEx m_InternalDelim;
185 CTempString_Storage m_DelimStorage;
186 };
187
188 inline
CStrTokenizeBase(const CTempString & str,const CTempString & delim,TFlags flags,CTempString_Storage * storage)189 CStrTokenizeBase::CStrTokenizeBase(const CTempString& str,
190 const CTempString& delim,
191 TFlags flags,
192 CTempString_Storage* storage)
193 : m_Str(str), m_Pos(0), m_Flags(flags), m_Storage(storage)
194 {
195 SetDelim(delim);
196 }
197
198 inline
SetDelim(const CTempString & delim)199 void CStrTokenizeBase::SetDelim(const CTempString& delim)
200 {
201 m_Delim = delim;
202
203 if ((m_Flags & NStr::fSplit_ByPattern) == 0) {
204 m_InternalDelim = m_Delim;
205 } else {
206 m_InternalDelim.assign(m_Delim, 0, 1);
207 }
208 if ((m_Flags & (NStr::fSplit_CanEscape | NStr::fSplit_CanQuote)) != 0) {
209 x_ExtendInternalDelim();
210 }
211 }
212
213 /// Main tokenization algorithm
214 ///
215 /// TStr - string type (must follow std::string interface)
216 /// TV - container type for tokens (std::vector)
217 /// TP - target container type for token positions (vector<size_t>)
218 /// TCount - token count trait for the target container space reservation
219 /// TReserve- target space reservation trait
220 ///
221 template<class TStr,
222 class TV,
223 class TP = CStrDummyTokenPos,
224 class TCount = CStrDummyTokenCount,
225 class TReserve = CStrDummyTargetReserve<TV, TP> >
226 class CStrTokenize : public CStrTokenizeBase
227 {
228 public:
229 typedef TStr TString;
230 typedef TV TContainer;
231 typedef TP TPosContainer;
232 typedef TCount TCountTrait;
233 typedef TReserve TReserveTrait;
234
235 /// Constructor
236 ///
237 /// @param str
238 /// String to be tokenized.
239 /// @param delim
240 /// Set of char delimiters used to tokenize string "str".
241 /// If delimiter is empty, then input string is appended to "arr" as is.
242 /// @param flags
243 /// Flags governing splitting.
244 /// Without NStr::fSplit_MergeDelimiters, delimiters that immediately follow
245 /// each other are treated as separate delimiters - empty string(s)
246 /// appear in the target output.
247 ///
CStrTokenize(const TString & str,const TString & delim,TFlags flags,CTempString_Storage * storage)248 CStrTokenize(const TString& str, const TString& delim, TFlags flags, CTempString_Storage* storage)
249 : CStrTokenizeBase(str, delim, flags, storage)
250 { }
251
252 /// Tokenize the string using the specified set of char delimiters.
253 ///
254 /// @param target
255 /// Output container.
256 /// Tokens defined in "str" by using symbols from "delim" are added
257 /// to the list "target".
258 /// @param token_pos
259 /// Container for the tokens' positions in "str".
260 /// @param empty_str
261 /// String added to target when there are no other characters between
262 /// delimiters
263 ///
Do(TContainer & target,TPosContainer & token_pos,const TString & empty_str=TString ())264 void Do(TContainer& target,
265 TPosContainer& token_pos,
266 const TString& empty_str = TString())
267 {
268 auto target_initial_size = target.size();
269
270 // Special cases
271 if (m_Str.empty()) {
272 return;
273 } else if (m_Delim.empty()) {
274 target.push_back(m_Str);
275 token_pos.push_back(0);
276 return;
277 }
278 // Do target space reservation (if applicable)
279 TReserveTrait::Reserve(*this, target, token_pos);
280
281 // Tokenization
282
283 CTempStringList part_collector(m_Storage);
284 SIZE_TYPE prev_pos;
285 SIZE_TYPE delim_pos = NPOS;
286 m_Pos = 0;
287 do {
288 Advance(&part_collector, &prev_pos, &delim_pos);
289 target.push_back(empty_str); // reserve space for value added next line
290 part_collector.Join(&target.back());
291 part_collector.Clear();
292 token_pos.push_back(prev_pos);
293 } while ( !AtEnd() );
294
295 if ( (m_Flags & NStr::fSplit_Truncate_End) == 0 ) {
296 // account training delimiter
297 if (delim_pos != NPOS) {
298 // add empty token after last delimiter
299 target.push_back(empty_str);
300 token_pos.push_back(delim_pos + 1);
301 }
302 }
303 else {
304 // truncate trailing delimiters
305
306 // number of newly added items
307 SIZE_TYPE num_new = target.size() - target_initial_size;
308 // number of empty trailing items to delete
309 SIZE_TYPE num_del = 0;
310 for (auto i = target.rbegin(); i != target.rend() && num_new--; ++i) {
311 if (!i->empty()) {
312 break;
313 }
314 num_del++;
315 }
316 if (num_del) {
317 target.resize(target.size()-num_del);
318 token_pos.resize(token_pos.size()-num_del);
319 }
320 }
321 }
322 };
323
324
325 /// token count trait for std::string
326 ///
327 struct CStringTokenCount
328 {
329 static
CountCStringTokenCount330 size_t Count(CStrTokenizeBase& tokenizer)
331 {
332 size_t tokens = 0;
333
334 do {
335 if (tokenizer.Advance(NULL, NULL, NULL)) {
336 ++tokens;
337 }
338 } while ( !tokenizer.AtEnd() );
339
340 return tokens;
341 }
342 };
343
344
345 /// Target reservation trait (applies for vector<>)
346 ///
347 template<class TV, class TP, class TCount>
348 struct CStrTargetReserve
349 {
350 static
ReserveCStrTargetReserve351 void Reserve(CStrTokenizeBase& tokenizer,
352 TV& target,
353 TP& token_pos)
354 {
355 // Reserve vector size only for empty vectors. For vectors which
356 // already have items this has been known to work more slowly.
357 if ( target.empty() ) {
358 size_t tokens = TCount::Count(tokenizer);
359 if (tokens) {
360 token_pos.reserve(tokens);
361 target.reserve(tokens);
362 }
363 }
364 }
365 };
366
367
368 /// Adapter for token position container pointer(NULL legal)
369 /// Makes pointer to a container look as a legal container
370 ///
371 template<class TPosContainer>
372 class CStrTokenPosAdapter
373 {
374 public:
375 /// If token_pos construction parameter is NULL all calls are ignored
CStrTokenPosAdapter(TPosContainer * token_pos)376 CStrTokenPosAdapter(TPosContainer* token_pos)
377 : m_TokenPos(token_pos)
378 {}
379
push_back(string::size_type pos)380 void push_back(string::size_type pos)
381 {
382 if (m_TokenPos) m_TokenPos->push_back(pos);
383 }
reserve(string::size_type capacity)384 void reserve(string::size_type capacity)
385 {
386 if (m_TokenPos) m_TokenPos->reserve(capacity);
387 }
size()388 string::size_type size()
389 {
390 return m_TokenPos ? m_TokenPos->size() : 0;
391 }
resize(string::size_type newsize)392 void resize(string::size_type newsize)
393 {
394 if (m_TokenPos) m_TokenPos->resize(newsize);
395 }
396 private:
397 TPosContainer* m_TokenPos;
398 };
399
400
401
402 /*
403 * @}
404 */
405
406 /////////////////////////////////////////////////////////////////////////////
407
408
409 END_NCBI_SCOPE
410
411 #endif // CORELIB___STR_UTIL__HPP
412