1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/string_split.h"
6 
7 #include <stddef.h>
8 
9 #include "base/logging.h"
10 #include "base/strings/string_util.h"
11 #include "base/third_party/icu/icu_utf.h"
12 
13 namespace base {
14 
15 namespace {
16 
17 // Returns either the ASCII or UTF-16 whitespace.
18 template <typename char_type>
19 std::basic_string_view<char_type> WhitespaceForType();
20 template <>
WhitespaceForType()21 std::u16string_view WhitespaceForType<char16_t>() {
22   return kWhitespaceUTF16;
23 }
24 template <>
WhitespaceForType()25 std::string_view WhitespaceForType<char>() {
26   return kWhitespaceASCII;
27 }
28 
29 // Optimize the single-character case to call find() on the string instead,
30 // since this is the common case and can be made faster. This could have been
31 // done with template specialization too, but would have been less clear.
32 //
33 // There is no corresponding FindFirstNotOf because std::string_view already
34 // implements these different versions that do the optimized searching.
FindFirstOf(std::string_view piece,char c,size_t pos)35 size_t FindFirstOf(std::string_view piece, char c, size_t pos) {
36   return piece.find(c, pos);
37 }
FindFirstOf(std::u16string_view piece,char16_t c,size_t pos)38 size_t FindFirstOf(std::u16string_view piece, char16_t c, size_t pos) {
39   return piece.find(c, pos);
40 }
FindFirstOf(std::string_view piece,std::string_view one_of,size_t pos)41 size_t FindFirstOf(std::string_view piece,
42                    std::string_view one_of,
43                    size_t pos) {
44   return piece.find_first_of(one_of, pos);
45 }
FindFirstOf(std::u16string_view piece,std::u16string_view one_of,size_t pos)46 size_t FindFirstOf(std::u16string_view piece,
47                    std::u16string_view one_of,
48                    size_t pos) {
49   return piece.find_first_of(one_of, pos);
50 }
51 
52 // General string splitter template. Can take 8- or 16-bit input, can produce
53 // the corresponding string or std::string_view output, and can take single- or
54 // multiple-character delimiters.
55 //
56 // DelimiterType is either a character (Str::value_type) or a string piece of
57 // multiple characters (std::basic_string_view<char>). std::string_view has a
58 // version of find for both of these cases, and the single-character version is
59 // the most common and can be implemented faster, which is why this is a
60 // template.
61 template <typename char_type, typename OutputStringType, typename DelimiterType>
SplitStringT(std::basic_string_view<char_type> str,DelimiterType delimiter,WhitespaceHandling whitespace,SplitResult result_type)62 static std::vector<OutputStringType> SplitStringT(
63     std::basic_string_view<char_type> str,
64     DelimiterType delimiter,
65     WhitespaceHandling whitespace,
66     SplitResult result_type) {
67   std::vector<OutputStringType> result;
68   if (str.empty())
69     return result;
70 
71   using ViewType = std::basic_string_view<char_type>;
72 
73   size_t start = 0;
74   while (start != ViewType::npos) {
75     size_t end = FindFirstOf(str, delimiter, start);
76 
77     ViewType piece;
78     if (end == ViewType::npos) {
79       piece = str.substr(start);
80       start = ViewType::npos;
81     } else {
82       piece = str.substr(start, end - start);
83       start = end + 1;
84     }
85 
86     if (whitespace == TRIM_WHITESPACE)
87       piece = TrimString(piece, WhitespaceForType<char_type>(), TRIM_ALL);
88 
89     if (result_type == SPLIT_WANT_ALL || !piece.empty())
90       result.emplace_back(piece);
91   }
92   return result;
93 }
94 
AppendStringKeyValue(std::string_view input,char delimiter,StringPairs * result)95 bool AppendStringKeyValue(std::string_view input,
96                           char delimiter,
97                           StringPairs* result) {
98   // Always append a new item regardless of success (it might be empty). The
99   // below code will copy the strings directly into the result pair.
100   result->resize(result->size() + 1);
101   auto& result_pair = result->back();
102 
103   // Find the delimiter.
104   size_t end_key_pos = input.find_first_of(delimiter);
105   if (end_key_pos == std::string::npos) {
106     return false;  // No delimiter.
107   }
108   result_pair.first.assign(input.substr(0, end_key_pos));
109 
110   // Find the value string.
111   std::string_view remains =
112       input.substr(end_key_pos, input.size() - end_key_pos);
113   size_t begin_value_pos = remains.find_first_not_of(delimiter);
114   if (begin_value_pos == std::string_view::npos) {
115     return false;  // No value.
116   }
117   result_pair.second.assign(
118       remains.substr(begin_value_pos, remains.size() - begin_value_pos));
119 
120   return true;
121 }
122 
123 template <typename char_type, typename OutputStringType>
SplitStringUsingSubstrT(std::basic_string_view<char_type> input,std::basic_string_view<char_type> delimiter,WhitespaceHandling whitespace,SplitResult result_type,std::vector<OutputStringType> * result)124 void SplitStringUsingSubstrT(std::basic_string_view<char_type> input,
125                              std::basic_string_view<char_type> delimiter,
126                              WhitespaceHandling whitespace,
127                              SplitResult result_type,
128                              std::vector<OutputStringType>* result) {
129   using Piece = std::basic_string_view<char_type>;
130   using size_type = typename Piece::size_type;
131 
132   result->clear();
133   for (size_type begin_index = 0, end_index = 0; end_index != Piece::npos;
134        begin_index = end_index + delimiter.size()) {
135     end_index = input.find(delimiter, begin_index);
136     Piece term = end_index == Piece::npos
137                      ? input.substr(begin_index)
138                      : input.substr(begin_index, end_index - begin_index);
139 
140     if (whitespace == TRIM_WHITESPACE)
141       term = TrimString(term, WhitespaceForType<char_type>(), TRIM_ALL);
142 
143     if (result_type == SPLIT_WANT_ALL || !term.empty())
144       result->emplace_back(term);
145   }
146 }
147 
148 }  // namespace
149 
SplitString(std::string_view input,std::string_view separators,WhitespaceHandling whitespace,SplitResult result_type)150 std::vector<std::string> SplitString(std::string_view input,
151                                      std::string_view separators,
152                                      WhitespaceHandling whitespace,
153                                      SplitResult result_type) {
154   if (separators.size() == 1) {
155     return SplitStringT<char, std::string, char>(input, separators[0],
156                                                  whitespace, result_type);
157   }
158   return SplitStringT<char, std::string, std::string_view>(
159       input, separators, whitespace, result_type);
160 }
161 
SplitString(std::u16string_view input,std::u16string_view separators,WhitespaceHandling whitespace,SplitResult result_type)162 std::vector<std::u16string> SplitString(std::u16string_view input,
163                                         std::u16string_view separators,
164                                         WhitespaceHandling whitespace,
165                                         SplitResult result_type) {
166   if (separators.size() == 1) {
167     return SplitStringT<char16_t, std::u16string, char16_t>(
168         input, separators[0], whitespace, result_type);
169   }
170   return SplitStringT<char16_t, std::u16string, std::u16string_view>(
171       input, separators, whitespace, result_type);
172 }
173 
SplitStringPiece(std::string_view input,std::string_view separators,WhitespaceHandling whitespace,SplitResult result_type)174 std::vector<std::string_view> SplitStringPiece(std::string_view input,
175                                                std::string_view separators,
176                                                WhitespaceHandling whitespace,
177                                                SplitResult result_type) {
178   if (separators.size() == 1) {
179     return SplitStringT<char, std::string_view, char>(input, separators[0],
180                                                       whitespace, result_type);
181   }
182   return SplitStringT<char, std::string_view, std::string_view>(
183       input, separators, whitespace, result_type);
184 }
185 
SplitStringPiece(std::u16string_view input,std::u16string_view separators,WhitespaceHandling whitespace,SplitResult result_type)186 std::vector<std::u16string_view> SplitStringPiece(
187     std::u16string_view input,
188     std::u16string_view separators,
189     WhitespaceHandling whitespace,
190     SplitResult result_type) {
191   if (separators.size() == 1) {
192     return SplitStringT<char16_t, std::u16string_view, char16_t>(
193         input, separators[0], whitespace, result_type);
194   }
195   return SplitStringT<char16_t, std::u16string_view, std::u16string_view>(
196       input, separators, whitespace, result_type);
197 }
198 
SplitStringIntoKeyValuePairs(std::string_view input,char key_value_delimiter,char key_value_pair_delimiter,StringPairs * key_value_pairs)199 bool SplitStringIntoKeyValuePairs(std::string_view input,
200                                   char key_value_delimiter,
201                                   char key_value_pair_delimiter,
202                                   StringPairs* key_value_pairs) {
203   key_value_pairs->clear();
204 
205   std::vector<std::string_view> pairs =
206       SplitStringPiece(input, std::string(1, key_value_pair_delimiter),
207                        TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY);
208   key_value_pairs->reserve(pairs.size());
209 
210   bool success = true;
211   for (const std::string_view& pair : pairs) {
212     if (!AppendStringKeyValue(pair, key_value_delimiter, key_value_pairs)) {
213       // Don't return here, to allow for pairs without associated
214       // value or key; just record that the split failed.
215       success = false;
216     }
217   }
218   return success;
219 }
220 
SplitStringUsingSubstr(std::u16string_view input,std::u16string_view delimiter,WhitespaceHandling whitespace,SplitResult result_type)221 std::vector<std::u16string> SplitStringUsingSubstr(
222     std::u16string_view input,
223     std::u16string_view delimiter,
224     WhitespaceHandling whitespace,
225     SplitResult result_type) {
226   std::vector<std::u16string> result;
227   SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
228   return result;
229 }
230 
SplitStringUsingSubstr(std::string_view input,std::string_view delimiter,WhitespaceHandling whitespace,SplitResult result_type)231 std::vector<std::string> SplitStringUsingSubstr(std::string_view input,
232                                                 std::string_view delimiter,
233                                                 WhitespaceHandling whitespace,
234                                                 SplitResult result_type) {
235   std::vector<std::string> result;
236   SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
237   return result;
238 }
239 
SplitStringPieceUsingSubstr(std::u16string_view input,std::u16string_view delimiter,WhitespaceHandling whitespace,SplitResult result_type)240 std::vector<std::u16string_view> SplitStringPieceUsingSubstr(
241     std::u16string_view input,
242     std::u16string_view delimiter,
243     WhitespaceHandling whitespace,
244     SplitResult result_type) {
245   std::vector<std::u16string_view> result;
246   SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
247   return result;
248 }
249 
SplitStringPieceUsingSubstr(std::string_view input,std::string_view delimiter,WhitespaceHandling whitespace,SplitResult result_type)250 std::vector<std::string_view> SplitStringPieceUsingSubstr(
251     std::string_view input,
252     std::string_view delimiter,
253     WhitespaceHandling whitespace,
254     SplitResult result_type) {
255   std::vector<std::string_view> result;
256   SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
257   return result;
258 }
259 
260 }  // namespace base
261