1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "llvm/ADT/StringRef.h"
11 #include "llvm/ADT/APInt.h"
12 #include <bitset>
13
14 using namespace llvm;
15
16 // MSVC emits references to this into the translation units which reference it.
17 #ifndef _MSC_VER
18 const size_t StringRef::npos;
19 #endif
20
ascii_tolower(char x)21 static char ascii_tolower(char x) {
22 if (x >= 'A' && x <= 'Z')
23 return x - 'A' + 'a';
24 return x;
25 }
26
ascii_isdigit(char x)27 static bool ascii_isdigit(char x) {
28 return x >= '0' && x <= '9';
29 }
30
31 /// compare_lower - Compare strings, ignoring case.
compare_lower(StringRef RHS) const32 int StringRef::compare_lower(StringRef RHS) const {
33 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
34 unsigned char LHC = ascii_tolower(Data[I]);
35 unsigned char RHC = ascii_tolower(RHS.Data[I]);
36 if (LHC != RHC)
37 return LHC < RHC ? -1 : 1;
38 }
39
40 if (Length == RHS.Length)
41 return 0;
42 return Length < RHS.Length ? -1 : 1;
43 }
44
45 /// compare_numeric - Compare strings, handle embedded numbers.
compare_numeric(StringRef RHS) const46 int StringRef::compare_numeric(StringRef RHS) const {
47 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
48 if (Data[I] == RHS.Data[I])
49 continue;
50 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
51 // The longer sequence of numbers is larger. This doesn't really handle
52 // prefixed zeros well.
53 for (size_t J = I+1; J != E+1; ++J) {
54 bool ld = J < Length && ascii_isdigit(Data[J]);
55 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
56 if (ld != rd)
57 return rd ? -1 : 1;
58 if (!rd)
59 break;
60 }
61 }
62 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
63 }
64 if (Length == RHS.Length)
65 return 0;
66 return Length < RHS.Length ? -1 : 1;
67 }
68
69 // Compute the edit distance between the two given strings.
edit_distance(llvm::StringRef Other,bool AllowReplacements)70 unsigned StringRef::edit_distance(llvm::StringRef Other,
71 bool AllowReplacements) {
72 // The algorithm implemented below is the "classic"
73 // dynamic-programming algorithm for computing the Levenshtein
74 // distance, which is described here:
75 //
76 // http://en.wikipedia.org/wiki/Levenshtein_distance
77 //
78 // Although the algorithm is typically described using an m x n
79 // array, only two rows are used at a time, so this implemenation
80 // just keeps two separate vectors for those two rows.
81 size_type m = size();
82 size_type n = Other.size();
83
84 const unsigned SmallBufferSize = 64;
85 unsigned SmallBuffer[SmallBufferSize];
86 unsigned *Allocated = 0;
87 unsigned *previous = SmallBuffer;
88 if (2*(n + 1) > SmallBufferSize)
89 Allocated = previous = new unsigned [2*(n+1)];
90 unsigned *current = previous + (n + 1);
91
92 for (unsigned i = 0; i <= n; ++i)
93 previous[i] = i;
94
95 for (size_type y = 1; y <= m; ++y) {
96 current[0] = y;
97 for (size_type x = 1; x <= n; ++x) {
98 if (AllowReplacements) {
99 current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u),
100 min(current[x-1], previous[x])+1);
101 }
102 else {
103 if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1];
104 else current[x] = min(current[x-1], previous[x]) + 1;
105 }
106 }
107
108 unsigned *tmp = current;
109 current = previous;
110 previous = tmp;
111 }
112
113 unsigned Result = previous[n];
114 delete [] Allocated;
115
116 return Result;
117 }
118
119 //===----------------------------------------------------------------------===//
120 // String Searching
121 //===----------------------------------------------------------------------===//
122
123
124 /// find - Search for the first string \arg Str in the string.
125 ///
126 /// \return - The index of the first occurence of \arg Str, or npos if not
127 /// found.
find(StringRef Str,size_t From) const128 size_t StringRef::find(StringRef Str, size_t From) const {
129 size_t N = Str.size();
130 if (N > Length)
131 return npos;
132 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
133 if (substr(i, N).equals(Str))
134 return i;
135 return npos;
136 }
137
138 /// rfind - Search for the last string \arg Str in the string.
139 ///
140 /// \return - The index of the last occurence of \arg Str, or npos if not
141 /// found.
rfind(StringRef Str) const142 size_t StringRef::rfind(StringRef Str) const {
143 size_t N = Str.size();
144 if (N > Length)
145 return npos;
146 for (size_t i = Length - N + 1, e = 0; i != e;) {
147 --i;
148 if (substr(i, N).equals(Str))
149 return i;
150 }
151 return npos;
152 }
153
154 /// find_first_of - Find the first character in the string that is in \arg
155 /// Chars, or npos if not found.
156 ///
157 /// Note: O(size() + Chars.size())
find_first_of(StringRef Chars,size_t From) const158 StringRef::size_type StringRef::find_first_of(StringRef Chars,
159 size_t From) const {
160 std::bitset<1 << CHAR_BIT> CharBits;
161 for (size_type i = 0; i != Chars.size(); ++i)
162 CharBits.set((unsigned char)Chars[i]);
163
164 for (size_type i = min(From, Length), e = Length; i != e; ++i)
165 if (CharBits.test((unsigned char)Data[i]))
166 return i;
167 return npos;
168 }
169
170 /// find_first_not_of - Find the first character in the string that is not
171 /// \arg C or npos if not found.
find_first_not_of(char C,size_t From) const172 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
173 for (size_type i = min(From, Length), e = Length; i != e; ++i)
174 if (Data[i] != C)
175 return i;
176 return npos;
177 }
178
179 /// find_first_not_of - Find the first character in the string that is not
180 /// in the string \arg Chars, or npos if not found.
181 ///
182 /// Note: O(size() + Chars.size())
find_first_not_of(StringRef Chars,size_t From) const183 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
184 size_t From) const {
185 std::bitset<1 << CHAR_BIT> CharBits;
186 for (size_type i = 0; i != Chars.size(); ++i)
187 CharBits.set((unsigned char)Chars[i]);
188
189 for (size_type i = min(From, Length), e = Length; i != e; ++i)
190 if (!CharBits.test((unsigned char)Data[i]))
191 return i;
192 return npos;
193 }
194
195
196 //===----------------------------------------------------------------------===//
197 // Helpful Algorithms
198 //===----------------------------------------------------------------------===//
199
200 /// count - Return the number of non-overlapped occurrences of \arg Str in
201 /// the string.
count(StringRef Str) const202 size_t StringRef::count(StringRef Str) const {
203 size_t Count = 0;
204 size_t N = Str.size();
205 if (N > Length)
206 return 0;
207 for (size_t i = 0, e = Length - N + 1; i != e; ++i)
208 if (substr(i, N).equals(Str))
209 ++Count;
210 return Count;
211 }
212
GetAutoSenseRadix(StringRef & Str)213 static unsigned GetAutoSenseRadix(StringRef &Str) {
214 if (Str.startswith("0x")) {
215 Str = Str.substr(2);
216 return 16;
217 } else if (Str.startswith("0b")) {
218 Str = Str.substr(2);
219 return 2;
220 } else if (Str.startswith("0")) {
221 return 8;
222 } else {
223 return 10;
224 }
225 }
226
227
228 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
229 /// sequence of radix up to 36 to an unsigned long long value.
GetAsUnsignedInteger(StringRef Str,unsigned Radix,unsigned long long & Result)230 static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
231 unsigned long long &Result) {
232 // Autosense radix if not specified.
233 if (Radix == 0)
234 Radix = GetAutoSenseRadix(Str);
235
236 // Empty strings (after the radix autosense) are invalid.
237 if (Str.empty()) return true;
238
239 // Parse all the bytes of the string given this radix. Watch for overflow.
240 Result = 0;
241 while (!Str.empty()) {
242 unsigned CharVal;
243 if (Str[0] >= '0' && Str[0] <= '9')
244 CharVal = Str[0]-'0';
245 else if (Str[0] >= 'a' && Str[0] <= 'z')
246 CharVal = Str[0]-'a'+10;
247 else if (Str[0] >= 'A' && Str[0] <= 'Z')
248 CharVal = Str[0]-'A'+10;
249 else
250 return true;
251
252 // If the parsed value is larger than the integer radix, the string is
253 // invalid.
254 if (CharVal >= Radix)
255 return true;
256
257 // Add in this character.
258 unsigned long long PrevResult = Result;
259 Result = Result*Radix+CharVal;
260
261 // Check for overflow.
262 if (Result < PrevResult)
263 return true;
264
265 Str = Str.substr(1);
266 }
267
268 return false;
269 }
270
getAsInteger(unsigned Radix,unsigned long long & Result) const271 bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
272 return GetAsUnsignedInteger(*this, Radix, Result);
273 }
274
275
getAsInteger(unsigned Radix,long long & Result) const276 bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
277 unsigned long long ULLVal;
278
279 // Handle positive strings first.
280 if (empty() || front() != '-') {
281 if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
282 // Check for value so large it overflows a signed value.
283 (long long)ULLVal < 0)
284 return true;
285 Result = ULLVal;
286 return false;
287 }
288
289 // Get the positive part of the value.
290 if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
291 // Reject values so large they'd overflow as negative signed, but allow
292 // "-0". This negates the unsigned so that the negative isn't undefined
293 // on signed overflow.
294 (long long)-ULLVal > 0)
295 return true;
296
297 Result = -ULLVal;
298 return false;
299 }
300
getAsInteger(unsigned Radix,int & Result) const301 bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
302 long long Val;
303 if (getAsInteger(Radix, Val) ||
304 (int)Val != Val)
305 return true;
306 Result = Val;
307 return false;
308 }
309
getAsInteger(unsigned Radix,unsigned & Result) const310 bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
311 unsigned long long Val;
312 if (getAsInteger(Radix, Val) ||
313 (unsigned)Val != Val)
314 return true;
315 Result = Val;
316 return false;
317 }
318
getAsInteger(unsigned Radix,APInt & Result) const319 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
320 StringRef Str = *this;
321
322 // Autosense radix if not specified.
323 if (Radix == 0)
324 Radix = GetAutoSenseRadix(Str);
325
326 assert(Radix > 1 && Radix <= 36);
327
328 // Empty strings (after the radix autosense) are invalid.
329 if (Str.empty()) return true;
330
331 // Skip leading zeroes. This can be a significant improvement if
332 // it means we don't need > 64 bits.
333 while (!Str.empty() && Str.front() == '0')
334 Str = Str.substr(1);
335
336 // If it was nothing but zeroes....
337 if (Str.empty()) {
338 Result = APInt(64, 0);
339 return false;
340 }
341
342 // (Over-)estimate the required number of bits.
343 unsigned Log2Radix = 0;
344 while ((1U << Log2Radix) < Radix) Log2Radix++;
345 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
346
347 unsigned BitWidth = Log2Radix * Str.size();
348 if (BitWidth < Result.getBitWidth())
349 BitWidth = Result.getBitWidth(); // don't shrink the result
350 else
351 Result.zext(BitWidth);
352
353 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
354 if (!IsPowerOf2Radix) {
355 // These must have the same bit-width as Result.
356 RadixAP = APInt(BitWidth, Radix);
357 CharAP = APInt(BitWidth, 0);
358 }
359
360 // Parse all the bytes of the string given this radix.
361 Result = 0;
362 while (!Str.empty()) {
363 unsigned CharVal;
364 if (Str[0] >= '0' && Str[0] <= '9')
365 CharVal = Str[0]-'0';
366 else if (Str[0] >= 'a' && Str[0] <= 'z')
367 CharVal = Str[0]-'a'+10;
368 else if (Str[0] >= 'A' && Str[0] <= 'Z')
369 CharVal = Str[0]-'A'+10;
370 else
371 return true;
372
373 // If the parsed value is larger than the integer radix, the string is
374 // invalid.
375 if (CharVal >= Radix)
376 return true;
377
378 // Add in this character.
379 if (IsPowerOf2Radix) {
380 Result <<= Log2Radix;
381 Result |= CharVal;
382 } else {
383 Result *= RadixAP;
384 CharAP = CharVal;
385 Result += CharAP;
386 }
387
388 Str = Str.substr(1);
389 }
390
391 return false;
392 }
393