1// Copyright 2015 Huan Du. All rights reserved. 2// Licensed under the MIT license that can be found in the LICENSE file. 3 4package xstrings 5 6import ( 7 "unicode" 8 "unicode/utf8" 9) 10 11type runeRangeMap struct { 12 FromLo rune // Lower bound of range map. 13 FromHi rune // An inclusive higher bound of range map. 14 ToLo rune 15 ToHi rune 16} 17 18type runeDict struct { 19 Dict [unicode.MaxASCII + 1]rune 20} 21 22type runeMap map[rune]rune 23 24// Translator can translate string with pre-compiled from and to patterns. 25// If a from/to pattern pair needs to be used more than once, it's recommended 26// to create a Translator and reuse it. 27type Translator struct { 28 quickDict *runeDict // A quick dictionary to look up rune by index. Only available for latin runes. 29 runeMap runeMap // Rune map for translation. 30 ranges []*runeRangeMap // Ranges of runes. 31 mappedRune rune // If mappedRune >= 0, all matched runes are translated to the mappedRune. 32 reverted bool // If to pattern is empty, all matched characters will be deleted. 33 hasPattern bool 34} 35 36// NewTranslator creates new Translator through a from/to pattern pair. 37func NewTranslator(from, to string) *Translator { 38 tr := &Translator{} 39 40 if from == "" { 41 return tr 42 } 43 44 reverted := from[0] == '^' 45 deletion := len(to) == 0 46 47 if reverted { 48 from = from[1:] 49 } 50 51 var fromStart, fromEnd, fromRangeStep rune 52 var toStart, toEnd, toRangeStep rune 53 var fromRangeSize, toRangeSize rune 54 var singleRunes []rune 55 56 // Update the to rune range. 57 updateRange := func() { 58 // No more rune to read in the to rune pattern. 59 if toEnd == utf8.RuneError { 60 return 61 } 62 63 if toRangeStep == 0 { 64 to, toStart, toEnd, toRangeStep = nextRuneRange(to, toEnd) 65 return 66 } 67 68 // Current range is not empty. Consume 1 rune from start. 69 if toStart != toEnd { 70 toStart += toRangeStep 71 return 72 } 73 74 // No more rune. Repeat the last rune. 75 if to == "" { 76 toEnd = utf8.RuneError 77 return 78 } 79 80 // Both start and end are used. Read two more runes from the to pattern. 81 to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError) 82 } 83 84 if deletion { 85 toStart = utf8.RuneError 86 toEnd = utf8.RuneError 87 } else { 88 // If from pattern is reverted, only the last rune in the to pattern will be used. 89 if reverted { 90 var size int 91 92 for len(to) > 0 { 93 toStart, size = utf8.DecodeRuneInString(to) 94 to = to[size:] 95 } 96 97 toEnd = utf8.RuneError 98 } else { 99 to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError) 100 } 101 } 102 103 fromEnd = utf8.RuneError 104 105 for len(from) > 0 { 106 from, fromStart, fromEnd, fromRangeStep = nextRuneRange(from, fromEnd) 107 108 // fromStart is a single character. Just map it with a rune in the to pattern. 109 if fromRangeStep == 0 { 110 singleRunes = tr.addRune(fromStart, toStart, singleRunes) 111 updateRange() 112 continue 113 } 114 115 for toEnd != utf8.RuneError && fromStart != fromEnd { 116 // If mapped rune is a single character instead of a range, simply shift first 117 // rune in the range. 118 if toRangeStep == 0 { 119 singleRunes = tr.addRune(fromStart, toStart, singleRunes) 120 updateRange() 121 fromStart += fromRangeStep 122 continue 123 } 124 125 fromRangeSize = (fromEnd - fromStart) * fromRangeStep 126 toRangeSize = (toEnd - toStart) * toRangeStep 127 128 // Not enough runes in the to pattern. Need to read more. 129 if fromRangeSize > toRangeSize { 130 fromStart, toStart = tr.addRuneRange(fromStart, fromStart+toRangeSize*fromRangeStep, toStart, toEnd, singleRunes) 131 fromStart += fromRangeStep 132 updateRange() 133 134 // Edge case: If fromRangeSize == toRangeSize + 1, the last fromStart value needs be considered 135 // as a single rune. 136 if fromStart == fromEnd { 137 singleRunes = tr.addRune(fromStart, toStart, singleRunes) 138 updateRange() 139 } 140 141 continue 142 } 143 144 fromStart, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart+fromRangeSize*toRangeStep, singleRunes) 145 updateRange() 146 break 147 } 148 149 if fromStart == fromEnd { 150 fromEnd = utf8.RuneError 151 continue 152 } 153 154 _, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart, singleRunes) 155 fromEnd = utf8.RuneError 156 } 157 158 if fromEnd != utf8.RuneError { 159 tr.addRune(fromEnd, toStart, singleRunes) 160 } 161 162 tr.reverted = reverted 163 tr.mappedRune = -1 164 tr.hasPattern = true 165 166 // Translate RuneError only if in deletion or reverted mode. 167 if deletion || reverted { 168 tr.mappedRune = toStart 169 } 170 171 return tr 172} 173 174func (tr *Translator) addRune(from, to rune, singleRunes []rune) []rune { 175 if from <= unicode.MaxASCII { 176 if tr.quickDict == nil { 177 tr.quickDict = &runeDict{} 178 } 179 180 tr.quickDict.Dict[from] = to 181 } else { 182 if tr.runeMap == nil { 183 tr.runeMap = make(runeMap) 184 } 185 186 tr.runeMap[from] = to 187 } 188 189 singleRunes = append(singleRunes, from) 190 return singleRunes 191} 192 193func (tr *Translator) addRuneRange(fromLo, fromHi, toLo, toHi rune, singleRunes []rune) (rune, rune) { 194 var r rune 195 var rrm *runeRangeMap 196 197 if fromLo < fromHi { 198 rrm = &runeRangeMap{ 199 FromLo: fromLo, 200 FromHi: fromHi, 201 ToLo: toLo, 202 ToHi: toHi, 203 } 204 } else { 205 rrm = &runeRangeMap{ 206 FromLo: fromHi, 207 FromHi: fromLo, 208 ToLo: toHi, 209 ToHi: toLo, 210 } 211 } 212 213 // If there is any single rune conflicts with this rune range, clear single rune record. 214 for _, r = range singleRunes { 215 if rrm.FromLo <= r && r <= rrm.FromHi { 216 if r <= unicode.MaxASCII { 217 tr.quickDict.Dict[r] = 0 218 } else { 219 delete(tr.runeMap, r) 220 } 221 } 222 } 223 224 tr.ranges = append(tr.ranges, rrm) 225 return fromHi, toHi 226} 227 228func nextRuneRange(str string, last rune) (remaining string, start, end rune, rangeStep rune) { 229 var r rune 230 var size int 231 232 remaining = str 233 escaping := false 234 isRange := false 235 236 for len(remaining) > 0 { 237 r, size = utf8.DecodeRuneInString(remaining) 238 remaining = remaining[size:] 239 240 // Parse special characters. 241 if !escaping { 242 if r == '\\' { 243 escaping = true 244 continue 245 } 246 247 if r == '-' { 248 // Ignore slash at beginning of string. 249 if last == utf8.RuneError { 250 continue 251 } 252 253 start = last 254 isRange = true 255 continue 256 } 257 } 258 259 escaping = false 260 261 if last != utf8.RuneError { 262 // This is a range which start and end are the same. 263 // Considier it as a normal character. 264 if isRange && last == r { 265 isRange = false 266 continue 267 } 268 269 start = last 270 end = r 271 272 if isRange { 273 if start < end { 274 rangeStep = 1 275 } else { 276 rangeStep = -1 277 } 278 } 279 280 return 281 } 282 283 last = r 284 } 285 286 start = last 287 end = utf8.RuneError 288 return 289} 290 291// Translate str with a from/to pattern pair. 292// 293// See comment in Translate function for usage and samples. 294func (tr *Translator) Translate(str string) string { 295 if !tr.hasPattern || str == "" { 296 return str 297 } 298 299 var r rune 300 var size int 301 var needTr bool 302 303 orig := str 304 305 var output *stringBuilder 306 307 for len(str) > 0 { 308 r, size = utf8.DecodeRuneInString(str) 309 r, needTr = tr.TranslateRune(r) 310 311 if needTr && output == nil { 312 output = allocBuffer(orig, str) 313 } 314 315 if r != utf8.RuneError && output != nil { 316 output.WriteRune(r) 317 } 318 319 str = str[size:] 320 } 321 322 // No character is translated. 323 if output == nil { 324 return orig 325 } 326 327 return output.String() 328} 329 330// TranslateRune return translated rune and true if r matches the from pattern. 331// If r doesn't match the pattern, original r is returned and translated is false. 332func (tr *Translator) TranslateRune(r rune) (result rune, translated bool) { 333 switch { 334 case tr.quickDict != nil: 335 if r <= unicode.MaxASCII { 336 result = tr.quickDict.Dict[r] 337 338 if result != 0 { 339 translated = true 340 341 if tr.mappedRune >= 0 { 342 result = tr.mappedRune 343 } 344 345 break 346 } 347 } 348 349 fallthrough 350 351 case tr.runeMap != nil: 352 var ok bool 353 354 if result, ok = tr.runeMap[r]; ok { 355 translated = true 356 357 if tr.mappedRune >= 0 { 358 result = tr.mappedRune 359 } 360 361 break 362 } 363 364 fallthrough 365 366 default: 367 var rrm *runeRangeMap 368 ranges := tr.ranges 369 370 for i := len(ranges) - 1; i >= 0; i-- { 371 rrm = ranges[i] 372 373 if rrm.FromLo <= r && r <= rrm.FromHi { 374 translated = true 375 376 if tr.mappedRune >= 0 { 377 result = tr.mappedRune 378 break 379 } 380 381 if rrm.ToLo < rrm.ToHi { 382 result = rrm.ToLo + r - rrm.FromLo 383 } else if rrm.ToLo > rrm.ToHi { 384 // ToHi can be smaller than ToLo if range is from higher to lower. 385 result = rrm.ToLo - r + rrm.FromLo 386 } else { 387 result = rrm.ToLo 388 } 389 390 break 391 } 392 } 393 } 394 395 if tr.reverted { 396 if !translated { 397 result = tr.mappedRune 398 } 399 400 translated = !translated 401 } 402 403 if !translated { 404 result = r 405 } 406 407 return 408} 409 410// HasPattern returns true if Translator has one pattern at least. 411func (tr *Translator) HasPattern() bool { 412 return tr.hasPattern 413} 414 415// Translate str with the characters defined in from replaced by characters defined in to. 416// 417// From and to are patterns representing a set of characters. Pattern is defined as following. 418// 419// * Special characters 420// * '-' means a range of runes, e.g. 421// * "a-z" means all characters from 'a' to 'z' inclusive; 422// * "z-a" means all characters from 'z' to 'a' inclusive. 423// * '^' as first character means a set of all runes excepted listed, e.g. 424// * "^a-z" means all characters except 'a' to 'z' inclusive. 425// * '\' escapes special characters. 426// * Normal character represents itself, e.g. "abc" is a set including 'a', 'b' and 'c'. 427// 428// Translate will try to find a 1:1 mapping from from to to. 429// If to is smaller than from, last rune in to will be used to map "out of range" characters in from. 430// 431// Note that '^' only works in the from pattern. It will be considered as a normal character in the to pattern. 432// 433// If the to pattern is an empty string, Translate works exactly the same as Delete. 434// 435// Samples: 436// Translate("hello", "aeiou", "12345") => "h2ll4" 437// Translate("hello", "a-z", "A-Z") => "HELLO" 438// Translate("hello", "z-a", "a-z") => "svool" 439// Translate("hello", "aeiou", "*") => "h*ll*" 440// Translate("hello", "^l", "*") => "**ll*" 441// Translate("hello ^ world", `\^lo`, "*") => "he*** * w*r*d" 442func Translate(str, from, to string) string { 443 tr := NewTranslator(from, to) 444 return tr.Translate(str) 445} 446 447// Delete runes in str matching the pattern. 448// Pattern is defined in Translate function. 449// 450// Samples: 451// Delete("hello", "aeiou") => "hll" 452// Delete("hello", "a-k") => "llo" 453// Delete("hello", "^a-k") => "he" 454func Delete(str, pattern string) string { 455 tr := NewTranslator(pattern, "") 456 return tr.Translate(str) 457} 458 459// Count how many runes in str match the pattern. 460// Pattern is defined in Translate function. 461// 462// Samples: 463// Count("hello", "aeiou") => 3 464// Count("hello", "a-k") => 3 465// Count("hello", "^a-k") => 2 466func Count(str, pattern string) int { 467 if pattern == "" || str == "" { 468 return 0 469 } 470 471 var r rune 472 var size int 473 var matched bool 474 475 tr := NewTranslator(pattern, "") 476 cnt := 0 477 478 for len(str) > 0 { 479 r, size = utf8.DecodeRuneInString(str) 480 str = str[size:] 481 482 if _, matched = tr.TranslateRune(r); matched { 483 cnt++ 484 } 485 } 486 487 return cnt 488} 489 490// Squeeze deletes adjacent repeated runes in str. 491// If pattern is not empty, only runes matching the pattern will be squeezed. 492// 493// Samples: 494// Squeeze("hello", "") => "helo" 495// Squeeze("hello", "m-z") => "hello" 496// Squeeze("hello world", " ") => "hello world" 497func Squeeze(str, pattern string) string { 498 var last, r rune 499 var size int 500 var skipSqueeze, matched bool 501 var tr *Translator 502 var output *stringBuilder 503 504 orig := str 505 last = -1 506 507 if len(pattern) > 0 { 508 tr = NewTranslator(pattern, "") 509 } 510 511 for len(str) > 0 { 512 r, size = utf8.DecodeRuneInString(str) 513 514 // Need to squeeze the str. 515 if last == r && !skipSqueeze { 516 if tr != nil { 517 if _, matched = tr.TranslateRune(r); !matched { 518 skipSqueeze = true 519 } 520 } 521 522 if output == nil { 523 output = allocBuffer(orig, str) 524 } 525 526 if skipSqueeze { 527 output.WriteRune(r) 528 } 529 } else { 530 if output != nil { 531 output.WriteRune(r) 532 } 533 534 last = r 535 skipSqueeze = false 536 } 537 538 str = str[size:] 539 } 540 541 if output == nil { 542 return orig 543 } 544 545 return output.String() 546} 547