1package syntax 2 3import ( 4 "bytes" 5 "encoding/binary" 6 "fmt" 7 "sort" 8 "unicode" 9 "unicode/utf8" 10) 11 12// CharSet combines start-end rune ranges and unicode categories representing a set of characters 13type CharSet struct { 14 ranges []singleRange 15 categories []category 16 sub *CharSet //optional subtractor 17 negate bool 18 anything bool 19} 20 21type category struct { 22 negate bool 23 cat string 24} 25 26type singleRange struct { 27 first rune 28 last rune 29} 30 31const ( 32 spaceCategoryText = " " 33 wordCategoryText = "W" 34) 35 36var ( 37 ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00} 38 ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b} 39 ecmaDigit = []rune{0x0030, 0x003a} 40) 41 42var ( 43 AnyClass = getCharSetFromOldString([]rune{0}, false) 44 ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false) 45 NoneClass = getCharSetFromOldString(nil, false) 46 ECMAWordClass = getCharSetFromOldString(ecmaWord, false) 47 NotECMAWordClass = getCharSetFromOldString(ecmaWord, true) 48 ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false) 49 NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true) 50 ECMADigitClass = getCharSetFromOldString(ecmaDigit, false) 51 NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true) 52 53 WordClass = getCharSetFromCategoryString(false, false, wordCategoryText) 54 NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText) 55 SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText) 56 NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText) 57 DigitClass = getCharSetFromCategoryString(false, false, "Nd") 58 NotDigitClass = getCharSetFromCategoryString(false, true, "Nd") 59) 60 61var unicodeCategories = func() map[string]*unicode.RangeTable { 62 retVal := make(map[string]*unicode.RangeTable) 63 for k, v := range unicode.Scripts { 64 retVal[k] = v 65 } 66 for k, v := range unicode.Categories { 67 retVal[k] = v 68 } 69 for k, v := range unicode.Properties { 70 retVal[k] = v 71 } 72 return retVal 73}() 74 75func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet { 76 if negateCat && negateSet { 77 panic("BUG! You should only negate the set OR the category in a constant setup, but not both") 78 } 79 80 c := CharSet{negate: negateSet} 81 82 c.categories = make([]category, len(cats)) 83 for i, cat := range cats { 84 c.categories[i] = category{cat: cat, negate: negateCat} 85 } 86 return func() *CharSet { 87 //make a copy each time 88 local := c 89 //return that address 90 return &local 91 } 92} 93 94func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet { 95 c := CharSet{} 96 if len(setText) > 0 { 97 fillFirst := false 98 l := len(setText) 99 if negate { 100 if setText[0] == 0 { 101 setText = setText[1:] 102 } else { 103 l++ 104 fillFirst = true 105 } 106 } 107 108 if l%2 == 0 { 109 c.ranges = make([]singleRange, l/2) 110 } else { 111 c.ranges = make([]singleRange, l/2+1) 112 } 113 114 first := true 115 if fillFirst { 116 c.ranges[0] = singleRange{first: 0} 117 first = false 118 } 119 120 i := 0 121 for _, r := range setText { 122 if first { 123 // lower bound in a new range 124 c.ranges[i] = singleRange{first: r} 125 first = false 126 } else { 127 c.ranges[i].last = r - 1 128 i++ 129 first = true 130 } 131 } 132 if !first { 133 c.ranges[i].last = utf8.MaxRune 134 } 135 } 136 137 return func() *CharSet { 138 local := c 139 return &local 140 } 141} 142 143// Copy makes a deep copy to prevent accidental mutation of a set 144func (c CharSet) Copy() CharSet { 145 ret := CharSet{ 146 anything: c.anything, 147 negate: c.negate, 148 } 149 150 ret.ranges = append(ret.ranges, c.ranges...) 151 ret.categories = append(ret.categories, c.categories...) 152 153 if c.sub != nil { 154 sub := c.sub.Copy() 155 ret.sub = &sub 156 } 157 158 return ret 159} 160 161// gets a human-readable description for a set string 162func (c CharSet) String() string { 163 buf := &bytes.Buffer{} 164 buf.WriteRune('[') 165 166 if c.IsNegated() { 167 buf.WriteRune('^') 168 } 169 170 for _, r := range c.ranges { 171 172 buf.WriteString(CharDescription(r.first)) 173 if r.first != r.last { 174 if r.last-r.first != 1 { 175 //groups that are 1 char apart skip the dash 176 buf.WriteRune('-') 177 } 178 buf.WriteString(CharDescription(r.last)) 179 } 180 } 181 182 for _, c := range c.categories { 183 buf.WriteString(c.String()) 184 } 185 186 if c.sub != nil { 187 buf.WriteRune('-') 188 buf.WriteString(c.sub.String()) 189 } 190 191 buf.WriteRune(']') 192 193 return buf.String() 194} 195 196// mapHashFill converts a charset into a buffer for use in maps 197func (c CharSet) mapHashFill(buf *bytes.Buffer) { 198 if c.negate { 199 buf.WriteByte(0) 200 } else { 201 buf.WriteByte(1) 202 } 203 204 binary.Write(buf, binary.LittleEndian, len(c.ranges)) 205 binary.Write(buf, binary.LittleEndian, len(c.categories)) 206 for _, r := range c.ranges { 207 buf.WriteRune(r.first) 208 buf.WriteRune(r.last) 209 } 210 for _, ct := range c.categories { 211 buf.WriteString(ct.cat) 212 if ct.negate { 213 buf.WriteByte(1) 214 } else { 215 buf.WriteByte(0) 216 } 217 } 218 219 if c.sub != nil { 220 c.sub.mapHashFill(buf) 221 } 222} 223 224// CharIn returns true if the rune is in our character set (either ranges or categories). 225// It handles negations and subtracted sub-charsets. 226func (c CharSet) CharIn(ch rune) bool { 227 val := false 228 // in s && !s.subtracted 229 230 //check ranges 231 for _, r := range c.ranges { 232 if ch < r.first { 233 continue 234 } 235 if ch <= r.last { 236 val = true 237 break 238 } 239 } 240 241 //check categories if we haven't already found a range 242 if !val && len(c.categories) > 0 { 243 for _, ct := range c.categories { 244 // special categories...then unicode 245 if ct.cat == spaceCategoryText { 246 if unicode.IsSpace(ch) { 247 // we found a space so we're done 248 // negate means this is a "bad" thing 249 val = !ct.negate 250 break 251 } else if ct.negate { 252 val = true 253 break 254 } 255 } else if ct.cat == wordCategoryText { 256 if IsWordChar(ch) { 257 val = !ct.negate 258 break 259 } else if ct.negate { 260 val = true 261 break 262 } 263 } else if unicode.Is(unicodeCategories[ct.cat], ch) { 264 // if we're in this unicode category then we're done 265 // if negate=true on this category then we "failed" our test 266 // otherwise we're good that we found it 267 val = !ct.negate 268 break 269 } else if ct.negate { 270 val = true 271 break 272 } 273 } 274 } 275 276 // negate the whole char set 277 if c.negate { 278 val = !val 279 } 280 281 // get subtracted recurse 282 if val && c.sub != nil { 283 val = !c.sub.CharIn(ch) 284 } 285 286 //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val) 287 return val 288} 289 290func (c category) String() string { 291 switch c.cat { 292 case spaceCategoryText: 293 if c.negate { 294 return "\\S" 295 } 296 return "\\s" 297 case wordCategoryText: 298 if c.negate { 299 return "\\W" 300 } 301 return "\\w" 302 } 303 if _, ok := unicodeCategories[c.cat]; ok { 304 305 if c.negate { 306 return "\\P{" + c.cat + "}" 307 } 308 return "\\p{" + c.cat + "}" 309 } 310 return "Unknown category: " + c.cat 311} 312 313// CharDescription Produces a human-readable description for a single character. 314func CharDescription(ch rune) string { 315 /*if ch == '\\' { 316 return "\\\\" 317 } 318 319 if ch > ' ' && ch <= '~' { 320 return string(ch) 321 } else if ch == '\n' { 322 return "\\n" 323 } else if ch == ' ' { 324 return "\\ " 325 }*/ 326 327 b := &bytes.Buffer{} 328 escape(b, ch, false) //fmt.Sprintf("%U", ch) 329 return b.String() 330} 331 332// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) 333// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic 334// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C 335// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. 336func IsWordChar(r rune) bool { 337 //"L", "Mn", "Nd", "Pc" 338 return unicode.In(r, 339 unicode.Categories["L"], unicode.Categories["Mn"], 340 unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C' 341 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 342} 343 344func IsECMAWordChar(r rune) bool { 345 return unicode.In(r, 346 unicode.Categories["L"], unicode.Categories["Mn"], 347 unicode.Categories["Nd"], unicode.Categories["Pc"]) 348 349 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 350} 351 352// SingletonChar will return the char from the first range without validation. 353// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input 354func (c CharSet) SingletonChar() rune { 355 return c.ranges[0].first 356} 357 358func (c CharSet) IsSingleton() bool { 359 return !c.negate && //negated is multiple chars 360 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars 361 c.sub == nil && // subtraction means we've got multiple chars 362 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char 363} 364 365func (c CharSet) IsSingletonInverse() bool { 366 return c.negate && //same as above, but requires negated 367 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars 368 c.sub == nil && // subtraction means we've got multiple chars 369 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char 370} 371 372func (c CharSet) IsMergeable() bool { 373 return !c.IsNegated() && !c.HasSubtraction() 374} 375 376func (c CharSet) IsNegated() bool { 377 return c.negate 378} 379 380func (c CharSet) HasSubtraction() bool { 381 return c.sub != nil 382} 383 384func (c CharSet) IsEmpty() bool { 385 return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil 386} 387 388func (c *CharSet) addDigit(ecma, negate bool, pattern string) { 389 if ecma { 390 if negate { 391 c.addRanges(NotECMADigitClass().ranges) 392 } else { 393 c.addRanges(ECMADigitClass().ranges) 394 } 395 } else { 396 c.addCategories(category{cat: "Nd", negate: negate}) 397 } 398} 399 400func (c *CharSet) addChar(ch rune) { 401 c.addRange(ch, ch) 402} 403 404func (c *CharSet) addSpace(ecma, negate bool) { 405 if ecma { 406 if negate { 407 c.addRanges(NotECMASpaceClass().ranges) 408 } else { 409 c.addRanges(ECMASpaceClass().ranges) 410 } 411 } else { 412 c.addCategories(category{cat: spaceCategoryText, negate: negate}) 413 } 414} 415 416func (c *CharSet) addWord(ecma, negate bool) { 417 if ecma { 418 if negate { 419 c.addRanges(NotECMAWordClass().ranges) 420 } else { 421 c.addRanges(ECMAWordClass().ranges) 422 } 423 } else { 424 c.addCategories(category{cat: wordCategoryText, negate: negate}) 425 } 426} 427 428// Add set ranges and categories into ours -- no deduping or anything 429func (c *CharSet) addSet(set CharSet) { 430 if c.anything { 431 return 432 } 433 if set.anything { 434 c.makeAnything() 435 return 436 } 437 // just append here to prevent double-canon 438 c.ranges = append(c.ranges, set.ranges...) 439 c.addCategories(set.categories...) 440 c.canonicalize() 441} 442 443func (c *CharSet) makeAnything() { 444 c.anything = true 445 c.categories = []category{} 446 c.ranges = AnyClass().ranges 447} 448 449func (c *CharSet) addCategories(cats ...category) { 450 // don't add dupes and remove positive+negative 451 if c.anything { 452 // if we've had a previous positive+negative group then 453 // just return, we're as broad as we can get 454 return 455 } 456 457 for _, ct := range cats { 458 found := false 459 for _, ct2 := range c.categories { 460 if ct.cat == ct2.cat { 461 if ct.negate != ct2.negate { 462 // oposite negations...this mean we just 463 // take us as anything and move on 464 c.makeAnything() 465 return 466 } 467 found = true 468 break 469 } 470 } 471 472 if !found { 473 c.categories = append(c.categories, ct) 474 } 475 } 476} 477 478// Merges new ranges to our own 479func (c *CharSet) addRanges(ranges []singleRange) { 480 if c.anything { 481 return 482 } 483 c.ranges = append(c.ranges, ranges...) 484 c.canonicalize() 485} 486 487// Merges everything but the new ranges into our own 488func (c *CharSet) addNegativeRanges(ranges []singleRange) { 489 if c.anything { 490 return 491 } 492 493 var hi rune 494 495 // convert incoming ranges into opposites, assume they are in order 496 for _, r := range ranges { 497 if hi < r.first { 498 c.ranges = append(c.ranges, singleRange{hi, r.first - 1}) 499 } 500 hi = r.last + 1 501 } 502 503 if hi < utf8.MaxRune { 504 c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune}) 505 } 506 507 c.canonicalize() 508} 509 510func isValidUnicodeCat(catName string) bool { 511 _, ok := unicodeCategories[catName] 512 return ok 513} 514 515func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) { 516 if !isValidUnicodeCat(categoryName) { 517 // unknown unicode category, script, or property "blah" 518 panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName)) 519 520 } 521 522 if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") { 523 // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match 524 c.addCategories( 525 category{cat: "Ll", negate: negate}, 526 category{cat: "Lu", negate: negate}, 527 category{cat: "Lt", negate: negate}) 528 } 529 c.addCategories(category{cat: categoryName, negate: negate}) 530} 531 532func (c *CharSet) addSubtraction(sub *CharSet) { 533 c.sub = sub 534} 535 536func (c *CharSet) addRange(chMin, chMax rune) { 537 c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax}) 538 c.canonicalize() 539} 540 541func (c *CharSet) addNamedASCII(name string, negate bool) bool { 542 var rs []singleRange 543 544 switch name { 545 case "alnum": 546 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}} 547 case "alpha": 548 rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}} 549 case "ascii": 550 rs = []singleRange{singleRange{0, 0x7f}} 551 case "blank": 552 rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}} 553 case "cntrl": 554 rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}} 555 case "digit": 556 c.addDigit(false, negate, "") 557 case "graph": 558 rs = []singleRange{singleRange{'!', '~'}} 559 case "lower": 560 rs = []singleRange{singleRange{'a', 'z'}} 561 case "print": 562 rs = []singleRange{singleRange{' ', '~'}} 563 case "punct": //[!-/:-@[-`{-~] 564 rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}} 565 case "space": 566 c.addSpace(true, negate) 567 case "upper": 568 rs = []singleRange{singleRange{'A', 'Z'}} 569 case "word": 570 c.addWord(true, negate) 571 case "xdigit": 572 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}} 573 default: 574 return false 575 } 576 577 if len(rs) > 0 { 578 if negate { 579 c.addNegativeRanges(rs) 580 } else { 581 c.addRanges(rs) 582 } 583 } 584 585 return true 586} 587 588type singleRangeSorter []singleRange 589 590func (p singleRangeSorter) Len() int { return len(p) } 591func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first } 592func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 593 594// Logic to reduce a character class to a unique, sorted form. 595func (c *CharSet) canonicalize() { 596 var i, j int 597 var last rune 598 599 // 600 // Find and eliminate overlapping or abutting ranges 601 // 602 603 if len(c.ranges) > 1 { 604 sort.Sort(singleRangeSorter(c.ranges)) 605 606 done := false 607 608 for i, j = 1, 0; ; i++ { 609 for last = c.ranges[j].last; ; i++ { 610 if i == len(c.ranges) || last == utf8.MaxRune { 611 done = true 612 break 613 } 614 615 CurrentRange := c.ranges[i] 616 if CurrentRange.first > last+1 { 617 break 618 } 619 620 if last < CurrentRange.last { 621 last = CurrentRange.last 622 } 623 } 624 625 c.ranges[j] = singleRange{first: c.ranges[j].first, last: last} 626 627 j++ 628 629 if done { 630 break 631 } 632 633 if j < i { 634 c.ranges[j] = c.ranges[i] 635 } 636 } 637 638 c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...) 639 } 640} 641 642// Adds to the class any lowercase versions of characters already 643// in the class. Used for case-insensitivity. 644func (c *CharSet) addLowercase() { 645 if c.anything { 646 return 647 } 648 toAdd := []singleRange{} 649 for i := 0; i < len(c.ranges); i++ { 650 r := c.ranges[i] 651 if r.first == r.last { 652 lower := unicode.ToLower(r.first) 653 c.ranges[i] = singleRange{first: lower, last: lower} 654 } else { 655 toAdd = append(toAdd, r) 656 } 657 } 658 659 for _, r := range toAdd { 660 c.addLowercaseRange(r.first, r.last) 661 } 662 c.canonicalize() 663} 664 665/************************************************************************** 666 Let U be the set of Unicode character values and let L be the lowercase 667 function, mapping from U to U. To perform case insensitive matching of 668 character sets, we need to be able to map an interval I in U, say 669 670 I = [chMin, chMax] = { ch : chMin <= ch <= chMax } 671 672 to a set A such that A contains L(I) and A is contained in the union of 673 I and L(I). 674 675 The table below partitions U into intervals on which L is non-decreasing. 676 Thus, for any interval J = [a, b] contained in one of these intervals, 677 L(J) is contained in [L(a), L(b)]. 678 679 It is also true that for any such J, [L(a), L(b)] is contained in the 680 union of J and L(J). This does not follow from L being non-decreasing on 681 these intervals. It follows from the nature of the L on each interval. 682 On each interval, L has one of the following forms: 683 684 (1) L(ch) = constant (LowercaseSet) 685 (2) L(ch) = ch + offset (LowercaseAdd) 686 (3) L(ch) = ch | 1 (LowercaseBor) 687 (4) L(ch) = ch + (ch & 1) (LowercaseBad) 688 689 It is easy to verify that for any of these forms [L(a), L(b)] is 690 contained in the union of [a, b] and L([a, b]). 691***************************************************************************/ 692 693const ( 694 LowercaseSet = 0 // Set to arg. 695 LowercaseAdd = 1 // Add arg. 696 LowercaseBor = 2 // Bitwise or with 1. 697 LowercaseBad = 3 // Bitwise and with 1 and add original. 698) 699 700type lcMap struct { 701 chMin, chMax rune 702 op, data int32 703} 704 705var lcTable = []lcMap{ 706 lcMap{'\u0041', '\u005A', LowercaseAdd, 32}, 707 lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32}, 708 lcMap{'\u0100', '\u012E', LowercaseBor, 0}, 709 lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069}, 710 lcMap{'\u0132', '\u0136', LowercaseBor, 0}, 711 lcMap{'\u0139', '\u0147', LowercaseBad, 0}, 712 lcMap{'\u014A', '\u0176', LowercaseBor, 0}, 713 lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF}, 714 lcMap{'\u0179', '\u017D', LowercaseBad, 0}, 715 lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253}, 716 lcMap{'\u0182', '\u0184', LowercaseBor, 0}, 717 lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254}, 718 lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188}, 719 lcMap{'\u0189', '\u018A', LowercaseAdd, 205}, 720 lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C}, 721 lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD}, 722 lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259}, 723 lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B}, 724 lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192}, 725 lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260}, 726 lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263}, 727 lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269}, 728 lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268}, 729 lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199}, 730 lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F}, 731 lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272}, 732 lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275}, 733 lcMap{'\u01A0', '\u01A4', LowercaseBor, 0}, 734 lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8}, 735 lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283}, 736 lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD}, 737 lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288}, 738 lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0}, 739 lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217}, 740 lcMap{'\u01B3', '\u01B5', LowercaseBad, 0}, 741 lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292}, 742 lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9}, 743 lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD}, 744 lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6}, 745 lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9}, 746 lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC}, 747 lcMap{'\u01CD', '\u01DB', LowercaseBad, 0}, 748 lcMap{'\u01DE', '\u01EE', LowercaseBor, 0}, 749 lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3}, 750 lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5}, 751 lcMap{'\u01FA', '\u0216', LowercaseBor, 0}, 752 lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC}, 753 lcMap{'\u0388', '\u038A', LowercaseAdd, 37}, 754 lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC}, 755 lcMap{'\u038E', '\u038F', LowercaseAdd, 63}, 756 lcMap{'\u0391', '\u03AB', LowercaseAdd, 32}, 757 lcMap{'\u03E2', '\u03EE', LowercaseBor, 0}, 758 lcMap{'\u0401', '\u040F', LowercaseAdd, 80}, 759 lcMap{'\u0410', '\u042F', LowercaseAdd, 32}, 760 lcMap{'\u0460', '\u0480', LowercaseBor, 0}, 761 lcMap{'\u0490', '\u04BE', LowercaseBor, 0}, 762 lcMap{'\u04C1', '\u04C3', LowercaseBad, 0}, 763 lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8}, 764 lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC}, 765 lcMap{'\u04D0', '\u04EA', LowercaseBor, 0}, 766 lcMap{'\u04EE', '\u04F4', LowercaseBor, 0}, 767 lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9}, 768 lcMap{'\u0531', '\u0556', LowercaseAdd, 48}, 769 lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48}, 770 lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0}, 771 lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8}, 772 lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8}, 773 lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8}, 774 lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8}, 775 lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8}, 776 lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51}, 777 lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53}, 778 lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55}, 779 lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57}, 780 lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8}, 781 lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8}, 782 lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8}, 783 lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8}, 784 lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8}, 785 lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74}, 786 lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3}, 787 lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86}, 788 lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3}, 789 lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8}, 790 lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100}, 791 lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8}, 792 lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112}, 793 lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5}, 794 lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128}, 795 lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126}, 796 lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3}, 797 lcMap{'\u2160', '\u216F', LowercaseAdd, 16}, 798 lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26}, 799 lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32}, 800} 801 802func (c *CharSet) addLowercaseRange(chMin, chMax rune) { 803 var i, iMax, iMid int 804 var chMinT, chMaxT rune 805 var lc lcMap 806 807 for i, iMax = 0, len(lcTable); i < iMax; { 808 iMid = (i + iMax) / 2 809 if lcTable[iMid].chMax < chMin { 810 i = iMid + 1 811 } else { 812 iMax = iMid 813 } 814 } 815 816 for ; i < len(lcTable); i++ { 817 lc = lcTable[i] 818 if lc.chMin > chMax { 819 return 820 } 821 chMinT = lc.chMin 822 if chMinT < chMin { 823 chMinT = chMin 824 } 825 826 chMaxT = lc.chMax 827 if chMaxT > chMax { 828 chMaxT = chMax 829 } 830 831 switch lc.op { 832 case LowercaseSet: 833 chMinT = rune(lc.data) 834 chMaxT = rune(lc.data) 835 break 836 case LowercaseAdd: 837 chMinT += lc.data 838 chMaxT += lc.data 839 break 840 case LowercaseBor: 841 chMinT |= 1 842 chMaxT |= 1 843 break 844 case LowercaseBad: 845 chMinT += (chMinT & 1) 846 chMaxT += (chMaxT & 1) 847 break 848 } 849 850 if chMinT < chMin || chMaxT > chMax { 851 c.addRange(chMinT, chMaxT) 852 } 853 } 854} 855