1"""Case-folding and whitespace normalization""" 2# Unicode Case Folding table has been derived from the following work: 3# 4# CaseFolding-12.0.0.txt 5# Date: 2019-01-22, 08:18:22 GMT 6# (c) 2019 Unicode(R) Inc. 7# Unicode and the Unicode Logo are registered trademarks 8# of Unicode, Inc. in the U.S. and other countries. 9# For terms of use, see http://www.unicode.org/terms_of_use.html 10# 11# Unicode Character Database 12# For documentation, see http://www.unicode.org/reports/tr44/ 13 14import re 15import sys 16from builtins import str, chr 17 18__all__ = ["normalize_reference"] 19 20if sys.version_info < (3,) and sys.maxunicode <= 0xffff: 21 # shim for Python 2.x UCS2 build 22 _unichr = chr 23 24 def chr(cdp): 25 if 0x10000 <= cdp < 0x110000: 26 cdp -= 0x10000 27 return (_unichr(0xd800 | (cdp >> 10)) + 28 _unichr(0xdc00 | (cdp & 0x3ff))) 29 return _unichr(cdp) 30 31 32def _parse_table(tbl): 33 xlat = {} 34 cur_i, cur_j = -1, 0 35 for entry in tbl.split(';'): 36 arr = entry.split(',') 37 info = [int(x, 36) if x else 0 for x in arr[0].split(':')] 38 arr = [int(x, 36) for x in arr[1:]] 39 assert not any(x in xlat for x in arr) 40 sfx = ''.join(map(chr, arr)) 41 streak, stride = 0, 1 42 if len(info) == 2: 43 fdt, delta = info 44 elif len(info) == 3: 45 fdt, streak, delta = info 46 else: 47 fdt, streak, delta, stride = info 48 assert streak >= 0 and stride >= 1 49 cur_i += fdt + 1 50 cur_j -= delta 51 assert cur_j != 0 52 i = cur_i 53 last = cur_i + streak 54 while i <= last: 55 # uniqueness and idempotency 56 assert i not in xlat and i + cur_j not in xlat 57 assert i not in arr 58 xlat[i] = chr(i + cur_j) + sfx 59 i += stride 60 return xlat 61 62 63XLAT = _parse_table( 64 # ===== Start of Unicode Case Folding table ===== 65 '1t:p:-w;37:-kn;a:m:kn;n:6:;6:3w,37;w:1a:-31:2;1b:5k,lj;1:4:-5k:2;6:e::' 66 '2;f:-aa,32;:18:aa:2;19:3e;:4:-3e:2;5:7h;1:-da;:2:5t:2;3:-5p;:5p;1:1:-5' 67 'o;1:5o;2:-26;:-3f;:-1;:5m;1:-5o;:-2;1:-4;:2;:5s;3:-5u;:-2;1:-1;:4:5x:2' 68 ';5:-61;:61;1:-61;2:61;1:-61;:61;1:1:-60;1:2:60:2;3:-62;:4:62:4;b:-1;:1' 69 ';1:-1;:1;1:-1;:g:1:2;i:g::2;h:av,lo;:-aw;:2:1:2;3:2q;:-15;:12:-1l:2;13' 70 ':3n;1:g:-3n:2;n:-8bu;:8bu;1:4k;:-8gb;2:8br;1:5g;:-7c;:-2;:8:1y:2;72:-3' 71 '7;16:2:37:2;5:;8:-37;6:26;1:2:1;3:-r;1:1:1;1:m,lk,ld;:g:9;h:8:;c:b,lk,' 72 'ld;h:k;c:-7;:12;:-5;3:-a;:7;1:m:-n:2;n:1j;:-6;2:c;:4;1:-1t;1:8;:-8;2:2' 73 ':3n;2:f:-5u;f:v:1c;27:w:v:2;15:1g::2;1h:-e;:c:e:2;e:2m::2;2o:11:-1b;2d' 74 ':2a,136;26w:11:-5mq;12:6::6;mo:5:5m0;1on:4sm;:-1;:-9;:1:-2;1:1;:-7;:-o' 75 ';:-vzb;7:16:tj7;18:2:;8y:44:-2bl:2;45:5yn,mp;:-b,lk;:-2,lm;:-1,lm;:p,j' 76 'i;:-5xb;2:5wx,37;1:2m:-5yk:2;2v:7:9;f:5:;f:7:;f:7:;f:5:;7:5fn,lv;1:2,l' 77 'v,lc;1:2,lv,ld;1:2,lv,n6;2:6:-5ft:2;e:7:;n:7:3c,qh;7:7:8,qh;7:7:-o,qh;' 78 '7:7:8,qh;7:7:-1k,qh;7:7:8,qh;9:-6,qh;:5hc,qh;:6,qh;1:-3,n6;:1,n6,qh;:1' 79 ':-5j2;1:1:1u;1:5hd,qh;1:-6;3:-5h3,qh;:5ha,qh;:a,qh;1:-7,n6;:1,n6,qh;:3' 80 ':-5h6;3:5hb,qh;5:4,lk,lc;:1,lk,ld;2:3,n6;:1,lk,n6;:1:-5jq;1:1:2k;7:5h5' 81 ',lk,lc;:1,lk,ld;:5,lv;1:-2,n6;:1,lk,n6;:1:-5ju;1:1:2w;1:-2x;5:33,qh;:5' 82 'h0,qh;:-4,qh;1:7,n6;:1,n6,qh;:1:-5gu;1:1:-2;1:5h1,qh;89:8a;3:o2;:-3d;6' 83 ':-6ea;19:f:c;y:f;mq:p:-p;1ft:1a:-m;2n:1b;1:8ag;:-5ch;:5c1;2:4:-8a0:2;5' 84 ':8bh;:-v;:y;:-1;1:3:-8bj:3;b:1:8cg;1:2q:-8cg:2;2y:2::2;6:nym::nym;nyn:' 85 '16::2;1p:q::2;4h:c::2;f:1o::2;1y:2::2;3:r9h;:8:-r9h:2;c:;1:wmh;2:2:-wm' 86 'h:2;5:i::2;j:wn9;:b;:-4;:-a;:3;1:-1e;:o;:-l;:-xbp;:a:pr:2;d:;1:1d;:wlv' 87 ';:-5cb;q1:27:2oo;fpr:jii,2u;:1,2x;:1,30;:1,2u,2x;:1,2u,30;:-c,38;:1,38' 88 ';c:-z8,12u;:1,12d;:1,12j;:-9,12u;:b,12l;sp:p:-1cjn;ym:13:-8;4v:z:;1jj:' 89 '1e:-o;2e7:v:w;gwv:v:;o8v:x:-2' 90 # ===== End of Unicode Case Folding table ===== 91) 92 93 94def _check_native(tbl): 95 """ 96 Determine if Python's own native implementation 97 subsumes the supplied case folding table 98 """ 99 try: 100 for i in tbl: 101 stv = chr(i) 102 if stv.casefold() == stv: 103 return False 104 except AttributeError: 105 return False 106 return True 107 108 109# Hoist version check out of function for performance 110SPACE_RE = re.compile(r'[ \t\r\n]+') 111if _check_native(XLAT): 112 def normalize_reference(string): 113 """ 114 Normalize reference label: collapse internal whitespace 115 to single space, remove leading/trailing whitespace, case fold. 116 """ 117 return SPACE_RE.sub(' ', string[1:-1].strip()).casefold() 118elif sys.version_info >= (3,) or sys.maxunicode > 0xffff: 119 def normalize_reference(string): 120 """ 121 Normalize reference label: collapse internal whitespace 122 to single space, remove leading/trailing whitespace, case fold. 123 """ 124 return SPACE_RE.sub(' ', string[1:-1].strip()).translate(XLAT) 125else: 126 def _get_smp_regex(): 127 xls = sorted(x - 0x10000 for x in XLAT if x >= 0x10000) 128 xls.append(-1) 129 fmt, (dsh, opn, pip, cse) = str('\\u%04x'), str('-[|]') 130 rga, srk, erk = [str(r'[ \t\r\n]+')], 0, -2 131 for k in xls: 132 new_hir = (erk ^ k) >> 10 != 0 133 if new_hir or erk + 1 != k: 134 if erk >= 0 and srk != erk: 135 if srk + 1 != erk: 136 rga.append(dsh) 137 rga.append(fmt % (0xdc00 + (erk & 0x3ff))) 138 if new_hir: 139 if erk >= 0: 140 rga.append(cse) 141 if k < 0: 142 break 143 rga.append(pip) 144 rga.append(fmt % (0xd800 + (k >> 10))) 145 rga.append(opn) 146 srk = k 147 rga.append(fmt % (0xdc00 + (srk & 0x3ff))) 148 erk = k 149 return re.compile(str().join(rga)) 150 151 def _subst_handler(matchobj): 152 src = matchobj.group(0) 153 hiv = ord(src[0]) 154 if hiv < 0xd800: 155 return ' ' 156 return XLAT[0x10000 + ((hiv & 0x3ff) << 10) | (ord(src[1]) & 0x3ff)] 157 158 SMP_RE = _get_smp_regex() 159 160 def normalize_reference(string): 161 """ 162 Normalize reference label: collapse internal whitespace 163 to single space, remove leading/trailing whitespace, case fold. 164 """ 165 return SMP_RE.sub(_subst_handler, string[1:-1].strip()).translate(XLAT) 166