1//go:generate go run ./gen.go 2 3// Package unidata contains information about Unicode characters. 4package unidata 5 6import ( 7 "errors" 8 "fmt" 9 "strconv" 10 "strings" 11 "unicode" 12 "unicode/utf16" 13 "unicode/utf8" 14 15 "zgo.at/zstd/zstring" 16) 17 18const UnknownCodepoint = "CODEPOINT NOT IN UNICODE" 19 20const ( 21 GenderNone = 0 22 GenderSign = 1 23 GenderRole = 2 24) 25 26// Codepoint is a single codepoint. 27type Codepoint struct { 28 Codepoint rune 29 Width uint8 30 Cat uint8 31 Name string 32 Digraph string 33 HTML string 34 KeySym string // TODO: []string? 35} 36 37// Emoji is an emoji sequence. 38type Emoji struct { 39 Codepoints []rune 40 Name string 41 Group, Subgroup int 42 CLDR []string 43 SkinTones bool 44 Genders int 45} 46 47func (e Emoji) GroupName() string { 48 return EmojiGroups[e.Group] 49} 50 51func (e Emoji) SubgroupName() string { 52 return EmojiSubgroups[e.GroupName()][e.Subgroup] 53} 54 55// Find a codepoint. 56func Find(cp rune) (Codepoint, bool) { 57 info, ok := Codepoints[cp] 58 if ok { 59 return info, true 60 } 61 62 // The UnicodeData.txt file doesn't list every character; some are included as a 63 // range: 64 // 65 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 66 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 67 for i, r := range ranges { 68 if cp >= r[0] && cp <= r[1] { 69 info, ok := Codepoints[r[0]] 70 if !ok { 71 panic("unidata.Find: '" + string(r) + "' not found; this should never happen") 72 } 73 74 info.Codepoint = cp 75 info.Name = rangeNames[i] 76 return info, true 77 } 78 } 79 80 return Codepoint{Codepoint: cp, Name: UnknownCodepoint}, false 81} 82 83// ToRune converts a human input string to a rune. 84// 85// The input can be as U+41, U+0041, U41, 0x41, 0o101, 0b1000001 86func ToRune(s string) (rune, error) { 87 os := s 88 s = strings.ToUpper(s) 89 var base = 16 90 switch { 91 case zstring.HasPrefixes(s, "0X", "U+"): 92 s = s[2:] 93 case strings.HasPrefix(s, "0D"): 94 s = s[2:] 95 base = 10 96 case strings.HasPrefix(s, "0O"): 97 s = s[2:] 98 base = 8 99 case strings.HasPrefix(s, "0B"): 100 s = s[2:] 101 base = 2 102 103 case zstring.HasPrefixes(s, "X", "U"): 104 s = s[1:] 105 case strings.HasPrefix(s, "O"): 106 s = s[1:] 107 base = 8 108 } 109 i, err := strconv.ParseInt(s, base, 32) 110 if err != nil { 111 if errors.Is(err, strconv.ErrRange) { 112 return 0, fmt.Errorf("out of range: %q", os) 113 } 114 if errors.Is(err, strconv.ErrSyntax) { 115 return 0, fmt.Errorf("not a number or codepoint: %q", os) 116 } 117 return 0, err 118 } 119 return rune(i), nil 120} 121 122// CanonicalCategory transforms a category name to the canonical representation. 123func CanonicalCategory(cat string) string { 124 // TODO: improve. 125 cat = strings.Replace(cat, " ", "", -1) 126 cat = strings.Replace(cat, ",", "", -1) 127 cat = strings.Replace(cat, "_", "", -1) 128 cat = strings.ToLower(cat) 129 return cat 130} 131 132func (c Codepoint) String() string { 133 return c.Repr(false) + ": " + c.FormatCodepoint() + " " + c.Name 134} 135 136func (c Codepoint) FormatCodepoint() string { 137 return fmt.Sprintf("U+%04X", c.Codepoint) 138} 139 140func (c Codepoint) Format(base int) string { 141 return strconv.FormatUint(uint64(c.Codepoint), base) 142} 143 144func (c Codepoint) Plane() string { 145 for p, r := range Planes { 146 if c.Codepoint >= r[0] && c.Codepoint <= r[1] { 147 return p 148 } 149 } 150 return "" 151} 152 153func (c Codepoint) WidthName() string { 154 return WidthNames[c.Width] 155} 156 157func (c Codepoint) Category() string { 158 return Catnames[c.Cat] 159} 160 161func (c Codepoint) Block() string { 162 for b, r := range Blocks { 163 if c.Codepoint >= r[0] && c.Codepoint <= r[1] { 164 return b 165 } 166 } 167 return "" 168} 169 170func (c Codepoint) UTF8() string { 171 buf := make([]byte, 4) 172 n := utf8.EncodeRune(buf, c.Codepoint) 173 return fmt.Sprintf("% x", buf[:n]) 174} 175 176func (c Codepoint) UTF16(bigEndian bool) string { 177 var p []byte 178 if c.Codepoint <= 0xffff { 179 p = []byte{byte(c.Codepoint % 256), byte(c.Codepoint >> 8)} 180 if bigEndian { 181 p[1], p[0] = p[0], p[1] 182 } 183 } else { 184 a, b := utf16.EncodeRune(c.Codepoint) 185 p = []byte{byte(a % 256), byte(a >> 8), byte(b % 256), byte(b >> 8)} 186 if bigEndian { 187 p[1], p[0], p[3], p[2] = p[0], p[1], p[2], p[3] 188 } 189 } 190 return fmt.Sprintf(`% x`, p) 191} 192 193func (c Codepoint) XMLEntity() string { 194 return "&#x" + strconv.FormatInt(int64(c.Codepoint), 16) + ";" 195} 196 197func (c Codepoint) JSON() string { 198 u := strings.ReplaceAll(c.UTF16(true), " ", "") 199 if len(u) == 4 { 200 return `\u` + u 201 } 202 return `\u` + u[:4] + `\u` + u[4:] 203} 204 205func (c Codepoint) HTMLEntity() string { 206 if c.HTML != "" { 207 return "&" + c.HTML + ";" 208 } 209 return c.XMLEntity() 210} 211 212func (c Codepoint) Repr(raw bool) string { 213 if raw { 214 return string(c.Codepoint) 215 } 216 217 cp := c.Codepoint 218 219 // Display combining characters with ◌. 220 if unicode.In(cp, unicode.Mn, unicode.Mc, unicode.Me) { 221 return "\u25cc" + string(cp) 222 } 223 224 switch { 225 case unicode.IsControl(cp): 226 switch { 227 case cp < 0x20: // C0; use "Control Pictures" block 228 cp += 0x2400 229 case cp == 0x7f: // DEL 230 cp = 0x2421 231 // No control pictures for C1 or anything else, use "open box". 232 default: 233 cp = 0x2423 234 } 235 // "Other, Format" category except the soft hyphen and spaces. 236 case !unicode.IsPrint(cp) && cp != 0x00ad && !unicode.In(cp, unicode.Zs): 237 cp = 0xfffd 238 } 239 240 return string(cp) 241} 242 243func (e Emoji) String() string { 244 var c string 245 246 // Flags 247 // 1F1FF 1F1FC # E2.0 flag: Zimbabwe 248 // 1F3F4 E0067 E0062 E0065 E006E E0067 E007F # E5.0 flag: England 249 if (e.Codepoints[0] >= 0x1f1e6 && e.Codepoints[0] <= 0x1f1ff) || 250 (len(e.Codepoints) > 1 && e.Codepoints[1] == 0xe0067) { 251 for _, cp := range e.Codepoints { 252 c += string(rune(cp)) 253 } 254 return c 255 } 256 257 for i, cp := range e.Codepoints { 258 c += string(rune(cp)) 259 260 // Don't add ZWJ as last item. 261 if i == len(e.Codepoints)-1 { 262 continue 263 } 264 265 switch e.Codepoints[i+1] { 266 // Never add ZWJ before variation selector or skin tone. 267 case 0xfe0f, 0x1f3fb, 0x1f3fc, 0x1f3fd, 0x1f3fe, 0x1f3ff: 268 continue 269 // Keycap: join with 0xfe0f 270 case 0x20e3: 271 continue 272 } 273 274 c += "\u200d" 275 } 276 return c 277} 278