1//go:generate go run ./gen.go
2
3// Package unidata contains information about Unicode characters.
4package unidata
5
6import (
7	"errors"
8	"fmt"
9	"strconv"
10	"strings"
11	"unicode"
12	"unicode/utf16"
13	"unicode/utf8"
14
15	"zgo.at/zstd/zstring"
16)
17
18const UnknownCodepoint = "CODEPOINT NOT IN UNICODE"
19
20const (
21	GenderNone = 0
22	GenderSign = 1
23	GenderRole = 2
24)
25
26// Codepoint is a single codepoint.
27type Codepoint struct {
28	Codepoint rune
29	Width     uint8
30	Cat       uint8
31	Name      string
32	Digraph   string
33	HTML      string
34	KeySym    string // TODO: []string?
35}
36
37// Emoji is an emoji sequence.
38type Emoji struct {
39	Codepoints      []rune
40	Name            string
41	Group, Subgroup int
42	CLDR            []string
43	SkinTones       bool
44	Genders         int
45}
46
47func (e Emoji) GroupName() string {
48	return EmojiGroups[e.Group]
49}
50
51func (e Emoji) SubgroupName() string {
52	return EmojiSubgroups[e.GroupName()][e.Subgroup]
53}
54
55// Find a codepoint.
56func Find(cp rune) (Codepoint, bool) {
57	info, ok := Codepoints[cp]
58	if ok {
59		return info, true
60	}
61
62	// The UnicodeData.txt file doesn't list every character; some are included as a
63	// range:
64	//
65	//   3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
66	//   4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
67	for i, r := range ranges {
68		if cp >= r[0] && cp <= r[1] {
69			info, ok := Codepoints[r[0]]
70			if !ok {
71				panic("unidata.Find: '" + string(r) + "' not found; this should never happen")
72			}
73
74			info.Codepoint = cp
75			info.Name = rangeNames[i]
76			return info, true
77		}
78	}
79
80	return Codepoint{Codepoint: cp, Name: UnknownCodepoint}, false
81}
82
83// ToRune converts a human input string to a rune.
84//
85// The input can be as U+41, U+0041, U41, 0x41, 0o101, 0b1000001
86func ToRune(s string) (rune, error) {
87	os := s
88	s = strings.ToUpper(s)
89	var base = 16
90	switch {
91	case zstring.HasPrefixes(s, "0X", "U+"):
92		s = s[2:]
93	case strings.HasPrefix(s, "0D"):
94		s = s[2:]
95		base = 10
96	case strings.HasPrefix(s, "0O"):
97		s = s[2:]
98		base = 8
99	case strings.HasPrefix(s, "0B"):
100		s = s[2:]
101		base = 2
102
103	case zstring.HasPrefixes(s, "X", "U"):
104		s = s[1:]
105	case strings.HasPrefix(s, "O"):
106		s = s[1:]
107		base = 8
108	}
109	i, err := strconv.ParseInt(s, base, 32)
110	if err != nil {
111		if errors.Is(err, strconv.ErrRange) {
112			return 0, fmt.Errorf("out of range: %q", os)
113		}
114		if errors.Is(err, strconv.ErrSyntax) {
115			return 0, fmt.Errorf("not a number or codepoint: %q", os)
116		}
117		return 0, err
118	}
119	return rune(i), nil
120}
121
122// CanonicalCategory transforms a category name to the canonical representation.
123func CanonicalCategory(cat string) string {
124	// TODO: improve.
125	cat = strings.Replace(cat, " ", "", -1)
126	cat = strings.Replace(cat, ",", "", -1)
127	cat = strings.Replace(cat, "_", "", -1)
128	cat = strings.ToLower(cat)
129	return cat
130}
131
132func (c Codepoint) String() string {
133	return c.Repr(false) + ": " + c.FormatCodepoint() + " " + c.Name
134}
135
136func (c Codepoint) FormatCodepoint() string {
137	return fmt.Sprintf("U+%04X", c.Codepoint)
138}
139
140func (c Codepoint) Format(base int) string {
141	return strconv.FormatUint(uint64(c.Codepoint), base)
142}
143
144func (c Codepoint) Plane() string {
145	for p, r := range Planes {
146		if c.Codepoint >= r[0] && c.Codepoint <= r[1] {
147			return p
148		}
149	}
150	return ""
151}
152
153func (c Codepoint) WidthName() string {
154	return WidthNames[c.Width]
155}
156
157func (c Codepoint) Category() string {
158	return Catnames[c.Cat]
159}
160
161func (c Codepoint) Block() string {
162	for b, r := range Blocks {
163		if c.Codepoint >= r[0] && c.Codepoint <= r[1] {
164			return b
165		}
166	}
167	return ""
168}
169
170func (c Codepoint) UTF8() string {
171	buf := make([]byte, 4)
172	n := utf8.EncodeRune(buf, c.Codepoint)
173	return fmt.Sprintf("% x", buf[:n])
174}
175
176func (c Codepoint) UTF16(bigEndian bool) string {
177	var p []byte
178	if c.Codepoint <= 0xffff {
179		p = []byte{byte(c.Codepoint % 256), byte(c.Codepoint >> 8)}
180		if bigEndian {
181			p[1], p[0] = p[0], p[1]
182		}
183	} else {
184		a, b := utf16.EncodeRune(c.Codepoint)
185		p = []byte{byte(a % 256), byte(a >> 8), byte(b % 256), byte(b >> 8)}
186		if bigEndian {
187			p[1], p[0], p[3], p[2] = p[0], p[1], p[2], p[3]
188		}
189	}
190	return fmt.Sprintf(`% x`, p)
191}
192
193func (c Codepoint) XMLEntity() string {
194	return "&#x" + strconv.FormatInt(int64(c.Codepoint), 16) + ";"
195}
196
197func (c Codepoint) JSON() string {
198	u := strings.ReplaceAll(c.UTF16(true), " ", "")
199	if len(u) == 4 {
200		return `\u` + u
201	}
202	return `\u` + u[:4] + `\u` + u[4:]
203}
204
205func (c Codepoint) HTMLEntity() string {
206	if c.HTML != "" {
207		return "&" + c.HTML + ";"
208	}
209	return c.XMLEntity()
210}
211
212func (c Codepoint) Repr(raw bool) string {
213	if raw {
214		return string(c.Codepoint)
215	}
216
217	cp := c.Codepoint
218
219	// Display combining characters with ◌.
220	if unicode.In(cp, unicode.Mn, unicode.Mc, unicode.Me) {
221		return "\u25cc" + string(cp)
222	}
223
224	switch {
225	case unicode.IsControl(cp):
226		switch {
227		case cp < 0x20: // C0; use "Control Pictures" block
228			cp += 0x2400
229		case cp == 0x7f: // DEL
230			cp = 0x2421
231		// No control pictures for C1 or anything else, use "open box".
232		default:
233			cp = 0x2423
234		}
235	// "Other, Format" category except the soft hyphen and spaces.
236	case !unicode.IsPrint(cp) && cp != 0x00ad && !unicode.In(cp, unicode.Zs):
237		cp = 0xfffd
238	}
239
240	return string(cp)
241}
242
243func (e Emoji) String() string {
244	var c string
245
246	// Flags
247	// 1F1FF 1F1FC                                 # ���� E2.0 flag: Zimbabwe
248	// 1F3F4 E0067 E0062 E0065 E006E E0067 E007F   # �������������� E5.0 flag: England
249	if (e.Codepoints[0] >= 0x1f1e6 && e.Codepoints[0] <= 0x1f1ff) ||
250		(len(e.Codepoints) > 1 && e.Codepoints[1] == 0xe0067) {
251		for _, cp := range e.Codepoints {
252			c += string(rune(cp))
253		}
254		return c
255	}
256
257	for i, cp := range e.Codepoints {
258		c += string(rune(cp))
259
260		// Don't add ZWJ as last item.
261		if i == len(e.Codepoints)-1 {
262			continue
263		}
264
265		switch e.Codepoints[i+1] {
266		// Never add ZWJ before variation selector or skin tone.
267		case 0xfe0f, 0x1f3fb, 0x1f3fc, 0x1f3fd, 0x1f3fe, 0x1f3ff:
268			continue
269		// Keycap: join with 0xfe0f
270		case 0x20e3:
271			continue
272		}
273
274		c += "\u200d"
275	}
276	return c
277}
278