1package textseg
2
3import (
4    "errors"
5    "unicode/utf8"
6)
7
8// Generated from grapheme_clusters.rl. DO NOT EDIT
9%%{
10  # (except you are actually in grapheme_clusters.rl here, so edit away!)
11
12  machine graphclust;
13  write data;
14}%%
15
16var Error = errors.New("invalid UTF8 text")
17
18// ScanGraphemeClusters is a split function for bufio.Scanner that splits
19// on grapheme cluster boundaries.
20func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
21    if len(data) == 0 {
22        return 0, nil, nil
23    }
24
25    // Ragel state
26	cs := 0 // Current State
27	p := 0  // "Pointer" into data
28	pe := len(data) // End-of-data "pointer"
29    ts := 0
30    te := 0
31    act := 0
32    eof := pe
33
34    // Make Go compiler happy
35    _ = ts
36    _ = te
37    _ = act
38    _ = eof
39
40    startPos := 0
41    endPos := 0
42
43    %%{
44        include GraphemeCluster "grapheme_clusters_table.rl";
45        include Emoji "emoji_table.rl";
46
47        action start {
48            startPos = p
49        }
50
51        action end {
52            endPos = p
53        }
54
55        action emit {
56            return endPos+1, data[startPos:endPos+1], nil
57        }
58
59        ZWJGlue = ZWJ (Extended_Pictographic Extend*)?;
60        AnyExtender = Extend | ZWJGlue | SpacingMark;
61        Extension = AnyExtender*;
62        ReplacementChar = (0xEF 0xBF 0xBD);
63
64        CRLFSeq = CR LF;
65        ControlSeq = Control | ReplacementChar;
66        HangulSeq = (
67            L+ (((LV? V+ | LVT) T*)?|LV?) |
68            LV V* T* |
69            V+ T* |
70            LVT T* |
71            T+
72        ) Extension;
73        EmojiSeq = Extended_Pictographic Extend* Extension;
74        ZWJSeq = ZWJ (ZWJ | Extend | SpacingMark)*;
75        EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;
76
77        UTF8Cont = 0x80 .. 0xBF;
78        AnyUTF8 = (
79            0x00..0x7F |
80            0xC0..0xDF . UTF8Cont |
81            0xE0..0xEF . UTF8Cont . UTF8Cont |
82            0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
83        );
84
85        # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
86        OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|Extended_Pictographic|ZWJ|Regional_Indicator|Prepend)) (Extend | ZWJ | SpacingMark)*;
87
88        # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
89        PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;
90
91        CRLFTok = CRLFSeq >start @end;
92        ControlTok = ControlSeq >start @end;
93        HangulTok = HangulSeq >start @end;
94        EmojiTok = EmojiSeq >start @end;
95        ZWJTok = ZWJSeq >start @end;
96        EmojiFlagTok = EmojiFlagSeq >start @end;
97        OtherTok = OtherSeq >start @end;
98        PrependTok = PrependSeq >start @end;
99
100        main := |*
101            CRLFTok => emit;
102            ControlTok => emit;
103            HangulTok => emit;
104            EmojiTok => emit;
105            ZWJTok => emit;
106            EmojiFlagTok => emit;
107            PrependTok => emit;
108            OtherTok => emit;
109
110            # any single valid UTF-8 character would also be valid per spec,
111            # but we'll handle that separately after the loop so we can deal
112            # with requesting more bytes if we're not at EOF.
113        *|;
114
115        write init;
116        write exec;
117    }%%
118
119    // If we fall out here then we were unable to complete a sequence.
120    // If we weren't able to complete a sequence then either we've
121    // reached the end of a partial buffer (so there's more data to come)
122    // or we have an isolated symbol that would normally be part of a
123    // grapheme cluster but has appeared in isolation here.
124
125    if !atEOF {
126        // Request more
127        return 0, nil, nil
128    }
129
130    // Just take the first UTF-8 sequence and return that.
131    _, seqLen := utf8.DecodeRune(data)
132    return seqLen, data[:seqLen], nil
133}