1package textseg 2 3import ( 4 "errors" 5 "unicode/utf8" 6) 7 8// Generated from grapheme_clusters.rl. DO NOT EDIT 9%%{ 10 # (except you are actually in grapheme_clusters.rl here, so edit away!) 11 12 machine graphclust; 13 write data; 14}%% 15 16var Error = errors.New("invalid UTF8 text") 17 18// ScanGraphemeClusters is a split function for bufio.Scanner that splits 19// on grapheme cluster boundaries. 20func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) { 21 if len(data) == 0 { 22 return 0, nil, nil 23 } 24 25 // Ragel state 26 cs := 0 // Current State 27 p := 0 // "Pointer" into data 28 pe := len(data) // End-of-data "pointer" 29 ts := 0 30 te := 0 31 act := 0 32 eof := pe 33 34 // Make Go compiler happy 35 _ = ts 36 _ = te 37 _ = act 38 _ = eof 39 40 startPos := 0 41 endPos := 0 42 43 %%{ 44 include GraphemeCluster "grapheme_clusters_table.rl"; 45 include Emoji "emoji_table.rl"; 46 47 action start { 48 startPos = p 49 } 50 51 action end { 52 endPos = p 53 } 54 55 action emit { 56 return endPos+1, data[startPos:endPos+1], nil 57 } 58 59 ZWJGlue = ZWJ (Extended_Pictographic Extend*)?; 60 AnyExtender = Extend | ZWJGlue | SpacingMark; 61 Extension = AnyExtender*; 62 ReplacementChar = (0xEF 0xBF 0xBD); 63 64 CRLFSeq = CR LF; 65 ControlSeq = Control | ReplacementChar; 66 HangulSeq = ( 67 L+ (((LV? V+ | LVT) T*)?|LV?) | 68 LV V* T* | 69 V+ T* | 70 LVT T* | 71 T+ 72 ) Extension; 73 EmojiSeq = Extended_Pictographic Extend* Extension; 74 ZWJSeq = ZWJ (ZWJ | Extend | SpacingMark)*; 75 EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension; 76 77 UTF8Cont = 0x80 .. 0xBF; 78 AnyUTF8 = ( 79 0x00..0x7F | 80 0xC0..0xDF . UTF8Cont | 81 0xE0..0xEF . UTF8Cont . UTF8Cont | 82 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont 83 ); 84 85 # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension 86 OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|Extended_Pictographic|ZWJ|Regional_Indicator|Prepend)) (Extend | ZWJ | SpacingMark)*; 87 88 # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break 89 PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?; 90 91 CRLFTok = CRLFSeq >start @end; 92 ControlTok = ControlSeq >start @end; 93 HangulTok = HangulSeq >start @end; 94 EmojiTok = EmojiSeq >start @end; 95 ZWJTok = ZWJSeq >start @end; 96 EmojiFlagTok = EmojiFlagSeq >start @end; 97 OtherTok = OtherSeq >start @end; 98 PrependTok = PrependSeq >start @end; 99 100 main := |* 101 CRLFTok => emit; 102 ControlTok => emit; 103 HangulTok => emit; 104 EmojiTok => emit; 105 ZWJTok => emit; 106 EmojiFlagTok => emit; 107 PrependTok => emit; 108 OtherTok => emit; 109 110 # any single valid UTF-8 character would also be valid per spec, 111 # but we'll handle that separately after the loop so we can deal 112 # with requesting more bytes if we're not at EOF. 113 *|; 114 115 write init; 116 write exec; 117 }%% 118 119 // If we fall out here then we were unable to complete a sequence. 120 // If we weren't able to complete a sequence then either we've 121 // reached the end of a partial buffer (so there's more data to come) 122 // or we have an isolated symbol that would normally be part of a 123 // grapheme cluster but has appeared in isolation here. 124 125 if !atEOF { 126 // Request more 127 return 0, nil, nil 128 } 129 130 // Just take the first UTF-8 sequence and return that. 131 _, seqLen := utf8.DecodeRune(data) 132 return seqLen, data[:seqLen], nil 133}