1// Copyright (c) 2015 Eric Bower 2// 3// Permission is hereby granted, free of charge, to any person obtaining a copy 4// of this software and associated documentation files (the "Software"), to deal 5// in the Software without restriction, including without limitation the rights 6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7// copies of the Software, and to permit persons to whom the Software is 8// furnished to do so, subject to the following conditions: 9// 10// The above copyright notice and this permission notice shall be included in all 11// copies or substantial portions of the Software. 12// 13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19// SOFTWARE. 20 21package tokenize 22 23import ( 24 "regexp" 25 "strings" 26 27 "github.com/jdkato/prose/internal/util" 28 "gopkg.in/neurosnap/sentences.v1" 29 "gopkg.in/neurosnap/sentences.v1/data" 30) 31 32// PunktSentenceTokenizer is an extension of the Go implementation of the Punkt 33// sentence tokenizer (https://github.com/neurosnap/sentences), with a few 34// minor improvements (see https://github.com/neurosnap/sentences/pull/18). 35type PunktSentenceTokenizer struct { 36 tokenizer *sentences.DefaultSentenceTokenizer 37} 38 39// NewPunktSentenceTokenizer creates a new PunktSentenceTokenizer and loads 40// its English model. 41func NewPunktSentenceTokenizer() *PunktSentenceTokenizer { 42 var pt PunktSentenceTokenizer 43 var err error 44 45 pt.tokenizer, err = newSentenceTokenizer(nil) 46 util.CheckError(err) 47 48 return &pt 49} 50 51// Tokenize splits text into sentences. 52func (p PunktSentenceTokenizer) Tokenize(text string) []string { 53 sents := []string{} 54 for _, s := range p.tokenizer.Tokenize(text) { 55 sents = append(sents, s.Text) 56 } 57 return sents 58} 59 60type wordTokenizer struct { 61 sentences.DefaultWordTokenizer 62} 63 64var reAbbr = regexp.MustCompile(`((?:[\w]\.)+[\w]*\.)`) 65var reLooksLikeEllipsis = regexp.MustCompile(`(?:\.\s?){2,}\.`) 66var reEntities = regexp.MustCompile(`Yahoo!`) 67 68// English customized sentence tokenizer. 69func newSentenceTokenizer(s *sentences.Storage) (*sentences.DefaultSentenceTokenizer, error) { 70 training := s 71 72 if training == nil { 73 b, err := data.Asset("data/english.json") 74 if err != nil { 75 return nil, err 76 } 77 78 training, err = sentences.LoadTraining(b) 79 if err != nil { 80 return nil, err 81 } 82 } 83 84 // supervisor abbreviations 85 abbrevs := []string{"sgt", "gov", "no", "mt"} 86 for _, abbr := range abbrevs { 87 training.AbbrevTypes.Add(abbr) 88 } 89 90 lang := sentences.NewPunctStrings() 91 word := newWordTokenizer(lang) 92 annotations := sentences.NewAnnotations(training, lang, word) 93 94 ortho := &sentences.OrthoContext{ 95 Storage: training, 96 PunctStrings: lang, 97 TokenType: word, 98 TokenFirst: word, 99 } 100 101 multiPunct := &multiPunctWordAnnotation{ 102 Storage: training, 103 TokenParser: word, 104 TokenGrouper: &sentences.DefaultTokenGrouper{}, 105 Ortho: ortho, 106 } 107 108 annotations = append(annotations, multiPunct) 109 110 tokenizer := &sentences.DefaultSentenceTokenizer{ 111 Storage: training, 112 PunctStrings: lang, 113 WordTokenizer: word, 114 Annotations: annotations, 115 } 116 117 return tokenizer, nil 118} 119 120func newWordTokenizer(p sentences.PunctStrings) *wordTokenizer { 121 word := &wordTokenizer{} 122 word.PunctStrings = p 123 124 return word 125} 126 127func (e *wordTokenizer) HasSentEndChars(t *sentences.Token) bool { 128 enders := []string{ 129 `."`, `.)`, `.’`, `.”`, 130 `?`, `?"`, `?'`, `?)`, `?’`, `?”`, 131 `!`, `!"`, `!'`, `!)`, `!’`, `!”`, 132 } 133 134 for _, ender := range enders { 135 if strings.HasSuffix(t.Tok, ender) && !reEntities.MatchString(t.Tok) { 136 return true 137 } 138 } 139 140 parens := []string{ 141 `.[`, `.(`, `."`, 142 `?[`, `?(`, 143 `![`, `!(`, 144 } 145 146 for _, paren := range parens { 147 if strings.Contains(t.Tok, paren) { 148 return true 149 } 150 } 151 152 return false 153} 154 155// MultiPunctWordAnnotation attempts to tease out custom Abbreviations such as 156// "F.B.I." 157type multiPunctWordAnnotation struct { 158 *sentences.Storage 159 sentences.TokenParser 160 sentences.TokenGrouper 161 sentences.Ortho 162} 163 164func (a *multiPunctWordAnnotation) Annotate(tokens []*sentences.Token) []*sentences.Token { 165 for _, tokPair := range a.TokenGrouper.Group(tokens) { 166 if len(tokPair) < 2 || tokPair[1] == nil { 167 tok := tokPair[0].Tok 168 if strings.Contains(tok, "\n") && strings.Contains(tok, " ") { 169 // We've mislabeled due to an errant newline. 170 tokPair[0].SentBreak = false 171 } 172 continue 173 } 174 175 a.tokenAnnotation(tokPair[0], tokPair[1]) 176 } 177 178 return tokens 179} 180 181// looksInternal determines if tok's punctuation could appear 182// sentence-internally (i.e., parentheses or quotations). 183func looksInternal(tok string) bool { 184 internal := []string{")", `’`, `”`, `"`, `'`} 185 for _, punc := range internal { 186 if strings.HasSuffix(tok, punc) { 187 return true 188 } 189 } 190 return false 191} 192 193func (a *multiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Token) { 194 // This is an expensive calculation, so we only want to do it once. 195 var nextTyp string 196 197 // If both tokOne and tokTwo and periods, we're probably in an ellipsis 198 // that wasn't properly tokenized by `WordTokenizer`. 199 if strings.HasSuffix(tokOne.Tok, ".") && tokTwo.Tok == "." { 200 tokOne.SentBreak = false 201 tokTwo.SentBreak = false 202 return 203 } 204 205 isNonBreak := strings.HasSuffix(tokOne.Tok, ".") && !tokOne.SentBreak 206 isEllipsis := reLooksLikeEllipsis.MatchString(tokOne.Tok) 207 isInternal := tokOne.SentBreak && looksInternal(tokOne.Tok) 208 209 if isNonBreak || isEllipsis || isInternal { 210 nextTyp = a.TokenParser.TypeNoSentPeriod(tokTwo) 211 isStarter := a.SentStarters[nextTyp] 212 213 // If the tokOne looks like an ellipsis and tokTwo is either 214 // capitalized or a frequent sentence starter, break the sentence. 215 if isEllipsis { 216 if a.TokenParser.FirstUpper(tokTwo) || isStarter != 0 { 217 tokOne.SentBreak = true 218 return 219 } 220 } 221 222 // If the tokOne's sentence-breaking punctuation looks like it could 223 // occur sentence-internally, ensure that the following word is either 224 // capitalized or a frequent sentence starter. 225 if isInternal { 226 if a.TokenParser.FirstLower(tokTwo) && isStarter == 0 { 227 tokOne.SentBreak = false 228 return 229 } 230 } 231 232 // If the tokOne ends with a period but isn't marked as a sentence 233 // break, mark it if tokTwo is capitalized and can occur in _ORTHO_LC. 234 if isNonBreak && a.TokenParser.FirstUpper(tokTwo) { 235 if a.Storage.OrthoContext[nextTyp]&112 != 0 { 236 tokOne.SentBreak = true 237 } 238 } 239 } 240 241 if len(reAbbr.FindAllString(tokOne.Tok, 1)) == 0 { 242 return 243 } 244 245 if a.IsInitial(tokOne) { 246 return 247 } 248 249 tokOne.Abbr = true 250 tokOne.SentBreak = false 251 252 // [4.1.1. Orthographic Heuristic] Check if there's 253 // orthogrpahic evidence about whether the next word 254 // starts a sentence or not. 255 isSentStarter := a.Ortho.Heuristic(tokTwo) 256 if isSentStarter == 1 { 257 tokOne.SentBreak = true 258 return 259 } 260 261 if nextTyp == "" { 262 nextTyp = a.TokenParser.TypeNoSentPeriod(tokTwo) 263 } 264 265 // [4.1.3. Frequent Sentence Starter Heruistic] If the 266 // next word is capitalized, and is a member of the 267 // frequent-sentence-starters list, then label tok as a 268 // sentence break. 269 if a.TokenParser.FirstUpper(tokTwo) && a.SentStarters[nextTyp] != 0 { 270 tokOne.SentBreak = true 271 return 272 } 273} 274