1// Copyright (c) 2015 Eric Bower
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in all
11// copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19// SOFTWARE.
20
21package tokenize
22
23import (
24	"regexp"
25	"strings"
26
27	"github.com/jdkato/prose/internal/util"
28	"gopkg.in/neurosnap/sentences.v1"
29	"gopkg.in/neurosnap/sentences.v1/data"
30)
31
32// PunktSentenceTokenizer is an extension of the Go implementation of the Punkt
33// sentence tokenizer (https://github.com/neurosnap/sentences), with a few
34// minor improvements (see https://github.com/neurosnap/sentences/pull/18).
35type PunktSentenceTokenizer struct {
36	tokenizer *sentences.DefaultSentenceTokenizer
37}
38
39// NewPunktSentenceTokenizer creates a new PunktSentenceTokenizer and loads
40// its English model.
41func NewPunktSentenceTokenizer() *PunktSentenceTokenizer {
42	var pt PunktSentenceTokenizer
43	var err error
44
45	pt.tokenizer, err = newSentenceTokenizer(nil)
46	util.CheckError(err)
47
48	return &pt
49}
50
51// Tokenize splits text into sentences.
52func (p PunktSentenceTokenizer) Tokenize(text string) []string {
53	sents := []string{}
54	for _, s := range p.tokenizer.Tokenize(text) {
55		sents = append(sents, s.Text)
56	}
57	return sents
58}
59
60type wordTokenizer struct {
61	sentences.DefaultWordTokenizer
62}
63
64var reAbbr = regexp.MustCompile(`((?:[\w]\.)+[\w]*\.)`)
65var reLooksLikeEllipsis = regexp.MustCompile(`(?:\.\s?){2,}\.`)
66var reEntities = regexp.MustCompile(`Yahoo!`)
67
68// English customized sentence tokenizer.
69func newSentenceTokenizer(s *sentences.Storage) (*sentences.DefaultSentenceTokenizer, error) {
70	training := s
71
72	if training == nil {
73		b, err := data.Asset("data/english.json")
74		if err != nil {
75			return nil, err
76		}
77
78		training, err = sentences.LoadTraining(b)
79		if err != nil {
80			return nil, err
81		}
82	}
83
84	// supervisor abbreviations
85	abbrevs := []string{"sgt", "gov", "no", "mt"}
86	for _, abbr := range abbrevs {
87		training.AbbrevTypes.Add(abbr)
88	}
89
90	lang := sentences.NewPunctStrings()
91	word := newWordTokenizer(lang)
92	annotations := sentences.NewAnnotations(training, lang, word)
93
94	ortho := &sentences.OrthoContext{
95		Storage:      training,
96		PunctStrings: lang,
97		TokenType:    word,
98		TokenFirst:   word,
99	}
100
101	multiPunct := &multiPunctWordAnnotation{
102		Storage:      training,
103		TokenParser:  word,
104		TokenGrouper: &sentences.DefaultTokenGrouper{},
105		Ortho:        ortho,
106	}
107
108	annotations = append(annotations, multiPunct)
109
110	tokenizer := &sentences.DefaultSentenceTokenizer{
111		Storage:       training,
112		PunctStrings:  lang,
113		WordTokenizer: word,
114		Annotations:   annotations,
115	}
116
117	return tokenizer, nil
118}
119
120func newWordTokenizer(p sentences.PunctStrings) *wordTokenizer {
121	word := &wordTokenizer{}
122	word.PunctStrings = p
123
124	return word
125}
126
127func (e *wordTokenizer) HasSentEndChars(t *sentences.Token) bool {
128	enders := []string{
129		`."`, `.)`, `.’`, `.”`,
130		`?`, `?"`, `?'`, `?)`, `?’`, `?”`,
131		`!`, `!"`, `!'`, `!)`, `!’`, `!”`,
132	}
133
134	for _, ender := range enders {
135		if strings.HasSuffix(t.Tok, ender) && !reEntities.MatchString(t.Tok) {
136			return true
137		}
138	}
139
140	parens := []string{
141		`.[`, `.(`, `."`,
142		`?[`, `?(`,
143		`![`, `!(`,
144	}
145
146	for _, paren := range parens {
147		if strings.Contains(t.Tok, paren) {
148			return true
149		}
150	}
151
152	return false
153}
154
155// MultiPunctWordAnnotation attempts to tease out custom Abbreviations such as
156// "F.B.I."
157type multiPunctWordAnnotation struct {
158	*sentences.Storage
159	sentences.TokenParser
160	sentences.TokenGrouper
161	sentences.Ortho
162}
163
164func (a *multiPunctWordAnnotation) Annotate(tokens []*sentences.Token) []*sentences.Token {
165	for _, tokPair := range a.TokenGrouper.Group(tokens) {
166		if len(tokPair) < 2 || tokPair[1] == nil {
167			tok := tokPair[0].Tok
168			if strings.Contains(tok, "\n") && strings.Contains(tok, " ") {
169				// We've mislabeled due to an errant newline.
170				tokPair[0].SentBreak = false
171			}
172			continue
173		}
174
175		a.tokenAnnotation(tokPair[0], tokPair[1])
176	}
177
178	return tokens
179}
180
181// looksInternal determines if tok's punctuation could appear
182// sentence-internally (i.e., parentheses or quotations).
183func looksInternal(tok string) bool {
184	internal := []string{")", `’`, `”`, `"`, `'`}
185	for _, punc := range internal {
186		if strings.HasSuffix(tok, punc) {
187			return true
188		}
189	}
190	return false
191}
192
193func (a *multiPunctWordAnnotation) tokenAnnotation(tokOne, tokTwo *sentences.Token) {
194	// This is an expensive calculation, so we only want to do it once.
195	var nextTyp string
196
197	// If both tokOne and tokTwo and periods, we're probably in an ellipsis
198	// that wasn't properly tokenized by `WordTokenizer`.
199	if strings.HasSuffix(tokOne.Tok, ".") && tokTwo.Tok == "." {
200		tokOne.SentBreak = false
201		tokTwo.SentBreak = false
202		return
203	}
204
205	isNonBreak := strings.HasSuffix(tokOne.Tok, ".") && !tokOne.SentBreak
206	isEllipsis := reLooksLikeEllipsis.MatchString(tokOne.Tok)
207	isInternal := tokOne.SentBreak && looksInternal(tokOne.Tok)
208
209	if isNonBreak || isEllipsis || isInternal {
210		nextTyp = a.TokenParser.TypeNoSentPeriod(tokTwo)
211		isStarter := a.SentStarters[nextTyp]
212
213		// If the tokOne looks like an ellipsis and tokTwo is either
214		// capitalized or a frequent sentence starter, break the sentence.
215		if isEllipsis {
216			if a.TokenParser.FirstUpper(tokTwo) || isStarter != 0 {
217				tokOne.SentBreak = true
218				return
219			}
220		}
221
222		// If the tokOne's sentence-breaking punctuation looks like it could
223		// occur sentence-internally, ensure that the following word is either
224		// capitalized or a frequent sentence starter.
225		if isInternal {
226			if a.TokenParser.FirstLower(tokTwo) && isStarter == 0 {
227				tokOne.SentBreak = false
228				return
229			}
230		}
231
232		// If the tokOne ends with a period but isn't marked as a sentence
233		// break, mark it if tokTwo is capitalized and can occur in _ORTHO_LC.
234		if isNonBreak && a.TokenParser.FirstUpper(tokTwo) {
235			if a.Storage.OrthoContext[nextTyp]&112 != 0 {
236				tokOne.SentBreak = true
237			}
238		}
239	}
240
241	if len(reAbbr.FindAllString(tokOne.Tok, 1)) == 0 {
242		return
243	}
244
245	if a.IsInitial(tokOne) {
246		return
247	}
248
249	tokOne.Abbr = true
250	tokOne.SentBreak = false
251
252	// [4.1.1. Orthographic Heuristic] Check if there's
253	// orthogrpahic evidence about whether the next word
254	// starts a sentence or not.
255	isSentStarter := a.Ortho.Heuristic(tokTwo)
256	if isSentStarter == 1 {
257		tokOne.SentBreak = true
258		return
259	}
260
261	if nextTyp == "" {
262		nextTyp = a.TokenParser.TypeNoSentPeriod(tokTwo)
263	}
264
265	// [4.1.3. Frequent Sentence Starter Heruistic] If the
266	// next word is capitalized, and is a member of the
267	// frequent-sentence-starters list, then label tok as a
268	// sentence break.
269	if a.TokenParser.FirstUpper(tokTwo) && a.SentStarters[nextTyp] != 0 {
270		tokOne.SentBreak = true
271		return
272	}
273}
274