1package en
2
3import (
4	"github.com/blevesearch/bleve/analysis"
5	"github.com/blevesearch/bleve/registry"
6)
7
8const StopName = "stop_en"
9
10// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
11//
12// this content was obtained from:
13// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
14// ` was changed to ' to allow for literal string
15var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
16 | This file is distributed under the BSD License.
17 | See http://snowball.tartarus.org/license.php
18 | Also see http://www.opensource.org/licenses/bsd-license.html
19 |  - Encoding was converted to UTF-8.
20 |  - This notice was added.
21 |
22 | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
23
24 | An English stop word list. Comments begin with vertical bar. Each stop
25 | word is at the start of a line.
26
27 | Many of the forms below are quite rare (e.g. "yourselves") but included for
28 |  completeness.
29
30           | PRONOUNS FORMS
31             | 1st person sing
32
33i              | subject, always in upper case of course
34
35me             | object
36my             | possessive adjective
37               | the possessive pronoun 'mine' is best suppressed, because of the
38               | sense of coal-mine etc.
39myself         | reflexive
40             | 1st person plural
41we             | subject
42
43| us           | object
44               | care is required here because US = United States. It is usually
45               | safe to remove it if it is in lower case.
46our            | possessive adjective
47ours           | possessive pronoun
48ourselves      | reflexive
49             | second person (archaic 'thou' forms not included)
50you            | subject and object
51your           | possessive adjective
52yours          | possessive pronoun
53yourself       | reflexive (singular)
54yourselves     | reflexive (plural)
55             | third person singular
56he             | subject
57him            | object
58his            | possessive adjective and pronoun
59himself        | reflexive
60
61she            | subject
62her            | object and possessive adjective
63hers           | possessive pronoun
64herself        | reflexive
65
66it             | subject and object
67its            | possessive adjective
68itself         | reflexive
69             | third person plural
70they           | subject
71them           | object
72their          | possessive adjective
73theirs         | possessive pronoun
74themselves     | reflexive
75             | other forms (demonstratives, interrogatives)
76what
77which
78who
79whom
80this
81that
82these
83those
84
85           | VERB FORMS (using F.R. Palmer's nomenclature)
86             | BE
87am             | 1st person, present
88is             | -s form (3rd person, present)
89are            | present
90was            | 1st person, past
91were           | past
92be             | infinitive
93been           | past participle
94being          | -ing form
95             | HAVE
96have           | simple
97has            | -s form
98had            | past
99having         | -ing form
100             | DO
101do             | simple
102does           | -s form
103did            | past
104doing          | -ing form
105
106 | The forms below are, I believe, best omitted, because of the significant
107 | homonym forms:
108
109 |  He made a WILL
110 |  old tin CAN
111 |  merry month of MAY
112 |  a smell of MUST
113 |  fight the good fight with all thy MIGHT
114
115 | would, could, should, ought might however be included
116
117 |          | AUXILIARIES
118 |            | WILL
119 |will
120
121would
122
123 |            | SHALL
124 |shall
125
126should
127
128 |            | CAN
129 |can
130
131could
132
133 |            | MAY
134 |may
135 |might
136 |            | MUST
137 |must
138 |            | OUGHT
139
140ought
141
142           | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
143              | pronoun + verb
144
145i'm
146you're
147he's
148she's
149it's
150we're
151they're
152i've
153you've
154we've
155they've
156i'd
157you'd
158he'd
159she'd
160we'd
161they'd
162i'll
163you'll
164he'll
165she'll
166we'll
167they'll
168
169              | verb + negation
170
171isn't
172aren't
173wasn't
174weren't
175hasn't
176haven't
177hadn't
178doesn't
179don't
180didn't
181
182              | auxiliary + negation
183
184won't
185wouldn't
186shan't
187shouldn't
188can't
189cannot
190couldn't
191mustn't
192
193             | miscellaneous forms
194
195let's
196that's
197who's
198what's
199here's
200there's
201when's
202where's
203why's
204how's
205
206              | rarer forms
207
208 | daren't needn't
209
210              | doubtful forms
211
212 | oughtn't mightn't
213
214           | ARTICLES
215a
216an
217the
218
219           | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
220           | high, that classification is pointless.)
221and
222but
223if
224or
225because
226as
227until
228while
229
230of
231at
232by
233for
234with
235about
236against
237between
238into
239through
240during
241before
242after
243above
244below
245to
246from
247up
248down
249in
250out
251on
252off
253over
254under
255
256again
257further
258then
259once
260
261here
262there
263when
264where
265why
266how
267
268all
269any
270both
271each
272few
273more
274most
275other
276some
277such
278
279no
280nor
281not
282only
283own
284same
285so
286than
287too
288very
289
290 | Just for the record, the following words are among the commonest in English
291
292    | one
293    | every
294    | least
295    | less
296    | many
297    | now
298    | ever
299    | never
300    | say
301    | says
302    | said
303    | also
304    | get
305    | go
306    | goes
307    | just
308    | made
309    | make
310    | put
311    | see
312    | seen
313    | whether
314    | like
315    | well
316    | back
317    | even
318    | still
319    | way
320    | take
321    | since
322    | another
323    | however
324    | two
325    | three
326    | four
327    | five
328    | first
329    | second
330    | new
331    | old
332    | high
333    | long
334`)
335
336func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
337	rv := analysis.NewTokenMap()
338	err := rv.LoadBytes(EnglishStopWords)
339	return rv, err
340}
341
342func init() {
343	registry.RegisterTokenMap(StopName, TokenMapConstructor)
344}
345