1package en 2 3import ( 4 "github.com/blevesearch/bleve/analysis" 5 "github.com/blevesearch/bleve/registry" 6) 7 8const StopName = "stop_en" 9 10// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter. 11// 12// this content was obtained from: 13// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ 14// ` was changed to ' to allow for literal string 15var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt 16 | This file is distributed under the BSD License. 17 | See http://snowball.tartarus.org/license.php 18 | Also see http://www.opensource.org/licenses/bsd-license.html 19 | - Encoding was converted to UTF-8. 20 | - This notice was added. 21 | 22 | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 23 24 | An English stop word list. Comments begin with vertical bar. Each stop 25 | word is at the start of a line. 26 27 | Many of the forms below are quite rare (e.g. "yourselves") but included for 28 | completeness. 29 30 | PRONOUNS FORMS 31 | 1st person sing 32 33i | subject, always in upper case of course 34 35me | object 36my | possessive adjective 37 | the possessive pronoun 'mine' is best suppressed, because of the 38 | sense of coal-mine etc. 39myself | reflexive 40 | 1st person plural 41we | subject 42 43| us | object 44 | care is required here because US = United States. It is usually 45 | safe to remove it if it is in lower case. 46our | possessive adjective 47ours | possessive pronoun 48ourselves | reflexive 49 | second person (archaic 'thou' forms not included) 50you | subject and object 51your | possessive adjective 52yours | possessive pronoun 53yourself | reflexive (singular) 54yourselves | reflexive (plural) 55 | third person singular 56he | subject 57him | object 58his | possessive adjective and pronoun 59himself | reflexive 60 61she | subject 62her | object and possessive adjective 63hers | possessive pronoun 64herself | reflexive 65 66it | subject and object 67its | possessive adjective 68itself | reflexive 69 | third person plural 70they | subject 71them | object 72their | possessive adjective 73theirs | possessive pronoun 74themselves | reflexive 75 | other forms (demonstratives, interrogatives) 76what 77which 78who 79whom 80this 81that 82these 83those 84 85 | VERB FORMS (using F.R. Palmer's nomenclature) 86 | BE 87am | 1st person, present 88is | -s form (3rd person, present) 89are | present 90was | 1st person, past 91were | past 92be | infinitive 93been | past participle 94being | -ing form 95 | HAVE 96have | simple 97has | -s form 98had | past 99having | -ing form 100 | DO 101do | simple 102does | -s form 103did | past 104doing | -ing form 105 106 | The forms below are, I believe, best omitted, because of the significant 107 | homonym forms: 108 109 | He made a WILL 110 | old tin CAN 111 | merry month of MAY 112 | a smell of MUST 113 | fight the good fight with all thy MIGHT 114 115 | would, could, should, ought might however be included 116 117 | | AUXILIARIES 118 | | WILL 119 |will 120 121would 122 123 | | SHALL 124 |shall 125 126should 127 128 | | CAN 129 |can 130 131could 132 133 | | MAY 134 |may 135 |might 136 | | MUST 137 |must 138 | | OUGHT 139 140ought 141 142 | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing 143 | pronoun + verb 144 145i'm 146you're 147he's 148she's 149it's 150we're 151they're 152i've 153you've 154we've 155they've 156i'd 157you'd 158he'd 159she'd 160we'd 161they'd 162i'll 163you'll 164he'll 165she'll 166we'll 167they'll 168 169 | verb + negation 170 171isn't 172aren't 173wasn't 174weren't 175hasn't 176haven't 177hadn't 178doesn't 179don't 180didn't 181 182 | auxiliary + negation 183 184won't 185wouldn't 186shan't 187shouldn't 188can't 189cannot 190couldn't 191mustn't 192 193 | miscellaneous forms 194 195let's 196that's 197who's 198what's 199here's 200there's 201when's 202where's 203why's 204how's 205 206 | rarer forms 207 208 | daren't needn't 209 210 | doubtful forms 211 212 | oughtn't mightn't 213 214 | ARTICLES 215a 216an 217the 218 219 | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so 220 | high, that classification is pointless.) 221and 222but 223if 224or 225because 226as 227until 228while 229 230of 231at 232by 233for 234with 235about 236against 237between 238into 239through 240during 241before 242after 243above 244below 245to 246from 247up 248down 249in 250out 251on 252off 253over 254under 255 256again 257further 258then 259once 260 261here 262there 263when 264where 265why 266how 267 268all 269any 270both 271each 272few 273more 274most 275other 276some 277such 278 279no 280nor 281not 282only 283own 284same 285so 286than 287too 288very 289 290 | Just for the record, the following words are among the commonest in English 291 292 | one 293 | every 294 | least 295 | less 296 | many 297 | now 298 | ever 299 | never 300 | say 301 | says 302 | said 303 | also 304 | get 305 | go 306 | goes 307 | just 308 | made 309 | make 310 | put 311 | see 312 | seen 313 | whether 314 | like 315 | well 316 | back 317 | even 318 | still 319 | way 320 | take 321 | since 322 | another 323 | however 324 | two 325 | three 326 | four 327 | five 328 | first 329 | second 330 | new 331 | old 332 | high 333 | long 334`) 335 336func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { 337 rv := analysis.NewTokenMap() 338 err := rv.LoadBytes(EnglishStopWords) 339 return rv, err 340} 341 342func init() { 343 registry.RegisterTokenMap(StopName, TokenMapConstructor) 344} 345