1 // Copyright (c) 2001, Dr Martin Porter 2 // Copyright (c) 2002, Richard Boulton 3 // Copyright (c) 2015, Cesar Souza 4 // Copyright (c) 2018, Olly Betts 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright notice, 11 // * this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // * notice, this list of conditions and the following disclaimer in the 14 // * documentation and/or other materials provided with the distribution. 15 // * Neither the name of the copyright holders nor the names of its contributors 16 // * may be used to endorse or promote products derived from this software 17 // * without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 namespace Snowball 31 { 32 using System; 33 using System.Linq; 34 using System.Text; 35 36 /// <summary> 37 /// Class holding current state. 38 /// </summary> 39 /// 40 public class Env 41 { 42 /// <summary> 43 /// Initializes a new instance of the <see cref="Env"/> class. 44 /// </summary> 45 /// Env()46 protected Env() 47 { 48 } 49 50 /// <summary> 51 /// Gets the current string. 52 /// </summary> 53 /// 54 protected StringBuilder current; 55 56 /// <summary> 57 /// Current cursor position. 58 /// </summary> 59 /// 60 protected int cursor; 61 62 /// <summary> 63 /// Forward limit for inspecting the buffer. 64 /// </summary> 65 /// 66 protected int limit; 67 68 /// <summary> 69 /// Backward limit for inspecting the buffer. 70 /// </summary> 71 /// 72 protected int limit_backward; 73 74 /// <summary> 75 /// Starting bracket position. 76 /// </summary> 77 /// 78 protected int bra; 79 80 /// <summary> 81 /// Ending bracket position. 82 /// </summary> 83 /// 84 protected int ket; 85 86 /// <summary> 87 /// Copy another Env object. 88 /// </summary> 89 /// Env(Env other)90 public Env(Env other) 91 { 92 copy_from(other); 93 } 94 95 /// <summary> 96 /// Copy another Env object. 97 /// </summary> 98 /// copy_from(Env other)99 protected void copy_from(Env other) 100 { 101 current = other.current; 102 cursor = other.cursor; 103 limit = other.limit; 104 limit_backward = other.limit_backward; 105 bra = other.bra; 106 ket = other.ket; 107 } 108 } 109 110 111 /// <summary> 112 /// Base class for Snowball's stemmer algorithms. 113 /// </summary> 114 /// 115 public abstract class Stemmer : Env 116 { 117 /// <summary> 118 /// Initializes a new instance of the <see cref="Stemmer"/> class. 119 /// </summary> 120 /// Stemmer()121 protected Stemmer() 122 { 123 current = new StringBuilder(); 124 setBufferContents(""); 125 } 126 127 128 /// <summary> 129 /// Calls the stemmer to process the next word. 130 /// </summary> 131 /// stem()132 protected abstract bool stem(); 133 134 135 /// <summary> 136 /// Stems the buffer's contents. 137 /// </summary> 138 /// Stem()139 public bool Stem() 140 { 141 return this.stem(); 142 } 143 144 /// <summary> 145 /// Stems a given word. 146 /// </summary> 147 /// 148 /// <param name="word">The word to be stemmed.</param> 149 /// 150 /// <returns>The stemmed word.</returns> 151 /// Stem(string word)152 public string Stem(string word) 153 { 154 setBufferContents(word); 155 this.stem(); 156 return current.ToString(); 157 } 158 159 160 /// <summary> 161 /// Gets the current processing buffer. 162 /// </summary> 163 /// 164 public StringBuilder Buffer 165 { 166 get { return current; } 167 } 168 169 /// <summary> 170 /// Gets or sets the current word to be stemmed 171 /// or the stemmed word, if the stemmer has been 172 /// processed. 173 /// </summary> 174 /// 175 public string Current 176 { 177 get { return current.ToString(); } 178 set { setBufferContents(value); } 179 } 180 setBufferContents(string value)181 private void setBufferContents(string value) 182 { 183 current.Clear(); 184 current.Insert(0, value); 185 186 cursor = 0; 187 limit = current.Length; 188 limit_backward = 0; 189 bra = cursor; 190 ket = limit; 191 } 192 193 194 /// <summary> 195 /// Determines whether the current character is 196 /// inside a given group of characters <c>s</c>. 197 /// </summary> in_grouping(string s, int min, int max, bool repeat)198 protected int in_grouping(string s, int min, int max, bool repeat) 199 { 200 do 201 { 202 if (cursor >= limit) 203 return -1; 204 205 char ch = current[cursor]; 206 if (ch > max || ch < min) 207 return 1; 208 209 if (!s.Contains(ch)) 210 return 1; 211 212 cursor++; 213 } 214 while (repeat); 215 216 return 0; 217 } 218 219 /// <summary> 220 /// Determines whether the current character is 221 /// inside a given group of characters <c>s</c>. 222 /// </summary> in_grouping_b(string s, int min, int max, bool repeat)223 protected int in_grouping_b(string s, int min, int max, bool repeat) 224 { 225 do 226 { 227 if (cursor <= limit_backward) 228 return -1; 229 230 char ch = current[cursor - 1]; 231 if (ch > max || ch < min) 232 return 1; 233 234 if (!s.Contains(ch)) 235 return 1; 236 237 cursor--; 238 } while (repeat); 239 240 return 0; 241 } 242 243 /// <summary> 244 /// Determines whether the current character is 245 /// outside a given group of characters <c>s</c>. 246 /// </summary> out_grouping(string s, int min, int max, bool repeat)247 protected int out_grouping(string s, int min, int max, bool repeat) 248 { 249 do 250 { 251 if (cursor >= limit) 252 return -1; 253 254 char ch = current[cursor]; 255 if (ch > max || ch < min) 256 { 257 cursor++; 258 continue; 259 } 260 261 if (!s.Contains(ch)) 262 { 263 cursor++; 264 continue; 265 } 266 267 return 1; 268 269 } while (repeat); 270 271 return 0; 272 } 273 274 /// <summary> 275 /// Determines whether the current character is 276 /// outside a given group of characters <c>s</c>. 277 /// </summary> out_grouping_b(string s, int min, int max, bool repeat)278 protected int out_grouping_b(string s, int min, int max, bool repeat) 279 { 280 do 281 { 282 if (cursor <= limit_backward) 283 return -1; 284 285 char ch = current[cursor - 1]; 286 if (ch > max || ch < min) 287 { 288 cursor--; 289 continue; 290 } 291 292 if (!s.Contains(ch)) 293 { 294 cursor--; 295 continue; 296 } 297 298 return 1; 299 } 300 while (repeat); 301 302 return 0; 303 } 304 305 306 /// <summary> 307 /// Determines if the current buffer contains the 308 /// string s, starting from the current position and 309 /// going forward. 310 /// </summary> eq_s(String s)311 protected bool eq_s(String s) 312 { 313 if (limit - cursor < s.Length) 314 return false; 315 316 for (int i = 0; i != s.Length; i++) 317 { 318 if (current[cursor + i] != s[i]) 319 return false; 320 } 321 322 cursor += s.Length; 323 return true; 324 } 325 326 /// <summary> 327 /// Determines if the current buffer contains the 328 /// string s, starting from the current position and 329 /// going backwards. 330 /// </summary> eq_s_b(String s)331 protected bool eq_s_b(String s) 332 { 333 if (cursor - limit_backward < s.Length) 334 return false; 335 336 for (int i = 0; i != s.Length; i++) 337 { 338 if (current[cursor - s.Length + i] != s[i]) 339 return false; 340 } 341 342 cursor -= s.Length; 343 return true; 344 } 345 346 /// <summary> 347 /// Determines if the current buffer contains the 348 /// string s, starting from the current position and 349 /// going backwards. 350 /// </summary> eq_s_b(StringBuilder s)351 protected bool eq_s_b(StringBuilder s) 352 { 353 if (cursor - limit_backward < s.Length) 354 return false; 355 356 for (int i = 0; i != s.Length; i++) 357 { 358 if (current[cursor - s.Length + i] != s[i]) 359 return false; 360 } 361 362 cursor -= s.Length; 363 return true; 364 } 365 366 367 /// <summary> 368 /// Searches if the current buffer matches against one of the 369 /// amongs, starting from the current cursor position and going 370 /// forward. 371 /// </summary> 372 /// find_among(Among[] v)373 protected int find_among(Among[] v) 374 { 375 int i = 0; 376 int j = v.Length; 377 378 int c = cursor; 379 int l = limit; 380 381 int common_i = 0; 382 int common_j = 0; 383 384 bool first_key_inspected = false; 385 386 while (true) 387 { 388 int k = i + ((j - i) >> 1); 389 int diff = 0; 390 int common = common_i < common_j ? common_i : common_j; // smaller 391 392 Among w = v[k]; 393 394 for (int i2 = common; i2 < w.SearchString.Length; i2++) 395 { 396 if (c + common == l) 397 { 398 diff = -1; 399 break; 400 } 401 402 diff = current[c + common] - w.SearchString[i2]; 403 404 if (diff != 0) 405 break; 406 407 common++; 408 } 409 410 if (diff < 0) 411 { 412 j = k; 413 common_j = common; 414 } 415 else 416 { 417 i = k; 418 common_i = common; 419 } 420 421 if (j - i <= 1) 422 { 423 if (i > 0) 424 break; // v->s has been inspected 425 426 if (j == i) 427 break; // only one item in v 428 429 // - but now we need to go round once more to get 430 // v->s inspected. This looks messy, but is actually 431 // the optimal approach. 432 433 if (first_key_inspected) 434 break; 435 436 first_key_inspected = true; 437 } 438 } 439 440 while (true) 441 { 442 Among w = v[i]; 443 444 if (common_i >= w.SearchString.Length) 445 { 446 cursor = c + w.SearchString.Length; 447 448 if (w.Action == null) 449 return w.Result; 450 451 bool res = w.Action(); 452 cursor = c + w.SearchString.Length; 453 454 if (res) 455 return w.Result; 456 } 457 458 i = w.MatchIndex; 459 460 if (i < 0) 461 return 0; 462 } 463 } 464 465 /// <summary> 466 /// Searches if the current buffer matches against one of the 467 /// amongs, starting from the current cursor position and going 468 /// backwards. 469 /// </summary> 470 /// find_among_b(Among[] v)471 protected int find_among_b(Among[] v) 472 { 473 int i = 0; 474 int j = v.Length; 475 476 int c = cursor; 477 int lb = limit_backward; 478 479 int common_i = 0; 480 int common_j = 0; 481 482 bool first_key_inspected = false; 483 484 while (true) 485 { 486 int k = i + ((j - i) >> 1); 487 int diff = 0; 488 int common = common_i < common_j ? common_i : common_j; 489 Among w = v[k]; 490 491 for (int i2 = w.SearchString.Length - 1 - common; i2 >= 0; i2--) 492 { 493 if (c - common == lb) 494 { 495 diff = -1; 496 break; 497 } 498 499 diff = current[c - 1 - common] - w.SearchString[i2]; 500 501 if (diff != 0) 502 break; 503 504 common++; 505 } 506 507 if (diff < 0) 508 { 509 j = k; 510 common_j = common; 511 } 512 else 513 { 514 i = k; 515 common_i = common; 516 } 517 518 if (j - i <= 1) 519 { 520 if (i > 0) 521 break; 522 523 if (j == i) 524 break; 525 526 if (first_key_inspected) 527 break; 528 529 first_key_inspected = true; 530 } 531 } 532 533 while (true) 534 { 535 Among w = v[i]; 536 537 if (common_i >= w.SearchString.Length) 538 { 539 cursor = c - w.SearchString.Length; 540 541 if (w.Action == null) 542 return w.Result; 543 544 bool res = w.Action(); 545 cursor = c - w.SearchString.Length; 546 547 if (res) 548 return w.Result; 549 } 550 551 i = w.MatchIndex; 552 553 if (i < 0) 554 return 0; 555 } 556 } 557 558 /// <summary> 559 /// Replaces the characters between <c>c_bra</c> 560 /// and <c>c_ket</c> by the characters in s. 561 /// </summary> 562 /// replace_s(int c_bra, int c_ket, String s)563 protected int replace_s(int c_bra, int c_ket, String s) 564 { 565 int adjustment = s.Length - (c_ket - c_bra); 566 Replace(current, c_bra, c_ket, s); 567 limit += adjustment; 568 if (cursor >= c_ket) 569 cursor += adjustment; 570 else if (cursor > c_bra) 571 cursor = c_bra; 572 return adjustment; 573 } 574 575 /// <summary> 576 /// Checks if a slicing can be done. 577 /// </summary> slice_check()578 protected void slice_check() 579 { 580 if (bra < 0 || bra > ket || ket > limit || limit > current.Length) 581 { 582 System.Diagnostics.Trace.WriteLine("faulty slice operation"); 583 } 584 } 585 586 /// <summary> 587 /// Replaces the contents of the bracket with the string s. 588 /// </summary> 589 /// 590 /// <param name="s">The s.</param> slice_from(String s)591 protected void slice_from(String s) 592 { 593 slice_check(); 594 replace_s(bra, ket, s); 595 } 596 597 /// <summary> 598 /// Removes the current bracket contents. 599 /// </summary> 600 /// slice_del()601 protected void slice_del() 602 { 603 slice_from(""); 604 } 605 606 /// <summary> 607 /// Replaces the contents of the bracket with the string s. 608 /// </summary> 609 /// insert(int c_bra, int c_ket, String s)610 protected void insert(int c_bra, int c_ket, String s) 611 { 612 int adjustment = replace_s(c_bra, c_ket, s); 613 if (c_bra <= bra) bra += adjustment; 614 if (c_bra <= ket) ket += adjustment; 615 } 616 617 /// <summary> 618 /// Replaces the contents of the bracket with the string s. 619 /// </summary> 620 /// insert(int c_bra, int c_ket, StringBuilder s)621 protected void insert(int c_bra, int c_ket, StringBuilder s) 622 { 623 int adjustment = replace_s(c_bra, c_ket, s.ToString()); 624 if (c_bra <= bra) bra += adjustment; 625 if (c_bra <= ket) ket += adjustment; 626 } 627 628 /// <summary> 629 /// Replaces the contents of the bracket with the string s. 630 /// </summary> 631 /// slice_to(StringBuilder s)632 protected void slice_to(StringBuilder s) 633 { 634 slice_check(); 635 Replace(s, 0, s.Length, current.ToString(bra, ket - bra)); 636 } 637 638 /// <summary> 639 /// Replaces the contents of the bracket with the string s. 640 /// </summary> 641 /// assign_to(StringBuilder s)642 protected void assign_to(StringBuilder s) 643 { 644 Replace(s, 0, s.Length, current.ToString(0, limit)); 645 } 646 647 648 649 /// <summary> 650 /// Replaces a specific region of the buffer with another text. 651 /// </summary> Replace(StringBuilder sb, int index, int length, string text)652 public static StringBuilder Replace(StringBuilder sb, int index, int length, string text) 653 { 654 sb.Remove(index, length - index); 655 sb.Insert(index, text); 656 return sb; 657 } 658 659 } 660 } 661