1 // Copyright (c) 2001, Dr Martin Porter
2 // Copyright (c) 2002, Richard Boulton
3 // Copyright (c) 2015, Cesar Souza
4 // Copyright (c) 2018, Olly Betts
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright notice,
11 //     * this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //     * notice, this list of conditions and the following disclaimer in the
14 //     * documentation and/or other materials provided with the distribution.
15 //     * Neither the name of the copyright holders nor the names of its contributors
16 //     * may be used to endorse or promote products derived from this software
17 //     * without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 namespace Snowball
31 {
32     using System;
33     using System.Linq;
34     using System.Text;
35 
36     /// <summary>
37     ///   Class holding current state.
38     /// </summary>
39     ///
40     public class Env
41     {
42         /// <summary>
43         ///   Initializes a new instance of the <see cref="Env"/> class.
44         /// </summary>
45         ///
Env()46         protected Env()
47         {
48         }
49 
50         /// <summary>
51         ///   Gets the current string.
52         /// </summary>
53         ///
54         protected StringBuilder current;
55 
56         /// <summary>
57         ///   Current cursor position.
58         /// </summary>
59         ///
60         protected int cursor;
61 
62         /// <summary>
63         ///   Forward limit for inspecting the buffer.
64         /// </summary>
65         ///
66         protected int limit;
67 
68         /// <summary>
69         ///   Backward limit for inspecting the buffer.
70         /// </summary>
71         ///
72         protected int limit_backward;
73 
74         /// <summary>
75         ///   Starting bracket position.
76         /// </summary>
77         ///
78         protected int bra;
79 
80         /// <summary>
81         ///   Ending bracket position.
82         /// </summary>
83         ///
84         protected int ket;
85 
86         /// <summary>
87         ///   Copy another Env object.
88         /// </summary>
89         ///
Env(Env other)90         public Env(Env other)
91         {
92             copy_from(other);
93         }
94 
95         /// <summary>
96         ///   Copy another Env object.
97         /// </summary>
98         ///
copy_from(Env other)99         protected void copy_from(Env other)
100         {
101             current          = other.current;
102             cursor           = other.cursor;
103             limit            = other.limit;
104             limit_backward   = other.limit_backward;
105             bra              = other.bra;
106             ket              = other.ket;
107         }
108     }
109 
110 
111     /// <summary>
112     ///   Base class for Snowball's stemmer algorithms.
113     /// </summary>
114     ///
115     public abstract class Stemmer : Env
116     {
117         /// <summary>
118         ///   Initializes a new instance of the <see cref="Stemmer"/> class.
119         /// </summary>
120         ///
Stemmer()121         protected Stemmer()
122         {
123             current = new StringBuilder();
124             setBufferContents("");
125         }
126 
127 
128         /// <summary>
129         ///   Calls the stemmer to process the next word.
130         /// </summary>
131         ///
stem()132         protected abstract bool stem();
133 
134 
135         /// <summary>
136         ///   Stems the buffer's contents.
137         /// </summary>
138         ///
Stem()139         public bool Stem()
140         {
141             return this.stem();
142         }
143 
144         /// <summary>
145         ///   Stems a given word.
146         /// </summary>
147         ///
148         /// <param name="word">The word to be stemmed.</param>
149         ///
150         /// <returns>The stemmed word.</returns>
151         ///
Stem(string word)152         public string Stem(string word)
153         {
154             setBufferContents(word);
155             this.stem();
156             return current.ToString();
157         }
158 
159 
160         /// <summary>
161         ///   Gets the current processing buffer.
162         /// </summary>
163         ///
164         public StringBuilder Buffer
165         {
166             get { return current; }
167         }
168 
169         /// <summary>
170         ///   Gets or sets the current word to be stemmed
171         ///   or the stemmed word, if the stemmer has been
172         ///   processed.
173         /// </summary>
174         ///
175         public string Current
176         {
177             get { return current.ToString(); }
178             set { setBufferContents(value); }
179         }
180 
setBufferContents(string value)181         private void setBufferContents(string value)
182         {
183             current.Clear();
184             current.Insert(0, value);
185 
186             cursor = 0;
187             limit = current.Length;
188             limit_backward = 0;
189             bra = cursor;
190             ket = limit;
191         }
192 
193 
194         /// <summary>
195         ///   Determines whether the current character is
196         ///   inside a given group of characters <c>s</c>.
197         /// </summary>
in_grouping(string s, int min, int max, bool repeat)198         protected int in_grouping(string s, int min, int max, bool repeat)
199         {
200             do
201             {
202                 if (cursor >= limit)
203                     return -1;
204 
205                 char ch = current[cursor];
206                 if (ch > max || ch < min)
207                     return 1;
208 
209                 if (!s.Contains(ch))
210                     return 1;
211 
212                 cursor++;
213             }
214             while (repeat);
215 
216             return 0;
217         }
218 
219         /// <summary>
220         ///   Determines whether the current character is
221         ///   inside a given group of characters <c>s</c>.
222         /// </summary>
in_grouping_b(string s, int min, int max, bool repeat)223         protected int in_grouping_b(string s, int min, int max, bool repeat)
224         {
225             do
226             {
227                 if (cursor <= limit_backward)
228                     return -1;
229 
230                 char ch = current[cursor - 1];
231                 if (ch > max || ch < min)
232                     return 1;
233 
234                 if (!s.Contains(ch))
235                     return 1;
236 
237                 cursor--;
238             } while (repeat);
239 
240             return 0;
241         }
242 
243         /// <summary>
244         ///   Determines whether the current character is
245         ///   outside a given group of characters <c>s</c>.
246         /// </summary>
out_grouping(string s, int min, int max, bool repeat)247         protected int out_grouping(string s, int min, int max, bool repeat)
248         {
249             do
250             {
251                 if (cursor >= limit)
252                     return -1;
253 
254                 char ch = current[cursor];
255                 if (ch > max || ch < min)
256                 {
257                     cursor++;
258                     continue;
259                 }
260 
261                 if (!s.Contains(ch))
262                 {
263                     cursor++;
264                     continue;
265                 }
266 
267                 return 1;
268 
269             } while (repeat);
270 
271             return 0;
272         }
273 
274         /// <summary>
275         ///   Determines whether the current character is
276         ///   outside a given group of characters <c>s</c>.
277         /// </summary>
out_grouping_b(string s, int min, int max, bool repeat)278         protected int out_grouping_b(string s, int min, int max, bool repeat)
279         {
280             do
281             {
282                 if (cursor <= limit_backward)
283                     return -1;
284 
285                 char ch = current[cursor - 1];
286                 if (ch > max || ch < min)
287                 {
288                     cursor--;
289                     continue;
290                 }
291 
292                 if (!s.Contains(ch))
293                 {
294                     cursor--;
295                     continue;
296                 }
297 
298                 return 1;
299             }
300             while (repeat);
301 
302             return 0;
303         }
304 
305 
306         /// <summary>
307         ///   Determines if the current buffer contains the
308         ///   string s, starting from the current position and
309         ///   going forward.
310         /// </summary>
eq_s(String s)311         protected bool eq_s(String s)
312         {
313             if (limit - cursor < s.Length)
314                 return false;
315 
316             for (int i = 0; i != s.Length; i++)
317             {
318                 if (current[cursor + i] != s[i])
319                     return false;
320             }
321 
322             cursor += s.Length;
323             return true;
324         }
325 
326         /// <summary>
327         ///   Determines if the current buffer contains the
328         ///   string s, starting from the current position and
329         ///   going backwards.
330         /// </summary>
eq_s_b(String s)331         protected bool eq_s_b(String s)
332         {
333             if (cursor - limit_backward < s.Length)
334                 return false;
335 
336             for (int i = 0; i != s.Length; i++)
337             {
338                 if (current[cursor - s.Length + i] != s[i])
339                     return false;
340             }
341 
342             cursor -= s.Length;
343             return true;
344         }
345 
346         /// <summary>
347         ///   Determines if the current buffer contains the
348         ///   string s, starting from the current position and
349         ///   going backwards.
350         /// </summary>
eq_s_b(StringBuilder s)351         protected bool eq_s_b(StringBuilder s)
352         {
353             if (cursor - limit_backward < s.Length)
354                 return false;
355 
356             for (int i = 0; i != s.Length; i++)
357             {
358                 if (current[cursor - s.Length + i] != s[i])
359                     return false;
360             }
361 
362             cursor -= s.Length;
363             return true;
364         }
365 
366 
367         /// <summary>
368         ///   Searches if the current buffer matches against one of the
369         ///   amongs, starting from the current cursor position and going
370         ///   forward.
371         /// </summary>
372         ///
find_among(Among[] v)373         protected int find_among(Among[] v)
374         {
375             int i = 0;
376             int j = v.Length;
377 
378             int c = cursor;
379             int l = limit;
380 
381             int common_i = 0;
382             int common_j = 0;
383 
384             bool first_key_inspected = false;
385 
386             while (true)
387             {
388                 int k = i + ((j - i) >> 1);
389                 int diff = 0;
390                 int common = common_i < common_j ? common_i : common_j; // smaller
391 
392                 Among w = v[k];
393 
394                 for (int i2 = common; i2 < w.SearchString.Length; i2++)
395                 {
396                     if (c + common == l)
397                     {
398                         diff = -1;
399                         break;
400                     }
401 
402                     diff = current[c + common] - w.SearchString[i2];
403 
404                     if (diff != 0)
405                         break;
406 
407                     common++;
408                 }
409 
410                 if (diff < 0)
411                 {
412                     j = k;
413                     common_j = common;
414                 }
415                 else
416                 {
417                     i = k;
418                     common_i = common;
419                 }
420 
421                 if (j - i <= 1)
422                 {
423                     if (i > 0)
424                         break; // v->s has been inspected
425 
426                     if (j == i)
427                         break; // only one item in v
428 
429                     // - but now we need to go round once more to get
430                     // v->s inspected. This looks messy, but is actually
431                     // the optimal approach.
432 
433                     if (first_key_inspected)
434                         break;
435 
436                     first_key_inspected = true;
437                 }
438             }
439 
440             while (true)
441             {
442                 Among w = v[i];
443 
444                 if (common_i >= w.SearchString.Length)
445                 {
446                     cursor = c + w.SearchString.Length;
447 
448                     if (w.Action == null)
449                         return w.Result;
450 
451                     bool res = w.Action();
452                     cursor = c + w.SearchString.Length;
453 
454                     if (res)
455                         return w.Result;
456                 }
457 
458                 i = w.MatchIndex;
459 
460                 if (i < 0)
461                     return 0;
462             }
463         }
464 
465         /// <summary>
466         ///   Searches if the current buffer matches against one of the
467         ///   amongs, starting from the current cursor position and going
468         ///   backwards.
469         /// </summary>
470         ///
find_among_b(Among[] v)471         protected int find_among_b(Among[] v)
472         {
473             int i = 0;
474             int j = v.Length;
475 
476             int c = cursor;
477             int lb = limit_backward;
478 
479             int common_i = 0;
480             int common_j = 0;
481 
482             bool first_key_inspected = false;
483 
484             while (true)
485             {
486                 int k = i + ((j - i) >> 1);
487                 int diff = 0;
488                 int common = common_i < common_j ? common_i : common_j;
489                 Among w = v[k];
490 
491                 for (int i2 = w.SearchString.Length - 1 - common; i2 >= 0; i2--)
492                 {
493                     if (c - common == lb)
494                     {
495                         diff = -1;
496                         break;
497                     }
498 
499                     diff = current[c - 1 - common] - w.SearchString[i2];
500 
501                     if (diff != 0)
502                         break;
503 
504                     common++;
505                 }
506 
507                 if (diff < 0)
508                 {
509                     j = k;
510                     common_j = common;
511                 }
512                 else
513                 {
514                     i = k;
515                     common_i = common;
516                 }
517 
518                 if (j - i <= 1)
519                 {
520                     if (i > 0)
521                         break;
522 
523                     if (j == i)
524                         break;
525 
526                     if (first_key_inspected)
527                         break;
528 
529                     first_key_inspected = true;
530                 }
531             }
532 
533             while (true)
534             {
535                 Among w = v[i];
536 
537                 if (common_i >= w.SearchString.Length)
538                 {
539                     cursor = c - w.SearchString.Length;
540 
541                     if (w.Action == null)
542                         return w.Result;
543 
544                     bool res = w.Action();
545                     cursor = c - w.SearchString.Length;
546 
547                     if (res)
548                         return w.Result;
549                 }
550 
551                 i = w.MatchIndex;
552 
553                 if (i < 0)
554                     return 0;
555             }
556         }
557 
558         /// <summary>
559         ///   Replaces the characters between <c>c_bra</c>
560         ///   and <c>c_ket</c> by the characters in s.
561         /// </summary>
562         ///
replace_s(int c_bra, int c_ket, String s)563         protected int replace_s(int c_bra, int c_ket, String s)
564         {
565             int adjustment = s.Length - (c_ket - c_bra);
566             Replace(current, c_bra, c_ket, s);
567             limit += adjustment;
568             if (cursor >= c_ket)
569                 cursor += adjustment;
570             else if (cursor > c_bra)
571                 cursor = c_bra;
572             return adjustment;
573         }
574 
575         /// <summary>
576         ///   Checks if a slicing can be done.
577         /// </summary>
slice_check()578         protected void slice_check()
579         {
580             if (bra < 0 || bra > ket || ket > limit || limit > current.Length)
581             {
582                 System.Diagnostics.Trace.WriteLine("faulty slice operation");
583             }
584         }
585 
586         /// <summary>
587         ///   Replaces the contents of the bracket with the string s.
588         /// </summary>
589         ///
590         /// <param name="s">The s.</param>
slice_from(String s)591         protected void slice_from(String s)
592         {
593             slice_check();
594             replace_s(bra, ket, s);
595         }
596 
597         /// <summary>
598         ///   Removes the current bracket contents.
599         /// </summary>
600         ///
slice_del()601         protected void slice_del()
602         {
603             slice_from("");
604         }
605 
606         /// <summary>
607         ///   Replaces the contents of the bracket with the string s.
608         /// </summary>
609         ///
insert(int c_bra, int c_ket, String s)610         protected void insert(int c_bra, int c_ket, String s)
611         {
612             int adjustment = replace_s(c_bra, c_ket, s);
613             if (c_bra <= bra) bra += adjustment;
614             if (c_bra <= ket) ket += adjustment;
615         }
616 
617         /// <summary>
618         ///   Replaces the contents of the bracket with the string s.
619         /// </summary>
620         ///
insert(int c_bra, int c_ket, StringBuilder s)621         protected void insert(int c_bra, int c_ket, StringBuilder s)
622         {
623             int adjustment = replace_s(c_bra, c_ket, s.ToString());
624             if (c_bra <= bra) bra += adjustment;
625             if (c_bra <= ket) ket += adjustment;
626         }
627 
628         /// <summary>
629         ///   Replaces the contents of the bracket with the string s.
630         /// </summary>
631         ///
slice_to(StringBuilder s)632         protected void slice_to(StringBuilder s)
633         {
634             slice_check();
635             Replace(s, 0, s.Length, current.ToString(bra, ket - bra));
636         }
637 
638         /// <summary>
639         ///   Replaces the contents of the bracket with the string s.
640         /// </summary>
641         ///
assign_to(StringBuilder s)642         protected void assign_to(StringBuilder s)
643         {
644             Replace(s, 0, s.Length, current.ToString(0, limit));
645         }
646 
647 
648 
649         /// <summary>
650         ///   Replaces a specific region of the buffer with another text.
651         /// </summary>
Replace(StringBuilder sb, int index, int length, string text)652         public static StringBuilder Replace(StringBuilder sb, int index, int length, string text)
653         {
654             sb.Remove(index, length - index);
655             sb.Insert(index, text);
656             return sb;
657         }
658 
659     }
660 }
661