1//
2// $Id$
3//
4
5#undef LOC_TABLE_ENTRY
6#undef LOC_TABLE_INDEX
7#define LOC_TABLE_ENTRY		LOC_PREFIX(stem_table_entry_)
8#define LOC_TABLE_INDEX		LOC_PREFIX(stem_table_index_)
9
10
11struct LOC_TABLE_ENTRY
12{
13	LOC_CHAR_TYPE	suffix[8];
14	int				remove, len;
15};
16
17
18struct LOC_TABLE_INDEX
19{
20	LOC_CHAR_TYPE	first;
21	int				count;
22};
23
24
25// TableStringN, where N is a number of chars
26#undef TS1
27#undef TS2
28#undef TS3
29#undef TS4
30#undef TS5
31#define TS1(c1) { RUS::c1 }
32#define TS2(c1,c2) { RUS::c1, RUS::c2 }
33#define TS3(c1,c2,c3) { RUS::c1, RUS::c2, RUS::c3 }
34#define TS4(c1,c2,c3,c4) { RUS::c1, RUS::c2, RUS::c3, RUS::c4 }
35#define TS5(c1,c2,c3,c4,c5) { RUS::c1, RUS::c2, RUS::c3, RUS::c4, RUS::c5 }
36
37
38static LOC_TABLE_INDEX LOC_PREFIX(ru_adj_i)[] =
39{
40	{ RUS::E,	4 },
41	{ RUS::I,	2 },
42	{ RUS::IY,	4 },
43	{ RUS::M,	7 },
44	{ RUS::O,	2 },
45	{ RUS::U,	2 },
46	{ RUS::H,	2 },
47	{ RUS::YU,	4 },
48	{ RUS::YA,	2 },
49};
50
51
52static LOC_TABLE_ENTRY LOC_PREFIX(ru_adj)[] =
53{
54	{ TS2(E,E),		2, -1 },
55	{ TS2(I,E),		2, -1 },
56	{ TS2(Y,E),		2, -1 },
57	{ TS2(O,E),		2, -1 },
58
59	{ TS3(I,M,I),	3, -1 },
60	{ TS3(Y,M,I),	3, -1 },
61
62	{ TS2(E,IY),	2, -1 },
63	{ TS2(I,IY),	2, -1 },
64	{ TS2(Y,IY),	2, -1 },
65	{ TS2(O,IY),	2, -1 },
66
67	{ TS3(A,E,M),	0, -1 },
68	{ TS3(U,E,M),	0, -1 },
69	{ TS3(YA,E,M),	0, -1 },
70	{ TS2(E,M),		2, -1 },
71	{ TS2(I,M),		2, -1 },
72	{ TS2(Y,M),		2, -1 },
73	{ TS2(O,M),		2, -1 },
74
75	{ TS3(E,G,O),	3, -1 },
76	{ TS3(O,G,O),	3, -1 },
77
78	{ TS3(E,M,U),	3, -1 },
79	{ TS3(O,M,U),	3, -1 },
80
81	{ TS2(I,H),		2, -1 },
82	{ TS2(Y,H),		2, -1 },
83
84	{ TS2(E,YU),	2, -1 },
85	{ TS2(O,YU),	2, -1 },
86	{ TS2(U,YU),	2, -1 },
87	{ TS2(YU,YU),	2, -1 },
88
89	{ TS2(A,YA),	2, -1 },
90	{ TS2(YA,YA),	2, -1 }
91};
92
93
94static LOC_TABLE_INDEX LOC_PREFIX(ru_part_i)[] =
95{
96	{ RUS::A,	3 },
97	{ RUS::M,	1 },
98	{ RUS::N,	3 },
99	{ RUS::O,	3 },
100	{ RUS::Y,	3 },
101	{ RUS::SH,	4 },
102	{ RUS::SCH,	5 }
103};
104
105
106static LOC_TABLE_ENTRY LOC_PREFIX(ru_part)[] =
107{
108	{ TS4(A,N,N,A),		2, -1 },
109	{ TS4(E,N,N,A),		2, -1 },
110	{ TS4(YA,N,N,A),	2, -1 },
111
112	{ TS3(YA,E,M),		2, -1 },
113
114	{ TS3(A,N,N),		1, -1 },
115	{ TS3(E,N,N),		1, -1 },
116	{ TS3(YA,N,N),		1, -1 },
117
118	{ TS4(A,N,N,O),		2, -1 },
119	{ TS4(E,N,N,O),		2, -1 },
120	{ TS4(YA,N,N,O),	2, -1 },
121
122	{ TS4(A,N,N,Y),		2, -1 },
123	{ TS4(E,N,N,Y),		2, -1 },
124	{ TS4(YA,N,N,Y),	2, -1 },
125
126	{ TS3(A,V,SH),		2, -1 },
127	{ TS3(I,V,SH),		3, -1 },
128	{ TS3(Y,V,SH),		3, -1 },
129	{ TS3(YA,V,SH),		2, -1 },
130
131	{ TS3(A,YU,SCH),	2, -1 },
132	{ TS2(A,SCH),		1, -1 },
133	{ TS3(YA,YU,SCH),	2, -1 },
134	{ TS2(YA,SCH),		1, -1 },
135	{ TS3(U,YU,SCH),	3, -1 }
136};
137
138
139static LOC_TABLE_INDEX LOC_PREFIX(ru_verb_i)[] =
140{
141	{ RUS::A,	7 },
142	{ RUS::E,	9 },
143	{ RUS::I,	4 },
144	{ RUS::IY,	4 },
145	{ RUS::L,	4 },
146	{ RUS::M,	5 },
147	{ RUS::O,	7 },
148	{ RUS::T,	9 },
149	{ RUS::Y,	3 },
150	{ RUS::MYA,	10 },
151	{ RUS::YU,	4 },
152	{ RUS::YA,	1 }
153};
154
155
156static LOC_TABLE_ENTRY LOC_PREFIX(ru_verb)[] =
157{
158	{ TS3(A,L,A),	3, -1 },
159	{ TS3(A,N,A),	3, -1 },
160	{ TS3(YA,L,A),	3, -1 },
161	{ TS3(YA,N,A),	3, -1 },
162	{ TS3(I,L,A),	3, -1 },
163	{ TS3(Y,L,A),	3, -1 },
164	{ TS3(E,N,A),	3, -1 },
165
166	{ TS4(A,E,T,E),		4, -1 },
167	{ TS4(A,IY,T,E),	4, -1 },
168	{ TS3(MYA,T,E),		3, -1 },
169	{ TS4(U,E,T,E),		4, -1 },
170	{ TS4(YA,E,T,E),	4, -1 },
171	{ TS4(YA,IY,T,E),	4, -1 },
172	{ TS4(E,IY,T,E),	4, -1 },
173	{ TS4(U,IY,T,E),	4, -1 },
174	{ TS3(I,T,E),		3, -1 },
175
176	{ TS3(A,L,I),	3, -1 },
177	{ TS3(YA,L,I),	3, -1 },
178	{ TS3(I,L,I),	3, -1 },
179	{ TS3(Y,L,I),	3, -1 },
180
181	{ TS2(A,IY),	2, -1 },
182	{ TS2(YA,IY),	2, -1 },
183	{ TS2(E,IY),	2, -1 },
184	{ TS2(U,IY),	2, -1 },
185
186	{ TS2(A,L),		2, -1 },
187	{ TS2(YA,L),	2, -1 },
188	{ TS2(I,L),		2, -1 },
189	{ TS2(Y,L),		2, -1 },
190
191	{ TS3(A,E,M),	3, -1 },
192	{ TS3(YA,E,M),	3, -1 },
193	{ TS3(U,E,M),	3, -1 },
194	{ TS2(I,M),		2, -1 },
195	{ TS2(Y,M),		2, -1 },
196
197	{ TS3(A,L,O),	3, -1 },
198	{ TS3(A,N,O),	3, -1 },
199	{ TS3(YA,L,O),	3, -1 },
200	{ TS3(YA,N,O),	3, -1 },
201	{ TS3(I,L,O),	3, -1 },
202	{ TS3(Y,L,O),	3, -1 },
203	{ TS3(E,N,O),	3, -1 },
204
205	{ TS3(A,E,T),	3, -1 },
206	{ TS3(A,YU,T),	3, -1 },
207	{ TS3(YA,E,T),	3, -1 },
208	{ TS3(YA,YU,T),	3, -1 },
209	{ TS2(YA,T),	2, -1 },
210	{ TS3(U,E,T),	3, -1 },
211	{ TS3(U,YU,T),	3, -1 },
212	{ TS2(I,T),		2, -1 },
213	{ TS2(Y,T),		2, -1 },
214
215	{ TS3(A,N,Y),	3, -1 },
216	{ TS3(YA,N,Y),	3, -1 },
217	{ TS3(E,N,Y),	3, -1 },
218
219	{ TS4(A,E,SH,MYA),	4, -1 },
220	{ TS4(U,E,SH,MYA),	4, -1 },
221	{ TS4(YA,E,SH,MYA),	4, -1 },
222	{ TS3(A,T,MYA),		3, -1 },
223	{ TS3(E,T,MYA),		3, -1 },
224	{ TS3(I,T,MYA),		3, -1 },
225	{ TS3(U,T,MYA),		3, -1 },
226	{ TS3(Y,T,MYA),		3, -1 },
227	{ TS3(I,SH,MYA),	3, -1 },
228	{ TS3(YA,T,MYA),	3, -1 },
229
230	{ TS2(A,YU),	2, -1 },
231	{ TS2(U,YU),	2, -1 },
232	{ TS2(YA,YU),	2, -1 },
233	{ TS1(YU),		1, -1 },
234
235	{ TS2(U,YA),	2, -1 }
236};
237
238
239static LOC_TABLE_INDEX LOC_PREFIX(ru_dear_i)[] =
240{
241	{ RUS::K,	3 },
242	{ RUS::A,	2 },
243	{ RUS::V,	2 },
244	{ RUS::E,	2 },
245	{ RUS::I,	4 },
246	{ RUS::IY,	2 },
247	{ RUS::M,	4 },
248	{ RUS::O,	2 },
249	{ RUS::U,	2 },
250	{ RUS::H,	2 },
251	{ RUS::YU,	2 }
252};
253
254
255static LOC_TABLE_ENTRY LOC_PREFIX(ru_dear)[] =
256{
257	{ TS3(CH,E,K),		3, -1 },
258	{ TS3(CH,O,K),		3, -1 },
259	{ TS3(N,O,K),		3, -1 },
260
261	{ TS3(CH,K,A),		3, -1 },
262	{ TS3(N, K,A),		3, -1 },
263	{ TS4(CH,K,O,V),	4, -1 },
264	{ TS4(N, K,O,V),	4, -1 },
265	{ TS3(CH,K,E),		3, -1 },
266	{ TS3(N, K,E),		3, -1 },
267	{ TS3(CH,K,I),		3, -1 },
268	{ TS3(N, K,I),		3, -1 },
269	{ TS5(CH,K,A,M,I),	5, -1 },
270	{ TS5(N, K,A,M,I),	5, -1 },
271	{ TS4(CH,K,O,IY),	4, -1 },
272	{ TS4(N, K,O,IY),	4, -1 },
273	{ TS4(CH,K,A,M),	4, -1 },
274	{ TS4(N, K,A,M),	4, -1 },
275	{ TS4(CH,K,O,M),	4, -1 },
276	{ TS4(N, K,O,M),	4, -1 },
277	{ TS3(CH,K,O),		3, -1 },
278	{ TS3(N, K,O),		3, -1 },
279	{ TS3(CH,K,U),		3, -1 },
280	{ TS3(N, K,U),		3, -1 },
281	{ TS4(CH,K,A,H),	4, -1 },
282	{ TS4(N, K,A,H),	4, -1 },
283	{ TS4(CH,K,O,YU),	4, -1 },
284	{ TS4(N, K,O,YU),	4, -1 }
285};
286
287
288static LOC_TABLE_INDEX LOC_PREFIX(ru_noun_i)[] =
289{
290	{ RUS::A,	1 },
291	{ RUS::V,	2 },
292	{ RUS::E,	3 },
293	{ RUS::I,	6 },
294	{ RUS::IY,	4 },
295	{ RUS::M,	5 },
296	{ RUS::O,	1 },
297	{ RUS::U,	1 },
298	{ RUS::H,	3 },
299	{ RUS::Y,	1 },
300	{ RUS::MYA,	1 },
301	{ RUS::YU,	3 },
302	{ RUS::YA,	3 }
303};
304
305
306static LOC_TABLE_ENTRY LOC_PREFIX(ru_noun)[] =
307{
308	{ TS1(A),		1, -1 },
309
310	{ TS2(E,V),		2, -1 },
311	{ TS2(O,V),		2, -1 },
312
313	{ TS2(I,E),		2, -1 },
314	{ TS2(MYA,E),	2, -1 },
315	{ TS1(E),		1, -1 },
316
317	{ TS4(I,YA,M,I),4, -1 },
318	{ TS3(YA,M,I),	3, -1 },
319	{ TS3(A,M,I),	3, -1 },
320	{ TS2(E,I),		2, -1 },
321	{ TS2(I,I),		2, -1 },
322	{ TS1(I),		1, -1 },
323
324	{ TS3(I,E,IY),	3, -1 },
325	{ TS2(E,IY),	2, -1 },
326	{ TS2(O,IY),	2, -1 },
327	{ TS2(I,IY),	2, -1 },
328
329	{ TS3(I,YA,M),	3, -1 },
330	{ TS2(YA,M),	2, -1 },
331	{ TS3(I,E,M),	3, -1 },
332	{ TS2(A,M),		2, -1 },
333	{ TS2(O,M),		2, -1 },
334
335	{ TS1(O),		1, -1 },
336
337	{ TS1(U),		1, -1 },
338
339	{ TS2(A,H),		2, -1 },
340	{ TS3(I,YA,H),	3, -1 },
341	{ TS2(YA,H),	2, -1 },
342
343	{ TS1(Y),		1, -1 },
344
345	{ TS1(MYA),		1, -1 },
346
347	{ TS2(I,YU),	2, -1 },
348	{ TS2(MYA,YU),	2, -1 },
349	{ TS1(YU),		1, -1 },
350
351	{ TS2(I,YA),	2, -1 },
352	{ TS2(MYA,YA),	2, -1 },
353	{ TS1(YA),		1, -1 }
354};
355
356
357int stem_ru_table_i ( LOC_CHAR_TYPE * word, int len, LOC_TABLE_ENTRY * table, LOC_TABLE_INDEX * itable, int icount )
358{
359	int i, j, k, m;
360	LOC_CHAR_TYPE l = word[--len];
361
362	for ( i=0, j=0; i<icount; i++ )
363	{
364		if ( l==itable[i].first )
365		{
366			m = itable[i].count;
367			i = j-1;
368			while ( m-- )
369			{
370				i++;
371				j = table[i].len;
372				k = len;
373				if ( j>k )
374					continue;
375				for ( ; j>=0; k--, j-- )
376					if ( word[k]!=table[i].suffix[j] )
377						break;
378				if ( j>=0 )
379					continue;
380				return table[i].remove;
381			}
382			return 0;
383		}
384		j += itable[i].count;
385	}
386	return 0;
387}
388
389
390#undef STEM_RU_FUNC
391#define STEM_RU_FUNC(func,table) \
392	int func ( LOC_CHAR_TYPE * word, int len ) \
393	{ \
394		return stem_ru_table ( word, len, LOC_PREFIX(table), \
395			sizeof(LOC_PREFIX(table))/sizeof(LOC_TABLE_ENTRY) ); \
396	}
397
398#undef STEM_RU_FUNC_I
399#define STEM_RU_FUNC_I(table) \
400	int LOC_PREFIX(stem_##table##_i) ( LOC_CHAR_TYPE * word, int len ) \
401	{ \
402		return stem_ru_table_i ( word, len, LOC_PREFIX(table), LOC_PREFIX(table##_i), \
403			sizeof(LOC_PREFIX(table##_i))/sizeof(LOC_TABLE_INDEX) ); \
404	}
405
406
407STEM_RU_FUNC_I(ru_adj)
408STEM_RU_FUNC_I(ru_part)
409STEM_RU_FUNC_I(ru_dear)
410STEM_RU_FUNC_I(ru_verb)
411STEM_RU_FUNC_I(ru_noun)
412
413
414static int LOC_PREFIX(stem_ru_adjectival) ( LOC_CHAR_TYPE * word, int len )
415{
416	int i = LOC_PREFIX(stem_ru_adj_i) ( word, len );
417	if ( i )
418		i += LOC_PREFIX(stem_ru_part_i) ( word, len-i );
419	return i;
420}
421
422
423static int LOC_PREFIX(stem_ru_verb_ov) ( LOC_CHAR_TYPE * word, int len )
424{
425	int i = LOC_PREFIX(stem_ru_verb_i) ( word, len );
426	if ( i && (len>=i+2) && word[len-i-2] == RUS::O && word[len-i-1] == RUS::V )
427		return i+2;
428	return i;
429}
430
431
432void LOC_PREFIX(stem_ru_init) ()
433{
434	int i;
435
436	#undef STEM_RU_INIT_TABLE
437	#define STEM_RU_INIT_TABLE(table) \
438		for ( i=0; i<int(sizeof(LOC_PREFIX(table))/sizeof(LOC_TABLE_ENTRY)); i++ ) \
439			LOC_PREFIX(table)[i].len = ((int)strlen((char*)LOC_PREFIX(table)[i].suffix)/sizeof(LOC_CHAR_TYPE))- 1;
440
441	STEM_RU_INIT_TABLE(ru_adj)
442	STEM_RU_INIT_TABLE(ru_part)
443	STEM_RU_INIT_TABLE(ru_verb)
444	STEM_RU_INIT_TABLE(ru_noun)
445	STEM_RU_INIT_TABLE(ru_dear)
446}
447
448
449void LOC_PREFIX(stem_ru) ( LOC_CHAR_TYPE * word )
450{
451	int r1, r2;
452	int i, len;
453
454	// IsVowel
455	#undef IV
456	#define IV(c) ( \
457		c==RUS::A || c==RUS::E || c==RUS::YO || c==RUS::I || c==RUS::O || \
458		c==RUS::U || c==RUS::Y || c==RUS::EE || c==RUS::YU || c==RUS::YA )
459
460	// EndOfWord
461	#undef EOW
462	#define EOW(_arg) (!(*((unsigned char*)(_arg))))
463
464	while ( !EOW(word) ) if ( IV(*word) ) break; else word++;
465	if ( !EOW(word) ) word++; else return;
466	len = 0; while ( !EOW(word+len) ) len++;
467
468	r1 = r2 = len;
469	for ( i=-1; i<len-1; i++ ) if ( IV(word[i]) && !IV(word[i+1]) ) { r1 = i+2; break; }
470	for ( i=r1; i<len-1; i++ ) if ( IV(word[i]) && !IV(word[i+1]) ) { r2 = i+2; break; }
471
472	#define C(p) word[len-p]
473	#define W(p,c) ( C(p)==c )
474	#define XSUFF2(c2,c1) ( W(1,c1) && W(2,c2) )
475	#define XSUFF3(c3,c2,c1) ( W(1,c1) && W(2,c2) && W(3,c3) )
476	#define XSUFF4(c4,c3,c2,c1) ( W(1,c1) && W(2,c2) && W(3,c3) && W(4,c4) )
477	#define XSUFF5(c5,c4,c3,c2,c1) ( W(1,c1) && W(2,c2) && W(3,c3) && W(4,c4) && W(5,c5) )
478	#define BRK(_arg) { len -= _arg; break; }
479	#define CHK(_func) { i = LOC_PREFIX(_func) ( word, len ); if ( i ) BRK ( i ); }
480
481	for ( ;; )
482	{
483		CHK ( stem_ru_dear_i );
484
485		if ( C(1)==RUS::V && len>=2 )
486		{
487			if ( C(2)==RUS::I || C(2)==RUS::Y || C(2)==RUS::YA )
488				BRK(2);
489
490			if ( C(2)==RUS::A )
491			{
492				if ( C(3)==RUS::V && C(4)==RUS::A )
493					BRK(4);
494				BRK(2);
495			}
496		}
497
498		if ( len>=3 && XSUFF3 ( RUS::V, RUS::SH, RUS::I )
499			&& ( C(4)==RUS::A || C(4)==RUS::I || C(4)==RUS::Y || C(4)==RUS::YA ) )
500				BRK(4);
501
502		if ( len>=5 && XSUFF5 ( RUS::V, RUS::SH, RUS::I, RUS::S, RUS::MYA )
503			&& ( C(6)==RUS::A || C(6)==RUS::I || C(6)==RUS::Y || C(6)==RUS::YA ) )
504				BRK(6);
505
506		CHK ( stem_ru_adjectival );
507
508		if ( len>=2 && ( XSUFF2 ( RUS::S, RUS::MYA ) || XSUFF2 ( RUS::S, RUS::YA ) ) )
509		{
510			len -= 2;
511			CHK ( stem_ru_adjectival );
512			CHK ( stem_ru_verb_ov );
513		} else
514		{
515			CHK ( stem_ru_verb_ov );
516		}
517
518		CHK ( stem_ru_noun_i );
519		break;
520	}
521
522	if ( len>0 && ( W(1,RUS::IY) || W(1,RUS::I) ) )
523		len--;
524
525	if ( len-r2>=3 && XSUFF3 ( RUS::O, RUS::S, RUS::T ) )
526		len -= 3;
527	else if ( len-r2>=4 && XSUFF4 ( RUS::O, RUS::S, RUS::T, RUS::MYA ) )
528		len -= 4;
529
530	if ( len>=3 && XSUFF3 ( RUS::E, RUS::IY, RUS::SH ) )
531		len -= 3;
532	else if ( len>=4 && XSUFF4 ( RUS::E, RUS::IY, RUS::SH, RUS::E ) )
533		len -= 4;
534
535	if ( len>=2 && XSUFF2 ( RUS::N, RUS::N ) )
536		len--;
537
538	if ( len>0 && W(1,RUS::MYA) )
539		len--;
540
541	*((unsigned char*)(word+len)) = '\0';
542}
543
544// undefine externally defined stuff
545#undef LOC_CHAR_TYPE
546#undef LOC_PREFIX
547#undef RUS
548
549//
550// $Id$
551//
552