1 //
2 // $Id$
3 //
4
5 //
6 // Copyright (c) 2011-2016, Andrew Aksyonoff
7 // Copyright (c) 2011-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 // Based on AOT lemmatizer, http://aot.ru/
16 // Copyright (c) 2004-2014, Alexey Sokirko and others
17 //
18
19 #include "sphinx.h"
20 #include "sphinxint.h"
21 #include "sphinxutils.h"
22
23 //////////////////////////////////////////////////////////////////////////
24 // LEMMATIZER
25 //////////////////////////////////////////////////////////////////////////
26
27 const BYTE AOT_POS_UNKNOWN = 0xff;
28 const int AOT_MIN_PREDICTION_SUFFIX = 3;
29 const BYTE AOT_MORPH_ANNOT_CHAR = '+';
30 const int AOT_MAX_ALPHABET_SIZE = 54;
31 const DWORD AOT_NOFORM = 0xffffffffUL;
32 const DWORD AOT_ORIGFORM = 0xfffffffeUL;
33
34 static int g_iCacheSize = 262144; // in bytes, so 256K
35
36
37 #define AOT_MODEL_NO(_a) ((_a)>>18)
38 #define AOT_ITEM_NO(_a) (((_a)&0x3FFFF)>>9)
39 #define AOT_PREFIX_NO(_a) ((_a)&0x1FF)
40
41
42 /// morpohological form info
43 struct CMorphForm
44 {
45 BYTE m_FlexiaLen;
46 BYTE m_PrefixLen;
47 BYTE m_POS;
48 BYTE m_Dummy;
49 char m_Prefix[4];
50 char m_Flexia[24];
51 };
52
53
54 /// alphabet descriptor
55 struct AlphabetDesc_t
56 {
57 int m_iSize;
58 BYTE m_dCode2Alpha [ AOT_MAX_ALPHABET_SIZE ];
59 BYTE m_dCode2AlphaWA [ AOT_MAX_ALPHABET_SIZE ];
60 };
61
62
63 /// alphabet codec
64 class CABCEncoder : public ISphNoncopyable
65 {
66 public:
67 int m_AlphabetSize;
68 int m_Alphabet2Code[256];
69 int m_Alphabet2CodeWithoutAnnotator[256];
70
71 void InitAlphabet ( const AlphabetDesc_t & tDesc );
72 bool CheckABCWithoutAnnotator ( const BYTE * pWord ) const;
73 DWORD DecodeFromAlphabet ( const BYTE * sPath, int iPath ) const;
74 };
75
76 /// morphology automaton node, 1:31
77 /// 1 bit for "final or not" flag
78 /// 31 bits for index to relations (pointer to the first child)
79 struct CMorphAutomNode
80 {
81 DWORD m_Data;
GetChildrenStartCMorphAutomNode82 DWORD GetChildrenStart() const { return m_Data&(0x80000000-1); }
IsFinalCMorphAutomNode83 bool IsFinal() const { return (m_Data&0x80000000) > 0; }
84 };
85
86
87 /// morphology automaton relation, 8:24
88 /// 8 bits for relational char (aka next char in current form)
89 /// 24 bites for index to nodes (pointer to the next level node)
90 struct CMorphAutomRelation
91 {
92 DWORD m_Data;
GetChildNoCMorphAutomRelation93 DWORD GetChildNo() const { return m_Data & 0xffffff; }
GetRelationalCharCMorphAutomRelation94 BYTE GetRelationalChar() const { return (BYTE)(m_Data>>24); }
95 };
96
97
98 /// morphology automaton
99 class CMorphAutomat : public CABCEncoder
100 {
101 protected:
102 CMorphAutomNode * m_pNodes;
103 int m_NodesCount;
104 CMorphAutomRelation * m_pRelations;
105 int m_RelationsCount;
106
107 int m_iCacheSize;
108 CSphTightVector<int> m_ChildrenCache;
109
110 void BuildChildrenCache ( int iCacheSize );
111 int FindStringAndPassAnnotChar ( const BYTE * pText ) const;
112
113 public:
CMorphAutomat()114 CMorphAutomat ()
115 : m_pNodes ( NULL )
116 , m_NodesCount ( 0 )
117 , m_pRelations ( NULL )
118 , m_RelationsCount ( 0 )
119 , m_iCacheSize ( 0 )
120 {}
121
~CMorphAutomat()122 ~CMorphAutomat ()
123 {
124 SafeDelete ( m_pNodes );
125 SafeDelete ( m_pRelations );
126 }
127
GetChildrenCount(int i) const128 int GetChildrenCount ( int i ) const { return m_pNodes[i+1].GetChildrenStart() - m_pNodes[i].GetChildrenStart(); }
GetChildren(int i) const129 const CMorphAutomRelation * GetChildren ( int i ) const { return m_pRelations + m_pNodes[i].GetChildrenStart(); }
GetNode(int i) const130 const CMorphAutomNode GetNode ( int i ) const { return m_pNodes[i]; }
131
132 public:
133 bool LoadPak ( CSphReader & rd, int iCacheSize );
134 void GetInnerMorphInfos ( const BYTE * pText, DWORD * Infos ) const;
135 int NextNode ( int NodeNo, BYTE Child ) const;
136 };
137
138
139 /// prediction data tuple
140 struct CPredictTuple
141 {
142 WORD m_ItemNo;
143 DWORD m_LemmaInfoNo;
144 BYTE m_PartOfSpeechNo;
145 };
146
147
148 /// flexia model is basically a vector of morphology forms
149 /// (there is other meta suff like per-model comments but that is now stripped)
150 typedef CSphVector<CMorphForm> CFlexiaModel;
151
152
153 /// lemmatizer
154 class CLemmatizer
155 {
156 protected:
157 static const int MAX_PREFIX_LEN = 12;
158 static const bool m_bMaximalPrediction = false;
159 bool m_bIsGerman;
160
161 BYTE m_UC[256];
162
163 CMorphAutomat m_FormAutomat;
164 CSphVector<WORD> m_LemmaFlexiaModel; ///< lemma id to flexia model id mapping
165 CSphVector<BYTE> m_NPSs;
166 int m_PrefixLen [ MAX_PREFIX_LEN ];
167 CSphVector<BYTE> m_PrefixBlob;
168
169 CMorphAutomat m_SuffixAutomat;
170 CSphVector<DWORD> m_ModelFreq;
171
172 bool IsPrefix ( const BYTE * sPrefix, int iLen ) const;
173
PredictPack(const CPredictTuple & t) const174 DWORD PredictPack ( const CPredictTuple & t ) const { return ( m_LemmaFlexiaModel [ t.m_LemmaInfoNo ]<<18 ) + ( t.m_ItemNo<<9 ); }
175 bool PredictFind ( const BYTE * pWord, int iLen, CSphVector<CPredictTuple> & res ) const;
176 void PredictFindRecursive ( int r, BYTE * sPath, int iPath, CSphVector<CPredictTuple> & Infos ) const;
177 void PredictByDataBase ( const BYTE * pWord, int iLen, DWORD * results, bool is_cap ) const;
178
179 public:
CLemmatizer(bool IsGerman=false)180 explicit CLemmatizer ( bool IsGerman = false )
181 : m_bIsGerman ( IsGerman )
182 {}
183 CSphVector<CFlexiaModel> m_FlexiaModels; ///< flexia models
184 int m_iLang; ///< my language
185
186 bool LemmatizeWord ( BYTE * pWord, DWORD * results ) const;
187 bool LoadPak ( CSphReader & rd );
188 };
189
190 //////////////////////////////////////////////////////////////////////////
191
DecodeFromAlphabet(const BYTE * sPath,int iPath) const192 DWORD CABCEncoder::DecodeFromAlphabet ( const BYTE * sPath, int iPath ) const
193 {
194 DWORD c = 1;
195 DWORD Result = 0;
196 for ( const BYTE * sMax = sPath+iPath; sPath<sMax; sPath++ )
197 {
198 Result += m_Alphabet2CodeWithoutAnnotator[*sPath] * c;
199 c *= m_AlphabetSize - 1;
200 }
201 return Result;
202 }
203
204
CheckABCWithoutAnnotator(const BYTE * pWord) const205 bool CABCEncoder::CheckABCWithoutAnnotator ( const BYTE * pWord ) const
206 {
207 while ( *pWord )
208 if ( m_Alphabet2CodeWithoutAnnotator [ *pWord++ ]==-1 )
209 return false;
210 return true;
211 }
212
213
InitAlphabet(const AlphabetDesc_t & tDesc)214 void CABCEncoder::InitAlphabet ( const AlphabetDesc_t & tDesc )
215 {
216 m_AlphabetSize = tDesc.m_iSize;
217 for ( int i=0; i<256; i++ )
218 {
219 m_Alphabet2Code[i] = -1;
220 m_Alphabet2CodeWithoutAnnotator[i] = -1;
221 }
222 for ( int i=0; i<m_AlphabetSize; i++ )
223 m_Alphabet2Code [ tDesc.m_dCode2Alpha[i] ] = i;
224 for ( int i=0; i<m_AlphabetSize-1; i++ )
225 m_Alphabet2CodeWithoutAnnotator [ tDesc.m_dCode2AlphaWA[i] ] = i;
226 }
227
228 //////////////////////////////////////////////////////////////////////////
229
BuildChildrenCache(int iCacheSize)230 void CMorphAutomat::BuildChildrenCache ( int iCacheSize )
231 {
232 iCacheSize /= AOT_MAX_ALPHABET_SIZE*4;
233 iCacheSize = Max ( iCacheSize, 0 );
234 m_iCacheSize = Min ( m_NodesCount, iCacheSize );
235
236 m_ChildrenCache.Resize ( m_iCacheSize*AOT_MAX_ALPHABET_SIZE );
237 m_ChildrenCache.Fill ( -1 );
238 for ( int NodeNo=0; NodeNo<m_iCacheSize; NodeNo++ )
239 {
240 const CMorphAutomRelation * pStart = m_pRelations + m_pNodes [ NodeNo ].GetChildrenStart();
241 const CMorphAutomRelation * pEnd = pStart + GetChildrenCount ( NodeNo );
242 for ( ; pStart!=pEnd; pStart++ )
243 {
244 const CMorphAutomRelation & p = *pStart;
245 m_ChildrenCache [ NodeNo*AOT_MAX_ALPHABET_SIZE + m_Alphabet2Code [ p.GetRelationalChar() ] ] = p.GetChildNo();
246 }
247 }
248 }
249
250
LoadPak(CSphReader & rd,int iCacheSize)251 bool CMorphAutomat::LoadPak ( CSphReader & rd, int iCacheSize )
252 {
253 rd.Tag ( "automaton-nodes" );
254 m_NodesCount = rd.UnzipInt();
255 m_pNodes = new CMorphAutomNode [ m_NodesCount+1 ];
256 rd.GetBytes ( m_pNodes, m_NodesCount*sizeof(CMorphAutomNode) );
257
258 rd.Tag ( "automaton-relations" );
259 m_RelationsCount = rd.UnzipInt();
260 m_pRelations = new CMorphAutomRelation [ m_RelationsCount ];
261 rd.GetBytes ( m_pRelations, m_RelationsCount*sizeof(CMorphAutomRelation) );
262
263 if ( rd.GetErrorFlag() )
264 return false;
265
266 m_pNodes [ m_NodesCount ].m_Data = m_RelationsCount;
267
268 #if !USE_LITTLE_ENDIAN
269 for ( int i=0; i< m_NodesCount; ++i )
270 FlipEndianess ( &m_pNodes[i].m_Data );
271 for ( int i=0; i< m_RelationsCount; ++i )
272 FlipEndianess ( &m_pRelations[i].m_Data );
273 #endif
274
275 BuildChildrenCache ( iCacheSize );
276 return true;
277 }
278
279
NextNode(int NodeNo,BYTE RelationChar) const280 int CMorphAutomat::NextNode ( int NodeNo, BYTE RelationChar ) const
281 {
282 if ( NodeNo<m_iCacheSize )
283 {
284 int z = m_Alphabet2Code [ RelationChar ];
285 if ( z==-1 )
286 return -1;
287 return m_ChildrenCache [ NodeNo*AOT_MAX_ALPHABET_SIZE + z ];
288 } else
289 {
290 const CMorphAutomRelation * pStart = m_pRelations + m_pNodes [ NodeNo ].GetChildrenStart();
291 const CMorphAutomRelation * pEnd = pStart + GetChildrenCount ( NodeNo );
292 for ( ; pStart!=pEnd; pStart++ )
293 {
294 const CMorphAutomRelation & p = *pStart;
295 if ( RelationChar==p.GetRelationalChar() )
296 return p.GetChildNo();
297 }
298 return -1;
299 }
300 }
301
302
FindStringAndPassAnnotChar(const BYTE * pText) const303 int CMorphAutomat::FindStringAndPassAnnotChar ( const BYTE * pText ) const
304 {
305 int r = 0;
306 while ( *pText )
307 {
308 int nd = NextNode ( r, *pText++ );
309 if ( nd==-1 )
310 return -1;
311 r = nd;
312 }
313 return NextNode ( r, AOT_MORPH_ANNOT_CHAR ); // passing annotation char
314 }
315
316
GetInnerMorphInfos(const BYTE * pText,DWORD * Infos) const317 void CMorphAutomat::GetInnerMorphInfos ( const BYTE * pText, DWORD * Infos ) const
318 {
319 *Infos = AOT_NOFORM;
320
321 int r = FindStringAndPassAnnotChar ( pText );
322 if ( r==-1 )
323 return;
324
325 // recursively get all interpretations
326 const int MAX_DEPTH = 32;
327 int iLevel = 0;
328 BYTE sPath[MAX_DEPTH];
329 int iChild[MAX_DEPTH];
330 int iChildMax[MAX_DEPTH];
331
332 iChild[0] = m_pNodes[r].GetChildrenStart();
333 iChildMax[0] = m_pNodes[r+1].GetChildrenStart();
334
335 while ( iLevel>=0 )
336 {
337 while ( iChild[iLevel]<iChildMax[iLevel] )
338 {
339 CMorphAutomRelation Rel = m_pRelations[iChild[iLevel]];
340 int NodeNo = Rel.GetChildNo();
341 sPath[iLevel] = Rel.GetRelationalChar();
342 iChild[iLevel]++;
343 if ( m_pNodes[NodeNo].IsFinal() )
344 {
345 *Infos++ = DecodeFromAlphabet ( sPath, iLevel+1 );
346 } else
347 {
348 iLevel++;
349 assert ( iLevel<MAX_DEPTH );
350 iChild[iLevel] = m_pNodes[NodeNo].GetChildrenStart();
351 iChildMax[iLevel] = m_pNodes[NodeNo+1].GetChildrenStart();
352 }
353 }
354 iLevel--;
355 }
356 *Infos = AOT_NOFORM;
357 }
358
359 //////////////////////////////////////////////////////////////////////////
360
PredictFindRecursive(int NodeNo,BYTE * sPath,int iPath,CSphVector<CPredictTuple> & Infos) const361 void CLemmatizer::PredictFindRecursive ( int NodeNo, BYTE * sPath, int iPath, CSphVector<CPredictTuple> & Infos ) const
362 {
363 const CMorphAutomNode & N = m_SuffixAutomat.GetNode ( NodeNo );
364 if ( N.IsFinal() )
365 {
366 int i = 0;
367 while ( i<iPath && sPath[i]!=AOT_MORPH_ANNOT_CHAR )
368 i++;
369
370 int j = i+1;
371 while ( j<iPath && sPath[j]!=AOT_MORPH_ANNOT_CHAR )
372 j++;
373
374 int k = j+1;
375 while ( k<iPath && sPath[k]!=AOT_MORPH_ANNOT_CHAR )
376 k++;
377
378 CPredictTuple & A = Infos.Add();
379 A.m_PartOfSpeechNo = (BYTE) m_SuffixAutomat.DecodeFromAlphabet ( sPath+i+1, j-i-1 );
380 A.m_LemmaInfoNo = m_SuffixAutomat.DecodeFromAlphabet ( sPath+j+1, k-j-1 );
381 A.m_ItemNo = (WORD) m_SuffixAutomat.DecodeFromAlphabet ( sPath+k+1, iPath-k-1 );
382 }
383
384 int Count = m_SuffixAutomat.GetChildrenCount ( NodeNo );
385 for ( int i=0; i<Count; i++ )
386 {
387 const CMorphAutomRelation & p = m_SuffixAutomat.GetChildren ( NodeNo )[i];
388 sPath[iPath] = p.GetRelationalChar();
389 PredictFindRecursive ( p.GetChildNo(), sPath, iPath+1, Infos );
390 }
391 }
392
393
PredictFind(const BYTE * pWord,int iLen,CSphVector<CPredictTuple> & res) const394 bool CLemmatizer::PredictFind ( const BYTE * pWord, int iLen, CSphVector<CPredictTuple> & res ) const
395 {
396 // FIXME? we might not want to predict words with annot char inside
397 // was: if (ReversedWordForm.find(AnnotChar) != string::npos) return false;
398
399 int r = 0;
400 int i = 0;
401 const BYTE * p = pWord + iLen;
402 for ( ; i<iLen; i++ )
403 {
404 int nd = m_SuffixAutomat.NextNode ( r, *--p );
405 if ( nd==-1 )
406 break;
407 r = nd;
408 }
409
410 // no prediction by suffix which is less than 3
411 if ( i<AOT_MIN_PREDICTION_SUFFIX )
412 return false;
413
414 assert ( r!=-1 );
415 BYTE sPath[128];
416 PredictFindRecursive ( r, sPath, 0, res );
417 return true;
418 }
419
420
IsPrefix(const BYTE * sPrefix,int iLen) const421 bool CLemmatizer::IsPrefix ( const BYTE * sPrefix, int iLen ) const
422 {
423 // empty prefix is a prefix
424 if ( !iLen )
425 return true;
426 if ( iLen>=MAX_PREFIX_LEN || m_PrefixLen[iLen]<0 )
427 return false;
428
429 const BYTE * p = &m_PrefixBlob [ m_PrefixLen[iLen] ];
430 while ( *p==iLen )
431 {
432 if ( !memcmp ( p+1, sPrefix, iLen ) )
433 return true;
434 p += 1+iLen;
435 }
436 return false;
437 }
438
439
440 /// returns true if matched in dictionary, false if predicted
LemmatizeWord(BYTE * pWord,DWORD * results) const441 bool CLemmatizer::LemmatizeWord ( BYTE * pWord, DWORD * results ) const
442 {
443 const bool bCap = false; // maybe when we manage to drag this all the way from tokenizer
444 const bool bPredict = true;
445
446 // uppercase (and maybe other translations), check, and compute length
447 BYTE * p;
448 if ( m_iLang==AOT_RU )
449 {
450 for ( p = pWord; *p; p++ )
451 {
452 BYTE b = m_UC[*p];
453 // russian chars are in 0xC0..0xDF range
454 // avoid lemmatizing words with other chars in them
455 if ( ( b>>5 )!=6 )
456 {
457 *results = AOT_NOFORM;
458 return false;
459 }
460 // uppercase
461 *p = b;
462 }
463 } else ///< use the alphabet to reduce another letters
464 {
465 for ( p = pWord; *p; p++ )
466 {
467 BYTE b = m_UC[*p];
468 // english chars are in 0x61..0x7A range
469 // avoid lemmatizing words with other chars in them
470 if ( m_FormAutomat.m_Alphabet2CodeWithoutAnnotator[b]<0 )
471 {
472 *results = AOT_NOFORM;
473 return false;
474 }
475 // uppercase
476 *p = b;
477 }
478 }
479
480 int iLen = (int)( p-pWord );
481
482 // do dictionary lookup
483 m_FormAutomat.GetInnerMorphInfos ( pWord, results );
484 if ( *results!=AOT_NOFORM )
485 return true;
486 if_const ( !bPredict )
487 return false;
488
489 // attempt prediction by keyword suffix
490 // find the longest suffix that finds dictionary results
491 // require that suffix to be 4+ chars too
492 int iSuffix;
493 for ( iSuffix=1; iSuffix<=iLen-4; iSuffix++ )
494 {
495 m_FormAutomat.GetInnerMorphInfos ( pWord+iSuffix, results );
496 if ( *results!=AOT_NOFORM )
497 break;
498 }
499
500 // cancel suffix predictions with no hyphens, short enough
501 // known postfixes, and unknown prefixes
502 if ( pWord [ iSuffix-1 ]!='-'
503 && ( iLen-iSuffix )<6
504 && !IsPrefix ( pWord, iSuffix ) )
505 {
506 *results = AOT_NOFORM;
507 }
508
509 // cancel predictions by pronouns, eg [Sem'ykin'ym]
510 for ( DWORD * pRes=results; *pRes!=AOT_NOFORM; pRes++ )
511 if ( m_NPSs[ AOT_MODEL_NO ( *pRes ) ]==AOT_POS_UNKNOWN )
512 {
513 *results = AOT_NOFORM;
514 break;
515 }
516
517 // what, still no results?
518 if ( *results==AOT_NOFORM )
519 {
520 // attempt prediction by database
521 PredictByDataBase ( pWord, iLen, results, bCap );
522
523 // filter out too short flexias
524 DWORD * s = results;
525 DWORD * d = s;
526 while ( *s!=AOT_NOFORM )
527 {
528 const CMorphForm & F = m_FlexiaModels [ AOT_MODEL_NO(*s) ][ AOT_ITEM_NO(*s) ];
529 if ( F.m_FlexiaLen<iLen )
530 *d++ = *s;
531 s++;
532 }
533 *d = AOT_NOFORM;
534 }
535
536 return false;
537 }
538
539
PredictByDataBase(const BYTE * pWord,int iLen,DWORD * FindResults,bool is_cap) const540 void CLemmatizer::PredictByDataBase ( const BYTE * pWord, int iLen, DWORD * FindResults, bool is_cap ) const
541 {
542 // FIXME? handle all-consonant abbreviations anyway?
543 // was: if ( CheckAbbreviation ( InputWordStr, FindResults, is_cap ) ) return;
544
545 assert ( *FindResults==AOT_NOFORM );
546 DWORD * pOut = FindResults;
547 CSphVector<CPredictTuple> res;
548
549 // if the ABC is wrong this prediction yields too many variants
550 if ( m_FormAutomat.CheckABCWithoutAnnotator ( pWord ) )
551 PredictFind ( pWord, iLen, res );
552
553 // assume not more than 32 different pos
554 int has_nps[32];
555 for ( int i=0; i<32; i++ )
556 has_nps[i] = -1;
557
558 ARRAY_FOREACH ( j, res )
559 {
560 BYTE PartOfSpeechNo = res[j].m_PartOfSpeechNo;
561 if_const ( !m_bMaximalPrediction && has_nps[PartOfSpeechNo]!=-1 )
562 {
563 int iOldFreq = m_ModelFreq [ AOT_MODEL_NO ( FindResults[has_nps[PartOfSpeechNo]] ) ];
564 int iNewFreq = m_ModelFreq [ m_LemmaFlexiaModel [ res[j].m_LemmaInfoNo ] ];
565 if ( iOldFreq < iNewFreq )
566 FindResults [ has_nps [ PartOfSpeechNo ] ] = PredictPack ( res[j] );
567 continue;
568 }
569
570 has_nps [ PartOfSpeechNo ] = (int)( pOut-FindResults );
571 *pOut++ = PredictPack ( res[j] );
572 *pOut = AOT_NOFORM;
573 }
574
575 if ( has_nps[0]==-1 // no noun
576 || ( is_cap && !m_bIsGerman ) ) // or can be a proper noun (except German, where all nouns are written uppercase)
577 {
578 static BYTE CriticalNounLetterPack[4] = "+++";
579 PredictFind ( CriticalNounLetterPack, AOT_MIN_PREDICTION_SUFFIX, res );
580 *pOut++ = PredictPack ( res.Last() );
581 *pOut = AOT_NOFORM;
582 }
583 }
584
585
LoadPak(CSphReader & rd)586 bool CLemmatizer::LoadPak ( CSphReader & rd )
587 {
588 rd.Tag ( "sphinx-aot" );
589 int iVer = rd.UnzipInt();
590 if ( iVer!=1 )
591 return false;
592
593 rd.Tag ( "alphabet-desc" );
594 AlphabetDesc_t tDesc;
595 tDesc.m_iSize = rd.UnzipInt();
596 rd.GetBytes ( tDesc.m_dCode2Alpha, tDesc.m_iSize );
597 rd.GetBytes ( tDesc.m_dCode2AlphaWA, tDesc.m_iSize );
598
599 m_FormAutomat.InitAlphabet ( tDesc );
600 m_SuffixAutomat.InitAlphabet ( tDesc );
601
602 rd.Tag ( "uc-table" );
603 rd.GetBytes ( m_UC, 256 );
604
605 // caching forms can help a lot (from 4% with 256K cache to 13% with 110M cache)
606 rd.Tag ( "forms-automaton" );
607 m_FormAutomat.LoadPak ( rd, g_iCacheSize );
608
609 rd.Tag ( "flexia-models" );
610 m_FlexiaModels.Resize ( rd.UnzipInt() );
611 ARRAY_FOREACH ( i, m_FlexiaModels )
612 {
613 m_FlexiaModels[i].Resize ( rd.UnzipInt() );
614 ARRAY_FOREACH ( j, m_FlexiaModels[i] )
615 {
616 CMorphForm & F = m_FlexiaModels[i][j];
617 F.m_FlexiaLen = (BYTE) rd.GetByte();
618 rd.GetBytes ( F.m_Flexia, F.m_FlexiaLen );
619 F.m_PrefixLen = (BYTE) rd.GetByte();
620 rd.GetBytes ( F.m_Prefix, F.m_PrefixLen );
621 F.m_POS = (BYTE) rd.GetByte();
622
623 assert ( F.m_FlexiaLen<sizeof(F.m_Flexia) );
624 assert ( F.m_PrefixLen<sizeof(F.m_Prefix) );
625 F.m_Flexia[F.m_FlexiaLen] = 0;
626 F.m_Prefix[F.m_PrefixLen] = 0;
627 }
628 }
629
630 rd.Tag ( "prefixes" );
631 for ( int i=0; i<MAX_PREFIX_LEN; i++ )
632 m_PrefixLen[i] = rd.UnzipInt();
633 m_PrefixBlob.Resize ( rd.UnzipInt() );
634 rd.GetBytes ( m_PrefixBlob.Begin(), m_PrefixBlob.GetLength() );
635
636 rd.Tag ( "lemma-flexia-models" );
637 m_LemmaFlexiaModel.Resize ( rd.UnzipInt() );
638 ARRAY_FOREACH ( i, m_LemmaFlexiaModel )
639 m_LemmaFlexiaModel[i] = (WORD) rd.UnzipInt();
640
641 // build model freqs
642 m_ModelFreq.Resize ( m_FlexiaModels.GetLength() );
643 m_ModelFreq.Fill ( 0 );
644 ARRAY_FOREACH ( i, m_LemmaFlexiaModel )
645 m_ModelFreq [ m_LemmaFlexiaModel[i] ]++;
646
647 rd.Tag ( "nps-vector" );
648 m_NPSs.Resize ( rd.UnzipInt() );
649 rd.GetBytes ( m_NPSs.Begin(), m_NPSs.GetLength() );
650
651 // caching predictions does not measurably affect performance though
652 rd.Tag ( "prediction-automaton" );
653 m_SuffixAutomat.LoadPak ( rd, 0 );
654
655 rd.Tag ( "eof" );
656 return !rd.GetErrorFlag();
657 }
658
659 //////////////////////////////////////////////////////////////////////////
660 // SPHINX MORPHOLOGY INTERFACE
661 //////////////////////////////////////////////////////////////////////////
662
663 const char* AOT_LANGUAGES[AOT_LENGTH] = {"ru", "en", "de" };
664
665 static CLemmatizer * g_pLemmatizers[AOT_LENGTH] = {0};
666 static CSphNamedInt g_tDictinfos[AOT_LENGTH];
667
sphAotSetCacheSize(int iCacheSize)668 void sphAotSetCacheSize ( int iCacheSize )
669 {
670 g_iCacheSize = Max ( iCacheSize, 0 );
671 }
672
AotInit(const CSphString & sDictFile,CSphString & sError,int iLang)673 bool AotInit ( const CSphString & sDictFile, CSphString & sError, int iLang )
674 {
675 if ( g_pLemmatizers[iLang] )
676 return true;
677
678 CSphAutofile rdFile;
679 if ( rdFile.Open ( sDictFile, SPH_O_READ, sError )<0 )
680 return false;
681
682 g_pLemmatizers[iLang] = new CLemmatizer ( iLang==AOT_DE );
683 g_pLemmatizers[iLang]->m_iLang = iLang;
684
685 CSphReader rd;
686 rd.SetFile ( rdFile );
687 if ( !g_pLemmatizers[iLang]->LoadPak(rd) )
688 {
689 sError.SetSprintf ( "failed to load lemmatizer dictionary: %s", rd.GetErrorMessage().cstr() );
690 SafeDelete ( g_pLemmatizers[iLang] );
691 return false;
692 }
693
694 // track dictionary crc
695 DWORD uCrc;
696 if ( !sphCalcFileCRC32 ( sDictFile.cstr(), uCrc ) )
697 {
698 sError.SetSprintf ( "failed to crc32 lemmatizer dictionary %s", sDictFile.cstr() );
699 SafeDelete ( g_pLemmatizers[iLang] );
700 return false;
701 }
702
703 // extract basename
704 const char * a = sDictFile.cstr();
705 const char * b = a + strlen(a) - 1;
706 while ( b>a && b[-1]!='/' && b[-1]!='\\' )
707 b--;
708
709 g_tDictinfos[iLang].m_sName = b;
710 g_tDictinfos[iLang].m_iValue = (int)uCrc;
711 return true;
712 }
713
sphAotInit(const CSphString & sDictFile,CSphString & sError,int iLang)714 bool sphAotInit ( const CSphString & sDictFile, CSphString & sError, int iLang )
715 {
716 return AotInit ( sDictFile, sError, iLang );
717 }
718
IsAlpha1251(BYTE c)719 static inline bool IsAlpha1251 ( BYTE c )
720 {
721 return ( c>=0xC0 || c==0xA8 || c==0xB8 );
722 }
723
IsGermanAlpha1252(BYTE c)724 static inline bool IsGermanAlpha1252 ( BYTE c )
725 {
726 if ( c==0xb5 || c==0xdf )
727 return true;
728
729 BYTE lc = c | 0x20;
730 switch ( lc )
731 {
732 case 0xe2:
733 case 0xe4:
734 case 0xe7:
735 case 0xe8:
736 case 0xe9:
737 case 0xea:
738 case 0xf1:
739 case 0xf4:
740 case 0xf6:
741 case 0xfb:
742 case 0xfc:
743 return true;
744 default:
745 return ( lc>0x60 && lc<0x7b );
746 }
747 }
748
IsAlphaAscii(BYTE c)749 static inline bool IsAlphaAscii ( BYTE c )
750 {
751 BYTE lc = c | 0x20;
752 return ( lc>0x60 && lc<0x7b );
753 }
754
755 enum EMMITERS {EMIT_1BYTE, EMIT_UTF8RU, EMIT_UTF8};
756 template < EMMITERS >
Emit(BYTE * sOut,BYTE uChar)757 inline BYTE * Emit ( BYTE * sOut, BYTE uChar )
758 {
759 if ( uChar=='-' )
760 return sOut;
761 *sOut++ = uChar | 0x20;
762 return sOut;
763 }
764
765 template<>
Emit(BYTE * sOut,BYTE uChar)766 inline BYTE * Emit<EMIT_UTF8RU> ( BYTE * sOut, BYTE uChar )
767 {
768 if ( uChar=='-' )
769 return sOut;
770 assert ( uChar!=0xA8 && uChar!=0xB8 ); // no country for yo
771 uChar |= 0x20; // lowercase, E0..FF range now
772 if ( uChar & 0x10 )
773 {
774 // F0..FF -> D1 80..D1 8F
775 *sOut++ = 0xD1;
776 *sOut++ = uChar - 0x70;
777 } else
778 {
779 // E0..EF -> D0 B0..D0 BF
780 *sOut++ = 0xD0;
781 *sOut++ = uChar - 0x30;
782 }
783 return sOut;
784 }
785
786 template<>
Emit(BYTE * sOut,BYTE uChar)787 inline BYTE * Emit<EMIT_UTF8> ( BYTE * sOut, BYTE uChar )
788 {
789 if ( uChar=='-' )
790 return sOut;
791
792 if ( uChar!=0xDF ) // don't touch 'ss' umlaut
793 uChar |= 0x20;
794
795 if ( uChar & 0x80 )
796 {
797 *sOut++ = 0xC0 | (uChar>>6);
798 *sOut++ = 0x80 | (uChar&0x3F); // NOLINT
799 } else
800 *sOut++ = uChar;
801 return sOut;
802 }
803
804 template < EMMITERS IS_UTF8 >
CreateLemma(BYTE * sOut,const BYTE * sBase,int iBaseLen,bool bFound,const CFlexiaModel & M,const CMorphForm & F)805 inline void CreateLemma ( BYTE * sOut, const BYTE * sBase, int iBaseLen, bool bFound, const CFlexiaModel & M, const CMorphForm & F )
806 {
807 // cut the form prefix
808 int PrefixLen = F.m_PrefixLen;
809 if ( bFound || strncmp ( (const char*)sBase, F.m_Prefix, PrefixLen )==0 )
810 {
811 sBase += PrefixLen;
812 iBaseLen -= PrefixLen;
813 }
814
815 // FIXME! maybe handle these lemma wide prefixes too?
816 #if 0
817 const string & LemmPrefix = m_pParent->m_Prefixes[m_InnerAnnot.m_PrefixNo];
818 if ( m_bFound
819 || (
820 ( m_InputWordBase.substr ( 0, LemmPrefix.length() )==LemmPrefix ) &&
821 ( m_InputWordBase.substr ( LemmPrefix.length(), F.m_PrefixStr.length() )==F.m_PrefixStr ) ) )
822 {
823 m_InputWordBase.erase ( 0, LemmPrefix.length()+ M.m_PrefixStr.length() );
824 m_bPrefixesWereCut = true;
825 }
826 #endif
827
828 // cut the form suffix and append the lemma suffix
829 // UNLESS this was a predicted form, and form suffix does not fully match!
830 // eg. word=GUBARIEVICHA, flexion=IEIVICHA, so this is not really a matching lemma
831 int iSuff = F.m_FlexiaLen;
832 if ( bFound || ( iBaseLen>=iSuff && strncmp ( (const char*)sBase+iBaseLen-iSuff, F.m_Flexia, iSuff )==0 ) )
833 {
834 // ok, found and/or suffix matches, the usual route
835 int iCodePoints = 0;
836 iBaseLen -= iSuff;
837 while ( iBaseLen-- && iCodePoints<SPH_MAX_WORD_LEN )
838 {
839 sOut = Emit<IS_UTF8> ( sOut, *sBase++ );
840 iCodePoints++;
841 }
842
843 int iLemmaSuff = M[0].m_FlexiaLen;
844 const char * sFlexia = M[0].m_Flexia;
845 while ( iLemmaSuff-- && iCodePoints<SPH_MAX_WORD_LEN ) // OPTIMIZE? can remove len here
846 {
847 sOut = Emit<IS_UTF8> ( sOut, *sFlexia++ );
848 iCodePoints++;
849 }
850 } else
851 {
852 // whoops, no suffix match, just copy and lowercase the current base
853 while ( iBaseLen-- )
854 sOut = Emit<IS_UTF8> ( sOut, *sBase++ );
855 }
856 *sOut = '\0';
857 }
858
IsRuFreq2(BYTE * pWord)859 static inline bool IsRuFreq2 ( BYTE * pWord )
860 {
861 if ( pWord[2]!=0 )
862 return false;
863
864 int iCode = ( ( pWord[0]<<8 ) + pWord[1] ) | 0x2020;
865 switch ( iCode )
866 {
867 case 0xEDE0: // na
868 case 0xEFEE: // po
869 case 0xEDE5: // ne
870 case 0xEEF2: // ot
871 case 0xE7E0: // za
872 case 0xEEE1: // ob
873 case 0xE4EE: // do
874 case 0xF1EE: // so
875 case 0xE8E7: // iz
876 case 0xE8F5: // ih
877 case 0xF8F2: // sht
878 case 0xF3EB: // ul
879 return true;
880 }
881 return false;
882 }
883
IsEnFreq2(BYTE *)884 static inline bool IsEnFreq2 ( BYTE * )
885 {
886 // stub
887 return false;
888 }
889
IsDeFreq2(BYTE *)890 static inline bool IsDeFreq2 ( BYTE * )
891 {
892 // stub
893 return false;
894 }
895
IsRuFreq3(BYTE * pWord)896 static inline bool IsRuFreq3 ( BYTE * pWord )
897 {
898 if ( pWord[3]!=0 )
899 return false;
900 int iCode = ( ( pWord[0]<<16 ) + ( pWord[1]<<8 ) + pWord[2] ) | 0x202020;
901 return ( iCode==0xE8EBE8 || iCode==0xE4EBFF || iCode==0xEFF0E8 // ili, dlya, pri
902 || iCode==0xE3EEE4 || iCode==0xF7F2EE || iCode==0xE1E5E7 ); // god, chto, bez
903 }
904
IsEnFreq3(BYTE *)905 static inline bool IsEnFreq3 ( BYTE * )
906 {
907 // stub
908 return false;
909 }
910
IsDeFreq3(BYTE *)911 static inline bool IsDeFreq3 ( BYTE * )
912 {
913 // stub
914 return false;
915 }
916
sphAotLemmatizeRu1251(BYTE * pWord)917 void sphAotLemmatizeRu1251 ( BYTE * pWord )
918 {
919 // i must be initialized
920 assert ( g_pLemmatizers[AOT_RU] );
921
922 // pass-through 1-char words, and non-Russian words
923 if ( !IsAlpha1251(*pWord) || !pWord[1] )
924 return;
925
926 // handle a few most frequent 2-char, 3-char pass-through words
927 if ( IsRuFreq2(pWord) || IsRuFreq3(pWord) )
928 return;
929
930 // do lemmatizing
931 // input keyword moves into sForm; LemmatizeWord() will also case fold sForm
932 // we will generate results using sForm into pWord; so we need this extra copy
933 BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
934 int iFormLen = 0;
935
936 // faster than strlen and strcpy..
937 for ( BYTE * p=pWord; *p; )
938 sForm[iFormLen++] = *p++;
939 sForm[iFormLen] = '\0';
940
941 DWORD FindResults[12]; // max results is like 6
942 bool bFound = g_pLemmatizers[AOT_RU]->LemmatizeWord ( (BYTE*)sForm, FindResults );
943 if ( FindResults[0]==AOT_NOFORM )
944 return;
945
946 // pick a single form
947 // picks a noun, if possible, and otherwise prefers shorter forms
948 bool bNoun = false;
949 for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
950 {
951 const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
952 const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
953
954 bool bNewNoun = ( F.m_POS==0 );
955 if ( i==0 || ( !bNoun && bNewNoun ) )
956 {
957 CreateLemma<EMIT_1BYTE> ( pWord, sForm, iFormLen, bFound, M, F );
958 bNoun = bNewNoun;
959 } else if ( bNoun==bNewNoun )
960 {
961 BYTE sBuf[256];
962 CreateLemma<EMIT_1BYTE> ( sBuf, sForm, iFormLen, bFound, M, F );
963 if ( strcmp ( (char*)sBuf, (char*)pWord )<0 )
964 strcpy ( (char*)pWord, (char*)sBuf ); // NOLINT
965 }
966 }
967 }
968
sphAotLemmatize(BYTE * pWord,int iLang)969 void sphAotLemmatize ( BYTE * pWord, int iLang )
970 {
971 // i must be initialized
972 assert ( g_pLemmatizers[iLang] );
973
974 // pass-through 1-char words, and non-Russian words
975 if ( !IsAlphaAscii(*pWord) || !pWord[1] )
976 return;
977
978 // handle a few most frequent 2-char, 3-char pass-through words
979 if ( iLang==AOT_EN && ( IsEnFreq2(pWord) || IsEnFreq3(pWord) ) )
980 return;
981
982 if ( iLang==AOT_DE && ( IsDeFreq2(pWord) || IsDeFreq3(pWord) ) )
983 return;
984
985 // do lemmatizing
986 // input keyword moves into sForm; LemmatizeWord() will also case fold sForm
987 // we will generate results using sForm into pWord; so we need this extra copy
988 BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
989 int iFormLen = 0;
990
991 // faster than strlen and strcpy..
992 for ( BYTE * p=pWord; *p; )
993 sForm[iFormLen++] = *p++;
994 sForm[iFormLen] = '\0';
995
996 // do nothing with one-char words
997 if ( iFormLen<=1 )
998 return;
999
1000 DWORD FindResults[12]; // max results is like 6
1001 bool bFound = g_pLemmatizers[iLang]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1002 if ( FindResults[0]==AOT_NOFORM )
1003 return;
1004
1005 // pick a single form
1006 // picks a noun, if possible, and otherwise prefers shorter forms
1007 bool bNoun = false;
1008 for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1009 {
1010 const CFlexiaModel & M = g_pLemmatizers[iLang]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1011 const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1012
1013 bool bNewNoun = ( F.m_POS==0 );
1014 if ( i==0 || ( !bNoun && bNewNoun ) )
1015 {
1016 CreateLemma<EMIT_1BYTE> ( pWord, sForm, iFormLen, bFound, M, F );
1017 bNoun = bNewNoun;
1018 } else if ( bNoun==bNewNoun )
1019 {
1020 BYTE sBuf[256];
1021 CreateLemma<EMIT_1BYTE> ( sBuf, sForm, iFormLen, bFound, M, F );
1022 if ( strcmp ( (char*)sBuf, (char*)pWord )<0 )
1023 strcpy ( (char*)pWord, (char*)sBuf ); // NOLINT
1024 }
1025 }
1026 }
1027
IsRussianAlphaUtf8(const BYTE * pWord)1028 static inline bool IsRussianAlphaUtf8 ( const BYTE * pWord )
1029 {
1030 // letters, windows-1251, utf-8
1031 // A..YA, C0..DF, D0 90..D0 AF
1032 // a..p, E0..EF, D0 B0..D0 BF
1033 // r..ya, F0..FF, D1 80..D1 8F
1034 // YO, A8, D0 81
1035 // yo, B8, D1 91
1036 if ( pWord[0]==0xD0 )
1037 if ( pWord[1]==0x81 || ( pWord[1]>=0x90 && pWord[1]<0xC0 ) )
1038 return true;
1039 if ( pWord[0]==0xD1 )
1040 if ( pWord[1]>=0x80 && pWord[1]<=0x91 && pWord[1]!=0x90 )
1041 return true;
1042 return false;
1043 }
1044
sphAotLemmatizeDe1252(BYTE * pWord)1045 void sphAotLemmatizeDe1252 ( BYTE * pWord )
1046 {
1047 // i must be initialized
1048 assert ( g_pLemmatizers[AOT_DE] );
1049
1050 // pass-through 1-char words, and non-German words
1051 if ( !IsGermanAlpha1252(*pWord) || !pWord[1] )
1052 return;
1053
1054 // handle a few most frequent 2-char, 3-char pass-through words
1055 if ( IsDeFreq2(pWord) || IsDeFreq3(pWord) )
1056 return;
1057
1058 // do lemmatizing
1059 // input keyword moves into sForm; LemmatizeWord() will also case fold sForm
1060 // we will generate results using sForm into pWord; so we need this extra copy
1061 BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
1062 int iFormLen = 0;
1063
1064 // faster than strlen and strcpy..
1065 for ( BYTE * p=pWord; *p; )
1066 sForm[iFormLen++] = *p++;
1067 sForm[iFormLen] = '\0';
1068
1069 DWORD FindResults[12]; // max results is like 6
1070 bool bFound = g_pLemmatizers[AOT_DE]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1071 if ( FindResults[0]==AOT_NOFORM )
1072 return;
1073
1074 // pick a single form
1075 // picks a noun, if possible, and otherwise prefers shorter forms
1076 bool bNoun = false;
1077 for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1078 {
1079 const CFlexiaModel & M = g_pLemmatizers[AOT_DE]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1080 const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1081
1082 bool bNewNoun = ( F.m_POS==0 );
1083 if ( i==0 || ( !bNoun && bNewNoun ) )
1084 {
1085 CreateLemma<EMIT_1BYTE> ( pWord, sForm, iFormLen, bFound, M, F );
1086 bNoun = bNewNoun;
1087 } else if ( bNoun==bNewNoun )
1088 {
1089 BYTE sBuf[256];
1090 CreateLemma<EMIT_1BYTE> ( sBuf, sForm, iFormLen, bFound, M, F );
1091 if ( strcmp ( (char*)sBuf, (char*)pWord )<0 )
1092 strcpy ( (char*)pWord, (char*)sBuf ); // NOLINT
1093 }
1094 }
1095 }
1096
1097 /// returns length in bytes (aka chars) if all letters were russian and converted
1098 /// returns 0 and aborts early if non-russian letters are encountered
Utf8ToWin1251(BYTE * pOut,const BYTE * pWord)1099 static inline int Utf8ToWin1251 ( BYTE * pOut, const BYTE * pWord )
1100 {
1101 // YO, win A8, utf D0 81
1102 // A..YA, win C0..DF, utf D0 90..D0 AF
1103 // a..p, win E0..EF, utf D0 B0..D0 BF
1104 // r..ya, win F0..FF, utf D1 80..D1 8F
1105 // yo, win B8, utf D1 91
1106 static const BYTE dTable[128] =
1107 {
1108 0, 0xa8, 0, 0, 0, 0, 0, 0, // 00
1109 0, 0, 0, 0, 0, 0, 0, 0, // 08
1110 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, // 10
1111 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, // 18
1112 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, // 20
1113 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, // 28
1114 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, // 30
1115 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, // 38
1116 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, // 40
1117 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, // 48
1118 0, 0xb8, 0, 0, 0, 0, 0, 0, // 50
1119 0, 0, 0, 0, 0, 0, 0, 0, // 58
1120 0, 0, 0, 0, 0, 0, 0, 0, // 60
1121 0, 0, 0, 0, 0, 0, 0, 0, // 68
1122 0, 0, 0, 0, 0, 0, 0, 0, // 70
1123 0, 0, 0, 0, 0, 0, 0, 0 // 78
1124 };
1125
1126 BYTE * pStart = pOut;
1127 while ( *pWord )
1128 {
1129 // russian utf-8 letters begin with either D0 or D1
1130 // and any valid 2nd utf-8 byte must be in 80..BF range
1131 if ( ( *pWord & 0xFE )!=0xD0 )
1132 return 0;
1133 assert ( pWord[1]>=0x80 && pWord[1]<0xC0 );
1134
1135 // table index D0 80..BF to 0..3F, and D1 80..BF to 40..7F
1136 BYTE uWin = dTable [ ( pWord[1] & 0x7F ) + ( ( pWord[0] & 1 )<<6 ) ];
1137 pWord += 2;
1138
1139 if ( !uWin )
1140 return 0;
1141 *pOut++ = uWin;
1142 }
1143
1144 *pOut = '\0';
1145 return (int)( pOut-pStart );
1146 }
1147
1148 /// returns length in bytes (aka chars) if all letters were converted
1149 /// returns 0 and aborts early if non-western letters are encountered
Utf8ToWin1252(BYTE * pOut,const BYTE * pWord)1150 static inline int Utf8ToWin1252 ( BYTE * pOut, const BYTE * pWord )
1151 {
1152 BYTE * pStart = pOut;
1153 while ( *pWord )
1154 {
1155 if ( (*pWord)&0x80 )
1156 {
1157 if ( ((*pWord)&0xFC)==0xC0 )
1158 {
1159 *pOut++ = ( pWord[1] & 0x7F ) + ( ( pWord[0] & 3 )<<6 );
1160 pWord += 2;
1161 } else
1162 return 0;
1163 } else
1164 *pOut++ = *pWord++;
1165 }
1166
1167 *pOut = '\0';
1168 return (int)( pOut-pStart );
1169 }
1170
IsGermanAlphaUtf8(const BYTE * pWord)1171 static inline bool IsGermanAlphaUtf8 ( const BYTE * pWord )
1172 {
1173 // letters, windows-1252, utf-8
1174 // A..Z, trivial
1175 if ( pWord[0]>0x40 && pWord[0]<0x5b )
1176 return true;
1177
1178 // a..z, also trivial
1179 if ( pWord[0]>0x60 && pWord[0]<0x7b )
1180 return true;
1181
1182 // mu, 0xb5
1183 if ( pWord[0]==0xC2 && pWord[1]==0xB5 )
1184 return true;
1185
1186 // some upper
1187 if ( pWord[0]==0xC3 )
1188 {
1189 if ( pWord[1]==0X9F ) // ss umlaut
1190 return true;
1191 switch ( pWord[1] | 0x20 )
1192 {
1193 case 0xA2: // umlauts
1194 case 0xA4:
1195 case 0xA7:
1196 case 0xA8:
1197 case 0xA9:
1198 case 0xAa:
1199 case 0xB1:
1200 case 0xB4:
1201 case 0xB6:
1202 case 0xBb:
1203 case 0xBc:
1204 return true;
1205 }
1206 }
1207 return false;
1208 }
1209
Win1251ToLowercaseUtf8(BYTE * pOut,const BYTE * pWord)1210 static inline void Win1251ToLowercaseUtf8 ( BYTE * pOut, const BYTE * pWord )
1211 {
1212 while ( *pWord )
1213 {
1214 // a..p, E0..EF maps to D0 B0..D0 BF
1215 // r..ya, F0..FF maps to D1 80..D1 8F
1216 // yo maps to D1 91
1217 if ( *pWord>=0xC0 )
1218 {
1219 BYTE iCh = ( *pWord | 0x20 ); // lowercase
1220 BYTE iF = ( iCh>>4 ) & 1; // 0xE? or 0xF? value
1221 *pOut++ = 0xD0 + iF;
1222 *pOut++ = iCh - 0x30 - ( iF<<6 );
1223 } else if ( *pWord==0xA8 || *pWord==0xB8 )
1224 {
1225 *pOut++ = 0xD1;
1226 *pOut++ = 0x91;
1227 } else
1228 assert ( false );
1229 pWord++;
1230 }
1231 *pOut++ = '\0';
1232 }
1233
Win1252ToLowercaseUtf8(BYTE * pOut,const BYTE * pWord)1234 static inline void Win1252ToLowercaseUtf8 ( BYTE * pOut, const BYTE * pWord )
1235 {
1236 while ( *pWord )
1237 {
1238 if ( !((*pWord)&0x80) )
1239 *pOut++ = *pWord | 0x20;
1240 else
1241 {
1242 *pOut++ = 0xC0 | ((*pWord)>>6);
1243 *pOut++ = 0x80 | ((*pWord)&0x3F);
1244 }
1245 ++pWord;
1246 }
1247 *pOut++ = '\0';
1248 }
1249
sphAotLemmatizeRuUTF8(BYTE * pWord)1250 void sphAotLemmatizeRuUTF8 ( BYTE * pWord )
1251 {
1252 // i must be initialized
1253 assert ( g_pLemmatizers[AOT_RU] );
1254
1255 // only if the word is russian
1256 if ( !IsRussianAlphaUtf8(pWord) )
1257 return;
1258
1259 // convert to Windows-1251
1260 // failure means we should not lemmatize this
1261 BYTE sBuf [ SPH_MAX_WORD_LEN+4 ];
1262 if ( !Utf8ToWin1251 ( sBuf, pWord ) )
1263 return;
1264
1265 // lemmatize, convert back, done!
1266 sphAotLemmatizeRu1251 ( sBuf );
1267 Win1251ToLowercaseUtf8 ( pWord, sBuf );
1268 }
1269
sphAotLemmatizeDeUTF8(BYTE * pWord)1270 void sphAotLemmatizeDeUTF8 ( BYTE * pWord )
1271 {
1272 // i must be initialized
1273 assert ( g_pLemmatizers[AOT_DE] );
1274
1275 // only if the word is german
1276 if ( !IsGermanAlphaUtf8(pWord) )
1277 return;
1278
1279 // convert to Windows-1252
1280 // failure means we should not lemmatize this
1281 BYTE sBuf [ SPH_MAX_WORD_LEN+4 ];
1282 if ( !Utf8ToWin1252 ( sBuf, pWord ) )
1283 return;
1284
1285 // lemmatize, convert back, done!
1286 sphAotLemmatizeDe1252 ( sBuf );
1287 Win1252ToLowercaseUtf8 ( pWord, sBuf );
1288 }
1289
sphAotLemmatizeRu(CSphVector<CSphString> & dLemmas,const BYTE * pWord)1290 void sphAotLemmatizeRu ( CSphVector<CSphString> & dLemmas, const BYTE * pWord )
1291 {
1292 assert ( g_pLemmatizers[AOT_RU] );
1293 if ( !IsRussianAlphaUtf8(pWord) )
1294 return;
1295
1296 BYTE sForm [ SPH_MAX_WORD_LEN+4 ];
1297 int iFormLen = 0;
1298 iFormLen = Utf8ToWin1251 ( sForm, pWord );
1299
1300 if ( iFormLen<2 || IsRuFreq2(sForm) )
1301 return;
1302 if ( iFormLen<3 || IsRuFreq3(sForm) )
1303 return;
1304
1305 DWORD FindResults[12]; // max results is like 6
1306 bool bFound = g_pLemmatizers[AOT_RU]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1307 if ( FindResults[0]==AOT_NOFORM )
1308 return;
1309
1310 for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1311 {
1312 const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1313 const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1314
1315 BYTE sRes [ 3*SPH_MAX_WORD_LEN+4 ];
1316
1317 CreateLemma<EMIT_UTF8RU> ( sRes, sForm, iFormLen, bFound, M, F );
1318 dLemmas.Add ( (const char*)sRes );
1319 }
1320
1321 // OPTIMIZE?
1322 dLemmas.Uniq();
1323 }
1324
sphAotLemmatizeDe(CSphVector<CSphString> & dLemmas,const BYTE * pWord)1325 void sphAotLemmatizeDe ( CSphVector<CSphString> & dLemmas, const BYTE * pWord )
1326 {
1327 assert ( g_pLemmatizers[AOT_DE] );
1328 if ( !IsGermanAlphaUtf8(pWord) )
1329 return;
1330
1331 BYTE sForm [ SPH_MAX_WORD_LEN+4 ];
1332 int iFormLen = 0;
1333 iFormLen = Utf8ToWin1252 ( sForm, pWord );
1334
1335 if ( iFormLen<=1 )
1336 return;
1337
1338 if ( IsDeFreq2(sForm) || IsDeFreq3(sForm) )
1339 return;
1340
1341 DWORD FindResults[12]; // max results is like 6
1342 bool bFound = g_pLemmatizers[AOT_DE]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1343 if ( FindResults[0]==AOT_NOFORM )
1344 return;
1345
1346 for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1347 {
1348 const CFlexiaModel & M = g_pLemmatizers[AOT_DE]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1349 const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1350
1351 BYTE sRes [ 3*SPH_MAX_WORD_LEN+4 ];
1352
1353 CreateLemma<EMIT_UTF8> ( sRes, sForm, iFormLen, bFound, M, F );
1354 dLemmas.Add ( (const char*)sRes );
1355 }
1356
1357 // OPTIMIZE?
1358 dLemmas.Uniq();
1359 }
1360
1361 // generic lemmatize for other languages
sphAotLemmatize(CSphVector<CSphString> & dLemmas,const BYTE * pWord,int iLang)1362 void sphAotLemmatize ( CSphVector<CSphString> & dLemmas, const BYTE * pWord, int iLang )
1363 {
1364 assert ( iLang!=AOT_RU ); // must be processed by the specialized function
1365 assert ( g_pLemmatizers[iLang] );
1366
1367 if ( !IsAlphaAscii(*pWord) )
1368 return;
1369
1370 BYTE sForm [ SPH_MAX_WORD_LEN+4 ];
1371 int iFormLen = 0;
1372
1373 while ( *pWord )
1374 sForm [ iFormLen++ ] = *pWord++;
1375 sForm [ iFormLen ] = '\0';
1376
1377 if ( iFormLen<=1 )
1378 return;
1379
1380 if ( iLang==AOT_EN && ( IsEnFreq2(sForm) || IsEnFreq3(sForm) ) )
1381 return;
1382
1383 if ( iLang==AOT_DE && ( IsDeFreq2(sForm) || IsDeFreq3(sForm) ) )
1384 return;
1385
1386 DWORD FindResults[12]; // max results is like 6
1387 bool bFound = g_pLemmatizers[iLang]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1388 if ( FindResults[0]==AOT_NOFORM )
1389 return;
1390
1391 for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1392 {
1393 const CFlexiaModel & M = g_pLemmatizers[iLang]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1394 const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1395
1396 BYTE sRes [ 3*SPH_MAX_WORD_LEN+4 ];
1397 CreateLemma<EMIT_1BYTE> ( sRes, sForm, iFormLen, bFound, M, F );
1398
1399 dLemmas.Add ( (const char*)sRes );
1400 }
1401
1402 // OPTIMIZE?
1403 dLemmas.Uniq();
1404 }
1405
1406
sphAotDictinfo(int iLang)1407 const CSphNamedInt & sphAotDictinfo ( int iLang )
1408 {
1409 return g_tDictinfos[iLang];
1410 }
1411
1412 //////////////////////////////////////////////////////////////////////////
1413
1414 /// token filter for AOT morphology indexing
1415 /// AOT may return multiple (!) morphological hypotheses for a single token
1416 /// we return such additional hypotheses as blended tokens
1417 class CSphAotTokenizerTmpl : public CSphTokenFilter
1418 {
1419 protected:
1420 BYTE m_sForm [ SPH_MAX_WORD_LEN*3+4 ]; ///< aka MAX_KEYWORD_BYTES
1421 int m_iFormLen; ///< in bytes, but in windows-1251 that is characters, too
1422 bool m_bFound; ///< found or predicted?
1423 DWORD m_FindResults[12]; ///< max results is like 6
1424 int m_iCurrent; ///< index in m_FindResults that was just returned, -1 means no blending
1425 BYTE m_sToken [ SPH_MAX_WORD_LEN*3+4 ]; ///< to hold generated lemmas
1426 BYTE m_sOrigToken [ SPH_MAX_WORD_LEN*3+4 ]; ///< to hold original token
1427 bool m_bIndexExact;
1428
1429 const CSphWordforms * m_pWordforms;
1430
1431 public:
CSphAotTokenizerTmpl(ISphTokenizer * pTok,CSphDict * pDict,bool bIndexExact,int DEBUGARG (iLang))1432 CSphAotTokenizerTmpl ( ISphTokenizer * pTok, CSphDict * pDict, bool bIndexExact, int DEBUGARG(iLang) )
1433 : CSphTokenFilter ( pTok )
1434 {
1435 assert ( pTok );
1436 assert ( g_pLemmatizers[iLang] );
1437 m_iCurrent = -1;
1438 m_FindResults[0] = AOT_NOFORM;
1439 m_pWordforms = NULL;
1440 if ( pDict )
1441 {
1442 // tricky bit
1443 // one does not simply take over the wordforms from the dict
1444 // that would break saving of the (embedded) wordforms data
1445 // but as this filter replaces wordforms
1446 m_pWordforms = pDict->GetWordforms();
1447 pDict->DisableWordforms();
1448 }
1449 m_bIndexExact = bIndexExact;
1450 }
1451
SetBuffer(const BYTE * sBuffer,int iLength)1452 void SetBuffer ( const BYTE * sBuffer, int iLength )
1453 {
1454 m_pTokenizer->SetBuffer ( sBuffer, iLength );
1455 }
1456
TokenIsBlended() const1457 bool TokenIsBlended() const
1458 {
1459 return m_iCurrent>=0 || m_pTokenizer->TokenIsBlended();
1460 }
1461
GetSettingsFNV() const1462 uint64_t GetSettingsFNV () const
1463 {
1464 uint64_t uHash = CSphTokenFilter::GetSettingsFNV();
1465 uHash ^= (uint64_t)m_pWordforms;
1466 DWORD uFlags = m_bIndexExact ? 1 : 0;
1467 uHash = sphFNV64 ( &uFlags, sizeof(uFlags), uHash );
1468 return uHash;
1469 }
1470 };
1471
1472 class CSphAotTokenizerRu : public CSphAotTokenizerTmpl
1473 {
1474 public:
CSphAotTokenizerRu(ISphTokenizer * pTok,CSphDict * pDict,bool bIndexExact)1475 CSphAotTokenizerRu ( ISphTokenizer * pTok, CSphDict * pDict, bool bIndexExact )
1476 : CSphAotTokenizerTmpl ( pTok, pDict, bIndexExact, AOT_RU )
1477 {}
1478
Clone(ESphTokenizerClone eMode) const1479 ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
1480 {
1481 // this token filter must NOT be created as escaped
1482 // it must only be used during indexing time, NEVER in searching time
1483 assert ( eMode==SPH_CLONE_INDEX );
1484 CSphAotTokenizerRu * pClone = new CSphAotTokenizerRu ( m_pTokenizer->Clone ( eMode ), NULL, m_bIndexExact );
1485 if ( m_pWordforms )
1486 pClone->m_pWordforms = m_pWordforms;
1487 return pClone;
1488 }
1489
GetToken()1490 BYTE * GetToken()
1491 {
1492 m_eTokenMorph = SPH_TOKEN_MORPH_RAW;
1493
1494 // any pending lemmas left?
1495 if ( m_iCurrent>=0 )
1496 {
1497 ++m_iCurrent;
1498 assert ( m_FindResults[m_iCurrent]!=AOT_NOFORM );
1499
1500 // return original token
1501 if ( m_FindResults[m_iCurrent]==AOT_ORIGFORM )
1502 {
1503 assert ( m_FindResults[m_iCurrent+1]==AOT_NOFORM );
1504 strncpy ( (char*)m_sToken, (char*)m_sOrigToken, sizeof(m_sToken) );
1505 m_iCurrent = -1;
1506 m_eTokenMorph = SPH_TOKEN_MORPH_ORIGINAL;
1507 return m_sToken;
1508 }
1509
1510 // generate that lemma
1511 const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults [ m_iCurrent ] ) ];
1512 const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults [ m_iCurrent ] ) ];
1513 CreateLemma<EMIT_UTF8RU> ( m_sToken, m_sForm, m_iFormLen, m_bFound, M, F );
1514
1515 // is this the last one? gotta tag it non-blended
1516 if ( m_FindResults [ m_iCurrent+1 ]==AOT_NOFORM )
1517 m_iCurrent = -1;
1518
1519 if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1520 m_pWordforms->ToNormalForm ( m_sToken, false );
1521
1522 m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1523 return m_sToken;
1524 }
1525
1526 // ok, time to work on a next word
1527 assert ( m_iCurrent<0 );
1528 BYTE * pToken = m_pTokenizer->GetToken();
1529 if ( !pToken )
1530 return NULL;
1531
1532 // pass-through blended parts
1533 if ( m_pTokenizer->TokenIsBlended() )
1534 return pToken;
1535
1536 // pass-through matched wordforms
1537 if ( m_pWordforms && m_pWordforms->ToNormalForm ( pToken, true ) )
1538 return pToken;
1539
1540 // pass-through 1-char "words"
1541 if ( pToken[1]=='\0' )
1542 return pToken;
1543
1544 // pass-through non-Russian words
1545 if ( !IsRussianAlphaUtf8 ( pToken ) )
1546 return pToken;
1547
1548 // convert or copy regular tokens
1549 m_iFormLen = Utf8ToWin1251 ( m_sForm, pToken );
1550
1551 // do nothing with one-char words
1552 if ( m_iFormLen<=1 )
1553 return pToken;
1554
1555 // handle a few most frequent 2-char, 3-char pass-through words
1556 // OPTIMIZE? move up?
1557 if ( IsRuFreq2 ( m_sForm ) || IsRuFreq3 ( m_sForm ) )
1558 return pToken;
1559
1560 // lemmatize
1561 m_bFound = g_pLemmatizers[AOT_RU]->LemmatizeWord ( m_sForm, m_FindResults );
1562 if ( m_FindResults[0]==AOT_NOFORM )
1563 {
1564 assert ( m_iCurrent<0 );
1565 return pToken;
1566 }
1567
1568 // schedule original form for return, if needed
1569 if ( m_bIndexExact )
1570 {
1571 int i = 1;
1572 while ( m_FindResults[i]!=AOT_NOFORM )
1573 i++;
1574 m_FindResults[i] = AOT_ORIGFORM;
1575 m_FindResults[i+1] = AOT_NOFORM;
1576 strncpy ( (char*)m_sOrigToken, (char*)pToken, sizeof(m_sOrigToken) );
1577 }
1578
1579 // in any event, prepare the first lemma for return
1580 const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults[0] ) ];
1581 const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults[0] ) ];
1582 CreateLemma<EMIT_UTF8RU> ( pToken, m_sForm, m_iFormLen, m_bFound, M, F );
1583
1584 // schedule lemmas 2+ for return
1585 if ( m_FindResults[1]!=AOT_NOFORM )
1586 m_iCurrent = 0;
1587
1588 // suddenly, post-morphology wordforms
1589 if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1590 m_pWordforms->ToNormalForm ( pToken, false );
1591
1592 m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1593 return pToken;
1594 }
1595 };
1596
1597 class CSphAotTokenizer : public CSphAotTokenizerTmpl
1598 {
1599 AOT_LANGS m_iLang;
1600 public:
CSphAotTokenizer(ISphTokenizer * pTok,CSphDict * pDict,bool bIndexExact,int iLang)1601 CSphAotTokenizer ( ISphTokenizer * pTok, CSphDict * pDict, bool bIndexExact, int iLang )
1602 : CSphAotTokenizerTmpl ( pTok, pDict, bIndexExact, iLang )
1603 , m_iLang ( AOT_LANGS(iLang) )
1604 {}
1605
Clone(ESphTokenizerClone eMode) const1606 ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
1607 {
1608 // this token filter must NOT be created as escaped
1609 // it must only be used during indexing time, NEVER in searching time
1610 assert ( eMode==SPH_CLONE_INDEX );
1611 CSphAotTokenizer * pClone = new CSphAotTokenizer ( m_pTokenizer->Clone ( eMode ), NULL, m_bIndexExact, m_iLang );
1612 if ( m_pWordforms )
1613 pClone->m_pWordforms = m_pWordforms;
1614 return pClone;
1615 }
1616
GetToken()1617 BYTE * GetToken()
1618 {
1619 m_eTokenMorph = SPH_TOKEN_MORPH_RAW;
1620
1621 // any pending lemmas left?
1622 if ( m_iCurrent>=0 )
1623 {
1624 ++m_iCurrent;
1625 assert ( m_FindResults[m_iCurrent]!=AOT_NOFORM );
1626
1627 // return original token
1628 if ( m_FindResults[m_iCurrent]==AOT_ORIGFORM )
1629 {
1630 assert ( m_FindResults[m_iCurrent+1]==AOT_NOFORM );
1631 strncpy ( (char*)m_sToken, (char*)m_sOrigToken, sizeof(m_sToken) );
1632 m_iCurrent = -1;
1633 m_eTokenMorph = SPH_TOKEN_MORPH_ORIGINAL;
1634 return m_sToken;
1635 }
1636
1637 // generate that lemma
1638 const CFlexiaModel & M = g_pLemmatizers[m_iLang]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults [ m_iCurrent ] ) ];
1639 const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults [ m_iCurrent ] ) ];
1640 CreateLemma<EMIT_UTF8> ( m_sToken, m_sForm, m_iFormLen, m_bFound, M, F );
1641
1642 // is this the last one? gotta tag it non-blended
1643 if ( m_FindResults [ m_iCurrent+1 ]==AOT_NOFORM )
1644 m_iCurrent = -1;
1645
1646 if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1647 m_pWordforms->ToNormalForm ( m_sToken, false );
1648
1649 m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1650 return m_sToken;
1651 }
1652
1653 // ok, time to work on a next word
1654 assert ( m_iCurrent<0 );
1655 BYTE * pToken = m_pTokenizer->GetToken();
1656 if ( !pToken )
1657 return NULL;
1658
1659 // pass-through blended parts
1660 if ( m_pTokenizer->TokenIsBlended() )
1661 return pToken;
1662
1663 // pass-through matched wordforms
1664 if ( m_pWordforms && m_pWordforms->ToNormalForm ( pToken, true ) )
1665 return pToken;
1666
1667 // pass-through 1-char "words"
1668 if ( pToken[1]=='\0' )
1669 return pToken;
1670
1671 // pass-through non-Russian words
1672 if ( m_iLang==AOT_DE )
1673 {
1674 if ( !IsGermanAlphaUtf8 ( pToken ) )
1675 return pToken;
1676 } else
1677 {
1678 if ( !IsGermanAlpha1252 ( pToken[0] ) )
1679 return pToken;
1680 }
1681
1682 // convert or copy regular tokens
1683 if ( m_iLang==AOT_DE )
1684 m_iFormLen = Utf8ToWin1252 ( m_sForm, pToken );
1685 else
1686 {
1687 // manual strlen and memcpy; faster this way
1688 BYTE * p = pToken;
1689 m_iFormLen = 0;
1690 while ( *p )
1691 m_sForm [ m_iFormLen++ ] = *p++;
1692 m_sForm [ m_iFormLen ] = '\0';
1693 }
1694
1695 // do nothing with one-char words
1696 if ( m_iFormLen<=1 )
1697 return pToken;
1698
1699 // handle a few most frequent 2-char, 3-char pass-through words
1700 // OPTIMIZE? move up?
1701 if ( ( m_iLang==AOT_DE && ( IsDeFreq2 ( m_sForm ) || IsDeFreq3 ( m_sForm ) ) )
1702 || ( m_iLang==AOT_EN && ( IsEnFreq2 ( m_sForm ) || IsEnFreq3 ( m_sForm ) ) ) )
1703 return pToken;
1704
1705 // lemmatize
1706 m_bFound = g_pLemmatizers[m_iLang]->LemmatizeWord ( m_sForm, m_FindResults );
1707 if ( m_FindResults[0]==AOT_NOFORM )
1708 {
1709 assert ( m_iCurrent<0 );
1710 return pToken;
1711 }
1712
1713 // schedule original form for return, if needed
1714 if ( m_bIndexExact )
1715 {
1716 int i = 1;
1717 while ( m_FindResults[i]!=AOT_NOFORM )
1718 i++;
1719 m_FindResults[i] = AOT_ORIGFORM;
1720 m_FindResults[i+1] = AOT_NOFORM;
1721 strncpy ( (char*)m_sOrigToken, (char*)pToken, sizeof(m_sOrigToken) );
1722 }
1723
1724 // in any event, prepare the first lemma for return
1725 const CFlexiaModel & M = g_pLemmatizers[m_iLang]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults[0] ) ];
1726 const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults[0] ) ];
1727 CreateLemma<EMIT_UTF8> ( pToken, m_sForm, m_iFormLen, m_bFound, M, F );
1728
1729 // schedule lemmas 2+ for return
1730 if ( m_FindResults[1]!=AOT_NOFORM )
1731 m_iCurrent = 0;
1732
1733 // suddenly, post-morphology wordforms
1734 if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1735 m_pWordforms->ToNormalForm ( pToken, false );
1736
1737 m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1738 return pToken;
1739 }
1740 };
1741
1742
sphAotCreateFilter(ISphTokenizer * pTokenizer,CSphDict * pDict,bool bIndexExact,DWORD uLangMask)1743 CSphTokenFilter * sphAotCreateFilter ( ISphTokenizer * pTokenizer, CSphDict * pDict, bool bIndexExact, DWORD uLangMask )
1744 {
1745 assert ( uLangMask!=0 );
1746 CSphTokenFilter * pDerivedTokenizer = NULL;
1747 for ( int i=AOT_BEGIN; i<AOT_LENGTH; ++i )
1748 {
1749 if ( uLangMask & (1UL<<i) )
1750 {
1751 if ( i==AOT_RU )
1752 pDerivedTokenizer = new CSphAotTokenizerRu ( pTokenizer, pDict, bIndexExact );
1753 else
1754 pDerivedTokenizer = new CSphAotTokenizer ( pTokenizer, pDict, bIndexExact, i );
1755 pTokenizer = pDerivedTokenizer;
1756 }
1757 }
1758 return pDerivedTokenizer;
1759 }
1760
1761
sphAotShutdown()1762 void sphAotShutdown ()
1763 {
1764 for ( int i=0; i<AOT_LENGTH; i++ )
1765 SafeDelete ( g_pLemmatizers[i] );
1766 }
1767
1768 //
1769 // $Id$
1770 //
1771