1 //
2 // $Id$
3 //
4 
5 //
6 // Copyright (c) 2011-2016, Andrew Aksyonoff
7 // Copyright (c) 2011-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 // Based on AOT lemmatizer, http://aot.ru/
16 // Copyright (c) 2004-2014, Alexey Sokirko and others
17 //
18 
19 #include "sphinx.h"
20 #include "sphinxint.h"
21 #include "sphinxutils.h"
22 
23 //////////////////////////////////////////////////////////////////////////
24 // LEMMATIZER
25 //////////////////////////////////////////////////////////////////////////
26 
27 const BYTE	AOT_POS_UNKNOWN				= 0xff;
28 const int	AOT_MIN_PREDICTION_SUFFIX	= 3;
29 const BYTE	AOT_MORPH_ANNOT_CHAR		= '+';
30 const int	AOT_MAX_ALPHABET_SIZE		= 54;
31 const DWORD	AOT_NOFORM					= 0xffffffffUL;
32 const DWORD	AOT_ORIGFORM				= 0xfffffffeUL;
33 
34 static int	g_iCacheSize				= 262144; // in bytes, so 256K
35 
36 
37 #define		AOT_MODEL_NO(_a)	((_a)>>18)
38 #define		AOT_ITEM_NO(_a)		(((_a)&0x3FFFF)>>9)
39 #define		AOT_PREFIX_NO(_a)	((_a)&0x1FF)
40 
41 
42 /// morpohological form info
43 struct CMorphForm
44 {
45 	BYTE		m_FlexiaLen;
46 	BYTE		m_PrefixLen;
47 	BYTE		m_POS;
48 	BYTE		m_Dummy;
49 	char		m_Prefix[4];
50 	char		m_Flexia[24];
51 };
52 
53 
54 /// alphabet descriptor
55 struct AlphabetDesc_t
56 {
57 	int			m_iSize;
58 	BYTE		m_dCode2Alpha [ AOT_MAX_ALPHABET_SIZE ];
59 	BYTE		m_dCode2AlphaWA [ AOT_MAX_ALPHABET_SIZE ];
60 };
61 
62 
63 /// alphabet codec
64 class CABCEncoder : public ISphNoncopyable
65 {
66 public:
67 	int			m_AlphabetSize;
68 	int			m_Alphabet2Code[256];
69 	int			m_Alphabet2CodeWithoutAnnotator[256];
70 
71 	void		InitAlphabet ( const AlphabetDesc_t & tDesc );
72 	bool		CheckABCWithoutAnnotator ( const BYTE * pWord ) const;
73 	DWORD		DecodeFromAlphabet ( const BYTE * sPath, int iPath ) const;
74 };
75 
76 /// morphology automaton node, 1:31
77 /// 1 bit for "final or not" flag
78 /// 31 bits for index to relations (pointer to the first child)
79 struct CMorphAutomNode
80 {
81 	DWORD		m_Data;
GetChildrenStartCMorphAutomNode82 	DWORD		GetChildrenStart() const	{ return m_Data&(0x80000000-1); }
IsFinalCMorphAutomNode83 	bool		IsFinal() const				{ return (m_Data&0x80000000) > 0; }
84 };
85 
86 
87 /// morphology automaton relation, 8:24
88 /// 8 bits for relational char (aka next char in current form)
89 /// 24 bites for index to nodes (pointer to the next level node)
90 struct CMorphAutomRelation
91 {
92 	DWORD		m_Data;
GetChildNoCMorphAutomRelation93 	DWORD		GetChildNo() const			{ return m_Data & 0xffffff; }
GetRelationalCharCMorphAutomRelation94 	BYTE		GetRelationalChar() const	{ return (BYTE)(m_Data>>24); }
95 };
96 
97 
98 /// morphology automaton
99 class CMorphAutomat : public CABCEncoder
100 {
101 protected:
102 	CMorphAutomNode *		m_pNodes;
103 	int						m_NodesCount;
104 	CMorphAutomRelation *	m_pRelations;
105 	int						m_RelationsCount;
106 
107 	int						m_iCacheSize;
108 	CSphTightVector<int>	m_ChildrenCache;
109 
110 	void	BuildChildrenCache ( int iCacheSize );
111 	int		FindStringAndPassAnnotChar ( const BYTE * pText ) const;
112 
113 public:
CMorphAutomat()114 	CMorphAutomat ()
115 		: m_pNodes ( NULL )
116 		, m_NodesCount ( 0 )
117 		, m_pRelations ( NULL )
118 		, m_RelationsCount ( 0 )
119 		, m_iCacheSize ( 0 )
120 	{}
121 
~CMorphAutomat()122 	~CMorphAutomat ()
123 	{
124 		SafeDelete ( m_pNodes );
125 		SafeDelete ( m_pRelations );
126 	}
127 
GetChildrenCount(int i) const128 	int								GetChildrenCount ( int i ) const	{ return m_pNodes[i+1].GetChildrenStart() - m_pNodes[i].GetChildrenStart(); }
GetChildren(int i) const129 	const CMorphAutomRelation *		GetChildren ( int i ) const			{ return m_pRelations + m_pNodes[i].GetChildrenStart(); }
GetNode(int i) const130 	const CMorphAutomNode			GetNode ( int i ) const				{ return m_pNodes[i]; }
131 
132 public:
133 	bool	LoadPak ( CSphReader & rd, int iCacheSize );
134 	void	GetInnerMorphInfos ( const BYTE * pText, DWORD * Infos ) const;
135 	int		NextNode ( int NodeNo, BYTE Child ) const;
136 };
137 
138 
139 /// prediction data tuple
140 struct CPredictTuple
141 {
142 	WORD				m_ItemNo;
143 	DWORD				m_LemmaInfoNo;
144 	BYTE				m_PartOfSpeechNo;
145 };
146 
147 
148 /// flexia model is basically a vector of morphology forms
149 /// (there is other meta suff like per-model comments but that is now stripped)
150 typedef CSphVector<CMorphForm> CFlexiaModel;
151 
152 
153 /// lemmatizer
154 class CLemmatizer
155 {
156 protected:
157 	static const int			MAX_PREFIX_LEN = 12;
158 	static const bool			m_bMaximalPrediction = false;
159 	bool						m_bIsGerman;
160 
161 	BYTE						m_UC[256];
162 
163 	CMorphAutomat				m_FormAutomat;
164 	CSphVector<WORD>			m_LemmaFlexiaModel;		///< lemma id to flexia model id mapping
165 	CSphVector<BYTE>			m_NPSs;
166 	int							m_PrefixLen [ MAX_PREFIX_LEN ];
167 	CSphVector<BYTE>			m_PrefixBlob;
168 
169 	CMorphAutomat				m_SuffixAutomat;
170 	CSphVector<DWORD>			m_ModelFreq;
171 
172 	bool						IsPrefix ( const BYTE * sPrefix, int iLen ) const;
173 
PredictPack(const CPredictTuple & t) const174 	DWORD						PredictPack ( const CPredictTuple & t ) const		{ return ( m_LemmaFlexiaModel [ t.m_LemmaInfoNo ]<<18 ) + ( t.m_ItemNo<<9 ); }
175 	bool						PredictFind ( const BYTE * pWord, int iLen, CSphVector<CPredictTuple> & res ) const;
176 	void						PredictFindRecursive ( int r, BYTE * sPath, int iPath, CSphVector<CPredictTuple> & Infos ) const;
177 	void						PredictByDataBase ( const BYTE * pWord, int iLen, DWORD * results, bool is_cap ) const;
178 
179 public:
CLemmatizer(bool IsGerman=false)180 	explicit CLemmatizer ( bool IsGerman = false )
181 		: m_bIsGerman ( IsGerman )
182 	{}
183 	CSphVector<CFlexiaModel>	m_FlexiaModels;			///< flexia models
184 	int							m_iLang;					///< my language
185 
186 	bool						LemmatizeWord ( BYTE * pWord, DWORD * results ) const;
187 	bool						LoadPak ( CSphReader & rd );
188 };
189 
190 //////////////////////////////////////////////////////////////////////////
191 
DecodeFromAlphabet(const BYTE * sPath,int iPath) const192 DWORD CABCEncoder::DecodeFromAlphabet ( const BYTE * sPath, int iPath ) const
193 {
194 	DWORD c = 1;
195 	DWORD Result = 0;
196 	for ( const BYTE * sMax = sPath+iPath; sPath<sMax; sPath++ )
197 	{
198 		Result += m_Alphabet2CodeWithoutAnnotator[*sPath] * c;
199 		c *= m_AlphabetSize - 1;
200 	}
201 	return Result;
202 }
203 
204 
CheckABCWithoutAnnotator(const BYTE * pWord) const205 bool CABCEncoder::CheckABCWithoutAnnotator ( const BYTE * pWord ) const
206 {
207 	while ( *pWord )
208 		if ( m_Alphabet2CodeWithoutAnnotator [ *pWord++ ]==-1 )
209 			return false;
210 	return true;
211 }
212 
213 
InitAlphabet(const AlphabetDesc_t & tDesc)214 void CABCEncoder::InitAlphabet ( const AlphabetDesc_t & tDesc )
215 {
216 	m_AlphabetSize = tDesc.m_iSize;
217 	for ( int i=0; i<256; i++ )
218 	{
219 		m_Alphabet2Code[i] = -1;
220 		m_Alphabet2CodeWithoutAnnotator[i] = -1;
221 	}
222 	for ( int i=0; i<m_AlphabetSize; i++ )
223 		m_Alphabet2Code [ tDesc.m_dCode2Alpha[i] ] = i;
224 	for ( int i=0; i<m_AlphabetSize-1; i++ )
225 		m_Alphabet2CodeWithoutAnnotator [ tDesc.m_dCode2AlphaWA[i] ] = i;
226 }
227 
228 //////////////////////////////////////////////////////////////////////////
229 
BuildChildrenCache(int iCacheSize)230 void CMorphAutomat::BuildChildrenCache ( int iCacheSize )
231 {
232 	iCacheSize /= AOT_MAX_ALPHABET_SIZE*4;
233 	iCacheSize = Max ( iCacheSize, 0 );
234 	m_iCacheSize = Min ( m_NodesCount, iCacheSize );
235 
236 	m_ChildrenCache.Resize ( m_iCacheSize*AOT_MAX_ALPHABET_SIZE );
237 	m_ChildrenCache.Fill ( -1 );
238 	for ( int NodeNo=0; NodeNo<m_iCacheSize; NodeNo++ )
239 	{
240 		const CMorphAutomRelation * pStart = m_pRelations + m_pNodes [ NodeNo ].GetChildrenStart();
241 		const CMorphAutomRelation * pEnd = pStart + GetChildrenCount ( NodeNo );
242 		for ( ; pStart!=pEnd; pStart++ )
243 		{
244 			const CMorphAutomRelation & p = *pStart;
245 			m_ChildrenCache [ NodeNo*AOT_MAX_ALPHABET_SIZE + m_Alphabet2Code [ p.GetRelationalChar() ] ] = p.GetChildNo();
246 		}
247 	}
248 }
249 
250 
LoadPak(CSphReader & rd,int iCacheSize)251 bool CMorphAutomat::LoadPak ( CSphReader & rd, int iCacheSize )
252 {
253 	rd.Tag ( "automaton-nodes" );
254 	m_NodesCount = rd.UnzipInt();
255 	m_pNodes = new CMorphAutomNode [ m_NodesCount+1 ];
256 	rd.GetBytes ( m_pNodes, m_NodesCount*sizeof(CMorphAutomNode) );
257 
258 	rd.Tag ( "automaton-relations" );
259 	m_RelationsCount = rd.UnzipInt();
260 	m_pRelations = new CMorphAutomRelation [ m_RelationsCount ];
261 	rd.GetBytes ( m_pRelations, m_RelationsCount*sizeof(CMorphAutomRelation) );
262 
263 	if ( rd.GetErrorFlag() )
264 		return false;
265 
266 	m_pNodes [ m_NodesCount ].m_Data = m_RelationsCount;
267 
268 #if !USE_LITTLE_ENDIAN
269 	for ( int i=0; i< m_NodesCount; ++i )
270 		FlipEndianess ( &m_pNodes[i].m_Data );
271 	for ( int i=0; i< m_RelationsCount; ++i )
272 		FlipEndianess ( &m_pRelations[i].m_Data );
273 #endif
274 
275 	BuildChildrenCache ( iCacheSize );
276 	return true;
277 }
278 
279 
NextNode(int NodeNo,BYTE RelationChar) const280 int CMorphAutomat::NextNode ( int NodeNo, BYTE RelationChar ) const
281 {
282 	if ( NodeNo<m_iCacheSize )
283 	{
284 		int z = m_Alphabet2Code [ RelationChar ];
285 		if ( z==-1 )
286 			return -1;
287 		return m_ChildrenCache [ NodeNo*AOT_MAX_ALPHABET_SIZE + z ];
288 	} else
289 	{
290 		const CMorphAutomRelation * pStart = m_pRelations + m_pNodes [ NodeNo ].GetChildrenStart();
291 		const CMorphAutomRelation * pEnd = pStart + GetChildrenCount ( NodeNo );
292 		for ( ; pStart!=pEnd; pStart++ )
293 		{
294 			const CMorphAutomRelation & p = *pStart;
295 			if ( RelationChar==p.GetRelationalChar() )
296 				return p.GetChildNo();
297 		}
298 		return -1;
299 	}
300 }
301 
302 
FindStringAndPassAnnotChar(const BYTE * pText) const303 int	CMorphAutomat::FindStringAndPassAnnotChar ( const BYTE * pText ) const
304 {
305 	int r = 0;
306 	while ( *pText )
307 	{
308 		int nd = NextNode ( r, *pText++ );
309 		if ( nd==-1 )
310 			return -1;
311 		r = nd;
312 	}
313 	return NextNode ( r, AOT_MORPH_ANNOT_CHAR ); // passing annotation char
314 }
315 
316 
GetInnerMorphInfos(const BYTE * pText,DWORD * Infos) const317 void CMorphAutomat::GetInnerMorphInfos ( const BYTE * pText, DWORD * Infos ) const
318 {
319 	*Infos = AOT_NOFORM;
320 
321 	int r = FindStringAndPassAnnotChar ( pText );
322 	if ( r==-1 )
323 		return;
324 
325 	// recursively get all interpretations
326 	const int MAX_DEPTH = 32;
327 	int iLevel = 0;
328 	BYTE sPath[MAX_DEPTH];
329 	int iChild[MAX_DEPTH];
330 	int iChildMax[MAX_DEPTH];
331 
332 	iChild[0] = m_pNodes[r].GetChildrenStart();
333 	iChildMax[0] = m_pNodes[r+1].GetChildrenStart();
334 
335 	while ( iLevel>=0 )
336 	{
337 		while ( iChild[iLevel]<iChildMax[iLevel] )
338 		{
339 			CMorphAutomRelation Rel = m_pRelations[iChild[iLevel]];
340 			int NodeNo = Rel.GetChildNo();
341 			sPath[iLevel] = Rel.GetRelationalChar();
342 			iChild[iLevel]++;
343 			if ( m_pNodes[NodeNo].IsFinal() )
344 			{
345 				*Infos++ = DecodeFromAlphabet ( sPath, iLevel+1 );
346 			} else
347 			{
348 				iLevel++;
349 				assert ( iLevel<MAX_DEPTH );
350 				iChild[iLevel] = m_pNodes[NodeNo].GetChildrenStart();
351 				iChildMax[iLevel] = m_pNodes[NodeNo+1].GetChildrenStart();
352 			}
353 		}
354 		iLevel--;
355 	}
356 	*Infos = AOT_NOFORM;
357 }
358 
359 //////////////////////////////////////////////////////////////////////////
360 
PredictFindRecursive(int NodeNo,BYTE * sPath,int iPath,CSphVector<CPredictTuple> & Infos) const361 void CLemmatizer::PredictFindRecursive ( int NodeNo, BYTE * sPath, int iPath, CSphVector<CPredictTuple> & Infos ) const
362 {
363 	const CMorphAutomNode & N = m_SuffixAutomat.GetNode ( NodeNo );
364 	if ( N.IsFinal() )
365 	{
366 		int i = 0;
367 		while ( i<iPath && sPath[i]!=AOT_MORPH_ANNOT_CHAR )
368 			i++;
369 
370 		int j = i+1;
371 		while ( j<iPath && sPath[j]!=AOT_MORPH_ANNOT_CHAR )
372 			j++;
373 
374 		int k = j+1;
375 		while ( k<iPath && sPath[k]!=AOT_MORPH_ANNOT_CHAR )
376 			k++;
377 
378 		CPredictTuple & A = Infos.Add();
379 		A.m_PartOfSpeechNo = (BYTE) m_SuffixAutomat.DecodeFromAlphabet ( sPath+i+1, j-i-1 );
380 		A.m_LemmaInfoNo = m_SuffixAutomat.DecodeFromAlphabet ( sPath+j+1, k-j-1 );
381 		A.m_ItemNo = (WORD) m_SuffixAutomat.DecodeFromAlphabet ( sPath+k+1, iPath-k-1 );
382 	}
383 
384 	int Count = m_SuffixAutomat.GetChildrenCount ( NodeNo );
385 	for ( int i=0; i<Count; i++ )
386 	{
387 		const CMorphAutomRelation & p = m_SuffixAutomat.GetChildren ( NodeNo )[i];
388 		sPath[iPath] = p.GetRelationalChar();
389 		PredictFindRecursive ( p.GetChildNo(), sPath, iPath+1, Infos );
390 	}
391 }
392 
393 
PredictFind(const BYTE * pWord,int iLen,CSphVector<CPredictTuple> & res) const394 bool CLemmatizer::PredictFind ( const BYTE * pWord, int iLen, CSphVector<CPredictTuple> & res ) const
395 {
396 	// FIXME? we might not want to predict words with annot char inside
397 	// was: if (ReversedWordForm.find(AnnotChar) != string::npos) return false;
398 
399 	int r = 0;
400 	int i = 0;
401 	const BYTE * p = pWord + iLen;
402 	for ( ; i<iLen; i++ )
403 	{
404 		int nd = m_SuffixAutomat.NextNode ( r, *--p );
405 		if ( nd==-1 )
406 			break;
407 		r = nd;
408 	}
409 
410 	// no prediction by suffix which is less than 3
411 	if ( i<AOT_MIN_PREDICTION_SUFFIX )
412 		return false;
413 
414 	assert ( r!=-1 );
415 	BYTE sPath[128];
416 	PredictFindRecursive ( r, sPath, 0, res );
417 	return true;
418 }
419 
420 
IsPrefix(const BYTE * sPrefix,int iLen) const421 bool CLemmatizer::IsPrefix ( const BYTE * sPrefix, int iLen ) const
422 {
423 	// empty prefix is a prefix
424 	if ( !iLen )
425 		return true;
426 	if ( iLen>=MAX_PREFIX_LEN || m_PrefixLen[iLen]<0 )
427 		return false;
428 
429 	const BYTE * p = &m_PrefixBlob [ m_PrefixLen[iLen] ];
430 	while ( *p==iLen )
431 	{
432 		if ( !memcmp ( p+1, sPrefix, iLen ) )
433 			return true;
434 		p += 1+iLen;
435 	}
436 	return false;
437 }
438 
439 
440 /// returns true if matched in dictionary, false if predicted
LemmatizeWord(BYTE * pWord,DWORD * results) const441 bool CLemmatizer::LemmatizeWord ( BYTE * pWord, DWORD * results ) const
442 {
443 	const bool bCap = false; // maybe when we manage to drag this all the way from tokenizer
444 	const bool bPredict = true;
445 
446 	// uppercase (and maybe other translations), check, and compute length
447 	BYTE * p;
448 	if ( m_iLang==AOT_RU )
449 	{
450 		for ( p = pWord; *p; p++ )
451 		{
452 			BYTE b = m_UC[*p];
453 			// russian chars are in 0xC0..0xDF range
454 			// avoid lemmatizing words with other chars in them
455 			if ( ( b>>5 )!=6 )
456 			{
457 				*results = AOT_NOFORM;
458 				return false;
459 			}
460 			// uppercase
461 			*p = b;
462 		}
463 	} else ///< use the alphabet to reduce another letters
464 	{
465 		for ( p = pWord; *p; p++ )
466 		{
467 			BYTE b = m_UC[*p];
468 			// english chars are in 0x61..0x7A range
469 			// avoid lemmatizing words with other chars in them
470 			if ( m_FormAutomat.m_Alphabet2CodeWithoutAnnotator[b]<0 )
471 			{
472 				*results = AOT_NOFORM;
473 				return false;
474 			}
475 			// uppercase
476 			*p = b;
477 		}
478 	}
479 
480 	int iLen = (int)( p-pWord );
481 
482 	// do dictionary lookup
483 	m_FormAutomat.GetInnerMorphInfos ( pWord, results );
484 	if ( *results!=AOT_NOFORM )
485 		return true;
486 	if_const ( !bPredict )
487 		return false;
488 
489 	// attempt prediction by keyword suffix
490 	// find the longest suffix that finds dictionary results
491 	// require that suffix to be 4+ chars too
492 	int iSuffix;
493 	for ( iSuffix=1; iSuffix<=iLen-4; iSuffix++ )
494 	{
495 		m_FormAutomat.GetInnerMorphInfos ( pWord+iSuffix, results );
496 		if ( *results!=AOT_NOFORM )
497 			break;
498 	}
499 
500 	// cancel suffix predictions with no hyphens, short enough
501 	// known postfixes, and unknown prefixes
502 	if ( pWord [ iSuffix-1 ]!='-'
503 		&& ( iLen-iSuffix )<6
504 		&& !IsPrefix ( pWord, iSuffix ) )
505 	{
506 		*results = AOT_NOFORM;
507 	}
508 
509 	// cancel predictions by pronouns, eg [Sem'ykin'ym]
510 	for ( DWORD * pRes=results; *pRes!=AOT_NOFORM; pRes++ )
511 		if ( m_NPSs[ AOT_MODEL_NO ( *pRes ) ]==AOT_POS_UNKNOWN )
512 	{
513 		*results = AOT_NOFORM;
514 		break;
515 	}
516 
517 	// what, still no results?
518 	if ( *results==AOT_NOFORM )
519 	{
520 		// attempt prediction by database
521 		PredictByDataBase ( pWord, iLen, results, bCap );
522 
523 		// filter out too short flexias
524 		DWORD * s = results;
525 		DWORD * d = s;
526 		while ( *s!=AOT_NOFORM )
527 		{
528 			const CMorphForm & F = m_FlexiaModels [ AOT_MODEL_NO(*s) ][ AOT_ITEM_NO(*s) ];
529 			if ( F.m_FlexiaLen<iLen )
530 				*d++ = *s;
531 			s++;
532 		}
533 		*d = AOT_NOFORM;
534 	}
535 
536 	return false;
537 }
538 
539 
PredictByDataBase(const BYTE * pWord,int iLen,DWORD * FindResults,bool is_cap) const540 void CLemmatizer::PredictByDataBase ( const BYTE * pWord, int iLen, DWORD * FindResults, bool is_cap ) const
541 {
542 	// FIXME? handle all-consonant abbreviations anyway?
543 	// was: if ( CheckAbbreviation ( InputWordStr, FindResults, is_cap ) ) return;
544 
545 	assert ( *FindResults==AOT_NOFORM );
546 	DWORD * pOut = FindResults;
547 	CSphVector<CPredictTuple> res;
548 
549 	// if the ABC is wrong this prediction yields too many variants
550 	if ( m_FormAutomat.CheckABCWithoutAnnotator ( pWord ) )
551 		PredictFind ( pWord, iLen, res );
552 
553 	// assume not more than 32 different pos
554 	int has_nps[32];
555 	for ( int i=0; i<32; i++ )
556 		has_nps[i] = -1;
557 
558 	ARRAY_FOREACH ( j, res )
559 	{
560 		BYTE PartOfSpeechNo = res[j].m_PartOfSpeechNo;
561 		if_const ( !m_bMaximalPrediction && has_nps[PartOfSpeechNo]!=-1 )
562 		{
563 			int iOldFreq = m_ModelFreq [ AOT_MODEL_NO ( FindResults[has_nps[PartOfSpeechNo]] ) ];
564 			int iNewFreq = m_ModelFreq [ m_LemmaFlexiaModel [ res[j].m_LemmaInfoNo ] ];
565 			if ( iOldFreq < iNewFreq )
566 				FindResults [ has_nps [ PartOfSpeechNo ] ] = PredictPack ( res[j] );
567 			continue;
568 		}
569 
570 		has_nps [ PartOfSpeechNo ] = (int)( pOut-FindResults );
571 		*pOut++ = PredictPack ( res[j] );
572 		*pOut = AOT_NOFORM;
573 	}
574 
575 	if	( has_nps[0]==-1 // no noun
576 		|| ( is_cap && !m_bIsGerman ) ) // or can be a proper noun (except German, where all nouns are written uppercase)
577 	{
578 		static BYTE CriticalNounLetterPack[4] = "+++";
579 		PredictFind ( CriticalNounLetterPack, AOT_MIN_PREDICTION_SUFFIX, res );
580 		*pOut++ = PredictPack ( res.Last() );
581 		*pOut = AOT_NOFORM;
582 	}
583 }
584 
585 
LoadPak(CSphReader & rd)586 bool CLemmatizer::LoadPak ( CSphReader & rd )
587 {
588 	rd.Tag ( "sphinx-aot" );
589 	int iVer = rd.UnzipInt();
590 	if ( iVer!=1 )
591 		return false;
592 
593 	rd.Tag ( "alphabet-desc" );
594 	AlphabetDesc_t tDesc;
595 	tDesc.m_iSize = rd.UnzipInt();
596 	rd.GetBytes ( tDesc.m_dCode2Alpha, tDesc.m_iSize );
597 	rd.GetBytes ( tDesc.m_dCode2AlphaWA, tDesc.m_iSize );
598 
599 	m_FormAutomat.InitAlphabet ( tDesc );
600 	m_SuffixAutomat.InitAlphabet ( tDesc );
601 
602 	rd.Tag ( "uc-table" );
603 	rd.GetBytes ( m_UC, 256 );
604 
605 	// caching forms can help a lot (from 4% with 256K cache to 13% with 110M cache)
606 	rd.Tag ( "forms-automaton" );
607 	m_FormAutomat.LoadPak ( rd, g_iCacheSize );
608 
609 	rd.Tag ( "flexia-models" );
610 	m_FlexiaModels.Resize ( rd.UnzipInt() );
611 	ARRAY_FOREACH ( i, m_FlexiaModels )
612 	{
613 		m_FlexiaModels[i].Resize ( rd.UnzipInt() );
614 		ARRAY_FOREACH ( j, m_FlexiaModels[i] )
615 		{
616 			CMorphForm & F = m_FlexiaModels[i][j];
617 			F.m_FlexiaLen = (BYTE) rd.GetByte();
618 			rd.GetBytes ( F.m_Flexia, F.m_FlexiaLen );
619 			F.m_PrefixLen = (BYTE) rd.GetByte();
620 			rd.GetBytes ( F.m_Prefix, F.m_PrefixLen );
621 			F.m_POS = (BYTE) rd.GetByte();
622 
623 			assert ( F.m_FlexiaLen<sizeof(F.m_Flexia) );
624 			assert ( F.m_PrefixLen<sizeof(F.m_Prefix) );
625 			F.m_Flexia[F.m_FlexiaLen] = 0;
626 			F.m_Prefix[F.m_PrefixLen] = 0;
627 		}
628 	}
629 
630 	rd.Tag ( "prefixes" );
631 	for ( int i=0; i<MAX_PREFIX_LEN; i++ )
632 		m_PrefixLen[i] = rd.UnzipInt();
633 	m_PrefixBlob.Resize ( rd.UnzipInt() );
634 	rd.GetBytes ( m_PrefixBlob.Begin(), m_PrefixBlob.GetLength() );
635 
636 	rd.Tag ( "lemma-flexia-models" );
637 	m_LemmaFlexiaModel.Resize ( rd.UnzipInt() );
638 	ARRAY_FOREACH ( i, m_LemmaFlexiaModel )
639 		m_LemmaFlexiaModel[i] = (WORD) rd.UnzipInt();
640 
641 	// build model freqs
642 	m_ModelFreq.Resize ( m_FlexiaModels.GetLength() );
643 	m_ModelFreq.Fill ( 0 );
644 	ARRAY_FOREACH ( i, m_LemmaFlexiaModel )
645 		m_ModelFreq [ m_LemmaFlexiaModel[i] ]++;
646 
647 	rd.Tag ( "nps-vector" );
648 	m_NPSs.Resize ( rd.UnzipInt() );
649 	rd.GetBytes ( m_NPSs.Begin(), m_NPSs.GetLength() );
650 
651 	// caching predictions does not measurably affect performance though
652 	rd.Tag ( "prediction-automaton" );
653 	m_SuffixAutomat.LoadPak ( rd, 0 );
654 
655 	rd.Tag ( "eof" );
656 	return !rd.GetErrorFlag();
657 }
658 
659 //////////////////////////////////////////////////////////////////////////
660 // SPHINX MORPHOLOGY INTERFACE
661 //////////////////////////////////////////////////////////////////////////
662 
663 const char* AOT_LANGUAGES[AOT_LENGTH] = {"ru", "en", "de" };
664 
665 static CLemmatizer *	g_pLemmatizers[AOT_LENGTH] = {0};
666 static CSphNamedInt		g_tDictinfos[AOT_LENGTH];
667 
sphAotSetCacheSize(int iCacheSize)668 void sphAotSetCacheSize ( int iCacheSize )
669 {
670 	g_iCacheSize = Max ( iCacheSize, 0 );
671 }
672 
AotInit(const CSphString & sDictFile,CSphString & sError,int iLang)673 bool AotInit ( const CSphString & sDictFile, CSphString & sError, int iLang )
674 {
675 	if ( g_pLemmatizers[iLang] )
676 		return true;
677 
678 	CSphAutofile rdFile;
679 	if ( rdFile.Open ( sDictFile, SPH_O_READ, sError )<0 )
680 		return false;
681 
682 	g_pLemmatizers[iLang] = new CLemmatizer ( iLang==AOT_DE );
683 	g_pLemmatizers[iLang]->m_iLang = iLang;
684 
685 	CSphReader rd;
686 	rd.SetFile ( rdFile );
687 	if ( !g_pLemmatizers[iLang]->LoadPak(rd) )
688 	{
689 		sError.SetSprintf ( "failed to load lemmatizer dictionary: %s", rd.GetErrorMessage().cstr() );
690 		SafeDelete ( g_pLemmatizers[iLang] );
691 		return false;
692 	}
693 
694 	// track dictionary crc
695 	DWORD uCrc;
696 	if ( !sphCalcFileCRC32 ( sDictFile.cstr(), uCrc ) )
697 	{
698 		sError.SetSprintf ( "failed to crc32 lemmatizer dictionary %s", sDictFile.cstr() );
699 		SafeDelete ( g_pLemmatizers[iLang] );
700 		return false;
701 	}
702 
703 	// extract basename
704 	const char * a = sDictFile.cstr();
705 	const char * b = a + strlen(a) - 1;
706 	while ( b>a && b[-1]!='/' && b[-1]!='\\' )
707 		b--;
708 
709 	g_tDictinfos[iLang].m_sName = b;
710 	g_tDictinfos[iLang].m_iValue = (int)uCrc;
711 	return true;
712 }
713 
sphAotInit(const CSphString & sDictFile,CSphString & sError,int iLang)714 bool sphAotInit ( const CSphString & sDictFile, CSphString & sError, int iLang )
715 {
716 	return AotInit ( sDictFile, sError, iLang );
717 }
718 
IsAlpha1251(BYTE c)719 static inline bool IsAlpha1251 ( BYTE c )
720 {
721 	return ( c>=0xC0 || c==0xA8 || c==0xB8 );
722 }
723 
IsGermanAlpha1252(BYTE c)724 static inline bool IsGermanAlpha1252 ( BYTE c )
725 {
726 	if ( c==0xb5 || c==0xdf )
727 		return true;
728 
729 	BYTE lc = c | 0x20;
730 	switch ( lc )
731 	{
732 	case 0xe2:
733 	case 0xe4:
734 	case 0xe7:
735 	case 0xe8:
736 	case 0xe9:
737 	case 0xea:
738 	case 0xf1:
739 	case 0xf4:
740 	case 0xf6:
741 	case 0xfb:
742 	case 0xfc:
743 		return true;
744 	default:
745 		return ( lc>0x60 && lc<0x7b );
746 	}
747 }
748 
IsAlphaAscii(BYTE c)749 static inline bool IsAlphaAscii ( BYTE c )
750 {
751 	BYTE lc = c | 0x20;
752 	return ( lc>0x60 && lc<0x7b );
753 }
754 
755 enum EMMITERS {EMIT_1BYTE, EMIT_UTF8RU, EMIT_UTF8};
756 template < EMMITERS >
Emit(BYTE * sOut,BYTE uChar)757 inline BYTE * Emit ( BYTE * sOut, BYTE uChar )
758 {
759 	if ( uChar=='-' )
760 		return sOut;
761 	*sOut++ = uChar | 0x20;
762 	return sOut;
763 }
764 
765 template<>
Emit(BYTE * sOut,BYTE uChar)766 inline BYTE * Emit<EMIT_UTF8RU> ( BYTE * sOut, BYTE uChar )
767 {
768 	if ( uChar=='-' )
769 		return sOut;
770 	assert ( uChar!=0xA8 && uChar!=0xB8 ); // no country for yo
771 	uChar |= 0x20; // lowercase, E0..FF range now
772 	if ( uChar & 0x10 )
773 	{
774 		// F0..FF -> D1 80..D1 8F
775 		*sOut++ = 0xD1;
776 		*sOut++ = uChar - 0x70;
777 	} else
778 	{
779 		// E0..EF -> D0 B0..D0 BF
780 		*sOut++ = 0xD0;
781 		*sOut++ = uChar - 0x30;
782 	}
783 	return sOut;
784 }
785 
786 template<>
Emit(BYTE * sOut,BYTE uChar)787 inline BYTE * Emit<EMIT_UTF8> ( BYTE * sOut, BYTE uChar )
788 {
789 	if ( uChar=='-' )
790 		return sOut;
791 
792 	if ( uChar!=0xDF ) // don't touch 'ss' umlaut
793 		uChar |= 0x20;
794 
795 	if ( uChar & 0x80 )
796 	{
797 		*sOut++ = 0xC0 | (uChar>>6);
798 		*sOut++ = 0x80 | (uChar&0x3F); // NOLINT
799 	} else
800 		*sOut++ = uChar;
801 	return sOut;
802 }
803 
804 template < EMMITERS IS_UTF8 >
CreateLemma(BYTE * sOut,const BYTE * sBase,int iBaseLen,bool bFound,const CFlexiaModel & M,const CMorphForm & F)805 inline void CreateLemma ( BYTE * sOut, const BYTE * sBase, int iBaseLen, bool bFound, const CFlexiaModel & M, const CMorphForm & F )
806 {
807 	// cut the form prefix
808 	int PrefixLen = F.m_PrefixLen;
809 	if	( bFound || strncmp ( (const char*)sBase, F.m_Prefix, PrefixLen )==0 )
810 	{
811 		sBase += PrefixLen;
812 		iBaseLen -= PrefixLen;
813 	}
814 
815 	// FIXME! maybe handle these lemma wide prefixes too?
816 #if 0
817 	const string & LemmPrefix = m_pParent->m_Prefixes[m_InnerAnnot.m_PrefixNo];
818 	if ( m_bFound
819 		|| (
820 		( m_InputWordBase.substr ( 0, LemmPrefix.length() )==LemmPrefix ) &&
821 		( m_InputWordBase.substr ( LemmPrefix.length(), F.m_PrefixStr.length() )==F.m_PrefixStr ) ) )
822 	{
823 		m_InputWordBase.erase ( 0, LemmPrefix.length()+ M.m_PrefixStr.length() );
824 		m_bPrefixesWereCut = true;
825 	}
826 #endif
827 
828 	// cut the form suffix and append the lemma suffix
829 	// UNLESS this was a predicted form, and form suffix does not fully match!
830 	// eg. word=GUBARIEVICHA, flexion=IEIVICHA, so this is not really a matching lemma
831 	int iSuff = F.m_FlexiaLen;
832 	if ( bFound || ( iBaseLen>=iSuff && strncmp ( (const char*)sBase+iBaseLen-iSuff, F.m_Flexia, iSuff )==0 ) )
833 	{
834 		// ok, found and/or suffix matches, the usual route
835 		int iCodePoints = 0;
836 		iBaseLen -= iSuff;
837 		while ( iBaseLen-- && iCodePoints<SPH_MAX_WORD_LEN )
838 		{
839 			sOut = Emit<IS_UTF8> ( sOut, *sBase++ );
840 			iCodePoints++;
841 		}
842 
843 		int iLemmaSuff = M[0].m_FlexiaLen;
844 		const char * sFlexia = M[0].m_Flexia;
845 		while ( iLemmaSuff-- && iCodePoints<SPH_MAX_WORD_LEN ) // OPTIMIZE? can remove len here
846 		{
847 			sOut = Emit<IS_UTF8> ( sOut, *sFlexia++ );
848 			iCodePoints++;
849 		}
850 	} else
851 	{
852 		// whoops, no suffix match, just copy and lowercase the current base
853 		while ( iBaseLen-- )
854 			sOut = Emit<IS_UTF8> ( sOut, *sBase++ );
855 	}
856 	*sOut = '\0';
857 }
858 
IsRuFreq2(BYTE * pWord)859 static inline bool IsRuFreq2 ( BYTE * pWord )
860 {
861 	if ( pWord[2]!=0 )
862 		return false;
863 
864 	int iCode = ( ( pWord[0]<<8 ) + pWord[1] ) | 0x2020;
865 	switch ( iCode )
866 	{
867 		case 0xEDE0: // na
868 		case 0xEFEE: // po
869 		case 0xEDE5: // ne
870 		case 0xEEF2: // ot
871 		case 0xE7E0: // za
872 		case 0xEEE1: // ob
873 		case 0xE4EE: // do
874 		case 0xF1EE: // so
875 		case 0xE8E7: // iz
876 		case 0xE8F5: // ih
877 		case 0xF8F2: // sht
878 		case 0xF3EB: // ul
879 			return true;
880 	}
881 	return false;
882 }
883 
IsEnFreq2(BYTE *)884 static inline bool IsEnFreq2 ( BYTE * )
885 {
886 	// stub
887 	return false;
888 }
889 
IsDeFreq2(BYTE *)890 static inline bool IsDeFreq2 ( BYTE * )
891 {
892 	// stub
893 	return false;
894 }
895 
IsRuFreq3(BYTE * pWord)896 static inline bool IsRuFreq3 ( BYTE * pWord )
897 {
898 	if ( pWord[3]!=0 )
899 		return false;
900 	int iCode = ( ( pWord[0]<<16 ) + ( pWord[1]<<8 ) + pWord[2] ) | 0x202020;
901 	return ( iCode==0xE8EBE8 || iCode==0xE4EBFF || iCode==0xEFF0E8 // ili, dlya, pri
902 		|| iCode==0xE3EEE4 || iCode==0xF7F2EE || iCode==0xE1E5E7 ); // god, chto, bez
903 }
904 
IsEnFreq3(BYTE *)905 static inline bool IsEnFreq3 ( BYTE * )
906 {
907 	// stub
908 	return false;
909 }
910 
IsDeFreq3(BYTE *)911 static inline bool IsDeFreq3 ( BYTE * )
912 {
913 	// stub
914 	return false;
915 }
916 
sphAotLemmatizeRu1251(BYTE * pWord)917 void sphAotLemmatizeRu1251 ( BYTE * pWord )
918 {
919 	// i must be initialized
920 	assert ( g_pLemmatizers[AOT_RU] );
921 
922 	// pass-through 1-char words, and non-Russian words
923 	if ( !IsAlpha1251(*pWord) || !pWord[1] )
924 		return;
925 
926 	// handle a few most frequent 2-char, 3-char pass-through words
927 	if ( IsRuFreq2(pWord) || IsRuFreq3(pWord) )
928 		return;
929 
930 	// do lemmatizing
931 	// input keyword moves into sForm; LemmatizeWord() will also case fold sForm
932 	// we will generate results using sForm into pWord; so we need this extra copy
933 	BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
934 	int iFormLen = 0;
935 
936 	// faster than strlen and strcpy..
937 	for ( BYTE * p=pWord; *p; )
938 		sForm[iFormLen++] = *p++;
939 	sForm[iFormLen] = '\0';
940 
941 	DWORD FindResults[12]; // max results is like 6
942 	bool bFound = g_pLemmatizers[AOT_RU]->LemmatizeWord ( (BYTE*)sForm, FindResults );
943 	if ( FindResults[0]==AOT_NOFORM )
944 		return;
945 
946 	// pick a single form
947 	// picks a noun, if possible, and otherwise prefers shorter forms
948 	bool bNoun = false;
949 	for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
950 	{
951 		const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
952 		const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
953 
954 		bool bNewNoun = ( F.m_POS==0 );
955 		if ( i==0 || ( !bNoun && bNewNoun ) )
956 		{
957 			CreateLemma<EMIT_1BYTE> ( pWord, sForm, iFormLen, bFound, M, F );
958 			bNoun = bNewNoun;
959 		} else if ( bNoun==bNewNoun )
960 		{
961 			BYTE sBuf[256];
962 			CreateLemma<EMIT_1BYTE> ( sBuf, sForm, iFormLen, bFound, M, F );
963 			if ( strcmp ( (char*)sBuf, (char*)pWord )<0 )
964 				strcpy ( (char*)pWord, (char*)sBuf ); // NOLINT
965 		}
966 	}
967 }
968 
sphAotLemmatize(BYTE * pWord,int iLang)969 void sphAotLemmatize ( BYTE * pWord, int iLang )
970 {
971 	// i must be initialized
972 	assert ( g_pLemmatizers[iLang] );
973 
974 	// pass-through 1-char words, and non-Russian words
975 	if ( !IsAlphaAscii(*pWord) || !pWord[1] )
976 		return;
977 
978 	// handle a few most frequent 2-char, 3-char pass-through words
979 	if ( iLang==AOT_EN && ( IsEnFreq2(pWord) || IsEnFreq3(pWord) ) )
980 		return;
981 
982 	if ( iLang==AOT_DE && ( IsDeFreq2(pWord) || IsDeFreq3(pWord) ) )
983 		return;
984 
985 	// do lemmatizing
986 	// input keyword moves into sForm; LemmatizeWord() will also case fold sForm
987 	// we will generate results using sForm into pWord; so we need this extra copy
988 	BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
989 	int iFormLen = 0;
990 
991 	// faster than strlen and strcpy..
992 	for ( BYTE * p=pWord; *p; )
993 		sForm[iFormLen++] = *p++;
994 	sForm[iFormLen] = '\0';
995 
996 	// do nothing with one-char words
997 	if ( iFormLen<=1 )
998 		return;
999 
1000 	DWORD FindResults[12]; // max results is like 6
1001 	bool bFound = g_pLemmatizers[iLang]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1002 	if ( FindResults[0]==AOT_NOFORM )
1003 		return;
1004 
1005 	// pick a single form
1006 	// picks a noun, if possible, and otherwise prefers shorter forms
1007 	bool bNoun = false;
1008 	for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1009 	{
1010 		const CFlexiaModel & M = g_pLemmatizers[iLang]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1011 		const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1012 
1013 		bool bNewNoun = ( F.m_POS==0 );
1014 		if ( i==0 || ( !bNoun && bNewNoun ) )
1015 		{
1016 			CreateLemma<EMIT_1BYTE> ( pWord, sForm, iFormLen, bFound, M, F );
1017 			bNoun = bNewNoun;
1018 		} else if ( bNoun==bNewNoun )
1019 		{
1020 			BYTE sBuf[256];
1021 			CreateLemma<EMIT_1BYTE> ( sBuf, sForm, iFormLen, bFound, M, F );
1022 			if ( strcmp ( (char*)sBuf, (char*)pWord )<0 )
1023 				strcpy ( (char*)pWord, (char*)sBuf ); // NOLINT
1024 		}
1025 	}
1026 }
1027 
IsRussianAlphaUtf8(const BYTE * pWord)1028 static inline bool IsRussianAlphaUtf8 ( const BYTE * pWord )
1029 {
1030 	// letters, windows-1251, utf-8
1031 	// A..YA, C0..DF, D0 90..D0 AF
1032 	// a..p, E0..EF, D0 B0..D0 BF
1033 	// r..ya, F0..FF, D1 80..D1 8F
1034 	// YO, A8, D0 81
1035 	// yo, B8, D1 91
1036 	if ( pWord[0]==0xD0 )
1037 		if ( pWord[1]==0x81 || ( pWord[1]>=0x90 && pWord[1]<0xC0 ) )
1038 			return true;
1039 	if ( pWord[0]==0xD1 )
1040 		if ( pWord[1]>=0x80 && pWord[1]<=0x91 && pWord[1]!=0x90 )
1041 			return true;
1042 	return false;
1043 }
1044 
sphAotLemmatizeDe1252(BYTE * pWord)1045 void sphAotLemmatizeDe1252 ( BYTE * pWord )
1046 {
1047 	// i must be initialized
1048 	assert ( g_pLemmatizers[AOT_DE] );
1049 
1050 	// pass-through 1-char words, and non-German words
1051 	if ( !IsGermanAlpha1252(*pWord) || !pWord[1] )
1052 		return;
1053 
1054 	// handle a few most frequent 2-char, 3-char pass-through words
1055 	if ( IsDeFreq2(pWord) || IsDeFreq3(pWord) )
1056 		return;
1057 
1058 	// do lemmatizing
1059 	// input keyword moves into sForm; LemmatizeWord() will also case fold sForm
1060 	// we will generate results using sForm into pWord; so we need this extra copy
1061 	BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
1062 	int iFormLen = 0;
1063 
1064 	// faster than strlen and strcpy..
1065 	for ( BYTE * p=pWord; *p; )
1066 		sForm[iFormLen++] = *p++;
1067 	sForm[iFormLen] = '\0';
1068 
1069 	DWORD FindResults[12]; // max results is like 6
1070 	bool bFound = g_pLemmatizers[AOT_DE]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1071 	if ( FindResults[0]==AOT_NOFORM )
1072 		return;
1073 
1074 	// pick a single form
1075 	// picks a noun, if possible, and otherwise prefers shorter forms
1076 	bool bNoun = false;
1077 	for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1078 	{
1079 		const CFlexiaModel & M = g_pLemmatizers[AOT_DE]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1080 		const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1081 
1082 		bool bNewNoun = ( F.m_POS==0 );
1083 		if ( i==0 || ( !bNoun && bNewNoun ) )
1084 		{
1085 			CreateLemma<EMIT_1BYTE> ( pWord, sForm, iFormLen, bFound, M, F );
1086 			bNoun = bNewNoun;
1087 		} else if ( bNoun==bNewNoun )
1088 		{
1089 			BYTE sBuf[256];
1090 			CreateLemma<EMIT_1BYTE> ( sBuf, sForm, iFormLen, bFound, M, F );
1091 			if ( strcmp ( (char*)sBuf, (char*)pWord )<0 )
1092 				strcpy ( (char*)pWord, (char*)sBuf ); // NOLINT
1093 		}
1094 	}
1095 }
1096 
1097 /// returns length in bytes (aka chars) if all letters were russian and converted
1098 /// returns 0 and aborts early if non-russian letters are encountered
Utf8ToWin1251(BYTE * pOut,const BYTE * pWord)1099 static inline int Utf8ToWin1251 ( BYTE * pOut, const BYTE * pWord )
1100 {
1101 	// YO, win A8, utf D0 81
1102 	// A..YA, win C0..DF, utf D0 90..D0 AF
1103 	// a..p, win E0..EF, utf D0 B0..D0 BF
1104 	// r..ya, win F0..FF, utf D1 80..D1 8F
1105 	// yo, win B8, utf D1 91
1106 	static const BYTE dTable[128] =
1107 	{
1108 		0, 0xa8, 0, 0, 0, 0, 0, 0, // 00
1109 		0, 0, 0, 0, 0, 0, 0, 0, // 08
1110 		0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, // 10
1111 		0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, // 18
1112 		0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, // 20
1113 		0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, // 28
1114 		0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, // 30
1115 		0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, // 38
1116 		0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, // 40
1117 		0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, // 48
1118 		0, 0xb8, 0, 0, 0, 0, 0, 0, // 50
1119 		0, 0, 0, 0, 0, 0, 0, 0, // 58
1120 		0, 0, 0, 0, 0, 0, 0, 0, // 60
1121 		0, 0, 0, 0, 0, 0, 0, 0, // 68
1122 		0, 0, 0, 0, 0, 0, 0, 0, // 70
1123 		0, 0, 0, 0, 0, 0, 0, 0 // 78
1124 	};
1125 
1126 	BYTE * pStart = pOut;
1127 	while ( *pWord )
1128 	{
1129 		// russian utf-8 letters begin with either D0 or D1
1130 		// and any valid 2nd utf-8 byte must be in 80..BF range
1131 		if ( ( *pWord & 0xFE )!=0xD0 )
1132 			return 0;
1133 		assert ( pWord[1]>=0x80 && pWord[1]<0xC0 );
1134 
1135 		// table index D0 80..BF to 0..3F, and D1 80..BF to 40..7F
1136 		BYTE uWin = dTable [ ( pWord[1] & 0x7F ) + ( ( pWord[0] & 1 )<<6 ) ];
1137 		pWord += 2;
1138 
1139 		if ( !uWin )
1140 			return 0;
1141 		*pOut++ = uWin;
1142 	}
1143 
1144 	*pOut = '\0';
1145 	return (int)( pOut-pStart );
1146 }
1147 
1148 /// returns length in bytes (aka chars) if all letters were converted
1149 /// returns 0 and aborts early if non-western letters are encountered
Utf8ToWin1252(BYTE * pOut,const BYTE * pWord)1150 static inline int Utf8ToWin1252 ( BYTE * pOut, const BYTE * pWord )
1151 {
1152 	BYTE * pStart = pOut;
1153 	while ( *pWord )
1154 	{
1155 		if ( (*pWord)&0x80 )
1156 		{
1157 			if ( ((*pWord)&0xFC)==0xC0 )
1158 			{
1159 				*pOut++ = ( pWord[1] & 0x7F ) + ( ( pWord[0] & 3 )<<6 );
1160 				pWord += 2;
1161 			} else
1162 				return 0;
1163 		} else
1164 			*pOut++ = *pWord++;
1165 	}
1166 
1167 	*pOut = '\0';
1168 	return (int)( pOut-pStart );
1169 }
1170 
IsGermanAlphaUtf8(const BYTE * pWord)1171 static inline bool IsGermanAlphaUtf8 ( const BYTE * pWord )
1172 {
1173 	// letters, windows-1252, utf-8
1174 	// A..Z, trivial
1175 	if ( pWord[0]>0x40 && pWord[0]<0x5b )
1176 		return true;
1177 
1178 	// a..z, also trivial
1179 	if ( pWord[0]>0x60 && pWord[0]<0x7b )
1180 		return true;
1181 
1182 	// mu, 0xb5
1183 	if ( pWord[0]==0xC2 && pWord[1]==0xB5 )
1184 		return true;
1185 
1186 	// some upper
1187 	if ( pWord[0]==0xC3 )
1188 	{
1189 		if ( pWord[1]==0X9F ) // ss umlaut
1190 			return true;
1191 		switch ( pWord[1] | 0x20 )
1192 		{
1193 			case 0xA2: // umlauts
1194 			case 0xA4:
1195 			case 0xA7:
1196 			case 0xA8:
1197 			case 0xA9:
1198 			case 0xAa:
1199 			case 0xB1:
1200 			case 0xB4:
1201 			case 0xB6:
1202 			case 0xBb:
1203 			case 0xBc:
1204 				return true;
1205 		}
1206 	}
1207 	return false;
1208 }
1209 
Win1251ToLowercaseUtf8(BYTE * pOut,const BYTE * pWord)1210 static inline void Win1251ToLowercaseUtf8 ( BYTE * pOut, const BYTE * pWord )
1211 {
1212 	while ( *pWord )
1213 	{
1214 		// a..p, E0..EF maps to D0 B0..D0 BF
1215 		// r..ya, F0..FF maps to D1 80..D1 8F
1216 		// yo maps to D1 91
1217 		if ( *pWord>=0xC0 )
1218 		{
1219 			BYTE iCh = ( *pWord | 0x20 ); // lowercase
1220 			BYTE iF = ( iCh>>4 ) & 1; // 0xE? or 0xF? value
1221 			*pOut++ = 0xD0 + iF;
1222 			*pOut++ = iCh - 0x30 - ( iF<<6 );
1223 		} else if ( *pWord==0xA8 || *pWord==0xB8 )
1224 		{
1225 			*pOut++ = 0xD1;
1226 			*pOut++ = 0x91;
1227 		} else
1228 			assert ( false );
1229 		pWord++;
1230 	}
1231 	*pOut++ = '\0';
1232 }
1233 
Win1252ToLowercaseUtf8(BYTE * pOut,const BYTE * pWord)1234 static inline void Win1252ToLowercaseUtf8 ( BYTE * pOut, const BYTE * pWord )
1235 {
1236 	while ( *pWord )
1237 	{
1238 		if ( !((*pWord)&0x80) )
1239 			*pOut++ = *pWord | 0x20;
1240 		else
1241 		{
1242 			*pOut++ = 0xC0 | ((*pWord)>>6);
1243 			*pOut++ = 0x80 | ((*pWord)&0x3F);
1244 		}
1245 		++pWord;
1246 	}
1247 	*pOut++ = '\0';
1248 }
1249 
sphAotLemmatizeRuUTF8(BYTE * pWord)1250 void sphAotLemmatizeRuUTF8 ( BYTE * pWord )
1251 {
1252 	// i must be initialized
1253 	assert ( g_pLemmatizers[AOT_RU] );
1254 
1255 	// only if the word is russian
1256 	if ( !IsRussianAlphaUtf8(pWord) )
1257 		return;
1258 
1259 	// convert to Windows-1251
1260 	// failure means we should not lemmatize this
1261 	BYTE sBuf [ SPH_MAX_WORD_LEN+4 ];
1262 	if ( !Utf8ToWin1251 ( sBuf, pWord ) )
1263 		return;
1264 
1265 	// lemmatize, convert back, done!
1266 	sphAotLemmatizeRu1251 ( sBuf );
1267 	Win1251ToLowercaseUtf8 ( pWord, sBuf );
1268 }
1269 
sphAotLemmatizeDeUTF8(BYTE * pWord)1270 void sphAotLemmatizeDeUTF8 ( BYTE * pWord )
1271 {
1272 	// i must be initialized
1273 	assert ( g_pLemmatizers[AOT_DE] );
1274 
1275 	// only if the word is german
1276 	if ( !IsGermanAlphaUtf8(pWord) )
1277 		return;
1278 
1279 	// convert to Windows-1252
1280 	// failure means we should not lemmatize this
1281 	BYTE sBuf [ SPH_MAX_WORD_LEN+4 ];
1282 	if ( !Utf8ToWin1252 ( sBuf, pWord ) )
1283 		return;
1284 
1285 	// lemmatize, convert back, done!
1286 	sphAotLemmatizeDe1252 ( sBuf );
1287 	Win1252ToLowercaseUtf8 ( pWord, sBuf );
1288 }
1289 
sphAotLemmatizeRu(CSphVector<CSphString> & dLemmas,const BYTE * pWord)1290 void sphAotLemmatizeRu ( CSphVector<CSphString> & dLemmas, const BYTE * pWord )
1291 {
1292 	assert ( g_pLemmatizers[AOT_RU] );
1293 	if ( !IsRussianAlphaUtf8(pWord) )
1294 		return;
1295 
1296 	BYTE sForm [ SPH_MAX_WORD_LEN+4 ];
1297 	int iFormLen = 0;
1298 	iFormLen = Utf8ToWin1251 ( sForm, pWord );
1299 
1300 	if ( iFormLen<2 || IsRuFreq2(sForm) )
1301 		return;
1302 	if ( iFormLen<3 || IsRuFreq3(sForm) )
1303 		return;
1304 
1305 	DWORD FindResults[12]; // max results is like 6
1306 	bool bFound = g_pLemmatizers[AOT_RU]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1307 	if ( FindResults[0]==AOT_NOFORM )
1308 		return;
1309 
1310 	for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1311 	{
1312 		const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1313 		const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1314 
1315 		BYTE sRes [ 3*SPH_MAX_WORD_LEN+4 ];
1316 
1317 		CreateLemma<EMIT_UTF8RU> ( sRes, sForm, iFormLen, bFound, M, F );
1318 		dLemmas.Add ( (const char*)sRes );
1319 	}
1320 
1321 	// OPTIMIZE?
1322 	dLemmas.Uniq();
1323 }
1324 
sphAotLemmatizeDe(CSphVector<CSphString> & dLemmas,const BYTE * pWord)1325 void sphAotLemmatizeDe ( CSphVector<CSphString> & dLemmas, const BYTE * pWord )
1326 {
1327 	assert ( g_pLemmatizers[AOT_DE] );
1328 	if ( !IsGermanAlphaUtf8(pWord) )
1329 		return;
1330 
1331 	BYTE sForm [ SPH_MAX_WORD_LEN+4 ];
1332 	int iFormLen = 0;
1333 	iFormLen = Utf8ToWin1252 ( sForm, pWord );
1334 
1335 	if ( iFormLen<=1 )
1336 		return;
1337 
1338 	if ( IsDeFreq2(sForm) || IsDeFreq3(sForm) )
1339 		return;
1340 
1341 	DWORD FindResults[12]; // max results is like 6
1342 	bool bFound = g_pLemmatizers[AOT_DE]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1343 	if ( FindResults[0]==AOT_NOFORM )
1344 		return;
1345 
1346 	for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1347 	{
1348 		const CFlexiaModel & M = g_pLemmatizers[AOT_DE]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1349 		const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1350 
1351 		BYTE sRes [ 3*SPH_MAX_WORD_LEN+4 ];
1352 
1353 		CreateLemma<EMIT_UTF8> ( sRes, sForm, iFormLen, bFound, M, F );
1354 		dLemmas.Add ( (const char*)sRes );
1355 	}
1356 
1357 	// OPTIMIZE?
1358 	dLemmas.Uniq();
1359 }
1360 
1361 // generic lemmatize for other languages
sphAotLemmatize(CSphVector<CSphString> & dLemmas,const BYTE * pWord,int iLang)1362 void sphAotLemmatize ( CSphVector<CSphString> & dLemmas, const BYTE * pWord, int iLang )
1363 {
1364 	assert ( iLang!=AOT_RU ); // must be processed by the specialized function
1365 	assert ( g_pLemmatizers[iLang] );
1366 
1367 	if ( !IsAlphaAscii(*pWord) )
1368 		return;
1369 
1370 	BYTE sForm [ SPH_MAX_WORD_LEN+4 ];
1371 	int iFormLen = 0;
1372 
1373 	while ( *pWord )
1374 		sForm [ iFormLen++ ] = *pWord++;
1375 	sForm [ iFormLen ] = '\0';
1376 
1377 	if ( iFormLen<=1 )
1378 		return;
1379 
1380 	if ( iLang==AOT_EN && ( IsEnFreq2(sForm) || IsEnFreq3(sForm) ) )
1381 		return;
1382 
1383 	if ( iLang==AOT_DE && ( IsDeFreq2(sForm) || IsDeFreq3(sForm) ) )
1384 		return;
1385 
1386 	DWORD FindResults[12]; // max results is like 6
1387 	bool bFound = g_pLemmatizers[iLang]->LemmatizeWord ( (BYTE*)sForm, FindResults );
1388 	if ( FindResults[0]==AOT_NOFORM )
1389 		return;
1390 
1391 	for ( int i=0; FindResults[i]!=AOT_NOFORM; i++ )
1392 	{
1393 		const CFlexiaModel & M = g_pLemmatizers[iLang]->m_FlexiaModels [ AOT_MODEL_NO ( FindResults[i] ) ];
1394 		const CMorphForm & F = M [ AOT_ITEM_NO ( FindResults[i] ) ];
1395 
1396 		BYTE sRes [ 3*SPH_MAX_WORD_LEN+4 ];
1397 		CreateLemma<EMIT_1BYTE> ( sRes, sForm, iFormLen, bFound, M, F );
1398 
1399 		dLemmas.Add ( (const char*)sRes );
1400 	}
1401 
1402 	// OPTIMIZE?
1403 	dLemmas.Uniq();
1404 }
1405 
1406 
sphAotDictinfo(int iLang)1407 const CSphNamedInt & sphAotDictinfo ( int iLang )
1408 {
1409 	return g_tDictinfos[iLang];
1410 }
1411 
1412 //////////////////////////////////////////////////////////////////////////
1413 
1414 /// token filter for AOT morphology indexing
1415 /// AOT may return multiple (!) morphological hypotheses for a single token
1416 /// we return such additional hypotheses as blended tokens
1417 class CSphAotTokenizerTmpl : public CSphTokenFilter
1418 {
1419 protected:
1420 	BYTE		m_sForm [ SPH_MAX_WORD_LEN*3+4 ];	///< aka MAX_KEYWORD_BYTES
1421 	int			m_iFormLen;							///< in bytes, but in windows-1251 that is characters, too
1422 	bool		m_bFound;							///< found or predicted?
1423 	DWORD		m_FindResults[12];					///< max results is like 6
1424 	int			m_iCurrent;							///< index in m_FindResults that was just returned, -1 means no blending
1425 	BYTE		m_sToken [ SPH_MAX_WORD_LEN*3+4 ];	///< to hold generated lemmas
1426 	BYTE		m_sOrigToken [ SPH_MAX_WORD_LEN*3+4 ];	///< to hold original token
1427 	bool		m_bIndexExact;
1428 
1429 	const CSphWordforms *	m_pWordforms;
1430 
1431 public:
CSphAotTokenizerTmpl(ISphTokenizer * pTok,CSphDict * pDict,bool bIndexExact,int DEBUGARG (iLang))1432 	CSphAotTokenizerTmpl ( ISphTokenizer * pTok, CSphDict * pDict, bool bIndexExact, int DEBUGARG(iLang) )
1433 		: CSphTokenFilter ( pTok )
1434 	{
1435 		assert ( pTok );
1436 		assert ( g_pLemmatizers[iLang] );
1437 		m_iCurrent = -1;
1438 		m_FindResults[0] = AOT_NOFORM;
1439 		m_pWordforms = NULL;
1440 		if ( pDict )
1441 		{
1442 			// tricky bit
1443 			// one does not simply take over the wordforms from the dict
1444 			// that would break saving of the (embedded) wordforms data
1445 			// but as this filter replaces wordforms
1446 			m_pWordforms = pDict->GetWordforms();
1447 			pDict->DisableWordforms();
1448 		}
1449 		m_bIndexExact = bIndexExact;
1450 	}
1451 
SetBuffer(const BYTE * sBuffer,int iLength)1452 	void SetBuffer ( const BYTE * sBuffer, int iLength )
1453 	{
1454 		m_pTokenizer->SetBuffer ( sBuffer, iLength );
1455 	}
1456 
TokenIsBlended() const1457 	bool TokenIsBlended() const
1458 	{
1459 		return m_iCurrent>=0 || m_pTokenizer->TokenIsBlended();
1460 	}
1461 
GetSettingsFNV() const1462 	uint64_t GetSettingsFNV () const
1463 	{
1464 		uint64_t uHash = CSphTokenFilter::GetSettingsFNV();
1465 		uHash ^= (uint64_t)m_pWordforms;
1466 		DWORD uFlags = m_bIndexExact ? 1 : 0;
1467 		uHash = sphFNV64 ( &uFlags, sizeof(uFlags), uHash );
1468 		return uHash;
1469 	}
1470 };
1471 
1472 class CSphAotTokenizerRu : public CSphAotTokenizerTmpl
1473 {
1474 public:
CSphAotTokenizerRu(ISphTokenizer * pTok,CSphDict * pDict,bool bIndexExact)1475 	CSphAotTokenizerRu ( ISphTokenizer * pTok, CSphDict * pDict, bool bIndexExact )
1476 		: CSphAotTokenizerTmpl ( pTok, pDict, bIndexExact, AOT_RU )
1477 	{}
1478 
Clone(ESphTokenizerClone eMode) const1479 	ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
1480 	{
1481 		// this token filter must NOT be created as escaped
1482 		// it must only be used during indexing time, NEVER in searching time
1483 		assert ( eMode==SPH_CLONE_INDEX );
1484 		CSphAotTokenizerRu * pClone = new CSphAotTokenizerRu ( m_pTokenizer->Clone ( eMode ), NULL, m_bIndexExact );
1485 		if ( m_pWordforms )
1486 			pClone->m_pWordforms = m_pWordforms;
1487 		return pClone;
1488 	}
1489 
GetToken()1490 	BYTE * GetToken()
1491 	{
1492 		m_eTokenMorph = SPH_TOKEN_MORPH_RAW;
1493 
1494 		// any pending lemmas left?
1495 		if ( m_iCurrent>=0 )
1496 		{
1497 			++m_iCurrent;
1498 			assert ( m_FindResults[m_iCurrent]!=AOT_NOFORM );
1499 
1500 			// return original token
1501 			if ( m_FindResults[m_iCurrent]==AOT_ORIGFORM )
1502 			{
1503 				assert ( m_FindResults[m_iCurrent+1]==AOT_NOFORM );
1504 				strncpy ( (char*)m_sToken, (char*)m_sOrigToken, sizeof(m_sToken) );
1505 				m_iCurrent = -1;
1506 				m_eTokenMorph = SPH_TOKEN_MORPH_ORIGINAL;
1507 				return m_sToken;
1508 			}
1509 
1510 			// generate that lemma
1511 			const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults [ m_iCurrent ] ) ];
1512 			const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults [ m_iCurrent ] ) ];
1513 			CreateLemma<EMIT_UTF8RU> ( m_sToken, m_sForm, m_iFormLen, m_bFound, M, F );
1514 
1515 			// is this the last one? gotta tag it non-blended
1516 			if ( m_FindResults [ m_iCurrent+1 ]==AOT_NOFORM )
1517 				m_iCurrent = -1;
1518 
1519 			if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1520 				m_pWordforms->ToNormalForm ( m_sToken, false );
1521 
1522 			m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1523 			return m_sToken;
1524 		}
1525 
1526 		// ok, time to work on a next word
1527 		assert ( m_iCurrent<0 );
1528 		BYTE * pToken = m_pTokenizer->GetToken();
1529 		if ( !pToken )
1530 			return NULL;
1531 
1532 		// pass-through blended parts
1533 		if ( m_pTokenizer->TokenIsBlended() )
1534 			return pToken;
1535 
1536 		// pass-through matched wordforms
1537 		if ( m_pWordforms && m_pWordforms->ToNormalForm ( pToken, true ) )
1538 			return pToken;
1539 
1540 		// pass-through 1-char "words"
1541 		if ( pToken[1]=='\0' )
1542 			return pToken;
1543 
1544 		// pass-through non-Russian words
1545 		if ( !IsRussianAlphaUtf8 ( pToken ) )
1546 			return pToken;
1547 
1548 		// convert or copy regular tokens
1549 		m_iFormLen = Utf8ToWin1251 ( m_sForm, pToken );
1550 
1551 		// do nothing with one-char words
1552 		if ( m_iFormLen<=1 )
1553 			return pToken;
1554 
1555 		// handle a few most frequent 2-char, 3-char pass-through words
1556 		// OPTIMIZE? move up?
1557 		if ( IsRuFreq2 ( m_sForm ) || IsRuFreq3 ( m_sForm ) )
1558 			return pToken;
1559 
1560 		// lemmatize
1561 		m_bFound = g_pLemmatizers[AOT_RU]->LemmatizeWord ( m_sForm, m_FindResults );
1562 		if ( m_FindResults[0]==AOT_NOFORM )
1563 		{
1564 			assert ( m_iCurrent<0 );
1565 			return pToken;
1566 		}
1567 
1568 		// schedule original form for return, if needed
1569 		if ( m_bIndexExact )
1570 		{
1571 			int i = 1;
1572 			while ( m_FindResults[i]!=AOT_NOFORM )
1573 				i++;
1574 			m_FindResults[i] = AOT_ORIGFORM;
1575 			m_FindResults[i+1] = AOT_NOFORM;
1576 			strncpy ( (char*)m_sOrigToken, (char*)pToken, sizeof(m_sOrigToken) );
1577 		}
1578 
1579 		// in any event, prepare the first lemma for return
1580 		const CFlexiaModel & M = g_pLemmatizers[AOT_RU]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults[0] ) ];
1581 		const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults[0] ) ];
1582 		CreateLemma<EMIT_UTF8RU> ( pToken, m_sForm, m_iFormLen, m_bFound, M, F );
1583 
1584 		// schedule lemmas 2+ for return
1585 		if ( m_FindResults[1]!=AOT_NOFORM )
1586 			m_iCurrent = 0;
1587 
1588 		// suddenly, post-morphology wordforms
1589 		if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1590 			m_pWordforms->ToNormalForm ( pToken, false );
1591 
1592 		m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1593 		return pToken;
1594 	}
1595 };
1596 
1597 class CSphAotTokenizer : public CSphAotTokenizerTmpl
1598 {
1599 	AOT_LANGS		m_iLang;
1600 public:
CSphAotTokenizer(ISphTokenizer * pTok,CSphDict * pDict,bool bIndexExact,int iLang)1601 	CSphAotTokenizer ( ISphTokenizer * pTok, CSphDict * pDict, bool bIndexExact, int iLang )
1602 		: CSphAotTokenizerTmpl ( pTok, pDict, bIndexExact, iLang )
1603 		, m_iLang ( AOT_LANGS(iLang) )
1604 	{}
1605 
Clone(ESphTokenizerClone eMode) const1606 	ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
1607 	{
1608 		// this token filter must NOT be created as escaped
1609 		// it must only be used during indexing time, NEVER in searching time
1610 		assert ( eMode==SPH_CLONE_INDEX );
1611 		CSphAotTokenizer * pClone = new CSphAotTokenizer ( m_pTokenizer->Clone ( eMode ), NULL, m_bIndexExact, m_iLang );
1612 		if ( m_pWordforms )
1613 			pClone->m_pWordforms = m_pWordforms;
1614 		return pClone;
1615 	}
1616 
GetToken()1617 	BYTE * GetToken()
1618 	{
1619 		m_eTokenMorph = SPH_TOKEN_MORPH_RAW;
1620 
1621 		// any pending lemmas left?
1622 		if ( m_iCurrent>=0 )
1623 		{
1624 			++m_iCurrent;
1625 			assert ( m_FindResults[m_iCurrent]!=AOT_NOFORM );
1626 
1627 			// return original token
1628 			if ( m_FindResults[m_iCurrent]==AOT_ORIGFORM )
1629 			{
1630 				assert ( m_FindResults[m_iCurrent+1]==AOT_NOFORM );
1631 				strncpy ( (char*)m_sToken, (char*)m_sOrigToken, sizeof(m_sToken) );
1632 				m_iCurrent = -1;
1633 				m_eTokenMorph = SPH_TOKEN_MORPH_ORIGINAL;
1634 				return m_sToken;
1635 			}
1636 
1637 			// generate that lemma
1638 			const CFlexiaModel & M = g_pLemmatizers[m_iLang]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults [ m_iCurrent ] ) ];
1639 			const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults [ m_iCurrent ] ) ];
1640 			CreateLemma<EMIT_UTF8> ( m_sToken, m_sForm, m_iFormLen, m_bFound, M, F );
1641 
1642 			// is this the last one? gotta tag it non-blended
1643 			if ( m_FindResults [ m_iCurrent+1 ]==AOT_NOFORM )
1644 				m_iCurrent = -1;
1645 
1646 			if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1647 				m_pWordforms->ToNormalForm ( m_sToken, false );
1648 
1649 			m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1650 			return m_sToken;
1651 		}
1652 
1653 		// ok, time to work on a next word
1654 		assert ( m_iCurrent<0 );
1655 		BYTE * pToken = m_pTokenizer->GetToken();
1656 		if ( !pToken )
1657 			return NULL;
1658 
1659 		// pass-through blended parts
1660 		if ( m_pTokenizer->TokenIsBlended() )
1661 			return pToken;
1662 
1663 		// pass-through matched wordforms
1664 		if ( m_pWordforms && m_pWordforms->ToNormalForm ( pToken, true ) )
1665 			return pToken;
1666 
1667 		// pass-through 1-char "words"
1668 		if ( pToken[1]=='\0' )
1669 			return pToken;
1670 
1671 		// pass-through non-Russian words
1672 		if ( m_iLang==AOT_DE )
1673 		{
1674 			if ( !IsGermanAlphaUtf8 ( pToken ) )
1675 				return pToken;
1676 		} else
1677 		{
1678 			if ( !IsGermanAlpha1252 ( pToken[0] ) )
1679 				return pToken;
1680 		}
1681 
1682 		// convert or copy regular tokens
1683 		if ( m_iLang==AOT_DE )
1684 			m_iFormLen = Utf8ToWin1252 ( m_sForm, pToken );
1685 		else
1686 		{
1687 			// manual strlen and memcpy; faster this way
1688 			BYTE * p = pToken;
1689 			m_iFormLen = 0;
1690 			while ( *p )
1691 				m_sForm [ m_iFormLen++ ] = *p++;
1692 			m_sForm [ m_iFormLen ] = '\0';
1693 		}
1694 
1695 		// do nothing with one-char words
1696 		if ( m_iFormLen<=1 )
1697 			return pToken;
1698 
1699 		// handle a few most frequent 2-char, 3-char pass-through words
1700 		// OPTIMIZE? move up?
1701 		if ( ( m_iLang==AOT_DE && ( IsDeFreq2 ( m_sForm ) || IsDeFreq3 ( m_sForm ) ) )
1702 			|| ( m_iLang==AOT_EN && ( IsEnFreq2 ( m_sForm ) || IsEnFreq3 ( m_sForm ) ) ) )
1703 			return pToken;
1704 
1705 		// lemmatize
1706 		m_bFound = g_pLemmatizers[m_iLang]->LemmatizeWord ( m_sForm, m_FindResults );
1707 		if ( m_FindResults[0]==AOT_NOFORM )
1708 		{
1709 			assert ( m_iCurrent<0 );
1710 			return pToken;
1711 		}
1712 
1713 		// schedule original form for return, if needed
1714 		if ( m_bIndexExact )
1715 		{
1716 			int i = 1;
1717 			while ( m_FindResults[i]!=AOT_NOFORM )
1718 				i++;
1719 			m_FindResults[i] = AOT_ORIGFORM;
1720 			m_FindResults[i+1] = AOT_NOFORM;
1721 			strncpy ( (char*)m_sOrigToken, (char*)pToken, sizeof(m_sOrigToken) );
1722 		}
1723 
1724 		// in any event, prepare the first lemma for return
1725 		const CFlexiaModel & M = g_pLemmatizers[m_iLang]->m_FlexiaModels [ AOT_MODEL_NO ( m_FindResults[0] ) ];
1726 		const CMorphForm & F = M [ AOT_ITEM_NO ( m_FindResults[0] ) ];
1727 		CreateLemma<EMIT_UTF8> ( pToken, m_sForm, m_iFormLen, m_bFound, M, F );
1728 
1729 		// schedule lemmas 2+ for return
1730 		if ( m_FindResults[1]!=AOT_NOFORM )
1731 			m_iCurrent = 0;
1732 
1733 		// suddenly, post-morphology wordforms
1734 		if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
1735 			m_pWordforms->ToNormalForm ( pToken, false );
1736 
1737 		m_eTokenMorph = SPH_TOKEN_MORPH_GUESS;
1738 		return pToken;
1739 	}
1740 };
1741 
1742 
sphAotCreateFilter(ISphTokenizer * pTokenizer,CSphDict * pDict,bool bIndexExact,DWORD uLangMask)1743 CSphTokenFilter * sphAotCreateFilter ( ISphTokenizer * pTokenizer, CSphDict * pDict, bool bIndexExact, DWORD uLangMask )
1744 {
1745 	assert ( uLangMask!=0 );
1746 	CSphTokenFilter * pDerivedTokenizer = NULL;
1747 	for ( int i=AOT_BEGIN; i<AOT_LENGTH; ++i )
1748 	{
1749 		if ( uLangMask & (1UL<<i) )
1750 		{
1751 			if ( i==AOT_RU )
1752 				pDerivedTokenizer = new CSphAotTokenizerRu ( pTokenizer, pDict, bIndexExact );
1753 			else
1754 				pDerivedTokenizer = new CSphAotTokenizer ( pTokenizer, pDict, bIndexExact, i );
1755 			pTokenizer = pDerivedTokenizer;
1756 		}
1757 	}
1758 	return pDerivedTokenizer;
1759 }
1760 
1761 
sphAotShutdown()1762 void sphAotShutdown ()
1763 {
1764 	for ( int i=0; i<AOT_LENGTH; i++ )
1765 		SafeDelete ( g_pLemmatizers[i] );
1766 }
1767 
1768 //
1769 // $Id$
1770 //
1771