1 #   include	"indConfig.h"
2 
3 #   include	<stdlib.h>
4 
5 #   include	<uniUtf8.h>
6 #   include	"indlocal.h"
7 #   include	<appDebugon.h>
8 
9 /************************************************************************/
10 /*  Allocate a new node. In Theory, it might be advisable to look for	*/
11 /*  a hole in the array of nodes. As in the current version of the	*/
12 /*  software, no nodes are really freed. this does not really make	*/
13 /*  sense.								*/
14 /************************************************************************/
15 
indTNmake(IND * ind)16 int indTNmake( IND *	ind )
17     {
18     TrieNode *	node;
19 
20     if  ( ind->indNodeCount >= ind->indAllocatedNodes )
21 	{
22 	int		newm= ( ind->indAllocatedNodes+ TNsBLOCK )/ TNsBLOCK;
23 	unsigned	sz= newm* sizeof(TrieNode *);
24 	TrieNode **	fresh;
25 	TrieNode *	nwpg;
26 
27 	if  ( ind->indNodePages )
28 	    { fresh= (TrieNode **) realloc( (char *)ind->indNodePages, sz );}
29 	else{ fresh= (TrieNode **) malloc( sz );				}
30 	if  ( ! fresh )
31 	    { return -1;	}
32 	ind->indNodePages= fresh;
33 	nwpg= (TrieNode *)malloc( TNsBLOCK* sizeof( TrieNode ) );
34 	if  ( ! nwpg )
35 	    { return -1;	}
36 	ind->indNodePages[newm-1]= nwpg;
37 	ind->indAllocatedNodes= newm* TNsBLOCK;
38 	}
39 
40     node= NODE(ind,ind->indNodeCount);
41     node->tn_transitions= -1;
42     node->tn_ntrans= 0;
43     node->tn_flags= TNfUSED;
44     node->tn_unused= 0;
45 
46     /*appDebug( "NODE %7d of %7d\n", ind->indNodeCount, ind->indAllocatedNodes );*/
47     return ind->indNodeCount++;
48     }
49 
indTNfree(IND * ind,int tn)50 void indTNfree(	IND *	ind,
51 		int	tn )
52     {
53     TrieNode *	node= NODE(ind,tn);
54 
55     if  ( node->tn_transitions >= 0 )
56 	{ indTLfree( ind, node->tn_transitions ); node->tn_transitions= -1; }
57     node->tn_flags= 0;
58     node->tn_ntrans= 0;
59 
60     while( ind->indNodeCount > 0)
61 	{
62 	node= NODE(ind,ind->indNodeCount-1);
63 
64 	if  ( node->tn_flags == 0	)
65 	    { ind->indNodeCount--;	}
66 	else{ break;		}
67 	}
68     }
69 
70 /************************************************************************/
71 /*									*/
72 /*  Follow one transition in the automaton.				*/
73 /*									*/
74 /*  Return:	The index of the transition from this node in *pTrans.	*/
75 /*		If there is no transition for the input symbol, *pTrans	*/
76 /*		is set to the candidate insertion point.		*/
77 /*									*/
78 /*  Return:	The number in the automaton of the next state (If any)	*/
79 /*  Return:	-1 if there is no state for this value.			*/
80 /*									*/
81 /************************************************************************/
82 
indINDstep(int * pTrans,IND * ind,int tn,int sym)83 int indINDstep(	int *				pTrans,
84 		IND *				ind,
85 		int				tn,
86 		int				sym )
87     {
88     TrieNode *	node;
89     int		l, r;
90     int		transitions;
91 
92     node= NODE(ind,tn);
93 
94     l= 0;
95     r= node->tn_ntrans;
96     *pTrans= ( l+ r )/2;
97     transitions= node->tn_transitions;
98 
99     if  ( r <= 0 )
100 	{ return -1;	}
101 
102     while( l < *pTrans )
103 	{
104 	if  ( sym < LINK(ind,transitions+*pTrans)->tl_key )
105 	    { r= *pTrans;	}
106 	else{ l= *pTrans;	}
107 	*pTrans= ( l+ r )/2;
108 	}
109 
110     if  ( sym != LINK(ind,transitions+*pTrans)->tl_key )
111 	{ return -1;	}
112 
113     return LINK(ind,transitions+*pTrans)->tl_to;
114     }
115 
116 /************************************************************************/
117 /*									*/
118 /*  Retrieve a word in an automaton.					*/
119 /*									*/
120 /************************************************************************/
121 
indINDgetUtf8(int * paccept,IND * ind,int tn,const char * key)122 int indINDgetUtf8(	int *		paccept,
123 			IND *		ind,
124 			int		tn,
125 			const char *	key )
126     {
127     int		m;
128 
129     if  ( tn < 0 || tn >= ind->indNodeCount )
130 	{ LLDEB(tn,ind->indNodeCount); return -1;	}
131 
132     for (;;)
133 	{
134 	int		step;
135 	unsigned short	symbol;
136 
137 	if  ( ! * key )
138 	    {
139 	    if  ( NODE(ind,tn)->tn_flags & TNfACCEPTS )
140 		{ *paccept= 1;	}
141 	    else{ *paccept= 0;	}
142 
143 	    return tn;
144 	    }
145 
146 	step= uniGetUtf8( &symbol, key );
147 	if  ( step < 1 )
148 	    { LDEB(step); return -1;	}
149 	key += step;
150 
151 	tn= indINDstep( &m, ind, tn, symbol );
152 
153 	if  ( tn < 0 )
154 	    { return tn;	}
155 	}
156     }
157 
indINDgetUtf16(int * paccept,IND * ind,int tn,const unsigned short * key)158 int indINDgetUtf16(	int *			paccept,
159 			IND *			ind,
160 			int			tn,
161 			const unsigned short *	key )
162     {
163     int		m;
164 
165     if  ( tn < 0 || tn >= ind->indNodeCount )
166 	{ LLDEB(tn,ind->indNodeCount); return -1;	}
167 
168     for (;;)
169 	{
170 	unsigned short	symbol;
171 
172 	if  ( ! * key )
173 	    {
174 	    if  ( NODE(ind,tn)->tn_flags & TNfACCEPTS )
175 		{ *paccept= 1;	}
176 	    else{ *paccept= 0;	}
177 
178 	    return tn;
179 	    }
180 
181 	symbol= *key;
182 	key += 1;
183 
184 	tn= indINDstep( &m, ind, tn, symbol );
185 
186 	if  ( tn < 0 )
187 	    { return tn;	}
188 	}
189     }
190 
191 /************************************************************************/
192 /*									*/
193 /*  Remove a word from the automaton.					*/
194 /*  Simply remove the 'accepts' flag from its final node.		*/
195 /*									*/
196 /*  This is not correct in a minimized automaton.			*/
197 /*  This is not correct if the automaton is used to find out whether	*/
198 /*  strings are a prefix of accepted strings.				*/
199 /*									*/
200 /************************************************************************/
201 
indINDforget(IND * ind,const char * key)202 int indINDforget(	IND *		ind,
203 			const char *	key )
204     {
205     int		accepted;
206     int		tn= indINDgetUtf8( &accepted, ind, ind->ind_start, key );
207     TrieNode *	node;
208 
209     if  ( tn < 0 || ! accepted )
210 	{ return -1;	}
211 
212     node= NODE(ind,tn);
213 
214     node->tn_flags &= ~TNfACCEPTS;
215 
216     return 0;
217     }
218 
219 /************************************************************************/
220 
indAddLinkToNode(IND * ind,TrieNode * node,int m,int symbol,int to)221 static int indAddLinkToNode(		IND *		ind,
222 					TrieNode *	node,
223 					int		m,
224 					int		symbol,
225 					int		to )
226     {
227     int		transitions= node->tn_transitions;
228 
229     if  ( node->tn_flags & TNfREAD_ONLY )
230 	{ XXDEB(node->tn_flags,TNfREAD_ONLY); return -1;	}
231 
232     if  ( node->tn_ntrans > 0 )
233 	{
234 	int	r;
235 
236 	transitions= indTLalloc( ind, transitions, node->tn_ntrans+ 1 );
237 	if  ( transitions < 0 )
238 	    { LLDEB(node->tn_ntrans,transitions); return -1;	}
239 	else{ node->tn_transitions= transitions;		}
240 
241 	if  ( symbol > LINK(ind,transitions+m)->tl_key )
242 	    { m++; }
243 	r= node->tn_ntrans++;
244 	while( r > m )
245 	    {
246 	    *LINK(ind,transitions+r)= *LINK(ind,transitions+r-1);
247 	    r--;
248 	    }
249 	LINK(ind,transitions+m)->tl_to= to;
250 	LINK(ind,transitions+m)->tl_key= symbol;
251 	}
252     else{
253 	node->tn_transitions= transitions= indTLalloc( ind, -1, 1 );
254 	if  ( transitions < 0 )
255 	    { LDEB(transitions); return -1;	}
256 
257 	node->tn_ntrans= 1;
258 	LINK(ind,transitions)->tl_to= to;
259 	LINK(ind,transitions)->tl_key= symbol;
260 	m= 0;
261 	}
262 
263     return transitions+ m;
264     }
265 
indAddNodeToNode(IND * ind,TrieNode * node,int m,int symbol)266 static int indAddNodeToNode(		IND *		ind,
267 					TrieNode *	node,
268 					int		m,
269 					int		symbol )
270     {
271     int		to;
272 
273     if  ( node->tn_flags & TNfREAD_ONLY )
274 	{ XXDEB(node->tn_flags,TNfREAD_ONLY); return -1;	}
275 
276     to= indTNmake( ind );
277     if  ( to < 0 )
278 	{ LDEB(to); return -1;	}
279 
280     if  ( indAddLinkToNode( ind, node, m, symbol, to ) < 0 )
281 	{ LLDEB(to,symbol); return -1;	}
282 
283     return to;
284     }
285 
286 /************************************************************************/
287 /*									*/
288 /*  Store a word in the automaton.					*/
289 /*									*/
290 /************************************************************************/
291 
indINDputUtf8(IND * ind,int tn,const char * key)292 int indINDputUtf8(	IND *		ind,
293 			int		tn,
294 			const char *	key )
295     {
296     for (;;)
297 	{
298 	TrieNode *	node;
299 	int		m;
300 
301 	int		step;
302 	unsigned short	symbol;
303 
304 	if  ( tn < 0 || tn >= ind->indNodeCount )
305 	    { LLDEB(tn,ind->indNodeCount); return -1; }
306 	node= NODE(ind,tn);
307 
308 	/*indTLwalk(ind);*/
309 
310 	if  ( ! * key )
311 	    {
312 	    if  ( node->tn_flags & TNfREAD_ONLY )
313 		{ XXDEB(node->tn_flags,TNfREAD_ONLY); return -1;	}
314 
315 	    node->tn_flags |= TNfACCEPTS;
316 	    return tn;
317 	    }
318 
319 	step= uniGetUtf8( &symbol, key );
320 	if  ( step < 1 )
321 	    { LDEB(step); return -1;	}
322 	key += step;
323 
324 	tn= indINDstep( &m, ind, tn, symbol );
325 	if  ( tn < 0 )
326 	    {
327 	    tn= indAddNodeToNode( ind, node, m, symbol );
328 	    if  ( tn < 0 )
329 		{ LDEB(tn); return -1;	}
330 	    }
331 	}
332     }
333 
indINDputUtf16(IND * ind,int tn,const unsigned short * key)334 int indINDputUtf16(	IND *			ind,
335 			int			tn,
336 			const unsigned short *	key )
337     {
338     for (;;)
339 	{
340 	TrieNode *	node;
341 	int		m;
342 
343 	unsigned short	symbol;
344 
345 	if  ( tn < 0 || tn >= ind->indNodeCount )
346 	    { LLDEB(tn,ind->indNodeCount); return -1; }
347 	node= NODE(ind,tn);
348 
349 	/*indTLwalk(ind);*/
350 
351 	if  ( ! * key )
352 	    {
353 	    if  ( node->tn_flags & TNfREAD_ONLY )
354 		{ XXDEB(node->tn_flags,TNfREAD_ONLY); return -1;	}
355 
356 	    node->tn_flags |= TNfACCEPTS;
357 	    return tn;
358 	    }
359 
360 	symbol= *key;
361 	key += 1;
362 
363 	tn= indINDstep( &m, ind, tn, symbol );
364 	if  ( tn < 0 )
365 	    {
366 	    tn= indAddNodeToNode( ind, node, m, symbol );
367 	    if  ( tn < 0 )
368 		{ LDEB(tn); return -1;	}
369 	    }
370 	}
371     }
372 
373 
374 /************************************************************************/
375 /*									*/
376 /*  Do something for all nodes in a trie.				*/
377 /*  NOTE: The approach is naive and loops on cyclic structures.		*/
378 /*									*/
379 /************************************************************************/
380 
indINDforall(IND * ind,int tn,void * through,IndForAllFun fun)381 int indINDforall(	IND *		ind,
382 			int		tn,
383 			void *		through,
384 			IndForAllFun	fun )
385     {
386     const TrieNode *		frNode;
387     int				i;
388     int				n;
389 
390     if  ( tn < 0 )
391 	{ return 0;	}
392     frNode= NODE(ind,tn);
393     n= frNode->tn_ntrans;
394 
395     for ( i= 0; i < n; i++ )
396 	{
397 	const TrieLink *	toLink= LINK(ind,frNode->tn_transitions+i);
398 	const TrieNode *	toNode= NODE(ind,toLink->tl_to);
399 
400 	if  ( (*fun)( through, +1, tn, toLink->tl_to, toLink->tl_key,
401 				(toNode->tn_flags & TNfACCEPTS) != 0 ) < 0 )
402 	    { LLLDEB(tn,toLink->tl_to,toLink->tl_key); return -1;	}
403 
404 	if  ( indINDforall( ind, toLink->tl_to, through, fun ) < 0 )
405 	    { LDEB(1); return -1;	}
406 
407 	if  ( (*fun)( through, -1, tn, toLink->tl_to, toLink->tl_key,
408 				(toNode->tn_flags & TNfACCEPTS) != 0 ) < 0 )
409 	    { LLLDEB(tn,toLink->tl_to,toLink->tl_key); return -1;	}
410 	}
411 
412     return 0;
413     }
414 
415 /************************************************************************/
416 /*									*/
417 /*  Add a (static) suffix to a node.					*/
418 /*									*/
419 /************************************************************************/
420 
indINDaddSuffix(IND * ind,int tnTo,int tnSuf)421 int indINDaddSuffix(		IND *		ind,
422 				int		tnTo,
423 				int		tnSuf )
424     {
425     TrieNode *		nodeTo= NODE(ind,tnTo);
426     const TrieNode *	nodeSuf= NODE(ind,tnSuf);
427 
428     int			m;
429 
430     int			i;
431 
432     if  ( nodeTo->tn_flags & TNfREAD_ONLY )
433 	{ XXDEB(nodeTo->tn_flags,TNfREAD_ONLY); return -1;	}
434     if  ( nodeSuf->tn_flags & TNfACCEPTS )
435 	{ nodeTo->tn_flags |= TNfACCEPTS;	}
436 
437     i= nodeSuf->tn_ntrans;
438     while( --i >= 0 )
439 	{
440 	const TrieLink *	linkSuf= LINK(ind,nodeSuf->tn_transitions+i);
441 
442 	tnSuf= linkSuf->tl_to;
443 
444 	tnTo= indINDstep( &m, ind, tnTo, linkSuf->tl_key );
445 	if  ( tnTo < 0 )
446 	    {
447 	    /*  Connect suffix here */
448 	    int	tlTo;
449 
450 	    tlTo= indAddLinkToNode( ind, nodeTo, m, linkSuf->tl_key, tnSuf );
451 	    if  ( tlTo < 0 )
452 		{ LDEB(tlTo); return -1;	}
453 
454 	    continue; /* Ready with this branch */
455 	    }
456 
457 	if  ( i > 0 )
458 	    {
459 	    if  ( indINDaddSuffix( ind, tnTo, tnSuf ) < 0 )
460 		{ LLDEB(tnTo,tnSuf);	}
461 	    }
462 	else{
463 	    /* Avoid chain recursion stay in same stack frame */
464 	    nodeTo= NODE(ind,tnTo);
465 	    nodeSuf= NODE(ind,tnSuf);
466 	    i= nodeSuf->tn_ntrans;
467 
468 	    if  ( nodeTo->tn_flags & TNfREAD_ONLY )
469 		{ XXDEB(nodeTo->tn_flags,TNfREAD_ONLY); return -1;	}
470 	    if  ( nodeSuf->tn_flags & TNfACCEPTS )
471 		{ nodeTo->tn_flags |= TNfACCEPTS;	}
472 	    }
473 	}
474 
475     return 0;
476     }
477