1 /* analyze.c
2 
3 
4 This file contains the routines for finding the rules that
5 best fit the input address and assigns each element of the
6 input to the appropriate output field. The process is
7 essentially one of pattern-matching. The Aho-Corasick algorithm
8 is used to match rules that map input symbols found by the tokenizer
9 to output symbols. In the general case a clause tree is built left to
10 right, matching rules of a particular class, depending on the state.
11 
12 Prototype 7H08 (This file was written by Walter Sinclair).
13 
14 Copyright (c) 2009 Walter Bruce Sinclair
15 
16 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
17 
18 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 
22 */
23 
24 /* For pagc-0.3.0 : last revised 2010-11-18 */
25 
26 //#define OCCUPANCY_DEBUG
27 #define USE_FORCE_MACRO
28 
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include <stddef.h>
32 #include <string.h>
33 #include "pagc_api.h"
34 
35 
36 /* ------------------------------------------------------------
37 A lookup string with a particular standardization is prevented
38    from becoming associated with a particular output symbol
39 ------------------------------------------------------------- */
40 typedef struct def_blocker
41 {
42    char *lookup ;
43    char *standard ;
44    SYMB output_symbol ;
45    DEF *definition ;
46 } DEF_BLOCKER ;
47 
48 #define NUM_DEF_BLOCKERS 2
49 
50 /* ---------------------------------------------------------------
51 When adding to this list, increment NUM_DEF_BLOCKERS for each new
52    entry. This list blocks the use of the lookup string (first entry)
53    as the standardization (second entry) as an output symbol (third)
54    binding to the definition (fourth entry). The fourth entry is
55    added at initialization after the lexicon is read into memory.
56    Thus ST is blocked as STREET as a pretype. This occurs if the
57    rule attempts to move ST (as SAINT) left from STREET into PRETYP.
58 ---------------------------------------------------------------- */
59 static DEF_BLOCKER __def_block_table__[NUM_DEF_BLOCKERS] =
60 {
61    {"ST", "STREET", PRETYP, NULL } ,
62    {"ST", "STREET", CITY, NULL }
63 } ;
64 
65 /* -- local prototypes -- */
66 
67 static int check_def_block( STAND_PARAM * , int ) ;
68 static void delete_stz( STZ_PARAM * , int ) ;
69 static int delete_duplicate_stz( STZ_PARAM * , int ) ;
70 static void first_composition( STAND_PARAM * ) ;
71 static int prepare_target_pattern( STAND_PARAM * ) ;
72 static int no_break( STAND_PARAM *__stand_param__ , int ) ;
73 static int do_left_combine( STAND_PARAM * , int , int ) ;
74 static int need_compression( STAND_PARAM *, SYMB , int , int  ) ;
75 static int select_next_composition( STAND_PARAM * ) ;
76 static int copy_best( STAND_PARAM * , int * , SYMB , int , SYMB * ) ;
77 static void save_current_composition( STAND_PARAM * , SEG * , int , SYMB * , DEF ** ) ;
78 static void scan_clause_tree(  STAND_PARAM * , int, int ) ;
79 static void shallow_clause_scan( STAND_PARAM * , int , int ) ;
80 static void deposit_stz( STAND_PARAM *, double , int ) ;
81 static STZ *copy_stz( STAND_PARAM * , double ) ;
82 static void make_singleton( SEG * , SYMB , int , int , double ) ;
83 static int lex_has_def( STAND_PARAM * , int , SYMB ) ;
84 static void _force_deposit_( STAND_PARAM * , int ) ;
85 static int have_schema_symbol( int * , SYMB ) ;
86 static void default_seg_val( int * , int , SEG * , int , SYMB , double ) ;
87 static int _modify_position_( STAND_PARAM *, SEG * , int , int , SYMB , SYMB ) ;
88 static int schema_modify_position( STAND_PARAM * , SEG * , int , int , SYMB , SYMB ) ;
89 static void force_arc_clause( STAND_PARAM * ) ;
90 #ifdef USE_FORCE_MACRO
91 static void _force_macro_clause_( STAND_PARAM * ) ;
92 #endif
93 static int non_geocode_address( STAND_PARAM * ) ;
94 static int evaluate_micro_l(STAND_PARAM *) ;
95 
96 /* -- Guide to the transition table:
97          MACRO_C MICRO_C ARC_C   CIVIC_C EXTRA_C
98 MICRO_B  FAIL    FAIL    EXIT    FAIL    FAIL
99 MICRO_M  FAIL    EXIT    PREFIX  FAIL    MICR0_M
100 MACRO    EXIT    FAIL    FAIL    FAIL    FAIL
101 PREFIX   FAIL    FAIL    FAIL    EXIT    FAIL
102 EXIT     FAIL    FAIL    FAIL    FAIL    EXIT
103 
104 -- */
105 
106 static int __tran_table__[MAX_CL][MAX_CL] = {
107    { FAIL, FAIL, EXIT, FAIL, FAIL } ,
108    { FAIL, EXIT, PREFIX, FAIL, MICRO_M } ,
109    { EXIT, FAIL, FAIL, FAIL, FAIL } ,
110    { FAIL, FAIL, FAIL, EXIT, FAIL } ,
111    { FAIL, FAIL, FAIL, FAIL, EXIT }
112 } ;
113 
114 /* -- skew weights for each rule class -- */
115 static double __weight_table__[MAX_CL] =
116 {
117   1.0, 0.95, 0.95, 0.8 , 0.85
118 } ;
119 
120 
121 #define TARG_START 0
122 #define FIRST_STZ 0
123 #define INITIAL_STZ_CUTOFF .05
124 #define VERY_LOW_WEIGHT .15
125 #define START_DEPTH 0
126 
127 static double __load_value__[ NUMBER_OF_WEIGHTS ] =
128 {
129    0.00, 0.325, 0.35 , 0.375 , 0.4 ,
130    0.475 , 0.55, 0.6 , 0.65 , 0.675 ,
131    0.7 , 0.75 , 0.8 , 0.825 , 0.85 ,
132    0.9 , 0.95 , 1.00
133 } ;
134 
135 #ifdef OCCUPANCY_DEBUG
136 static const char *__rule_type_names__[] =
137 {
138    "MACRO" , "MICRO" , "ARC" , "CIVIC" , "EXTRA"
139 } ;
140 #endif
141 
142 
143 /* ====================================================================
144 analyze.c (install_def_block_table)
145 process level initialization - called by standard.l (init_stand_process)
146 calls lexicon.c (find_entry)
147 returns FALSE if error encountered.
148 string.h (strcmp)
149 uses macro RET_ERR1, LOG_MESS, CLIENT_ERR
150 =======================================================================*/
install_def_block_table(ENTRY ** __hash_table__,ERR_PARAM * __err_param__)151 int install_def_block_table( ENTRY **__hash_table__, ERR_PARAM *__err_param__ )
152 {
153 	int i ;
154 	for ( i = 0 ; i < NUM_DEF_BLOCKERS ; i++ )
155 	{
156 		DEF * __standard_def__ ;
157 		ENTRY *__lookup_entry__ = find_entry( __hash_table__ , __def_block_table__[i].lookup ) ;
158 		if (__lookup_entry__ == NULL)
159 		{
160 			RET_ERR1( "install_def_block_table: Could not find def_block for %s\n", __def_block_table__[i].lookup , __err_param__ , FALSE ) ;
161 		}
162 		for ( __standard_def__ = __lookup_entry__->DefList ; __standard_def__ != NULL ; __standard_def__ = __standard_def__->Next )
163 		{
164 			if ( strcmp( __standard_def__->Standard , __def_block_table__[i].standard ) == 0 )
165 			{
166 	            __def_block_table__[i].definition = __standard_def__ ;
167 			}
168             break ;
169 		}
170 		if ( __def_block_table__[i].definition == NULL )
171 		{
172 			RET_ERR1( "install_def_block_table: Could not find def_block definition for %s\n" , __def_block_table__[i].standard , __err_param__ , FALSE ) ;
173 		}
174 	}
175 	return TRUE ;
176 }
177 
178 /* ====================================================================
179 analyze.c (create_segments)
180 context level initialization -- must come after the lexicon
181       is read - called by init_stand_context
182 Null on error.
183 =======================================================================*/
create_segments(ERR_PARAM * __err_param__)184 STZ_PARAM *create_segments( ERR_PARAM *__err_param__ )
185 {
186 	STZ_PARAM *__stz_info__ ;
187 	int i ;
188 	/* -- we're going to be re-sorting these pointers -- */
189     PAGC_ALLOC_STRUC(__stz_info__,STZ_PARAM,__err_param__,NULL) ;
190 	PAGC_CALLOC_STRUC(__stz_info__->stz_array,STZ *,MAX_STZ,__err_param__,NULL) ;
191 	for ( i = FIRST_STZ ; i < MAX_STZ ; i++ )
192 	{
193 		PAGC_ALLOC_STRUC(__stz_info__->stz_array[i],STZ,__err_param__,NULL) ;
194 	}
195 	PAGC_CALLOC_STRUC(__stz_info__->segs,SEG,MAXLEX,__err_param__,NULL) ;
196 	return __stz_info__ ;
197 }
198 
199 /* ====================================================================
200 analyze.c (destroy_segments)
201 context level cleanup
202 - called by (standard.l) close_stand_context
203 uses macros PAGC_DESTROY_2D_ARRAY, FREE_AND_NULL
204 =======================================================================*/
destroy_segments(STZ_PARAM * __stz_info__)205 void destroy_segments( STZ_PARAM *__stz_info__ )
206 {
207 	if ( __stz_info__ == NULL )
208 	{
209 		return ;
210 	}
211 	PAGC_DESTROY_2D_ARRAY( __stz_info__->stz_array , STZ,MAX_STZ )
212 	FREE_AND_NULL( __stz_info__->segs ) ;
213 	FREE_AND_NULL( __stz_info__ ) ;
214 }
215 /* ====================================================================
216 analyze.c (get_stz_downgrade)
217 2008-03-13 : lower grade standardizations should not produce the same
218 matching score as higher grade. This can be critical when a lower grade
219 standardization produces a perfect match on the wrong reference record
220 =======================================================================*/
get_stz_downgrade(STAND_PARAM * __stand_param__,int request_stz)221 double get_stz_downgrade( STAND_PARAM *__stand_param__ , int request_stz )
222 {
223 	double numerator, denominator ;
224 	STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
225 	if (( __stz_info__->stz_list_size - 1 ) < request_stz )
226 	{
227 		return 0. ;
228 	}
229 	if ( request_stz == 0 )
230 	{
231 		return 1.0 ;
232 	}
233 	if (( denominator = __stz_info__->stz_array[0]->score ) == 0. )
234 	{
235 		return denominator ;
236 	}
237 	numerator = __stz_info__->stz_array[request_stz]->score ;
238 	return ( numerator / denominator ) ;
239 }
240 
241 /* ====================================================================
242 analyze.c (get_next_stz)
243 called by analyze.c (evaluator) , build.c (Build)
244   build.c (transform_rows) match.c (match_records),
245 calls analyze.c (check_def_block, delete_duplicate_stz)
246 export.c (init_output_fields, stuff_fields)
247 <remarks>
248       return FALSE if the requested stz is not there - this allows
249       termination to a request loop when there are fewer than the maximum
250       number on the list and also reports, on request of 0, that none were
251       found. If the request_stz is the same as the last one done (since
252       evaluator last initialized the last_stz_output variable) we just
253       return rather than redo the same work. When matching we need
254       to know the correct standardization for positioning the point along
255       the arc
256       2008-04-06 : This function needs to return to the 0 stz when selecting
257       the best standardization for the build. To indicate that an override
258       is required, we'll take FAIL as a proxy for 0.
259 </remarks>
260 =======================================================================*/
get_next_stz(STAND_PARAM * __stand_param__,int request_stz_in)261 int get_next_stz( STAND_PARAM *__stand_param__ , int request_stz_in )
262 {
263 	int i ;
264 
265 	DEF **__best_defs__ = __stand_param__->best_defs ;
266 	SYMB *__best_output__ = __stand_param__->best_output ;
267 	STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
268 	int n = __stand_param__->LexNum ;
269 	int request_stz = request_stz_in ;
270     STZ * __cur_stz__ ;
271 	if (request_stz_in != FAIL)
272 	{
273 		if ((( __stz_info__->stz_list_size - 1 ) < request_stz ) || ( __stz_info__->last_stz_output == request_stz ))
274 		{
275 			/*-- Indicate that this is the last one : don't call
276 				get_next_stz with 0 unless you want a FALSE --*/
277 			return FALSE ;
278 		}
279 		/*-- Delete standardizations that contain blocked definitions --*/
280 		while (( check_def_block( __stand_param__ , request_stz )) && ( __stz_info__->stz_list_size > request_stz )) ;
281 		/*-- Have we reached the end of the list? --*/
282 		if ( __stz_info__->stz_list_size == request_stz )
283 		{
284 			return FALSE ;
285 		}
286 		/* -----------------------------------------------------------------------
287 			<remarks> A clause tree analysis may produce identical output to a MICRO_C by
288 			combining an ARC_C and CIVIC_C pair. We want only the first one in any
289 			situation where we ask for lower scoring candidates </remarks>
290 		------------------------------------------------------------------------ */
291 		if ( request_stz > FIRST_STZ )
292 		{
293 			while (( delete_duplicate_stz( __stz_info__, request_stz )) && ( __stz_info__->stz_list_size > request_stz )) ;
294 			if ( __stz_info__->stz_list_size == request_stz )
295 			{
296 				return FALSE ;
297 			}
298 		}
299 	}
300 	else
301 	{
302 		request_stz = FIRST_STZ ;
303 	}
304 	/*-- Reload the best defs and output from the new stz --*/
305 	__cur_stz__ = __stz_info__->stz_array[request_stz] ;
306 	for ( i = FIRST_LEX_POS ; i < n ; i++ )
307 	{
308 		__best_defs__[i] = __cur_stz__->definitions[i] ;
309 		__best_output__[i] = __cur_stz__->output[i] ;
310 	}
311 	__best_defs__[i] = NULL ;
312 	__best_output__[i] = FAIL ;
313 	/* -------------------------------------------------------------------
314 		Because this function is called with values greater than 0 only to
315 		redo a MICRO
316 	-------------------------------------------------------------------- */
317 	if (request_stz > FIRST_STZ || request_stz_in == FAIL)
318 	{
319 		/*-- LEFT : just MICRO here --*/
320 		init_output_fields(__stand_param__,LEFT) ;
321 		stuff_fields( __stand_param__ ) ;
322 	}
323 	__stz_info__->last_stz_output = request_stz ;
324 	return TRUE ;
325 }
326 
327 /* ====================================================================
328 analyze.c (check_def_block)
329 called by analyze.c (get_next_stz)
330 calls analyze.c (delete_stz)
331 =======================================================================*/
check_def_block(STAND_PARAM * __stand_param__,int request_stz)332 static int check_def_block( STAND_PARAM *__stand_param__ , int request_stz )
333 {
334 	int i, j ;
335 	STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
336 	SYMB *__cur_sym_ptr__ = __stz_info__->stz_array[ request_stz ]->output ;
337 	DEF **__stz_definitions__ = __stz_info__->stz_array[ request_stz ]->definitions ;
338 	int n = __stand_param__->LexNum ;
339 	for (i = FIRST_LEX_POS ; i < n ; i++)
340 	{
341 		for (j = 0 ; j < NUM_DEF_BLOCKERS ; j ++)
342 		{
343 			if (__cur_sym_ptr__[i] == __def_block_table__[j].output_symbol)
344 			{
345 				if (__stz_definitions__[i] == __def_block_table__[j].definition)
346 				{
347 					delete_stz(__stz_info__ , request_stz) ;
348 					return TRUE ;
349 				}
350 			}
351 		}
352 	}
353 	return FALSE ;
354 }
355 
356 /* ====================================================================
357 analyze.c (delete_stz)
358 called by analyze.c (check_def_block), analyze.c (delete_duplicate_stz)
359 =======================================================================*/
delete_stz(STZ_PARAM * __stz_info__,int request_stz)360 static void delete_stz( STZ_PARAM *__stz_info__ , int request_stz )
361 {
362 	int i, n ;
363     STZ **__stz_list__ ;
364     STZ *__stz_ptr__ ;
365 
366 	__stz_info__->stz_list_size -- ; /* -- change list count for deletion -- */
367 	n = __stz_info__->stz_list_size ;
368 	__stz_list__ = __stz_info__->stz_array ;
369 
370 	/*-- last on list? - it just becomes inactive --*/
371 	if ( request_stz == __stz_info__->stz_list_size )
372 	{
373 		return ;
374 	}
375 	/*-- we don't want to lose this pointer --*/
376 	__stz_ptr__ = __stz_list__[request_stz] ;
377 	/* ----------------------------------------------------------
378 		move the rest of the list down to eliminate the duplicate.
379 		The replacement entry will become the new, requested stz
380 
381 		if there are, for instance, n stz pointers active, stz_list_size
382 		will be n. So the ordinal n-1 is the last active stz. In this
383 		function, after the first instruction, n will point to the last
384 		active stz. When we move the stz pointers down, when i = n-1,
385 		the nth is moved into n-1. So, in order not to lose the pointer,
386 		the deleted stz goes into the vacated nth spot
387 		 0       req       n-1  n  inactive  MAX_STZ - 1
388 		[ ] [ ] [ ] ...   [ ] [ ] [ ] ...   [ ]
389 	----------------------------------------------------------- */
390 	for ( i = request_stz ; i < n ; i ++ )
391 	{
392 		__stz_list__[i] = __stz_list__[i+1] ;
393 	}
394 	/* -- save the pointer, now inactive, for reuse -- */
395 	__stz_list__[n] = __stz_ptr__ ;
396 }
397 
398 /* ====================================================================
399 analyze.c (delete_duplicate_stz)
400 calls analyze.c (delete_stz)
401 called by analyze.c (get_next_stz)
402 =======================================================================*/
delete_duplicate_stz(STZ_PARAM * __stz_info__,int request_stz)403 static int delete_duplicate_stz(STZ_PARAM *__stz_info__, int request_stz)
404 {
405 	/* ---------------------------------------------------------------------
406       if the requested_stz is identical to any earlier ones on the list,
407       both for definition and output symbol, or if it contains a blocked
408       definition , eliminate this entry and move the rest of the list down
409       one. Return TRUE if this happens, otherwise FALSE
410 	  -----------------------------------------------------------------------*/
411 
412 	int i ;
413 	STZ **__stz_list__ = __stz_info__->stz_array ;
414 	for (i = FIRST_STZ; i < request_stz; i ++)
415 	{
416 		SYMB a ;
417 		SYMB *__cur_sym_ptr__ = __stz_list__[request_stz]->output ;
418 		DEF **__stz_definitions__ = __stz_list__[request_stz]->definitions ;
419 		SYMB *__prev_sym_ptr__ = __stz_list__[i]->output ;
420 		DEF **__prev_stz_definitions__ = __stz_list__[i]->definitions ;
421 
422 		while (( a = *__prev_sym_ptr__++ ) == *__cur_sym_ptr__++ )
423 		{
424 			/* -------------------------------------------------------------
425 				A differing definition, even if the output token is the same
426 			could lead to a different result
427 			-------------------------------------------------------------- */
428 			if (*__prev_stz_definitions__++ != *__stz_definitions__++)
429 			{
430 				return FALSE ;
431 			}
432 			/*-- FAIL terminates output , so they're identical --*/
433 			if (a == FAIL)
434 			{
435 				delete_stz(__stz_info__, request_stz) ;
436 				return TRUE ;
437 			}
438 		}
439 	}
440 	return FALSE ;
441 }
442 
443 /* ====================================================================
444 analyze.c (evaluate_micro_l)
445 called by evaluator
446 2009-08-09 : special routine for MICRO_L state : landmark words
447 <revision date='2012-07-22'> Keep track of start_state </revision>
448 =======================================================================*/
449 
evaluate_micro_l(STAND_PARAM * __stand_param__)450 static int evaluate_micro_l( STAND_PARAM *__stand_param__ )
451 {
452 	int i , desired_type , output_field ;
453 	int __def_marked__[MAXLEX][MAXDEF] ;
454 	int *__orig_pos__ = __stand_param__->orig_str_pos ;
455 	int *__sym_sel__ = __stand_param__->cur_sym_sel ;
456 	int *__num_defs__ = __stand_param__->def_cnt ;
457 	LEXEME *__lexeme__ = __stand_param__->lex_vector ;
458 	int n = __stand_param__->LexNum ;
459 	/* 2009-08-15 : use lexicon types */
460 	switch ( __stand_param__->start_state )
461 	{
462 	case FEAT_L :
463 		desired_type = 1 ;
464 		output_field = FEATNAME ;
465 		break ;
466 	case FEAT_T :
467 		desired_type = 2 ;
468 		output_field = FEATTYPE ;
469 		break ;
470 	case FEAT_A :
471 		desired_type = 1 ;
472 		output_field = FEATAREA ;
473 		break ;
474 	default :
475 		return FALSE ;
476 	}
477 	/* -- read the symbols from the definitions into the lex_sym array -- */
478 	for (i = FIRST_LEX_POS ; i < n ; i++)
479 	{
480 		int j ;
481 		DEF *__def__ ;
482 		__orig_pos__[i] = i ; /* we won't use compression here */
483 		__sym_sel__[i] = 0 ; /* -- start at 0 for each Lexeme -- */
484 		/* -- walk the def chain, counting the symbs and putting them
485 		into the array -- */
486 		for (j = 0, __def__ = __lexeme__[i].DefList; __def__ != NULL; __def__ = __def__->Next, j++)
487 		{
488 			__stand_param__->comp_lex_sym[i][j] = __def__->Type ;
489 			__stand_param__->def_array[i][j] = __def__ ;
490 			/* 2009-08-30 : filter out non-default non-desired */
491 			if ((__def__->Type == desired_type) || (__def__->Protect))
492 			{
493 				__def_marked__[i][j] = TRUE ;
494 			}
495 			else __def_marked__[i][j] = FALSE ;
496 		}
497 		__num_defs__[i] = j ;
498 	}
499 	/*-- Now go through all the compositions, looking for those consisting
500 		only of unduplicated defs --*/
501 	do
502 	{
503 		int marked ;
504 		double seg_score ;
505 		/* one duplicated def disqualifies this composition */
506 		for (i = n-1 , marked = TRUE; i >= FIRST_LEX_POS; i --)
507 		{
508 			if (!__def_marked__[i][__sym_sel__[i]])
509 			{
510 				marked = FALSE ;
511 				break ;
512 			}
513 		}
514 		/* 2009-10-16 : accept other types */
515 		seg_score = (marked ? EXCELLENT : LOW) ;
516 		default_seg_val(__sym_sel__, n, __stand_param__->stz_info->segs, FALSE, output_field, seg_score) ;
517 		_force_deposit_(__stand_param__, ( n - 1)) ;
518 	} while ( select_next_composition(__stand_param__)) ;
519 	return ( get_next_stz(__stand_param__, FIRST_STZ)) ; /* -- in case nothing was found -- */
520 }
521 
522 
523 /* ====================================================================
524 analyze.c (evaluator)
525 called by standard.l (close_stand_field)
526 calls analyze.c (first_composition) , analyze.c (shallow_clause_scan) ,
527   analyze.c (scan_clause_tree) , analyze.c (select_next_composition) ,
528   analyze.c(force_arc_clause) , analyze.c (_force_macro_clause_) ,
529   analyze.c(non_geocode_address) , analyze.c (get_next_stz)
530 analyze.c (prepare_target_pattern)
531 <revision date='2006-11-02'> add STAND_PARAM arg and change calls </revision>
532 <revision date='2012-07-22'> Keep track of start_state </revision>
533 =======================================================================*/
evaluator(STAND_PARAM * __stand_param__)534 int evaluator(STAND_PARAM *__stand_param__)
535 {
536 
537     int state ;
538 	STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
539 	__stz_info__->stz_list_cutoff = INITIAL_STZ_CUTOFF ;
540 	state = __stand_param__->start_state ;
541 
542 #ifdef OCCUPANCY_DEBUG
543 	if (state == EXTRA_STATE)
544 	{
545 		__stz_info__->stz_list_cutoff = 0.00 ;
546 	}
547 #endif
548 	__stz_info__->stz_list_size = FIRST_STZ ;
549 	__stz_info__->last_stz_output = FAIL ;
550 
551 	/*-- <revision date='2009-08-09'> Special evaluation for landmarks </revision> --*/
552 	if (state > EXTRA_STATE)
553 	{
554 		return (evaluate_micro_l(__stand_param__)) ;
555 	}
556 	while (TRUE)
557 	{
558 		first_composition(__stand_param__) ; /* 2007-08-09 */
559 		/* -- cycle through all the possible compositions -- */
560 		do
561 		{
562 			int target_len ;
563 			if ((target_len = prepare_target_pattern(__stand_param__)) == TARG_START)
564 			{
565 				continue ;
566 			}
567 			/* --------------------------------------------------------------
568             We don't need to build a clause tree for each composition for
569             MICRO_B and MACRO start states since we only want one
570             segment.
571 			----------------------------------------------------------------*/
572 			switch (state)
573 			{
574             case MACRO :
575 				shallow_clause_scan(__stand_param__, MACRO_C, target_len) ;
576 				break ;
577             case MICRO_B :
578 				shallow_clause_scan(__stand_param__, ARC_C, target_len) ;
579                break ;
580             case EXTRA_STATE :
581 				/* -- 2008-04-19 : scan for occupancy only -- */
582 				shallow_clause_scan(__stand_param__, EXTRA_C, target_len) ;
583 				break ;
584 			default :
585 				scan_clause_tree(__stand_param__, state, target_len) ;
586 			}
587 			/* ----------------------------------------------------------------
588             If we don't check the list size, we may be checking the score
589             of some previous result in the case where no standardization is
590             found
591 			----------------------------------------------------------------- */
592 			if ((__stz_info__->stz_list_size > FIRST_STZ) && (!__stand_param__->analyze_complete) && (__stz_info__->stz_array[FIRST_STZ]->score >= __load_value__[EXCELLENT]))
593 			{
594 				break ;
595 			}
596 		} while (select_next_composition(__stand_param__)) ;
597 		if ((__stz_info__->stz_list_size > FIRST_STZ) && (__stz_info__->stz_array[FIRST_STZ]->score >= __load_value__[1]))
598 		{
599 			break ;
600 		}
601 		/* -- force a segment -- */
602 		if (state == MICRO_B)
603 		{
604 			force_arc_clause(__stand_param__) ;
605 			break ;
606 		}
607 #ifdef USE_FORCE_MACRO
608 		if (state == MACRO)
609 		{
610 			_force_macro_clause_(__stand_param__) ;
611 			break ;
612 		}
613 #endif
614 		if (state != MICRO_M)
615 		{
616 			break ;
617 		}
618 		if (!non_geocode_address(__stand_param__))
619 		{
620 			break ;
621 		}
622 		state = EXIT ;
623 	} /*-- end of while TRUE --*/
624 	return (get_next_stz(__stand_param__, FIRST_STZ)) ; /* -- in case nothing was found -- */
625 }
626 
627 
628 /* ====================================================================
629 <summary>
630 <function name='analyze.c (first_composition)'/>
631 <called-by> <functionref='analyze.c (evaluator)'/>
632 <remarks> Called by Evaluator to intialize __best_output__ and __sym_sel__ -
633 	also sets up lex_sym, save_defs and __num_defs__ from the
634     definitions in the LexVector  </remarks>
635 </summary>
636 =======================================================================*/
first_composition(STAND_PARAM * __stand_param__)637 static void first_composition( STAND_PARAM *__stand_param__ )
638 {
639 	int i ;
640 
641 	int *__sym_sel__ = __stand_param__->cur_sym_sel ;
642 	int *__num_defs__ = __stand_param__->def_cnt ;
643 	LEXEME *__lexemes__ = __stand_param__->lex_vector ;
644 	int n = __stand_param__->LexNum ;
645 	/*-- <remarks> Read the symbols from the definitions into the lex_sym array </remarks> --*/
646 	for (i = FIRST_LEX_POS; i < n; i++)
647 	{
648 		int j ;
649 		DEF *__def__ ;
650 		__sym_sel__[i] = 0 ; /* -- start at 0 for each Lexeme -- */
651 		/*-- <remarks> Walk the def chain, counting the symbs and putting them
652 			into the array </remarks> --*/
653 		for (j = 0, __def__ = __lexemes__[i].DefList; __def__ != NULL; __def__ = __def__->Next, j++)
654 		{
655 			__stand_param__->comp_lex_sym[i][j] = __def__->Type ;
656 			__stand_param__->def_array[i][j] = __def__ ;
657 		}
658 		__num_defs__[i] = j ;
659 	}
660 }
661 
662 /* ============================================================
663 analyze.c (prepare_target_pattern)
664 called by analyze.c (evaluator)
665 calls analyze.c (need_compression) gamma.c (refresh_transducer)
666 2006-10-31 : add STAND_PARAM parameter and change calls
667 ==============================================================*/
prepare_target_pattern(STAND_PARAM * __stand_param__)668 static int prepare_target_pattern(STAND_PARAM *__stand_param__)
669 {
670 	int lex_pos, target_pos;
671 	int *__sym_sel__ = __stand_param__->cur_sym_sel ;
672 	SYMB *__p_target__ = __stand_param__->target ;
673 	int *__orig_pos__ = __stand_param__->orig_str_pos ;
674 	int n = __stand_param__->LexNum ;
675 	NODE **__g_function__ = __stand_param__->rules->gamma_matrix ;
676 	for ( lex_pos = FIRST_LEX_POS , target_pos = TARG_START ; lex_pos < n ;lex_pos++ )
677 	{
678 		SYMB in_symb = __stand_param__->comp_lex_sym[lex_pos][__sym_sel__[lex_pos]] ;
679 		/* ------------------------------------------------------------
680 			compress multiple words and stopwords - the idea is that
681 			any combination of LEFT and RIGHT compression tokens (words
682 			and stopwords, compress as a single word
683 		------------------------------------------------------------- */
684 		if ( !need_compression( __stand_param__ , in_symb , lex_pos , target_pos ))
685 		{
686 			/* ---------------------------------------------------------
687             If no compression, associate this lex_pos with the
688             target_pos, put the symbol into the target and increment
689             the target_pos. Otherwise, keep the same target_pos and
690             discard symbol
691 			---------------------------------------------------------- */
692 			__orig_pos__[lex_pos] = target_pos ;
693 			__p_target__[target_pos++] = in_symb ;
694 		}
695 	}
696 	/*-- Terminate symb lists --*/
697 	__p_target__[target_pos] = FAIL ;
698 	/*-- But suppose we only have one symbol, and it is a stopword --*/
699 	if ( target_pos > TARG_START )
700 	{
701 		/*-- Set up the Aho-Corasick registry of output links --*/
702 		refresh_transducer( __stand_param__->registry , __p_target__ , __g_function__ ) ;
703 	}
704 	return target_pos ; /* -- return cardinal number of target symbols -- */
705 }
706 
707 /* ============================================================
708 analyze.c (no_break)
709 called by analyze.c (do_left_combine)
710 -- moved from tokenize.c to analyze.c
711 ==============================================================*/
no_break(STAND_PARAM * __stand_param__,int n)712 static int no_break( STAND_PARAM *__stand_param__ , int n )
713 {
714 	int k = __stand_param__->lex_vector[n].EndMorph ;
715 	/* 0 is no break
716 	1 is set for semicolons, tabs and commas,
717 	2 for spaces */
718 	return (( __stand_param__->morph_array[k].Term  == 1 )? FALSE : TRUE ) ;
719 }
720 
721 
722 /* ============================================================
723 analyze.c (do_left_combine)
724 calls analyze.c (no_break) called by analyze.c (need_compression)
725 ==============================================================*/
do_left_combine(STAND_PARAM * __stand_param__,int lex_pos,int target_pos)726 static int do_left_combine( STAND_PARAM *__stand_param__ , int lex_pos , int target_pos )
727 {
728 	/*-- A LEFT_COMPRESS left compresses only if a LEFT_COMPRESS there to
729 		combine with --*/
730 	if (( target_pos == TARG_START ) || ( __stand_param__->target[target_pos - 1] != LEFT_COMPRESS ))
731 	{
732 		/*-- A RIGHT_COMPRESS also returns FALSE if it is at the start or
733 			if the previous token isn't a LEFT_COMPRESS. need_compression will
734 			deal with this --*/
735 		return FALSE ;
736 	}
737 	/*-- A break in the lex sequence suggests these two words don't
738 		belong together --*/
739 	if ( !no_break( __stand_param__ , lex_pos - 1 ))
740 	{
741 		return FALSE ;
742 	}
743 	/*-- Okay, left compress it by giving it the same target position as the
744 		previous symbol --*/
745 	__stand_param__->orig_str_pos[lex_pos] = target_pos - 1 ; /* -- need to associate lex_pos
746                             and target_pos for later decompression --*/
747 	return TRUE ; /*-- Indicate compression was done --*/
748 }
749 
750 
751 /* ============================================================
752 analyze.c (need_compression)
753 called by analyze.c (prepare_target_pattern)
754 calls analyze.c (do_left_combine)
755 ==============================================================*/
need_compression(STAND_PARAM * __stand_param__,SYMB a,int lex_pos,int target_pos)756 static int need_compression( STAND_PARAM *__stand_param__ , SYMB a , int lex_pos , int target_pos )
757 {
758 	/*-- No stopwords are accepted, no matter what --*/
759 	if ( a == RIGHT_COMPRESS )
760 	{
761 		/*-- Does it combine with the last target symbol or the next? --*/
762 		if ( !do_left_combine( __stand_param__ , lex_pos , target_pos ))
763 		{
764 			/* ---------------------------------------------------------------
765             do a right combine by giving it the next position. Note that
766             this allows the possibility of a STOPWORD with combining with
767             TYPE or DIR tokens, but this is what we want in cases like EL
768             CAMINO RD -- a RIGHT_COMPRESS may stray into the wrong field --
769             deal with this when decompressing
770 			---------------------------------------------------------------- */
771 			__stand_param__->orig_str_pos[lex_pos] = target_pos ; /* -- target_pos does not
772                                               advance if returning TRUE --*/
773 		}
774 		return TRUE ;
775 	}
776 	/* -----------------------------------------------------------------------
777 	everything that isn't a WORD must be accepted - we don't want to
778 	combine words that are used in parsing, - two direction words, for
779 	instance, one of which may be used as part of a street name, the other
780 	perhaps as a suffix direction.
781 	-------------------------------------------------------------------------*/
782 	if ( a != LEFT_COMPRESS )
783 	{
784 		return FALSE ;
785 	}
786 	/*-- compress the WORD --*/
787 	return ( do_left_combine( __stand_param__ , lex_pos , target_pos )) ;
788 }
789 
790 /*========================================================================
791 analyze.c (scan_clause_tree)
792 Called by analyze.c (Evaluator)
793 Calls analyze.c (deposit_stz)
794 2006-11-02 : add KW *** arg, change call to GetOutputLink to direct access
795 =========================================================================*/
scan_clause_tree(STAND_PARAM * __stand_param__,int start_state,int start_pos)796 static void scan_clause_tree(STAND_PARAM *__stand_param__,int start_state,int start_pos)
797 {
798 	int next_state = FAIL ;
799 
800 	RULE_PARAM *__rules__ = __stand_param__->rules ;
801 	KW ***__output_link__ = __rules__->output_link ;
802 	SEG *__segments__ = __stand_param__->stz_info->segs ;
803 	double sum = 0.00 ; /* -- running total for score calculation --*/
804 	int pos = start_pos ; /* -- one beyond the last symbol -- */
805 	int state = start_state ; /* --for the __tran_table__ -- */
806 	int depth = START_DEPTH ; /* --how deep in the clause tree -- */
807 	int cl = 0 ;
808 	KW *__keyw__ = NULL ;
809 
810 	while (TRUE)
811 	{
812         SEG *__outer_seg__ ;
813 		while (TRUE)
814 		{
815             SEG *__inner_seg__ ;
816 			if (__keyw__ == NULL)
817 			{
818 				/*-- when we're out of keys for this class, get next class --*/
819 				if (++cl == MAX_CL)
820 				{
821 					/* -- no more states to transition to, so go up clause tree
822 					- unless there's nowhere to go -- */
823 					if (depth == START_DEPTH) return ; /* -- the exit -- */
824 					depth -- ;
825 					break ;
826 				}
827 				if ((next_state = __tran_table__[state][cl]) == FAIL)
828 				{
829 					/*-- no transition, try next clause --*/
830 					continue ;
831 				}
832 				/*-- recall that the registry is shifted right one node to
833 				account for the node that corresponds to total failure --*/
834 				/*-- <revision date='2006-11-02'> Substitute for GetOutputLink </revision> --*/
835 				if ((__keyw__ = __output_link__[__stand_param__->registry[pos]][cl]) == NULL)
836 				{
837 					continue ;
838 				}
839 			} /* end of if keyword is NULL */
840 
841 			/* -- skip pointless rules -- */
842 			if ((__keyw__->Length == pos) && (next_state != EXIT))
843 			{
844 				__keyw__ = __keyw__->OutputNext ; /* -- the next key to check -- */
845 				continue ;
846 			}
847 			/* -- fill in this definition for output if it forms part of a
848             completed stz -- */
849 			__inner_seg__ = __segments__ + depth ;
850 			__inner_seg__->End = pos - 1 ; /* -- ordinal numb of last sym in target -- */
851 			__inner_seg__->Key = __keyw__ ;
852 			__inner_seg__->State = state ;
853 			__inner_seg__->Output = __keyw__->Output ;
854 			if (__rules__->collect_statistics)
855 			{
856 				__keyw__->hits ++ ;
857 				__rules__->total_key_hits ++ ;
858 			}
859 			/* -- running total in sum, segment total in Segment -- */
860 			sum += (__inner_seg__->Value = __load_value__[__keyw__->Weight] * __weight_table__[__keyw__->Type]) ;
861 			if ((__inner_seg__->Start = pos - (__keyw__->Length)) == 0)
862 			{
863 				/* -- all definitions have been matched: if this is a valid
864 				state, save the standardization , then head back up
865 				the tree -- */
866 				if (next_state == EXIT)
867 				{
868 					deposit_stz(__stand_param__,sum,depth) ;
869 				}
870 				/* -- keep the same cl,  state , depth and pos -- */
871 				sum -= __inner_seg__->Value ; /* -- restore the previous sum -- */
872 				__keyw__ = __keyw__->OutputNext ; /* -- and get the next rule on the
873                                               linked list -- */
874 				continue ;
875 			}
876 			/* -- begin a subtree at the new depth -- */
877 			pos = __inner_seg__->Start ;
878 			state = __tran_table__[state][cl] ;
879 			depth ++ ;
880 			cl = 0 ;
881 			__keyw__ = NULL ; /* -- new start -- */
882 		} /* -- end of inner loop -- */
883 		/* -- restore the previous state from the seg before overwrite -- */
884 		__outer_seg__ = __segments__ + depth ;
885 		state = __outer_seg__->State ;
886 		if (depth != START_DEPTH)
887 		{
888 			sum -= __outer_seg__->Value ;
889 			pos = __outer_seg__->End + 1 ;
890 		}
891 		else
892 		{
893 			sum = 0.00 ;
894 			pos = start_pos ;
895 		}
896 		__keyw__ = __outer_seg__->Key ;
897 		cl = __keyw__->Type ; /* -- the clause we were working on -- */
898 		__keyw__ = __keyw__->OutputNext ; /* -- the next key to check -- */
899 	} /* -- end of outer loop -- */
900 }
901 
902 /*========================================================================
903 analyze.c (shallow_clause_scan)
904 Called by analyze.c (evaluator)
905 Calls analyze.c (deposit_stz)
906 <remarks>Called by Evaluator to get a complete rule for this class. If we
907 		can't get a complete rule we don't want one at all. If no composition
908 		can up with one, force_standardization will activate</remarks>
909 2006-11-02 : add KW *** arg, change call to GetOutputLink to direct access
910 =========================================================================*/
shallow_clause_scan(STAND_PARAM * __stand_param__,int cl,int pos)911 static void shallow_clause_scan(STAND_PARAM *__stand_param__ , int cl, int pos)
912 {
913 	KW *__kw__ ;
914 
915 	RULE_PARAM *__rules__ = __stand_param__->rules ;
916 	KW ***__output_link__ = __rules__->output_link ;
917 	SEG * __seg__ = __stand_param__->stz_info->segs ;
918 	__seg__->End = pos - 1 ;
919 	__seg__->Start = 0 ;
920 	/*-- <revision date='2006-11-02'> Substitute for GetOutputLink </revision> --*/
921 	for (__kw__ = __output_link__[__stand_param__->registry[pos]][cl] ; __kw__ != NULL; __kw__ = __kw__->OutputNext)
922 	{
923 		/*-- once we get a short keyword, depart --*/
924 		if (__kw__->Length < pos) return ;
925 		/*-- fill in the rest of this definition for output if it forms part
926          of a completed stz --*/
927 		__seg__->Output = __kw__->Output ;
928 		if (__rules__->collect_statistics)
929 		{
930 			__seg__->Key = __kw__ ;
931 			__kw__->hits ++ ;
932 			__rules__->total_key_hits ++ ;
933 		}
934 #ifdef OCCUPANCY_DEBUG
935 		if (cl == EXTRA_C)
936 		{
937 			SYMB *__ol__ ;
938 			printf( "\nRule is type %d (%s)\n: " , __kw__->Type , __rule_type_names__[__kw__->Type] ) ;
939 			printf( "Input : " ) ;
940 			for ( __ol__ = __kw__->Input ; *__ol__ != FAIL ; __ol__++ )
941 			{
942 				printf( "|%d (%s)|", *__ol__ , in_symb_name( *__ol__ )) ;
943 			}
944 			printf("\nOutput: ") ;
945 			/*-- output the output symbols --*/
946 			for (__ol__ = __kw__->Output;*__ol__ != FAIL;__ol__++)
947 			{
948 				printf("|%d (%s)|",*__ol__,out_symb_name(*__ol__)) ;
949 			}
950 			printf ("\nrank %d ( %f)\n",__kw__->Weight,__load_value__[__kw__->Weight]) ;
951 		}
952 #endif
953 		/* -- don't skew weights with these start states - so the cutoff is
954          easier -- */
955 		deposit_stz(__stand_param__,__load_value__[__kw__->Weight],START_DEPTH) ;
956 	}
957 }
958 
959 /* ====================================================================
960 analyze.c (select_next_composition)
961 called by analyze.c (evaluator)
962 =======================================================================*/
select_next_composition(STAND_PARAM * __stand_param__)963 static int select_next_composition( STAND_PARAM *__stand_param__ )
964 {
965 	int pos ;
966 	int *__sym_sel__ = __stand_param__->cur_sym_sel ;
967 	int *__num_defs__ = __stand_param__->def_cnt ;
968 
969 	for ( pos = __stand_param__->LexNum - 1 ; pos >= FIRST_LEX_POS ; pos-- )
970 	{
971 		__sym_sel__[pos]++ ; /*-- Increase selector --*/
972 		if ( __sym_sel__[pos] < __num_defs__[pos] )
973 		{
974 			/*-- Not ready yet for turnover --*/
975 			return TRUE ;
976 		}
977 		__sym_sel__[pos] = 0 ; /*-- Reset selector --*/
978 	}
979 	return FALSE ;
980 }
981 
982 /* ====================================================================
983 <summary>
984 	<function name='analyze.c (make_singleton)'>
985 	<remarks> Called to make a segment with a putative single position output.
986 		Don't really need a KW. as long as copy_best knows how to handle
987 		it. </remarks>
988 	<called-by><functionref='analyze.c (default_seg_val)'/></called-by>
989 	<revision date='2009-08-09'> Eliminate cl arg to make_singleton. </revision>
990 </summary>
991 =======================================================================*/
make_singleton(SEG * __segments__,SYMB sym,int pos,int depth,double score)992 static void make_singleton( SEG *__segments__, SYMB sym , int pos, int depth, double score )
993 {
994 
995 	/*-- <remarks> Since the __segments__ go left to right and the positions go right to
996 		left, the depth and position will usually be different. </remarks> --*/
997 	SEG *__seg__ = __segments__ + depth ;
998 	__seg__->Start = pos ;
999 	__seg__->End = pos ;
1000 	__seg__->Value = score ;
1001 	__seg__->Output = NULL ;
1002 	__seg__->sub_sym = sym ;
1003 }
1004 
1005 /* ====================================================================
1006 analyze.c (deposit_stz)
1007 calls analyze.c (copy_stz, save_current_composition)
1008 called by analyze.c (_force_deposit_, shallow_clause_scan,scan_clause_tree)
1009 =======================================================================*/
deposit_stz(STAND_PARAM * __stand_param__,double sum,int depth)1010 static void deposit_stz( STAND_PARAM *__stand_param__ , double sum , int depth )
1011 {
1012 	STZ_PARAM * __stz_info__ = __stand_param__->stz_info ;
1013     STZ *__cur_stz__ ;
1014 
1015 	/*-- calculate the score here --*/
1016 	double cur_score = (sum / (double) (depth + 1)) ;
1017 
1018 	/*-- and apply the cutoff before doing all the work of putting it into
1019 		the list --*/
1020 	if ( cur_score < __stz_info__->stz_list_cutoff ) return ;
1021 
1022 	/*-- need the score to get the pointer, need the pointer to copy the
1023 		content --*/
1024 	__cur_stz__ = copy_stz( __stand_param__ , cur_score ) ;
1025 
1026 	/*-- Then add the content, once we have a pointer -- */
1027 	if (( __stand_param__->rules->collect_statistics ) && ( depth == START_DEPTH ))
1028 	{
1029 		SEG *__seg__ = __stz_info__->segs + START_DEPTH ;
1030 		if (__seg__->Key != NULL)
1031 		{
1032 			__cur_stz__->build_key = __seg__->Key ;
1033 		}
1034 	}
1035 	save_current_composition( __stand_param__ , __stz_info__->segs,depth , __cur_stz__->output , __cur_stz__-> definitions ) ;
1036 }
1037 
1038 #define DUP_DECREMENT .0025
1039 
1040 /* ====================================================================
1041 analyze.c (copy_stz)
1042 called by analyze.c (deposit_stz)
1043 =======================================================================*/
copy_stz(STAND_PARAM * __stand_param__,double current_score)1044 static STZ * copy_stz(STAND_PARAM *__stand_param__ ,double current_score)
1045 {
1046 	/* -- sort it into the list and knock the last one off the list
1047       if it is MAX_STZ -- */
1048 	/* -- Take the Score of the last remaining item as the new cutoff,
1049       if it is greater than the current cutoff -- */
1050 	int i ;
1051     int last_on_list ;
1052     STZ *__cur_stz__ ;
1053 
1054 
1055 	STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
1056 	STZ **__stz_list__ = __stz_info__->stz_array ;
1057 
1058 	/* -- Increase the list size only if it isn't full. If it is full, take
1059 		the score of the last on the list (which we're going to knock off the
1060 		list) as the new cutoff -- */
1061 
1062 	if (__stz_info__->stz_list_size != MAX_STZ)
1063 	{
1064 		__stz_info__->stz_list_size++ ;
1065 	}
1066 
1067 	/* -- Get the pointer of the last on the list if the list is full (to be
1068       knocked off, or one beyond the previous last item (with undefined
1069       content) if the list isn't full. -- */
1070 	last_on_list = __stz_info__->stz_list_size - 1 ;
1071 	__cur_stz__ = __stz_list__[last_on_list] ; /* -- implicitly discard contents -- */
1072 	__cur_stz__->score = current_score ;
1073 	__cur_stz__->raw_score = current_score ;
1074 
1075 	/*-- Initialize the output vector - but is this necessary ? --*/
1076 	for (i = FIRST_LEX_POS;i <= __stand_param__->LexNum;i++)
1077 	{
1078 		__cur_stz__->output[i] = FAIL ;
1079 	}
1080 	/* -- boundary condition : last-1   last
1081                                [ ]     [ ]
1082       suppose the last - 1 has a score less than the current score - then
1083         it isn't copied into last, so __cur_stz__ goes back into the slot
1084         from which it was just removed - nothing moves  -- */
1085 	for (i = last_on_list;i > FIRST_STZ;i --)
1086 	{
1087 		/* -- Get the next pointer on the list and move it back if it has a
1088          lesser score. Otherwise we put the pointer to the new stz in the
1089          present position -- */
1090 		STZ *__next_stz__ = __stz_list__[i-1] ;
1091 		if (current_score > __next_stz__->raw_score)
1092 		{
1093 			__stz_list__[i] = __next_stz__ ;
1094 		}
1095 		else
1096 		{
1097 			if (current_score == __next_stz__->raw_score)
1098 			{
1099 				/* -- 2008-03-14: first come, first served -- */
1100 				__cur_stz__->score = __next_stz__->score - DUP_DECREMENT ;
1101 			}
1102 			break ;
1103 		}
1104 	}
1105 	__stz_list__[i] = __cur_stz__ ;
1106 	if (__stz_info__->stz_list_size == MAX_STZ)
1107 	{
1108 		__stz_info__->stz_list_cutoff = __stz_list__[last_on_list]->score ;
1109 	}
1110 	return __cur_stz__ ; /* -- tell the caller where we put it -- */
1111 }
1112 
1113 /* ====================================================================
1114 analyze.c (save_current_composition)
1115 called by analyze.c (deposit_stz)
1116 calls analyze.c (copy_best)
1117 <remarks>called by deposit_stz to align the current standardization output
1118       symbols to the LEXEME input symbols - it depends on the correct
1119       LEXEMES being present and the __sym_sel__ reflecting the last composition.
1120       Consequently it must be done at the time of deposit </remarks>
1121 =======================================================================*/
save_current_composition(STAND_PARAM * __stand_param__,SEG * __segments__,int depth,SYMB * __best_output__,DEF ** __best_defs__)1122 static void save_current_composition(STAND_PARAM *__stand_param__,SEG *__segments__, int depth, SYMB *__best_output__ , DEF **__best_defs__)
1123 {
1124 
1125 	int lex_pos ;
1126 	SEG *__seg__ ;
1127 	int *__sym_sel__ = __stand_param__->cur_sym_sel ;
1128 
1129 	/*-- <remarks> Get the definitions selected from save_defs - needed for outputing
1130 		the lexemes. Different definitions may give a different
1131 		standardization for the same input - the letter W may be standardized
1132 		as W if a SINGLE or WEST if a DIRECT </remarks> --*/
1133 
1134 	/* -- use the whole target -- */
1135 	for ( lex_pos = FIRST_LEX_POS ; lex_pos < __stand_param__->LexNum ; lex_pos++ )
1136 	{
1137 		__best_defs__[lex_pos] = __stand_param__->def_array[lex_pos][__sym_sel__[lex_pos]] ;
1138 	}
1139 	__best_defs__[lex_pos] = NULL ;
1140 
1141 	/*-- <remarks> Segments go backwards (right to left) , but the content for
1142       each segment goes left to right </remarks> --*/
1143 
1144 	for ( __seg__ = __segments__ + depth, lex_pos = FIRST_LEX_POS ; __seg__ >= __segments__ ; __seg__-- )
1145 	{
1146 		SYMB *__sym_ptr__ ;
1147 		if (( __sym_ptr__ = __seg__->Output ) == NULL)
1148 		{
1149 			lex_pos = copy_best( __stand_param__ , __sym_sel__ , __seg__->sub_sym , lex_pos , __best_output__ ) ;
1150 			continue ;
1151 		}
1152 		for ( ; *__sym_ptr__ != FAIL ; __sym_ptr__ ++ )
1153 		{
1154 			lex_pos = copy_best( __stand_param__ , __sym_sel__ , *__sym_ptr__ , lex_pos , __best_output__ ) ;
1155 		}
1156    }
1157 }
1158 
1159 /* ====================================================================
1160 analyze.c (copy_best)
1161 called by analyze.c (save_current_composition)
1162 <remarks> Called by save_current_composition to decompress stopword and word
1163       sequences </remarks>
1164 =======================================================================*/
copy_best(STAND_PARAM * __stand_param__,int * __sym_sel__,SYMB output_symb,int beg,SYMB * __best_output__)1165 static int copy_best( STAND_PARAM *__stand_param__ , int *__sym_sel__ , SYMB output_symb , int beg , SYMB *__best_output__ )
1166 {
1167 	int lex_pos ;
1168 	int *__orig_pos__ = __stand_param__->orig_str_pos ;
1169 
1170 	/*-- <remarks> <code>orig_pos</code> has the (multiple) LEXEME positions to which the
1171       (single) output symbol corresponds - so we add that symbol to each of
1172       the positions </remarks> --*/
1173 
1174 	int next_target_pos = __orig_pos__[beg] + 1 ;
1175 	for ( lex_pos = beg ; __orig_pos__[lex_pos] < next_target_pos ; lex_pos ++ )
1176 	{
1177 		if ( lex_pos == __stand_param__->LexNum ) break ;
1178 
1179 		/*-- <remarks> Check for errant RIGHT_COMPRESS - put it back into STREET
1180 			if possible </remarks> --*/
1181 
1182 		if (( lex_pos > FIRST_LEX_POS ) && ( output_symb != STREET ) && ( __stand_param__->comp_lex_sym[lex_pos][__sym_sel__[lex_pos]] == RIGHT_COMPRESS ) && ( __best_output__[lex_pos - 1] == STREET ))
1183 		{
1184 			__best_output__[lex_pos] = STREET ;
1185 		}
1186 		else
1187 		{
1188 			__best_output__[lex_pos] = output_symb ;
1189 		}
1190 	}
1191 	return lex_pos ;
1192 }
1193 
1194 /* ====================================================================
1195 analyze.c (lex_has_def)
1196 called by analyze.c (non_geocode_address, _modify_position_)
1197 scan the ith row of comp_lex_sym for the symbol sym
1198 returns the matching cell j
1199 =======================================================================*/
lex_has_def(STAND_PARAM * __stand_param__,int i,SYMB sym)1200 static int lex_has_def(STAND_PARAM *__stand_param__, int i, SYMB sym)
1201 {
1202 	int j ;
1203 	int *__num_defs__ = __stand_param__->def_cnt ;
1204 	for (j = 0; j < __num_defs__[i]; j ++)
1205 	{
1206 		if (__stand_param__->comp_lex_sym[i][j] == sym)
1207 		{
1208 			return j ;
1209 		}
1210 	}
1211 	return FAIL ;
1212 }
1213 
1214 /* ====================================================================
1215 analyze.c (have_schema_symbol)
1216 called by analyze.c (schema_modify_position)
1217 =======================================================================*/
have_schema_symbol(int * __check_dir__,SYMB sym)1218 static int have_schema_symbol(int *__check_dir__,SYMB sym)
1219 {
1220 	if (__check_dir__ != NULL)
1221 	{
1222 		if (__check_dir__[sym])
1223 		{
1224 			return TRUE ;
1225 		}
1226 	}
1227 	return FALSE ;
1228 }
1229 
1230 /* ====================================================================
1231 <summary>
1232 	<function name='analyze.c (default_seg_val)'/>
1233 	<calls> <functionref='analyze.c (make_singleton)'/> </calls>
1234 	<called-by> <functionref='analyze.c (force_arc_clause,
1235 		_force_macro_clause_)'/> </called-by>
1236 	<revision date='2009-08-09'> Fourth arg now used to determine if
1237 		the __sym_sel__ should be initialized to the first definition :
1238 		save_composition uses the value. We will do that when we
1239 		have no idea at all which the right one is -- and there is
1240 		always at least one. </revision>
1241 </summary>
1242 =======================================================================*/
1243 #define DEPTH_POS ( num_lexes - 1 ) - depth
1244 
default_seg_val(int * __sym_sel__,int num_lexes,SEG * __segments__,int use_default_sym,SYMB sym,double score)1245 static void default_seg_val( int *__sym_sel__, int num_lexes, SEG *__segments__, int use_default_sym, SYMB sym, double score )
1246 {
1247 	int depth ;
1248 	for (depth = FIRST_LEX_POS ;depth < num_lexes;depth ++)
1249 	{
1250 		if (use_default_sym)
1251 		{
1252 			/*-- <revision date='2009-08-09'> Set default only if told to do so </revision> --*/
1253 			__sym_sel__[DEPTH_POS] = 0 ; /* -- default value -- */
1254 		}
1255 		/*-- <revision date='2009-08-09'> Eliminate cl arg to make_singleton. </revision> --*/
1256 		make_singleton(__segments__,sym,DEPTH_POS,depth,score) ;
1257 	}
1258 }
1259 
1260 /* ====================================================================
1261 analyze.c (_modify_position_)
1262 called by analyze.c (schema_modify_position,_force_macro_clause_)
1263 calls analyze.c (lex_has_def)
1264 <remarks>If the input symbol is found at pos, then we put the out_sym as the sub_sym
1265 at depth in __seg__
1266 =======================================================================*/
_modify_position_(STAND_PARAM * __stand_param__,SEG * __seg__,int depth,int pos,SYMB in_sym,SYMB out_sym)1267 static int _modify_position_(STAND_PARAM *__stand_param__, SEG *__seg__, int depth, int pos, SYMB in_sym, SYMB out_sym)
1268 {
1269 	int sel ;
1270 	if ((sel = lex_has_def(__stand_param__, pos, in_sym)) != FAIL)
1271 	{
1272 		__seg__[depth].sub_sym = out_sym ;
1273 		__stand_param__->cur_sym_sel[pos] = sel ;
1274 		return TRUE ;
1275 	}
1276 	return FALSE ;
1277 }
1278 
1279 /* ====================================================================
1280 analyze.c (schema_modify_position)
1281 - called by analyze.c (force_arc_clause)
1282 calls analyze.c (have_schema_symbol, _modify_position_)
1283 =======================================================================*/
schema_modify_position(STAND_PARAM * __stand_param__,SEG * __segments__,int depth,int lex_pos,SYMB in_sym,SYMB out_sym)1284 static int schema_modify_position( STAND_PARAM  *__stand_param__ , SEG *__segments__ , int depth , int lex_pos , SYMB in_sym , SYMB out_sym )
1285 {
1286 	/* -- note: this requires that attributes are present. It
1287 	only works if we're working within a particular
1288 	reference dataset. -- */
1289 	if (have_schema_symbol(__stand_param__->have_ref_att, out_sym))
1290 	{
1291 		return (_modify_position_(__stand_param__,__segments__, depth , lex_pos , in_sym , out_sym)) ;
1292 	}
1293 	return FALSE ;
1294 }
1295 
1296 
1297 
1298 /* ====================================================================
1299 analyze.c (force_arc_clause)
1300 called by analyze.c (evaluator)
1301 calls analyze.c (default_seg_val, schema_modify_position and _force_deposit_)
1302 <remarks>We're going to force standardization on an Arc clause without
1303       much computation. first_composition has already done its work,
1304       so we go through the lex_sym looking for likely constructions , using
1305       the schema read as a guide </remarks>
1306 =======================================================================*/
force_arc_clause(STAND_PARAM * __stand_param__)1307 static void force_arc_clause( STAND_PARAM *__stand_param__ )
1308 {
1309 	int lex_start, lex_end, depth ;
1310 	STZ_PARAM * __stz_info__ = __stand_param__->stz_info ;
1311 	int num_lexes = __stand_param__->LexNum ;
1312 	default_seg_val( __stand_param__->cur_sym_sel , num_lexes , __stz_info__->segs , ARC_C , STREET , VERY_LOW_WEIGHT ) ;
1313 	depth = lex_start = 0 ;
1314 	lex_end = num_lexes -1 ;
1315 	/*-- look for a SUFDIR in the last position --*/
1316 	if (lex_start < lex_end -1)
1317 	{
1318 		if (schema_modify_position( __stand_param__ , __stz_info__->segs , depth , lex_end , DIRECT , SUFDIR ))
1319 		{
1320 			lex_end-- ;
1321 			depth ++ ;
1322 		}
1323 	}
1324 	/*-- look for a SUFTYP --*/
1325 	if (lex_start < (lex_end -1))
1326 	{
1327 		if (schema_modify_position( __stand_param__ , __stz_info__->segs , depth , lex_end , TYPE , SUFTYP ))
1328 		{
1329 			lex_end-- ;
1330 		}
1331 	}
1332 	depth = num_lexes - 1 ;
1333 	if (lex_start < (lex_end -1))
1334 	{
1335 		if (schema_modify_position(__stand_param__, __stz_info__->segs, depth, lex_start, DIRECT, PREDIR))
1336 		{
1337 			lex_start++ ;
1338 			depth -- ;
1339 		}
1340 	}
1341 	if (lex_start < (lex_end-1))
1342 	{
1343 		if (schema_modify_position(__stand_param__, __stz_info__->segs, depth, lex_start, TYPE, PRETYP))
1344 		{
1345 			lex_start++ ;
1346 		}
1347 	}
1348 	_force_deposit_(__stand_param__, (__stand_param__->LexNum-1)) ;
1349 }
1350 
1351 #define MODIFY_SEG_POS(_IN_SYM_VAL_,_OUT_SYM_VAL_)\
1352 if ( _modify_position_( __stand_param__ , __segments__ , depth , lex_sym_pos , _IN_SYM_VAL_ , _OUT_SYM_VAL_ ) ) { continue ; }
1353 
1354 
1355 /* ====================================================================
1356 <summary>
1357 	<function name='analyze.c (_force_macro_clause_)'/>
1358 	<called-by> <functionref='analyze.c (evaluator)'/> </called-by>
1359 	<calls> <functionref='analyze.c (default_seg_val,_modify_position_,_force_deposit_)'/> </calls>
1360 </summary>
1361 =======================================================================*/
1362 #ifdef USE_FORCE_MACRO
_force_macro_clause_(STAND_PARAM * __stand_param__)1363 static void _force_macro_clause_( STAND_PARAM *__stand_param__ )
1364 {
1365 	int lex_sym_pos, depth ;
1366 	int n = __stand_param__->LexNum ;
1367 	int end = n -1 ;
1368 	SEG *__segments__ = __stand_param__->stz_info->segs ;
1369 
1370 	default_seg_val( __stand_param__->cur_sym_sel , n , __segments__ , MACRO_C , POSTAL , VERY_LOW_WEIGHT ) ;
1371 	for ( lex_sym_pos = 0 , depth = end ; lex_sym_pos <= end ; lex_sym_pos ++ , depth -- )
1372 	{
1373 		MODIFY_SEG_POS(PCH,POSTAL);
1374 		MODIFY_SEG_POS(PCT,POSTAL);
1375 		MODIFY_SEG_POS(QUINT,POSTAL);
1376 		MODIFY_SEG_POS(QUAD,POSTAL);
1377 		MODIFY_SEG_POS(NUMBER,POSTAL);
1378 		MODIFY_SEG_POS(MIXED,POSTAL);
1379 		MODIFY_SEG_POS(NATION,NATION);
1380 		MODIFY_SEG_POS(PROV,PROV);
1381 		MODIFY_SEG_POS(CITY,CITY);
1382 		MODIFY_SEG_POS(WORD,CITY);
1383 	}
1384 	_force_deposit_(__stand_param__,n-1) ;
1385 }
1386 #endif
1387 /* ====================================================================
1388 <summary>
1389 	<function name='analyze.c (_force_deposit_)'/>
1390 	<called-by> <function ref='analyze.c (force_arc_clause,_force_macro_clause_)'/> </called-by>
1391 	<calls> <function ref='analyze.c (deposit_stz)'/> </calls>
1392 </summary>
1393 =======================================================================*/
_force_deposit_(STAND_PARAM * __stand_param__,int depth)1394 static void _force_deposit_( STAND_PARAM *__stand_param__ , int depth )
1395 {
1396 	/*-- <remarks> Worst case scenario: we have a string of unknowns. It'll score
1397 		really low, but not zero. </remarks> --*/
1398 	double sum = 0.00 ;
1399 	SEG *__seg__ ;
1400 	SEG *__segments__ = __stand_param__->stz_info->segs ;
1401 	for (__seg__ = __segments__ + depth; __seg__ >= __segments__; __seg__--)
1402 	{
1403 		sum += __seg__->Value ;
1404 	}
1405 	deposit_stz( __stand_param__ , sum , depth ) ;
1406 }
1407 
1408 /* ====================================================================
1409 analyze.c (non_geocode_address)
1410 called by analyze.c (evaluator)
1411 calls analyze.c (lex_has_def)
1412 =======================================================================*/
non_geocode_address(STAND_PARAM * __stand_param__)1413 static int non_geocode_address( STAND_PARAM *__stand_param__ )
1414 {
1415 	/* -- scan through each position looking for an RR or BOXH token. -- */
1416 	int lex_sym_pos ;
1417 	int n = __stand_param__->LexNum ;
1418 	for ( lex_sym_pos = FIRST_LEX_POS ; lex_sym_pos < n ; lex_sym_pos ++ )
1419 	{
1420 		int result = lex_has_def( __stand_param__ , lex_sym_pos , RR ) ;
1421 		if ( result != FAIL )
1422 		{
1423 			return TRUE ;
1424 		}
1425 		if ((result = lex_has_def( __stand_param__ , lex_sym_pos , BOXH )) != FAIL)
1426 		{
1427 			return TRUE ;
1428 		}
1429 	}
1430 	return FALSE ;
1431 }
1432 
1433 /* ====================================================================
1434 analyze.c (output_raw_elements)
1435 print out the raw elements of the tokens
1436 =======================================================================*/
output_raw_elements(STAND_PARAM * __stand_param__,ERR_PARAM * __err_param__)1437 void output_raw_elements( STAND_PARAM * __stand_param__ , ERR_PARAM *__err_param__ )
1438 {
1439 	int stz_no , n ;
1440 	int lex_pos ;
1441 	DEF *__def__ ;
1442     STZ **__stz_list__;
1443 
1444 	STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
1445 	if (__err_param__ == NULL)
1446 	{
1447 		printf("Input tokenization candidates:\n") ;
1448 	}
1449 	else
1450 	{
1451 		LOG_MESS("Input tokenization candidates:",__err_param__) ;
1452 	}
1453 	for (lex_pos = FIRST_LEX_POS;lex_pos < __stand_param__->LexNum;lex_pos ++)
1454 	{
1455 		for ( __def__ = __stand_param__->lex_vector[lex_pos].DefList; __def__ != NULL; __def__ = __def__->Next)
1456 		{
1457 			if (__err_param__ == NULL)
1458 			{
1459 				printf("\t(%d) std: %s, tok: %d (%s)\n",lex_pos,((__def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard),__def__->Type,in_symb_name(__def__->Type));
1460 			}
1461 			else
1462 			{
1463 				sprintf( __err_param__->error_buf , "\t(%d) std: %s, tok: %d (%s)\n" , lex_pos , (( __def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard) , __def__->Type , in_symb_name( __def__->Type ));
1464 				register_error( __err_param__ ) ;
1465 			}
1466 		}
1467 	}
1468 	n = __stz_info__->stz_list_size ;
1469 	__stz_list__ = __stz_info__->stz_array ;
1470 	for ( stz_no = FIRST_STZ ; stz_no < n ; stz_no ++ )
1471 	{
1472 		STZ *__cur_stz__ = __stz_list__[stz_no] ;
1473 		if ( __err_param__ == NULL )
1474 		{
1475 			printf( "Raw standardization %d with score %f:\n" , ( stz_no  ) , __cur_stz__->score ) ;
1476 		}
1477 		else
1478 		{
1479 			LOG_MESS2( "Raw standardization %d with score %f:\n" , ( stz_no  ) , __cur_stz__->score , __err_param__ ) ;
1480 		}
1481 		for ( lex_pos = FIRST_LEX_POS ; lex_pos < __stand_param__->LexNum ; lex_pos ++ )
1482 		{
1483             SYMB k;
1484 			__def__ = __cur_stz__->definitions[lex_pos] ;
1485 			/*-- 2010-11-18 : handle end STOPWORD --*/
1486 			k = __cur_stz__->output[lex_pos] ;
1487 			if ( __err_param__ == NULL )
1488 			{
1489 				printf( "\t(%d) Input %d (%s) text %s mapped to output %d (%s)\n" , lex_pos , __def__->Type , in_symb_name( __def__->Type ) , (( __def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard ) , k , (( k == FAIL )? "NONE" : out_symb_name( k ))) ;
1490 			}
1491 			else
1492 			{
1493 				sprintf( __err_param__->error_buf , "\t(%d) Input %d (%s) text %s mapped to output %d (%s)\n" , lex_pos , __def__->Type , in_symb_name( __def__->Type ) , (( __def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard ) , k , (( k == FAIL )? "NONE" : out_symb_name( k ))) ;
1494 				register_error( __err_param__ ) ;
1495 			}
1496 			if ( k == FAIL ) break ;
1497 		}
1498 	}
1499 	fflush( stdout ) ;
1500 }
1501 
1502