1 /* analyze.c
2
3
4 This file contains the routines for finding the rules that
5 best fit the input address and assigns each element of the
6 input to the appropriate output field. The process is
7 essentially one of pattern-matching. The Aho-Corasick algorithm
8 is used to match rules that map input symbols found by the tokenizer
9 to output symbols. In the general case a clause tree is built left to
10 right, matching rules of a particular class, depending on the state.
11
12 Prototype 7H08 (This file was written by Walter Sinclair).
13
14 Copyright (c) 2009 Walter Bruce Sinclair
15
16 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
17
18 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
22 */
23
24 /* For pagc-0.3.0 : last revised 2010-11-18 */
25
26 //#define OCCUPANCY_DEBUG
27 #define USE_FORCE_MACRO
28
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include <stddef.h>
32 #include <string.h>
33 #include "pagc_api.h"
34
35
36 /* ------------------------------------------------------------
37 A lookup string with a particular standardization is prevented
38 from becoming associated with a particular output symbol
39 ------------------------------------------------------------- */
40 typedef struct def_blocker
41 {
42 char *lookup ;
43 char *standard ;
44 SYMB output_symbol ;
45 DEF *definition ;
46 } DEF_BLOCKER ;
47
48 #define NUM_DEF_BLOCKERS 2
49
50 /* ---------------------------------------------------------------
51 When adding to this list, increment NUM_DEF_BLOCKERS for each new
52 entry. This list blocks the use of the lookup string (first entry)
53 as the standardization (second entry) as an output symbol (third)
54 binding to the definition (fourth entry). The fourth entry is
55 added at initialization after the lexicon is read into memory.
56 Thus ST is blocked as STREET as a pretype. This occurs if the
57 rule attempts to move ST (as SAINT) left from STREET into PRETYP.
58 ---------------------------------------------------------------- */
59 static DEF_BLOCKER __def_block_table__[NUM_DEF_BLOCKERS] =
60 {
61 {"ST", "STREET", PRETYP, NULL } ,
62 {"ST", "STREET", CITY, NULL }
63 } ;
64
65 /* -- local prototypes -- */
66
67 static int check_def_block( STAND_PARAM * , int ) ;
68 static void delete_stz( STZ_PARAM * , int ) ;
69 static int delete_duplicate_stz( STZ_PARAM * , int ) ;
70 static void first_composition( STAND_PARAM * ) ;
71 static int prepare_target_pattern( STAND_PARAM * ) ;
72 static int no_break( STAND_PARAM *__stand_param__ , int ) ;
73 static int do_left_combine( STAND_PARAM * , int , int ) ;
74 static int need_compression( STAND_PARAM *, SYMB , int , int ) ;
75 static int select_next_composition( STAND_PARAM * ) ;
76 static int copy_best( STAND_PARAM * , int * , SYMB , int , SYMB * ) ;
77 static void save_current_composition( STAND_PARAM * , SEG * , int , SYMB * , DEF ** ) ;
78 static void scan_clause_tree( STAND_PARAM * , int, int ) ;
79 static void shallow_clause_scan( STAND_PARAM * , int , int ) ;
80 static void deposit_stz( STAND_PARAM *, double , int ) ;
81 static STZ *copy_stz( STAND_PARAM * , double ) ;
82 static void make_singleton( SEG * , SYMB , int , int , double ) ;
83 static int lex_has_def( STAND_PARAM * , int , SYMB ) ;
84 static void _force_deposit_( STAND_PARAM * , int ) ;
85 static int have_schema_symbol( int * , SYMB ) ;
86 static void default_seg_val( int * , int , SEG * , int , SYMB , double ) ;
87 static int _modify_position_( STAND_PARAM *, SEG * , int , int , SYMB , SYMB ) ;
88 static int schema_modify_position( STAND_PARAM * , SEG * , int , int , SYMB , SYMB ) ;
89 static void force_arc_clause( STAND_PARAM * ) ;
90 #ifdef USE_FORCE_MACRO
91 static void _force_macro_clause_( STAND_PARAM * ) ;
92 #endif
93 static int non_geocode_address( STAND_PARAM * ) ;
94 static int evaluate_micro_l(STAND_PARAM *) ;
95
96 /* -- Guide to the transition table:
97 MACRO_C MICRO_C ARC_C CIVIC_C EXTRA_C
98 MICRO_B FAIL FAIL EXIT FAIL FAIL
99 MICRO_M FAIL EXIT PREFIX FAIL MICR0_M
100 MACRO EXIT FAIL FAIL FAIL FAIL
101 PREFIX FAIL FAIL FAIL EXIT FAIL
102 EXIT FAIL FAIL FAIL FAIL EXIT
103
104 -- */
105
106 static int __tran_table__[MAX_CL][MAX_CL] = {
107 { FAIL, FAIL, EXIT, FAIL, FAIL } ,
108 { FAIL, EXIT, PREFIX, FAIL, MICRO_M } ,
109 { EXIT, FAIL, FAIL, FAIL, FAIL } ,
110 { FAIL, FAIL, FAIL, EXIT, FAIL } ,
111 { FAIL, FAIL, FAIL, FAIL, EXIT }
112 } ;
113
114 /* -- skew weights for each rule class -- */
115 static double __weight_table__[MAX_CL] =
116 {
117 1.0, 0.95, 0.95, 0.8 , 0.85
118 } ;
119
120
121 #define TARG_START 0
122 #define FIRST_STZ 0
123 #define INITIAL_STZ_CUTOFF .05
124 #define VERY_LOW_WEIGHT .15
125 #define START_DEPTH 0
126
127 static double __load_value__[ NUMBER_OF_WEIGHTS ] =
128 {
129 0.00, 0.325, 0.35 , 0.375 , 0.4 ,
130 0.475 , 0.55, 0.6 , 0.65 , 0.675 ,
131 0.7 , 0.75 , 0.8 , 0.825 , 0.85 ,
132 0.9 , 0.95 , 1.00
133 } ;
134
135 #ifdef OCCUPANCY_DEBUG
136 static const char *__rule_type_names__[] =
137 {
138 "MACRO" , "MICRO" , "ARC" , "CIVIC" , "EXTRA"
139 } ;
140 #endif
141
142
143 /* ====================================================================
144 analyze.c (install_def_block_table)
145 process level initialization - called by standard.l (init_stand_process)
146 calls lexicon.c (find_entry)
147 returns FALSE if error encountered.
148 string.h (strcmp)
149 uses macro RET_ERR1, LOG_MESS, CLIENT_ERR
150 =======================================================================*/
install_def_block_table(ENTRY ** __hash_table__,ERR_PARAM * __err_param__)151 int install_def_block_table( ENTRY **__hash_table__, ERR_PARAM *__err_param__ )
152 {
153 int i ;
154 for ( i = 0 ; i < NUM_DEF_BLOCKERS ; i++ )
155 {
156 DEF * __standard_def__ ;
157 ENTRY *__lookup_entry__ = find_entry( __hash_table__ , __def_block_table__[i].lookup ) ;
158 if (__lookup_entry__ == NULL)
159 {
160 RET_ERR1( "install_def_block_table: Could not find def_block for %s\n", __def_block_table__[i].lookup , __err_param__ , FALSE ) ;
161 }
162 for ( __standard_def__ = __lookup_entry__->DefList ; __standard_def__ != NULL ; __standard_def__ = __standard_def__->Next )
163 {
164 if ( strcmp( __standard_def__->Standard , __def_block_table__[i].standard ) == 0 )
165 {
166 __def_block_table__[i].definition = __standard_def__ ;
167 }
168 break ;
169 }
170 if ( __def_block_table__[i].definition == NULL )
171 {
172 RET_ERR1( "install_def_block_table: Could not find def_block definition for %s\n" , __def_block_table__[i].standard , __err_param__ , FALSE ) ;
173 }
174 }
175 return TRUE ;
176 }
177
178 /* ====================================================================
179 analyze.c (create_segments)
180 context level initialization -- must come after the lexicon
181 is read - called by init_stand_context
182 Null on error.
183 =======================================================================*/
create_segments(ERR_PARAM * __err_param__)184 STZ_PARAM *create_segments( ERR_PARAM *__err_param__ )
185 {
186 STZ_PARAM *__stz_info__ ;
187 int i ;
188 /* -- we're going to be re-sorting these pointers -- */
189 PAGC_ALLOC_STRUC(__stz_info__,STZ_PARAM,__err_param__,NULL) ;
190 PAGC_CALLOC_STRUC(__stz_info__->stz_array,STZ *,MAX_STZ,__err_param__,NULL) ;
191 for ( i = FIRST_STZ ; i < MAX_STZ ; i++ )
192 {
193 PAGC_ALLOC_STRUC(__stz_info__->stz_array[i],STZ,__err_param__,NULL) ;
194 }
195 PAGC_CALLOC_STRUC(__stz_info__->segs,SEG,MAXLEX,__err_param__,NULL) ;
196 return __stz_info__ ;
197 }
198
199 /* ====================================================================
200 analyze.c (destroy_segments)
201 context level cleanup
202 - called by (standard.l) close_stand_context
203 uses macros PAGC_DESTROY_2D_ARRAY, FREE_AND_NULL
204 =======================================================================*/
destroy_segments(STZ_PARAM * __stz_info__)205 void destroy_segments( STZ_PARAM *__stz_info__ )
206 {
207 if ( __stz_info__ == NULL )
208 {
209 return ;
210 }
211 PAGC_DESTROY_2D_ARRAY( __stz_info__->stz_array , STZ,MAX_STZ )
212 FREE_AND_NULL( __stz_info__->segs ) ;
213 FREE_AND_NULL( __stz_info__ ) ;
214 }
215 /* ====================================================================
216 analyze.c (get_stz_downgrade)
217 2008-03-13 : lower grade standardizations should not produce the same
218 matching score as higher grade. This can be critical when a lower grade
219 standardization produces a perfect match on the wrong reference record
220 =======================================================================*/
get_stz_downgrade(STAND_PARAM * __stand_param__,int request_stz)221 double get_stz_downgrade( STAND_PARAM *__stand_param__ , int request_stz )
222 {
223 double numerator, denominator ;
224 STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
225 if (( __stz_info__->stz_list_size - 1 ) < request_stz )
226 {
227 return 0. ;
228 }
229 if ( request_stz == 0 )
230 {
231 return 1.0 ;
232 }
233 if (( denominator = __stz_info__->stz_array[0]->score ) == 0. )
234 {
235 return denominator ;
236 }
237 numerator = __stz_info__->stz_array[request_stz]->score ;
238 return ( numerator / denominator ) ;
239 }
240
241 /* ====================================================================
242 analyze.c (get_next_stz)
243 called by analyze.c (evaluator) , build.c (Build)
244 build.c (transform_rows) match.c (match_records),
245 calls analyze.c (check_def_block, delete_duplicate_stz)
246 export.c (init_output_fields, stuff_fields)
247 <remarks>
248 return FALSE if the requested stz is not there - this allows
249 termination to a request loop when there are fewer than the maximum
250 number on the list and also reports, on request of 0, that none were
251 found. If the request_stz is the same as the last one done (since
252 evaluator last initialized the last_stz_output variable) we just
253 return rather than redo the same work. When matching we need
254 to know the correct standardization for positioning the point along
255 the arc
256 2008-04-06 : This function needs to return to the 0 stz when selecting
257 the best standardization for the build. To indicate that an override
258 is required, we'll take FAIL as a proxy for 0.
259 </remarks>
260 =======================================================================*/
get_next_stz(STAND_PARAM * __stand_param__,int request_stz_in)261 int get_next_stz( STAND_PARAM *__stand_param__ , int request_stz_in )
262 {
263 int i ;
264
265 DEF **__best_defs__ = __stand_param__->best_defs ;
266 SYMB *__best_output__ = __stand_param__->best_output ;
267 STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
268 int n = __stand_param__->LexNum ;
269 int request_stz = request_stz_in ;
270 STZ * __cur_stz__ ;
271 if (request_stz_in != FAIL)
272 {
273 if ((( __stz_info__->stz_list_size - 1 ) < request_stz ) || ( __stz_info__->last_stz_output == request_stz ))
274 {
275 /*-- Indicate that this is the last one : don't call
276 get_next_stz with 0 unless you want a FALSE --*/
277 return FALSE ;
278 }
279 /*-- Delete standardizations that contain blocked definitions --*/
280 while (( check_def_block( __stand_param__ , request_stz )) && ( __stz_info__->stz_list_size > request_stz )) ;
281 /*-- Have we reached the end of the list? --*/
282 if ( __stz_info__->stz_list_size == request_stz )
283 {
284 return FALSE ;
285 }
286 /* -----------------------------------------------------------------------
287 <remarks> A clause tree analysis may produce identical output to a MICRO_C by
288 combining an ARC_C and CIVIC_C pair. We want only the first one in any
289 situation where we ask for lower scoring candidates </remarks>
290 ------------------------------------------------------------------------ */
291 if ( request_stz > FIRST_STZ )
292 {
293 while (( delete_duplicate_stz( __stz_info__, request_stz )) && ( __stz_info__->stz_list_size > request_stz )) ;
294 if ( __stz_info__->stz_list_size == request_stz )
295 {
296 return FALSE ;
297 }
298 }
299 }
300 else
301 {
302 request_stz = FIRST_STZ ;
303 }
304 /*-- Reload the best defs and output from the new stz --*/
305 __cur_stz__ = __stz_info__->stz_array[request_stz] ;
306 for ( i = FIRST_LEX_POS ; i < n ; i++ )
307 {
308 __best_defs__[i] = __cur_stz__->definitions[i] ;
309 __best_output__[i] = __cur_stz__->output[i] ;
310 }
311 __best_defs__[i] = NULL ;
312 __best_output__[i] = FAIL ;
313 /* -------------------------------------------------------------------
314 Because this function is called with values greater than 0 only to
315 redo a MICRO
316 -------------------------------------------------------------------- */
317 if (request_stz > FIRST_STZ || request_stz_in == FAIL)
318 {
319 /*-- LEFT : just MICRO here --*/
320 init_output_fields(__stand_param__,LEFT) ;
321 stuff_fields( __stand_param__ ) ;
322 }
323 __stz_info__->last_stz_output = request_stz ;
324 return TRUE ;
325 }
326
327 /* ====================================================================
328 analyze.c (check_def_block)
329 called by analyze.c (get_next_stz)
330 calls analyze.c (delete_stz)
331 =======================================================================*/
check_def_block(STAND_PARAM * __stand_param__,int request_stz)332 static int check_def_block( STAND_PARAM *__stand_param__ , int request_stz )
333 {
334 int i, j ;
335 STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
336 SYMB *__cur_sym_ptr__ = __stz_info__->stz_array[ request_stz ]->output ;
337 DEF **__stz_definitions__ = __stz_info__->stz_array[ request_stz ]->definitions ;
338 int n = __stand_param__->LexNum ;
339 for (i = FIRST_LEX_POS ; i < n ; i++)
340 {
341 for (j = 0 ; j < NUM_DEF_BLOCKERS ; j ++)
342 {
343 if (__cur_sym_ptr__[i] == __def_block_table__[j].output_symbol)
344 {
345 if (__stz_definitions__[i] == __def_block_table__[j].definition)
346 {
347 delete_stz(__stz_info__ , request_stz) ;
348 return TRUE ;
349 }
350 }
351 }
352 }
353 return FALSE ;
354 }
355
356 /* ====================================================================
357 analyze.c (delete_stz)
358 called by analyze.c (check_def_block), analyze.c (delete_duplicate_stz)
359 =======================================================================*/
delete_stz(STZ_PARAM * __stz_info__,int request_stz)360 static void delete_stz( STZ_PARAM *__stz_info__ , int request_stz )
361 {
362 int i, n ;
363 STZ **__stz_list__ ;
364 STZ *__stz_ptr__ ;
365
366 __stz_info__->stz_list_size -- ; /* -- change list count for deletion -- */
367 n = __stz_info__->stz_list_size ;
368 __stz_list__ = __stz_info__->stz_array ;
369
370 /*-- last on list? - it just becomes inactive --*/
371 if ( request_stz == __stz_info__->stz_list_size )
372 {
373 return ;
374 }
375 /*-- we don't want to lose this pointer --*/
376 __stz_ptr__ = __stz_list__[request_stz] ;
377 /* ----------------------------------------------------------
378 move the rest of the list down to eliminate the duplicate.
379 The replacement entry will become the new, requested stz
380
381 if there are, for instance, n stz pointers active, stz_list_size
382 will be n. So the ordinal n-1 is the last active stz. In this
383 function, after the first instruction, n will point to the last
384 active stz. When we move the stz pointers down, when i = n-1,
385 the nth is moved into n-1. So, in order not to lose the pointer,
386 the deleted stz goes into the vacated nth spot
387 0 req n-1 n inactive MAX_STZ - 1
388 [ ] [ ] [ ] ... [ ] [ ] [ ] ... [ ]
389 ----------------------------------------------------------- */
390 for ( i = request_stz ; i < n ; i ++ )
391 {
392 __stz_list__[i] = __stz_list__[i+1] ;
393 }
394 /* -- save the pointer, now inactive, for reuse -- */
395 __stz_list__[n] = __stz_ptr__ ;
396 }
397
398 /* ====================================================================
399 analyze.c (delete_duplicate_stz)
400 calls analyze.c (delete_stz)
401 called by analyze.c (get_next_stz)
402 =======================================================================*/
delete_duplicate_stz(STZ_PARAM * __stz_info__,int request_stz)403 static int delete_duplicate_stz(STZ_PARAM *__stz_info__, int request_stz)
404 {
405 /* ---------------------------------------------------------------------
406 if the requested_stz is identical to any earlier ones on the list,
407 both for definition and output symbol, or if it contains a blocked
408 definition , eliminate this entry and move the rest of the list down
409 one. Return TRUE if this happens, otherwise FALSE
410 -----------------------------------------------------------------------*/
411
412 int i ;
413 STZ **__stz_list__ = __stz_info__->stz_array ;
414 for (i = FIRST_STZ; i < request_stz; i ++)
415 {
416 SYMB a ;
417 SYMB *__cur_sym_ptr__ = __stz_list__[request_stz]->output ;
418 DEF **__stz_definitions__ = __stz_list__[request_stz]->definitions ;
419 SYMB *__prev_sym_ptr__ = __stz_list__[i]->output ;
420 DEF **__prev_stz_definitions__ = __stz_list__[i]->definitions ;
421
422 while (( a = *__prev_sym_ptr__++ ) == *__cur_sym_ptr__++ )
423 {
424 /* -------------------------------------------------------------
425 A differing definition, even if the output token is the same
426 could lead to a different result
427 -------------------------------------------------------------- */
428 if (*__prev_stz_definitions__++ != *__stz_definitions__++)
429 {
430 return FALSE ;
431 }
432 /*-- FAIL terminates output , so they're identical --*/
433 if (a == FAIL)
434 {
435 delete_stz(__stz_info__, request_stz) ;
436 return TRUE ;
437 }
438 }
439 }
440 return FALSE ;
441 }
442
443 /* ====================================================================
444 analyze.c (evaluate_micro_l)
445 called by evaluator
446 2009-08-09 : special routine for MICRO_L state : landmark words
447 <revision date='2012-07-22'> Keep track of start_state </revision>
448 =======================================================================*/
449
evaluate_micro_l(STAND_PARAM * __stand_param__)450 static int evaluate_micro_l( STAND_PARAM *__stand_param__ )
451 {
452 int i , desired_type , output_field ;
453 int __def_marked__[MAXLEX][MAXDEF] ;
454 int *__orig_pos__ = __stand_param__->orig_str_pos ;
455 int *__sym_sel__ = __stand_param__->cur_sym_sel ;
456 int *__num_defs__ = __stand_param__->def_cnt ;
457 LEXEME *__lexeme__ = __stand_param__->lex_vector ;
458 int n = __stand_param__->LexNum ;
459 /* 2009-08-15 : use lexicon types */
460 switch ( __stand_param__->start_state )
461 {
462 case FEAT_L :
463 desired_type = 1 ;
464 output_field = FEATNAME ;
465 break ;
466 case FEAT_T :
467 desired_type = 2 ;
468 output_field = FEATTYPE ;
469 break ;
470 case FEAT_A :
471 desired_type = 1 ;
472 output_field = FEATAREA ;
473 break ;
474 default :
475 return FALSE ;
476 }
477 /* -- read the symbols from the definitions into the lex_sym array -- */
478 for (i = FIRST_LEX_POS ; i < n ; i++)
479 {
480 int j ;
481 DEF *__def__ ;
482 __orig_pos__[i] = i ; /* we won't use compression here */
483 __sym_sel__[i] = 0 ; /* -- start at 0 for each Lexeme -- */
484 /* -- walk the def chain, counting the symbs and putting them
485 into the array -- */
486 for (j = 0, __def__ = __lexeme__[i].DefList; __def__ != NULL; __def__ = __def__->Next, j++)
487 {
488 __stand_param__->comp_lex_sym[i][j] = __def__->Type ;
489 __stand_param__->def_array[i][j] = __def__ ;
490 /* 2009-08-30 : filter out non-default non-desired */
491 if ((__def__->Type == desired_type) || (__def__->Protect))
492 {
493 __def_marked__[i][j] = TRUE ;
494 }
495 else __def_marked__[i][j] = FALSE ;
496 }
497 __num_defs__[i] = j ;
498 }
499 /*-- Now go through all the compositions, looking for those consisting
500 only of unduplicated defs --*/
501 do
502 {
503 int marked ;
504 double seg_score ;
505 /* one duplicated def disqualifies this composition */
506 for (i = n-1 , marked = TRUE; i >= FIRST_LEX_POS; i --)
507 {
508 if (!__def_marked__[i][__sym_sel__[i]])
509 {
510 marked = FALSE ;
511 break ;
512 }
513 }
514 /* 2009-10-16 : accept other types */
515 seg_score = (marked ? EXCELLENT : LOW) ;
516 default_seg_val(__sym_sel__, n, __stand_param__->stz_info->segs, FALSE, output_field, seg_score) ;
517 _force_deposit_(__stand_param__, ( n - 1)) ;
518 } while ( select_next_composition(__stand_param__)) ;
519 return ( get_next_stz(__stand_param__, FIRST_STZ)) ; /* -- in case nothing was found -- */
520 }
521
522
523 /* ====================================================================
524 analyze.c (evaluator)
525 called by standard.l (close_stand_field)
526 calls analyze.c (first_composition) , analyze.c (shallow_clause_scan) ,
527 analyze.c (scan_clause_tree) , analyze.c (select_next_composition) ,
528 analyze.c(force_arc_clause) , analyze.c (_force_macro_clause_) ,
529 analyze.c(non_geocode_address) , analyze.c (get_next_stz)
530 analyze.c (prepare_target_pattern)
531 <revision date='2006-11-02'> add STAND_PARAM arg and change calls </revision>
532 <revision date='2012-07-22'> Keep track of start_state </revision>
533 =======================================================================*/
evaluator(STAND_PARAM * __stand_param__)534 int evaluator(STAND_PARAM *__stand_param__)
535 {
536
537 int state ;
538 STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
539 __stz_info__->stz_list_cutoff = INITIAL_STZ_CUTOFF ;
540 state = __stand_param__->start_state ;
541
542 #ifdef OCCUPANCY_DEBUG
543 if (state == EXTRA_STATE)
544 {
545 __stz_info__->stz_list_cutoff = 0.00 ;
546 }
547 #endif
548 __stz_info__->stz_list_size = FIRST_STZ ;
549 __stz_info__->last_stz_output = FAIL ;
550
551 /*-- <revision date='2009-08-09'> Special evaluation for landmarks </revision> --*/
552 if (state > EXTRA_STATE)
553 {
554 return (evaluate_micro_l(__stand_param__)) ;
555 }
556 while (TRUE)
557 {
558 first_composition(__stand_param__) ; /* 2007-08-09 */
559 /* -- cycle through all the possible compositions -- */
560 do
561 {
562 int target_len ;
563 if ((target_len = prepare_target_pattern(__stand_param__)) == TARG_START)
564 {
565 continue ;
566 }
567 /* --------------------------------------------------------------
568 We don't need to build a clause tree for each composition for
569 MICRO_B and MACRO start states since we only want one
570 segment.
571 ----------------------------------------------------------------*/
572 switch (state)
573 {
574 case MACRO :
575 shallow_clause_scan(__stand_param__, MACRO_C, target_len) ;
576 break ;
577 case MICRO_B :
578 shallow_clause_scan(__stand_param__, ARC_C, target_len) ;
579 break ;
580 case EXTRA_STATE :
581 /* -- 2008-04-19 : scan for occupancy only -- */
582 shallow_clause_scan(__stand_param__, EXTRA_C, target_len) ;
583 break ;
584 default :
585 scan_clause_tree(__stand_param__, state, target_len) ;
586 }
587 /* ----------------------------------------------------------------
588 If we don't check the list size, we may be checking the score
589 of some previous result in the case where no standardization is
590 found
591 ----------------------------------------------------------------- */
592 if ((__stz_info__->stz_list_size > FIRST_STZ) && (!__stand_param__->analyze_complete) && (__stz_info__->stz_array[FIRST_STZ]->score >= __load_value__[EXCELLENT]))
593 {
594 break ;
595 }
596 } while (select_next_composition(__stand_param__)) ;
597 if ((__stz_info__->stz_list_size > FIRST_STZ) && (__stz_info__->stz_array[FIRST_STZ]->score >= __load_value__[1]))
598 {
599 break ;
600 }
601 /* -- force a segment -- */
602 if (state == MICRO_B)
603 {
604 force_arc_clause(__stand_param__) ;
605 break ;
606 }
607 #ifdef USE_FORCE_MACRO
608 if (state == MACRO)
609 {
610 _force_macro_clause_(__stand_param__) ;
611 break ;
612 }
613 #endif
614 if (state != MICRO_M)
615 {
616 break ;
617 }
618 if (!non_geocode_address(__stand_param__))
619 {
620 break ;
621 }
622 state = EXIT ;
623 } /*-- end of while TRUE --*/
624 return (get_next_stz(__stand_param__, FIRST_STZ)) ; /* -- in case nothing was found -- */
625 }
626
627
628 /* ====================================================================
629 <summary>
630 <function name='analyze.c (first_composition)'/>
631 <called-by> <functionref='analyze.c (evaluator)'/>
632 <remarks> Called by Evaluator to intialize __best_output__ and __sym_sel__ -
633 also sets up lex_sym, save_defs and __num_defs__ from the
634 definitions in the LexVector </remarks>
635 </summary>
636 =======================================================================*/
first_composition(STAND_PARAM * __stand_param__)637 static void first_composition( STAND_PARAM *__stand_param__ )
638 {
639 int i ;
640
641 int *__sym_sel__ = __stand_param__->cur_sym_sel ;
642 int *__num_defs__ = __stand_param__->def_cnt ;
643 LEXEME *__lexemes__ = __stand_param__->lex_vector ;
644 int n = __stand_param__->LexNum ;
645 /*-- <remarks> Read the symbols from the definitions into the lex_sym array </remarks> --*/
646 for (i = FIRST_LEX_POS; i < n; i++)
647 {
648 int j ;
649 DEF *__def__ ;
650 __sym_sel__[i] = 0 ; /* -- start at 0 for each Lexeme -- */
651 /*-- <remarks> Walk the def chain, counting the symbs and putting them
652 into the array </remarks> --*/
653 for (j = 0, __def__ = __lexemes__[i].DefList; __def__ != NULL; __def__ = __def__->Next, j++)
654 {
655 __stand_param__->comp_lex_sym[i][j] = __def__->Type ;
656 __stand_param__->def_array[i][j] = __def__ ;
657 }
658 __num_defs__[i] = j ;
659 }
660 }
661
662 /* ============================================================
663 analyze.c (prepare_target_pattern)
664 called by analyze.c (evaluator)
665 calls analyze.c (need_compression) gamma.c (refresh_transducer)
666 2006-10-31 : add STAND_PARAM parameter and change calls
667 ==============================================================*/
prepare_target_pattern(STAND_PARAM * __stand_param__)668 static int prepare_target_pattern(STAND_PARAM *__stand_param__)
669 {
670 int lex_pos, target_pos;
671 int *__sym_sel__ = __stand_param__->cur_sym_sel ;
672 SYMB *__p_target__ = __stand_param__->target ;
673 int *__orig_pos__ = __stand_param__->orig_str_pos ;
674 int n = __stand_param__->LexNum ;
675 NODE **__g_function__ = __stand_param__->rules->gamma_matrix ;
676 for ( lex_pos = FIRST_LEX_POS , target_pos = TARG_START ; lex_pos < n ;lex_pos++ )
677 {
678 SYMB in_symb = __stand_param__->comp_lex_sym[lex_pos][__sym_sel__[lex_pos]] ;
679 /* ------------------------------------------------------------
680 compress multiple words and stopwords - the idea is that
681 any combination of LEFT and RIGHT compression tokens (words
682 and stopwords, compress as a single word
683 ------------------------------------------------------------- */
684 if ( !need_compression( __stand_param__ , in_symb , lex_pos , target_pos ))
685 {
686 /* ---------------------------------------------------------
687 If no compression, associate this lex_pos with the
688 target_pos, put the symbol into the target and increment
689 the target_pos. Otherwise, keep the same target_pos and
690 discard symbol
691 ---------------------------------------------------------- */
692 __orig_pos__[lex_pos] = target_pos ;
693 __p_target__[target_pos++] = in_symb ;
694 }
695 }
696 /*-- Terminate symb lists --*/
697 __p_target__[target_pos] = FAIL ;
698 /*-- But suppose we only have one symbol, and it is a stopword --*/
699 if ( target_pos > TARG_START )
700 {
701 /*-- Set up the Aho-Corasick registry of output links --*/
702 refresh_transducer( __stand_param__->registry , __p_target__ , __g_function__ ) ;
703 }
704 return target_pos ; /* -- return cardinal number of target symbols -- */
705 }
706
707 /* ============================================================
708 analyze.c (no_break)
709 called by analyze.c (do_left_combine)
710 -- moved from tokenize.c to analyze.c
711 ==============================================================*/
no_break(STAND_PARAM * __stand_param__,int n)712 static int no_break( STAND_PARAM *__stand_param__ , int n )
713 {
714 int k = __stand_param__->lex_vector[n].EndMorph ;
715 /* 0 is no break
716 1 is set for semicolons, tabs and commas,
717 2 for spaces */
718 return (( __stand_param__->morph_array[k].Term == 1 )? FALSE : TRUE ) ;
719 }
720
721
722 /* ============================================================
723 analyze.c (do_left_combine)
724 calls analyze.c (no_break) called by analyze.c (need_compression)
725 ==============================================================*/
do_left_combine(STAND_PARAM * __stand_param__,int lex_pos,int target_pos)726 static int do_left_combine( STAND_PARAM *__stand_param__ , int lex_pos , int target_pos )
727 {
728 /*-- A LEFT_COMPRESS left compresses only if a LEFT_COMPRESS there to
729 combine with --*/
730 if (( target_pos == TARG_START ) || ( __stand_param__->target[target_pos - 1] != LEFT_COMPRESS ))
731 {
732 /*-- A RIGHT_COMPRESS also returns FALSE if it is at the start or
733 if the previous token isn't a LEFT_COMPRESS. need_compression will
734 deal with this --*/
735 return FALSE ;
736 }
737 /*-- A break in the lex sequence suggests these two words don't
738 belong together --*/
739 if ( !no_break( __stand_param__ , lex_pos - 1 ))
740 {
741 return FALSE ;
742 }
743 /*-- Okay, left compress it by giving it the same target position as the
744 previous symbol --*/
745 __stand_param__->orig_str_pos[lex_pos] = target_pos - 1 ; /* -- need to associate lex_pos
746 and target_pos for later decompression --*/
747 return TRUE ; /*-- Indicate compression was done --*/
748 }
749
750
751 /* ============================================================
752 analyze.c (need_compression)
753 called by analyze.c (prepare_target_pattern)
754 calls analyze.c (do_left_combine)
755 ==============================================================*/
need_compression(STAND_PARAM * __stand_param__,SYMB a,int lex_pos,int target_pos)756 static int need_compression( STAND_PARAM *__stand_param__ , SYMB a , int lex_pos , int target_pos )
757 {
758 /*-- No stopwords are accepted, no matter what --*/
759 if ( a == RIGHT_COMPRESS )
760 {
761 /*-- Does it combine with the last target symbol or the next? --*/
762 if ( !do_left_combine( __stand_param__ , lex_pos , target_pos ))
763 {
764 /* ---------------------------------------------------------------
765 do a right combine by giving it the next position. Note that
766 this allows the possibility of a STOPWORD with combining with
767 TYPE or DIR tokens, but this is what we want in cases like EL
768 CAMINO RD -- a RIGHT_COMPRESS may stray into the wrong field --
769 deal with this when decompressing
770 ---------------------------------------------------------------- */
771 __stand_param__->orig_str_pos[lex_pos] = target_pos ; /* -- target_pos does not
772 advance if returning TRUE --*/
773 }
774 return TRUE ;
775 }
776 /* -----------------------------------------------------------------------
777 everything that isn't a WORD must be accepted - we don't want to
778 combine words that are used in parsing, - two direction words, for
779 instance, one of which may be used as part of a street name, the other
780 perhaps as a suffix direction.
781 -------------------------------------------------------------------------*/
782 if ( a != LEFT_COMPRESS )
783 {
784 return FALSE ;
785 }
786 /*-- compress the WORD --*/
787 return ( do_left_combine( __stand_param__ , lex_pos , target_pos )) ;
788 }
789
790 /*========================================================================
791 analyze.c (scan_clause_tree)
792 Called by analyze.c (Evaluator)
793 Calls analyze.c (deposit_stz)
794 2006-11-02 : add KW *** arg, change call to GetOutputLink to direct access
795 =========================================================================*/
scan_clause_tree(STAND_PARAM * __stand_param__,int start_state,int start_pos)796 static void scan_clause_tree(STAND_PARAM *__stand_param__,int start_state,int start_pos)
797 {
798 int next_state = FAIL ;
799
800 RULE_PARAM *__rules__ = __stand_param__->rules ;
801 KW ***__output_link__ = __rules__->output_link ;
802 SEG *__segments__ = __stand_param__->stz_info->segs ;
803 double sum = 0.00 ; /* -- running total for score calculation --*/
804 int pos = start_pos ; /* -- one beyond the last symbol -- */
805 int state = start_state ; /* --for the __tran_table__ -- */
806 int depth = START_DEPTH ; /* --how deep in the clause tree -- */
807 int cl = 0 ;
808 KW *__keyw__ = NULL ;
809
810 while (TRUE)
811 {
812 SEG *__outer_seg__ ;
813 while (TRUE)
814 {
815 SEG *__inner_seg__ ;
816 if (__keyw__ == NULL)
817 {
818 /*-- when we're out of keys for this class, get next class --*/
819 if (++cl == MAX_CL)
820 {
821 /* -- no more states to transition to, so go up clause tree
822 - unless there's nowhere to go -- */
823 if (depth == START_DEPTH) return ; /* -- the exit -- */
824 depth -- ;
825 break ;
826 }
827 if ((next_state = __tran_table__[state][cl]) == FAIL)
828 {
829 /*-- no transition, try next clause --*/
830 continue ;
831 }
832 /*-- recall that the registry is shifted right one node to
833 account for the node that corresponds to total failure --*/
834 /*-- <revision date='2006-11-02'> Substitute for GetOutputLink </revision> --*/
835 if ((__keyw__ = __output_link__[__stand_param__->registry[pos]][cl]) == NULL)
836 {
837 continue ;
838 }
839 } /* end of if keyword is NULL */
840
841 /* -- skip pointless rules -- */
842 if ((__keyw__->Length == pos) && (next_state != EXIT))
843 {
844 __keyw__ = __keyw__->OutputNext ; /* -- the next key to check -- */
845 continue ;
846 }
847 /* -- fill in this definition for output if it forms part of a
848 completed stz -- */
849 __inner_seg__ = __segments__ + depth ;
850 __inner_seg__->End = pos - 1 ; /* -- ordinal numb of last sym in target -- */
851 __inner_seg__->Key = __keyw__ ;
852 __inner_seg__->State = state ;
853 __inner_seg__->Output = __keyw__->Output ;
854 if (__rules__->collect_statistics)
855 {
856 __keyw__->hits ++ ;
857 __rules__->total_key_hits ++ ;
858 }
859 /* -- running total in sum, segment total in Segment -- */
860 sum += (__inner_seg__->Value = __load_value__[__keyw__->Weight] * __weight_table__[__keyw__->Type]) ;
861 if ((__inner_seg__->Start = pos - (__keyw__->Length)) == 0)
862 {
863 /* -- all definitions have been matched: if this is a valid
864 state, save the standardization , then head back up
865 the tree -- */
866 if (next_state == EXIT)
867 {
868 deposit_stz(__stand_param__,sum,depth) ;
869 }
870 /* -- keep the same cl, state , depth and pos -- */
871 sum -= __inner_seg__->Value ; /* -- restore the previous sum -- */
872 __keyw__ = __keyw__->OutputNext ; /* -- and get the next rule on the
873 linked list -- */
874 continue ;
875 }
876 /* -- begin a subtree at the new depth -- */
877 pos = __inner_seg__->Start ;
878 state = __tran_table__[state][cl] ;
879 depth ++ ;
880 cl = 0 ;
881 __keyw__ = NULL ; /* -- new start -- */
882 } /* -- end of inner loop -- */
883 /* -- restore the previous state from the seg before overwrite -- */
884 __outer_seg__ = __segments__ + depth ;
885 state = __outer_seg__->State ;
886 if (depth != START_DEPTH)
887 {
888 sum -= __outer_seg__->Value ;
889 pos = __outer_seg__->End + 1 ;
890 }
891 else
892 {
893 sum = 0.00 ;
894 pos = start_pos ;
895 }
896 __keyw__ = __outer_seg__->Key ;
897 cl = __keyw__->Type ; /* -- the clause we were working on -- */
898 __keyw__ = __keyw__->OutputNext ; /* -- the next key to check -- */
899 } /* -- end of outer loop -- */
900 }
901
902 /*========================================================================
903 analyze.c (shallow_clause_scan)
904 Called by analyze.c (evaluator)
905 Calls analyze.c (deposit_stz)
906 <remarks>Called by Evaluator to get a complete rule for this class. If we
907 can't get a complete rule we don't want one at all. If no composition
908 can up with one, force_standardization will activate</remarks>
909 2006-11-02 : add KW *** arg, change call to GetOutputLink to direct access
910 =========================================================================*/
shallow_clause_scan(STAND_PARAM * __stand_param__,int cl,int pos)911 static void shallow_clause_scan(STAND_PARAM *__stand_param__ , int cl, int pos)
912 {
913 KW *__kw__ ;
914
915 RULE_PARAM *__rules__ = __stand_param__->rules ;
916 KW ***__output_link__ = __rules__->output_link ;
917 SEG * __seg__ = __stand_param__->stz_info->segs ;
918 __seg__->End = pos - 1 ;
919 __seg__->Start = 0 ;
920 /*-- <revision date='2006-11-02'> Substitute for GetOutputLink </revision> --*/
921 for (__kw__ = __output_link__[__stand_param__->registry[pos]][cl] ; __kw__ != NULL; __kw__ = __kw__->OutputNext)
922 {
923 /*-- once we get a short keyword, depart --*/
924 if (__kw__->Length < pos) return ;
925 /*-- fill in the rest of this definition for output if it forms part
926 of a completed stz --*/
927 __seg__->Output = __kw__->Output ;
928 if (__rules__->collect_statistics)
929 {
930 __seg__->Key = __kw__ ;
931 __kw__->hits ++ ;
932 __rules__->total_key_hits ++ ;
933 }
934 #ifdef OCCUPANCY_DEBUG
935 if (cl == EXTRA_C)
936 {
937 SYMB *__ol__ ;
938 printf( "\nRule is type %d (%s)\n: " , __kw__->Type , __rule_type_names__[__kw__->Type] ) ;
939 printf( "Input : " ) ;
940 for ( __ol__ = __kw__->Input ; *__ol__ != FAIL ; __ol__++ )
941 {
942 printf( "|%d (%s)|", *__ol__ , in_symb_name( *__ol__ )) ;
943 }
944 printf("\nOutput: ") ;
945 /*-- output the output symbols --*/
946 for (__ol__ = __kw__->Output;*__ol__ != FAIL;__ol__++)
947 {
948 printf("|%d (%s)|",*__ol__,out_symb_name(*__ol__)) ;
949 }
950 printf ("\nrank %d ( %f)\n",__kw__->Weight,__load_value__[__kw__->Weight]) ;
951 }
952 #endif
953 /* -- don't skew weights with these start states - so the cutoff is
954 easier -- */
955 deposit_stz(__stand_param__,__load_value__[__kw__->Weight],START_DEPTH) ;
956 }
957 }
958
959 /* ====================================================================
960 analyze.c (select_next_composition)
961 called by analyze.c (evaluator)
962 =======================================================================*/
select_next_composition(STAND_PARAM * __stand_param__)963 static int select_next_composition( STAND_PARAM *__stand_param__ )
964 {
965 int pos ;
966 int *__sym_sel__ = __stand_param__->cur_sym_sel ;
967 int *__num_defs__ = __stand_param__->def_cnt ;
968
969 for ( pos = __stand_param__->LexNum - 1 ; pos >= FIRST_LEX_POS ; pos-- )
970 {
971 __sym_sel__[pos]++ ; /*-- Increase selector --*/
972 if ( __sym_sel__[pos] < __num_defs__[pos] )
973 {
974 /*-- Not ready yet for turnover --*/
975 return TRUE ;
976 }
977 __sym_sel__[pos] = 0 ; /*-- Reset selector --*/
978 }
979 return FALSE ;
980 }
981
982 /* ====================================================================
983 <summary>
984 <function name='analyze.c (make_singleton)'>
985 <remarks> Called to make a segment with a putative single position output.
986 Don't really need a KW. as long as copy_best knows how to handle
987 it. </remarks>
988 <called-by><functionref='analyze.c (default_seg_val)'/></called-by>
989 <revision date='2009-08-09'> Eliminate cl arg to make_singleton. </revision>
990 </summary>
991 =======================================================================*/
make_singleton(SEG * __segments__,SYMB sym,int pos,int depth,double score)992 static void make_singleton( SEG *__segments__, SYMB sym , int pos, int depth, double score )
993 {
994
995 /*-- <remarks> Since the __segments__ go left to right and the positions go right to
996 left, the depth and position will usually be different. </remarks> --*/
997 SEG *__seg__ = __segments__ + depth ;
998 __seg__->Start = pos ;
999 __seg__->End = pos ;
1000 __seg__->Value = score ;
1001 __seg__->Output = NULL ;
1002 __seg__->sub_sym = sym ;
1003 }
1004
1005 /* ====================================================================
1006 analyze.c (deposit_stz)
1007 calls analyze.c (copy_stz, save_current_composition)
1008 called by analyze.c (_force_deposit_, shallow_clause_scan,scan_clause_tree)
1009 =======================================================================*/
deposit_stz(STAND_PARAM * __stand_param__,double sum,int depth)1010 static void deposit_stz( STAND_PARAM *__stand_param__ , double sum , int depth )
1011 {
1012 STZ_PARAM * __stz_info__ = __stand_param__->stz_info ;
1013 STZ *__cur_stz__ ;
1014
1015 /*-- calculate the score here --*/
1016 double cur_score = (sum / (double) (depth + 1)) ;
1017
1018 /*-- and apply the cutoff before doing all the work of putting it into
1019 the list --*/
1020 if ( cur_score < __stz_info__->stz_list_cutoff ) return ;
1021
1022 /*-- need the score to get the pointer, need the pointer to copy the
1023 content --*/
1024 __cur_stz__ = copy_stz( __stand_param__ , cur_score ) ;
1025
1026 /*-- Then add the content, once we have a pointer -- */
1027 if (( __stand_param__->rules->collect_statistics ) && ( depth == START_DEPTH ))
1028 {
1029 SEG *__seg__ = __stz_info__->segs + START_DEPTH ;
1030 if (__seg__->Key != NULL)
1031 {
1032 __cur_stz__->build_key = __seg__->Key ;
1033 }
1034 }
1035 save_current_composition( __stand_param__ , __stz_info__->segs,depth , __cur_stz__->output , __cur_stz__-> definitions ) ;
1036 }
1037
1038 #define DUP_DECREMENT .0025
1039
1040 /* ====================================================================
1041 analyze.c (copy_stz)
1042 called by analyze.c (deposit_stz)
1043 =======================================================================*/
copy_stz(STAND_PARAM * __stand_param__,double current_score)1044 static STZ * copy_stz(STAND_PARAM *__stand_param__ ,double current_score)
1045 {
1046 /* -- sort it into the list and knock the last one off the list
1047 if it is MAX_STZ -- */
1048 /* -- Take the Score of the last remaining item as the new cutoff,
1049 if it is greater than the current cutoff -- */
1050 int i ;
1051 int last_on_list ;
1052 STZ *__cur_stz__ ;
1053
1054
1055 STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
1056 STZ **__stz_list__ = __stz_info__->stz_array ;
1057
1058 /* -- Increase the list size only if it isn't full. If it is full, take
1059 the score of the last on the list (which we're going to knock off the
1060 list) as the new cutoff -- */
1061
1062 if (__stz_info__->stz_list_size != MAX_STZ)
1063 {
1064 __stz_info__->stz_list_size++ ;
1065 }
1066
1067 /* -- Get the pointer of the last on the list if the list is full (to be
1068 knocked off, or one beyond the previous last item (with undefined
1069 content) if the list isn't full. -- */
1070 last_on_list = __stz_info__->stz_list_size - 1 ;
1071 __cur_stz__ = __stz_list__[last_on_list] ; /* -- implicitly discard contents -- */
1072 __cur_stz__->score = current_score ;
1073 __cur_stz__->raw_score = current_score ;
1074
1075 /*-- Initialize the output vector - but is this necessary ? --*/
1076 for (i = FIRST_LEX_POS;i <= __stand_param__->LexNum;i++)
1077 {
1078 __cur_stz__->output[i] = FAIL ;
1079 }
1080 /* -- boundary condition : last-1 last
1081 [ ] [ ]
1082 suppose the last - 1 has a score less than the current score - then
1083 it isn't copied into last, so __cur_stz__ goes back into the slot
1084 from which it was just removed - nothing moves -- */
1085 for (i = last_on_list;i > FIRST_STZ;i --)
1086 {
1087 /* -- Get the next pointer on the list and move it back if it has a
1088 lesser score. Otherwise we put the pointer to the new stz in the
1089 present position -- */
1090 STZ *__next_stz__ = __stz_list__[i-1] ;
1091 if (current_score > __next_stz__->raw_score)
1092 {
1093 __stz_list__[i] = __next_stz__ ;
1094 }
1095 else
1096 {
1097 if (current_score == __next_stz__->raw_score)
1098 {
1099 /* -- 2008-03-14: first come, first served -- */
1100 __cur_stz__->score = __next_stz__->score - DUP_DECREMENT ;
1101 }
1102 break ;
1103 }
1104 }
1105 __stz_list__[i] = __cur_stz__ ;
1106 if (__stz_info__->stz_list_size == MAX_STZ)
1107 {
1108 __stz_info__->stz_list_cutoff = __stz_list__[last_on_list]->score ;
1109 }
1110 return __cur_stz__ ; /* -- tell the caller where we put it -- */
1111 }
1112
1113 /* ====================================================================
1114 analyze.c (save_current_composition)
1115 called by analyze.c (deposit_stz)
1116 calls analyze.c (copy_best)
1117 <remarks>called by deposit_stz to align the current standardization output
1118 symbols to the LEXEME input symbols - it depends on the correct
1119 LEXEMES being present and the __sym_sel__ reflecting the last composition.
1120 Consequently it must be done at the time of deposit </remarks>
1121 =======================================================================*/
save_current_composition(STAND_PARAM * __stand_param__,SEG * __segments__,int depth,SYMB * __best_output__,DEF ** __best_defs__)1122 static void save_current_composition(STAND_PARAM *__stand_param__,SEG *__segments__, int depth, SYMB *__best_output__ , DEF **__best_defs__)
1123 {
1124
1125 int lex_pos ;
1126 SEG *__seg__ ;
1127 int *__sym_sel__ = __stand_param__->cur_sym_sel ;
1128
1129 /*-- <remarks> Get the definitions selected from save_defs - needed for outputing
1130 the lexemes. Different definitions may give a different
1131 standardization for the same input - the letter W may be standardized
1132 as W if a SINGLE or WEST if a DIRECT </remarks> --*/
1133
1134 /* -- use the whole target -- */
1135 for ( lex_pos = FIRST_LEX_POS ; lex_pos < __stand_param__->LexNum ; lex_pos++ )
1136 {
1137 __best_defs__[lex_pos] = __stand_param__->def_array[lex_pos][__sym_sel__[lex_pos]] ;
1138 }
1139 __best_defs__[lex_pos] = NULL ;
1140
1141 /*-- <remarks> Segments go backwards (right to left) , but the content for
1142 each segment goes left to right </remarks> --*/
1143
1144 for ( __seg__ = __segments__ + depth, lex_pos = FIRST_LEX_POS ; __seg__ >= __segments__ ; __seg__-- )
1145 {
1146 SYMB *__sym_ptr__ ;
1147 if (( __sym_ptr__ = __seg__->Output ) == NULL)
1148 {
1149 lex_pos = copy_best( __stand_param__ , __sym_sel__ , __seg__->sub_sym , lex_pos , __best_output__ ) ;
1150 continue ;
1151 }
1152 for ( ; *__sym_ptr__ != FAIL ; __sym_ptr__ ++ )
1153 {
1154 lex_pos = copy_best( __stand_param__ , __sym_sel__ , *__sym_ptr__ , lex_pos , __best_output__ ) ;
1155 }
1156 }
1157 }
1158
1159 /* ====================================================================
1160 analyze.c (copy_best)
1161 called by analyze.c (save_current_composition)
1162 <remarks> Called by save_current_composition to decompress stopword and word
1163 sequences </remarks>
1164 =======================================================================*/
copy_best(STAND_PARAM * __stand_param__,int * __sym_sel__,SYMB output_symb,int beg,SYMB * __best_output__)1165 static int copy_best( STAND_PARAM *__stand_param__ , int *__sym_sel__ , SYMB output_symb , int beg , SYMB *__best_output__ )
1166 {
1167 int lex_pos ;
1168 int *__orig_pos__ = __stand_param__->orig_str_pos ;
1169
1170 /*-- <remarks> <code>orig_pos</code> has the (multiple) LEXEME positions to which the
1171 (single) output symbol corresponds - so we add that symbol to each of
1172 the positions </remarks> --*/
1173
1174 int next_target_pos = __orig_pos__[beg] + 1 ;
1175 for ( lex_pos = beg ; __orig_pos__[lex_pos] < next_target_pos ; lex_pos ++ )
1176 {
1177 if ( lex_pos == __stand_param__->LexNum ) break ;
1178
1179 /*-- <remarks> Check for errant RIGHT_COMPRESS - put it back into STREET
1180 if possible </remarks> --*/
1181
1182 if (( lex_pos > FIRST_LEX_POS ) && ( output_symb != STREET ) && ( __stand_param__->comp_lex_sym[lex_pos][__sym_sel__[lex_pos]] == RIGHT_COMPRESS ) && ( __best_output__[lex_pos - 1] == STREET ))
1183 {
1184 __best_output__[lex_pos] = STREET ;
1185 }
1186 else
1187 {
1188 __best_output__[lex_pos] = output_symb ;
1189 }
1190 }
1191 return lex_pos ;
1192 }
1193
1194 /* ====================================================================
1195 analyze.c (lex_has_def)
1196 called by analyze.c (non_geocode_address, _modify_position_)
1197 scan the ith row of comp_lex_sym for the symbol sym
1198 returns the matching cell j
1199 =======================================================================*/
lex_has_def(STAND_PARAM * __stand_param__,int i,SYMB sym)1200 static int lex_has_def(STAND_PARAM *__stand_param__, int i, SYMB sym)
1201 {
1202 int j ;
1203 int *__num_defs__ = __stand_param__->def_cnt ;
1204 for (j = 0; j < __num_defs__[i]; j ++)
1205 {
1206 if (__stand_param__->comp_lex_sym[i][j] == sym)
1207 {
1208 return j ;
1209 }
1210 }
1211 return FAIL ;
1212 }
1213
1214 /* ====================================================================
1215 analyze.c (have_schema_symbol)
1216 called by analyze.c (schema_modify_position)
1217 =======================================================================*/
have_schema_symbol(int * __check_dir__,SYMB sym)1218 static int have_schema_symbol(int *__check_dir__,SYMB sym)
1219 {
1220 if (__check_dir__ != NULL)
1221 {
1222 if (__check_dir__[sym])
1223 {
1224 return TRUE ;
1225 }
1226 }
1227 return FALSE ;
1228 }
1229
1230 /* ====================================================================
1231 <summary>
1232 <function name='analyze.c (default_seg_val)'/>
1233 <calls> <functionref='analyze.c (make_singleton)'/> </calls>
1234 <called-by> <functionref='analyze.c (force_arc_clause,
1235 _force_macro_clause_)'/> </called-by>
1236 <revision date='2009-08-09'> Fourth arg now used to determine if
1237 the __sym_sel__ should be initialized to the first definition :
1238 save_composition uses the value. We will do that when we
1239 have no idea at all which the right one is -- and there is
1240 always at least one. </revision>
1241 </summary>
1242 =======================================================================*/
1243 #define DEPTH_POS ( num_lexes - 1 ) - depth
1244
default_seg_val(int * __sym_sel__,int num_lexes,SEG * __segments__,int use_default_sym,SYMB sym,double score)1245 static void default_seg_val( int *__sym_sel__, int num_lexes, SEG *__segments__, int use_default_sym, SYMB sym, double score )
1246 {
1247 int depth ;
1248 for (depth = FIRST_LEX_POS ;depth < num_lexes;depth ++)
1249 {
1250 if (use_default_sym)
1251 {
1252 /*-- <revision date='2009-08-09'> Set default only if told to do so </revision> --*/
1253 __sym_sel__[DEPTH_POS] = 0 ; /* -- default value -- */
1254 }
1255 /*-- <revision date='2009-08-09'> Eliminate cl arg to make_singleton. </revision> --*/
1256 make_singleton(__segments__,sym,DEPTH_POS,depth,score) ;
1257 }
1258 }
1259
1260 /* ====================================================================
1261 analyze.c (_modify_position_)
1262 called by analyze.c (schema_modify_position,_force_macro_clause_)
1263 calls analyze.c (lex_has_def)
1264 <remarks>If the input symbol is found at pos, then we put the out_sym as the sub_sym
1265 at depth in __seg__
1266 =======================================================================*/
_modify_position_(STAND_PARAM * __stand_param__,SEG * __seg__,int depth,int pos,SYMB in_sym,SYMB out_sym)1267 static int _modify_position_(STAND_PARAM *__stand_param__, SEG *__seg__, int depth, int pos, SYMB in_sym, SYMB out_sym)
1268 {
1269 int sel ;
1270 if ((sel = lex_has_def(__stand_param__, pos, in_sym)) != FAIL)
1271 {
1272 __seg__[depth].sub_sym = out_sym ;
1273 __stand_param__->cur_sym_sel[pos] = sel ;
1274 return TRUE ;
1275 }
1276 return FALSE ;
1277 }
1278
1279 /* ====================================================================
1280 analyze.c (schema_modify_position)
1281 - called by analyze.c (force_arc_clause)
1282 calls analyze.c (have_schema_symbol, _modify_position_)
1283 =======================================================================*/
schema_modify_position(STAND_PARAM * __stand_param__,SEG * __segments__,int depth,int lex_pos,SYMB in_sym,SYMB out_sym)1284 static int schema_modify_position( STAND_PARAM *__stand_param__ , SEG *__segments__ , int depth , int lex_pos , SYMB in_sym , SYMB out_sym )
1285 {
1286 /* -- note: this requires that attributes are present. It
1287 only works if we're working within a particular
1288 reference dataset. -- */
1289 if (have_schema_symbol(__stand_param__->have_ref_att, out_sym))
1290 {
1291 return (_modify_position_(__stand_param__,__segments__, depth , lex_pos , in_sym , out_sym)) ;
1292 }
1293 return FALSE ;
1294 }
1295
1296
1297
1298 /* ====================================================================
1299 analyze.c (force_arc_clause)
1300 called by analyze.c (evaluator)
1301 calls analyze.c (default_seg_val, schema_modify_position and _force_deposit_)
1302 <remarks>We're going to force standardization on an Arc clause without
1303 much computation. first_composition has already done its work,
1304 so we go through the lex_sym looking for likely constructions , using
1305 the schema read as a guide </remarks>
1306 =======================================================================*/
force_arc_clause(STAND_PARAM * __stand_param__)1307 static void force_arc_clause( STAND_PARAM *__stand_param__ )
1308 {
1309 int lex_start, lex_end, depth ;
1310 STZ_PARAM * __stz_info__ = __stand_param__->stz_info ;
1311 int num_lexes = __stand_param__->LexNum ;
1312 default_seg_val( __stand_param__->cur_sym_sel , num_lexes , __stz_info__->segs , ARC_C , STREET , VERY_LOW_WEIGHT ) ;
1313 depth = lex_start = 0 ;
1314 lex_end = num_lexes -1 ;
1315 /*-- look for a SUFDIR in the last position --*/
1316 if (lex_start < lex_end -1)
1317 {
1318 if (schema_modify_position( __stand_param__ , __stz_info__->segs , depth , lex_end , DIRECT , SUFDIR ))
1319 {
1320 lex_end-- ;
1321 depth ++ ;
1322 }
1323 }
1324 /*-- look for a SUFTYP --*/
1325 if (lex_start < (lex_end -1))
1326 {
1327 if (schema_modify_position( __stand_param__ , __stz_info__->segs , depth , lex_end , TYPE , SUFTYP ))
1328 {
1329 lex_end-- ;
1330 }
1331 }
1332 depth = num_lexes - 1 ;
1333 if (lex_start < (lex_end -1))
1334 {
1335 if (schema_modify_position(__stand_param__, __stz_info__->segs, depth, lex_start, DIRECT, PREDIR))
1336 {
1337 lex_start++ ;
1338 depth -- ;
1339 }
1340 }
1341 if (lex_start < (lex_end-1))
1342 {
1343 if (schema_modify_position(__stand_param__, __stz_info__->segs, depth, lex_start, TYPE, PRETYP))
1344 {
1345 lex_start++ ;
1346 }
1347 }
1348 _force_deposit_(__stand_param__, (__stand_param__->LexNum-1)) ;
1349 }
1350
1351 #define MODIFY_SEG_POS(_IN_SYM_VAL_,_OUT_SYM_VAL_)\
1352 if ( _modify_position_( __stand_param__ , __segments__ , depth , lex_sym_pos , _IN_SYM_VAL_ , _OUT_SYM_VAL_ ) ) { continue ; }
1353
1354
1355 /* ====================================================================
1356 <summary>
1357 <function name='analyze.c (_force_macro_clause_)'/>
1358 <called-by> <functionref='analyze.c (evaluator)'/> </called-by>
1359 <calls> <functionref='analyze.c (default_seg_val,_modify_position_,_force_deposit_)'/> </calls>
1360 </summary>
1361 =======================================================================*/
1362 #ifdef USE_FORCE_MACRO
_force_macro_clause_(STAND_PARAM * __stand_param__)1363 static void _force_macro_clause_( STAND_PARAM *__stand_param__ )
1364 {
1365 int lex_sym_pos, depth ;
1366 int n = __stand_param__->LexNum ;
1367 int end = n -1 ;
1368 SEG *__segments__ = __stand_param__->stz_info->segs ;
1369
1370 default_seg_val( __stand_param__->cur_sym_sel , n , __segments__ , MACRO_C , POSTAL , VERY_LOW_WEIGHT ) ;
1371 for ( lex_sym_pos = 0 , depth = end ; lex_sym_pos <= end ; lex_sym_pos ++ , depth -- )
1372 {
1373 MODIFY_SEG_POS(PCH,POSTAL);
1374 MODIFY_SEG_POS(PCT,POSTAL);
1375 MODIFY_SEG_POS(QUINT,POSTAL);
1376 MODIFY_SEG_POS(QUAD,POSTAL);
1377 MODIFY_SEG_POS(NUMBER,POSTAL);
1378 MODIFY_SEG_POS(MIXED,POSTAL);
1379 MODIFY_SEG_POS(NATION,NATION);
1380 MODIFY_SEG_POS(PROV,PROV);
1381 MODIFY_SEG_POS(CITY,CITY);
1382 MODIFY_SEG_POS(WORD,CITY);
1383 }
1384 _force_deposit_(__stand_param__,n-1) ;
1385 }
1386 #endif
1387 /* ====================================================================
1388 <summary>
1389 <function name='analyze.c (_force_deposit_)'/>
1390 <called-by> <function ref='analyze.c (force_arc_clause,_force_macro_clause_)'/> </called-by>
1391 <calls> <function ref='analyze.c (deposit_stz)'/> </calls>
1392 </summary>
1393 =======================================================================*/
_force_deposit_(STAND_PARAM * __stand_param__,int depth)1394 static void _force_deposit_( STAND_PARAM *__stand_param__ , int depth )
1395 {
1396 /*-- <remarks> Worst case scenario: we have a string of unknowns. It'll score
1397 really low, but not zero. </remarks> --*/
1398 double sum = 0.00 ;
1399 SEG *__seg__ ;
1400 SEG *__segments__ = __stand_param__->stz_info->segs ;
1401 for (__seg__ = __segments__ + depth; __seg__ >= __segments__; __seg__--)
1402 {
1403 sum += __seg__->Value ;
1404 }
1405 deposit_stz( __stand_param__ , sum , depth ) ;
1406 }
1407
1408 /* ====================================================================
1409 analyze.c (non_geocode_address)
1410 called by analyze.c (evaluator)
1411 calls analyze.c (lex_has_def)
1412 =======================================================================*/
non_geocode_address(STAND_PARAM * __stand_param__)1413 static int non_geocode_address( STAND_PARAM *__stand_param__ )
1414 {
1415 /* -- scan through each position looking for an RR or BOXH token. -- */
1416 int lex_sym_pos ;
1417 int n = __stand_param__->LexNum ;
1418 for ( lex_sym_pos = FIRST_LEX_POS ; lex_sym_pos < n ; lex_sym_pos ++ )
1419 {
1420 int result = lex_has_def( __stand_param__ , lex_sym_pos , RR ) ;
1421 if ( result != FAIL )
1422 {
1423 return TRUE ;
1424 }
1425 if ((result = lex_has_def( __stand_param__ , lex_sym_pos , BOXH )) != FAIL)
1426 {
1427 return TRUE ;
1428 }
1429 }
1430 return FALSE ;
1431 }
1432
1433 /* ====================================================================
1434 analyze.c (output_raw_elements)
1435 print out the raw elements of the tokens
1436 =======================================================================*/
output_raw_elements(STAND_PARAM * __stand_param__,ERR_PARAM * __err_param__)1437 void output_raw_elements( STAND_PARAM * __stand_param__ , ERR_PARAM *__err_param__ )
1438 {
1439 int stz_no , n ;
1440 int lex_pos ;
1441 DEF *__def__ ;
1442 STZ **__stz_list__;
1443
1444 STZ_PARAM *__stz_info__ = __stand_param__->stz_info ;
1445 if (__err_param__ == NULL)
1446 {
1447 printf("Input tokenization candidates:\n") ;
1448 }
1449 else
1450 {
1451 LOG_MESS("Input tokenization candidates:",__err_param__) ;
1452 }
1453 for (lex_pos = FIRST_LEX_POS;lex_pos < __stand_param__->LexNum;lex_pos ++)
1454 {
1455 for ( __def__ = __stand_param__->lex_vector[lex_pos].DefList; __def__ != NULL; __def__ = __def__->Next)
1456 {
1457 if (__err_param__ == NULL)
1458 {
1459 printf("\t(%d) std: %s, tok: %d (%s)\n",lex_pos,((__def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard),__def__->Type,in_symb_name(__def__->Type));
1460 }
1461 else
1462 {
1463 sprintf( __err_param__->error_buf , "\t(%d) std: %s, tok: %d (%s)\n" , lex_pos , (( __def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard) , __def__->Type , in_symb_name( __def__->Type ));
1464 register_error( __err_param__ ) ;
1465 }
1466 }
1467 }
1468 n = __stz_info__->stz_list_size ;
1469 __stz_list__ = __stz_info__->stz_array ;
1470 for ( stz_no = FIRST_STZ ; stz_no < n ; stz_no ++ )
1471 {
1472 STZ *__cur_stz__ = __stz_list__[stz_no] ;
1473 if ( __err_param__ == NULL )
1474 {
1475 printf( "Raw standardization %d with score %f:\n" , ( stz_no ) , __cur_stz__->score ) ;
1476 }
1477 else
1478 {
1479 LOG_MESS2( "Raw standardization %d with score %f:\n" , ( stz_no ) , __cur_stz__->score , __err_param__ ) ;
1480 }
1481 for ( lex_pos = FIRST_LEX_POS ; lex_pos < __stand_param__->LexNum ; lex_pos ++ )
1482 {
1483 SYMB k;
1484 __def__ = __cur_stz__->definitions[lex_pos] ;
1485 /*-- 2010-11-18 : handle end STOPWORD --*/
1486 k = __cur_stz__->output[lex_pos] ;
1487 if ( __err_param__ == NULL )
1488 {
1489 printf( "\t(%d) Input %d (%s) text %s mapped to output %d (%s)\n" , lex_pos , __def__->Type , in_symb_name( __def__->Type ) , (( __def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard ) , k , (( k == FAIL )? "NONE" : out_symb_name( k ))) ;
1490 }
1491 else
1492 {
1493 sprintf( __err_param__->error_buf , "\t(%d) Input %d (%s) text %s mapped to output %d (%s)\n" , lex_pos , __def__->Type , in_symb_name( __def__->Type ) , (( __def__->Protect )? __stand_param__->lex_vector[lex_pos].Text : __def__->Standard ) , k , (( k == FAIL )? "NONE" : out_symb_name( k ))) ;
1494 register_error( __err_param__ ) ;
1495 }
1496 if ( k == FAIL ) break ;
1497 }
1498 }
1499 fflush( stdout ) ;
1500 }
1501
1502