1 /* This file is part of the Zebra server.
2 Copyright (C) 2004-2013 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18 */
19
20 /** \file
21 \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #if HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 #if FLUSH2
54 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
55 zebra_rec_keys_t ins_keys,
56 zint ins_rank,
57 zebra_rec_keys_t del_keys,
58 zint del_rank);
59 #else
60 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
61 int cmd,
62 zebra_rec_keys_t reckeys,
63 zint staticrank);
64 #endif
65
zebra_init_log_level(void)66 static void zebra_init_log_level(void)
67 {
68 if (!log_level_initialized)
69 {
70 log_level_initialized = 1;
71
72 log_level_extract = yaz_log_module_level("extract");
73 log_level_details = yaz_log_module_level("indexdetails");
74 }
75 }
76
wrbuf_hex_str(const char * cstr)77 static WRBUF wrbuf_hex_str(const char *cstr)
78 {
79 size_t i;
80 WRBUF w = wrbuf_alloc();
81 for (i = 0; cstr[i]; i++)
82 {
83 if (cstr[i] < ' ' || cstr[i] > 126)
84 wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
85 else
86 wrbuf_putc(w, cstr[i]);
87 }
88 return w;
89 }
90
91
92 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
93 int cmd, zebra_rec_keys_t skp);
94 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
95 static void extract_token_add(RecWord *p);
96
check_log_limit(ZebraHandle zh)97 static void check_log_limit(ZebraHandle zh)
98 {
99 if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
100 {
101 yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
102 zh->m_file_verbose_limit);
103 }
104 }
105
logRecord(ZebraHandle zh)106 static void logRecord(ZebraHandle zh)
107 {
108 check_log_limit(zh);
109 ++zh->records_processed;
110 if (!(zh->records_processed % 1000))
111 {
112 yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
113 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT,
114 zh->records_processed, zh->records_inserted,
115 zh->records_updated, zh->records_deleted);
116 }
117 }
118
init_extractCtrl(ZebraHandle zh,struct recExtractCtrl * ctrl)119 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
120 {
121 ctrl->flagShowRecords = !zh->m_flag_rw;
122 }
123
124
125 static void extract_add_index_string(RecWord *p,
126 zinfo_index_category_t cat,
127 const char *str, int length);
128
129 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
130
extract_init(struct recExtractCtrl * p,RecWord * w)131 static void extract_init(struct recExtractCtrl *p, RecWord *w)
132 {
133 w->seqno = 1;
134 w->index_name = "any";
135 w->index_type = "w";
136 w->extractCtrl = p;
137 w->record_id = 0;
138 w->section_id = 0;
139 w->segment = 0;
140 }
141
142 struct snip_rec_info {
143 ZebraHandle zh;
144 zebra_snippets *snippets;
145 };
146
parse_complete_field(RecWord * p,zebra_map_t zm,char * buf)147 static int parse_complete_field(RecWord *p, zebra_map_t zm,
148 char *buf)
149 {
150 const char *b = p->term_buf;
151 const char **map = 0;
152 int i = 0, remain = p->term_len;
153
154 if (remain > 0)
155 map = zebra_maps_input(zm, &b, remain, 1);
156 while (remain > 0 && i < IT_MAX_WORD)
157 {
158 while (map && *map && **map == *CHR_SPACE)
159 {
160 remain = p->term_len - (b - p->term_buf);
161
162 if (remain > 0)
163 {
164 int first = i ? 0 : 1; /* first position */
165 map = zebra_maps_input(zm, &b, remain, first);
166 }
167 else
168 map = 0;
169 }
170 if (!map)
171 break;
172
173 if (i && i < IT_MAX_WORD)
174 buf[i++] = *CHR_SPACE;
175 while (map && *map && **map != *CHR_SPACE)
176 {
177 const char *cp = *map;
178
179 if (**map == *CHR_CUT)
180 {
181 i = 0;
182 }
183 else
184 {
185 if (i >= IT_MAX_WORD)
186 break;
187 while (i < IT_MAX_WORD && *cp)
188 buf[i++] = *(cp++);
189 }
190 remain = p->term_len - (b - p->term_buf);
191 if (remain > 0)
192 {
193 map = zebra_maps_input(zm, &b, remain, 0);
194 }
195 else
196 map = 0;
197 }
198 }
199 return i;
200 }
201
snippet_add_complete_field(RecWord * p,int ord,zebra_map_t zm)202 static void snippet_add_complete_field(RecWord *p, int ord,
203 zebra_map_t zm)
204 {
205 struct snip_rec_info *h = p->extractCtrl->handle;
206 char buf[IT_MAX_WORD+1];
207 int i = parse_complete_field(p, zm, buf);
208
209 if (!i)
210 return;
211
212 if (p->term_len && p->term_buf && zebra_maps_is_index(zm))
213 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
214 p->term_buf, p->term_len);
215 p->seqno++;
216 }
217
snippet_add_incomplete_field(RecWord * p,int ord,zebra_map_t zm)218 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
219 {
220 struct snip_rec_info *h = p->extractCtrl->handle;
221 const char *b = p->term_buf;
222 int remain = p->term_len;
223 int first = 1;
224 const char **map = 0;
225 const char *start = b;
226 const char *last = b;
227
228 if (remain > 0)
229 map = zebra_maps_input(zm, &b, remain, 0);
230
231 while (map)
232 {
233 int remain;
234
235 /* Skip spaces */
236 while (map && *map && **map == *CHR_SPACE)
237 {
238 remain = p->term_len - (b - p->term_buf);
239 last = b;
240 if (remain > 0)
241 map = zebra_maps_input(zm, &b, remain, 0);
242 else
243 map = 0;
244 }
245 if (!map)
246 break;
247 if (start != last && zebra_maps_is_index(zm))
248 {
249 zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
250 start, last - start);
251 }
252 start = last;
253 while (map && *map && **map != *CHR_SPACE)
254 {
255 remain = p->term_len - (b - p->term_buf);
256 last = b;
257 if (remain > 0)
258 map = zebra_maps_input(zm, &b, remain, 0);
259 else
260 map = 0;
261 }
262 if (start == last)
263 return ;
264
265 if (first)
266 {
267 first = 0;
268 if (zebra_maps_is_first_in_field(zm))
269 {
270 /* first in field marker */
271 p->seqno++;
272 }
273 }
274 if (start != last && zebra_maps_is_index(zm))
275 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
276 start, last - start);
277 start = last;
278 p->seqno++;
279 }
280
281 }
282
snippet_add_icu(RecWord * p,int ord,zebra_map_t zm)283 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
284 {
285 struct snip_rec_info *h = p->extractCtrl->handle;
286
287 const char *res_buf = 0;
288 size_t res_len = 0;
289
290 const char *display_buf = 0;
291 size_t display_len = 0;
292
293 zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
294 while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
295 &display_buf, &display_len))
296 {
297 if (zebra_maps_is_index(zm))
298 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
299 display_buf, display_len);
300 p->seqno++;
301 }
302 }
303
snippet_token_add(RecWord * p)304 static void snippet_token_add(RecWord *p)
305 {
306 struct snip_rec_info *h = p->extractCtrl->handle;
307 ZebraHandle zh = h->zh;
308 zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
309
310 if (zm)
311 {
312 ZebraExplainInfo zei = zh->reg->zei;
313 int ch = zebraExplain_lookup_attr_str(
314 zei, zinfo_index_category_index, p->index_type, p->index_name);
315
316 if (zebra_maps_is_icu(zm))
317 snippet_add_icu(p, ch, zm);
318 else
319 {
320 if (zebra_maps_is_complete(zm))
321 snippet_add_complete_field(p, ch, zm);
322 else
323 snippet_add_incomplete_field(p, ch, zm);
324 }
325 }
326 }
327
snippet_schema_add(struct recExtractCtrl * p,Odr_oid * oid)328 static void snippet_schema_add(
329 struct recExtractCtrl *p, Odr_oid *oid)
330 {
331
332 }
333
extract_snippet(ZebraHandle zh,zebra_snippets * sn,struct ZebraRecStream * stream,RecType rt,void * recTypeClientData)334 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
335 struct ZebraRecStream *stream,
336 RecType rt, void *recTypeClientData)
337 {
338 struct recExtractCtrl extractCtrl;
339 struct snip_rec_info info;
340
341 extractCtrl.stream = stream;
342 extractCtrl.first_record = 1;
343 extractCtrl.init = extract_init;
344 extractCtrl.tokenAdd = snippet_token_add;
345 extractCtrl.schemaAdd = snippet_schema_add;
346 assert(zh->reg);
347 assert(zh->reg->dh);
348
349 extractCtrl.dh = zh->reg->dh;
350
351 info.zh = zh;
352 info.snippets = sn;
353 extractCtrl.handle = &info;
354 extractCtrl.match_criteria[0] = '\0';
355 extractCtrl.staticrank = 0;
356 extractCtrl.action = action_insert;
357
358 init_extractCtrl(zh, &extractCtrl);
359
360 extractCtrl.setStoreData = 0;
361
362 (*rt->extract)(recTypeClientData, &extractCtrl);
363 }
364
searchRecordKey(ZebraHandle zh,zebra_rec_keys_t reckeys,const char * index_name,const char ** ws,int ws_length)365 static void searchRecordKey(ZebraHandle zh,
366 zebra_rec_keys_t reckeys,
367 const char *index_name,
368 const char **ws, int ws_length)
369 {
370 int i;
371 int ch = -1;
372 zinfo_index_category_t cat = zinfo_index_category_index;
373
374 for (i = 0; i<ws_length; i++)
375 ws[i] = NULL;
376
377 if (ch < 0)
378 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
379 if (ch < 0)
380 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
381 if (ch < 0)
382 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
383
384 if (ch < 0)
385 return ;
386
387 if (zebra_rec_keys_rewind(reckeys))
388 {
389 zint startSeq = -1;
390 const char *str;
391 size_t slen;
392 struct it_key key;
393 zint seqno;
394 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
395 {
396 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
397
398 seqno = key.mem[key.len-1];
399
400 if (key.mem[0] == ch)
401 {
402 zint woff;
403
404 if (startSeq == -1)
405 startSeq = seqno;
406 woff = seqno - startSeq;
407 if (woff >= 0 && woff < ws_length)
408 ws[woff] = str;
409 }
410 }
411 }
412 }
413
414 #define FILE_MATCH_BLANK "\t "
415
get_match_from_spec(ZebraHandle zh,zebra_rec_keys_t reckeys,const char * fname,const char * spec)416 static char *get_match_from_spec(ZebraHandle zh,
417 zebra_rec_keys_t reckeys,
418 const char *fname, const char *spec)
419 {
420 static char dstBuf[2048]; /* static here ??? */
421 char *dst = dstBuf;
422 const char *s = spec;
423
424 while (1)
425 {
426 for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
427 ;
428 if (!*s)
429 break;
430 if (*s == '(')
431 {
432 const char *ws[32];
433 char attset_str[64], attname_str[64];
434 int i;
435 int first = 1;
436
437 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
438 ;
439 for (i = 0; *s && *s != ',' && *s != ')' &&
440 !strchr(FILE_MATCH_BLANK, *s); s++)
441 if (i+1 < sizeof(attset_str))
442 attset_str[i++] = *s;
443 attset_str[i] = '\0';
444
445 for (; strchr(FILE_MATCH_BLANK, *s); s++)
446 ;
447 if (*s != ',')
448 strcpy(attname_str, attset_str);
449 else
450 {
451 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
452 ;
453 for (i = 0; *s && *s != ')' &&
454 !strchr(FILE_MATCH_BLANK, *s); s++)
455 if (i+1 < sizeof(attname_str))
456 attname_str[i++] = *s;
457 attname_str[i] = '\0';
458 }
459 if (*s != ')')
460 {
461 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
462 spec, zh->m_group ? zh->m_group : "none");
463 return NULL;
464 }
465 s++;
466
467 searchRecordKey(zh, reckeys, attname_str, ws, 32);
468 if (0) /* for debugging */
469 {
470 for (i = 0; i<32; i++)
471 {
472 if (ws[i])
473 {
474 WRBUF w = wrbuf_hex_str(ws[i]);
475 yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
476 wrbuf_destroy(w);
477 }
478 }
479 }
480
481 for (i = 0; i<32; i++)
482 if (ws[i])
483 {
484 if (first)
485 {
486 *dst++ = ' ';
487 first = 0;
488 }
489 strcpy(dst, ws[i]);
490 dst += strlen(ws[i]);
491 }
492 if (first)
493 {
494 yaz_log(YLOG_WARN, "Record didn't contain match"
495 " fields in (%s,%s)", attset_str, attname_str);
496 return NULL;
497 }
498 }
499 else if (*s == '$')
500 {
501 int spec_len;
502 char special[64];
503 const char *spec_src = NULL;
504 const char *s1 = ++s;
505 while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
506 s1++;
507
508 spec_len = s1 - s;
509 if (spec_len > sizeof(special)-1)
510 spec_len = sizeof(special)-1;
511 memcpy(special, s, spec_len);
512 special[spec_len] = '\0';
513 s = s1;
514
515 if (!strcmp(special, "group"))
516 spec_src = zh->m_group;
517 else if (!strcmp(special, "database"))
518 spec_src = zh->basenames[0];
519 else if (!strcmp(special, "filename")) {
520 spec_src = fname;
521 }
522 else if (!strcmp(special, "type"))
523 spec_src = zh->m_record_type;
524 else
525 spec_src = NULL;
526 if (spec_src)
527 {
528 strcpy(dst, spec_src);
529 dst += strlen(spec_src);
530 }
531 }
532 else if (*s == '\"' || *s == '\'')
533 {
534 int stopMarker = *s++;
535 char tmpString[64];
536 int i = 0;
537
538 while (*s && *s != stopMarker)
539 {
540 if (i+1 < sizeof(tmpString))
541 tmpString[i++] = *s++;
542 }
543 if (*s)
544 s++;
545 tmpString[i] = '\0';
546 strcpy(dst, tmpString);
547 dst += strlen(tmpString);
548 }
549 else
550 {
551 yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
552 spec, zh->m_group ? zh->m_group : "none");
553 return NULL;
554 }
555 *dst++ = 1;
556 }
557 if (dst == dstBuf)
558 {
559 yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
560 fname, zh->m_group ? zh->m_group : "none");
561 return NULL;
562 }
563 *dst = '\0';
564
565 if (0) /* for debugging */
566 {
567 WRBUF w = wrbuf_hex_str(dstBuf);
568 yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
569 wrbuf_destroy(w);
570 }
571
572 return dstBuf;
573 }
574
575 struct recordLogInfo {
576 const char *fname;
577 int recordOffset;
578 struct recordGroup *rGroup;
579 };
580
581 /** \brief add the always-matches index entry and map to real record ID
582 \param ctrl record control
583 \param record_id custom record ID
584 \param sysno system record ID
585
586 This function serves two purposes.. It adds the always matches
587 entry and makes a pointer from the custom record ID (if defined)
588 back to the system record ID (sysno)
589 See zebra_recid_to_sysno .
590 */
all_matches_add(struct recExtractCtrl * ctrl,zint record_id,zint sysno)591 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
592 zint sysno)
593 {
594 RecWord word;
595 extract_init(ctrl, &word);
596 word.record_id = record_id;
597 /* we use the seqno as placeholder for a way to get back to
598 record database from _ALLRECORDS.. This is used if a custom
599 RECORD was defined */
600 word.seqno = sysno;
601 word.index_name = "_ALLRECORDS";
602 word.index_type = "w";
603
604 extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
605 "", 0);
606 }
607
608 /* forward declaration */
609 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
610 struct ZebraRecStream *stream,
611 enum zebra_recctrl_action_t action,
612 const char *recordType,
613 zint *sysno,
614 const char *match_criteria,
615 const char *fname,
616 RecType recType,
617 void *recTypeClientData);
618
619
zebra_extract_file(ZebraHandle zh,zint * sysno,const char * fname,enum zebra_recctrl_action_t action)620 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
621 enum zebra_recctrl_action_t action)
622 {
623 ZEBRA_RES r = ZEBRA_OK;
624 int i, fd;
625 char gprefix[128];
626 char ext[128];
627 char ext_res[128];
628 const char *original_record_type = 0;
629 RecType recType;
630 void *recTypeClientData;
631 struct ZebraRecStream stream, *streamp;
632
633 zebra_init_log_level();
634
635 if (!zh->m_group || !*zh->m_group)
636 *gprefix = '\0';
637 else
638 sprintf(gprefix, "%s.", zh->m_group);
639
640 yaz_log(log_level_extract, "zebra_extract_file %s", fname);
641
642 /* determine file extension */
643 *ext = '\0';
644 for (i = strlen(fname); --i >= 0; )
645 if (fname[i] == '/')
646 break;
647 else if (fname[i] == '.')
648 {
649 strcpy(ext, fname+i+1);
650 break;
651 }
652 /* determine file type - depending on extension */
653 original_record_type = zh->m_record_type;
654 if (!zh->m_record_type)
655 {
656 sprintf(ext_res, "%srecordType.%s", gprefix, ext);
657 zh->m_record_type = res_get(zh->res, ext_res);
658 }
659 if (!zh->m_record_type)
660 {
661 check_log_limit(zh);
662 if (zh->records_processed + zh->records_skipped
663 < zh->m_file_verbose_limit)
664 yaz_log(YLOG_LOG, "? %s", fname);
665 zh->records_skipped++;
666 return 0;
667 }
668 /* determine match criteria */
669 if (!zh->m_record_id)
670 {
671 sprintf(ext_res, "%srecordId.%s", gprefix, ext);
672 zh->m_record_id = res_get(zh->res, ext_res);
673 }
674
675 if (!(recType =
676 recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
677 &recTypeClientData)))
678 {
679 yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
680 return ZEBRA_FAIL;
681 }
682
683 switch(recType->version)
684 {
685 case 0:
686 break;
687 default:
688 yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
689 }
690 if (sysno && (action == action_delete || action == action_a_delete))
691 {
692 streamp = 0;
693 }
694 else
695 {
696 char full_rep[1024];
697
698 if (zh->path_reg && !yaz_is_abspath(fname))
699 {
700 strcpy(full_rep, zh->path_reg);
701 strcat(full_rep, "/");
702 strcat(full_rep, fname);
703 }
704 else
705 strcpy(full_rep, fname);
706
707 if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
708 {
709 yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
710 zh->m_record_type = original_record_type;
711 return ZEBRA_FAIL;
712 }
713 streamp = &stream;
714 zebra_create_stream_fd(streamp, fd, 0);
715 }
716 r = zebra_extract_records_stream(zh, streamp,
717 action,
718 zh->m_record_type,
719 sysno,
720 0, /*match_criteria */
721 fname,
722 recType, recTypeClientData);
723 if (streamp)
724 stream.destroy(streamp);
725 zh->m_record_type = original_record_type;
726 return r;
727 }
728
729 /*
730 If sysno is provided, then it's used to identify the reocord.
731 If not, and match_criteria is provided, then sysno is guessed
732 If not, and a record is provided, then sysno is got from there
733
734 */
735
zebra_buffer_extract_record(ZebraHandle zh,const char * buf,size_t buf_size,enum zebra_recctrl_action_t action,const char * recordType,zint * sysno,const char * match_criteria,const char * fname)736 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
737 const char *buf, size_t buf_size,
738 enum zebra_recctrl_action_t action,
739 const char *recordType,
740 zint *sysno,
741 const char *match_criteria,
742 const char *fname)
743 {
744 struct ZebraRecStream stream;
745 ZEBRA_RES res;
746 void *clientData;
747 RecType recType = 0;
748
749 if (recordType && *recordType)
750 {
751 yaz_log(log_level_extract,
752 "Record type explicitly specified: %s", recordType);
753 recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
754 &clientData);
755 }
756 else
757 {
758 if (!(zh->m_record_type))
759 {
760 yaz_log(YLOG_WARN, "No such record type defined");
761 return ZEBRA_FAIL;
762 }
763 yaz_log(log_level_extract, "Get record type from rgroup: %s",
764 zh->m_record_type);
765 recType = recType_byName(zh->reg->recTypes, zh->res,
766 zh->m_record_type, &clientData);
767 recordType = zh->m_record_type;
768 }
769
770 if (!recType)
771 {
772 yaz_log(YLOG_WARN, "No such record type: %s", recordType);
773 return ZEBRA_FAIL;
774 }
775
776 zebra_create_stream_mem(&stream, buf, buf_size);
777
778 res = zebra_extract_records_stream(zh, &stream,
779 action,
780 recordType,
781 sysno,
782 match_criteria,
783 fname,
784 recType, clientData);
785 stream.destroy(&stream);
786 return res;
787 }
788
zebra_extract_record_stream(ZebraHandle zh,struct ZebraRecStream * stream,enum zebra_recctrl_action_t action,const char * recordType,zint * sysno,const char * match_criteria,const char * fname,RecType recType,void * recTypeClientData,int * more)789 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
790 struct ZebraRecStream *stream,
791 enum zebra_recctrl_action_t action,
792 const char *recordType,
793 zint *sysno,
794 const char *match_criteria,
795 const char *fname,
796 RecType recType,
797 void *recTypeClientData,
798 int *more)
799
800 {
801 zint sysno0 = 0;
802 RecordAttr *recordAttr;
803 struct recExtractCtrl extractCtrl;
804 int r;
805 const char *matchStr = 0;
806 Record rec;
807 off_t start_offset = 0, end_offset = 0;
808 const char *pr_fname = fname; /* filename to print .. */
809 int show_progress = zh->records_processed + zh->records_skipped
810 < zh->m_file_verbose_limit ? 1:0;
811
812 zebra_init_log_level();
813
814 if (!pr_fname)
815 pr_fname = "<no file>"; /* make it printable if file is omitted */
816
817 zebra_rec_keys_reset(zh->reg->keys);
818 zebra_rec_keys_reset(zh->reg->sortKeys);
819
820 if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
821 {
822 if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0],
823 zh->m_explain_database))
824 return ZEBRA_FAIL;
825 }
826
827 if (stream)
828 {
829 off_t null_offset = 0;
830 extractCtrl.stream = stream;
831
832 start_offset = stream->tellf(stream);
833
834 extractCtrl.first_record = start_offset ? 0 : 1;
835
836 stream->endf(stream, &null_offset);;
837
838 extractCtrl.init = extract_init;
839 extractCtrl.tokenAdd = extract_token_add;
840 extractCtrl.schemaAdd = extract_schema_add;
841 extractCtrl.dh = zh->reg->dh;
842 extractCtrl.handle = zh;
843 extractCtrl.match_criteria[0] = '\0';
844 extractCtrl.staticrank = 0;
845 extractCtrl.action = action;
846
847 init_extractCtrl(zh, &extractCtrl);
848
849 extract_set_store_data_prepare(&extractCtrl);
850
851 r = (*recType->extract)(recTypeClientData, &extractCtrl);
852
853 if (action == action_update)
854 {
855 action = extractCtrl.action;
856 }
857
858 switch (r)
859 {
860 case RECCTRL_EXTRACT_EOF:
861 return ZEBRA_FAIL;
862 case RECCTRL_EXTRACT_ERROR_GENERIC:
863 /* error occured during extraction ... */
864 yaz_log(YLOG_WARN, "extract error: generic");
865 return ZEBRA_FAIL;
866 case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
867 /* error occured during extraction ... */
868 yaz_log(YLOG_WARN, "extract error: no such filter");
869 return ZEBRA_FAIL;
870 case RECCTRL_EXTRACT_SKIP:
871 if (show_progress)
872 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
873 recordType, pr_fname, (zint) start_offset);
874 *more = 1;
875
876 end_offset = stream->endf(stream, 0);
877 if (end_offset)
878 stream->seekf(stream, end_offset);
879
880 return ZEBRA_OK;
881 case RECCTRL_EXTRACT_OK:
882 break;
883 default:
884 yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
885 return ZEBRA_FAIL;
886 }
887 end_offset = stream->endf(stream, 0);
888 if (end_offset)
889 stream->seekf(stream, end_offset);
890 else
891 end_offset = stream->tellf(stream);
892
893 if (extractCtrl.match_criteria[0])
894 match_criteria = extractCtrl.match_criteria;
895 }
896
897 *more = 1;
898
899 if (zh->m_flag_rw == 0)
900 {
901 yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
902 pr_fname, (zint) start_offset);
903 /* test mode .. Do not perform match */
904 return ZEBRA_OK;
905 }
906
907 if (!sysno)
908 {
909 sysno = &sysno0;
910
911 if (match_criteria && *match_criteria)
912 matchStr = match_criteria;
913 else
914 {
915 if (zh->m_record_id && *zh->m_record_id)
916 {
917 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname,
918 zh->m_record_id);
919 if (!matchStr)
920 {
921 yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
922 pr_fname, (zint) start_offset);
923 return ZEBRA_FAIL;
924 }
925 if (0 && matchStr)
926 {
927 WRBUF w = wrbuf_alloc();
928 size_t i;
929 for (i = 0; i < strlen(matchStr); i++)
930 {
931 wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
932 }
933 yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
934 wrbuf_destroy(w);
935 }
936 }
937 }
938 if (matchStr)
939 {
940 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
941 char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
942 matchStr);
943
944
945 if (log_level_extract)
946 {
947 WRBUF w = wrbuf_hex_str(matchStr);
948 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
949 wrbuf_destroy(w);
950 }
951 if (rinfo)
952 {
953 assert(*rinfo == sizeof(*sysno));
954 memcpy(sysno, rinfo+1, sizeof(*sysno));
955 }
956 }
957 }
958
959 if (! *sysno)
960 {
961 /* new record AKA does not exist already */
962 if (action == action_delete)
963 {
964 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
965 pr_fname, (zint) start_offset);
966 yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
967 return ZEBRA_FAIL;
968 }
969 else if (action == action_a_delete)
970 {
971 if (show_progress)
972 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
973 pr_fname, (zint) start_offset);
974 return ZEBRA_OK;
975 }
976 else if (action == action_replace)
977 {
978 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
979 pr_fname, (zint) start_offset);
980 yaz_log(YLOG_WARN, "cannot update record above (seems new)");
981 return ZEBRA_FAIL;
982 }
983 if (show_progress)
984 yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
985 (zint) start_offset);
986 rec = rec_new(zh->reg->records);
987
988 *sysno = rec->sysno;
989
990
991 if (stream)
992 {
993 all_matches_add(&extractCtrl,
994 zebra_rec_keys_get_custom_record_id(zh->reg->keys),
995 *sysno);
996 }
997
998
999 recordAttr = rec_init_attr(zh->reg->zei, rec);
1000 if (extractCtrl.staticrank < 0)
1001 {
1002 yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1003 extractCtrl.staticrank = 0;
1004 }
1005
1006 if (matchStr)
1007 {
1008 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1009 dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1010 sizeof(*sysno), sysno);
1011 }
1012
1013 extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1014 #if FLUSH2
1015 extract_flush_record_keys2(zh, *sysno,
1016 zh->reg->keys, extractCtrl.staticrank,
1017 0, recordAttr->staticrank);
1018 #else
1019 extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1020 extractCtrl.staticrank);
1021 #endif
1022 recordAttr->staticrank = extractCtrl.staticrank;
1023 zh->records_inserted++;
1024 }
1025 else
1026 {
1027 /* record already exists */
1028 zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1029 zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1030 if (action == action_insert)
1031 {
1032 yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT,
1033 recordType, pr_fname, (zint) start_offset);
1034 logRecord(zh);
1035 return ZEBRA_FAIL;
1036 }
1037
1038 rec = rec_get(zh->reg->records, *sysno);
1039 assert(rec);
1040
1041 if (stream)
1042 {
1043 all_matches_add(&extractCtrl,
1044 zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1045 *sysno);
1046 }
1047
1048 recordAttr = rec_init_attr(zh->reg->zei, rec);
1049
1050 /* decrease total size */
1051 zebraExplain_recordBytesIncrement(zh->reg->zei,
1052 - recordAttr->recordSize);
1053
1054 zebra_rec_keys_set_buf(delkeys,
1055 rec->info[recInfo_delKeys],
1056 rec->size[recInfo_delKeys],
1057 0);
1058 zebra_rec_keys_set_buf(sortKeys,
1059 rec->info[recInfo_sortKeys],
1060 rec->size[recInfo_sortKeys],
1061 0);
1062
1063 extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1064 #if !FLUSH2
1065 extract_flush_record_keys(zh, *sysno, 0, delkeys,
1066 recordAttr->staticrank);
1067 #endif
1068 if (action == action_delete || action == action_a_delete)
1069 {
1070 /* record going to be deleted */
1071 #if FLUSH2
1072 extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1073 delkeys, recordAttr->staticrank);
1074 #endif
1075 if (zebra_rec_keys_empty(delkeys))
1076 {
1077 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1078 pr_fname, (zint) start_offset);
1079 yaz_log(YLOG_WARN, "cannot delete file above, "
1080 "storeKeys false (3)");
1081 }
1082 else
1083 {
1084 if (show_progress)
1085 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1086 pr_fname, (zint) start_offset);
1087 zh->records_deleted++;
1088 if (matchStr)
1089 {
1090 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1091 dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1092 }
1093 rec_del(zh->reg->records, &rec);
1094 }
1095 zebra_rec_keys_close(delkeys);
1096 zebra_rec_keys_close(sortKeys);
1097 rec_free(&rec);
1098 logRecord(zh);
1099 return ZEBRA_OK;
1100 }
1101 else
1102 { /* update or special_update */
1103 if (show_progress)
1104 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1105 pr_fname, (zint) start_offset);
1106 extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1107
1108 #if FLUSH2
1109 extract_flush_record_keys2(zh, *sysno,
1110 zh->reg->keys, extractCtrl.staticrank,
1111 delkeys, recordAttr->staticrank);
1112 #else
1113 extract_flush_record_keys(zh, *sysno, 1,
1114 zh->reg->keys, extractCtrl.staticrank);
1115 #endif
1116 recordAttr->staticrank = extractCtrl.staticrank;
1117 zh->records_updated++;
1118 }
1119 zebra_rec_keys_close(delkeys);
1120 zebra_rec_keys_close(sortKeys);
1121 }
1122 /* update file type */
1123 xfree(rec->info[recInfo_fileType]);
1124 rec->info[recInfo_fileType] =
1125 rec_strdup(recordType, &rec->size[recInfo_fileType]);
1126
1127 /* update filename */
1128 xfree(rec->info[recInfo_filename]);
1129 rec->info[recInfo_filename] =
1130 rec_strdup(fname, &rec->size[recInfo_filename]);
1131
1132 /* update delete keys */
1133 xfree(rec->info[recInfo_delKeys]);
1134 if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1135 {
1136 zebra_rec_keys_get_buf(zh->reg->keys,
1137 &rec->info[recInfo_delKeys],
1138 &rec->size[recInfo_delKeys]);
1139 }
1140 else
1141 {
1142 rec->info[recInfo_delKeys] = NULL;
1143 rec->size[recInfo_delKeys] = 0;
1144 }
1145 /* update sort keys */
1146 xfree(rec->info[recInfo_sortKeys]);
1147
1148 zebra_rec_keys_get_buf(zh->reg->sortKeys,
1149 &rec->info[recInfo_sortKeys],
1150 &rec->size[recInfo_sortKeys]);
1151
1152 if (stream)
1153 {
1154 recordAttr->recordSize = end_offset - start_offset;
1155 zebraExplain_recordBytesIncrement(zh->reg->zei,
1156 recordAttr->recordSize);
1157 }
1158
1159 /* set run-number for this record */
1160 recordAttr->runNumber =
1161 zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1162
1163 /* update store data */
1164 xfree(rec->info[recInfo_storeData]);
1165
1166 /* update store data */
1167 if (zh->store_data_buf)
1168 {
1169 rec->size[recInfo_storeData] = zh->store_data_size;
1170 rec->info[recInfo_storeData] = zh->store_data_buf;
1171 zh->store_data_buf = 0;
1172 recordAttr->recordSize = zh->store_data_size;
1173 }
1174 else if (zh->m_store_data)
1175 {
1176 off_t cur_offset = stream->tellf(stream);
1177
1178 rec->size[recInfo_storeData] = recordAttr->recordSize;
1179 rec->info[recInfo_storeData] = (char *)
1180 xmalloc(recordAttr->recordSize);
1181 stream->seekf(stream, start_offset);
1182 stream->readf(stream, rec->info[recInfo_storeData],
1183 recordAttr->recordSize);
1184 stream->seekf(stream, cur_offset);
1185 }
1186 else
1187 {
1188 rec->info[recInfo_storeData] = NULL;
1189 rec->size[recInfo_storeData] = 0;
1190 }
1191 /* update database name */
1192 xfree(rec->info[recInfo_databaseName]);
1193 rec->info[recInfo_databaseName] =
1194 rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]);
1195
1196 /* update offset */
1197 recordAttr->recordOffset = start_offset;
1198
1199 /* commit this record */
1200 rec_put(zh->reg->records, &rec);
1201 logRecord(zh);
1202 return ZEBRA_OK;
1203 }
1204
1205 /** \brief extracts records from stream
1206 \param zh Zebra Handle
1207 \param stream stream that we read from
1208 \param action (action_insert, action_replace, action_delete, ..)
1209 \param recordType Record filter type "grs.xml", etc.
1210 \param sysno pointer to sysno if already known; NULL otherwise
1211 \param match_criteria (NULL if not already given)
1212 \param fname filename that we read from (for logging purposes only)
1213 \param recType record type
1214 \param recTypeClientData client data for record type
1215 \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1216 */
zebra_extract_records_stream(ZebraHandle zh,struct ZebraRecStream * stream,enum zebra_recctrl_action_t action,const char * recordType,zint * sysno,const char * match_criteria,const char * fname,RecType recType,void * recTypeClientData)1217 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
1218 struct ZebraRecStream *stream,
1219 enum zebra_recctrl_action_t action,
1220 const char *recordType,
1221 zint *sysno,
1222 const char *match_criteria,
1223 const char *fname,
1224 RecType recType,
1225 void *recTypeClientData)
1226 {
1227 ZEBRA_RES res = ZEBRA_OK;
1228 while (1)
1229 {
1230 int more = 0;
1231 res = zebra_extract_record_stream(zh, stream,
1232 action,
1233 recordType,
1234 sysno,
1235 match_criteria,
1236 fname,
1237 recType, recTypeClientData, &more);
1238 if (!more)
1239 {
1240 res = ZEBRA_OK;
1241 break;
1242 }
1243 if (res != ZEBRA_OK)
1244 break;
1245 if (sysno)
1246 break;
1247 }
1248 return res;
1249 }
1250
zebra_extract_explain(void * handle,Record rec,data1_node * n)1251 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1252 {
1253 ZebraHandle zh = (ZebraHandle) handle;
1254 struct recExtractCtrl extractCtrl;
1255
1256 if (zebraExplain_curDatabase(zh->reg->zei,
1257 rec->info[recInfo_databaseName]))
1258 {
1259 abort();
1260 if (zebraExplain_newDatabase(zh->reg->zei,
1261 rec->info[recInfo_databaseName], 0))
1262 abort();
1263 }
1264
1265 zebra_rec_keys_reset(zh->reg->keys);
1266 zebra_rec_keys_reset(zh->reg->sortKeys);
1267
1268 extractCtrl.init = extract_init;
1269 extractCtrl.tokenAdd = extract_token_add;
1270 extractCtrl.schemaAdd = extract_schema_add;
1271 extractCtrl.dh = zh->reg->dh;
1272
1273 init_extractCtrl(zh, &extractCtrl);
1274
1275 extractCtrl.flagShowRecords = 0;
1276 extractCtrl.match_criteria[0] = '\0';
1277 extractCtrl.staticrank = 0;
1278 extractCtrl.action = action_update;
1279
1280 extractCtrl.handle = handle;
1281 extractCtrl.first_record = 1;
1282
1283 extract_set_store_data_prepare(&extractCtrl);
1284
1285 if (n)
1286 grs_extract_tree(&extractCtrl, n);
1287
1288 if (rec->size[recInfo_delKeys])
1289 {
1290 zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1291
1292 zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1293
1294 zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1295 rec->size[recInfo_delKeys],
1296 0);
1297 #if FLUSH2
1298 extract_flush_record_keys2(zh, rec->sysno,
1299 zh->reg->keys, 0, delkeys, 0);
1300 #else
1301 extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1302 extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1303 #endif
1304 zebra_rec_keys_close(delkeys);
1305
1306 zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1307 rec->size[recInfo_sortKeys],
1308 0);
1309
1310 extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1311 zebra_rec_keys_close(sortkeys);
1312 }
1313 else
1314 {
1315 #if FLUSH2
1316 extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1317 #else
1318 extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1319 #endif
1320 }
1321 extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1322
1323 xfree(rec->info[recInfo_delKeys]);
1324 zebra_rec_keys_get_buf(zh->reg->keys,
1325 &rec->info[recInfo_delKeys],
1326 &rec->size[recInfo_delKeys]);
1327
1328 xfree(rec->info[recInfo_sortKeys]);
1329 zebra_rec_keys_get_buf(zh->reg->sortKeys,
1330 &rec->info[recInfo_sortKeys],
1331 &rec->size[recInfo_sortKeys]);
1332 return ZEBRA_OK;
1333 }
1334
zebra_it_key_str_dump(ZebraHandle zh,struct it_key * key,const char * str,size_t slen,NMEM nmem,int level)1335 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1336 const char *str, size_t slen, NMEM nmem, int level)
1337 {
1338 char keystr[200]; /* room for zints to print */
1339 int ord = CAST_ZINT_TO_INT(key->mem[0]);
1340 const char *index_type;
1341 int i;
1342 const char *string_index;
1343
1344 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1345 0/* db */, &string_index);
1346 assert(index_type);
1347 *keystr = '\0';
1348 for (i = 0; i < key->len; i++)
1349 {
1350 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1351 }
1352
1353 if (*str < CHR_BASE_CHAR)
1354 {
1355 int i;
1356 char dst_buf[200]; /* room for special chars */
1357
1358 strcpy(dst_buf , "?");
1359
1360 if (!strcmp(str, ""))
1361 strcpy(dst_buf, "alwaysmatches");
1362 if (!strcmp(str, FIRST_IN_FIELD_STR))
1363 strcpy(dst_buf, "firstinfield");
1364 else if (!strcmp(str, CHR_UNKNOWN))
1365 strcpy(dst_buf, "unknown");
1366 else if (!strcmp(str, CHR_SPACE))
1367 strcpy(dst_buf, "space");
1368
1369 for (i = 0; i<slen; i++)
1370 {
1371 sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1372 }
1373 yaz_log(level, "%s%s %s %s", keystr, index_type,
1374 string_index, dst_buf);
1375 }
1376 else
1377 {
1378 char *dst_term = 0;
1379 zebra_term_untrans_iconv(zh, nmem, index_type, &dst_term, str);
1380 if (dst_term)
1381 yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1382 string_index, dst_term);
1383 else
1384 {
1385 WRBUF w = wrbuf_alloc();
1386 wrbuf_write_escaped(w, str, strlen(str));
1387 yaz_log(level, "%s%s %s %s", keystr, index_type,
1388 string_index, wrbuf_cstr(w));
1389 wrbuf_destroy(w);
1390 }
1391 }
1392 }
1393
extract_rec_keys_log(ZebraHandle zh,int is_insert,zebra_rec_keys_t reckeys,int level)1394 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1395 zebra_rec_keys_t reckeys,
1396 int level)
1397 {
1398 if (zebra_rec_keys_rewind(reckeys))
1399 {
1400 size_t slen;
1401 const char *str;
1402 struct it_key key;
1403 NMEM nmem = nmem_create();
1404
1405 while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1406 {
1407 zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1408 nmem_reset(nmem);
1409 }
1410 nmem_destroy(nmem);
1411 }
1412 }
1413
extract_rec_keys_adjust(ZebraHandle zh,int is_insert,zebra_rec_keys_t reckeys)1414 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1415 zebra_rec_keys_t reckeys)
1416 {
1417 ZebraExplainInfo zei = zh->reg->zei;
1418 struct ord_stat {
1419 int no;
1420 int ord;
1421 struct ord_stat *next;
1422 };
1423
1424 if (zebra_rec_keys_rewind(reckeys))
1425 {
1426 struct ord_stat *ord_list = 0;
1427 struct ord_stat *p;
1428 size_t slen;
1429 const char *str;
1430 struct it_key key_in;
1431 while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1432 {
1433 int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1434
1435 for (p = ord_list; p ; p = p->next)
1436 if (p->ord == ord)
1437 {
1438 p->no++;
1439 break;
1440 }
1441 if (!p)
1442 {
1443 p = xmalloc(sizeof(*p));
1444 p->no = 1;
1445 p->ord = ord;
1446 p->next = ord_list;
1447 ord_list = p;
1448 }
1449 }
1450
1451 p = ord_list;
1452 while (p)
1453 {
1454 struct ord_stat *p1 = p;
1455
1456 if (is_insert)
1457 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1458 else
1459 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1460 p = p->next;
1461 xfree(p1);
1462 }
1463 }
1464 }
1465
1466 #if FLUSH2
extract_flush_record_keys2(ZebraHandle zh,zint sysno,zebra_rec_keys_t ins_keys,zint ins_rank,zebra_rec_keys_t del_keys,zint del_rank)1467 static void extract_flush_record_keys2(
1468 ZebraHandle zh, zint sysno,
1469 zebra_rec_keys_t ins_keys, zint ins_rank,
1470 zebra_rec_keys_t del_keys, zint del_rank)
1471 {
1472 ZebraExplainInfo zei = zh->reg->zei;
1473 int normal = 0;
1474 int optimized = 0;
1475
1476 if (!zh->reg->key_block)
1477 {
1478 int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1479 const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1480 int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1481 zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1482 }
1483
1484 if (ins_keys)
1485 {
1486 extract_rec_keys_adjust(zh, 1, ins_keys);
1487 if (!del_keys)
1488 zebraExplain_recordCountIncrement(zei, 1);
1489 zebra_rec_keys_rewind(ins_keys);
1490 }
1491 if (del_keys)
1492 {
1493 extract_rec_keys_adjust(zh, 0, del_keys);
1494 if (!ins_keys)
1495 zebraExplain_recordCountIncrement(zei, -1);
1496 zebra_rec_keys_rewind(del_keys);
1497 }
1498
1499 while (1)
1500 {
1501 size_t del_slen;
1502 const char *del_str;
1503 struct it_key del_key_in;
1504 int del = 0;
1505
1506 size_t ins_slen;
1507 const char *ins_str;
1508 struct it_key ins_key_in;
1509 int ins = 0;
1510
1511 if (del_keys)
1512 del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1513 &del_key_in);
1514 if (ins_keys)
1515 ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1516 &ins_key_in);
1517
1518 if (del && ins && ins_rank == del_rank
1519 && !key_compare(&del_key_in, &ins_key_in)
1520 && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1521 {
1522 optimized++;
1523 continue;
1524 }
1525 if (!del && !ins)
1526 break;
1527
1528 normal++;
1529 if (del)
1530 key_block_write(zh->reg->key_block, sysno,
1531 &del_key_in, 0, del_str, del_slen,
1532 del_rank, zh->m_staticrank);
1533 if (ins)
1534 key_block_write(zh->reg->key_block, sysno,
1535 &ins_key_in, 1, ins_str, ins_slen,
1536 ins_rank, zh->m_staticrank);
1537 }
1538 yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1539 }
1540 #else
extract_flush_record_keys(ZebraHandle zh,zint sysno,int cmd,zebra_rec_keys_t reckeys,zint staticrank)1541 static void extract_flush_record_keys(
1542 ZebraHandle zh, zint sysno, int cmd,
1543 zebra_rec_keys_t reckeys,
1544 zint staticrank)
1545 {
1546 ZebraExplainInfo zei = zh->reg->zei;
1547
1548 extract_rec_keys_adjust(zh, cmd, reckeys);
1549
1550 if (log_level_details)
1551 {
1552 yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1553 sysno, cmd ? "insert" : "delete");
1554 extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1555 }
1556
1557 if (!zh->reg->key_block)
1558 {
1559 int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1560 const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1561 int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1562 zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1563 }
1564 zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1565
1566 #if 0
1567 yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1568 print_rec_keys(zh, reckeys);
1569 #endif
1570 if (zebra_rec_keys_rewind(reckeys))
1571 {
1572 size_t slen;
1573 const char *str;
1574 struct it_key key_in;
1575 while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1576 {
1577 key_block_write(zh->reg->key_block, sysno,
1578 &key_in, cmd, str, slen,
1579 staticrank, zh->m_staticrank);
1580 }
1581 }
1582 }
1583 #endif
1584
zebra_rec_keys_to_snippets1(ZebraHandle zh,zebra_rec_keys_t reckeys,zebra_snippets * snippets)1585 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1586 zebra_rec_keys_t reckeys,
1587 zebra_snippets *snippets)
1588 {
1589 NMEM nmem = nmem_create();
1590 if (zebra_rec_keys_rewind(reckeys))
1591 {
1592 const char *str;
1593 size_t slen;
1594 struct it_key key;
1595 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1596 {
1597 char *dst_term = 0;
1598 int ord;
1599 zint seqno;
1600 const char *index_type;
1601
1602 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1603 seqno = key.mem[key.len-1];
1604 ord = CAST_ZINT_TO_INT(key.mem[0]);
1605
1606 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1607 0/* db */, 0 /* string_index */);
1608 assert(index_type);
1609 zebra_term_untrans_iconv(zh, nmem, index_type,
1610 &dst_term, str);
1611 zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1612 nmem_reset(nmem);
1613 }
1614 }
1615 nmem_destroy(nmem);
1616 return ZEBRA_OK;
1617 }
1618
print_rec_keys(ZebraHandle zh,zebra_rec_keys_t reckeys)1619 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1620 {
1621 yaz_log(YLOG_LOG, "print_rec_keys");
1622 if (zebra_rec_keys_rewind(reckeys))
1623 {
1624 const char *str;
1625 size_t slen;
1626 struct it_key key;
1627 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1628 {
1629 char dst_buf[IT_MAX_WORD];
1630 zint seqno;
1631 const char *index_type;
1632 int ord = CAST_ZINT_TO_INT(key.mem[0]);
1633 const char *db = 0;
1634 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1635
1636 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1637
1638 seqno = key.mem[key.len-1];
1639
1640 zebra_term_untrans(zh, index_type, dst_buf, str);
1641
1642 yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT
1643 " term=%s", ord, seqno, dst_buf);
1644 }
1645 }
1646 }
1647
extract_add_index_string(RecWord * p,zinfo_index_category_t cat,const char * str,int length)1648 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1649 const char *str, int length)
1650 {
1651 struct it_key key;
1652 ZebraHandle zh = p->extractCtrl->handle;
1653 ZebraExplainInfo zei = zh->reg->zei;
1654 int ch, i;
1655
1656 ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1657 if (ch < 0)
1658 ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1659
1660 i = 0;
1661 key.mem[i++] = ch;
1662 key.mem[i++] = p->record_id;
1663 key.mem[i++] = p->section_id;
1664
1665 if (zh->m_segment_indexing)
1666 key.mem[i++] = p->segment;
1667 key.mem[i++] = p->seqno;
1668 key.len = i;
1669
1670 zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1671 }
1672
extract_add_sort_string(RecWord * p,const char * str,int length)1673 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1674 {
1675 struct it_key key;
1676 ZebraHandle zh = p->extractCtrl->handle;
1677 ZebraExplainInfo zei = zh->reg->zei;
1678 int ch;
1679 zinfo_index_category_t cat = zinfo_index_category_sort;
1680
1681 ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1682 if (ch < 0)
1683 ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1684 key.len = 3;
1685 key.mem[0] = ch;
1686 key.mem[1] = p->record_id;
1687 key.mem[2] = p->section_id;
1688
1689 zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1690 }
1691
extract_add_staticrank_string(RecWord * p,const char * str,int length)1692 static void extract_add_staticrank_string(RecWord *p,
1693 const char *str, int length)
1694 {
1695 char valz[40];
1696 struct recExtractCtrl *ctrl = p->extractCtrl;
1697
1698 if (length > sizeof(valz)-1)
1699 length = sizeof(valz)-1;
1700
1701 memcpy(valz, str, length);
1702 valz[length] = '\0';
1703 ctrl->staticrank = atozint(valz);
1704 }
1705
extract_add_string(RecWord * p,zebra_map_t zm,const char * string,int length)1706 static void extract_add_string(RecWord *p, zebra_map_t zm,
1707 const char *string, int length)
1708 {
1709 assert(length > 0);
1710
1711 if (!p->index_name)
1712 return;
1713 if (log_level_details)
1714 {
1715
1716 WRBUF w = wrbuf_alloc();
1717
1718 wrbuf_write_escaped(w, string, length);
1719 yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1720 wrbuf_destroy(w);
1721 }
1722 if (zebra_maps_is_index(zm))
1723 {
1724 extract_add_index_string(p, zinfo_index_category_index,
1725 string, length);
1726 if (zebra_maps_is_alwaysmatches(zm))
1727 {
1728 RecWord word;
1729 memcpy(&word, p, sizeof(word));
1730
1731 word.seqno = 1;
1732 extract_add_index_string(
1733 &word, zinfo_index_category_alwaysmatches, "", 0);
1734 }
1735 }
1736 else if (zebra_maps_is_sort(zm))
1737 {
1738 extract_add_sort_string(p, string, length);
1739 }
1740 else if (zebra_maps_is_staticrank(zm))
1741 {
1742 extract_add_staticrank_string(p, string, length);
1743 }
1744 }
1745
extract_add_incomplete_field(RecWord * p,zebra_map_t zm)1746 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1747 {
1748 const char *b = p->term_buf;
1749 int remain = p->term_len;
1750 int first = 1;
1751 const char **map = 0;
1752
1753 if (remain > 0)
1754 map = zebra_maps_input(zm, &b, remain, 0);
1755
1756 while (map)
1757 {
1758 char buf[IT_MAX_WORD+1];
1759 int i, remain;
1760
1761 /* Skip spaces */
1762 while (map && *map && **map == *CHR_SPACE)
1763 {
1764 remain = p->term_len - (b - p->term_buf);
1765 if (remain > 0)
1766 map = zebra_maps_input(zm, &b, remain, 0);
1767 else
1768 map = 0;
1769 }
1770 if (!map)
1771 break;
1772 i = 0;
1773 while (map && *map && **map != *CHR_SPACE)
1774 {
1775 const char *cp = *map;
1776
1777 while (i < IT_MAX_WORD && *cp)
1778 buf[i++] = *(cp++);
1779 remain = p->term_len - (b - p->term_buf);
1780 if (remain > 0)
1781 map = zebra_maps_input(zm, &b, remain, 0);
1782 else
1783 map = 0;
1784 }
1785 if (!i)
1786 return;
1787
1788 if (first)
1789 {
1790 first = 0;
1791 if (zebra_maps_is_first_in_field(zm))
1792 {
1793 /* first in field marker */
1794 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1795 p->seqno++;
1796 }
1797 }
1798 extract_add_string(p, zm, buf, i);
1799 p->seqno++;
1800 }
1801 }
1802
extract_add_complete_field(RecWord * p,zebra_map_t zm)1803 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1804 {
1805 char buf[IT_MAX_WORD+1];
1806 int i = parse_complete_field(p, zm, buf);
1807 if (!i)
1808 return;
1809 extract_add_string(p, zm, buf, i);
1810 p->seqno++;
1811 }
1812
extract_add_icu(RecWord * p,zebra_map_t zm)1813 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1814 {
1815 const char *res_buf = 0;
1816 size_t res_len = 0;
1817
1818 zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1819 while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1820 {
1821 if (res_len > IT_MAX_WORD)
1822 {
1823 yaz_log(YLOG_LOG, "Truncating long term %ld", (long) res_len);
1824 res_len = IT_MAX_WORD;
1825 }
1826 extract_add_string(p, zm, res_buf, res_len);
1827 p->seqno++;
1828 }
1829 }
1830
1831
1832 /** \brief top-level indexing handler for recctrl system
1833 \param p token data to be indexed
1834
1835 Call sequence:
1836 extract_token_add
1837 extract_add_{in}_complete / extract_add_icu
1838 extract_add_string
1839
1840 extract_add_index_string
1841 or
1842 extract_add_sort_string
1843 or
1844 extract_add_staticrank_string
1845
1846 */
extract_token_add(RecWord * p)1847 static void extract_token_add(RecWord *p)
1848 {
1849 ZebraHandle zh = p->extractCtrl->handle;
1850 zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1851
1852 if (log_level_details)
1853 {
1854 yaz_log(log_level_details, "extract_token_add "
1855 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1856 p->index_type, p->index_name,
1857 p->seqno, p->term_len, p->term_buf);
1858 }
1859 if (zebra_maps_is_icu(zm))
1860 {
1861 extract_add_icu(p, zm);
1862 }
1863 else
1864 {
1865 if (zebra_maps_is_complete(zm))
1866 extract_add_complete_field(p, zm);
1867 else
1868 extract_add_incomplete_field(p, zm);
1869 }
1870 }
1871
extract_set_store_data_cb(struct recExtractCtrl * p,void * buf,size_t sz)1872 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1873 void *buf, size_t sz)
1874 {
1875 ZebraHandle zh = (ZebraHandle) p->handle;
1876
1877 xfree(zh->store_data_buf);
1878 zh->store_data_buf = 0;
1879 zh->store_data_size = 0;
1880 if (buf && sz)
1881 {
1882 zh->store_data_buf = xmalloc(sz);
1883 zh->store_data_size = sz;
1884 memcpy(zh->store_data_buf, buf, sz);
1885 }
1886 }
1887
extract_set_store_data_prepare(struct recExtractCtrl * p)1888 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1889 {
1890 ZebraHandle zh = (ZebraHandle) p->handle;
1891 xfree(zh->store_data_buf);
1892 zh->store_data_buf = 0;
1893 zh->store_data_size = 0;
1894 p->setStoreData = extract_set_store_data_cb;
1895 }
1896
extract_schema_add(struct recExtractCtrl * p,Odr_oid * oid)1897 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1898 {
1899 ZebraHandle zh = (ZebraHandle) p->handle;
1900 zebraExplain_addSchema(zh->reg->zei, oid);
1901 }
1902
extract_flush_sort_keys(ZebraHandle zh,zint sysno,int cmd,zebra_rec_keys_t reckeys)1903 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1904 int cmd, zebra_rec_keys_t reckeys)
1905 {
1906 #if 0
1907 yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1908 cmd, sysno);
1909 extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1910 #endif
1911
1912 if (zebra_rec_keys_rewind(reckeys))
1913 {
1914 zebra_sort_index_t si = zh->reg->sort_index;
1915 size_t slen;
1916 const char *str;
1917 struct it_key key_in;
1918
1919 NMEM nmem = nmem_create();
1920 struct sort_add_ent {
1921 int ord;
1922 int cmd;
1923 struct sort_add_ent *next;
1924 WRBUF wrbuf;
1925 zint sysno;
1926 zint section_id;
1927 };
1928 struct sort_add_ent *sort_ent_list = 0;
1929
1930 while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1931 {
1932 int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1933 zint filter_sysno = key_in.mem[1];
1934 zint section_id = key_in.mem[2];
1935
1936 struct sort_add_ent **e = &sort_ent_list;
1937 for (; *e; e = &(*e)->next)
1938 if ((*e)->ord == ord && section_id == (*e)->section_id)
1939 break;
1940 if (!*e)
1941 {
1942 *e = nmem_malloc(nmem, sizeof(**e));
1943 (*e)->next = 0;
1944 (*e)->wrbuf = wrbuf_alloc();
1945 (*e)->ord = ord;
1946 (*e)->cmd = cmd;
1947 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1948 (*e)->section_id = section_id;
1949 }
1950
1951 wrbuf_write((*e)->wrbuf, str, slen);
1952 wrbuf_putc((*e)->wrbuf, '\0');
1953 }
1954 if (sort_ent_list)
1955 {
1956 zint last_sysno = 0;
1957 struct sort_add_ent *e = sort_ent_list;
1958 for (; e; e = e->next)
1959 {
1960 if (last_sysno != e->sysno)
1961 {
1962 zebra_sort_sysno(si, e->sysno);
1963 last_sysno = e->sysno;
1964 }
1965 zebra_sort_type(si, e->ord);
1966 if (e->cmd == 1)
1967 zebra_sort_add(si, e->section_id, e->wrbuf);
1968 else
1969 zebra_sort_delete(si, e->section_id);
1970 wrbuf_destroy(e->wrbuf);
1971 }
1972 }
1973 nmem_destroy(nmem);
1974 }
1975 }
1976
1977 /*
1978 * Local variables:
1979 * c-basic-offset: 4
1980 * c-file-style: "Stroustrup"
1981 * indent-tabs-mode: nil
1982 * End:
1983 * vim: shiftwidth=4 tabstop=8 expandtab
1984 */
1985
1986