1 /* This file is part of the Zebra server.
2    Copyright (C) 2004-2013 Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 
18 */
19 
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23 
24 #if HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37 
38 
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44 
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48 
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52 
53 #if FLUSH2
54 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
55                                        zebra_rec_keys_t ins_keys,
56                                        zint ins_rank,
57                                        zebra_rec_keys_t del_keys,
58                                        zint del_rank);
59 #else
60 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
61                                       int cmd,
62                                       zebra_rec_keys_t reckeys,
63                                       zint staticrank);
64 #endif
65 
zebra_init_log_level(void)66 static void zebra_init_log_level(void)
67 {
68     if (!log_level_initialized)
69     {
70         log_level_initialized = 1;
71 
72         log_level_extract = yaz_log_module_level("extract");
73         log_level_details = yaz_log_module_level("indexdetails");
74     }
75 }
76 
wrbuf_hex_str(const char * cstr)77 static WRBUF wrbuf_hex_str(const char *cstr)
78 {
79     size_t i;
80     WRBUF w = wrbuf_alloc();
81     for (i = 0; cstr[i]; i++)
82     {
83         if (cstr[i] < ' ' || cstr[i] > 126)
84             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
85         else
86             wrbuf_putc(w, cstr[i]);
87     }
88     return w;
89 }
90 
91 
92 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
93                                     int cmd, zebra_rec_keys_t skp);
94 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
95 static void extract_token_add(RecWord *p);
96 
check_log_limit(ZebraHandle zh)97 static void check_log_limit(ZebraHandle zh)
98 {
99     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
100     {
101         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
102                 zh->m_file_verbose_limit);
103     }
104 }
105 
logRecord(ZebraHandle zh)106 static void logRecord(ZebraHandle zh)
107 {
108     check_log_limit(zh);
109     ++zh->records_processed;
110     if (!(zh->records_processed % 1000))
111     {
112         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
113                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT,
114                 zh->records_processed, zh->records_inserted,
115                 zh->records_updated, zh->records_deleted);
116     }
117 }
118 
init_extractCtrl(ZebraHandle zh,struct recExtractCtrl * ctrl)119 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
120 {
121     ctrl->flagShowRecords = !zh->m_flag_rw;
122 }
123 
124 
125 static void extract_add_index_string(RecWord *p,
126                                       zinfo_index_category_t cat,
127                                       const char *str, int length);
128 
129 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
130 
extract_init(struct recExtractCtrl * p,RecWord * w)131 static void extract_init(struct recExtractCtrl *p, RecWord *w)
132 {
133     w->seqno = 1;
134     w->index_name = "any";
135     w->index_type = "w";
136     w->extractCtrl = p;
137     w->record_id = 0;
138     w->section_id = 0;
139     w->segment = 0;
140 }
141 
142 struct snip_rec_info {
143     ZebraHandle zh;
144     zebra_snippets *snippets;
145 };
146 
parse_complete_field(RecWord * p,zebra_map_t zm,char * buf)147 static int parse_complete_field(RecWord *p, zebra_map_t zm,
148                                 char *buf)
149 {
150     const char *b = p->term_buf;
151     const char **map = 0;
152     int i = 0, remain = p->term_len;
153 
154     if (remain > 0)
155 	map = zebra_maps_input(zm, &b, remain, 1);
156     while (remain > 0 && i < IT_MAX_WORD)
157     {
158 	while (map && *map && **map == *CHR_SPACE)
159 	{
160 	    remain = p->term_len - (b - p->term_buf);
161 
162 	    if (remain > 0)
163 	    {
164 		int first = i ? 0 : 1;  /* first position */
165 		map = zebra_maps_input(zm, &b, remain, first);
166 	    }
167 	    else
168 		map = 0;
169 	}
170 	if (!map)
171 	    break;
172 
173 	if (i && i < IT_MAX_WORD)
174 	    buf[i++] = *CHR_SPACE;
175 	while (map && *map && **map != *CHR_SPACE)
176 	{
177 	    const char *cp = *map;
178 
179 	    if (**map == *CHR_CUT)
180 	    {
181 		i = 0;
182 	    }
183 	    else
184 	    {
185 		if (i >= IT_MAX_WORD)
186 		    break;
187 		while (i < IT_MAX_WORD && *cp)
188 		    buf[i++] = *(cp++);
189 	    }
190 	    remain = p->term_len  - (b - p->term_buf);
191 	    if (remain > 0)
192 	    {
193 		map = zebra_maps_input(zm, &b, remain, 0);
194 	    }
195 	    else
196 		map = 0;
197 	}
198     }
199     return i;
200 }
201 
snippet_add_complete_field(RecWord * p,int ord,zebra_map_t zm)202 static void snippet_add_complete_field(RecWord *p, int ord,
203                                        zebra_map_t zm)
204 {
205     struct snip_rec_info *h = p->extractCtrl->handle;
206     char buf[IT_MAX_WORD+1];
207     int i = parse_complete_field(p, zm, buf);
208 
209     if (!i)
210         return;
211 
212     if (p->term_len && p->term_buf && zebra_maps_is_index(zm))
213         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
214                                p->term_buf, p->term_len);
215     p->seqno++;
216 }
217 
snippet_add_incomplete_field(RecWord * p,int ord,zebra_map_t zm)218 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
219 {
220     struct snip_rec_info *h = p->extractCtrl->handle;
221     const char *b = p->term_buf;
222     int remain = p->term_len;
223     int first = 1;
224     const char **map = 0;
225     const char *start = b;
226     const char *last = b;
227 
228     if (remain > 0)
229 	map = zebra_maps_input(zm, &b, remain, 0);
230 
231     while (map)
232     {
233 	int remain;
234 
235 	/* Skip spaces */
236 	while (map && *map && **map == *CHR_SPACE)
237 	{
238 	    remain = p->term_len - (b - p->term_buf);
239             last = b;
240 	    if (remain > 0)
241 		map = zebra_maps_input(zm, &b, remain, 0);
242 	    else
243 		map = 0;
244 	}
245 	if (!map)
246 	    break;
247         if (start != last && zebra_maps_is_index(zm))
248         {
249             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
250                                    start, last - start);
251         }
252         start = last;
253 	while (map && *map && **map != *CHR_SPACE)
254 	{
255 	    remain = p->term_len - (b - p->term_buf);
256             last = b;
257 	    if (remain > 0)
258 		map = zebra_maps_input(zm, &b, remain, 0);
259 	    else
260 		map = 0;
261 	}
262         if (start == last)
263             return ;
264 
265         if (first)
266         {
267             first = 0;
268             if (zebra_maps_is_first_in_field(zm))
269             {
270                 /* first in field marker */
271                 p->seqno++;
272             }
273         }
274         if (start != last && zebra_maps_is_index(zm))
275             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
276                                    start, last - start);
277         start = last;
278         p->seqno++;
279     }
280 
281 }
282 
snippet_add_icu(RecWord * p,int ord,zebra_map_t zm)283 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
284 {
285     struct snip_rec_info *h = p->extractCtrl->handle;
286 
287     const char *res_buf = 0;
288     size_t res_len = 0;
289 
290     const char *display_buf = 0;
291     size_t display_len = 0;
292 
293     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
294     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
295                                    &display_buf, &display_len))
296     {
297         if (zebra_maps_is_index(zm))
298             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
299                                    display_buf, display_len);
300         p->seqno++;
301     }
302 }
303 
snippet_token_add(RecWord * p)304 static void snippet_token_add(RecWord *p)
305 {
306     struct snip_rec_info *h = p->extractCtrl->handle;
307     ZebraHandle zh = h->zh;
308     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
309 
310     if (zm)
311     {
312         ZebraExplainInfo zei = zh->reg->zei;
313         int ch = zebraExplain_lookup_attr_str(
314             zei, zinfo_index_category_index, p->index_type, p->index_name);
315 
316         if (zebra_maps_is_icu(zm))
317             snippet_add_icu(p, ch, zm);
318         else
319         {
320             if (zebra_maps_is_complete(zm))
321                 snippet_add_complete_field(p, ch, zm);
322             else
323                 snippet_add_incomplete_field(p, ch, zm);
324         }
325     }
326 }
327 
snippet_schema_add(struct recExtractCtrl * p,Odr_oid * oid)328 static void snippet_schema_add(
329     struct recExtractCtrl *p, Odr_oid *oid)
330 {
331 
332 }
333 
extract_snippet(ZebraHandle zh,zebra_snippets * sn,struct ZebraRecStream * stream,RecType rt,void * recTypeClientData)334 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
335                      struct ZebraRecStream *stream,
336                      RecType rt, void *recTypeClientData)
337 {
338     struct recExtractCtrl extractCtrl;
339     struct snip_rec_info info;
340 
341     extractCtrl.stream = stream;
342     extractCtrl.first_record = 1;
343     extractCtrl.init = extract_init;
344     extractCtrl.tokenAdd = snippet_token_add;
345     extractCtrl.schemaAdd = snippet_schema_add;
346     assert(zh->reg);
347     assert(zh->reg->dh);
348 
349     extractCtrl.dh = zh->reg->dh;
350 
351     info.zh = zh;
352     info.snippets = sn;
353     extractCtrl.handle = &info;
354     extractCtrl.match_criteria[0] = '\0';
355     extractCtrl.staticrank = 0;
356     extractCtrl.action = action_insert;
357 
358     init_extractCtrl(zh, &extractCtrl);
359 
360     extractCtrl.setStoreData = 0;
361 
362     (*rt->extract)(recTypeClientData, &extractCtrl);
363 }
364 
searchRecordKey(ZebraHandle zh,zebra_rec_keys_t reckeys,const char * index_name,const char ** ws,int ws_length)365 static void searchRecordKey(ZebraHandle zh,
366 			    zebra_rec_keys_t reckeys,
367                             const char *index_name,
368 			    const char **ws, int ws_length)
369 {
370     int i;
371     int ch = -1;
372     zinfo_index_category_t cat = zinfo_index_category_index;
373 
374     for (i = 0; i<ws_length; i++)
375         ws[i] = NULL;
376 
377     if (ch < 0)
378         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
379     if (ch < 0)
380         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
381     if (ch < 0)
382         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
383 
384     if (ch < 0)
385 	return ;
386 
387     if (zebra_rec_keys_rewind(reckeys))
388     {
389 	zint startSeq = -1;
390 	const char *str;
391 	size_t slen;
392 	struct it_key key;
393 	zint seqno;
394 	while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
395 	{
396 	    assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
397 
398 	    seqno = key.mem[key.len-1];
399 
400 	    if (key.mem[0] == ch)
401 	    {
402 		zint woff;
403 
404 		if (startSeq == -1)
405 		    startSeq = seqno;
406 		woff = seqno - startSeq;
407 		if (woff >= 0 && woff < ws_length)
408 		    ws[woff] = str;
409 	    }
410 	}
411     }
412 }
413 
414 #define FILE_MATCH_BLANK "\t "
415 
get_match_from_spec(ZebraHandle zh,zebra_rec_keys_t reckeys,const char * fname,const char * spec)416 static char *get_match_from_spec(ZebraHandle zh,
417                           zebra_rec_keys_t reckeys,
418                           const char *fname, const char *spec)
419 {
420     static char dstBuf[2048];      /* static here ??? */
421     char *dst = dstBuf;
422     const char *s = spec;
423 
424     while (1)
425     {
426 	for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
427 	    ;
428         if (!*s)
429             break;
430         if (*s == '(')
431         {
432 	    const char *ws[32];
433 	    char attset_str[64], attname_str[64];
434 	    int i;
435             int first = 1;
436 
437 	    for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
438 		;
439 	    for (i = 0; *s && *s != ',' && *s != ')' &&
440 		     !strchr(FILE_MATCH_BLANK, *s); s++)
441 		if (i+1 < sizeof(attset_str))
442 		    attset_str[i++] = *s;
443 	    attset_str[i] = '\0';
444 
445 	    for (; strchr(FILE_MATCH_BLANK, *s); s++)
446 		;
447 	    if (*s != ',')
448                 strcpy(attname_str, attset_str);
449             else
450 	    {
451 		for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
452 		    ;
453 		for (i = 0; *s && *s != ')' &&
454 			 !strchr(FILE_MATCH_BLANK, *s); s++)
455 		    if (i+1 < sizeof(attname_str))
456 			attname_str[i++] = *s;
457 		attname_str[i] = '\0';
458 	    }
459             if (*s != ')')
460             {
461                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
462                       spec, zh->m_group ? zh->m_group : "none");
463                 return NULL;
464             }
465             s++;
466 
467             searchRecordKey(zh, reckeys, attname_str, ws, 32);
468             if (0) /* for debugging */
469             {
470                 for (i = 0; i<32; i++)
471                 {
472                     if (ws[i])
473                     {
474                         WRBUF w = wrbuf_hex_str(ws[i]);
475                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
476                         wrbuf_destroy(w);
477                     }
478                 }
479             }
480 
481             for (i = 0; i<32; i++)
482                 if (ws[i])
483                 {
484                     if (first)
485                     {
486                         *dst++ = ' ';
487                         first = 0;
488                     }
489                     strcpy(dst, ws[i]);
490                     dst += strlen(ws[i]);
491                 }
492             if (first)
493             {
494                 yaz_log(YLOG_WARN, "Record didn't contain match"
495                       " fields in (%s,%s)", attset_str, attname_str);
496                 return NULL;
497             }
498         }
499         else if (*s == '$')
500         {
501             int spec_len;
502             char special[64];
503             const char *spec_src = NULL;
504             const char *s1 = ++s;
505             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
506                 s1++;
507 
508             spec_len = s1 - s;
509             if (spec_len > sizeof(special)-1)
510                 spec_len = sizeof(special)-1;
511             memcpy(special, s, spec_len);
512             special[spec_len] = '\0';
513             s = s1;
514 
515             if (!strcmp(special, "group"))
516                 spec_src = zh->m_group;
517             else if (!strcmp(special, "database"))
518                 spec_src = zh->basenames[0];
519             else if (!strcmp(special, "filename")) {
520                 spec_src = fname;
521 	    }
522             else if (!strcmp(special, "type"))
523                 spec_src = zh->m_record_type;
524             else
525                 spec_src = NULL;
526             if (spec_src)
527             {
528                 strcpy(dst, spec_src);
529                 dst += strlen(spec_src);
530             }
531         }
532         else if (*s == '\"' || *s == '\'')
533         {
534             int stopMarker = *s++;
535             char tmpString[64];
536             int i = 0;
537 
538             while (*s && *s != stopMarker)
539             {
540                 if (i+1 < sizeof(tmpString))
541                     tmpString[i++] = *s++;
542             }
543             if (*s)
544                 s++;
545             tmpString[i] = '\0';
546             strcpy(dst, tmpString);
547             dst += strlen(tmpString);
548         }
549         else
550         {
551             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
552                   spec, zh->m_group ? zh->m_group : "none");
553             return NULL;
554         }
555         *dst++ = 1;
556     }
557     if (dst == dstBuf)
558     {
559         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
560               fname, zh->m_group ? zh->m_group : "none");
561         return NULL;
562     }
563     *dst = '\0';
564 
565     if (0) /* for debugging */
566     {
567         WRBUF w = wrbuf_hex_str(dstBuf);
568         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
569         wrbuf_destroy(w);
570     }
571 
572     return dstBuf;
573 }
574 
575 struct recordLogInfo {
576     const char *fname;
577     int recordOffset;
578     struct recordGroup *rGroup;
579 };
580 
581 /** \brief add the always-matches index entry and map to real record ID
582     \param ctrl record control
583     \param record_id custom record ID
584     \param sysno system record ID
585 
586     This function serves two purposes.. It adds the always matches
587     entry and makes a pointer from the custom record ID (if defined)
588     back to the system record ID (sysno)
589     See zebra_recid_to_sysno .
590   */
all_matches_add(struct recExtractCtrl * ctrl,zint record_id,zint sysno)591 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
592                             zint sysno)
593 {
594     RecWord word;
595     extract_init(ctrl, &word);
596     word.record_id = record_id;
597     /* we use the seqno as placeholder for a way to get back to
598        record database from _ALLRECORDS.. This is used if a custom
599        RECORD was defined */
600     word.seqno = sysno;
601     word.index_name = "_ALLRECORDS";
602     word.index_type = "w";
603 
604     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
605                               "", 0);
606 }
607 
608 /* forward declaration */
609 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
610                                        struct ZebraRecStream *stream,
611                                        enum zebra_recctrl_action_t action,
612                                        const char *recordType,
613                                        zint *sysno,
614                                        const char *match_criteria,
615                                        const char *fname,
616                                        RecType recType,
617                                        void *recTypeClientData);
618 
619 
zebra_extract_file(ZebraHandle zh,zint * sysno,const char * fname,enum zebra_recctrl_action_t action)620 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
621                              enum zebra_recctrl_action_t action)
622 {
623     ZEBRA_RES r = ZEBRA_OK;
624     int i, fd;
625     char gprefix[128];
626     char ext[128];
627     char ext_res[128];
628     const char *original_record_type = 0;
629     RecType recType;
630     void *recTypeClientData;
631     struct ZebraRecStream stream, *streamp;
632 
633     zebra_init_log_level();
634 
635     if (!zh->m_group || !*zh->m_group)
636         *gprefix = '\0';
637     else
638         sprintf(gprefix, "%s.", zh->m_group);
639 
640     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
641 
642     /* determine file extension */
643     *ext = '\0';
644     for (i = strlen(fname); --i >= 0; )
645         if (fname[i] == '/')
646             break;
647         else if (fname[i] == '.')
648         {
649             strcpy(ext, fname+i+1);
650             break;
651         }
652     /* determine file type - depending on extension */
653     original_record_type = zh->m_record_type;
654     if (!zh->m_record_type)
655     {
656         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
657         zh->m_record_type = res_get(zh->res, ext_res);
658     }
659     if (!zh->m_record_type)
660     {
661         check_log_limit(zh);
662 	if (zh->records_processed + zh->records_skipped
663             < zh->m_file_verbose_limit)
664             yaz_log(YLOG_LOG, "? %s", fname);
665         zh->records_skipped++;
666         return 0;
667     }
668     /* determine match criteria */
669     if (!zh->m_record_id)
670     {
671         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
672         zh->m_record_id = res_get(zh->res, ext_res);
673     }
674 
675     if (!(recType =
676 	  recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
677 			  &recTypeClientData)))
678     {
679         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
680         return ZEBRA_FAIL;
681     }
682 
683     switch(recType->version)
684     {
685     case 0:
686 	break;
687     default:
688 	yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
689     }
690     if (sysno && (action == action_delete || action == action_a_delete))
691     {
692         streamp = 0;
693     }
694     else
695     {
696         char full_rep[1024];
697 
698         if (zh->path_reg && !yaz_is_abspath(fname))
699         {
700             strcpy(full_rep, zh->path_reg);
701             strcat(full_rep, "/");
702             strcat(full_rep, fname);
703         }
704         else
705             strcpy(full_rep, fname);
706 
707         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
708         {
709             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
710 	    zh->m_record_type = original_record_type;
711             return ZEBRA_FAIL;
712         }
713         streamp = &stream;
714         zebra_create_stream_fd(streamp, fd, 0);
715     }
716     r = zebra_extract_records_stream(zh, streamp,
717                                      action,
718                                      zh->m_record_type,
719                                      sysno,
720                                      0, /*match_criteria */
721                                      fname,
722                                      recType, recTypeClientData);
723     if (streamp)
724         stream.destroy(streamp);
725     zh->m_record_type = original_record_type;
726     return r;
727 }
728 
729 /*
730   If sysno is provided, then it's used to identify the reocord.
731   If not, and match_criteria is provided, then sysno is guessed
732   If not, and a record is provided, then sysno is got from there
733 
734  */
735 
zebra_buffer_extract_record(ZebraHandle zh,const char * buf,size_t buf_size,enum zebra_recctrl_action_t action,const char * recordType,zint * sysno,const char * match_criteria,const char * fname)736 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
737                                       const char *buf, size_t buf_size,
738                                       enum zebra_recctrl_action_t action,
739                                       const char *recordType,
740                                       zint *sysno,
741                                       const char *match_criteria,
742                                       const char *fname)
743 {
744     struct ZebraRecStream stream;
745     ZEBRA_RES res;
746     void *clientData;
747     RecType recType = 0;
748 
749     if (recordType && *recordType)
750     {
751         yaz_log(log_level_extract,
752                 "Record type explicitly specified: %s", recordType);
753         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
754                                   &clientData);
755     }
756     else
757     {
758         if (!(zh->m_record_type))
759 	{
760             yaz_log(YLOG_WARN, "No such record type defined");
761             return ZEBRA_FAIL;
762         }
763         yaz_log(log_level_extract, "Get record type from rgroup: %s",
764                 zh->m_record_type);
765         recType = recType_byName(zh->reg->recTypes, zh->res,
766 				  zh->m_record_type, &clientData);
767         recordType = zh->m_record_type;
768     }
769 
770     if (!recType)
771     {
772         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
773         return ZEBRA_FAIL;
774     }
775 
776     zebra_create_stream_mem(&stream, buf, buf_size);
777 
778     res = zebra_extract_records_stream(zh, &stream,
779                                        action,
780                                        recordType,
781                                        sysno,
782                                        match_criteria,
783                                        fname,
784                                        recType, clientData);
785     stream.destroy(&stream);
786     return res;
787 }
788 
zebra_extract_record_stream(ZebraHandle zh,struct ZebraRecStream * stream,enum zebra_recctrl_action_t action,const char * recordType,zint * sysno,const char * match_criteria,const char * fname,RecType recType,void * recTypeClientData,int * more)789 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
790                                              struct ZebraRecStream *stream,
791                                              enum zebra_recctrl_action_t action,
792                                              const char *recordType,
793                                              zint *sysno,
794                                              const char *match_criteria,
795                                              const char *fname,
796                                              RecType recType,
797                                              void *recTypeClientData,
798                                              int *more)
799 
800 {
801     zint sysno0 = 0;
802     RecordAttr *recordAttr;
803     struct recExtractCtrl extractCtrl;
804     int r;
805     const char *matchStr = 0;
806     Record rec;
807     off_t start_offset = 0, end_offset = 0;
808     const char *pr_fname = fname;  /* filename to print .. */
809     int show_progress = zh->records_processed + zh->records_skipped
810         < zh->m_file_verbose_limit ? 1:0;
811 
812     zebra_init_log_level();
813 
814     if (!pr_fname)
815 	pr_fname = "<no file>";  /* make it printable if file is omitted */
816 
817     zebra_rec_keys_reset(zh->reg->keys);
818     zebra_rec_keys_reset(zh->reg->sortKeys);
819 
820     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
821     {
822         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0],
823 				      zh->m_explain_database))
824             return ZEBRA_FAIL;
825     }
826 
827     if (stream)
828     {
829         off_t null_offset = 0;
830         extractCtrl.stream = stream;
831 
832         start_offset = stream->tellf(stream);
833 
834         extractCtrl.first_record = start_offset ? 0 : 1;
835 
836         stream->endf(stream, &null_offset);;
837 
838         extractCtrl.init = extract_init;
839         extractCtrl.tokenAdd = extract_token_add;
840         extractCtrl.schemaAdd = extract_schema_add;
841         extractCtrl.dh = zh->reg->dh;
842         extractCtrl.handle = zh;
843         extractCtrl.match_criteria[0] = '\0';
844         extractCtrl.staticrank = 0;
845         extractCtrl.action = action;
846 
847         init_extractCtrl(zh, &extractCtrl);
848 
849         extract_set_store_data_prepare(&extractCtrl);
850 
851         r = (*recType->extract)(recTypeClientData, &extractCtrl);
852 
853         if (action == action_update)
854         {
855             action = extractCtrl.action;
856         }
857 
858         switch (r)
859         {
860         case RECCTRL_EXTRACT_EOF:
861             return ZEBRA_FAIL;
862         case RECCTRL_EXTRACT_ERROR_GENERIC:
863             /* error occured during extraction ... */
864             yaz_log(YLOG_WARN, "extract error: generic");
865             return ZEBRA_FAIL;
866         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
867             /* error occured during extraction ... */
868             yaz_log(YLOG_WARN, "extract error: no such filter");
869             return ZEBRA_FAIL;
870         case RECCTRL_EXTRACT_SKIP:
871             if (show_progress)
872                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
873                          recordType, pr_fname, (zint) start_offset);
874             *more = 1;
875 
876             end_offset = stream->endf(stream, 0);
877             if (end_offset)
878                 stream->seekf(stream, end_offset);
879 
880             return ZEBRA_OK;
881         case RECCTRL_EXTRACT_OK:
882             break;
883         default:
884             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
885             return ZEBRA_FAIL;
886         }
887         end_offset = stream->endf(stream, 0);
888         if (end_offset)
889             stream->seekf(stream, end_offset);
890         else
891             end_offset = stream->tellf(stream);
892 
893         if (extractCtrl.match_criteria[0])
894             match_criteria = extractCtrl.match_criteria;
895     }
896 
897     *more = 1;
898 
899     if (zh->m_flag_rw == 0)
900     {
901         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
902                 pr_fname, (zint) start_offset);
903         /* test mode .. Do not perform match */
904         return ZEBRA_OK;
905     }
906 
907     if (!sysno)
908     {
909 	sysno = &sysno0;
910 
911         if (match_criteria && *match_criteria)
912             matchStr = match_criteria;
913         else
914         {
915             if (zh->m_record_id && *zh->m_record_id)
916             {
917                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname,
918                                                zh->m_record_id);
919 		if (!matchStr)
920                 {
921                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
922                              pr_fname, (zint) start_offset);
923 		    return ZEBRA_FAIL;
924                 }
925                 if (0 && matchStr)
926                 {
927                     WRBUF w = wrbuf_alloc();
928                     size_t i;
929                     for (i = 0; i < strlen(matchStr); i++)
930                     {
931                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
932                     }
933                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
934                     wrbuf_destroy(w);
935                 }
936             }
937         }
938         if (matchStr)
939 	{
940 	    int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
941 	    char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
942 					  matchStr);
943 
944 
945             if (log_level_extract)
946             {
947                 WRBUF w = wrbuf_hex_str(matchStr);
948                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
949                 wrbuf_destroy(w);
950             }
951             if (rinfo)
952 	    {
953 		assert(*rinfo == sizeof(*sysno));
954                 memcpy(sysno, rinfo+1, sizeof(*sysno));
955 	    }
956        }
957     }
958 
959     if (! *sysno)
960     {
961         /* new record AKA does not exist already */
962         if (action == action_delete)
963         {
964             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
965                     pr_fname, (zint) start_offset);
966             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
967             return ZEBRA_FAIL;
968         }
969         else if (action == action_a_delete)
970         {
971             if (show_progress)
972                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
973                         pr_fname, (zint) start_offset);
974             return ZEBRA_OK;
975         }
976 	else if (action == action_replace)
977 	{
978 	    yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
979 			 pr_fname, (zint) start_offset);
980             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
981             return ZEBRA_FAIL;
982 	}
983 	if (show_progress)
984 	    yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
985 		     (zint) start_offset);
986         rec = rec_new(zh->reg->records);
987 
988         *sysno = rec->sysno;
989 
990 
991         if (stream)
992         {
993             all_matches_add(&extractCtrl,
994                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
995                             *sysno);
996         }
997 
998 
999 	recordAttr = rec_init_attr(zh->reg->zei, rec);
1000 	if (extractCtrl.staticrank < 0)
1001         {
1002             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1003 	    extractCtrl.staticrank = 0;
1004         }
1005 
1006         if (matchStr)
1007         {
1008 	    int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1009             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1010 			    sizeof(*sysno), sysno);
1011         }
1012 
1013 	extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1014 #if FLUSH2
1015         extract_flush_record_keys2(zh, *sysno,
1016                                    zh->reg->keys, extractCtrl.staticrank,
1017                                    0, recordAttr->staticrank);
1018 #else
1019         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1020                                   extractCtrl.staticrank);
1021 #endif
1022 	recordAttr->staticrank = extractCtrl.staticrank;
1023         zh->records_inserted++;
1024     }
1025     else
1026     {
1027         /* record already exists */
1028 	zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1029 	zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1030 	if (action == action_insert)
1031 	{
1032 	    yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT,
1033 			 recordType, pr_fname, (zint) start_offset);
1034 	    logRecord(zh);
1035 	    return ZEBRA_FAIL;
1036 	}
1037 
1038         rec = rec_get(zh->reg->records, *sysno);
1039         assert(rec);
1040 
1041         if (stream)
1042         {
1043             all_matches_add(&extractCtrl,
1044                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1045                             *sysno);
1046         }
1047 
1048 	recordAttr = rec_init_attr(zh->reg->zei, rec);
1049 
1050         /* decrease total size */
1051         zebraExplain_recordBytesIncrement(zh->reg->zei,
1052                                            - recordAttr->recordSize);
1053 
1054 	zebra_rec_keys_set_buf(delkeys,
1055 			       rec->info[recInfo_delKeys],
1056 			       rec->size[recInfo_delKeys],
1057 			       0);
1058 	zebra_rec_keys_set_buf(sortKeys,
1059 			       rec->info[recInfo_sortKeys],
1060 			       rec->size[recInfo_sortKeys],
1061 			       0);
1062 
1063 	extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1064 #if !FLUSH2
1065         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1066                                   recordAttr->staticrank);
1067 #endif
1068         if (action == action_delete || action == action_a_delete)
1069         {
1070             /* record going to be deleted */
1071 #if FLUSH2
1072             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1073                                        delkeys, recordAttr->staticrank);
1074 #endif
1075             if (zebra_rec_keys_empty(delkeys))
1076             {
1077 		yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1078                         pr_fname, (zint) start_offset);
1079 		yaz_log(YLOG_WARN, "cannot delete file above, "
1080                         "storeKeys false (3)");
1081 	    }
1082             else
1083             {
1084 		if (show_progress)
1085 		    yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1086                             pr_fname, (zint) start_offset);
1087                 zh->records_deleted++;
1088                 if (matchStr)
1089 		{
1090 		    int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1091                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1092 		}
1093                 rec_del(zh->reg->records, &rec);
1094             }
1095             zebra_rec_keys_close(delkeys);
1096             zebra_rec_keys_close(sortKeys);
1097 	    rec_free(&rec);
1098             logRecord(zh);
1099             return ZEBRA_OK;
1100         }
1101         else
1102         {   /* update or special_update */
1103 	    if (show_progress)
1104                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1105                         pr_fname, (zint) start_offset);
1106             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1107 
1108 #if FLUSH2
1109             extract_flush_record_keys2(zh, *sysno,
1110                                        zh->reg->keys, extractCtrl.staticrank,
1111                                        delkeys, recordAttr->staticrank);
1112 #else
1113             extract_flush_record_keys(zh, *sysno, 1,
1114                                       zh->reg->keys, extractCtrl.staticrank);
1115 #endif
1116 	    recordAttr->staticrank = extractCtrl.staticrank;
1117             zh->records_updated++;
1118         }
1119 	zebra_rec_keys_close(delkeys);
1120 	zebra_rec_keys_close(sortKeys);
1121     }
1122     /* update file type */
1123     xfree(rec->info[recInfo_fileType]);
1124     rec->info[recInfo_fileType] =
1125         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1126 
1127     /* update filename */
1128     xfree(rec->info[recInfo_filename]);
1129     rec->info[recInfo_filename] =
1130         rec_strdup(fname, &rec->size[recInfo_filename]);
1131 
1132     /* update delete keys */
1133     xfree(rec->info[recInfo_delKeys]);
1134     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1135     {
1136 	zebra_rec_keys_get_buf(zh->reg->keys,
1137 			       &rec->info[recInfo_delKeys],
1138 			       &rec->size[recInfo_delKeys]);
1139     }
1140     else
1141     {
1142         rec->info[recInfo_delKeys] = NULL;
1143         rec->size[recInfo_delKeys] = 0;
1144     }
1145     /* update sort keys */
1146     xfree(rec->info[recInfo_sortKeys]);
1147 
1148     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1149 			   &rec->info[recInfo_sortKeys],
1150 			   &rec->size[recInfo_sortKeys]);
1151 
1152     if (stream)
1153     {
1154         recordAttr->recordSize = end_offset - start_offset;
1155         zebraExplain_recordBytesIncrement(zh->reg->zei,
1156                                           recordAttr->recordSize);
1157     }
1158 
1159     /* set run-number for this record */
1160     recordAttr->runNumber =
1161 	zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1162 
1163     /* update store data */
1164     xfree(rec->info[recInfo_storeData]);
1165 
1166     /* update store data */
1167     if (zh->store_data_buf)
1168     {
1169         rec->size[recInfo_storeData] = zh->store_data_size;
1170         rec->info[recInfo_storeData] = zh->store_data_buf;
1171 	zh->store_data_buf = 0;
1172         recordAttr->recordSize = zh->store_data_size;
1173     }
1174     else if (zh->m_store_data)
1175     {
1176         off_t cur_offset = stream->tellf(stream);
1177 
1178         rec->size[recInfo_storeData] = recordAttr->recordSize;
1179         rec->info[recInfo_storeData] = (char *)
1180 	    xmalloc(recordAttr->recordSize);
1181         stream->seekf(stream, start_offset);
1182         stream->readf(stream, rec->info[recInfo_storeData],
1183                       recordAttr->recordSize);
1184         stream->seekf(stream, cur_offset);
1185     }
1186     else
1187     {
1188         rec->info[recInfo_storeData] = NULL;
1189         rec->size[recInfo_storeData] = 0;
1190     }
1191     /* update database name */
1192     xfree(rec->info[recInfo_databaseName]);
1193     rec->info[recInfo_databaseName] =
1194         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]);
1195 
1196     /* update offset */
1197     recordAttr->recordOffset = start_offset;
1198 
1199     /* commit this record */
1200     rec_put(zh->reg->records, &rec);
1201     logRecord(zh);
1202     return ZEBRA_OK;
1203 }
1204 
1205 /** \brief extracts records from stream
1206     \param zh Zebra Handle
1207     \param stream stream that we read from
1208     \param action (action_insert, action_replace, action_delete, ..)
1209     \param recordType Record filter type "grs.xml", etc.
1210     \param sysno pointer to sysno if already known; NULL otherwise
1211     \param match_criteria (NULL if not already given)
1212     \param fname filename that we read from (for logging purposes only)
1213     \param recType record type
1214     \param recTypeClientData client data for record type
1215     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1216 */
zebra_extract_records_stream(ZebraHandle zh,struct ZebraRecStream * stream,enum zebra_recctrl_action_t action,const char * recordType,zint * sysno,const char * match_criteria,const char * fname,RecType recType,void * recTypeClientData)1217 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
1218                                        struct ZebraRecStream *stream,
1219                                        enum zebra_recctrl_action_t action,
1220                                        const char *recordType,
1221                                        zint *sysno,
1222                                        const char *match_criteria,
1223                                        const char *fname,
1224                                        RecType recType,
1225                                        void *recTypeClientData)
1226 {
1227     ZEBRA_RES res = ZEBRA_OK;
1228     while (1)
1229     {
1230         int more = 0;
1231         res = zebra_extract_record_stream(zh, stream,
1232                                           action,
1233                                           recordType,
1234                                           sysno,
1235                                           match_criteria,
1236                                           fname,
1237                                           recType, recTypeClientData, &more);
1238         if (!more)
1239         {
1240             res = ZEBRA_OK;
1241             break;
1242         }
1243         if (res != ZEBRA_OK)
1244             break;
1245         if (sysno)
1246             break;
1247     }
1248     return res;
1249 }
1250 
zebra_extract_explain(void * handle,Record rec,data1_node * n)1251 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1252 {
1253     ZebraHandle zh = (ZebraHandle) handle;
1254     struct recExtractCtrl extractCtrl;
1255 
1256     if (zebraExplain_curDatabase(zh->reg->zei,
1257 				  rec->info[recInfo_databaseName]))
1258     {
1259 	abort();
1260         if (zebraExplain_newDatabase(zh->reg->zei,
1261 				      rec->info[recInfo_databaseName], 0))
1262             abort();
1263     }
1264 
1265     zebra_rec_keys_reset(zh->reg->keys);
1266     zebra_rec_keys_reset(zh->reg->sortKeys);
1267 
1268     extractCtrl.init = extract_init;
1269     extractCtrl.tokenAdd = extract_token_add;
1270     extractCtrl.schemaAdd = extract_schema_add;
1271     extractCtrl.dh = zh->reg->dh;
1272 
1273     init_extractCtrl(zh, &extractCtrl);
1274 
1275     extractCtrl.flagShowRecords = 0;
1276     extractCtrl.match_criteria[0] = '\0';
1277     extractCtrl.staticrank = 0;
1278     extractCtrl.action = action_update;
1279 
1280     extractCtrl.handle = handle;
1281     extractCtrl.first_record = 1;
1282 
1283     extract_set_store_data_prepare(&extractCtrl);
1284 
1285     if (n)
1286 	grs_extract_tree(&extractCtrl, n);
1287 
1288     if (rec->size[recInfo_delKeys])
1289     {
1290 	zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1291 
1292 	zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1293 
1294 	zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1295 			       rec->size[recInfo_delKeys],
1296 			       0);
1297 #if FLUSH2
1298 	extract_flush_record_keys2(zh, rec->sysno,
1299                                    zh->reg->keys, 0, delkeys, 0);
1300 #else
1301 	extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1302         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1303 #endif
1304 	zebra_rec_keys_close(delkeys);
1305 
1306 	zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1307 			       rec->size[recInfo_sortKeys],
1308 			       0);
1309 
1310 	extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1311 	zebra_rec_keys_close(sortkeys);
1312     }
1313     else
1314     {
1315 #if FLUSH2
1316 	extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1317 #else
1318         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1319 #endif
1320     }
1321     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1322 
1323     xfree(rec->info[recInfo_delKeys]);
1324     zebra_rec_keys_get_buf(zh->reg->keys,
1325 			   &rec->info[recInfo_delKeys],
1326 			   &rec->size[recInfo_delKeys]);
1327 
1328     xfree(rec->info[recInfo_sortKeys]);
1329     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1330 			   &rec->info[recInfo_sortKeys],
1331 			   &rec->size[recInfo_sortKeys]);
1332     return ZEBRA_OK;
1333 }
1334 
zebra_it_key_str_dump(ZebraHandle zh,struct it_key * key,const char * str,size_t slen,NMEM nmem,int level)1335 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1336                            const char *str, size_t slen, NMEM nmem, int level)
1337 {
1338     char keystr[200]; /* room for zints to print */
1339     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1340     const char *index_type;
1341     int i;
1342     const char *string_index;
1343 
1344     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1345                             0/* db */, &string_index);
1346     assert(index_type);
1347     *keystr = '\0';
1348     for (i = 0; i < key->len; i++)
1349     {
1350         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1351     }
1352 
1353     if (*str < CHR_BASE_CHAR)
1354     {
1355         int i;
1356         char dst_buf[200]; /* room for special chars */
1357 
1358         strcpy(dst_buf , "?");
1359 
1360         if (!strcmp(str, ""))
1361             strcpy(dst_buf, "alwaysmatches");
1362         if (!strcmp(str, FIRST_IN_FIELD_STR))
1363             strcpy(dst_buf, "firstinfield");
1364         else if (!strcmp(str, CHR_UNKNOWN))
1365             strcpy(dst_buf, "unknown");
1366         else if (!strcmp(str, CHR_SPACE))
1367             strcpy(dst_buf, "space");
1368 
1369         for (i = 0; i<slen; i++)
1370         {
1371             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1372         }
1373         yaz_log(level, "%s%s %s %s", keystr, index_type,
1374                 string_index, dst_buf);
1375     }
1376     else
1377     {
1378         char *dst_term = 0;
1379         zebra_term_untrans_iconv(zh, nmem, index_type, &dst_term, str);
1380         if (dst_term)
1381             yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1382                     string_index, dst_term);
1383         else
1384         {
1385             WRBUF w = wrbuf_alloc();
1386             wrbuf_write_escaped(w, str, strlen(str));
1387             yaz_log(level, "%s%s %s %s", keystr, index_type,
1388                     string_index, wrbuf_cstr(w));
1389             wrbuf_destroy(w);
1390         }
1391     }
1392 }
1393 
extract_rec_keys_log(ZebraHandle zh,int is_insert,zebra_rec_keys_t reckeys,int level)1394 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1395                           zebra_rec_keys_t reckeys,
1396                           int level)
1397 {
1398     if (zebra_rec_keys_rewind(reckeys))
1399     {
1400 	size_t slen;
1401 	const char *str;
1402 	struct it_key key;
1403         NMEM nmem = nmem_create();
1404 
1405 	while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1406         {
1407             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1408             nmem_reset(nmem);
1409         }
1410         nmem_destroy(nmem);
1411     }
1412 }
1413 
extract_rec_keys_adjust(ZebraHandle zh,int is_insert,zebra_rec_keys_t reckeys)1414 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1415                              zebra_rec_keys_t reckeys)
1416 {
1417     ZebraExplainInfo zei = zh->reg->zei;
1418     struct ord_stat {
1419         int no;
1420         int ord;
1421         struct ord_stat *next;
1422     };
1423 
1424     if (zebra_rec_keys_rewind(reckeys))
1425     {
1426         struct ord_stat *ord_list = 0;
1427         struct ord_stat *p;
1428 	size_t slen;
1429 	const char *str;
1430 	struct it_key key_in;
1431 	while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1432         {
1433             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1434 
1435             for (p = ord_list; p ; p = p->next)
1436                 if (p->ord == ord)
1437                 {
1438                     p->no++;
1439                     break;
1440                 }
1441             if (!p)
1442             {
1443                 p = xmalloc(sizeof(*p));
1444                 p->no = 1;
1445                 p->ord = ord;
1446                 p->next = ord_list;
1447                 ord_list = p;
1448             }
1449         }
1450 
1451         p = ord_list;
1452         while (p)
1453         {
1454             struct ord_stat *p1 = p;
1455 
1456             if (is_insert)
1457                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1458             else
1459                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1460             p = p->next;
1461             xfree(p1);
1462         }
1463     }
1464 }
1465 
1466 #if FLUSH2
extract_flush_record_keys2(ZebraHandle zh,zint sysno,zebra_rec_keys_t ins_keys,zint ins_rank,zebra_rec_keys_t del_keys,zint del_rank)1467 static void extract_flush_record_keys2(
1468     ZebraHandle zh, zint sysno,
1469     zebra_rec_keys_t ins_keys, zint ins_rank,
1470     zebra_rec_keys_t del_keys, zint del_rank)
1471 {
1472     ZebraExplainInfo zei = zh->reg->zei;
1473     int normal = 0;
1474     int optimized = 0;
1475 
1476     if (!zh->reg->key_block)
1477     {
1478 	int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1479         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1480         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1481         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1482     }
1483 
1484     if (ins_keys)
1485     {
1486         extract_rec_keys_adjust(zh, 1, ins_keys);
1487         if (!del_keys)
1488             zebraExplain_recordCountIncrement(zei, 1);
1489         zebra_rec_keys_rewind(ins_keys);
1490     }
1491     if (del_keys)
1492     {
1493         extract_rec_keys_adjust(zh, 0, del_keys);
1494         if (!ins_keys)
1495             zebraExplain_recordCountIncrement(zei, -1);
1496         zebra_rec_keys_rewind(del_keys);
1497     }
1498 
1499     while (1)
1500     {
1501 	size_t del_slen;
1502 	const char *del_str;
1503 	struct it_key del_key_in;
1504         int del = 0;
1505 
1506 	size_t ins_slen;
1507 	const char *ins_str;
1508 	struct it_key ins_key_in;
1509         int ins = 0;
1510 
1511         if (del_keys)
1512             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1513                                       &del_key_in);
1514         if (ins_keys)
1515             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1516                                       &ins_key_in);
1517 
1518         if (del && ins && ins_rank == del_rank
1519             && !key_compare(&del_key_in, &ins_key_in)
1520             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1521         {
1522             optimized++;
1523             continue;
1524         }
1525         if (!del && !ins)
1526             break;
1527 
1528         normal++;
1529         if (del)
1530             key_block_write(zh->reg->key_block, sysno,
1531                             &del_key_in, 0, del_str, del_slen,
1532                             del_rank, zh->m_staticrank);
1533         if (ins)
1534             key_block_write(zh->reg->key_block, sysno,
1535                             &ins_key_in, 1, ins_str, ins_slen,
1536                             ins_rank, zh->m_staticrank);
1537     }
1538     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1539 }
1540 #else
extract_flush_record_keys(ZebraHandle zh,zint sysno,int cmd,zebra_rec_keys_t reckeys,zint staticrank)1541 static void extract_flush_record_keys(
1542     ZebraHandle zh, zint sysno, int cmd,
1543     zebra_rec_keys_t reckeys,
1544     zint staticrank)
1545 {
1546     ZebraExplainInfo zei = zh->reg->zei;
1547 
1548     extract_rec_keys_adjust(zh, cmd, reckeys);
1549 
1550     if (log_level_details)
1551     {
1552         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1553                 sysno, cmd ? "insert" : "delete");
1554         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1555     }
1556 
1557     if (!zh->reg->key_block)
1558     {
1559         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1560         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1561         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1562         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1563     }
1564     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1565 
1566 #if 0
1567     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1568     print_rec_keys(zh, reckeys);
1569 #endif
1570     if (zebra_rec_keys_rewind(reckeys))
1571     {
1572         size_t slen;
1573         const char *str;
1574         struct it_key key_in;
1575         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1576         {
1577             key_block_write(zh->reg->key_block, sysno,
1578                             &key_in, cmd, str, slen,
1579                             staticrank, zh->m_staticrank);
1580         }
1581     }
1582 }
1583 #endif
1584 
zebra_rec_keys_to_snippets1(ZebraHandle zh,zebra_rec_keys_t reckeys,zebra_snippets * snippets)1585 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1586                                      zebra_rec_keys_t reckeys,
1587                                      zebra_snippets *snippets)
1588 {
1589     NMEM nmem = nmem_create();
1590     if (zebra_rec_keys_rewind(reckeys))
1591     {
1592 	const char *str;
1593 	size_t slen;
1594 	struct it_key key;
1595 	while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1596 	{
1597 	    char *dst_term = 0;
1598 	    int ord;
1599             zint seqno;
1600 	    const char *index_type;
1601 
1602 	    assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1603 	    seqno = key.mem[key.len-1];
1604 	    ord = CAST_ZINT_TO_INT(key.mem[0]);
1605 
1606 	    zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1607 				    0/* db */, 0 /* string_index */);
1608 	    assert(index_type);
1609 	    zebra_term_untrans_iconv(zh, nmem, index_type,
1610 				     &dst_term, str);
1611 	    zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1612 	    nmem_reset(nmem);
1613 	}
1614     }
1615     nmem_destroy(nmem);
1616     return ZEBRA_OK;
1617 }
1618 
print_rec_keys(ZebraHandle zh,zebra_rec_keys_t reckeys)1619 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1620 {
1621     yaz_log(YLOG_LOG, "print_rec_keys");
1622     if (zebra_rec_keys_rewind(reckeys))
1623     {
1624 	const char *str;
1625 	size_t slen;
1626 	struct it_key key;
1627 	while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1628 	{
1629 	    char dst_buf[IT_MAX_WORD];
1630 	    zint seqno;
1631 	    const char *index_type;
1632             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1633 	    const char *db = 0;
1634 	    assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1635 
1636 	    zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1637 
1638 	    seqno = key.mem[key.len-1];
1639 
1640 	    zebra_term_untrans(zh, index_type, dst_buf, str);
1641 
1642 	    yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT
1643                     " term=%s", ord, seqno, dst_buf);
1644 	}
1645     }
1646 }
1647 
extract_add_index_string(RecWord * p,zinfo_index_category_t cat,const char * str,int length)1648 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1649                                      const char *str, int length)
1650 {
1651     struct it_key key;
1652     ZebraHandle zh = p->extractCtrl->handle;
1653     ZebraExplainInfo zei = zh->reg->zei;
1654     int ch, i;
1655 
1656     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1657     if (ch < 0)
1658         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1659 
1660     i = 0;
1661     key.mem[i++] = ch;
1662     key.mem[i++] = p->record_id;
1663     key.mem[i++] = p->section_id;
1664 
1665     if (zh->m_segment_indexing)
1666         key.mem[i++] = p->segment;
1667     key.mem[i++] = p->seqno;
1668     key.len = i;
1669 
1670     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1671 }
1672 
extract_add_sort_string(RecWord * p,const char * str,int length)1673 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1674 {
1675     struct it_key key;
1676     ZebraHandle zh = p->extractCtrl->handle;
1677     ZebraExplainInfo zei = zh->reg->zei;
1678     int ch;
1679     zinfo_index_category_t cat = zinfo_index_category_sort;
1680 
1681     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1682     if (ch < 0)
1683         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1684     key.len = 3;
1685     key.mem[0] = ch;
1686     key.mem[1] = p->record_id;
1687     key.mem[2] = p->section_id;
1688 
1689     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1690 }
1691 
extract_add_staticrank_string(RecWord * p,const char * str,int length)1692 static void extract_add_staticrank_string(RecWord *p,
1693                                           const char *str, int length)
1694 {
1695     char valz[40];
1696     struct recExtractCtrl *ctrl = p->extractCtrl;
1697 
1698     if (length > sizeof(valz)-1)
1699         length = sizeof(valz)-1;
1700 
1701     memcpy(valz, str, length);
1702     valz[length] = '\0';
1703     ctrl->staticrank = atozint(valz);
1704 }
1705 
extract_add_string(RecWord * p,zebra_map_t zm,const char * string,int length)1706 static void extract_add_string(RecWord *p, zebra_map_t zm,
1707                                const char *string, int length)
1708 {
1709     assert(length > 0);
1710 
1711     if (!p->index_name)
1712         return;
1713     if (log_level_details)
1714     {
1715 
1716         WRBUF w = wrbuf_alloc();
1717 
1718         wrbuf_write_escaped(w, string, length);
1719         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1720         wrbuf_destroy(w);
1721     }
1722     if (zebra_maps_is_index(zm))
1723     {
1724 	extract_add_index_string(p, zinfo_index_category_index,
1725                                  string, length);
1726         if (zebra_maps_is_alwaysmatches(zm))
1727         {
1728             RecWord word;
1729             memcpy(&word, p, sizeof(word));
1730 
1731             word.seqno = 1;
1732             extract_add_index_string(
1733                 &word, zinfo_index_category_alwaysmatches, "", 0);
1734         }
1735     }
1736     else if (zebra_maps_is_sort(zm))
1737     {
1738 	extract_add_sort_string(p, string, length);
1739     }
1740     else if (zebra_maps_is_staticrank(zm))
1741     {
1742 	extract_add_staticrank_string(p, string, length);
1743     }
1744 }
1745 
extract_add_incomplete_field(RecWord * p,zebra_map_t zm)1746 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1747 {
1748     const char *b = p->term_buf;
1749     int remain = p->term_len;
1750     int first = 1;
1751     const char **map = 0;
1752 
1753     if (remain > 0)
1754 	map = zebra_maps_input(zm, &b, remain, 0);
1755 
1756     while (map)
1757     {
1758 	char buf[IT_MAX_WORD+1];
1759 	int i, remain;
1760 
1761 	/* Skip spaces */
1762 	while (map && *map && **map == *CHR_SPACE)
1763 	{
1764 	    remain = p->term_len - (b - p->term_buf);
1765 	    if (remain > 0)
1766 		map = zebra_maps_input(zm, &b, remain, 0);
1767 	    else
1768 		map = 0;
1769 	}
1770 	if (!map)
1771 	    break;
1772 	i = 0;
1773 	while (map && *map && **map != *CHR_SPACE)
1774 	{
1775 	    const char *cp = *map;
1776 
1777 	    while (i < IT_MAX_WORD && *cp)
1778 		buf[i++] = *(cp++);
1779 	    remain = p->term_len - (b - p->term_buf);
1780 	    if (remain > 0)
1781 		map = zebra_maps_input(zm, &b, remain, 0);
1782 	    else
1783 		map = 0;
1784 	}
1785 	if (!i)
1786 	    return;
1787 
1788         if (first)
1789         {
1790             first = 0;
1791             if (zebra_maps_is_first_in_field(zm))
1792             {
1793                 /* first in field marker */
1794                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1795                 p->seqno++;
1796             }
1797         }
1798 	extract_add_string(p, zm, buf, i);
1799         p->seqno++;
1800     }
1801 }
1802 
extract_add_complete_field(RecWord * p,zebra_map_t zm)1803 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1804 {
1805     char buf[IT_MAX_WORD+1];
1806     int i = parse_complete_field(p, zm, buf);
1807     if (!i)
1808 	return;
1809     extract_add_string(p, zm, buf, i);
1810     p->seqno++;
1811 }
1812 
extract_add_icu(RecWord * p,zebra_map_t zm)1813 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1814 {
1815     const char *res_buf = 0;
1816     size_t res_len = 0;
1817 
1818     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1819     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1820     {
1821         if (res_len > IT_MAX_WORD)
1822         {
1823             yaz_log(YLOG_LOG, "Truncating long term %ld", (long) res_len);
1824             res_len = IT_MAX_WORD;
1825         }
1826         extract_add_string(p, zm, res_buf, res_len);
1827         p->seqno++;
1828     }
1829 }
1830 
1831 
1832 /** \brief top-level indexing handler for recctrl system
1833     \param p token data to be indexed
1834 
1835     Call sequence:
1836     extract_token_add
1837     extract_add_{in}_complete / extract_add_icu
1838     extract_add_string
1839 
1840     extract_add_index_string
1841     or
1842     extract_add_sort_string
1843     or
1844     extract_add_staticrank_string
1845 
1846 */
extract_token_add(RecWord * p)1847 static void extract_token_add(RecWord *p)
1848 {
1849     ZebraHandle zh = p->extractCtrl->handle;
1850     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1851 
1852     if (log_level_details)
1853     {
1854         yaz_log(log_level_details, "extract_token_add "
1855                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1856                 p->index_type, p->index_name,
1857                 p->seqno, p->term_len, p->term_buf);
1858     }
1859     if (zebra_maps_is_icu(zm))
1860     {
1861         extract_add_icu(p, zm);
1862     }
1863     else
1864     {
1865         if (zebra_maps_is_complete(zm))
1866             extract_add_complete_field(p, zm);
1867         else
1868             extract_add_incomplete_field(p, zm);
1869     }
1870 }
1871 
extract_set_store_data_cb(struct recExtractCtrl * p,void * buf,size_t sz)1872 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1873 				      void *buf, size_t sz)
1874 {
1875     ZebraHandle zh = (ZebraHandle) p->handle;
1876 
1877     xfree(zh->store_data_buf);
1878     zh->store_data_buf = 0;
1879     zh->store_data_size = 0;
1880     if (buf && sz)
1881     {
1882 	zh->store_data_buf = xmalloc(sz);
1883 	zh->store_data_size = sz;
1884 	memcpy(zh->store_data_buf, buf, sz);
1885     }
1886 }
1887 
extract_set_store_data_prepare(struct recExtractCtrl * p)1888 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1889 {
1890     ZebraHandle zh = (ZebraHandle) p->handle;
1891     xfree(zh->store_data_buf);
1892     zh->store_data_buf = 0;
1893     zh->store_data_size = 0;
1894     p->setStoreData = extract_set_store_data_cb;
1895 }
1896 
extract_schema_add(struct recExtractCtrl * p,Odr_oid * oid)1897 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1898 {
1899     ZebraHandle zh = (ZebraHandle) p->handle;
1900     zebraExplain_addSchema(zh->reg->zei, oid);
1901 }
1902 
extract_flush_sort_keys(ZebraHandle zh,zint sysno,int cmd,zebra_rec_keys_t reckeys)1903 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1904                              int cmd, zebra_rec_keys_t reckeys)
1905 {
1906 #if 0
1907     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1908             cmd, sysno);
1909     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1910 #endif
1911 
1912     if (zebra_rec_keys_rewind(reckeys))
1913     {
1914         zebra_sort_index_t si = zh->reg->sort_index;
1915 	size_t slen;
1916 	const char *str;
1917 	struct it_key key_in;
1918 
1919         NMEM nmem = nmem_create();
1920         struct sort_add_ent {
1921             int ord;
1922             int cmd;
1923             struct sort_add_ent *next;
1924             WRBUF wrbuf;
1925             zint sysno;
1926             zint section_id;
1927         };
1928         struct sort_add_ent *sort_ent_list = 0;
1929 
1930 	while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1931         {
1932             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1933             zint filter_sysno = key_in.mem[1];
1934             zint section_id = key_in.mem[2];
1935 
1936             struct sort_add_ent **e = &sort_ent_list;
1937             for (; *e; e = &(*e)->next)
1938                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1939                     break;
1940             if (!*e)
1941             {
1942                 *e = nmem_malloc(nmem, sizeof(**e));
1943                 (*e)->next = 0;
1944                 (*e)->wrbuf = wrbuf_alloc();
1945                 (*e)->ord = ord;
1946                 (*e)->cmd = cmd;
1947                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1948                 (*e)->section_id = section_id;
1949             }
1950 
1951             wrbuf_write((*e)->wrbuf, str, slen);
1952             wrbuf_putc((*e)->wrbuf, '\0');
1953         }
1954         if (sort_ent_list)
1955         {
1956             zint last_sysno = 0;
1957             struct sort_add_ent *e = sort_ent_list;
1958             for (; e; e = e->next)
1959             {
1960                 if (last_sysno != e->sysno)
1961                 {
1962                     zebra_sort_sysno(si, e->sysno);
1963                     last_sysno = e->sysno;
1964                 }
1965                 zebra_sort_type(si, e->ord);
1966                 if (e->cmd == 1)
1967                     zebra_sort_add(si, e->section_id, e->wrbuf);
1968                 else
1969                     zebra_sort_delete(si, e->section_id);
1970                 wrbuf_destroy(e->wrbuf);
1971             }
1972         }
1973         nmem_destroy(nmem);
1974     }
1975 }
1976 
1977 /*
1978  * Local variables:
1979  * c-basic-offset: 4
1980  * c-file-style: "Stroustrup"
1981  * indent-tabs-mode: nil
1982  * End:
1983  * vim: shiftwidth=4 tabstop=8 expandtab
1984  */
1985 
1986