1 /* This file is part of the Zebra server.
2 Copyright (C) 2004-2013 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18 */
19
20
21 /*
22 * This module reads "loose" SGML and converts it to data1 tree
23 */
24
25 #if HAVE_CONFIG_H
26 #include <config.h>
27 #endif
28 #include <assert.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31
32 #include <yaz/yaz-util.h>
33 #include <d1_absyn.h>
34
data1_get_root_tag(data1_handle dh,data1_node * n)35 data1_node *data1_get_root_tag (data1_handle dh, data1_node *n)
36 {
37 if (!n)
38 return 0;
39 if (data1_is_xmlmode(dh))
40 {
41 n = n->child;
42 while (n && n->which != DATA1N_tag)
43 n = n->next;
44 }
45 return n;
46 }
47
48 /*
49 * get the tag which is the immediate parent of this node (this may mean
50 * traversing intermediate things like variants and stuff.
51 */
get_parent_tag(data1_handle dh,data1_node * n)52 data1_node *get_parent_tag (data1_handle dh, data1_node *n)
53 {
54 if (data1_is_xmlmode(dh))
55 {
56 for (; n && n->which != DATA1N_root; n = n->parent)
57 if (n->which == DATA1N_tag && n->parent &&
58 n->parent->which != DATA1N_root)
59 return n;
60 }
61 else
62 {
63 for (; n && n->which != DATA1N_root; n = n->parent)
64 if (n->which == DATA1N_tag)
65 return n;
66 }
67 return 0;
68 }
69
data1_mk_node(data1_handle dh,NMEM m)70 data1_node *data1_mk_node (data1_handle dh, NMEM m)
71 {
72 return data1_mk_node2 (dh, m, DATA1N_root, 0);
73 }
74
data1_mk_node_type(data1_handle dh,NMEM m,int type)75 data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type)
76 {
77 return data1_mk_node2 (dh, m, type, 0);
78 }
79
data1_init_node(data1_handle dh,data1_node * r,int type)80 static void data1_init_node (data1_handle dh, data1_node *r, int type)
81 {
82 r->which = type;
83 switch(type)
84 {
85 case DATA1N_tag:
86 r->u.tag.tag = 0;
87 r->u.tag.element = 0;
88 r->u.tag.no_data_requested = 0;
89 r->u.tag.node_selected = 0;
90 r->u.tag.make_variantlist = 0;
91 r->u.tag.get_bytes = -1;
92 r->u.tag.attributes = 0;
93 break;
94 case DATA1N_root:
95 r->u.root.type = 0;
96 r->u.root.absyn = 0;
97 break;
98 case DATA1N_data:
99 r->u.data.data = 0;
100 r->u.data.len = 0;
101 r->u.data.what = 0;
102 r->u.data.formatted_text = 0;
103 break;
104 case DATA1N_comment:
105 r->u.data.data = 0;
106 r->u.data.len = 0;
107 r->u.data.what = 0;
108 r->u.data.formatted_text = 1;
109 break;
110 case DATA1N_variant:
111 r->u.variant.type = 0;
112 r->u.variant.value = 0;
113 break;
114 case DATA1N_preprocess:
115 r->u.preprocess.target = 0;
116 r->u.preprocess.attributes = 0;
117 break;
118 default:
119 yaz_log (YLOG_WARN, "data_mk_node_type. bad type = %d\n", type);
120 }
121 }
122
data1_append_node(data1_handle dh,NMEM m,int type,data1_node * parent)123 data1_node *data1_append_node (data1_handle dh, NMEM m, int type,
124 data1_node *parent)
125 {
126 data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
127 r->next = r->child = r->last_child = 0;
128
129 r->parent = parent;
130 if (!parent)
131 r->root = r;
132 else
133 {
134 r->root = parent->root;
135 if (!parent->child)
136 parent->child = parent->last_child = r;
137 else
138 parent->last_child->next = r;
139 parent->last_child = r;
140 }
141 data1_init_node(dh, r, type);
142 return r;
143 }
144
data1_mk_node2(data1_handle dh,NMEM m,int type,data1_node * parent)145 data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type,
146 data1_node *parent)
147 {
148 return data1_append_node (dh, m, type, parent);
149 }
150
data1_insert_node(data1_handle dh,NMEM m,int type,data1_node * parent)151 data1_node *data1_insert_node (data1_handle dh, NMEM m, int type,
152 data1_node *parent)
153 {
154 data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
155 r->next = r->child = r->last_child = 0;
156
157 if (!parent)
158 r->root = r;
159 else
160 {
161 r->root = parent->root;
162 r->parent = parent;
163 if (!parent->child)
164 parent->last_child = r;
165 else
166 r->next = parent->child;
167 parent->child = r;
168 }
169 data1_init_node(dh, r, type);
170 return r;
171 }
172
data1_mk_root(data1_handle dh,NMEM nmem,const char * name)173 data1_node *data1_mk_root (data1_handle dh, NMEM nmem, const char *name)
174 {
175 data1_absyn *absyn = data1_get_absyn(dh, name, 1);
176 data1_node *res;
177
178 if (!absyn)
179 {
180 yaz_log(YLOG_WARN, "Unable to acquire abstract syntax " "for '%s'",
181 name);
182 /* It's now OK for a record not to have an absyn */
183 }
184 res = data1_mk_node2 (dh, nmem, DATA1N_root, 0);
185 res->u.root.type = data1_insert_string (dh, res, nmem, name);
186 res->u.root.absyn = absyn;
187 return res;
188 }
189
data1_set_root(data1_handle dh,data1_node * res,NMEM nmem,const char * name)190 void data1_set_root(data1_handle dh, data1_node *res,
191 NMEM nmem, const char *name)
192 {
193 data1_absyn *absyn = data1_get_absyn(
194 dh, name, DATA1_XPATH_INDEXING_ENABLE);
195
196 res->u.root.type = data1_insert_string (dh, res, nmem, name);
197 res->u.root.absyn = absyn;
198 }
199
data1_add_attrs(data1_handle dh,NMEM nmem,const char ** attr,data1_xattr ** p)200 void data1_add_attrs(data1_handle dh, NMEM nmem, const char **attr,
201 data1_xattr **p)
202 {
203 while (*p)
204 p = &(*p)->next;
205
206 while (attr && *attr)
207 {
208 *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p));
209 (*p)->name = nmem_strdup (nmem, *attr++);
210 (*p)->value = nmem_strdup (nmem, *attr++);
211 (*p)->what = DATA1I_text;
212
213 p = &(*p)->next;
214 }
215 *p = 0;
216 }
217
data1_mk_preprocess(data1_handle dh,NMEM nmem,const char * target,const char ** attr,data1_node * at)218 data1_node *data1_mk_preprocess (data1_handle dh, NMEM nmem,
219 const char *target,
220 const char **attr, data1_node *at)
221 {
222 return data1_mk_preprocess_n (dh, nmem, target, strlen(target),
223 attr, at);
224 }
225
data1_mk_preprocess_n(data1_handle dh,NMEM nmem,const char * target,size_t len,const char ** attr,data1_node * at)226 data1_node *data1_mk_preprocess_n (data1_handle dh, NMEM nmem,
227 const char *target, size_t len,
228 const char **attr, data1_node *at)
229 {
230 data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_preprocess, at);
231 res->u.preprocess.target = data1_insert_string_n (dh, res, nmem,
232 target, len);
233
234 data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
235 return res;
236 }
237
data1_insert_preprocess(data1_handle dh,NMEM nmem,const char * target,const char ** attr,data1_node * at)238 data1_node *data1_insert_preprocess (data1_handle dh, NMEM nmem,
239 const char *target,
240 const char **attr, data1_node *at)
241 {
242 return data1_insert_preprocess_n (dh, nmem, target, strlen(target),
243 attr, at);
244 }
245
data1_insert_preprocess_n(data1_handle dh,NMEM nmem,const char * target,size_t len,const char ** attr,data1_node * at)246 data1_node *data1_insert_preprocess_n (data1_handle dh, NMEM nmem,
247 const char *target, size_t len,
248 const char **attr, data1_node *at)
249 {
250 data1_node *res = data1_insert_node (dh, nmem, DATA1N_preprocess, at);
251 res->u.preprocess.target = data1_insert_string_n (dh, res, nmem,
252 target, len);
253
254 data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
255 return res;
256 }
257
data1_mk_tag_n(data1_handle dh,NMEM nmem,const char * tag,size_t len,const char ** attr,data1_node * at)258 data1_node *data1_mk_tag_n (data1_handle dh, NMEM nmem,
259 const char *tag, size_t len, const char **attr,
260 data1_node *at)
261 {
262 data1_node *partag = get_parent_tag(dh, at);
263 data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_tag, at);
264 data1_element *e = 0;
265
266 res->u.tag.tag = data1_insert_string_n (dh, res, nmem, tag, len);
267
268 if (!partag) /* top tag? */
269 e = data1_getelementbytagname (dh, at->root->u.root.absyn,
270 0 /* index as local */,
271 res->u.tag.tag);
272 else
273 {
274 /* only set element for known tags */
275 e = partag->u.tag.element;
276 if (e)
277 e = data1_getelementbytagname (dh, at->root->u.root.absyn,
278 e, res->u.tag.tag);
279 }
280 res->u.tag.element = e;
281 data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
282 return res;
283 }
284
data1_tag_add_attr(data1_handle dh,NMEM nmem,data1_node * res,const char ** attr)285 void data1_tag_add_attr (data1_handle dh, NMEM nmem,
286 data1_node *res, const char **attr)
287 {
288 if (res->which != DATA1N_tag)
289 return;
290
291 data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
292 }
293
data1_mk_tag(data1_handle dh,NMEM nmem,const char * tag,const char ** attr,data1_node * at)294 data1_node *data1_mk_tag (data1_handle dh, NMEM nmem,
295 const char *tag, const char **attr, data1_node *at)
296 {
297 return data1_mk_tag_n (dh, nmem, tag, strlen(tag), attr, at);
298 }
299
data1_search_tag(data1_handle dh,data1_node * n,const char * tag)300 data1_node *data1_search_tag (data1_handle dh, data1_node *n,
301 const char *tag)
302 {
303 if (*tag == '/')
304 {
305 n = data1_get_root_tag (dh, n);
306 if (n)
307 n = n->child;
308 tag++;
309 }
310 for (; n; n = n->next)
311 if (n->which == DATA1N_tag && n->u.tag.tag &&
312 !yaz_matchstr (n->u.tag.tag, tag))
313 {
314 return n;
315 }
316 return 0;
317 }
318
data1_mk_tag_uni(data1_handle dh,NMEM nmem,const char * tag,data1_node * at)319 data1_node *data1_mk_tag_uni (data1_handle dh, NMEM nmem,
320 const char *tag, data1_node *at)
321 {
322 data1_node *node = data1_search_tag (dh, at->child, tag);
323 if (!node)
324 node = data1_mk_tag (dh, nmem, tag, 0 /* attr */, at);
325 else
326 node->child = node->last_child = 0;
327 return node;
328 }
329
data1_mk_text_n(data1_handle dh,NMEM mem,const char * buf,size_t len,data1_node * parent)330 data1_node *data1_mk_text_n (data1_handle dh, NMEM mem,
331 const char *buf, size_t len, data1_node *parent)
332 {
333 data1_node *res = data1_mk_node2 (dh, mem, DATA1N_data, parent);
334 res->u.data.what = DATA1I_text;
335 res->u.data.len = len;
336
337 res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);
338 return res;
339 }
340
data1_mk_text_nf(data1_handle dh,NMEM mem,const char * buf,size_t len,data1_node * parent)341 data1_node *data1_mk_text_nf (data1_handle dh, NMEM mem,
342 const char *buf, size_t len, data1_node *parent)
343 {
344 data1_node *res = data1_mk_text_n (dh, mem, buf, len, parent);
345 res->u.data.formatted_text = 1;
346 return res;
347 }
348
data1_mk_text(data1_handle dh,NMEM mem,const char * buf,data1_node * parent)349 data1_node *data1_mk_text (data1_handle dh, NMEM mem,
350 const char *buf, data1_node *parent)
351 {
352 return data1_mk_text_n (dh, mem, buf, strlen(buf), parent);
353 }
354
data1_mk_comment_n(data1_handle dh,NMEM mem,const char * buf,size_t len,data1_node * parent)355 data1_node *data1_mk_comment_n (data1_handle dh, NMEM mem,
356 const char *buf, size_t len,
357 data1_node *parent)
358 {
359 data1_node *res = data1_mk_node2 (dh, mem, DATA1N_comment, parent);
360 res->u.data.what = DATA1I_text;
361 res->u.data.len = len;
362
363 res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);
364 return res;
365 }
366
data1_mk_comment(data1_handle dh,NMEM mem,const char * buf,data1_node * parent)367 data1_node *data1_mk_comment (data1_handle dh, NMEM mem,
368 const char *buf, data1_node *parent)
369 {
370 return data1_mk_comment_n (dh, mem, buf, strlen(buf), parent);
371 }
372
data1_insert_string_n(data1_handle dh,data1_node * res,NMEM m,const char * str,size_t len)373 char *data1_insert_string_n (data1_handle dh, data1_node *res,
374 NMEM m, const char *str, size_t len)
375 {
376 char *b;
377 if (len >= DATA1_LOCALDATA)
378 b = (char *) nmem_malloc (m, len+1);
379 else
380 b = res->lbuf;
381 memcpy (b, str, len);
382 b[len] = 0;
383 return b;
384 }
385
data1_insert_string(data1_handle dh,data1_node * res,NMEM m,const char * str)386 char *data1_insert_string (data1_handle dh, data1_node *res,
387 NMEM m, const char *str)
388 {
389 return data1_insert_string_n (dh, res, m, str, strlen(str));
390 }
391
data1_add_insert_taggeddata(data1_handle dh,data1_node * at,const char * tagname,NMEM m,int local_allowed,int insert_mode)392 static data1_node *data1_add_insert_taggeddata(data1_handle dh,
393 data1_node *at,
394 const char *tagname, NMEM m,
395 int local_allowed,
396 int insert_mode)
397 {
398 data1_node *root = at->root;
399 data1_node *partag = get_parent_tag (dh, at);
400 data1_element *e = NULL;
401 data1_node *datn = 0;
402 data1_node *tagn = 0;
403
404 if (!partag)
405 e = data1_getelementbytagname (dh, root->u.root.absyn, 0, tagname);
406 else
407 {
408 e = partag->u.tag.element;
409 if (e)
410 e = data1_getelementbytagname (dh, root->u.root.absyn, e, tagname);
411 }
412 if (local_allowed || e)
413 {
414 if (insert_mode)
415 tagn = data1_insert_node (dh, m, DATA1N_tag, at);
416 else
417 tagn = data1_append_node (dh, m, DATA1N_tag, at);
418 tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname);
419 tagn->u.tag.element = e;
420 datn = data1_mk_node2 (dh, m, DATA1N_data, tagn);
421 }
422 return datn;
423 }
424
data1_mk_tag_data(data1_handle dh,data1_node * at,const char * tagname,NMEM m)425 data1_node *data1_mk_tag_data(data1_handle dh, data1_node *at,
426 const char *tagname, NMEM m)
427 {
428 return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0);
429 }
430
431
432 /*
433 * Insert a tagged node into the record root as first child of the node at
434 * which should be root or tag itself). Returns pointer to the data node,
435 * which can then be modified.
436 */
data1_mk_tag_data_wd(data1_handle dh,data1_node * at,const char * tagname,NMEM m)437 data1_node *data1_mk_tag_data_wd(data1_handle dh, data1_node *at,
438 const char *tagname, NMEM m)
439 {
440 return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1);
441 }
442
data1_insert_taggeddata(data1_handle dh,data1_node * root,data1_node * at,const char * tagname,NMEM m)443 data1_node *data1_insert_taggeddata (data1_handle dh, data1_node *root,
444 data1_node *at, const char *tagname,
445 NMEM m)
446 {
447 return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1);
448 }
449
data1_add_taggeddata(data1_handle dh,data1_node * root,data1_node * at,const char * tagname,NMEM m)450 data1_node *data1_add_taggeddata (data1_handle dh, data1_node *root,
451 data1_node *at, const char *tagname,
452 NMEM m)
453 {
454 return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0);
455 }
456
data1_mk_tag_data_zint(data1_handle dh,data1_node * at,const char * tag,zint num,NMEM nmem)457 data1_node *data1_mk_tag_data_zint (data1_handle dh, data1_node *at,
458 const char *tag, zint num,
459 NMEM nmem)
460 {
461 data1_node *node_data;
462
463 node_data = data1_mk_tag_data (dh, at, tag, nmem);
464 if (!node_data)
465 return 0;
466 node_data->u.data.what = DATA1I_num;
467 node_data->u.data.data = node_data->lbuf;
468 sprintf (node_data->u.data.data, ZINT_FORMAT, num);
469 node_data->u.data.len = strlen (node_data->u.data.data);
470 return node_data;
471 }
472
data1_mk_tag_data_int(data1_handle dh,data1_node * at,const char * tag,int num,NMEM nmem)473 data1_node *data1_mk_tag_data_int (data1_handle dh, data1_node *at,
474 const char *tag, int num,
475 NMEM nmem)
476 {
477 return data1_mk_tag_data_zint(dh, at, tag, num, nmem);
478 }
479
data1_mk_tag_data_oid(data1_handle dh,data1_node * at,const char * tag,Odr_oid * oid,NMEM nmem)480 data1_node *data1_mk_tag_data_oid (data1_handle dh, data1_node *at,
481 const char *tag, Odr_oid *oid,
482 NMEM nmem)
483 {
484 data1_node *node_data;
485 char str[128], *p = str;
486 Odr_oid *ii;
487
488 node_data = data1_mk_tag_data (dh, at, tag, nmem);
489 if (!node_data)
490 return 0;
491
492 for (ii = oid; *ii >= 0; ii++)
493 {
494 if (ii != oid)
495 *p++ = '.';
496 sprintf (p, "%d", *ii);
497 p += strlen (p);
498 }
499 node_data->u.data.what = DATA1I_oid;
500 node_data->u.data.len = strlen (str);
501 node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str);
502 return node_data;
503 }
504
505
data1_mk_tag_data_text(data1_handle dh,data1_node * at,const char * tag,const char * str,NMEM nmem)506 data1_node *data1_mk_tag_data_text (data1_handle dh, data1_node *at,
507 const char *tag, const char *str,
508 NMEM nmem)
509 {
510 data1_node *node_data;
511
512 node_data = data1_mk_tag_data (dh, at, tag, nmem);
513 if (!node_data)
514 return 0;
515 node_data->u.data.what = DATA1I_text;
516 node_data->u.data.len = strlen (str);
517 node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str);
518 return node_data;
519 }
520
521
data1_mk_tag_data_text_uni(data1_handle dh,data1_node * at,const char * tag,const char * str,NMEM nmem)522 data1_node *data1_mk_tag_data_text_uni (data1_handle dh, data1_node *at,
523 const char *tag, const char *str,
524 NMEM nmem)
525 {
526 data1_node *node = data1_search_tag (dh, at->child, tag);
527 if (!node)
528 return data1_mk_tag_data_text (dh, at, tag, str, nmem);
529 else
530 {
531 data1_node *node_data = node->child;
532 node_data->u.data.what = DATA1I_text;
533 node_data->u.data.len = strlen (str);
534 node_data->u.data.data = data1_insert_string (dh, node_data,
535 nmem, str);
536 node_data->child = node_data->last_child = 0;
537 return node_data;
538 }
539 }
540
ampr(int (* get_byte)(void * fh),void * fh,int * amp)541 static int ampr (int (*get_byte)(void *fh), void *fh, int *amp)
542 {
543 #if 1
544 int c = (*get_byte)(fh);
545 *amp = 0;
546 return c;
547 #else
548 int c = (*get_byte)(fh);
549 *amp = 0;
550 if (c == '&')
551 {
552 char ent[20];
553 int i = 0;
554
555 while (1)
556 {
557 c = (*get_byte)(fh);
558 if (c == ';')
559 {
560 ent[i] = 0;
561
562 c = ' ';
563 if (!strcmp (ent, "quot"))
564 c = '"';
565 if (!strcmp (ent, "apos"))
566 c = '\'';
567 if (!strcmp (ent, "gt"))
568 c = '>';
569 if (!strcmp (ent, "lt"))
570 c = '<';
571 if (!strcmp (ent, "amp"))
572 c = '&';
573 *amp = 1;
574 break;
575 }
576 else if (c == 0 || d1_isspace(c))
577 break;
578 if (i < 19)
579 ent[i++] = c;
580 }
581 }
582 return c;
583 #endif
584 }
585
data1_read_xattr(data1_handle dh,NMEM m,int (* get_byte)(void * fh),void * fh,WRBUF wrbuf,int * ch,int * amp)586 data1_xattr *data1_read_xattr (data1_handle dh, NMEM m,
587 int (*get_byte)(void *fh), void *fh,
588 WRBUF wrbuf, int *ch, int *amp)
589 {
590 data1_xattr *p_first = 0;
591 data1_xattr **pp = &p_first;
592 int c = *ch;
593 for (;;)
594 {
595 data1_xattr *p;
596 while (*amp || (c && d1_isspace(c)))
597 c = ampr (get_byte, fh, amp);
598 if (*amp == 0 && (c == 0 || c == '>' || c == '/'))
599 break;
600 *pp = p = (data1_xattr *) nmem_malloc (m, sizeof(*p));
601 p->next = 0;
602 pp = &p->next;
603 p->value = 0;
604 p->what = DATA1I_xmltext;
605
606 wrbuf_rewind(wrbuf);
607 while (c && c != '=' && c != '>' && c != '/' && !d1_isspace(c))
608 {
609 wrbuf_putc (wrbuf, c);
610 c = ampr (get_byte, fh, amp);
611 }
612 p->name = nmem_strdup (m, wrbuf_cstr(wrbuf));
613 if (c == '=')
614 {
615 c = ampr (get_byte, fh, amp);
616 if (*amp == 0 && c == '"')
617 {
618 c = ampr (get_byte, fh, amp);
619 wrbuf_rewind(wrbuf);
620 while (*amp || (c && c != '"'))
621 {
622 wrbuf_putc (wrbuf, c);
623 c = ampr (get_byte, fh, amp);
624 }
625 if (c)
626 c = ampr (get_byte, fh, amp);
627 }
628 else if (*amp == 0 && c == '\'')
629 {
630 c = ampr (get_byte, fh, amp);
631 wrbuf_rewind(wrbuf);
632 while (*amp || (c && c != '\''))
633 {
634 wrbuf_putc (wrbuf, c);
635 c = ampr (get_byte, fh, amp);
636 }
637 if (c)
638 c = ampr (get_byte, fh, amp);
639 }
640 else
641 {
642 wrbuf_rewind(wrbuf);
643 while (*amp || (c && c != '>' && c != '/'))
644 {
645 wrbuf_putc (wrbuf, c);
646 c = ampr (get_byte, fh, amp);
647 }
648 }
649 p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
650 }
651 }
652 *ch = c;
653 return p_first;
654 }
655
656 /*
657 * Ugh. Sometimes functions just grow and grow on you. This one reads a
658 * 'node' and its children.
659 */
data1_read_nodex(data1_handle dh,NMEM m,int (* get_byte)(void * fh),void * fh,WRBUF wrbuf)660 data1_node *data1_read_nodex (data1_handle dh, NMEM m,
661 int (*get_byte)(void *fh), void *fh, WRBUF wrbuf)
662 {
663 data1_node *d1_stack[256];
664 data1_node *res;
665 int c, amp;
666 int level = 0;
667 int line = 1;
668
669 d1_stack[level] = 0;
670 c = ampr (get_byte, fh, &);
671 while (c != '\0')
672 {
673 data1_node *parent = level ? d1_stack[level-1] : 0;
674
675 if (amp == 0 && c == '<') /* beginning of tag */
676 {
677 data1_xattr *xattr;
678
679 char tag[256];
680 int null_tag = 0;
681 int end_tag = 0;
682 size_t i = 0;
683
684 c = ampr (get_byte, fh, &);
685 if (amp == 0 && c == '/')
686 {
687 end_tag = 1;
688 c = ampr (get_byte, fh, &);
689 }
690 else if (amp == 0 && c == '?')
691 {
692 int quote_mode = 0;
693 while ((c = ampr(get_byte, fh, &)))
694 {
695 if (amp)
696 continue;
697 if (quote_mode == 0)
698 {
699 if (c == '"')
700 quote_mode = c;
701 else if (c == '\'')
702 quote_mode = c;
703 else if (c == '>')
704 {
705 c = ampr(get_byte, fh, &);
706 break;
707 }
708 }
709 else
710 {
711 if (amp == 0 && c == quote_mode)
712 quote_mode = 0;
713 }
714 }
715 continue;
716 }
717 else if (amp == 0 && c == '!')
718 {
719 int c0, amp0;
720
721 wrbuf_rewind(wrbuf);
722
723 c0 = ampr (get_byte, fh, &0);
724 if (amp0 == 0 && c0 == '\0')
725 break;
726 c = ampr (get_byte, fh, &);
727
728 if (amp0 == 0 && c0 == '-' && amp == 0 && c == '-')
729 {
730 /* COMMENT: <!-- ... --> */
731 int no_dash = 0;
732
733 c = ampr (get_byte, fh, &);
734 while (amp || c)
735 {
736 if (amp == 0 && c == '-')
737 no_dash++;
738 else if (amp == 0 && c == '>' && no_dash >= 2)
739 {
740 if (level > 0)
741 d1_stack[level] =
742 data1_mk_comment_n (
743 dh, m,
744 wrbuf_buf(wrbuf), wrbuf_len(wrbuf)-2,
745 d1_stack[level-1]);
746 c = ampr (get_byte, fh, &); /* skip > */
747 break;
748 }
749 else
750 no_dash = 0;
751 wrbuf_putc (wrbuf, c);
752 c = ampr (get_byte, fh, &);
753 }
754 continue;
755 }
756 else
757 { /* DIRECTIVE: <! .. > */
758
759 int blevel = 0;
760 while (amp || c)
761 {
762 if (amp == 0 && c == '>' && blevel == 0)
763 {
764 c = ampr (get_byte, fh, &);
765 break;
766 }
767 if (amp == 0 && c == '[')
768 blevel++;
769 if (amp == 0 && c == ']' && blevel > 0)
770 blevel--;
771 c = ampr (get_byte, fh, &);
772 }
773 continue;
774 }
775 }
776 while (amp || (c && c != '>' && c != '/' && !d1_isspace(c)))
777 {
778 if (i < (sizeof(tag)-1))
779 tag[i++] = c;
780 c = ampr (get_byte, fh, &);
781 }
782 tag[i] = '\0';
783 xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c, &);
784 if (amp == 0 && c == '/')
785 { /* <tag attrs/> or <tag/> */
786 null_tag = 1;
787 c = ampr (get_byte, fh, &);
788 }
789 if (amp || c != '>')
790 {
791 yaz_log(YLOG_WARN, "d1: %d: Malformed tag", line);
792 return 0;
793 }
794 else
795 c = ampr (get_byte, fh, &);
796
797 /* End tag? */
798 if (end_tag)
799 {
800 if (*tag == '\0')
801 --level; /* </> */
802 else
803 { /* </tag> */
804 int i = level;
805 while (i > 0)
806 {
807 parent = d1_stack[--i];
808 if ((parent->which == DATA1N_root &&
809 !strcmp(tag, parent->u.root.type)) ||
810 (parent->which == DATA1N_tag &&
811 !strcmp(tag, parent->u.tag.tag)))
812 {
813 level = i;
814 break;
815 }
816 }
817 if (i != level)
818 {
819 yaz_log (YLOG_WARN, "%d: no begin tag for %s",
820 line, tag);
821 break;
822 }
823 }
824 if (data1_is_xmlmode(dh))
825 {
826 if (level <= 1)
827 return d1_stack[0];
828 }
829 else
830 {
831 if (level <= 0)
832 return d1_stack[0];
833 }
834 continue;
835 }
836 else if (!strcmp(tag, "var")
837 && xattr && xattr->next && xattr->next->next
838 && xattr->value == 0
839 && xattr->next->value == 0
840 && xattr->next->next->value == 0)
841 {
842 /* <var class type value> */
843 const char *tclass = xattr->name;
844 const char *type = xattr->next->name;
845 const char *value = xattr->next->name;
846 data1_vartype *tp;
847
848 yaz_log(YLOG_LOG, "Variant class=%s type=%s value=%s",
849 tclass, type, value);
850 if (!(tp =
851 data1_getvartypebyct(dh,
852 parent->root->u.root.absyn->varset,
853 tclass, type)))
854 continue;
855 /*
856 * If we're the first variant in this group, create a parent
857 * variant, and insert it before the current variant.
858 */
859 if (parent->which != DATA1N_variant)
860 {
861 res = data1_mk_node2 (dh, m, DATA1N_variant, parent);
862 }
863 else
864 {
865 /*
866 * now determine if one of our ancestor triples is of
867 * same type. If so, we break here.
868 */
869 int i;
870 for (i = level-1; d1_stack[i]->which==DATA1N_variant; --i)
871 if (d1_stack[i]->u.variant.type == tp)
872 {
873 level = i;
874 break;
875 }
876 res = data1_mk_node2 (dh, m, DATA1N_variant, parent);
877 res->u.variant.type = tp;
878 res->u.variant.value =
879 data1_insert_string (dh, res, m, value);
880 }
881 }
882 else
883 {
884
885 /* tag .. acquire our element in the abstract syntax */
886 if (level == 0)
887 {
888 parent = data1_mk_root (dh, m, tag);
889 res = d1_stack[level] = parent;
890
891 if (data1_is_xmlmode(dh))
892 {
893 level++;
894 res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);
895 res->u.tag.attributes = xattr;
896 }
897 }
898 else
899 {
900 res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);
901 res->u.tag.attributes = xattr;
902 }
903 }
904 d1_stack[level] = res;
905 d1_stack[level+1] = 0;
906 if (level < 250 && !null_tag)
907 ++level;
908 }
909 else /* != '<'... this is a body of text */
910 {
911 int len;
912
913 if (level == 0)
914 {
915 c = ampr (get_byte, fh, &);
916 continue;
917 }
918 res = data1_mk_node2 (dh, m, DATA1N_data, parent);
919 res->u.data.what = DATA1I_xmltext;
920 res->u.data.formatted_text = 0;
921 d1_stack[level] = res;
922
923 wrbuf_rewind(wrbuf);
924
925 while (amp || (c && c != '<'))
926 {
927 wrbuf_putc (wrbuf, c);
928 c = ampr (get_byte, fh, &);
929 }
930 len = wrbuf_len(wrbuf);
931
932 /* use local buffer of nmem if too large */
933 if (len >= DATA1_LOCALDATA)
934 res->u.data.data = (char*) nmem_malloc (m, len);
935 else
936 res->u.data.data = res->lbuf;
937
938 if (len)
939 memcpy (res->u.data.data, wrbuf_buf(wrbuf), len);
940 else
941 res->u.data.data = 0;
942 res->u.data.len = len;
943 }
944 }
945 return 0;
946 }
947
getc_mem(void * fh)948 int getc_mem (void *fh)
949 {
950 const char **p = (const char **) fh;
951 if (**p)
952 return *(*p)++;
953 return 0;
954 }
955
data1_read_node(data1_handle dh,const char ** buf,NMEM m)956 data1_node *data1_read_node (data1_handle dh, const char **buf, NMEM m)
957 {
958 WRBUF wrbuf = wrbuf_alloc();
959 data1_node *node;
960
961 node = data1_read_nodex(dh, m, getc_mem, (void *) (buf), wrbuf);
962 wrbuf_destroy(wrbuf);
963 return node;
964 }
965
966 /*
967 * Read a record in the native syntax.
968 */
data1_read_record(data1_handle dh,int (* rf)(void *,char *,size_t),void * fh,NMEM m)969 data1_node *data1_read_record(data1_handle dh,
970 int (*rf)(void *, char *, size_t), void *fh,
971 NMEM m)
972 {
973 int *size;
974 char **buf = data1_get_read_buf (dh, &size);
975 const char *bp;
976 int rd = 0, res;
977
978 if (!*buf)
979 *buf = (char *)xmalloc(*size = 4096);
980
981 for (;;)
982 {
983 if (rd + 2048 >= *size && !(*buf =(char *)xrealloc(*buf, *size *= 2)))
984 abort();
985 if ((res = (*rf)(fh, *buf + rd, 2048)) <= 0)
986 {
987 if (!res)
988 {
989 bp = *buf;
990 (*buf)[rd] = '\0';
991 return data1_read_node(dh, &bp, m);
992 }
993 else
994 return 0;
995 }
996 rd += res;
997 }
998 }
999
data1_read_sgml(data1_handle dh,NMEM m,const char * buf)1000 data1_node *data1_read_sgml (data1_handle dh, NMEM m, const char *buf)
1001 {
1002 const char *bp = buf;
1003 return data1_read_node (dh, &bp, m);
1004 }
1005
1006
conv_item(NMEM m,yaz_iconv_t t,WRBUF wrbuf,char * inbuf,size_t inlen)1007 static int conv_item(NMEM m, yaz_iconv_t t,
1008 WRBUF wrbuf, char *inbuf, size_t inlen)
1009 {
1010 wrbuf_rewind(wrbuf);
1011 wrbuf_iconv_write(wrbuf, t, inbuf, inlen);
1012 wrbuf_iconv_reset(wrbuf, t);
1013 return 0;
1014 }
1015
data1_iconv_s(data1_handle dh,NMEM m,data1_node * n,yaz_iconv_t t,WRBUF wrbuf,const char * tocode)1016 static void data1_iconv_s (data1_handle dh, NMEM m, data1_node *n,
1017 yaz_iconv_t t, WRBUF wrbuf, const char *tocode)
1018 {
1019 for (; n; n = n->next)
1020 {
1021 switch (n->which)
1022 {
1023 case DATA1N_data:
1024 case DATA1N_comment:
1025 if (conv_item (m, t, wrbuf, n->u.data.data, n->u.data.len) == 0)
1026 {
1027 n->u.data.data =
1028 data1_insert_string_n (dh, n, m, wrbuf->buf,
1029 wrbuf->pos);
1030 n->u.data.len = wrbuf->pos;
1031 }
1032 break;
1033 case DATA1N_tag:
1034 if (conv_item (m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag))
1035 == 0)
1036 {
1037 n->u.tag.tag =
1038 data1_insert_string_n (dh, n, m,
1039 wrbuf->buf, wrbuf->pos);
1040 }
1041 if (n->u.tag.attributes)
1042 {
1043 data1_xattr *p;
1044 for (p = n->u.tag.attributes; p; p = p->next)
1045 {
1046 if (p->value &&
1047 conv_item(m, t, wrbuf, p->value, strlen(p->value))
1048 == 0)
1049 {
1050 p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
1051 }
1052 }
1053 }
1054 break;
1055 case DATA1N_preprocess:
1056 if (strcmp(n->u.preprocess.target, "xml") == 0)
1057 {
1058 data1_xattr *p = n->u.preprocess.attributes;
1059 for (; p; p = p->next)
1060 if (strcmp (p->name, "encoding") == 0)
1061 p->value = nmem_strdup (m, tocode);
1062 }
1063 break;
1064 }
1065 data1_iconv_s (dh, m, n->child, t, wrbuf, tocode);
1066 }
1067 }
1068
data1_get_encoding(data1_handle dh,data1_node * n)1069 const char *data1_get_encoding (data1_handle dh, data1_node *n)
1070 {
1071 /* see if we have an xml header that specifies encoding */
1072 if (n && n->child && n->child->which == DATA1N_preprocess &&
1073 strcmp (n->child->u.preprocess.target, "xml") == 0)
1074 {
1075 data1_xattr *xp = n->child->u.preprocess.attributes;
1076 for (; xp; xp = xp->next)
1077 if (!strcmp (xp->name, "encoding") == 0)
1078 return xp->value;
1079 }
1080 /* no encoding in header, so see if "encoding" was specified for abs */
1081 if (n && n->which == DATA1N_root &&
1082 n->u.root.absyn && n->u.root.absyn->encoding)
1083 return n->u.root.absyn->encoding;
1084 /* none of above, return a hard coded default */
1085 return "ISO-8859-1";
1086 }
1087
data1_iconv(data1_handle dh,NMEM m,data1_node * n,const char * tocode,const char * fromcode)1088 int data1_iconv (data1_handle dh, NMEM m, data1_node *n,
1089 const char *tocode,
1090 const char *fromcode)
1091 {
1092 if (yaz_matchstr (tocode, fromcode))
1093 {
1094 WRBUF wrbuf = wrbuf_alloc();
1095 yaz_iconv_t t = yaz_iconv_open(tocode, fromcode);
1096 if (!t)
1097 {
1098 wrbuf_destroy(wrbuf);
1099 return -1;
1100 }
1101 data1_iconv_s(dh, m, n, t, wrbuf, tocode);
1102 yaz_iconv_close(t);
1103 wrbuf_destroy(wrbuf);
1104 }
1105 return 0;
1106 }
1107
data1_chop_text(data1_handle dh,NMEM m,data1_node * n)1108 void data1_chop_text(data1_handle dh, NMEM m, data1_node *n)
1109 {
1110 for (; n; n = n->next)
1111 {
1112 if (n->which == DATA1N_data)
1113 {
1114
1115 int sz = n->u.data.len;
1116 const char *ndata = n->u.data.data;
1117 int off = 0;
1118
1119 for (off = 0; off < sz; off++)
1120 if (!d1_isspace(ndata[off]))
1121 break;
1122 sz = sz - off;
1123 ndata += off;
1124
1125 while (sz && d1_isspace(ndata[sz - 1]))
1126 sz--;
1127
1128 n->u.data.data = nmem_malloc(m, sz);
1129 n->u.data.len = sz;
1130 memcpy(n->u.data.data, ndata, sz);
1131
1132 }
1133 data1_chop_text(dh, m, n->child);
1134 }
1135 }
1136
data1_concat_text(data1_handle dh,NMEM m,data1_node * n)1137 void data1_concat_text(data1_handle dh, NMEM m, data1_node *n)
1138 {
1139 for (; n; n = n->next)
1140 {
1141 if (n->which == DATA1N_data && n->next &&
1142 n->next->which == DATA1N_data)
1143 {
1144 int sz = 0;
1145 int off = 0;
1146 char *ndata;
1147 data1_node *np;
1148 for (np = n; np && np->which == DATA1N_data; np=np->next)
1149 sz += np->u.data.len;
1150 ndata = nmem_malloc(m, sz);
1151 for (np = n; np && np->which == DATA1N_data; np=np->next)
1152 {
1153 memcpy(ndata+off, np->u.data.data, np->u.data.len);
1154 off += np->u.data.len;
1155 }
1156 n->u.data.data = ndata;
1157 n->u.data.len = sz;
1158 n->next = np;
1159 if (!np && n->parent)
1160 n->parent->last_child = n;
1161
1162 }
1163 data1_concat_text(dh, m, n->child);
1164 }
1165 }
1166
1167 /*
1168 * Local variables:
1169 * c-basic-offset: 4
1170 * c-file-style: "Stroustrup"
1171 * indent-tabs-mode: nil
1172 * End:
1173 * vim: shiftwidth=4 tabstop=8 expandtab
1174 */
1175
1176