1 /* This file is part of the Zebra server.
2    Copyright (C) 2004-2013 Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 
18 */
19 
20 
21 /*
22  * This module reads "loose" SGML and converts it to data1 tree
23  */
24 
25 #if HAVE_CONFIG_H
26 #include <config.h>
27 #endif
28 #include <assert.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 
32 #include <yaz/yaz-util.h>
33 #include <d1_absyn.h>
34 
data1_get_root_tag(data1_handle dh,data1_node * n)35 data1_node *data1_get_root_tag (data1_handle dh, data1_node *n)
36 {
37     if (!n)
38         return 0;
39     if (data1_is_xmlmode(dh))
40     {
41         n = n->child;
42         while (n && n->which != DATA1N_tag)
43             n = n->next;
44     }
45     return n;
46 }
47 
48 /*
49  * get the tag which is the immediate parent of this node (this may mean
50  * traversing intermediate things like variants and stuff.
51  */
get_parent_tag(data1_handle dh,data1_node * n)52 data1_node *get_parent_tag (data1_handle dh, data1_node *n)
53 {
54     if (data1_is_xmlmode(dh))
55     {
56         for (; n && n->which != DATA1N_root; n = n->parent)
57             if (n->which == DATA1N_tag && n->parent &&
58                 n->parent->which != DATA1N_root)
59                 return n;
60     }
61     else
62     {
63         for (; n && n->which != DATA1N_root; n = n->parent)
64             if (n->which == DATA1N_tag)
65                 return n;
66     }
67     return 0;
68 }
69 
data1_mk_node(data1_handle dh,NMEM m)70 data1_node *data1_mk_node (data1_handle dh, NMEM m)
71 {
72     return data1_mk_node2 (dh, m, DATA1N_root, 0);
73 }
74 
data1_mk_node_type(data1_handle dh,NMEM m,int type)75 data1_node *data1_mk_node_type (data1_handle dh, NMEM m, int type)
76 {
77     return data1_mk_node2 (dh, m, type, 0);
78 }
79 
data1_init_node(data1_handle dh,data1_node * r,int type)80 static void data1_init_node (data1_handle dh, data1_node *r, int type)
81 {
82     r->which = type;
83     switch(type)
84     {
85     case DATA1N_tag:
86 	r->u.tag.tag = 0;
87 	r->u.tag.element = 0;
88 	r->u.tag.no_data_requested = 0;
89 	r->u.tag.node_selected = 0;
90 	r->u.tag.make_variantlist = 0;
91 	r->u.tag.get_bytes = -1;
92 	r->u.tag.attributes = 0;
93 	break;
94     case DATA1N_root:
95 	r->u.root.type = 0;
96 	r->u.root.absyn = 0;
97 	break;
98     case DATA1N_data:
99 	r->u.data.data = 0;
100 	r->u.data.len = 0;
101 	r->u.data.what = 0;
102 	r->u.data.formatted_text = 0;
103         break;
104     case DATA1N_comment:
105 	r->u.data.data = 0;
106 	r->u.data.len = 0;
107 	r->u.data.what = 0;
108 	r->u.data.formatted_text = 1;
109         break;
110     case DATA1N_variant:
111         r->u.variant.type = 0;
112         r->u.variant.value = 0;
113 	break;
114     case DATA1N_preprocess:
115         r->u.preprocess.target = 0;
116         r->u.preprocess.attributes = 0;
117         break;
118     default:
119 	yaz_log (YLOG_WARN, "data_mk_node_type. bad type = %d\n", type);
120     }
121 }
122 
data1_append_node(data1_handle dh,NMEM m,int type,data1_node * parent)123 data1_node *data1_append_node (data1_handle dh, NMEM m, int type,
124                                data1_node *parent)
125 {
126     data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
127     r->next = r->child = r->last_child = 0;
128 
129     r->parent = parent;
130     if (!parent)
131         r->root = r;
132     else
133     {
134         r->root = parent->root;
135         if (!parent->child)
136             parent->child = parent->last_child = r;
137         else
138             parent->last_child->next = r;
139         parent->last_child = r;
140     }
141     data1_init_node(dh, r, type);
142     return r;
143 }
144 
data1_mk_node2(data1_handle dh,NMEM m,int type,data1_node * parent)145 data1_node *data1_mk_node2 (data1_handle dh, NMEM m, int type,
146                             data1_node *parent)
147 {
148     return data1_append_node (dh, m, type, parent);
149 }
150 
data1_insert_node(data1_handle dh,NMEM m,int type,data1_node * parent)151 data1_node *data1_insert_node (data1_handle dh, NMEM m, int type,
152                                data1_node *parent)
153 {
154     data1_node *r = (data1_node *)nmem_malloc(m, sizeof(*r));
155     r->next = r->child = r->last_child = 0;
156 
157     if (!parent)
158         r->root = r;
159     else
160     {
161         r->root = parent->root;
162         r->parent = parent;
163         if (!parent->child)
164             parent->last_child = r;
165         else
166             r->next = parent->child;
167         parent->child = r;
168     }
169     data1_init_node(dh, r, type);
170     return r;
171 }
172 
data1_mk_root(data1_handle dh,NMEM nmem,const char * name)173 data1_node *data1_mk_root (data1_handle dh, NMEM nmem, const char *name)
174 {
175     data1_absyn *absyn = data1_get_absyn(dh, name, 1);
176     data1_node *res;
177 
178     if (!absyn)
179     {
180         yaz_log(YLOG_WARN, "Unable to acquire abstract syntax " "for '%s'",
181                 name);
182         /* It's now OK for a record not to have an absyn */
183     }
184     res = data1_mk_node2 (dh, nmem, DATA1N_root, 0);
185     res->u.root.type = data1_insert_string (dh, res, nmem, name);
186     res->u.root.absyn = absyn;
187     return res;
188 }
189 
data1_set_root(data1_handle dh,data1_node * res,NMEM nmem,const char * name)190 void data1_set_root(data1_handle dh, data1_node *res,
191                     NMEM nmem, const char *name)
192 {
193     data1_absyn *absyn = data1_get_absyn(
194         dh, name, DATA1_XPATH_INDEXING_ENABLE);
195 
196     res->u.root.type = data1_insert_string (dh, res, nmem, name);
197     res->u.root.absyn = absyn;
198 }
199 
data1_add_attrs(data1_handle dh,NMEM nmem,const char ** attr,data1_xattr ** p)200 void data1_add_attrs(data1_handle dh, NMEM nmem, const char **attr,
201 		     data1_xattr **p)
202 {
203     while (*p)
204         p = &(*p)->next;
205 
206     while (attr && *attr)
207     {
208         *p = (data1_xattr*) nmem_malloc (nmem, sizeof(**p));
209         (*p)->name = nmem_strdup (nmem, *attr++);
210         (*p)->value = nmem_strdup (nmem, *attr++);
211         (*p)->what = DATA1I_text;
212 
213         p = &(*p)->next;
214     }
215     *p = 0;
216 }
217 
data1_mk_preprocess(data1_handle dh,NMEM nmem,const char * target,const char ** attr,data1_node * at)218 data1_node *data1_mk_preprocess (data1_handle dh, NMEM nmem,
219                                  const char *target,
220                                  const char **attr, data1_node *at)
221 {
222     return data1_mk_preprocess_n (dh, nmem, target, strlen(target),
223                                   attr, at);
224 }
225 
data1_mk_preprocess_n(data1_handle dh,NMEM nmem,const char * target,size_t len,const char ** attr,data1_node * at)226 data1_node *data1_mk_preprocess_n (data1_handle dh, NMEM nmem,
227                                    const char *target, size_t len,
228                                    const char **attr, data1_node *at)
229 {
230     data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_preprocess, at);
231     res->u.preprocess.target = data1_insert_string_n (dh, res, nmem,
232                                                       target, len);
233 
234     data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
235     return res;
236 }
237 
data1_insert_preprocess(data1_handle dh,NMEM nmem,const char * target,const char ** attr,data1_node * at)238 data1_node *data1_insert_preprocess (data1_handle dh, NMEM nmem,
239                                  const char *target,
240                                  const char **attr, data1_node *at)
241 {
242     return data1_insert_preprocess_n (dh, nmem, target, strlen(target),
243 				      attr, at);
244 }
245 
data1_insert_preprocess_n(data1_handle dh,NMEM nmem,const char * target,size_t len,const char ** attr,data1_node * at)246 data1_node *data1_insert_preprocess_n (data1_handle dh, NMEM nmem,
247                                    const char *target, size_t len,
248                                    const char **attr, data1_node *at)
249 {
250     data1_node *res = data1_insert_node (dh, nmem, DATA1N_preprocess, at);
251     res->u.preprocess.target = data1_insert_string_n (dh, res, nmem,
252                                                       target, len);
253 
254     data1_add_attrs(dh, nmem, attr, &res->u.preprocess.attributes);
255     return res;
256 }
257 
data1_mk_tag_n(data1_handle dh,NMEM nmem,const char * tag,size_t len,const char ** attr,data1_node * at)258 data1_node *data1_mk_tag_n (data1_handle dh, NMEM nmem,
259                             const char *tag, size_t len, const char **attr,
260                             data1_node *at)
261 {
262     data1_node *partag = get_parent_tag(dh, at);
263     data1_node *res = data1_mk_node2 (dh, nmem, DATA1N_tag, at);
264     data1_element *e = 0;
265 
266     res->u.tag.tag = data1_insert_string_n (dh, res, nmem, tag, len);
267 
268     if (!partag)  /* top tag? */
269         e  = data1_getelementbytagname (dh, at->root->u.root.absyn,
270                                         0 /* index as local */,
271                                         res->u.tag.tag);
272     else
273     {
274         /* only set element for known tags */
275         e = partag->u.tag.element;
276         if (e)
277             e = data1_getelementbytagname (dh, at->root->u.root.absyn,
278                                            e, res->u.tag.tag);
279     }
280     res->u.tag.element = e;
281     data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
282     return res;
283 }
284 
data1_tag_add_attr(data1_handle dh,NMEM nmem,data1_node * res,const char ** attr)285 void data1_tag_add_attr (data1_handle dh, NMEM nmem,
286                          data1_node *res, const char **attr)
287 {
288     if (res->which != DATA1N_tag)
289         return;
290 
291     data1_add_attrs(dh, nmem, attr, &res->u.tag.attributes);
292 }
293 
data1_mk_tag(data1_handle dh,NMEM nmem,const char * tag,const char ** attr,data1_node * at)294 data1_node *data1_mk_tag (data1_handle dh, NMEM nmem,
295                           const char *tag, const char **attr, data1_node *at)
296 {
297     return data1_mk_tag_n (dh, nmem, tag, strlen(tag), attr, at);
298 }
299 
data1_search_tag(data1_handle dh,data1_node * n,const char * tag)300 data1_node *data1_search_tag (data1_handle dh, data1_node *n,
301                               const char *tag)
302 {
303     if (*tag == '/')
304     {
305         n = data1_get_root_tag (dh, n);
306         if (n)
307             n = n->child;
308         tag++;
309     }
310     for (; n; n = n->next)
311 	if (n->which == DATA1N_tag && n->u.tag.tag &&
312 	    !yaz_matchstr (n->u.tag.tag, tag))
313 	{
314 	    return n;
315 	}
316     return 0;
317 }
318 
data1_mk_tag_uni(data1_handle dh,NMEM nmem,const char * tag,data1_node * at)319 data1_node *data1_mk_tag_uni (data1_handle dh, NMEM nmem,
320                               const char *tag, data1_node *at)
321 {
322     data1_node *node = data1_search_tag (dh, at->child, tag);
323     if (!node)
324 	node = data1_mk_tag (dh, nmem, tag, 0 /* attr */, at);
325     else
326         node->child = node->last_child = 0;
327     return node;
328 }
329 
data1_mk_text_n(data1_handle dh,NMEM mem,const char * buf,size_t len,data1_node * parent)330 data1_node *data1_mk_text_n (data1_handle dh, NMEM mem,
331                              const char *buf, size_t len, data1_node *parent)
332 {
333     data1_node *res = data1_mk_node2 (dh, mem, DATA1N_data, parent);
334     res->u.data.what = DATA1I_text;
335     res->u.data.len = len;
336 
337     res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);
338     return res;
339 }
340 
data1_mk_text_nf(data1_handle dh,NMEM mem,const char * buf,size_t len,data1_node * parent)341 data1_node *data1_mk_text_nf (data1_handle dh, NMEM mem,
342                               const char *buf, size_t len, data1_node *parent)
343 {
344     data1_node *res = data1_mk_text_n (dh, mem, buf, len, parent);
345     res->u.data.formatted_text = 1;
346     return res;
347 }
348 
data1_mk_text(data1_handle dh,NMEM mem,const char * buf,data1_node * parent)349 data1_node *data1_mk_text (data1_handle dh, NMEM mem,
350                            const char *buf, data1_node *parent)
351 {
352     return data1_mk_text_n (dh, mem, buf, strlen(buf), parent);
353 }
354 
data1_mk_comment_n(data1_handle dh,NMEM mem,const char * buf,size_t len,data1_node * parent)355 data1_node *data1_mk_comment_n (data1_handle dh, NMEM mem,
356                                 const char *buf, size_t len,
357                                 data1_node *parent)
358 {
359     data1_node *res = data1_mk_node2 (dh, mem, DATA1N_comment, parent);
360     res->u.data.what = DATA1I_text;
361     res->u.data.len = len;
362 
363     res->u.data.data = data1_insert_string_n (dh, res, mem, buf, len);
364     return res;
365 }
366 
data1_mk_comment(data1_handle dh,NMEM mem,const char * buf,data1_node * parent)367 data1_node *data1_mk_comment (data1_handle dh, NMEM mem,
368                               const char *buf, data1_node *parent)
369 {
370     return data1_mk_comment_n (dh, mem, buf, strlen(buf), parent);
371 }
372 
data1_insert_string_n(data1_handle dh,data1_node * res,NMEM m,const char * str,size_t len)373 char *data1_insert_string_n (data1_handle dh, data1_node *res,
374                              NMEM m, const char *str, size_t len)
375 {
376     char *b;
377     if (len >= DATA1_LOCALDATA)
378         b = (char *) nmem_malloc (m, len+1);
379     else
380         b = res->lbuf;
381     memcpy (b, str, len);
382     b[len] = 0;
383     return b;
384 }
385 
data1_insert_string(data1_handle dh,data1_node * res,NMEM m,const char * str)386 char *data1_insert_string (data1_handle dh, data1_node *res,
387                            NMEM m, const char *str)
388 {
389     return data1_insert_string_n (dh, res, m, str, strlen(str));
390 }
391 
data1_add_insert_taggeddata(data1_handle dh,data1_node * at,const char * tagname,NMEM m,int local_allowed,int insert_mode)392 static data1_node *data1_add_insert_taggeddata(data1_handle dh,
393                                                data1_node *at,
394                                                const char *tagname, NMEM m,
395                                                int local_allowed,
396 					       int insert_mode)
397 {
398     data1_node *root = at->root;
399     data1_node *partag = get_parent_tag (dh, at);
400     data1_element *e = NULL;
401     data1_node *datn = 0;
402     data1_node *tagn = 0;
403 
404     if (!partag)
405         e = data1_getelementbytagname (dh, root->u.root.absyn, 0, tagname);
406     else
407     {
408 	e = partag->u.tag.element;
409         if (e)
410             e = data1_getelementbytagname (dh, root->u.root.absyn, e, tagname);
411     }
412     if (local_allowed || e)
413     {
414         if (insert_mode)
415             tagn = data1_insert_node (dh, m, DATA1N_tag, at);
416         else
417             tagn = data1_append_node (dh, m, DATA1N_tag, at);
418         tagn->u.tag.tag = data1_insert_string (dh, tagn, m, tagname);
419         tagn->u.tag.element = e;
420         datn = data1_mk_node2 (dh, m, DATA1N_data, tagn);
421     }
422     return datn;
423 }
424 
data1_mk_tag_data(data1_handle dh,data1_node * at,const char * tagname,NMEM m)425 data1_node *data1_mk_tag_data(data1_handle dh, data1_node *at,
426                               const char *tagname, NMEM m)
427 {
428     return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0);
429 }
430 
431 
432 /*
433  * Insert a tagged node into the record root as first child of the node at
434  * which should be root or tag itself). Returns pointer to the data node,
435  * which can then be modified.
436  */
data1_mk_tag_data_wd(data1_handle dh,data1_node * at,const char * tagname,NMEM m)437 data1_node *data1_mk_tag_data_wd(data1_handle dh, data1_node *at,
438                                  const char *tagname, NMEM m)
439 {
440     return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1);
441 }
442 
data1_insert_taggeddata(data1_handle dh,data1_node * root,data1_node * at,const char * tagname,NMEM m)443 data1_node *data1_insert_taggeddata (data1_handle dh, data1_node *root,
444                                      data1_node *at, const char *tagname,
445                                      NMEM m)
446 {
447     return data1_add_insert_taggeddata (dh, at, tagname, m, 0, 1);
448 }
449 
data1_add_taggeddata(data1_handle dh,data1_node * root,data1_node * at,const char * tagname,NMEM m)450 data1_node *data1_add_taggeddata (data1_handle dh, data1_node *root,
451                                   data1_node *at, const char *tagname,
452                                   NMEM m)
453 {
454     return data1_add_insert_taggeddata (dh, at, tagname, m, 1, 0);
455 }
456 
data1_mk_tag_data_zint(data1_handle dh,data1_node * at,const char * tag,zint num,NMEM nmem)457 data1_node *data1_mk_tag_data_zint (data1_handle dh, data1_node *at,
458                                    const char *tag, zint num,
459                                    NMEM nmem)
460 {
461     data1_node *node_data;
462 
463     node_data = data1_mk_tag_data (dh, at, tag, nmem);
464     if (!node_data)
465 	return 0;
466     node_data->u.data.what = DATA1I_num;
467     node_data->u.data.data = node_data->lbuf;
468     sprintf (node_data->u.data.data, ZINT_FORMAT, num);
469     node_data->u.data.len = strlen (node_data->u.data.data);
470     return node_data;
471 }
472 
data1_mk_tag_data_int(data1_handle dh,data1_node * at,const char * tag,int num,NMEM nmem)473 data1_node *data1_mk_tag_data_int (data1_handle dh, data1_node *at,
474                                    const char *tag, int num,
475                                    NMEM nmem)
476 {
477     return data1_mk_tag_data_zint(dh, at, tag, num, nmem);
478 }
479 
data1_mk_tag_data_oid(data1_handle dh,data1_node * at,const char * tag,Odr_oid * oid,NMEM nmem)480 data1_node *data1_mk_tag_data_oid (data1_handle dh, data1_node *at,
481                                    const char *tag, Odr_oid *oid,
482                                    NMEM nmem)
483 {
484     data1_node *node_data;
485     char str[128], *p = str;
486     Odr_oid *ii;
487 
488     node_data = data1_mk_tag_data (dh, at, tag, nmem);
489     if (!node_data)
490 	return 0;
491 
492     for (ii = oid; *ii >= 0; ii++)
493     {
494 	if (ii != oid)
495 	    *p++ = '.';
496 	sprintf (p, "%d", *ii);
497 	p += strlen (p);
498     }
499     node_data->u.data.what = DATA1I_oid;
500     node_data->u.data.len = strlen (str);
501     node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str);
502     return node_data;
503 }
504 
505 
data1_mk_tag_data_text(data1_handle dh,data1_node * at,const char * tag,const char * str,NMEM nmem)506 data1_node *data1_mk_tag_data_text (data1_handle dh, data1_node *at,
507                                     const char *tag, const char *str,
508                                     NMEM nmem)
509 {
510     data1_node *node_data;
511 
512     node_data = data1_mk_tag_data (dh, at, tag, nmem);
513     if (!node_data)
514 	return 0;
515     node_data->u.data.what = DATA1I_text;
516     node_data->u.data.len = strlen (str);
517     node_data->u.data.data = data1_insert_string (dh, node_data, nmem, str);
518     return node_data;
519 }
520 
521 
data1_mk_tag_data_text_uni(data1_handle dh,data1_node * at,const char * tag,const char * str,NMEM nmem)522 data1_node *data1_mk_tag_data_text_uni (data1_handle dh, data1_node *at,
523                                         const char *tag, const char *str,
524                                         NMEM nmem)
525 {
526     data1_node *node = data1_search_tag (dh, at->child, tag);
527     if (!node)
528         return data1_mk_tag_data_text (dh, at, tag, str, nmem);
529     else
530     {
531 	data1_node *node_data = node->child;
532 	node_data->u.data.what = DATA1I_text;
533 	node_data->u.data.len = strlen (str);
534 	node_data->u.data.data = data1_insert_string (dh, node_data,
535 						      nmem, str);
536         node_data->child = node_data->last_child = 0;
537 	return node_data;
538     }
539 }
540 
ampr(int (* get_byte)(void * fh),void * fh,int * amp)541 static int ampr (int (*get_byte)(void *fh), void *fh, int *amp)
542 {
543 #if 1
544     int c = (*get_byte)(fh);
545     *amp = 0;
546     return c;
547 #else
548     int c = (*get_byte)(fh);
549     *amp = 0;
550     if (c == '&')
551     {
552         char ent[20];
553         int i = 0;
554 
555         while (1)
556         {
557             c = (*get_byte)(fh);
558             if (c == ';')
559             {
560                 ent[i] = 0;
561 
562                 c = ' ';
563                 if (!strcmp (ent, "quot"))
564                     c = '"';
565                 if (!strcmp (ent, "apos"))
566                     c = '\'';
567                 if (!strcmp (ent, "gt"))
568                     c = '>';
569                 if (!strcmp (ent, "lt"))
570                     c = '<';
571                 if (!strcmp (ent, "amp"))
572                     c = '&';
573                 *amp = 1;
574                 break;
575             }
576             else if (c == 0 || d1_isspace(c))
577                 break;
578             if (i < 19)
579                 ent[i++] = c;
580         }
581     }
582     return c;
583 #endif
584 }
585 
data1_read_xattr(data1_handle dh,NMEM m,int (* get_byte)(void * fh),void * fh,WRBUF wrbuf,int * ch,int * amp)586 data1_xattr *data1_read_xattr (data1_handle dh, NMEM m,
587 			       int (*get_byte)(void *fh), void *fh,
588 			       WRBUF wrbuf, int *ch, int *amp)
589 {
590     data1_xattr *p_first = 0;
591     data1_xattr **pp = &p_first;
592     int c = *ch;
593     for (;;)
594     {
595 	data1_xattr *p;
596 	while (*amp || (c && d1_isspace(c)))
597 	    c = ampr (get_byte, fh, amp);
598 	if (*amp == 0 && (c == 0 || c == '>' || c == '/'))
599 	    break;
600 	*pp = p = (data1_xattr *) nmem_malloc (m, sizeof(*p));
601 	p->next = 0;
602 	pp = &p->next;
603 	p->value = 0;
604         p->what = DATA1I_xmltext;
605 
606 	wrbuf_rewind(wrbuf);
607 	while (c && c != '=' && c != '>' && c != '/' && !d1_isspace(c))
608 	{
609 	    wrbuf_putc (wrbuf, c);
610 	    c = ampr (get_byte, fh, amp);
611 	}
612 	p->name = nmem_strdup (m, wrbuf_cstr(wrbuf));
613 	if (c == '=')
614 	{
615 	    c = ampr (get_byte, fh, amp);
616 	    if (*amp == 0 && c == '"')
617 	    {
618 		c = ampr (get_byte, fh, amp);
619 		wrbuf_rewind(wrbuf);
620 		while (*amp || (c && c != '"'))
621 		{
622 		    wrbuf_putc (wrbuf, c);
623 		    c = ampr (get_byte, fh, amp);
624 	        }
625 	        if (c)
626 		    c = ampr (get_byte, fh, amp);
627 	    }
628 	    else if (*amp == 0 && c == '\'')
629 	    {
630 		c = ampr (get_byte, fh, amp);
631 		wrbuf_rewind(wrbuf);
632 		while (*amp || (c && c != '\''))
633 		{
634 		    wrbuf_putc (wrbuf, c);
635 		    c = ampr (get_byte, fh, amp);
636 	        }
637 	        if (c)
638 		    c = ampr (get_byte, fh, amp);
639 	    }
640 	    else
641 	    {
642 	        wrbuf_rewind(wrbuf);
643 	        while (*amp || (c && c != '>' && c != '/'))
644 	        {
645 		    wrbuf_putc (wrbuf, c);
646 		    c = ampr (get_byte, fh, amp);
647 	        }
648             }
649 	    p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
650 	}
651     }
652     *ch = c;
653     return p_first;
654 }
655 
656 /*
657  * Ugh. Sometimes functions just grow and grow on you. This one reads a
658  * 'node' and its children.
659  */
data1_read_nodex(data1_handle dh,NMEM m,int (* get_byte)(void * fh),void * fh,WRBUF wrbuf)660 data1_node *data1_read_nodex (data1_handle dh, NMEM m,
661 			      int (*get_byte)(void *fh), void *fh, WRBUF wrbuf)
662 {
663     data1_node *d1_stack[256];
664     data1_node *res;
665     int c, amp;
666     int level = 0;
667     int line = 1;
668 
669     d1_stack[level] = 0;
670     c = ampr (get_byte, fh, &amp);
671     while (c != '\0')
672     {
673 	data1_node *parent = level ? d1_stack[level-1] : 0;
674 
675 	if (amp == 0 && c == '<') /* beginning of tag */
676 	{
677 	    data1_xattr *xattr;
678 
679 	    char tag[256];
680 	    int null_tag = 0;
681 	    int end_tag = 0;
682 	    size_t i = 0;
683 
684 	    c = ampr (get_byte, fh, &amp);
685 	    if (amp == 0 && c == '/')
686 	    {
687 		end_tag = 1;
688 		c = ampr (get_byte, fh, &amp);
689 	    }
690 	    else if (amp == 0 && c == '?')
691 	    {
692 		int quote_mode = 0;
693 		while ((c = ampr(get_byte, fh, &amp)))
694 		{
695 		    if (amp)
696 			continue;
697 		    if (quote_mode == 0)
698 		    {
699 			if (c == '"')
700 			    quote_mode = c;
701 			else if (c == '\'')
702 			    quote_mode = c;
703 			else if (c == '>')
704 			{
705 			    c = ampr(get_byte, fh, &amp);
706 			    break;
707 			}
708 		    }
709 		    else
710 		    {
711 			if (amp == 0 && c == quote_mode)
712 			    quote_mode = 0;
713 		    }
714 		}
715 		continue;
716 	    }
717 	    else if (amp == 0 && c == '!')
718 	    {
719                 int c0, amp0;
720 
721                 wrbuf_rewind(wrbuf);
722 
723                 c0 = ampr (get_byte, fh, &amp0);
724                 if (amp0 == 0 && c0 == '\0')
725                     break;
726                 c = ampr (get_byte, fh, &amp);
727 
728                 if (amp0 == 0 && c0 == '-' && amp == 0 && c == '-')
729                 {
730                     /* COMMENT: <!-- ... --> */
731                     int no_dash = 0;
732 
733                     c = ampr (get_byte, fh, &amp);
734                     while (amp || c)
735                     {
736                         if (amp == 0 && c == '-')
737                             no_dash++;
738                         else if (amp == 0 && c == '>' && no_dash >= 2)
739                         {
740                             if (level > 0)
741                                 d1_stack[level] =
742                                     data1_mk_comment_n (
743                                         dh, m,
744                                         wrbuf_buf(wrbuf), wrbuf_len(wrbuf)-2,
745                                         d1_stack[level-1]);
746                             c = ampr (get_byte, fh, &amp); /* skip > */
747                             break;
748                         }
749                         else
750                             no_dash = 0;
751                         wrbuf_putc (wrbuf, c);
752                         c = ampr (get_byte, fh, &amp);
753                     }
754                     continue;
755                 }
756                 else
757                 {   /* DIRECTIVE: <! .. > */
758 
759 		    int blevel = 0;
760                     while (amp || c)
761                     {
762                         if (amp == 0 && c == '>' && blevel == 0)
763                         {
764                             c = ampr (get_byte, fh, &amp);
765 			    break;
766                         }
767 			if (amp == 0 && c == '[')
768 			    blevel++;
769 			if (amp == 0 && c == ']' && blevel > 0)
770 			    blevel--;
771                         c = ampr (get_byte, fh, &amp);
772                     }
773                     continue;
774                 }
775 	    }
776 	    while (amp || (c && c != '>' && c != '/' && !d1_isspace(c)))
777 	    {
778 		if (i < (sizeof(tag)-1))
779 		    tag[i++] = c;
780 		c = ampr (get_byte, fh, &amp);
781 	    }
782 	    tag[i] = '\0';
783 	    xattr = data1_read_xattr (dh, m, get_byte, fh, wrbuf, &c, &amp);
784 	    if (amp == 0 && c == '/')
785 	    {    /* <tag attrs/> or <tag/> */
786 		null_tag = 1;
787 		c = ampr (get_byte, fh, &amp);
788 	    }
789 	    if (amp || c != '>')
790 	    {
791 		yaz_log(YLOG_WARN, "d1: %d: Malformed tag", line);
792 		return 0;
793 	    }
794 	    else
795 		c = ampr (get_byte, fh, &amp);
796 
797 	    /* End tag? */
798 	    if (end_tag)
799 	    {
800 		if (*tag == '\0')
801 		    --level;        /* </> */
802 		else
803 		{                   /* </tag> */
804 		    int i = level;
805 		    while (i > 0)
806 		    {
807 			parent = d1_stack[--i];
808 			if ((parent->which == DATA1N_root &&
809 			     !strcmp(tag, parent->u.root.type)) ||
810 			    (parent->which == DATA1N_tag &&
811 			     !strcmp(tag, parent->u.tag.tag)))
812 			{
813 			    level = i;
814 			    break;
815 			}
816 		    }
817 		    if (i != level)
818 		    {
819 			yaz_log (YLOG_WARN, "%d: no begin tag for %s",
820 				 line, tag);
821 			break;
822 		    }
823 		}
824                 if (data1_is_xmlmode(dh))
825                 {
826                     if (level <= 1)
827                         return d1_stack[0];
828                 }
829                 else
830                 {
831                     if (level <= 0)
832                         return d1_stack[0];
833                 }
834 		continue;
835 	    }
836 	    else if (!strcmp(tag, "var")
837 		     && xattr && xattr->next && xattr->next->next
838 		     && xattr->value == 0
839 		     && xattr->next->value == 0
840 		     && xattr->next->next->value == 0)
841 	    {
842 		/* <var class type value> */
843 		const char *tclass = xattr->name;
844 		const char *type = xattr->next->name;
845 		const char *value = xattr->next->name;
846 		data1_vartype *tp;
847 
848 		yaz_log(YLOG_LOG, "Variant class=%s type=%s value=%s",
849 			tclass, type, value);
850 		if (!(tp =
851 		      data1_getvartypebyct(dh,
852 					   parent->root->u.root.absyn->varset,
853 					   tclass, type)))
854 		    continue;
855 		/*
856 		 * If we're the first variant in this group, create a parent
857 		 * variant, and insert it before the current variant.
858 		 */
859 		if (parent->which != DATA1N_variant)
860 		{
861 		    res = data1_mk_node2 (dh, m, DATA1N_variant, parent);
862 		}
863 		else
864 		{
865 		    /*
866 		     * now determine if one of our ancestor triples is of
867 		     * same type. If so, we break here.
868 		     */
869 		    int i;
870 		    for (i = level-1; d1_stack[i]->which==DATA1N_variant; --i)
871 			if (d1_stack[i]->u.variant.type == tp)
872 			{
873 			    level = i;
874 			    break;
875 			}
876 		    res = data1_mk_node2 (dh, m, DATA1N_variant, parent);
877 		    res->u.variant.type = tp;
878 		    res->u.variant.value =
879 			data1_insert_string (dh, res, m, value);
880 		}
881 	    }
882 	    else
883             {
884 
885                 /* tag .. acquire our element in the abstract syntax */
886                 if (level == 0)
887                 {
888                     parent = data1_mk_root (dh, m, tag);
889                     res = d1_stack[level] = parent;
890 
891                     if (data1_is_xmlmode(dh))
892                     {
893                         level++;
894                         res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);
895                         res->u.tag.attributes = xattr;
896                     }
897                 }
898                 else
899                 {
900                     res = data1_mk_tag (dh, m, tag, 0 /* attr */, parent);
901                     res->u.tag.attributes = xattr;
902                 }
903             }
904 	    d1_stack[level] = res;
905 	    d1_stack[level+1] = 0;
906 	    if (level < 250 && !null_tag)
907 		++level;
908 	}
909 	else /* != '<'... this is a body of text */
910 	{
911 	    int len;
912 
913 	    if (level == 0)
914 	    {
915 		c = ampr (get_byte, fh, &amp);
916 		continue;
917 	    }
918 	    res = data1_mk_node2 (dh, m, DATA1N_data, parent);
919 	    res->u.data.what = DATA1I_xmltext;
920 	    res->u.data.formatted_text = 0;
921 	    d1_stack[level] = res;
922 
923 	    wrbuf_rewind(wrbuf);
924 
925 	    while (amp || (c && c != '<'))
926 	    {
927 		wrbuf_putc (wrbuf, c);
928 		c = ampr (get_byte, fh, &amp);
929 	    }
930 	    len = wrbuf_len(wrbuf);
931 
932 	    /* use local buffer of nmem if too large */
933 	    if (len >= DATA1_LOCALDATA)
934 		res->u.data.data = (char*) nmem_malloc (m, len);
935 	    else
936 		res->u.data.data = res->lbuf;
937 
938             if (len)
939                 memcpy (res->u.data.data, wrbuf_buf(wrbuf), len);
940             else
941                 res->u.data.data = 0;
942             res->u.data.len = len;
943 	}
944     }
945     return 0;
946 }
947 
getc_mem(void * fh)948 int getc_mem (void *fh)
949 {
950     const char **p = (const char **) fh;
951     if (**p)
952 	return *(*p)++;
953     return 0;
954 }
955 
data1_read_node(data1_handle dh,const char ** buf,NMEM m)956 data1_node *data1_read_node (data1_handle dh, const char **buf, NMEM m)
957 {
958     WRBUF wrbuf = wrbuf_alloc();
959     data1_node *node;
960 
961     node = data1_read_nodex(dh, m, getc_mem, (void *) (buf), wrbuf);
962     wrbuf_destroy(wrbuf);
963     return node;
964 }
965 
966 /*
967  * Read a record in the native syntax.
968  */
data1_read_record(data1_handle dh,int (* rf)(void *,char *,size_t),void * fh,NMEM m)969 data1_node *data1_read_record(data1_handle dh,
970 			      int (*rf)(void *, char *, size_t), void *fh,
971                               NMEM m)
972 {
973     int *size;
974     char **buf = data1_get_read_buf (dh, &size);
975     const char *bp;
976     int rd = 0, res;
977 
978     if (!*buf)
979 	*buf = (char *)xmalloc(*size = 4096);
980 
981     for (;;)
982     {
983 	if (rd + 2048 >= *size && !(*buf =(char *)xrealloc(*buf, *size *= 2)))
984 	    abort();
985 	if ((res = (*rf)(fh, *buf + rd, 2048)) <= 0)
986 	{
987 	    if (!res)
988 	    {
989 		bp = *buf;
990 		(*buf)[rd] = '\0';
991 		return data1_read_node(dh, &bp, m);
992 	    }
993 	    else
994 		return 0;
995 	}
996 	rd += res;
997     }
998 }
999 
data1_read_sgml(data1_handle dh,NMEM m,const char * buf)1000 data1_node *data1_read_sgml (data1_handle dh, NMEM m, const char *buf)
1001 {
1002     const char *bp = buf;
1003     return data1_read_node (dh, &bp, m);
1004 }
1005 
1006 
conv_item(NMEM m,yaz_iconv_t t,WRBUF wrbuf,char * inbuf,size_t inlen)1007 static int conv_item(NMEM m, yaz_iconv_t t,
1008                      WRBUF wrbuf, char *inbuf, size_t inlen)
1009 {
1010     wrbuf_rewind(wrbuf);
1011     wrbuf_iconv_write(wrbuf, t, inbuf, inlen);
1012     wrbuf_iconv_reset(wrbuf, t);
1013     return 0;
1014 }
1015 
data1_iconv_s(data1_handle dh,NMEM m,data1_node * n,yaz_iconv_t t,WRBUF wrbuf,const char * tocode)1016 static void data1_iconv_s (data1_handle dh, NMEM m, data1_node *n,
1017                            yaz_iconv_t t, WRBUF wrbuf, const char *tocode)
1018 {
1019     for (; n; n = n->next)
1020     {
1021         switch (n->which)
1022         {
1023         case DATA1N_data:
1024         case DATA1N_comment:
1025             if (conv_item (m, t, wrbuf, n->u.data.data, n->u.data.len) == 0)
1026             {
1027                 n->u.data.data =
1028                     data1_insert_string_n (dh, n, m, wrbuf->buf,
1029                                            wrbuf->pos);
1030                 n->u.data.len = wrbuf->pos;
1031             }
1032             break;
1033         case DATA1N_tag:
1034             if (conv_item (m, t, wrbuf, n->u.tag.tag, strlen(n->u.tag.tag))
1035                 == 0)
1036             {
1037                 n->u.tag.tag =
1038                     data1_insert_string_n (dh, n, m,
1039                                            wrbuf->buf, wrbuf->pos);
1040             }
1041             if (n->u.tag.attributes)
1042             {
1043                 data1_xattr *p;
1044                 for (p = n->u.tag.attributes; p; p = p->next)
1045                 {
1046                     if (p->value &&
1047                         conv_item(m, t, wrbuf, p->value, strlen(p->value))
1048                         == 0)
1049                     {
1050                         p->value = nmem_strdup(m, wrbuf_cstr(wrbuf));
1051                     }
1052                 }
1053             }
1054             break;
1055         case DATA1N_preprocess:
1056             if (strcmp(n->u.preprocess.target, "xml") == 0)
1057             {
1058                 data1_xattr *p = n->u.preprocess.attributes;
1059                 for (; p; p = p->next)
1060                     if (strcmp (p->name, "encoding") == 0)
1061                         p->value = nmem_strdup (m, tocode);
1062             }
1063             break;
1064         }
1065         data1_iconv_s (dh, m, n->child, t, wrbuf, tocode);
1066     }
1067 }
1068 
data1_get_encoding(data1_handle dh,data1_node * n)1069 const char *data1_get_encoding (data1_handle dh, data1_node *n)
1070 {
1071     /* see if we have an xml header that specifies encoding */
1072     if (n && n->child && n->child->which == DATA1N_preprocess &&
1073         strcmp (n->child->u.preprocess.target, "xml") == 0)
1074     {
1075         data1_xattr *xp = n->child->u.preprocess.attributes;
1076         for (; xp; xp = xp->next)
1077             if (!strcmp (xp->name, "encoding") == 0)
1078                 return xp->value;
1079     }
1080     /* no encoding in header, so see if "encoding" was specified for abs */
1081     if (n && n->which == DATA1N_root &&
1082         n->u.root.absyn && n->u.root.absyn->encoding)
1083         return n->u.root.absyn->encoding;
1084     /* none of above, return a hard coded default */
1085     return "ISO-8859-1";
1086 }
1087 
data1_iconv(data1_handle dh,NMEM m,data1_node * n,const char * tocode,const char * fromcode)1088 int data1_iconv (data1_handle dh, NMEM m, data1_node *n,
1089                  const char *tocode,
1090                  const char *fromcode)
1091 {
1092     if (yaz_matchstr (tocode, fromcode))
1093     {
1094         WRBUF wrbuf = wrbuf_alloc();
1095         yaz_iconv_t t = yaz_iconv_open(tocode, fromcode);
1096         if (!t)
1097 	{
1098             wrbuf_destroy(wrbuf);
1099             return -1;
1100 	}
1101         data1_iconv_s(dh, m, n, t, wrbuf, tocode);
1102         yaz_iconv_close(t);
1103         wrbuf_destroy(wrbuf);
1104     }
1105     return 0;
1106 }
1107 
data1_chop_text(data1_handle dh,NMEM m,data1_node * n)1108 void data1_chop_text(data1_handle dh, NMEM m, data1_node *n)
1109 {
1110     for (; n; n = n->next)
1111     {
1112         if (n->which == DATA1N_data)
1113         {
1114 
1115             int sz = n->u.data.len;
1116             const char *ndata = n->u.data.data;
1117             int off = 0;
1118 
1119             for (off = 0; off < sz; off++)
1120                 if (!d1_isspace(ndata[off]))
1121                     break;
1122             sz = sz - off;
1123             ndata += off;
1124 
1125             while (sz && d1_isspace(ndata[sz - 1]))
1126                 sz--;
1127 
1128             n->u.data.data = nmem_malloc(m, sz);
1129             n->u.data.len = sz;
1130             memcpy(n->u.data.data, ndata, sz);
1131 
1132         }
1133         data1_chop_text(dh, m, n->child);
1134     }
1135 }
1136 
data1_concat_text(data1_handle dh,NMEM m,data1_node * n)1137 void data1_concat_text(data1_handle dh, NMEM m, data1_node *n)
1138 {
1139     for (; n; n = n->next)
1140     {
1141         if (n->which == DATA1N_data && n->next &&
1142             n->next->which == DATA1N_data)
1143         {
1144             int sz = 0;
1145             int off = 0;
1146             char *ndata;
1147             data1_node *np;
1148             for (np = n; np && np->which == DATA1N_data; np=np->next)
1149                 sz += np->u.data.len;
1150             ndata = nmem_malloc(m, sz);
1151             for (np = n; np && np->which == DATA1N_data; np=np->next)
1152             {
1153                 memcpy(ndata+off, np->u.data.data, np->u.data.len);
1154                 off += np->u.data.len;
1155             }
1156             n->u.data.data = ndata;
1157             n->u.data.len = sz;
1158             n->next = np;
1159 	    if (!np && n->parent)
1160 		n->parent->last_child = n;
1161 
1162         }
1163         data1_concat_text(dh, m, n->child);
1164     }
1165 }
1166 
1167 /*
1168  * Local variables:
1169  * c-basic-offset: 4
1170  * c-file-style: "Stroustrup"
1171  * indent-tabs-mode: nil
1172  * End:
1173  * vim: shiftwidth=4 tabstop=8 expandtab
1174  */
1175 
1176