1 /**
2 * Copyright 2008 Digital Bazaar, Inc.
3 *
4 * This file is part of librdfa.
5 *
6 * librdfa is Free Software, and can be licensed under any of the
7 * following three licenses:
8 *
9 * 1. GNU Lesser General Public License (LGPL) V2.1 or any
10 * newer version
11 * 2. GNU General Public License (GPL) V2 or any newer version
12 * 3. Apache License, V2.0 or any newer version
13 *
14 * You may not use this file except in compliance with at least one of
15 * the above three licenses.
16 *
17 * See LICENSE-* at the top of this software distribution for more
18 * information regarding the details of each license.
19 *
20 * The librdfa library is the Fastest RDFa Parser in the Universe. It is
21 * a stream parser, meaning that it takes an XML data as input and spits
22 * out RDF triples as it comes across them in the stream. Due to this
23 * processing approach, librdfa has a very, very small memory footprint.
24 * It is also very fast and can operate on hundreds of gigabytes of XML
25 * data without breaking a sweat.
26 *
27 * Usage:
28 *
29 * rdfacontext* context = rdfa_create_context(BASE_URI);
30 * context->callback_data = your_user_data;
31 * rdfa_set_triple_handler(context, &process_triple);
32 * rdfa_set_buffer_filler(context, &fill_buffer);
33 * rdfa_parse(context);
34 * rdfa_free_context(context);
35 *
36 * @author Manu Sporny
37 */
38 #include <stdlib.h>
39 #include <stdio.h>
40 #include <string.h>
41 #include "rdfa_utils.h"
42 #include "rdfa.h"
43
44 #define READ_BUFFER_SIZE 4096
45
rdfa_init_context(rdfacontext * context)46 void rdfa_init_context(rdfacontext* context)
47 {
48 // the [parent subject] is set to the [base] value;
49 context->parent_subject = NULL;
50 if(context->base != NULL)
51 {
52 char* cleaned_base = rdfa_iri_get_base(context->base);
53 context->parent_subject =
54 rdfa_replace_string(context->parent_subject, cleaned_base);
55 free(cleaned_base);
56 }
57
58 // the [parent object] is set to null;
59 context->parent_object = NULL;
60
61 #ifndef LIBRDFA_IN_RAPTOR
62 // the [list of URI mappings] is cleared;
63 context->uri_mappings = (char**)rdfa_create_mapping(MAX_URI_MAPPINGS);
64 #endif
65
66 // the [list of incomplete triples] is cleared;
67 context->incomplete_triples = rdfa_create_list(3);
68
69 // the [language] is set to null.
70 context->language = NULL;
71
72 // set the [current object resource] to null;
73 context->current_object_resource = NULL;
74
75 // 1. First, the local values are initialized, as follows:
76 //
77 // * the [recurse] flag is set to 'true';
78 context->recurse = 1;
79
80 // * the [skip element] flag is set to 'false';
81 context->skip_element = 0;
82
83 // * [new subject] is set to null;
84 context->new_subject = NULL;
85
86 // * [current object resource] is set to null;
87 context->current_object_resource = NULL;
88
89 // * the [local list of URI mappings] is set to the list of URI
90 // mappings from the [evaluation context];
91 // NOTE: This step is done in rdfa_create_new_element_context()
92
93 // * the [local list of incomplete triples] is set to null;
94 context->local_incomplete_triples = rdfa_create_list(3);
95
96 // * the [current language] value is set to the [language] value
97 // from the [evaluation context].
98 // NOTE: This step is done in rdfa_create_new_element_context()
99
100 // The next set of variables are initialized to make the C compiler
101 // and valgrind happy - they are not a part of the RDFa spec.
102 context->bnode_count = 0;
103 context->underscore_colon_bnode_name = NULL;
104 context->xml_literal_namespaces_defined = 0;
105 context->xml_literal_xml_lang_defined = 0;
106 context->content = NULL;
107 context->datatype = NULL;
108 context->property = NULL;
109 context->plain_literal = NULL;
110 context->plain_literal_size = 0;
111 context->xml_literal = NULL;
112 context->xml_literal_size = 0;
113 // FIXME: completing incomplete triples always happens now, change
114 // all of the code to reflect that.
115 //context->callback_data = NULL;
116 }
117
118 /**
119 * Read the head of the XHTML document and determines the base IRI for
120 * the document.
121 *
122 * @param context the current working context.
123 * @param working_buffer the current working buffer.
124 * @param wb_allocated the number of bytes that have been allocated to
125 * the working buffer.
126 *
127 * @return the size of the data available in the working buffer.
128 */
rdfa_init_base(rdfacontext * context,char ** working_buffer,size_t * working_buffer_size,char * temp_buffer,size_t bytes_read)129 static size_t rdfa_init_base(
130 rdfacontext* context, char** working_buffer, size_t* working_buffer_size,
131 char* temp_buffer, size_t bytes_read)
132 {
133 char* head_end = NULL;
134 size_t offset = context->wb_offset;
135 int needed_size = (offset + bytes_read) - *working_buffer_size;
136
137 // search for the end of <head>, stop if <head> was found
138
139 // extend the working buffer size
140 if(needed_size > 0)
141 {
142 size_t temp_buffer_size = sizeof(char) * READ_BUFFER_SIZE;
143 if((size_t)needed_size > temp_buffer_size)
144 temp_buffer_size += needed_size;
145
146 *working_buffer_size += temp_buffer_size;
147 // +1 for NUL at end, to allow strstr() etc. to work
148 *working_buffer = (char*)realloc(*working_buffer, *working_buffer_size + 1);
149 }
150
151 // append to the working buffer
152 memmove(*working_buffer + offset, temp_buffer, bytes_read);
153 // ensure the buffer is a NUL-terminated string
154 *(*working_buffer + offset + bytes_read) = '\0';
155
156 // search for the end of </head> in
157 head_end = strstr(*working_buffer, "</head>");
158 if(head_end == NULL)
159 head_end = strstr(*working_buffer, "</HEAD>");
160
161 context->wb_offset += bytes_read;
162
163 if(head_end == NULL)
164 return bytes_read;
165
166 // if </head> was found, search for <base and extract the base URI
167 if(head_end != NULL)
168 {
169 char* base_start = strstr(*working_buffer, "<base ");
170 if(base_start == NULL)
171 base_start = strstr(*working_buffer, "<BASE ");
172
173 if(base_start != NULL)
174 {
175 char* href_start = strstr(base_start, "href=");
176 char* uri_start = href_start + 6;
177 char* uri_end = strchr(uri_start, '"');
178
179 if((uri_start != NULL) && (uri_end != NULL))
180 {
181 if(*uri_start != '"')
182 {
183 size_t uri_size = uri_end - uri_start;
184 char* temp_uri = (char*)malloc(sizeof(char) * uri_size + 1);
185 char* cleaned_base;
186 strncpy(temp_uri, uri_start, uri_size);
187 temp_uri[uri_size] = '\0';
188
189 // TODO: This isn't in the processing rules, should it
190 // be? Setting current_object_resource will make
191 // sure that the BASE element is inherited by all
192 // subcontexts.
193 cleaned_base = rdfa_iri_get_base(temp_uri);
194 context->current_object_resource =
195 rdfa_replace_string(
196 context->current_object_resource, cleaned_base);
197
198 // clean up the base context
199 context->base =
200 rdfa_replace_string(context->base, cleaned_base);
201 free(cleaned_base);
202 free(temp_uri);
203 }
204 }
205 }
206 }
207
208 return bytes_read;
209 }
210
211 /**
212 * Creates a new context for the current element by cloning certain
213 * parts of the old context on the top of the given stack.
214 *
215 * @param context_stack the context stack that is associated with this
216 * processing run.
217 */
rdfa_create_new_element_context(rdfalist * context_stack)218 static rdfacontext* rdfa_create_new_element_context(rdfalist* context_stack)
219 {
220 rdfacontext* parent_context = (rdfacontext*)
221 context_stack->items[context_stack->num_items - 1]->data;
222 rdfacontext* rval = rdfa_create_context(parent_context->base);
223
224 // * Otherwise, the values are:
225
226 // * the [ base ] is set to the [ base ] value of the current
227 // [ evaluation context ];
228 rval->base = rdfa_replace_string(rval->base, parent_context->base);
229 rdfa_init_context(rval);
230
231 // copy the URI mappings
232 #ifndef LIBRDFA_IN_RAPTOR
233 if(rval->uri_mappings != NULL)
234 {
235 rdfa_free_mapping(rval->uri_mappings);
236 }
237 rval->uri_mappings = rdfa_copy_mapping(parent_context->uri_mappings);
238 #endif
239
240 // inherit the parent context's language
241 if(parent_context->language != NULL)
242 {
243 rval->language =
244 rdfa_replace_string(rval->language, parent_context->language);
245 }
246
247 // set the triple callback
248 rval->triple_callback = parent_context->triple_callback;
249 rval->buffer_filler_callback = parent_context->buffer_filler_callback;
250
251 // inherit the bnode count, _: bnode name, recurse flag, and state
252 // of the xml_literal_namespace_insertion
253 rval->bnode_count = parent_context->bnode_count;
254 rval->underscore_colon_bnode_name =
255 rdfa_replace_string(rval->underscore_colon_bnode_name,
256 parent_context->underscore_colon_bnode_name);
257 rval->recurse = parent_context->recurse;
258 rval->skip_element = 0;
259 rval->callback_data = parent_context->callback_data;
260 rval->xml_literal_namespaces_defined =
261 parent_context->xml_literal_namespaces_defined;
262 rval->xml_literal_xml_lang_defined =
263 parent_context->xml_literal_xml_lang_defined;
264
265 // inherit the parent context's new_subject
266 // TODO: This is not anywhere in the syntax processing document
267 //if(parent_context->new_subject != NULL)
268 //{
269 // rval->new_subject = rdfa_replace_string(
270 // rval->new_subject, parent_context->new_subject);
271 //}
272
273 if(parent_context->skip_element == 0)
274 {
275 // o the [ parent subject ] is set to the value of [ new subject ],
276 // if non-null, or the value of the [ parent subject ] of the
277 // current [ evaluation context ];
278 if(parent_context->new_subject != NULL)
279 {
280 rval->parent_subject = rdfa_replace_string(
281 rval->parent_subject, parent_context->new_subject);
282 }
283 else
284 {
285 rval->parent_subject = rdfa_replace_string(
286 rval->parent_subject, parent_context->parent_subject);
287 }
288
289 // o the [ parent object ] is set to value of [ current object
290 // resource ], if non-null, or the value of [ new subject ], if
291 // non-null, or the value of the [ parent subject ] of the
292 // current [ evaluation context ];
293 if(parent_context->current_object_resource != NULL)
294 {
295 rval->parent_object =
296 rdfa_replace_string(
297 rval->parent_object, parent_context->current_object_resource);
298 }
299 else if(parent_context->new_subject != NULL)
300 {
301 rval->parent_object =
302 rdfa_replace_string(
303 rval->parent_object, parent_context->new_subject);
304 }
305 else
306 {
307 rval->parent_object =
308 rdfa_replace_string(
309 rval->parent_object, parent_context->parent_subject);
310 }
311
312 // copy the incomplete triples
313 if(rval->incomplete_triples != NULL)
314 {
315 rdfa_free_list(rval->incomplete_triples);
316 }
317
318 // o the [ list of incomplete triples ] is set to the [ local list
319 // of incomplete triples ];
320 rval->incomplete_triples =
321 rdfa_copy_list(parent_context->local_incomplete_triples);
322 }
323 else
324 {
325 rval->parent_subject = rdfa_replace_string(
326 rval->parent_subject, parent_context->parent_subject);
327 rval->parent_object = rdfa_replace_string(
328 rval->parent_object, parent_context->parent_object);
329
330 // copy the incomplete triples
331 if(rval->incomplete_triples != NULL)
332 {
333 rdfa_free_list(rval->incomplete_triples);
334 }
335
336 rval->incomplete_triples =
337 rdfa_copy_list(parent_context->incomplete_triples);
338
339 // copy the local list of incomplete triples
340 if(rval->local_incomplete_triples != NULL)
341 {
342 rdfa_free_list(rval->local_incomplete_triples);
343 }
344
345 rval->local_incomplete_triples =
346 rdfa_copy_list(parent_context->local_incomplete_triples);
347 }
348
349 #ifdef LIBRDFA_IN_RAPTOR
350 rval->base_uri = parent_context->base_uri;
351 rval->sax2 = parent_context->sax2;
352 rval->namespace_handler = parent_context->namespace_handler;
353 rval->namespace_handler_user_data = parent_context->namespace_handler_user_data;
354 rval->error_handlers = parent_context->error_handlers;
355 #endif
356
357 return rval;
358 }
359
360
361 #ifdef LIBRDFA_IN_RAPTOR
362 static int
raptor_nspace_compare(const void * a,const void * b)363 raptor_nspace_compare(const void *a, const void *b)
364 {
365 raptor_namespace* ns_a=*(raptor_namespace**)a;
366 raptor_namespace* ns_b=*(raptor_namespace**)b;
367 if(!ns_a->prefix)
368 return 1;
369 else if(!ns_b->prefix)
370 return -1;
371 else
372 return strcmp((const char*)ns_b->prefix, (const char*)ns_a->prefix);
373 }
374 #endif
375
376 /**
377 * Handles the start_element call
378 */
379 static void XMLCALL
start_element(void * user_data,const char * name,const char ** attributes)380 start_element(void* user_data, const char* name, const char** attributes)
381 {
382 rdfalist* context_stack = (rdfalist*) user_data;
383 rdfacontext* context = rdfa_create_new_element_context(context_stack);
384 const char** aptr = attributes;
385 const char* xml_lang = NULL;
386 const char* about_curie = NULL;
387 char* about = NULL;
388 const char* src_curie = NULL;
389 char* src = NULL;
390 const char* type_of_curie = NULL;
391 rdfalist* type_of = NULL;
392 const char* rel_curie = NULL;
393 rdfalist* rel = NULL;
394 const char* rev_curie = NULL;
395 rdfalist* rev = NULL;
396 const char* property_curie = NULL;
397 rdfalist* property = NULL;
398 const char* resource_curie = NULL;
399 char* resource = NULL;
400 const char* href_curie = NULL;
401 char* href = NULL;
402 const char* content = NULL;
403 const char* datatype_curie = NULL;
404 char* datatype = NULL;
405
406 rdfa_push_item(context_stack, context, RDFALIST_FLAG_CONTEXT);
407
408 if(DEBUG)
409 {
410 printf("DEBUG: ------- START - %s -------\n", name);
411 }
412
413 // start the XML Literal text
414 if(context->xml_literal == NULL)
415 {
416 context->xml_literal = rdfa_replace_string(context->xml_literal, "<");
417 context->xml_literal_size = 1;
418 }
419 else
420 {
421 context->xml_literal = rdfa_n_append_string(
422 context->xml_literal, &context->xml_literal_size, "<", 1);
423 }
424 context->xml_literal = rdfa_n_append_string(
425 context->xml_literal, &context->xml_literal_size,
426 name, strlen(name));
427
428 if(!context->xml_literal_namespaces_defined)
429 {
430 // append namespaces to XML Literal
431 #ifdef LIBRDFA_IN_RAPTOR
432 raptor_namespace_stack* nstack = &context->sax2->namespaces;
433 raptor_namespace* ns;
434 raptor_namespace** ns_list = NULL;
435 size_t ns_size;
436 #else
437 char** umap = context->uri_mappings;
438 #endif
439 char* umap_key = NULL;
440 char* umap_value = NULL;
441
442 // if the namespaces are not defined, then neither is the xml:lang
443 context->xml_literal_xml_lang_defined = 0;
444
445 #ifdef LIBRDFA_IN_RAPTOR
446 ns_size = 0;
447 ns_list = raptor_namespace_stack_to_array(nstack, &ns_size);
448 qsort((void*)ns_list, ns_size, sizeof(raptor_namespace*),
449 raptor_nspace_compare);
450
451 while(ns_size > 0)
452 #else
453 while(*umap != NULL)
454 #endif
455 {
456 unsigned char insert_xmlns_definition = 1;
457 const char* attr = NULL;
458 const char* value = NULL;
459
460 // get the next mapping to process
461 #ifdef LIBRDFA_IN_RAPTOR
462 ns=ns_list[--ns_size];
463
464 umap_key = (char*)raptor_namespace_get_prefix(ns);
465 if(!umap_key)
466 umap_key=(char*)XMLNS_DEFAULT_MAPPING;
467 umap_value = (char*)raptor_uri_as_string_v2(context->sax2->world, raptor_namespace_get_uri(ns));
468 #else
469 rdfa_next_mapping(umap++, &umap_key, &umap_value);
470 umap++;
471 #endif
472
473 // check to make sure that the namespace isn't already
474 // defined in the current element.
475 if(attributes != NULL)
476 {
477 const char** attrs = attributes;
478 while((*attrs != NULL) && insert_xmlns_definition)
479 {
480 attr = *attrs++;
481 value = *attrs++;
482
483 // if the attribute is a umap_key, skip the definition
484 // of the attribute.
485 if((strcmp(attr, umap_key) == 0) ||
486 (strcmp(umap_key, XMLNS_DEFAULT_MAPPING) == 0))
487 {
488 insert_xmlns_definition = 0;
489 }
490 }
491 }
492
493 // if the namespace isn't already defined on the element,
494 // copy it to the XML Literal string.
495 if(insert_xmlns_definition)
496 {
497 // append the namespace attribute to the XML Literal
498 context->xml_literal = rdfa_n_append_string(
499 context->xml_literal, &context->xml_literal_size,
500 " xmlns", strlen(" xmlns"));
501
502 // check to see if we're dumping the standard XHTML namespace or
503 // a user-defined XML namespace
504 if(strcmp(umap_key, XMLNS_DEFAULT_MAPPING) != 0)
505 {
506 context->xml_literal = rdfa_n_append_string(
507 context->xml_literal, &context->xml_literal_size, ":", 1);
508 context->xml_literal = rdfa_n_append_string(
509 context->xml_literal, &context->xml_literal_size,
510 umap_key, strlen(umap_key));
511 }
512
513 // append the namespace value
514 context->xml_literal = rdfa_n_append_string(
515 context->xml_literal, &context->xml_literal_size, "=\"", 2);
516 context->xml_literal = rdfa_n_append_string(
517 context->xml_literal, &context->xml_literal_size,
518 umap_value, strlen(umap_value));
519 context->xml_literal = rdfa_n_append_string(
520 context->xml_literal, &context->xml_literal_size, "\"", 1);
521 }
522
523 insert_xmlns_definition = 1;
524 } /* end while umap not NULL */
525 context->xml_literal_namespaces_defined = 1;
526
527 #ifdef LIBRDFA_IN_RAPTOR
528 if(ns_list)
529 raptor_free_memory(ns_list);
530 #endif
531 } /* end if namespaces inserted */
532
533
534 // prepare all of the RDFa-specific attributes we are looking for.
535 // scan all of the attributes for the RDFa-specific attributes
536 if(aptr != NULL)
537 {
538 while(*aptr != NULL)
539 {
540 const char* attr;
541 const char* value;
542 char* literal_text;
543
544 attr = *aptr++;
545 value = *aptr++;
546
547 // append the attribute-value pair to the XML literal
548 literal_text = (char*)malloc(strlen(attr) + strlen(value) + 5);
549 sprintf(literal_text, " %s=\"%s\"", attr, value);
550 context->xml_literal = rdfa_n_append_string(
551 context->xml_literal, &context->xml_literal_size,
552 literal_text, strlen(literal_text));
553 free(literal_text);
554
555 // if xml:lang is defined, ensure that it is not overwritten
556 if(strcmp(attr, "xml:lang") == 0)
557 {
558 context->xml_literal_xml_lang_defined = 1;
559 }
560
561 // process all of the RDFa attributes
562 if(strcmp(attr, "about") == 0)
563 {
564 about_curie = value;
565 about = rdfa_resolve_curie(
566 context, about_curie, CURIE_PARSE_ABOUT_RESOURCE);
567 }
568 else if(strcmp(attr, "src") == 0)
569 {
570 src_curie = value;
571 src = rdfa_resolve_curie(context, src_curie, CURIE_PARSE_HREF_SRC);
572 }
573 else if(strcmp(attr, "typeof") == 0)
574 {
575 type_of_curie = value;
576 type_of = rdfa_resolve_curie_list(
577 context, type_of_curie,
578 CURIE_PARSE_INSTANCEOF_DATATYPE);
579 }
580 else if(strcmp(attr, "rel") == 0)
581 {
582 rel_curie = value;
583 rel = rdfa_resolve_curie_list(
584 context, rel_curie, CURIE_PARSE_RELREV);
585 }
586 else if(strcmp(attr, "rev") == 0)
587 {
588 rev_curie = value;
589 rev = rdfa_resolve_curie_list(
590 context, rev_curie, CURIE_PARSE_RELREV);
591 }
592 else if(strcmp(attr, "property") == 0)
593 {
594 property_curie = value;
595 property =
596 rdfa_resolve_curie_list(
597 context, property_curie, CURIE_PARSE_PROPERTY);
598 }
599 else if(strcmp(attr, "resource") == 0)
600 {
601 resource_curie = value;
602 resource = rdfa_resolve_curie(
603 context, resource_curie, CURIE_PARSE_ABOUT_RESOURCE);
604 }
605 else if(strcmp(attr, "href") == 0)
606 {
607 href_curie = value;
608 href =
609 rdfa_resolve_curie(context, href_curie, CURIE_PARSE_HREF_SRC);
610 }
611 else if(strcmp(attr, "content") == 0)
612 {
613 content = value;
614 }
615 else if(strcmp(attr, "datatype") == 0)
616 {
617 datatype_curie = value;
618
619 if(strlen(datatype_curie) == 0)
620 {
621 datatype = rdfa_replace_string(datatype, "");
622 }
623 else
624 {
625 datatype = rdfa_resolve_curie(context, datatype_curie,
626 CURIE_PARSE_INSTANCEOF_DATATYPE);
627 }
628 }
629 #ifndef LIBRDFA_IN_RAPTOR
630 else if(strcmp(attr, "xml:lang") == 0)
631 {
632 xml_lang = value;
633 }
634 else if(strstr(attr, "xmlns") != NULL)
635 {
636 // 2. Next the [current element] is parsed for
637 // [URI mapping]s and these are added to the
638 // [local list of URI mappings]. Note that a
639 // [URI mapping] will simply overwrite any current
640 // mapping in the list that has the same name;
641 rdfa_update_uri_mappings(context, attr, value);
642 }
643 #endif
644 }
645 }
646
647 #ifdef LIBRDFA_IN_RAPTOR
648 if(context->sax2) {
649 xml_lang = (const char*)raptor_sax2_inscope_xml_language(context->sax2);
650 if(!xml_lang)
651 xml_lang = "";
652 }
653 #endif
654 // check to see if we should append an xml:lang to the XML Literal
655 // if one is defined in the context and does not exist on the
656 // element.
657 if((xml_lang == NULL) && (context->language != NULL) &&
658 !context->xml_literal_xml_lang_defined)
659 {
660 context->xml_literal = rdfa_n_append_string(
661 context->xml_literal, &context->xml_literal_size,
662 " xml:lang=\"", strlen(" xml:lang=\""));
663 context->xml_literal = rdfa_n_append_string(
664 context->xml_literal, &context->xml_literal_size,
665 context->language, strlen(context->language));
666 context->xml_literal = rdfa_n_append_string(
667 context->xml_literal, &context->xml_literal_size, "\"", 1);
668
669 // ensure that the lang isn't set in a subtree (unless it's overwritten)
670 context->xml_literal_xml_lang_defined = 1;
671 }
672
673 // close the XML Literal value
674 context->xml_literal = rdfa_n_append_string(
675 context->xml_literal, &context->xml_literal_size, ">", 1);
676
677 // 3. The [current element] is also parsed for any language
678 // information, and [language] is set in the [current
679 // evaluation context];
680 rdfa_update_language(context, xml_lang);
681
682 /***************** FOR DEBUGGING PURPOSES ONLY ******************/
683 if(DEBUG)
684 {
685 if(about != NULL)
686 {
687 printf("DEBUG: @about = %s\n", about);
688 }
689 if(src != NULL)
690 {
691 printf("DEBUG: @src = %s\n", src);
692 }
693 if(type_of != NULL)
694 {
695 printf("DEBUG: @type_of = ");
696 rdfa_print_list(type_of);
697 }
698 if(rel != NULL)
699 {
700 printf("DEBUG: @rel = ");
701 rdfa_print_list(rel);
702 }
703 if(rev != NULL)
704 {
705 printf("DEBUG: @rev = ");
706 rdfa_print_list(rev);
707 }
708 if(property != NULL)
709 {
710 printf("DEBUG: @property = ");
711 rdfa_print_list(property);
712 }
713 if(resource != NULL)
714 {
715 printf("DEBUG: @resource = %s\n", resource);
716 }
717 if(href != NULL)
718 {
719 printf("DEBUG: @href = %s\n", href);
720 }
721 if(content != NULL)
722 {
723 printf("DEBUG: @content = %s\n", content);
724 }
725 if(datatype != NULL)
726 {
727 printf("DEBUG: @datatype = %s\n", datatype);
728 }
729 }
730
731 // TODO: This isn't part of the processing model, it needs to be
732 // included and is a correction for the last item in step #4.
733 if((about == NULL) && (src == NULL) && (type_of == NULL) &&
734 (rel == NULL) && (rev == NULL) && (property == NULL) &&
735 (resource == NULL) && (href == NULL))
736 {
737 context->skip_element = 1;
738 }
739
740 if((rel == NULL) && (rev == NULL))
741 {
742 // 4. If the [current element] contains no valid @rel or @rev
743 // URI, obtained according to the section on CURIE and URI
744 // Processing, then the next step is to establish a value for
745 // [new subject]. Any of the attributes that can carry a
746 // resource can set [new subject];
747 rdfa_establish_new_subject(
748 context, name, about, src, resource, href, type_of);
749 }
750 else
751 {
752 // 5. If the [current element] does contain a valid @rel or @rev
753 // URI, obtained according to the section on CURIE and URI
754 // Processing, then the next step is to establish both a value
755 // for [new subject] and a value for [current object resource]:
756 rdfa_establish_new_subject_with_relrev(
757 context, name, about, src, resource, href, type_of);
758 }
759
760 if(context->new_subject != NULL)
761 {
762 if(DEBUG)
763 {
764 printf("DEBUG: new_subject = %s\n", context->new_subject);
765 }
766
767 // 6. If in any of the previous steps a [new subject] was set to
768 // a non-null value,
769
770 // it is now used to provide a subject for type values;
771 if(type_of != NULL)
772 {
773 rdfa_complete_type_triples(context, type_of);
774 }
775
776 // Note that none of this block is executed if there is no
777 // [new subject] value, i.e., [new subject] remains null.
778 }
779
780 if(context->current_object_resource != NULL)
781 {
782 // 7. If in any of the previous steps a [current object resource]
783 // was set to a non-null value, it is now used to generate triples
784 rdfa_complete_relrev_triples(context, rel, rev);
785 }
786 else if((rel != NULL) || (rev != NULL))
787 {
788 // 8. If however [current object resource] was set to null, but
789 // there are predicates present, then they must be stored as
790 // [incomplete triple]s, pending the discovery of a subject that
791 // can be used as the object. Also, [current object resource]
792 // should be set to a newly created [bnode]
793 rdfa_save_incomplete_triples(context, rel, rev);
794 }
795
796 // Ensure to re-insert XML Literal namespace information from this
797 // point on...
798 if(property != NULL)
799 {
800 context->xml_literal_namespaces_defined = 0;
801 }
802
803 // save these for processing steps #9 and #10
804 context->property = property;
805 context->content = rdfa_replace_string(context->datatype, content);
806 context->datatype = rdfa_replace_string(context->datatype, datatype);
807
808 // free the resolved CURIEs
809 free(about);
810 free(src);
811 rdfa_free_list(type_of);
812 rdfa_free_list(rel);
813 rdfa_free_list(rev);
814 free(resource);
815 free(href);
816 free(datatype);
817 }
818
character_data(void * user_data,const char * s,int len)819 static void XMLCALL character_data(void *user_data, const char *s, int len)
820 {
821 rdfalist* context_stack = (rdfalist*)user_data;
822 rdfacontext* context = (rdfacontext*)
823 context_stack->items[context_stack->num_items - 1]->data;
824
825 char *buffer = (char*)malloc(len + 1);
826 memset(buffer, 0, len + 1);
827 memcpy(buffer, s, len);
828
829 // append the text to the current context's plain literal
830 if(context->plain_literal == NULL)
831 {
832 context->plain_literal =
833 rdfa_replace_string(context->plain_literal, buffer);
834 context->plain_literal_size = len;
835 }
836 else
837 {
838 context->plain_literal = rdfa_n_append_string(
839 context->plain_literal, &context->plain_literal_size, buffer, len);
840 }
841
842 // append the text to the current context's XML literal
843 if(context->xml_literal == NULL)
844 {
845 context->xml_literal =
846 rdfa_replace_string(context->xml_literal, buffer);
847 context->xml_literal_size = len;
848 }
849 else
850 {
851 context->xml_literal = rdfa_n_append_string(
852 context->xml_literal, &context->xml_literal_size, buffer, len);
853 }
854
855 //printf("plain_literal: %s\n", context->plain_literal);
856 //printf("xml_literal: %s\n", context->xml_literal);
857
858 free(buffer);
859 }
860
861 static void XMLCALL
end_element(void * user_data,const char * name)862 end_element(void *user_data, const char *name)
863 {
864 rdfalist* context_stack = (rdfalist*)user_data;
865 rdfacontext* context = (rdfacontext*)rdfa_pop_item(context_stack);
866 rdfacontext* parent_context = (rdfacontext*)
867 context_stack->items[context_stack->num_items - 1]->data;
868
869 // append the text to the current context's XML literal
870 char* buffer = (char*)malloc(strlen(name) + 4);
871
872 if(DEBUG)
873 {
874 printf("DEBUG: </%s>\n", name);
875 }
876
877 sprintf(buffer, "</%s>", name);
878 if(context->xml_literal == NULL)
879 {
880 context->xml_literal =
881 rdfa_replace_string(context->xml_literal, buffer);
882 context->xml_literal_size = strlen(buffer);
883 }
884 else
885 {
886 context->xml_literal = rdfa_n_append_string(
887 context->xml_literal, &context->xml_literal_size,
888 buffer, strlen(buffer));
889 }
890 free(buffer);
891
892 // 9. The next step of the iteration is to establish any
893 // [current object literal];
894
895 // generate the complete object literal triples
896 if(context->property != NULL)
897 {
898 // save the current xml literal
899 char* saved_xml_literal = context->xml_literal;
900 char* content_start = NULL;
901 char* content_end = NULL;
902
903 // ensure to mark only the inner-content of the XML node for
904 // processing the object literal.
905 buffer = NULL;
906
907
908 if(context->xml_literal != NULL)
909 {
910 // get the data between the first tag and the last tag
911 content_start = strchr(context->xml_literal, '>');
912 content_end = strrchr(context->xml_literal, '<');
913
914 if((content_start != NULL) && (content_end != NULL))
915 {
916 // set content end to null terminator
917 context->xml_literal = ++content_start;
918 *content_end = '\0';
919 }
920 }
921
922 // update the plain literal if the XML Literal is an empty string
923 if(strlen(context->xml_literal) == 0)
924 {
925 context->plain_literal =
926 rdfa_replace_string(context->plain_literal, "");
927 }
928
929 // process data between first tag and last tag
930 // this needs the xml literal to be null terminated
931 rdfa_complete_object_literal_triples(context);
932
933 if(content_end != NULL)
934 {
935 // set content end back
936 *content_end = '<';
937 }
938
939 if(saved_xml_literal != NULL)
940 {
941 // restore xml literal
942 context->xml_literal = saved_xml_literal;
943 }
944 }
945
946 //printf(context->plain_literal);
947
948 // append the XML literal and plain text literals to the parent
949 // literals
950 if(context->xml_literal != NULL)
951 {
952 if(parent_context->xml_literal == NULL)
953 {
954 parent_context->xml_literal =
955 rdfa_replace_string(
956 parent_context->xml_literal, context->xml_literal);
957 parent_context->xml_literal_size = context->xml_literal_size;
958 }
959 else
960 {
961 parent_context->xml_literal =
962 rdfa_n_append_string(
963 parent_context->xml_literal, &parent_context->xml_literal_size,
964 context->xml_literal, context->xml_literal_size);
965 }
966
967 // if there is an XML literal, there is probably a plain literal
968 if(context->plain_literal != NULL)
969 {
970 if(parent_context->plain_literal == NULL)
971 {
972 parent_context->plain_literal =
973 rdfa_replace_string(
974 parent_context->plain_literal, context->plain_literal);
975 parent_context->plain_literal_size = context->plain_literal_size;
976 }
977 else
978 {
979 parent_context->plain_literal =
980 rdfa_n_append_string(
981 parent_context->plain_literal,
982 &parent_context->plain_literal_size,
983 context->plain_literal,
984 context->plain_literal_size);
985 }
986 }
987 }
988
989 // preserve the bnode count by copying it to the parent_context
990 parent_context->bnode_count = context->bnode_count;
991 parent_context->underscore_colon_bnode_name = \
992 rdfa_replace_string(parent_context->underscore_colon_bnode_name,
993 context->underscore_colon_bnode_name);
994
995 // 10. If the [ skip element ] flag is 'false', and [ new subject ]
996 // was set to a non-null value, then any [ incomplete triple ]s
997 // within the current context should be completed:
998 if((context->skip_element == 0) && (context->new_subject != NULL))
999 {
1000 rdfa_complete_incomplete_triples(context);
1001 }
1002
1003 // free the context
1004 rdfa_free_context(context);
1005 }
1006
1007
1008 #ifdef LIBRDFA_IN_RAPTOR
raptor_rdfa_start_element(void * user_data,raptor_xml_element * xml_element)1009 static void raptor_rdfa_start_element(void *user_data,
1010 raptor_xml_element *xml_element)
1011 {
1012 raptor_qname* qname=raptor_xml_element_get_name(xml_element);
1013 int attr_count=raptor_xml_element_get_attributes_count(xml_element);
1014 raptor_qname** attrs=raptor_xml_element_get_attributes(xml_element);
1015 unsigned char* qname_string=raptor_qname_to_counted_name(qname, NULL);
1016 char** attr=NULL;
1017 int i;
1018
1019 if(attr_count > 0) {
1020 attr=(char**)malloc(sizeof(char*) * (1+(attr_count*2)));
1021 for(i=0; i<attr_count; i++) {
1022 attr[2*i]=(char*)raptor_qname_to_counted_name(attrs[i], NULL);
1023 attr[1+(2*i)]=(char*)raptor_qname_get_value(attrs[i]);
1024 }
1025 attr[2*i]=NULL;
1026 }
1027 start_element(user_data, (char*)qname_string, (const char**)attr);
1028 raptor_free_memory(qname_string);
1029 if(attr) {
1030 for(i=0; i<attr_count; i++)
1031 raptor_free_memory(attr[2*i]);
1032 free(attr);
1033 }
1034 }
1035
raptor_rdfa_end_element(void * user_data,raptor_xml_element * xml_element)1036 static void raptor_rdfa_end_element(void *user_data,
1037 raptor_xml_element* xml_element)
1038 {
1039 raptor_qname* qname=raptor_xml_element_get_name(xml_element);
1040 unsigned char* qname_string=raptor_qname_to_counted_name(qname, NULL);
1041
1042 end_element(user_data, (const char*)qname_string);
1043 raptor_free_memory(qname_string);
1044 }
1045
raptor_rdfa_character_data(void * user_data,raptor_xml_element * xml_element,const unsigned char * s,int len)1046 static void raptor_rdfa_character_data(void *user_data,
1047 raptor_xml_element* xml_element,
1048 const unsigned char *s, int len)
1049 {
1050 character_data(user_data, (const char *)s, len);
1051 }
1052
raptor_rdfa_namespace_handler(void * user_data,raptor_namespace * nspace)1053 static void raptor_rdfa_namespace_handler(void *user_data,
1054 raptor_namespace* nspace)
1055 {
1056 rdfalist* context_stack = (rdfalist*)user_data;
1057 rdfacontext* context = (rdfacontext*)
1058 context_stack->items[context_stack->num_items - 1]->data;
1059
1060 if(context->namespace_handler)
1061 (*context->namespace_handler)(context->namespace_handler_user_data,
1062 nspace);
1063 }
1064
1065
1066
1067 #endif
1068
1069
rdfa_create_context(const char * base)1070 rdfacontext* rdfa_create_context(const char* base)
1071 {
1072 rdfacontext* rval = NULL;
1073 size_t base_length = strlen(base);
1074
1075 // if the base isn't specified, don't create a context
1076 if(base_length > 0)
1077 {
1078 char* cleaned_base;
1079 rval = (rdfacontext*)malloc(sizeof(rdfacontext));
1080 rval->base = NULL;
1081 cleaned_base = rdfa_iri_get_base(base);
1082 rval->base = rdfa_replace_string(rval->base, cleaned_base);
1083 free(cleaned_base);
1084
1085 /* parse state */
1086 rval->wb_allocated = 0;
1087 rval->working_buffer = NULL;
1088 rval->wb_offset = 0;
1089 #ifdef LIBRDFA_IN_RAPTOR
1090 rval->base_uri = NULL;
1091 rval->sax2 = NULL;
1092 rval->namespace_handler = NULL;
1093 rval->namespace_handler_user_data = NULL;
1094 #else
1095 rval->parser = NULL;
1096 #endif
1097 rval->done = 0;
1098 rval->context_stack = NULL;
1099 rval->wb_preread = 0;
1100 rval->preread = 0;
1101 }
1102 else
1103 {
1104 printf("OMG!\n");
1105 }
1106
1107 return rval;
1108 }
1109
rdfa_free_context(rdfacontext * context)1110 void rdfa_free_context(rdfacontext* context)
1111 {
1112 if(context->base)
1113 {
1114 free(context->base);
1115 }
1116
1117 if(context->parent_subject != NULL)
1118 {
1119 free(context->parent_subject);
1120 }
1121
1122 if(context->parent_object != NULL)
1123 {
1124 free(context->parent_object);
1125 }
1126
1127 #ifndef LIBRDFA_IN_RAPTOR
1128 if(context->uri_mappings != NULL)
1129 {
1130 rdfa_free_mapping(context->uri_mappings);
1131 }
1132 #endif
1133
1134 if(context->incomplete_triples != NULL)
1135 {
1136 rdfa_free_list(context->incomplete_triples);
1137 }
1138
1139 if(context->language != NULL)
1140 {
1141 free(context->language);
1142 }
1143
1144 if(context->underscore_colon_bnode_name != NULL)
1145 {
1146 free(context->underscore_colon_bnode_name);
1147 }
1148
1149 if(context->new_subject != NULL)
1150 {
1151 free(context->new_subject);
1152 }
1153
1154 if(context->current_object_resource != NULL)
1155 {
1156 free(context->current_object_resource);
1157 }
1158
1159 if(context->content != NULL)
1160 {
1161 free(context->content);
1162 }
1163
1164 if(context->datatype != NULL)
1165 {
1166 free(context->datatype);
1167 }
1168
1169 if(context->property != NULL)
1170 {
1171 rdfa_free_list(context->property);
1172 }
1173
1174 if(context->plain_literal != NULL)
1175 {
1176 free(context->plain_literal);
1177 }
1178
1179 if(context->xml_literal != NULL)
1180 {
1181 free(context->xml_literal);
1182 }
1183
1184 // TODO: These should be moved into their own data structure
1185 if(context->local_incomplete_triples != NULL)
1186 {
1187 rdfa_free_list(context->local_incomplete_triples);
1188 }
1189
1190 // this field is not NULL only on the rdfacontext* at the top of the stack
1191 if(context->context_stack != NULL)
1192 {
1193 void* rval;
1194 // free the stack ensuring that we do not delete this context if
1195 // it is in the list (which it may be, if parsing ended on error)
1196 do {
1197 rval=rdfa_pop_item(context->context_stack);
1198 if(rval && rval != context)
1199 rdfa_free_context((rdfacontext*)rval);
1200 } while(rval);
1201 free(context->context_stack->items);
1202 free(context->context_stack);
1203 }
1204
1205 if(context->working_buffer != NULL)
1206 {
1207 free(context->working_buffer);
1208 }
1209
1210 free(context);
1211 }
1212
rdfa_set_triple_handler(rdfacontext * context,triple_handler_fp th)1213 void rdfa_set_triple_handler(rdfacontext* context, triple_handler_fp th)
1214 {
1215 context->triple_callback = th;
1216 }
1217
rdfa_set_buffer_filler(rdfacontext * context,buffer_filler_fp bf)1218 void rdfa_set_buffer_filler(rdfacontext* context, buffer_filler_fp bf)
1219 {
1220 context->buffer_filler_callback = bf;
1221 }
1222
rdfa_parse_start(rdfacontext * context)1223 int rdfa_parse_start(rdfacontext* context)
1224 {
1225 // create the buffers and expat parser
1226 int rval = RDFA_PARSE_SUCCESS;
1227
1228 context->wb_allocated = sizeof(char) * READ_BUFFER_SIZE;
1229 // +1 for NUL at end, to allow strstr() etc. to work
1230 // malloc - only the first char needs to be NUL
1231 context->working_buffer = (char*)malloc(context->wb_allocated + 1);
1232 *context->working_buffer = '\0';
1233
1234 #ifndef LIBRDFA_IN_RAPTOR
1235 context->parser = XML_ParserCreate(NULL);
1236 #endif
1237 context->done = 0;
1238 context->context_stack = rdfa_create_list(32);
1239
1240 // initialize the context stack
1241 rdfa_push_item(context->context_stack, context, RDFALIST_FLAG_CONTEXT);
1242
1243 #ifdef LIBRDFA_IN_RAPTOR
1244 context->sax2 = raptor_new_sax2(context->context_stack,
1245 context->error_handlers);
1246 #else
1247 #endif
1248
1249 // set up the context stack
1250 #ifdef LIBRDFA_IN_RAPTOR
1251 raptor_sax2_set_start_element_handler(context->sax2,
1252 raptor_rdfa_start_element);
1253 raptor_sax2_set_end_element_handler(context->sax2,
1254 raptor_rdfa_end_element);
1255 raptor_sax2_set_characters_handler(context->sax2,
1256 raptor_rdfa_character_data);
1257 raptor_sax2_set_namespace_handler(context->sax2,
1258 raptor_rdfa_namespace_handler);
1259 #else
1260 XML_SetUserData(context->parser, context->context_stack);
1261 XML_SetElementHandler(context->parser, start_element, end_element);
1262 XML_SetCharacterDataHandler(context->parser, character_data);
1263 #endif
1264
1265 rdfa_init_context(context);
1266
1267 #ifdef LIBRDFA_IN_RAPTOR
1268 context->base_uri=raptor_new_uri_v2(context->sax2->world, (const unsigned char*)context->base);
1269 raptor_sax2_parse_start(context->sax2, context->base_uri);
1270 #endif
1271
1272 return rval;
1273 }
1274
rdfa_parse_chunk(rdfacontext * context,char * data,size_t wblen,int done)1275 int rdfa_parse_chunk(rdfacontext* context, char* data, size_t wblen, int done)
1276 {
1277 // it is an error to call this before rdfa_parse_start()
1278 if(context->done)
1279 {
1280 return RDFA_PARSE_FAILED;
1281 }
1282
1283 if(!context->preread)
1284 {
1285 // search for the <base> tag and use the href contained therein to
1286 // set the parsing context.
1287 context->wb_preread = rdfa_init_base(context,
1288 &context->working_buffer, &context->wb_allocated, data, wblen);
1289
1290 // contisnue looking if in first 131072 bytes of data
1291 if(!context->base && context->wb_preread < (1<<17))
1292 return RDFA_PARSE_SUCCESS;
1293
1294 #ifdef LIBRDFA_IN_RAPTOR
1295
1296 if(raptor_sax2_parse_chunk(context->sax2,
1297 (const unsigned char*)context->working_buffer,
1298 context->wb_offset, done))
1299 {
1300 return RDFA_PARSE_FAILED;
1301 }
1302 #else
1303 if(XML_Parse(context->parser, context->working_buffer,
1304 context->wb_offset, 0) == XML_STATUS_ERROR)
1305 {
1306 #ifdef WIN32
1307 printf(
1308 #else
1309 fprintf(stderr,
1310 #endif
1311 "%s at line %d, column %d\n",
1312 XML_ErrorString(XML_GetErrorCode(context->parser)),
1313 (int)XML_GetCurrentLineNumber(context->parser),
1314 (int)XML_GetCurrentColumnNumber(context->parser));
1315 return RDFA_PARSE_FAILED;
1316 }
1317 #endif
1318
1319 context->preread = 1;
1320
1321 return RDFA_PARSE_SUCCESS;
1322 }
1323
1324 // otherwise just parse the block passed in
1325 #ifdef LIBRDFA_IN_RAPTOR
1326 if(raptor_sax2_parse_chunk(context->sax2, (const unsigned char*)data, wblen, done))
1327 {
1328 return RDFA_PARSE_FAILED;
1329 }
1330 #else
1331 if(XML_Parse(context->parser, data, wblen, done) == XML_STATUS_ERROR)
1332 {
1333 #ifdef WIN32
1334 printf(
1335 #else
1336 fprintf(stderr,
1337 #endif
1338 "%s at line %d, column %d.\n",
1339 XML_ErrorString(XML_GetErrorCode(context->parser)),
1340 (int)XML_GetCurrentLineNumber(context->parser),
1341 (int)XML_GetCurrentColumnNumber(context->parser));
1342 return RDFA_PARSE_FAILED;
1343 }
1344 #endif
1345
1346 return RDFA_PARSE_SUCCESS;
1347 }
1348
rdfa_parse_end(rdfacontext * context)1349 void rdfa_parse_end(rdfacontext* context)
1350 {
1351 // deinitialize context stack
1352 rdfa_pop_item(context->context_stack);
1353
1354 // Free the expat parser and the like
1355 #ifdef LIBRDFA_IN_RAPTOR
1356 if(context->base_uri)
1357 raptor_free_uri_v2(context->sax2->world, context->base_uri);
1358 raptor_free_sax2(context->sax2);
1359 context->sax2=NULL;
1360 #else
1361 // free parser
1362 XML_ParserFree(context->parser);
1363 #endif
1364 }
1365
rdfa_parse(rdfacontext * context)1366 int rdfa_parse(rdfacontext* context)
1367 {
1368 int rval;
1369
1370 rval = rdfa_parse_start(context);
1371 if(rval != RDFA_PARSE_SUCCESS)
1372 {
1373 context->done = 1;
1374 return rval;
1375 }
1376
1377 do
1378 {
1379 size_t wblen;
1380 int done;
1381
1382 wblen = context->buffer_filler_callback(
1383 context->working_buffer, context->wb_allocated,
1384 context->callback_data);
1385 done = (wblen == 0);
1386
1387 rval = rdfa_parse_chunk(
1388 context, context->working_buffer, wblen, done);
1389 context->done=done;
1390 }
1391 while(!context->done && rval == RDFA_PARSE_SUCCESS);
1392
1393 rdfa_parse_end(context);
1394
1395 return rval;
1396 }
1397