1 /**
2  * Copyright 2008 Digital Bazaar, Inc.
3  *
4  * This file is part of librdfa.
5  *
6  * librdfa is Free Software, and can be licensed under any of the
7  * following three licenses:
8  *
9  *   1. GNU Lesser General Public License (LGPL) V2.1 or any
10  *      newer version
11  *   2. GNU General Public License (GPL) V2 or any newer version
12  *   3. Apache License, V2.0 or any newer version
13  *
14  * You may not use this file except in compliance with at least one of
15  * the above three licenses.
16  *
17  * See LICENSE-* at the top of this software distribution for more
18  * information regarding the details of each license.
19  *
20  * The librdfa library is the Fastest RDFa Parser in the Universe. It is
21  * a stream parser, meaning that it takes an XML data as input and spits
22  * out RDF triples as it comes across them in the stream. Due to this
23  * processing approach, librdfa has a very, very small memory footprint.
24  * It is also very fast and can operate on hundreds of gigabytes of XML
25  * data without breaking a sweat.
26  *
27  * Usage:
28  *
29  *    rdfacontext* context = rdfa_create_context(BASE_URI);
30  *    context->callback_data = your_user_data;
31  *    rdfa_set_triple_handler(context, &process_triple);
32  *    rdfa_set_buffer_filler(context, &fill_buffer);
33  *    rdfa_parse(context);
34  *    rdfa_free_context(context);
35  *
36  * @author Manu Sporny
37  */
38 #include <stdlib.h>
39 #include <stdio.h>
40 #include <string.h>
41 #include "rdfa_utils.h"
42 #include "rdfa.h"
43 
44 #define READ_BUFFER_SIZE 4096
45 
rdfa_init_context(rdfacontext * context)46 void rdfa_init_context(rdfacontext* context)
47 {
48    // the [parent subject] is set to the [base] value;
49    context->parent_subject = NULL;
50    if(context->base != NULL)
51    {
52       char* cleaned_base = rdfa_iri_get_base(context->base);
53       context->parent_subject =
54          rdfa_replace_string(context->parent_subject, cleaned_base);
55       free(cleaned_base);
56    }
57 
58    // the [parent object] is set to null;
59    context->parent_object = NULL;
60 
61 #ifndef LIBRDFA_IN_RAPTOR
62    // the [list of URI mappings] is cleared;
63    context->uri_mappings = (char**)rdfa_create_mapping(MAX_URI_MAPPINGS);
64 #endif
65 
66    // the [list of incomplete triples] is cleared;
67    context->incomplete_triples = rdfa_create_list(3);
68 
69    // the [language] is set to null.
70    context->language = NULL;
71 
72    // set the [current object resource] to null;
73    context->current_object_resource = NULL;
74 
75    // 1. First, the local values are initialized, as follows:
76    //
77    // * the [recurse] flag is set to 'true';
78    context->recurse = 1;
79 
80    // * the [skip element] flag is set to 'false';
81    context->skip_element = 0;
82 
83    // * [new subject] is set to null;
84    context->new_subject = NULL;
85 
86    // * [current object resource] is set to null;
87    context->current_object_resource = NULL;
88 
89    // * the [local list of URI mappings] is set to the list of URI
90    //   mappings from the [evaluation context];
91    //   NOTE: This step is done in rdfa_create_new_element_context()
92 
93    // * the [local list of incomplete triples] is set to null;
94    context->local_incomplete_triples = rdfa_create_list(3);
95 
96    // * the [current language] value is set to the [language] value
97    //   from the [evaluation context].
98    //   NOTE: This step is done in rdfa_create_new_element_context()
99 
100    // The next set of variables are initialized to make the C compiler
101    // and valgrind happy - they are not a part of the RDFa spec.
102    context->bnode_count = 0;
103    context->underscore_colon_bnode_name = NULL;
104    context->xml_literal_namespaces_defined = 0;
105    context->xml_literal_xml_lang_defined = 0;
106    context->content = NULL;
107    context->datatype = NULL;
108    context->property = NULL;
109    context->plain_literal = NULL;
110    context->plain_literal_size = 0;
111    context->xml_literal = NULL;
112    context->xml_literal_size = 0;
113    // FIXME: completing incomplete triples always happens now, change
114    //        all of the code to reflect that.
115    //context->callback_data = NULL;
116 }
117 
118 /**
119  * Read the head of the XHTML document and determines the base IRI for
120  * the document.
121  *
122  * @param context the current working context.
123  * @param working_buffer the current working buffer.
124  * @param wb_allocated the number of bytes that have been allocated to
125  *                     the working buffer.
126  *
127  * @return the size of the data available in the working buffer.
128  */
rdfa_init_base(rdfacontext * context,char ** working_buffer,size_t * working_buffer_size,char * temp_buffer,size_t bytes_read)129 static size_t rdfa_init_base(
130    rdfacontext* context, char** working_buffer, size_t* working_buffer_size,
131    char* temp_buffer, size_t bytes_read)
132 {
133    char* head_end = NULL;
134    size_t offset = context->wb_offset;
135    int needed_size = (offset + bytes_read) - *working_buffer_size;
136 
137    // search for the end of <head>, stop if <head> was found
138 
139    // extend the working buffer size
140    if(needed_size > 0)
141    {
142       size_t temp_buffer_size = sizeof(char) * READ_BUFFER_SIZE;
143       if((size_t)needed_size > temp_buffer_size)
144          temp_buffer_size += needed_size;
145 
146       *working_buffer_size += temp_buffer_size;
147       // +1 for NUL at end, to allow strstr() etc. to work
148       *working_buffer = (char*)realloc(*working_buffer, *working_buffer_size + 1);
149    }
150 
151    // append to the working buffer
152    memmove(*working_buffer + offset, temp_buffer, bytes_read);
153    // ensure the buffer is a NUL-terminated string
154    *(*working_buffer + offset + bytes_read) = '\0';
155 
156    // search for the end of </head> in
157    head_end = strstr(*working_buffer, "</head>");
158    if(head_end == NULL)
159       head_end = strstr(*working_buffer, "</HEAD>");
160 
161    context->wb_offset += bytes_read;
162 
163    if(head_end == NULL)
164       return bytes_read;
165 
166    // if </head> was found, search for <base and extract the base URI
167    if(head_end != NULL)
168    {
169       char* base_start = strstr(*working_buffer, "<base ");
170       if(base_start == NULL)
171          base_start = strstr(*working_buffer, "<BASE ");
172 
173       if(base_start != NULL)
174       {
175          char* href_start = strstr(base_start, "href=");
176          char* uri_start = href_start + 6;
177          char* uri_end = strchr(uri_start, '"');
178 
179          if((uri_start != NULL) && (uri_end != NULL))
180          {
181             if(*uri_start != '"')
182             {
183                size_t uri_size = uri_end - uri_start;
184                char* temp_uri = (char*)malloc(sizeof(char) * uri_size + 1);
185 	       char* cleaned_base;
186                strncpy(temp_uri, uri_start, uri_size);
187                temp_uri[uri_size] = '\0';
188 
189                // TODO: This isn't in the processing rules, should it
190                //       be? Setting current_object_resource will make
191                //       sure that the BASE element is inherited by all
192                //       subcontexts.
193 	       cleaned_base = rdfa_iri_get_base(temp_uri);
194                context->current_object_resource =
195                   rdfa_replace_string(
196                      context->current_object_resource, cleaned_base);
197 
198 	       // clean up the base context
199                context->base =
200                   rdfa_replace_string(context->base, cleaned_base);
201                free(cleaned_base);
202                free(temp_uri);
203             }
204          }
205       }
206    }
207 
208    return bytes_read;
209 }
210 
211 /**
212  * Creates a new context for the current element by cloning certain
213  * parts of the old context on the top of the given stack.
214  *
215  * @param context_stack the context stack that is associated with this
216  *                      processing run.
217  */
rdfa_create_new_element_context(rdfalist * context_stack)218 static rdfacontext* rdfa_create_new_element_context(rdfalist* context_stack)
219 {
220    rdfacontext* parent_context = (rdfacontext*)
221       context_stack->items[context_stack->num_items - 1]->data;
222    rdfacontext* rval = rdfa_create_context(parent_context->base);
223 
224    // * Otherwise, the values are:
225 
226    // * the [ base ] is set to the [ base ] value of the current
227    //   [ evaluation context ];
228    rval->base = rdfa_replace_string(rval->base, parent_context->base);
229    rdfa_init_context(rval);
230 
231    // copy the URI mappings
232 #ifndef LIBRDFA_IN_RAPTOR
233    if(rval->uri_mappings != NULL)
234    {
235       rdfa_free_mapping(rval->uri_mappings);
236    }
237    rval->uri_mappings = rdfa_copy_mapping(parent_context->uri_mappings);
238 #endif
239 
240    // inherit the parent context's language
241    if(parent_context->language != NULL)
242    {
243       rval->language =
244          rdfa_replace_string(rval->language, parent_context->language);
245    }
246 
247    // set the triple callback
248    rval->triple_callback = parent_context->triple_callback;
249    rval->buffer_filler_callback = parent_context->buffer_filler_callback;
250 
251    // inherit the bnode count, _: bnode name, recurse flag, and state
252    // of the xml_literal_namespace_insertion
253    rval->bnode_count = parent_context->bnode_count;
254    rval->underscore_colon_bnode_name =
255       rdfa_replace_string(rval->underscore_colon_bnode_name,
256                           parent_context->underscore_colon_bnode_name);
257    rval->recurse = parent_context->recurse;
258    rval->skip_element = 0;
259    rval->callback_data = parent_context->callback_data;
260    rval->xml_literal_namespaces_defined =
261       parent_context->xml_literal_namespaces_defined;
262    rval->xml_literal_xml_lang_defined =
263       parent_context->xml_literal_xml_lang_defined;
264 
265    // inherit the parent context's new_subject
266    // TODO: This is not anywhere in the syntax processing document
267    //if(parent_context->new_subject != NULL)
268    //{
269    //   rval->new_subject = rdfa_replace_string(
270    //      rval->new_subject, parent_context->new_subject);
271    //}
272 
273    if(parent_context->skip_element == 0)
274    {
275       // o the [ parent subject ] is set to the value of [ new subject ],
276       //   if non-null, or the value of the [ parent subject ] of the
277       //   current [ evaluation context ];
278       if(parent_context->new_subject != NULL)
279       {
280          rval->parent_subject = rdfa_replace_string(
281             rval->parent_subject, parent_context->new_subject);
282       }
283       else
284       {
285          rval->parent_subject = rdfa_replace_string(
286             rval->parent_subject, parent_context->parent_subject);
287       }
288 
289       // o the [ parent object ] is set to value of [ current object
290       //   resource ], if non-null, or the value of [ new subject ], if
291       //   non-null, or the value of the [ parent subject ] of the
292       //   current [ evaluation context ];
293       if(parent_context->current_object_resource != NULL)
294       {
295          rval->parent_object =
296             rdfa_replace_string(
297                rval->parent_object, parent_context->current_object_resource);
298       }
299       else if(parent_context->new_subject != NULL)
300       {
301          rval->parent_object =
302             rdfa_replace_string(
303                rval->parent_object, parent_context->new_subject);
304       }
305       else
306       {
307          rval->parent_object =
308             rdfa_replace_string(
309                rval->parent_object, parent_context->parent_subject);
310       }
311 
312       // copy the incomplete triples
313       if(rval->incomplete_triples != NULL)
314       {
315          rdfa_free_list(rval->incomplete_triples);
316       }
317 
318       // o the [ list of incomplete triples ] is set to the [ local list
319       //   of incomplete triples ];
320       rval->incomplete_triples =
321          rdfa_copy_list(parent_context->local_incomplete_triples);
322    }
323    else
324    {
325       rval->parent_subject = rdfa_replace_string(
326          rval->parent_subject, parent_context->parent_subject);
327       rval->parent_object = rdfa_replace_string(
328          rval->parent_object, parent_context->parent_object);
329 
330       // copy the incomplete triples
331       if(rval->incomplete_triples != NULL)
332       {
333          rdfa_free_list(rval->incomplete_triples);
334       }
335 
336       rval->incomplete_triples =
337          rdfa_copy_list(parent_context->incomplete_triples);
338 
339       // copy the local list of incomplete triples
340       if(rval->local_incomplete_triples != NULL)
341       {
342          rdfa_free_list(rval->local_incomplete_triples);
343       }
344 
345       rval->local_incomplete_triples =
346          rdfa_copy_list(parent_context->local_incomplete_triples);
347    }
348 
349 #ifdef LIBRDFA_IN_RAPTOR
350    rval->base_uri = parent_context->base_uri;
351    rval->sax2     = parent_context->sax2;
352    rval->namespace_handler = parent_context->namespace_handler;
353    rval->namespace_handler_user_data = parent_context->namespace_handler_user_data;
354    rval->error_handlers = parent_context->error_handlers;
355 #endif
356 
357    return rval;
358 }
359 
360 
361 #ifdef LIBRDFA_IN_RAPTOR
362 static int
raptor_nspace_compare(const void * a,const void * b)363 raptor_nspace_compare(const void *a, const void *b)
364 {
365   raptor_namespace* ns_a=*(raptor_namespace**)a;
366   raptor_namespace* ns_b=*(raptor_namespace**)b;
367   if(!ns_a->prefix)
368     return 1;
369   else if(!ns_b->prefix)
370     return -1;
371   else
372     return strcmp((const char*)ns_b->prefix, (const char*)ns_a->prefix);
373 }
374 #endif
375 
376 /**
377  * Handles the start_element call
378  */
379 static void XMLCALL
start_element(void * user_data,const char * name,const char ** attributes)380    start_element(void* user_data, const char* name, const char** attributes)
381 {
382    rdfalist* context_stack = (rdfalist*) user_data;
383    rdfacontext* context = rdfa_create_new_element_context(context_stack);
384    const char** aptr = attributes;
385    const char* xml_lang = NULL;
386    const char* about_curie = NULL;
387    char* about = NULL;
388    const char* src_curie = NULL;
389    char* src = NULL;
390    const char* type_of_curie = NULL;
391    rdfalist* type_of = NULL;
392    const char* rel_curie = NULL;
393    rdfalist* rel = NULL;
394    const char* rev_curie = NULL;
395    rdfalist* rev = NULL;
396    const char* property_curie = NULL;
397    rdfalist* property = NULL;
398    const char* resource_curie = NULL;
399    char* resource = NULL;
400    const char* href_curie = NULL;
401    char* href = NULL;
402    const char* content = NULL;
403    const char* datatype_curie = NULL;
404    char* datatype = NULL;
405 
406    rdfa_push_item(context_stack, context, RDFALIST_FLAG_CONTEXT);
407 
408    if(DEBUG)
409    {
410       printf("DEBUG: ------- START - %s -------\n", name);
411    }
412 
413    // start the XML Literal text
414    if(context->xml_literal == NULL)
415    {
416       context->xml_literal = rdfa_replace_string(context->xml_literal, "<");
417       context->xml_literal_size = 1;
418    }
419    else
420    {
421       context->xml_literal = rdfa_n_append_string(
422          context->xml_literal, &context->xml_literal_size, "<", 1);
423    }
424    context->xml_literal = rdfa_n_append_string(
425       context->xml_literal, &context->xml_literal_size,
426       name, strlen(name));
427 
428    if(!context->xml_literal_namespaces_defined)
429    {
430       // append namespaces to XML Literal
431 #ifdef LIBRDFA_IN_RAPTOR
432       raptor_namespace_stack* nstack = &context->sax2->namespaces;
433       raptor_namespace* ns;
434       raptor_namespace** ns_list = NULL;
435       size_t ns_size;
436 #else
437       char** umap = context->uri_mappings;
438 #endif
439       char* umap_key = NULL;
440       char* umap_value = NULL;
441 
442       // if the namespaces are not defined, then neither is the xml:lang
443       context->xml_literal_xml_lang_defined = 0;
444 
445 #ifdef LIBRDFA_IN_RAPTOR
446       ns_size = 0;
447       ns_list = raptor_namespace_stack_to_array(nstack, &ns_size);
448       qsort((void*)ns_list, ns_size, sizeof(raptor_namespace*),
449             raptor_nspace_compare);
450 
451       while(ns_size > 0)
452 #else
453       while(*umap != NULL)
454 #endif
455       {
456          unsigned char insert_xmlns_definition = 1;
457          const char* attr = NULL;
458          const char* value = NULL;
459 
460          // get the next mapping to process
461 #ifdef LIBRDFA_IN_RAPTOR
462          ns=ns_list[--ns_size];
463 
464          umap_key = (char*)raptor_namespace_get_prefix(ns);
465          if(!umap_key)
466            umap_key=(char*)XMLNS_DEFAULT_MAPPING;
467          umap_value = (char*)raptor_uri_as_string_v2(context->sax2->world, raptor_namespace_get_uri(ns));
468 #else
469          rdfa_next_mapping(umap++, &umap_key, &umap_value);
470          umap++;
471 #endif
472 
473          // check to make sure that the namespace isn't already
474          // defined in the current element.
475          if(attributes != NULL)
476          {
477             const char** attrs = attributes;
478             while((*attrs != NULL) && insert_xmlns_definition)
479             {
480                attr = *attrs++;
481                value = *attrs++;
482 
483                // if the attribute is a umap_key, skip the definition
484                // of the attribute.
485                if((strcmp(attr, umap_key) == 0) ||
486                   (strcmp(umap_key, XMLNS_DEFAULT_MAPPING) == 0))
487                {
488                   insert_xmlns_definition = 0;
489                }
490             }
491          }
492 
493          // if the namespace isn't already defined on the element,
494          // copy it to the XML Literal string.
495          if(insert_xmlns_definition)
496          {
497             // append the namespace attribute to the XML Literal
498             context->xml_literal = rdfa_n_append_string(
499                context->xml_literal, &context->xml_literal_size,
500                " xmlns", strlen(" xmlns"));
501 
502             // check to see if we're dumping the standard XHTML namespace or
503             // a user-defined XML namespace
504             if(strcmp(umap_key, XMLNS_DEFAULT_MAPPING) != 0)
505             {
506                context->xml_literal = rdfa_n_append_string(
507                   context->xml_literal, &context->xml_literal_size, ":", 1);
508                context->xml_literal = rdfa_n_append_string(
509                   context->xml_literal, &context->xml_literal_size,
510                   umap_key, strlen(umap_key));
511             }
512 
513             // append the namespace value
514             context->xml_literal = rdfa_n_append_string(
515                context->xml_literal, &context->xml_literal_size, "=\"", 2);
516             context->xml_literal = rdfa_n_append_string(
517                context->xml_literal, &context->xml_literal_size,
518                umap_value, strlen(umap_value));
519             context->xml_literal = rdfa_n_append_string(
520                context->xml_literal, &context->xml_literal_size, "\"", 1);
521          }
522 
523          insert_xmlns_definition = 1;
524       } /* end while umap not NULL */
525       context->xml_literal_namespaces_defined = 1;
526 
527 #ifdef LIBRDFA_IN_RAPTOR
528       if(ns_list)
529         raptor_free_memory(ns_list);
530 #endif
531    } /* end if namespaces inserted */
532 
533 
534    // prepare all of the RDFa-specific attributes we are looking for.
535    // scan all of the attributes for the RDFa-specific attributes
536    if(aptr != NULL)
537    {
538       while(*aptr != NULL)
539       {
540          const char* attr;
541          const char* value;
542          char* literal_text;
543 
544          attr = *aptr++;
545          value = *aptr++;
546 
547          // append the attribute-value pair to the XML literal
548          literal_text = (char*)malloc(strlen(attr) + strlen(value) + 5);
549          sprintf(literal_text, " %s=\"%s\"", attr, value);
550          context->xml_literal = rdfa_n_append_string(
551             context->xml_literal, &context->xml_literal_size,
552             literal_text, strlen(literal_text));
553          free(literal_text);
554 
555          // if xml:lang is defined, ensure that it is not overwritten
556          if(strcmp(attr, "xml:lang") == 0)
557          {
558             context->xml_literal_xml_lang_defined = 1;
559          }
560 
561          // process all of the RDFa attributes
562          if(strcmp(attr, "about") == 0)
563          {
564             about_curie = value;
565             about = rdfa_resolve_curie(
566                context, about_curie, CURIE_PARSE_ABOUT_RESOURCE);
567          }
568          else if(strcmp(attr, "src") == 0)
569          {
570             src_curie = value;
571             src = rdfa_resolve_curie(context, src_curie, CURIE_PARSE_HREF_SRC);
572          }
573          else if(strcmp(attr, "typeof") == 0)
574          {
575             type_of_curie = value;
576             type_of = rdfa_resolve_curie_list(
577                context, type_of_curie,
578                CURIE_PARSE_INSTANCEOF_DATATYPE);
579          }
580          else if(strcmp(attr, "rel") == 0)
581          {
582             rel_curie = value;
583             rel = rdfa_resolve_curie_list(
584                context, rel_curie, CURIE_PARSE_RELREV);
585          }
586          else if(strcmp(attr, "rev") == 0)
587          {
588             rev_curie = value;
589             rev = rdfa_resolve_curie_list(
590                context, rev_curie, CURIE_PARSE_RELREV);
591          }
592          else if(strcmp(attr, "property") == 0)
593          {
594             property_curie = value;
595             property =
596                rdfa_resolve_curie_list(
597                   context, property_curie, CURIE_PARSE_PROPERTY);
598          }
599          else if(strcmp(attr, "resource") == 0)
600          {
601             resource_curie = value;
602             resource = rdfa_resolve_curie(
603                context, resource_curie, CURIE_PARSE_ABOUT_RESOURCE);
604          }
605          else if(strcmp(attr, "href") == 0)
606          {
607             href_curie = value;
608             href =
609                rdfa_resolve_curie(context, href_curie, CURIE_PARSE_HREF_SRC);
610          }
611          else if(strcmp(attr, "content") == 0)
612          {
613             content = value;
614          }
615          else if(strcmp(attr, "datatype") == 0)
616          {
617             datatype_curie = value;
618 
619             if(strlen(datatype_curie) == 0)
620             {
621                datatype = rdfa_replace_string(datatype, "");
622             }
623             else
624             {
625                datatype = rdfa_resolve_curie(context, datatype_curie,
626                   CURIE_PARSE_INSTANCEOF_DATATYPE);
627             }
628          }
629 #ifndef LIBRDFA_IN_RAPTOR
630          else if(strcmp(attr, "xml:lang") == 0)
631          {
632             xml_lang = value;
633          }
634          else if(strstr(attr, "xmlns") != NULL)
635          {
636             // 2. Next the [current element] is parsed for
637             //    [URI mapping]s and these are added to the
638             //    [local list of URI mappings]. Note that a
639             //    [URI mapping] will simply overwrite any current
640             //    mapping in the list that has the same name;
641             rdfa_update_uri_mappings(context, attr, value);
642          }
643 #endif
644       }
645    }
646 
647 #ifdef LIBRDFA_IN_RAPTOR
648    if(context->sax2) {
649       xml_lang = (const char*)raptor_sax2_inscope_xml_language(context->sax2);
650       if(!xml_lang)
651         xml_lang = "";
652    }
653 #endif
654    // check to see if we should append an xml:lang to the XML Literal
655    // if one is defined in the context and does not exist on the
656    // element.
657    if((xml_lang == NULL) && (context->language != NULL) &&
658       !context->xml_literal_xml_lang_defined)
659    {
660       context->xml_literal = rdfa_n_append_string(
661          context->xml_literal, &context->xml_literal_size,
662          " xml:lang=\"", strlen(" xml:lang=\""));
663       context->xml_literal = rdfa_n_append_string(
664          context->xml_literal, &context->xml_literal_size,
665          context->language, strlen(context->language));
666       context->xml_literal = rdfa_n_append_string(
667          context->xml_literal, &context->xml_literal_size, "\"", 1);
668 
669       // ensure that the lang isn't set in a subtree (unless it's overwritten)
670       context->xml_literal_xml_lang_defined = 1;
671    }
672 
673    // close the XML Literal value
674    context->xml_literal = rdfa_n_append_string(
675       context->xml_literal, &context->xml_literal_size, ">", 1);
676 
677    // 3. The [current element] is also parsed for any language
678    //    information, and [language] is set in the [current
679    //    evaluation context];
680    rdfa_update_language(context, xml_lang);
681 
682    /***************** FOR DEBUGGING PURPOSES ONLY ******************/
683    if(DEBUG)
684    {
685       if(about != NULL)
686       {
687          printf("DEBUG: @about = %s\n", about);
688       }
689       if(src != NULL)
690       {
691          printf("DEBUG: @src = %s\n", src);
692       }
693       if(type_of != NULL)
694       {
695          printf("DEBUG: @type_of = ");
696          rdfa_print_list(type_of);
697       }
698       if(rel != NULL)
699       {
700          printf("DEBUG: @rel = ");
701          rdfa_print_list(rel);
702       }
703       if(rev != NULL)
704       {
705          printf("DEBUG: @rev = ");
706          rdfa_print_list(rev);
707       }
708       if(property != NULL)
709       {
710          printf("DEBUG: @property = ");
711          rdfa_print_list(property);
712       }
713       if(resource != NULL)
714       {
715          printf("DEBUG: @resource = %s\n", resource);
716       }
717       if(href != NULL)
718       {
719          printf("DEBUG: @href = %s\n", href);
720       }
721       if(content != NULL)
722       {
723          printf("DEBUG: @content = %s\n", content);
724       }
725       if(datatype != NULL)
726       {
727          printf("DEBUG: @datatype = %s\n", datatype);
728       }
729    }
730 
731    // TODO: This isn't part of the processing model, it needs to be
732    // included and is a correction for the last item in step #4.
733    if((about == NULL) && (src == NULL) && (type_of == NULL) &&
734       (rel == NULL) && (rev == NULL) && (property == NULL) &&
735       (resource == NULL) && (href == NULL))
736    {
737       context->skip_element = 1;
738    }
739 
740    if((rel == NULL) && (rev == NULL))
741    {
742       // 4. If the [current element] contains no valid @rel or @rev
743       // URI, obtained according to the section on CURIE and URI
744       // Processing, then the next step is to establish a value for
745       // [new subject]. Any of the attributes that can carry a
746       // resource can set [new subject];
747       rdfa_establish_new_subject(
748          context, name, about, src, resource, href, type_of);
749    }
750    else
751    {
752       // 5. If the [current element] does contain a valid @rel or @rev
753       // URI, obtained according to the section on CURIE and URI
754       // Processing, then the next step is to establish both a value
755       // for [new subject] and a value for [current object resource]:
756       rdfa_establish_new_subject_with_relrev(
757          context, name, about, src, resource, href, type_of);
758    }
759 
760    if(context->new_subject != NULL)
761    {
762       if(DEBUG)
763       {
764          printf("DEBUG: new_subject = %s\n", context->new_subject);
765       }
766 
767       // 6. If in any of the previous steps a [new subject] was set to
768       // a non-null value,
769 
770       // it is now used to provide a subject for type values;
771       if(type_of != NULL)
772       {
773          rdfa_complete_type_triples(context, type_of);
774       }
775 
776       // Note that none of this block is executed if there is no
777       // [new subject] value, i.e., [new subject] remains null.
778    }
779 
780    if(context->current_object_resource != NULL)
781    {
782       // 7. If in any of the previous steps a [current object  resource]
783       // was set to a non-null value, it is now used to generate triples
784       rdfa_complete_relrev_triples(context, rel, rev);
785    }
786    else if((rel != NULL) || (rev != NULL))
787    {
788       // 8. If however [current object resource] was set to null, but
789       // there are predicates present, then they must be stored as
790       // [incomplete triple]s, pending the discovery of a subject that
791       // can be used as the object. Also, [current object resource]
792       // should be set to a newly created [bnode]
793       rdfa_save_incomplete_triples(context, rel, rev);
794    }
795 
796    // Ensure to re-insert XML Literal namespace information from this
797    // point on...
798    if(property != NULL)
799    {
800       context->xml_literal_namespaces_defined = 0;
801    }
802 
803    // save these for processing steps #9 and #10
804    context->property = property;
805    context->content = rdfa_replace_string(context->datatype, content);
806    context->datatype = rdfa_replace_string(context->datatype, datatype);
807 
808    // free the resolved CURIEs
809    free(about);
810    free(src);
811    rdfa_free_list(type_of);
812    rdfa_free_list(rel);
813    rdfa_free_list(rev);
814    free(resource);
815    free(href);
816    free(datatype);
817 }
818 
character_data(void * user_data,const char * s,int len)819 static void XMLCALL character_data(void *user_data, const char *s, int len)
820 {
821    rdfalist* context_stack = (rdfalist*)user_data;
822    rdfacontext* context = (rdfacontext*)
823       context_stack->items[context_stack->num_items - 1]->data;
824 
825    char *buffer = (char*)malloc(len + 1);
826    memset(buffer, 0, len + 1);
827    memcpy(buffer, s, len);
828 
829    // append the text to the current context's plain literal
830    if(context->plain_literal == NULL)
831    {
832       context->plain_literal =
833          rdfa_replace_string(context->plain_literal, buffer);
834       context->plain_literal_size = len;
835    }
836    else
837    {
838       context->plain_literal = rdfa_n_append_string(
839          context->plain_literal, &context->plain_literal_size, buffer, len);
840    }
841 
842    // append the text to the current context's XML literal
843    if(context->xml_literal == NULL)
844    {
845       context->xml_literal =
846          rdfa_replace_string(context->xml_literal, buffer);
847       context->xml_literal_size = len;
848    }
849    else
850    {
851       context->xml_literal = rdfa_n_append_string(
852          context->xml_literal, &context->xml_literal_size, buffer, len);
853   }
854 
855    //printf("plain_literal: %s\n", context->plain_literal);
856    //printf("xml_literal: %s\n", context->xml_literal);
857 
858    free(buffer);
859 }
860 
861 static void XMLCALL
end_element(void * user_data,const char * name)862    end_element(void *user_data, const char *name)
863 {
864    rdfalist* context_stack = (rdfalist*)user_data;
865    rdfacontext* context = (rdfacontext*)rdfa_pop_item(context_stack);
866    rdfacontext* parent_context = (rdfacontext*)
867       context_stack->items[context_stack->num_items - 1]->data;
868 
869    // append the text to the current context's XML literal
870    char* buffer = (char*)malloc(strlen(name) + 4);
871 
872    if(DEBUG)
873    {
874       printf("DEBUG: </%s>\n", name);
875    }
876 
877    sprintf(buffer, "</%s>", name);
878    if(context->xml_literal == NULL)
879    {
880       context->xml_literal =
881          rdfa_replace_string(context->xml_literal, buffer);
882       context->xml_literal_size = strlen(buffer);
883    }
884    else
885    {
886       context->xml_literal = rdfa_n_append_string(
887          context->xml_literal, &context->xml_literal_size,
888          buffer, strlen(buffer));
889    }
890    free(buffer);
891 
892    // 9. The next step of the iteration is to establish any
893    // [current object literal];
894 
895    // generate the complete object literal triples
896    if(context->property != NULL)
897    {
898       // save the current xml literal
899       char* saved_xml_literal = context->xml_literal;
900       char* content_start = NULL;
901       char* content_end = NULL;
902 
903       // ensure to mark only the inner-content of the XML node for
904       // processing the object literal.
905       buffer = NULL;
906 
907 
908       if(context->xml_literal != NULL)
909       {
910          // get the data between the first tag and the last tag
911          content_start = strchr(context->xml_literal, '>');
912          content_end = strrchr(context->xml_literal, '<');
913 
914          if((content_start != NULL) && (content_end != NULL))
915          {
916             // set content end to null terminator
917             context->xml_literal = ++content_start;
918             *content_end = '\0';
919          }
920       }
921 
922       // update the plain literal if the XML Literal is an empty string
923       if(strlen(context->xml_literal) == 0)
924       {
925          context->plain_literal =
926             rdfa_replace_string(context->plain_literal, "");
927       }
928 
929       // process data between first tag and last tag
930       // this needs the xml literal to be null terminated
931       rdfa_complete_object_literal_triples(context);
932 
933       if(content_end != NULL)
934       {
935          // set content end back
936          *content_end = '<';
937       }
938 
939       if(saved_xml_literal != NULL)
940       {
941          // restore xml literal
942          context->xml_literal = saved_xml_literal;
943       }
944    }
945 
946    //printf(context->plain_literal);
947 
948    // append the XML literal and plain text literals to the parent
949    // literals
950    if(context->xml_literal != NULL)
951    {
952       if(parent_context->xml_literal == NULL)
953       {
954          parent_context->xml_literal =
955             rdfa_replace_string(
956                parent_context->xml_literal, context->xml_literal);
957          parent_context->xml_literal_size = context->xml_literal_size;
958       }
959       else
960       {
961          parent_context->xml_literal =
962             rdfa_n_append_string(
963                parent_context->xml_literal, &parent_context->xml_literal_size,
964                context->xml_literal, context->xml_literal_size);
965       }
966 
967       // if there is an XML literal, there is probably a plain literal
968       if(context->plain_literal != NULL)
969       {
970          if(parent_context->plain_literal == NULL)
971          {
972             parent_context->plain_literal =
973                rdfa_replace_string(
974                   parent_context->plain_literal, context->plain_literal);
975             parent_context->plain_literal_size = context->plain_literal_size;
976          }
977          else
978          {
979             parent_context->plain_literal =
980                rdfa_n_append_string(
981                   parent_context->plain_literal,
982                   &parent_context->plain_literal_size,
983                   context->plain_literal,
984                   context->plain_literal_size);
985          }
986       }
987    }
988 
989    // preserve the bnode count by copying it to the parent_context
990    parent_context->bnode_count = context->bnode_count;
991    parent_context->underscore_colon_bnode_name = \
992       rdfa_replace_string(parent_context->underscore_colon_bnode_name,
993                           context->underscore_colon_bnode_name);
994 
995    // 10. If the [ skip element ] flag is 'false', and [ new subject ]
996    // was set to a non-null value, then any [ incomplete triple ]s
997    // within the current context should be completed:
998    if((context->skip_element == 0) && (context->new_subject != NULL))
999    {
1000       rdfa_complete_incomplete_triples(context);
1001    }
1002 
1003    // free the context
1004    rdfa_free_context(context);
1005 }
1006 
1007 
1008 #ifdef LIBRDFA_IN_RAPTOR
raptor_rdfa_start_element(void * user_data,raptor_xml_element * xml_element)1009 static void raptor_rdfa_start_element(void *user_data,
1010                                       raptor_xml_element *xml_element)
1011 {
1012   raptor_qname* qname=raptor_xml_element_get_name(xml_element);
1013   int attr_count=raptor_xml_element_get_attributes_count(xml_element);
1014   raptor_qname** attrs=raptor_xml_element_get_attributes(xml_element);
1015   unsigned char* qname_string=raptor_qname_to_counted_name(qname, NULL);
1016   char** attr=NULL;
1017   int i;
1018 
1019   if(attr_count > 0) {
1020     attr=(char**)malloc(sizeof(char*) * (1+(attr_count*2)));
1021     for(i=0; i<attr_count; i++) {
1022       attr[2*i]=(char*)raptor_qname_to_counted_name(attrs[i], NULL);
1023       attr[1+(2*i)]=(char*)raptor_qname_get_value(attrs[i]);
1024     }
1025     attr[2*i]=NULL;
1026   }
1027   start_element(user_data, (char*)qname_string, (const char**)attr);
1028   raptor_free_memory(qname_string);
1029   if(attr) {
1030     for(i=0; i<attr_count; i++)
1031       raptor_free_memory(attr[2*i]);
1032     free(attr);
1033   }
1034 }
1035 
raptor_rdfa_end_element(void * user_data,raptor_xml_element * xml_element)1036 static void raptor_rdfa_end_element(void *user_data,
1037                                     raptor_xml_element* xml_element)
1038 {
1039   raptor_qname* qname=raptor_xml_element_get_name(xml_element);
1040   unsigned char* qname_string=raptor_qname_to_counted_name(qname, NULL);
1041 
1042   end_element(user_data, (const char*)qname_string);
1043   raptor_free_memory(qname_string);
1044 }
1045 
raptor_rdfa_character_data(void * user_data,raptor_xml_element * xml_element,const unsigned char * s,int len)1046 static void raptor_rdfa_character_data(void *user_data,
1047                                        raptor_xml_element* xml_element,
1048                                        const unsigned char *s, int len)
1049 {
1050   character_data(user_data, (const char *)s, len);
1051 }
1052 
raptor_rdfa_namespace_handler(void * user_data,raptor_namespace * nspace)1053 static void raptor_rdfa_namespace_handler(void *user_data,
1054                                           raptor_namespace* nspace)
1055 {
1056   rdfalist* context_stack = (rdfalist*)user_data;
1057   rdfacontext* context = (rdfacontext*)
1058     context_stack->items[context_stack->num_items - 1]->data;
1059 
1060   if(context->namespace_handler)
1061     (*context->namespace_handler)(context->namespace_handler_user_data,
1062                                   nspace);
1063 }
1064 
1065 
1066 
1067 #endif
1068 
1069 
rdfa_create_context(const char * base)1070 rdfacontext* rdfa_create_context(const char* base)
1071 {
1072    rdfacontext* rval = NULL;
1073    size_t base_length = strlen(base);
1074 
1075    // if the base isn't specified, don't create a context
1076    if(base_length > 0)
1077    {
1078       char* cleaned_base;
1079       rval = (rdfacontext*)malloc(sizeof(rdfacontext));
1080       rval->base = NULL;
1081       cleaned_base = rdfa_iri_get_base(base);
1082       rval->base = rdfa_replace_string(rval->base, cleaned_base);
1083       free(cleaned_base);
1084 
1085       /* parse state */
1086       rval->wb_allocated = 0;
1087       rval->working_buffer = NULL;
1088       rval->wb_offset = 0;
1089 #ifdef LIBRDFA_IN_RAPTOR
1090       rval->base_uri = NULL;
1091       rval->sax2 = NULL;
1092       rval->namespace_handler = NULL;
1093       rval->namespace_handler_user_data = NULL;
1094 #else
1095       rval->parser = NULL;
1096 #endif
1097       rval->done = 0;
1098       rval->context_stack = NULL;
1099       rval->wb_preread = 0;
1100       rval->preread = 0;
1101    }
1102    else
1103    {
1104       printf("OMG!\n");
1105    }
1106 
1107    return rval;
1108 }
1109 
rdfa_free_context(rdfacontext * context)1110 void rdfa_free_context(rdfacontext* context)
1111 {
1112    if(context->base)
1113    {
1114       free(context->base);
1115    }
1116 
1117    if(context->parent_subject != NULL)
1118    {
1119       free(context->parent_subject);
1120    }
1121 
1122    if(context->parent_object != NULL)
1123    {
1124       free(context->parent_object);
1125    }
1126 
1127 #ifndef LIBRDFA_IN_RAPTOR
1128    if(context->uri_mappings != NULL)
1129    {
1130       rdfa_free_mapping(context->uri_mappings);
1131    }
1132 #endif
1133 
1134    if(context->incomplete_triples != NULL)
1135    {
1136       rdfa_free_list(context->incomplete_triples);
1137    }
1138 
1139    if(context->language != NULL)
1140    {
1141       free(context->language);
1142    }
1143 
1144    if(context->underscore_colon_bnode_name != NULL)
1145    {
1146       free(context->underscore_colon_bnode_name);
1147    }
1148 
1149    if(context->new_subject != NULL)
1150    {
1151       free(context->new_subject);
1152    }
1153 
1154    if(context->current_object_resource != NULL)
1155    {
1156       free(context->current_object_resource);
1157    }
1158 
1159    if(context->content != NULL)
1160    {
1161       free(context->content);
1162    }
1163 
1164    if(context->datatype != NULL)
1165    {
1166       free(context->datatype);
1167    }
1168 
1169    if(context->property != NULL)
1170    {
1171       rdfa_free_list(context->property);
1172    }
1173 
1174    if(context->plain_literal != NULL)
1175    {
1176       free(context->plain_literal);
1177    }
1178 
1179    if(context->xml_literal != NULL)
1180    {
1181       free(context->xml_literal);
1182    }
1183 
1184    // TODO: These should be moved into their own data structure
1185    if(context->local_incomplete_triples != NULL)
1186    {
1187       rdfa_free_list(context->local_incomplete_triples);
1188    }
1189 
1190    // this field is not NULL only on the rdfacontext* at the top of the stack
1191    if(context->context_stack != NULL)
1192    {
1193       void* rval;
1194       // free the stack ensuring that we do not delete this context if
1195       // it is in the list (which it may be, if parsing ended on error)
1196       do {
1197         rval=rdfa_pop_item(context->context_stack);
1198         if(rval && rval != context)
1199           rdfa_free_context((rdfacontext*)rval);
1200       } while(rval);
1201       free(context->context_stack->items);
1202       free(context->context_stack);
1203    }
1204 
1205    if(context->working_buffer != NULL)
1206    {
1207       free(context->working_buffer);
1208    }
1209 
1210    free(context);
1211 }
1212 
rdfa_set_triple_handler(rdfacontext * context,triple_handler_fp th)1213 void rdfa_set_triple_handler(rdfacontext* context, triple_handler_fp th)
1214 {
1215    context->triple_callback = th;
1216 }
1217 
rdfa_set_buffer_filler(rdfacontext * context,buffer_filler_fp bf)1218 void rdfa_set_buffer_filler(rdfacontext* context, buffer_filler_fp bf)
1219 {
1220    context->buffer_filler_callback = bf;
1221 }
1222 
rdfa_parse_start(rdfacontext * context)1223 int rdfa_parse_start(rdfacontext* context)
1224 {
1225    // create the buffers and expat parser
1226    int rval = RDFA_PARSE_SUCCESS;
1227 
1228    context->wb_allocated = sizeof(char) * READ_BUFFER_SIZE;
1229    // +1 for NUL at end, to allow strstr() etc. to work
1230    // malloc - only the first char needs to be NUL
1231    context->working_buffer = (char*)malloc(context->wb_allocated + 1);
1232    *context->working_buffer = '\0';
1233 
1234 #ifndef LIBRDFA_IN_RAPTOR
1235    context->parser = XML_ParserCreate(NULL);
1236 #endif
1237    context->done = 0;
1238    context->context_stack = rdfa_create_list(32);
1239 
1240    // initialize the context stack
1241    rdfa_push_item(context->context_stack, context, RDFALIST_FLAG_CONTEXT);
1242 
1243 #ifdef LIBRDFA_IN_RAPTOR
1244    context->sax2 = raptor_new_sax2(context->context_stack,
1245                                    context->error_handlers);
1246 #else
1247 #endif
1248 
1249    // set up the context stack
1250 #ifdef LIBRDFA_IN_RAPTOR
1251    raptor_sax2_set_start_element_handler(context->sax2,
1252                                          raptor_rdfa_start_element);
1253    raptor_sax2_set_end_element_handler(context->sax2,
1254                                        raptor_rdfa_end_element);
1255    raptor_sax2_set_characters_handler(context->sax2,
1256                                       raptor_rdfa_character_data);
1257    raptor_sax2_set_namespace_handler(context->sax2,
1258                                      raptor_rdfa_namespace_handler);
1259 #else
1260    XML_SetUserData(context->parser, context->context_stack);
1261    XML_SetElementHandler(context->parser, start_element, end_element);
1262    XML_SetCharacterDataHandler(context->parser, character_data);
1263 #endif
1264 
1265    rdfa_init_context(context);
1266 
1267 #ifdef LIBRDFA_IN_RAPTOR
1268    context->base_uri=raptor_new_uri_v2(context->sax2->world, (const unsigned char*)context->base);
1269    raptor_sax2_parse_start(context->sax2, context->base_uri);
1270 #endif
1271 
1272    return rval;
1273 }
1274 
rdfa_parse_chunk(rdfacontext * context,char * data,size_t wblen,int done)1275 int rdfa_parse_chunk(rdfacontext* context, char* data, size_t wblen, int done)
1276 {
1277    // it is an error to call this before rdfa_parse_start()
1278    if(context->done)
1279    {
1280       return RDFA_PARSE_FAILED;
1281    }
1282 
1283    if(!context->preread)
1284    {
1285       // search for the <base> tag and use the href contained therein to
1286       // set the parsing context.
1287       context->wb_preread = rdfa_init_base(context,
1288          &context->working_buffer, &context->wb_allocated, data, wblen);
1289 
1290       // contisnue looking if in first 131072 bytes of data
1291       if(!context->base && context->wb_preread < (1<<17))
1292          return RDFA_PARSE_SUCCESS;
1293 
1294 #ifdef LIBRDFA_IN_RAPTOR
1295 
1296       if(raptor_sax2_parse_chunk(context->sax2,
1297                                  (const unsigned char*)context->working_buffer,
1298                                  context->wb_offset, done))
1299       {
1300          return RDFA_PARSE_FAILED;
1301       }
1302 #else
1303       if(XML_Parse(context->parser, context->working_buffer,
1304          context->wb_offset, 0) == XML_STATUS_ERROR)
1305       {
1306 #ifdef WIN32
1307          printf(
1308 #else
1309          fprintf(stderr,
1310 #endif
1311                  "%s at line %d, column %d\n",
1312                  XML_ErrorString(XML_GetErrorCode(context->parser)),
1313                  (int)XML_GetCurrentLineNumber(context->parser),
1314                  (int)XML_GetCurrentColumnNumber(context->parser));
1315          return RDFA_PARSE_FAILED;
1316       }
1317 #endif
1318 
1319       context->preread = 1;
1320 
1321       return RDFA_PARSE_SUCCESS;
1322    }
1323 
1324    // otherwise just parse the block passed in
1325 #ifdef LIBRDFA_IN_RAPTOR
1326    if(raptor_sax2_parse_chunk(context->sax2, (const unsigned char*)data, wblen, done))
1327    {
1328       return RDFA_PARSE_FAILED;
1329    }
1330 #else
1331    if(XML_Parse(context->parser, data, wblen, done) == XML_STATUS_ERROR)
1332    {
1333 #ifdef WIN32
1334          printf(
1335 #else
1336          fprintf(stderr,
1337 #endif
1338               "%s at line %d, column %d.\n",
1339               XML_ErrorString(XML_GetErrorCode(context->parser)),
1340               (int)XML_GetCurrentLineNumber(context->parser),
1341               (int)XML_GetCurrentColumnNumber(context->parser));
1342       return RDFA_PARSE_FAILED;
1343    }
1344 #endif
1345 
1346    return RDFA_PARSE_SUCCESS;
1347 }
1348 
rdfa_parse_end(rdfacontext * context)1349 void rdfa_parse_end(rdfacontext* context)
1350 {
1351    // deinitialize context stack
1352    rdfa_pop_item(context->context_stack);
1353 
1354    // Free the expat parser and the like
1355 #ifdef LIBRDFA_IN_RAPTOR
1356    if(context->base_uri)
1357       raptor_free_uri_v2(context->sax2->world, context->base_uri);
1358    raptor_free_sax2(context->sax2);
1359    context->sax2=NULL;
1360 #else
1361    // free parser
1362    XML_ParserFree(context->parser);
1363 #endif
1364 }
1365 
rdfa_parse(rdfacontext * context)1366 int rdfa_parse(rdfacontext* context)
1367 {
1368   int rval;
1369 
1370   rval = rdfa_parse_start(context);
1371   if(rval != RDFA_PARSE_SUCCESS)
1372   {
1373     context->done = 1;
1374     return rval;
1375   }
1376 
1377   do
1378   {
1379      size_t wblen;
1380      int done;
1381 
1382      wblen = context->buffer_filler_callback(
1383         context->working_buffer, context->wb_allocated,
1384         context->callback_data);
1385      done = (wblen == 0);
1386 
1387      rval = rdfa_parse_chunk(
1388         context, context->working_buffer, wblen, done);
1389      context->done=done;
1390   }
1391   while(!context->done && rval == RDFA_PARSE_SUCCESS);
1392 
1393   rdfa_parse_end(context);
1394 
1395   return rval;
1396 }
1397