1 /**
2  * Copyright 2008-2010 Digital Bazaar, Inc.
3  *
4  * This file is part of librdfa.
5  *
6  * librdfa is Free Software, and can be licensed under any of the
7  * following three licenses:
8  *
9  *   1. GNU Lesser General Public License (LGPL) V2.1 or any
10  *      newer version
11  *   2. GNU General Public License (GPL) V2 or any newer version
12  *   3. Apache License, V2.0 or any newer version
13  *
14  * You may not use this file except in compliance with at least one of
15  * the above three licenses.
16  *
17  * See LICENSE-* at the top of this software distribution for more
18  * information regarding the details of each license.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with librdfa. If not, see <http://www.gnu.org/licenses/>.
22  *
23  * The librdfa library is the Fastest RDFa Parser in the Universe. It is
24  * a stream parser, meaning that it takes an XML data as input and spits
25  * out RDF triples as it comes across them in the stream. Due to this
26  * processing approach, librdfa has a very, very small memory footprint.
27  * It is also very fast and can operate on hundreds of gigabytes of XML
28  * data without breaking a sweat.
29  *
30  * Usage:
31  *
32  * rdfacontext* context = rdfa_create_context(base_uri);
33  * context->callback_data = your_user_data;
34  * rdfa_set_triple_handler(context, triple_function);
35  * rdfa_set_buffer_filler(context, buffer_filler_function);
36  * rdfa_parse(context);
37  * rdfa_free_context(context);
38  *
39  * If you would like to get warnings/error triples from the processor graph:
40  *
41  * rdfa_set_issue_handler(context, triple_function);
42  *
43  * Usage if you need more control over when to fill rdfa's buffer:
44  *
45  * rdfacontext* context = rdfa_create_context(base_uri);
46  * context->callback_data = your_user_data;
47  * rdfa_set_triple_handler(context, triple_function);
48  * int rval = rdfa_parse_start(context);
49  * if(rval == RDFA_PARSE_SUCCESS)
50  * {
51  *    FILE* myfile = fopen("myfilename");
52  *    size_t buf_len = 0;
53  *    size_t read = 0;
54  *    do
55  *    {
56  *       char* buf = rdfa_get_buffer(context, &buf_len);
57  *       if(buf_len > 0)
58  *       {
59  *          // fill buffer here up to buf_len bytes from your input stream
60  *          read = fread(buf, sizeof(char), buf_len, myfile);
61  *       }
62  *
63  *       // parse the read data
64  *       rdfa_parse_buffer(context, read);
65  *    }
66  *    while(read > 0);
67  *    fclose(myfile);
68  *
69  *    rdfa_parse_end(context);
70  * }
71  * rdfa_free_context(context);
72  *
73  */
74 #ifndef _LIBRDFA_RDFA_H_
75 #define _LIBRDFA_RDFA_H_
76 #include <stdlib.h>
77 #include <libxml/SAX2.h>
78 
79 /* Activate the stupid Windows DLL exporting mechanism if we're building for Windows */
80 #ifdef WIN32
81 #define DLLEXPORT __declspec(dllexport)
82 #else
83 #define DLLEXPORT
84 #endif
85 
86 #ifdef LIBRDFA_IN_RAPTOR
87 #include "raptor2.h"
88 #include "raptor_internal.h"
89 #endif /* LIBRDFA_IN_RAPTOR */
90 
91 #ifdef __cplusplus
92 extern "C"
93 {
94 #endif
95 
96 #define DEBUG 0
97 
98 /* RDFa version numbers */
99 #define RDFA_VERSION_1_0 1
100 #define RDFA_VERSION_1_1 2
101 
102 /* parse process return types */
103 #define RDFA_PARSE_WARNING -2
104 #define RDFA_PARSE_FAILED -1
105 #define RDFA_PARSE_UNKNOWN 0
106 #define RDFA_PARSE_SUCCESS 1
107 
108 /* maximum list lengths */
109 #define MAX_LOCAL_LIST_MAPPINGS 32
110 #define MAX_LIST_MAPPINGS 48
111 #define MAX_LIST_ITEMS 16
112 #define MAX_TERM_MAPPINGS 64
113 #define MAX_URI_MAPPINGS 128
114 #define MAX_INCOMPLETE_TRIPLES 128
115 
116 /* host language definitions */
117 #define HOST_LANGUAGE_NONE 0
118 #define HOST_LANGUAGE_XML1 1
119 #define HOST_LANGUAGE_XHTML1 2
120 #define HOST_LANGUAGE_HTML 3
121 
122 /* default mapping key for xmlns */
123 #define XMLNS_DEFAULT_MAPPING "XMLNS_DEFAULT"
124 
125 /* whitespace characters for RDFa Core 1.1 */
126 #define RDFA_WHITESPACE " \t\n\v\f\r"
127 
128 /**
129  * An RDF resource type is used to denote the content of a triple's
130  * object value.
131  */
132 typedef enum
133 {
134    RDF_TYPE_NAMESPACE_PREFIX,
135    RDF_TYPE_IRI,
136    RDF_TYPE_PLAIN_LITERAL,
137    RDF_TYPE_XML_LITERAL,
138    RDF_TYPE_TYPED_LITERAL,
139    RDF_TYPE_UNKNOWN
140 } rdfresource_t;
141 
142 /**
143  * An RDF triple is the result of an RDFa statement that contains, at
144  * the very least, a subject, a predicate and an object. It is the
145  * smallest, complete statement one can make in RDF.
146  */
147 typedef struct rdftriple
148 {
149    char* subject;
150    char* predicate;
151    char* object;
152    rdfresource_t object_type;
153    char* datatype;
154    char* language;
155 } rdftriple;
156 
157 /**
158  * The specification for a callback that is capable of handling
159  * triples. Produces a triple that must be freed once the application
160  * is done with the object.
161  */
162 typedef void (*triple_handler_fp)(rdftriple*, void*);
163 
164 /**
165  * The specification for a callback that is used to fill the input buffer
166  * with data to parse.
167  */
168 typedef size_t (*buffer_filler_fp)(char*, size_t, void*);
169 
170 /**
171  * An RDFA list item is used to hold each datum in an rdfa list. It
172  * contains a list of flags as well as the data for the list member.
173  */
174 typedef struct rdfalistitem
175 {
176    unsigned char flags;
177    void* data;
178 } rdfalistitem;
179 
180 /**
181  * An RDFa list is used to store multiple text strings that have a set
182  * of attributes associated with them. These can be lists of CURIEs,
183  * or lists of incomplete triples. The structure grows with use, but
184  * cannot be shrunk.
185  */
186 typedef struct rdfalist
187 {
188    rdfalistitem** items;
189    size_t num_items;
190    size_t max_items;
191    unsigned int user_data;
192 } rdfalist;
193 
194 /**
195  * The RDFa Parser structure is responsible for keeping track of the state of
196  * the current RDFa parser. Things such as the default namespace,
197  * CURIE mappings, and other context-specific
198  */
199 typedef struct rdfacontext
200 {
201    unsigned char rdfa_version;
202    char* base;
203    char* parent_subject;
204    char* parent_object;
205    char* default_vocabulary;
206 #ifndef LIBRDFA_IN_RAPTOR
207    void** uri_mappings;
208 #endif
209    void** term_mappings;
210    void** list_mappings;
211    void** local_list_mappings;
212    rdfalist* incomplete_triples;
213    rdfalist* local_incomplete_triples;
214    char* language;
215    unsigned char host_language;
216 
217    triple_handler_fp default_graph_triple_callback;
218    buffer_filler_fp buffer_filler_callback;
219    triple_handler_fp processor_graph_triple_callback;
220 
221    unsigned char recurse;
222    unsigned char skip_element;
223    char* new_subject;
224    char* current_object_resource;
225 
226    char* about;
227    char* typed_resource;
228    char* resource;
229    char* href;
230    char* src;
231    char* content;
232    char* datatype;
233    rdfalist* property;
234    unsigned char inlist_present;
235    unsigned char rel_present;
236    unsigned char rev_present;
237    char* plain_literal;
238    size_t plain_literal_size;
239    char* xml_literal;
240    size_t xml_literal_size;
241 
242    void* callback_data;
243 
244    /* parse state */
245    size_t bnode_count;
246    char* underscore_colon_bnode_name;
247    unsigned char xml_literal_namespaces_defined;
248    unsigned char xml_literal_xml_lang_defined;
249    size_t wb_allocated;
250    char* working_buffer;
251    size_t wb_position;
252 #ifdef LIBRDFA_IN_RAPTOR
253    raptor_world *world;
254    raptor_locator *locator;
255    /* a pointer (in every context) to the error_handlers structure
256     * held in the raptor_parser object */
257    raptor_uri* base_uri;
258    raptor_sax2* sax2;
259    raptor_namespace_handler namespace_handler;
260    void* namespace_handler_user_data;
261    int raptor_rdfa_version; /* 10 or 11 or otherwise default */
262 #else
263    xmlParserCtxtPtr parser;
264 #endif
265    int done;
266    rdfalist* context_stack;
267    size_t wb_preread;
268    int preread;
269    int depth;
270 } rdfacontext;
271 
272 /**
273  * Creates an initial context for RDFa.
274  *
275  * @param base The base URI that should be used for the parser.
276  *
277  * @return a pointer to the base RDFa context, or NULL if memory
278  *         allocation failed.
279  */
280 DLLEXPORT rdfacontext* rdfa_create_context(const char* base);
281 
282 /**
283  * Sets the default graph triple handler for the application.
284  *
285  * @param context the base rdfa context for the application.
286  * @param th the triple handler function.
287  */
288 DLLEXPORT void rdfa_set_default_graph_triple_handler(
289    rdfacontext* context, triple_handler_fp th);
290 
291 /**
292  * Sets the processor graph triple handler for the application.
293  *
294  * @param context the base rdfa context for the application.
295  * @param th the triple handler function.
296  */
297 DLLEXPORT void rdfa_set_processor_graph_triple_handler(
298    rdfacontext* context, triple_handler_fp th);
299 
300 /**
301  * Sets the buffer filler for the application.
302  *
303  * @param context the base rdfa context for the application.
304  * @param bf the buffer filler function.
305  */
306 DLLEXPORT void rdfa_set_buffer_filler(
307    rdfacontext* context, buffer_filler_fp bf);
308 
309 /**
310  * Starts processing given the base rdfa context.
311  *
312  * @param context the base rdfa context.
313  *
314  * @return RDFA_PARSE_SUCCESS if everything went well. RDFA_PARSE_FAILED
315  *         if there was a fatal error and RDFA_PARSE_WARNING if there
316  *         was a non-fatal error.
317  */
318 DLLEXPORT int rdfa_parse(rdfacontext* context);
319 
320 DLLEXPORT int rdfa_parse_start(rdfacontext* context);
321 
322 DLLEXPORT int rdfa_parse_chunk(
323    rdfacontext* context, char* data, size_t wblen, int done);
324 
325 /**
326  * Gets the input buffer for the given context so it can be filled with data.
327  * A pointer to the buffer will be returned and the maximum number of bytes
328  * that can be written to that buffer will be set to the blen parameter. Once
329  * data has been written to the buffer, rdfa_parse_buffer() should be called.
330  *
331  * @param context the base rdfa context.
332  * @param blen the variable to set to the buffer length.
333  *
334  * @return a pointer to the context's input buffer.
335  */
336 DLLEXPORT char* rdfa_get_buffer(rdfacontext* context, size_t* blen);
337 
338 /**
339  * Informs the parser to attempt to parse more of the given context's input
340  * buffer. To fill the input buffer with data, call rdfa_get_buffer().
341  *
342  * If any of the input buffer can be parsed, it will be. It is possible
343  * that none of the data will be parsed, in which case this function will
344  * still return RDFA_PARSE_SUCCESS. More data should be written to the input
345  * buffer using rdfa_get_buffer() as it is made available to the application.
346  * Once there is no more data to write, rdfa_parse_end() should be called.
347  *
348  * @param context the base rdfa context.
349  * @param bytes the number of bytes written to the input buffer via the last
350  *           call to rdfa_get_buffer(), a value of 0 will indicate that there
351  *           is no more data to parse.
352  *
353  * @return RDFA_PARSE_SUCCESS if everything went well. RDFA_PARSE_FAILED
354  *         if there was a fatal error and RDFA_PARSE_WARNING if there
355  *         was a non-fatal error.
356  */
357 DLLEXPORT int rdfa_parse_buffer(rdfacontext* context, size_t bytes);
358 
359 DLLEXPORT void rdfa_parse_end(rdfacontext* context);
360 
361 DLLEXPORT void rdfa_init_context(rdfacontext* context);
362 
363 DLLEXPORT char* rdfa_iri_get_base(const char* iri);
364 
365 /**
366  * Destroys the given rdfa context by freeing all memory associated
367  * with the context.
368  *
369  * @param context the rdfa context.
370  */
371 DLLEXPORT void rdfa_free_context(rdfacontext* context);
372 
373 #ifdef __cplusplus
374 }
375 #endif
376 
377 #endif
378