1 /** 2 * Copyright 2008-2010 Digital Bazaar, Inc. 3 * 4 * This file is part of librdfa. 5 * 6 * librdfa is Free Software, and can be licensed under any of the 7 * following three licenses: 8 * 9 * 1. GNU Lesser General Public License (LGPL) V2.1 or any 10 * newer version 11 * 2. GNU General Public License (GPL) V2 or any newer version 12 * 3. Apache License, V2.0 or any newer version 13 * 14 * You may not use this file except in compliance with at least one of 15 * the above three licenses. 16 * 17 * See LICENSE-* at the top of this software distribution for more 18 * information regarding the details of each license. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with librdfa. If not, see <http://www.gnu.org/licenses/>. 22 * 23 * The librdfa library is the Fastest RDFa Parser in the Universe. It is 24 * a stream parser, meaning that it takes an XML data as input and spits 25 * out RDF triples as it comes across them in the stream. Due to this 26 * processing approach, librdfa has a very, very small memory footprint. 27 * It is also very fast and can operate on hundreds of gigabytes of XML 28 * data without breaking a sweat. 29 * 30 * Usage: 31 * 32 * rdfacontext* context = rdfa_create_context(base_uri); 33 * context->callback_data = your_user_data; 34 * rdfa_set_triple_handler(context, triple_function); 35 * rdfa_set_buffer_filler(context, buffer_filler_function); 36 * rdfa_parse(context); 37 * rdfa_free_context(context); 38 * 39 * If you would like to get warnings/error triples from the processor graph: 40 * 41 * rdfa_set_issue_handler(context, triple_function); 42 * 43 * Usage if you need more control over when to fill rdfa's buffer: 44 * 45 * rdfacontext* context = rdfa_create_context(base_uri); 46 * context->callback_data = your_user_data; 47 * rdfa_set_triple_handler(context, triple_function); 48 * int rval = rdfa_parse_start(context); 49 * if(rval == RDFA_PARSE_SUCCESS) 50 * { 51 * FILE* myfile = fopen("myfilename"); 52 * size_t buf_len = 0; 53 * size_t read = 0; 54 * do 55 * { 56 * char* buf = rdfa_get_buffer(context, &buf_len); 57 * if(buf_len > 0) 58 * { 59 * // fill buffer here up to buf_len bytes from your input stream 60 * read = fread(buf, sizeof(char), buf_len, myfile); 61 * } 62 * 63 * // parse the read data 64 * rdfa_parse_buffer(context, read); 65 * } 66 * while(read > 0); 67 * fclose(myfile); 68 * 69 * rdfa_parse_end(context); 70 * } 71 * rdfa_free_context(context); 72 * 73 */ 74 #ifndef _LIBRDFA_RDFA_H_ 75 #define _LIBRDFA_RDFA_H_ 76 #include <stdlib.h> 77 #include <libxml/SAX2.h> 78 79 /* Activate the stupid Windows DLL exporting mechanism if we're building for Windows */ 80 #ifdef WIN32 81 #define DLLEXPORT __declspec(dllexport) 82 #else 83 #define DLLEXPORT 84 #endif 85 86 #ifdef LIBRDFA_IN_RAPTOR 87 #include "raptor2.h" 88 #include "raptor_internal.h" 89 #endif /* LIBRDFA_IN_RAPTOR */ 90 91 #ifdef __cplusplus 92 extern "C" 93 { 94 #endif 95 96 #define DEBUG 0 97 98 /* RDFa version numbers */ 99 #define RDFA_VERSION_1_0 1 100 #define RDFA_VERSION_1_1 2 101 102 /* parse process return types */ 103 #define RDFA_PARSE_WARNING -2 104 #define RDFA_PARSE_FAILED -1 105 #define RDFA_PARSE_UNKNOWN 0 106 #define RDFA_PARSE_SUCCESS 1 107 108 /* maximum list lengths */ 109 #define MAX_LOCAL_LIST_MAPPINGS 32 110 #define MAX_LIST_MAPPINGS 48 111 #define MAX_LIST_ITEMS 16 112 #define MAX_TERM_MAPPINGS 64 113 #define MAX_URI_MAPPINGS 128 114 #define MAX_INCOMPLETE_TRIPLES 128 115 116 /* host language definitions */ 117 #define HOST_LANGUAGE_NONE 0 118 #define HOST_LANGUAGE_XML1 1 119 #define HOST_LANGUAGE_XHTML1 2 120 #define HOST_LANGUAGE_HTML 3 121 122 /* default mapping key for xmlns */ 123 #define XMLNS_DEFAULT_MAPPING "XMLNS_DEFAULT" 124 125 /* whitespace characters for RDFa Core 1.1 */ 126 #define RDFA_WHITESPACE " \t\n\v\f\r" 127 128 /** 129 * An RDF resource type is used to denote the content of a triple's 130 * object value. 131 */ 132 typedef enum 133 { 134 RDF_TYPE_NAMESPACE_PREFIX, 135 RDF_TYPE_IRI, 136 RDF_TYPE_PLAIN_LITERAL, 137 RDF_TYPE_XML_LITERAL, 138 RDF_TYPE_TYPED_LITERAL, 139 RDF_TYPE_UNKNOWN 140 } rdfresource_t; 141 142 /** 143 * An RDF triple is the result of an RDFa statement that contains, at 144 * the very least, a subject, a predicate and an object. It is the 145 * smallest, complete statement one can make in RDF. 146 */ 147 typedef struct rdftriple 148 { 149 char* subject; 150 char* predicate; 151 char* object; 152 rdfresource_t object_type; 153 char* datatype; 154 char* language; 155 } rdftriple; 156 157 /** 158 * The specification for a callback that is capable of handling 159 * triples. Produces a triple that must be freed once the application 160 * is done with the object. 161 */ 162 typedef void (*triple_handler_fp)(rdftriple*, void*); 163 164 /** 165 * The specification for a callback that is used to fill the input buffer 166 * with data to parse. 167 */ 168 typedef size_t (*buffer_filler_fp)(char*, size_t, void*); 169 170 /** 171 * An RDFA list item is used to hold each datum in an rdfa list. It 172 * contains a list of flags as well as the data for the list member. 173 */ 174 typedef struct rdfalistitem 175 { 176 unsigned char flags; 177 void* data; 178 } rdfalistitem; 179 180 /** 181 * An RDFa list is used to store multiple text strings that have a set 182 * of attributes associated with them. These can be lists of CURIEs, 183 * or lists of incomplete triples. The structure grows with use, but 184 * cannot be shrunk. 185 */ 186 typedef struct rdfalist 187 { 188 rdfalistitem** items; 189 size_t num_items; 190 size_t max_items; 191 unsigned int user_data; 192 } rdfalist; 193 194 /** 195 * The RDFa Parser structure is responsible for keeping track of the state of 196 * the current RDFa parser. Things such as the default namespace, 197 * CURIE mappings, and other context-specific 198 */ 199 typedef struct rdfacontext 200 { 201 unsigned char rdfa_version; 202 char* base; 203 char* parent_subject; 204 char* parent_object; 205 char* default_vocabulary; 206 #ifndef LIBRDFA_IN_RAPTOR 207 void** uri_mappings; 208 #endif 209 void** term_mappings; 210 void** list_mappings; 211 void** local_list_mappings; 212 rdfalist* incomplete_triples; 213 rdfalist* local_incomplete_triples; 214 char* language; 215 unsigned char host_language; 216 217 triple_handler_fp default_graph_triple_callback; 218 buffer_filler_fp buffer_filler_callback; 219 triple_handler_fp processor_graph_triple_callback; 220 221 unsigned char recurse; 222 unsigned char skip_element; 223 char* new_subject; 224 char* current_object_resource; 225 226 char* about; 227 char* typed_resource; 228 char* resource; 229 char* href; 230 char* src; 231 char* content; 232 char* datatype; 233 rdfalist* property; 234 unsigned char inlist_present; 235 unsigned char rel_present; 236 unsigned char rev_present; 237 char* plain_literal; 238 size_t plain_literal_size; 239 char* xml_literal; 240 size_t xml_literal_size; 241 242 void* callback_data; 243 244 /* parse state */ 245 size_t bnode_count; 246 char* underscore_colon_bnode_name; 247 unsigned char xml_literal_namespaces_defined; 248 unsigned char xml_literal_xml_lang_defined; 249 size_t wb_allocated; 250 char* working_buffer; 251 size_t wb_position; 252 #ifdef LIBRDFA_IN_RAPTOR 253 raptor_world *world; 254 raptor_locator *locator; 255 /* a pointer (in every context) to the error_handlers structure 256 * held in the raptor_parser object */ 257 raptor_uri* base_uri; 258 raptor_sax2* sax2; 259 raptor_namespace_handler namespace_handler; 260 void* namespace_handler_user_data; 261 int raptor_rdfa_version; /* 10 or 11 or otherwise default */ 262 #else 263 xmlParserCtxtPtr parser; 264 #endif 265 int done; 266 rdfalist* context_stack; 267 size_t wb_preread; 268 int preread; 269 int depth; 270 } rdfacontext; 271 272 /** 273 * Creates an initial context for RDFa. 274 * 275 * @param base The base URI that should be used for the parser. 276 * 277 * @return a pointer to the base RDFa context, or NULL if memory 278 * allocation failed. 279 */ 280 DLLEXPORT rdfacontext* rdfa_create_context(const char* base); 281 282 /** 283 * Sets the default graph triple handler for the application. 284 * 285 * @param context the base rdfa context for the application. 286 * @param th the triple handler function. 287 */ 288 DLLEXPORT void rdfa_set_default_graph_triple_handler( 289 rdfacontext* context, triple_handler_fp th); 290 291 /** 292 * Sets the processor graph triple handler for the application. 293 * 294 * @param context the base rdfa context for the application. 295 * @param th the triple handler function. 296 */ 297 DLLEXPORT void rdfa_set_processor_graph_triple_handler( 298 rdfacontext* context, triple_handler_fp th); 299 300 /** 301 * Sets the buffer filler for the application. 302 * 303 * @param context the base rdfa context for the application. 304 * @param bf the buffer filler function. 305 */ 306 DLLEXPORT void rdfa_set_buffer_filler( 307 rdfacontext* context, buffer_filler_fp bf); 308 309 /** 310 * Starts processing given the base rdfa context. 311 * 312 * @param context the base rdfa context. 313 * 314 * @return RDFA_PARSE_SUCCESS if everything went well. RDFA_PARSE_FAILED 315 * if there was a fatal error and RDFA_PARSE_WARNING if there 316 * was a non-fatal error. 317 */ 318 DLLEXPORT int rdfa_parse(rdfacontext* context); 319 320 DLLEXPORT int rdfa_parse_start(rdfacontext* context); 321 322 DLLEXPORT int rdfa_parse_chunk( 323 rdfacontext* context, char* data, size_t wblen, int done); 324 325 /** 326 * Gets the input buffer for the given context so it can be filled with data. 327 * A pointer to the buffer will be returned and the maximum number of bytes 328 * that can be written to that buffer will be set to the blen parameter. Once 329 * data has been written to the buffer, rdfa_parse_buffer() should be called. 330 * 331 * @param context the base rdfa context. 332 * @param blen the variable to set to the buffer length. 333 * 334 * @return a pointer to the context's input buffer. 335 */ 336 DLLEXPORT char* rdfa_get_buffer(rdfacontext* context, size_t* blen); 337 338 /** 339 * Informs the parser to attempt to parse more of the given context's input 340 * buffer. To fill the input buffer with data, call rdfa_get_buffer(). 341 * 342 * If any of the input buffer can be parsed, it will be. It is possible 343 * that none of the data will be parsed, in which case this function will 344 * still return RDFA_PARSE_SUCCESS. More data should be written to the input 345 * buffer using rdfa_get_buffer() as it is made available to the application. 346 * Once there is no more data to write, rdfa_parse_end() should be called. 347 * 348 * @param context the base rdfa context. 349 * @param bytes the number of bytes written to the input buffer via the last 350 * call to rdfa_get_buffer(), a value of 0 will indicate that there 351 * is no more data to parse. 352 * 353 * @return RDFA_PARSE_SUCCESS if everything went well. RDFA_PARSE_FAILED 354 * if there was a fatal error and RDFA_PARSE_WARNING if there 355 * was a non-fatal error. 356 */ 357 DLLEXPORT int rdfa_parse_buffer(rdfacontext* context, size_t bytes); 358 359 DLLEXPORT void rdfa_parse_end(rdfacontext* context); 360 361 DLLEXPORT void rdfa_init_context(rdfacontext* context); 362 363 DLLEXPORT char* rdfa_iri_get_base(const char* iri); 364 365 /** 366 * Destroys the given rdfa context by freeing all memory associated 367 * with the context. 368 * 369 * @param context the rdfa context. 370 */ 371 DLLEXPORT void rdfa_free_context(rdfacontext* context); 372 373 #ifdef __cplusplus 374 } 375 #endif 376 377 #endif 378