1 /* -*- Mode: c; c-basic-offset: 2 -*-
2 *
3 * ntriples_parse.c - Raptor N-Triples Parser implementation
4 *
5 * N-Triples
6 * http://www.w3.org/TR/rdf-testcases/#ntriples
7 *
8 * Copyright (C) 2001-2008, David Beckett http://www.dajobe.org/
9 * Copyright (C) 2001-2005, University of Bristol, UK http://www.bristol.ac.uk/
10 *
11 * This package is Free Software and part of Redland http://librdf.org/
12 *
13 * It is licensed under the following three licenses as alternatives:
14 * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
15 * 2. GNU General Public License (GPL) V2 or any newer version
16 * 3. Apache License, V2.0 or any newer version
17 *
18 * You may not use this file except in compliance with at least one of
19 * the above three licenses.
20 *
21 * See LICENSE.html or LICENSE.txt at the top of this package for the
22 * complete terms and further detail along with the license texts for
23 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
24 *
25 *
26 */
27
28
29 #ifdef HAVE_CONFIG_H
30 #include <raptor_config.h>
31 #endif
32
33 #ifdef WIN32
34 #include <win32_raptor_config.h>
35 #endif
36
37 #include <stdio.h>
38 #include <string.h>
39 #include <ctype.h>
40 #include <stdarg.h>
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_STDLIB_H
45 #include <stdlib.h>
46 #endif
47
48 /* Raptor includes */
49 #include "raptor.h"
50 #include "raptor_internal.h"
51
52 /* Set RAPTOR_DEBUG to > 1 to get lots of buffer related debugging */
53 /*
54 #undef RAPTOR_DEBUG
55 #define RAPTOR_DEBUG 2
56 */
57
58 /* Prototypes for local functions */
59 static void raptor_ntriples_generate_statement(raptor_parser* parser, const unsigned char *subject, const raptor_ntriples_term_type subject_type, const unsigned char *predicate, const raptor_ntriples_term_type predicate_type, const void *object, const raptor_ntriples_term_type object_type, const unsigned char *object_literal_language, const unsigned char *object_literal_datatype);
60
61 /*
62 * NTriples parser object
63 */
64 struct raptor_ntriples_parser_context_s {
65 /* current line */
66 unsigned char *line;
67 /* current line length */
68 int line_length;
69 /* current char in line buffer */
70 int offset;
71
72 char last_char;
73
74 /* static statement for use in passing to user code */
75 raptor_statement statement;
76 };
77
78
79 typedef struct raptor_ntriples_parser_context_s raptor_ntriples_parser_context;
80
81
82
83 /**
84 * raptor_ntriples_parse_init:
85 *
86 * Initialise the Raptor NTriples parser.
87 *
88 * Return value: non 0 on failure
89 **/
90
91 static int
raptor_ntriples_parse_init(raptor_parser * rdf_parser,const char * name)92 raptor_ntriples_parse_init(raptor_parser* rdf_parser, const char *name)
93 {
94 /*raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context; */
95 return 0;
96 }
97
98
99 /* PUBLIC FUNCTIONS */
100
101
102 /*
103 * raptor_ntriples_parse_terminate - Free the Raptor NTriples parser
104 * @rdf_parser: parser object
105 *
106 **/
107 static void
raptor_ntriples_parse_terminate(raptor_parser * rdf_parser)108 raptor_ntriples_parse_terminate(raptor_parser* rdf_parser)
109 {
110 raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context;
111 if(ntriples_parser->line_length)
112 RAPTOR_FREE(cdata, ntriples_parser->line);
113 }
114
115
116 static const char * const term_type_strings[]={
117 "URIref",
118 "bnodeID",
119 "Literal"
120 };
121
122
123 #ifndef RAPTOR_DISABLE_DEPRECATED
124 /**
125 * raptor_ntriples_term_as_string:
126 * @term: N-Triples term.
127 *
128 * Get a label for a #raptor_ntriples_term_type.
129 *
130 * @deprecated: an internal debug function, do not use.
131 *
132 * Return value: a pointer to a constant string.
133 **/
134 const char *
raptor_ntriples_term_as_string(raptor_ntriples_term_type term)135 raptor_ntriples_term_as_string(raptor_ntriples_term_type term)
136 {
137 return term_type_strings[(int)term];
138 }
139 #endif
140
141
142 static void
raptor_ntriples_generate_statement(raptor_parser * parser,const unsigned char * subject,const raptor_ntriples_term_type subject_type,const unsigned char * predicate,const raptor_ntriples_term_type predicate_type,const void * object,const raptor_ntriples_term_type object_type,const unsigned char * object_literal_language,const unsigned char * object_literal_datatype)143 raptor_ntriples_generate_statement(raptor_parser* parser,
144 const unsigned char *subject,
145 const raptor_ntriples_term_type subject_type,
146 const unsigned char *predicate,
147 const raptor_ntriples_term_type predicate_type,
148 const void *object,
149 const raptor_ntriples_term_type object_type,
150 const unsigned char *object_literal_language,
151 const unsigned char *object_literal_datatype)
152 {
153 /* raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)parser->context; */
154 raptor_statement *statement=&parser->statement;
155 raptor_uri *subject_uri=NULL;
156 raptor_uri *predicate_uri=NULL;
157 raptor_uri *object_uri=NULL;
158 raptor_uri *datatype_uri=NULL;
159
160 /* Two choices for subject from N-Triples */
161 if(subject_type == RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE) {
162 statement->subject=subject;
163 statement->subject_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
164 } else {
165 subject_uri=raptor_new_uri_v2(parser->world, subject);
166 if(!subject_uri) {
167 raptor_parser_error(parser, "Could not create subject uri '%s', skipping", subject);
168 goto cleanup;
169 }
170 statement->subject=subject_uri;
171 statement->subject_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
172 }
173
174 if(object_literal_datatype) {
175 datatype_uri=raptor_new_uri_v2(parser->world, object_literal_datatype);
176 if(!datatype_uri) {
177 raptor_parser_error(parser, "Could not create object literal datatype uri '%s', skipping", object_literal_datatype);
178 goto cleanup;
179 }
180 object_literal_language=NULL;
181 }
182
183 /* Predicates in N-Triples are URIs but check for bad ordinals */
184 if(!strncmp((const char*)predicate, "http://www.w3.org/1999/02/22-rdf-syntax-ns#_", 44)) {
185 int predicate_ordinal=raptor_check_ordinal(predicate+44);
186 if(predicate_ordinal <= 0)
187 raptor_parser_error(parser, "Illegal ordinal value %d in property '%s'.", predicate_ordinal, predicate);
188 }
189
190 predicate_uri=raptor_new_uri_v2(parser->world, predicate);
191 if(!predicate_uri) {
192 raptor_parser_error(parser, "Could not create predicate uri '%s', skipping", predicate);
193 goto cleanup;
194 }
195 statement->predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
196 statement->predicate=predicate_uri;
197
198 /* Three choices for object from N-Triples */
199 statement->object_literal_language=NULL;
200 statement->object_literal_datatype=NULL;
201 if(object_type == RAPTOR_NTRIPLES_TERM_TYPE_URI_REF) {
202 object_uri=raptor_new_uri_v2(parser->world, (const unsigned char*)object);
203 if(!object_uri) {
204 raptor_parser_error(parser, "Could not create object uri '%s', skipping", (const char *)object);
205 goto cleanup;
206 }
207 statement->object=object_uri;
208 statement->object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
209 } else if(object_type == RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE) {
210 statement->object=object;
211 statement->object_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
212 } else {
213 statement->object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL;
214 statement->object=object;
215 statement->object_literal_language=object_literal_language;
216 statement->object_literal_datatype=datatype_uri;
217 }
218
219 if(!parser->statement_handler)
220 goto cleanup;
221
222 /* Generate the statement; or is it fact? */
223 (*parser->statement_handler)(parser->user_data, statement);
224
225 cleanup:
226 if(subject_uri)
227 raptor_free_uri_v2(parser->world, subject_uri);
228 if(predicate_uri)
229 raptor_free_uri_v2(parser->world, predicate_uri);
230 if(object_uri)
231 raptor_free_uri_v2(parser->world, object_uri);
232 if(datatype_uri)
233 raptor_free_uri_v2(parser->world, datatype_uri);
234 }
235
236
237 /* These are for 7-bit ASCII and not locale-specific */
238 #define IS_ASCII_ALPHA(c) (((c)>0x40 && (c)<0x5B) || ((c)>0x60 && (c)<0x7B))
239 #define IS_ASCII_UPPER(c) ((c)>0x40 && (c)<0x5B)
240 #define IS_ASCII_DIGIT(c) ((c)>0x2F && (c)<0x3A)
241 #define IS_ASCII_PRINT(c) ((c)>0x1F && (c)<0x7F)
242 #define TO_ASCII_LOWER(c) ((c)+0x20)
243
244 typedef enum {
245 RAPTOR_TERM_CLASS_URI, /* ends on > */
246 RAPTOR_TERM_CLASS_BNODEID, /* ends on first non [A-Za-z][A-Za-z0-9]* */
247 RAPTOR_TERM_CLASS_STRING, /* ends on non-escaped " */
248 RAPTOR_TERM_CLASS_LANGUAGE, /* ends on first non [a-z0-9]+ ('-' [a-z0-9]+ )? */
249 RAPTOR_TERM_CLASS_FULL /* the entire string is used */
250 } raptor_ntriples_term_class;
251
252
253 static int
raptor_ntriples_term_valid(unsigned char c,int position,raptor_ntriples_term_class term_class)254 raptor_ntriples_term_valid(unsigned char c, int position,
255 raptor_ntriples_term_class term_class)
256 {
257 int result=0;
258
259 switch(term_class) {
260 case RAPTOR_TERM_CLASS_URI:
261 /* ends on > */
262 result=(c!= '>');
263 break;
264
265 case RAPTOR_TERM_CLASS_BNODEID:
266 /* ends on first non [A-Za-z][A-Za-z0-9]* */
267 result=IS_ASCII_ALPHA(c);
268 if(position)
269 result = (result || IS_ASCII_DIGIT(c));
270 break;
271
272 case RAPTOR_TERM_CLASS_STRING:
273 /* ends on " */
274 result=(c!= '"');
275 break;
276
277 case RAPTOR_TERM_CLASS_LANGUAGE:
278 /* ends on first non [a-z0-9]+ ('-' [a-z0-9]+ )? */
279 result=(IS_ASCII_ALPHA(c) || IS_ASCII_DIGIT(c));
280 if(position)
281 result = (result || c=='-');
282 break;
283
284 case RAPTOR_TERM_CLASS_FULL:
285 result=1;
286 break;
287
288 default:
289 RAPTOR_FATAL2("Unknown ntriples term %d", term_class);
290 }
291
292 return result;
293 }
294
295
296 /*
297 * raptor_ntriples_term - Parse an N-Triples term with escapes
298 * @parser: NTriples parser
299 * @start: pointer to starting character of string (in)
300 * @dest: destination of string (in)
301 * @lenp: pointer to length of string (in/out)
302 * @dest_lenp: pointer to length of destination string (out)
303 * @end_char: string ending character
304 * @class: string class
305 * @allow_utf8: Non-0 if UTF-8 chars are allowed in the term
306 *
307 * N-Triples strings/URIs are written in ASCII at present; characters
308 * outside the printable ASCII range are discarded with a warning.
309 * See the grammar for full details of the allowed ranges.
310 *
311 * If the class is RAPTOR_TERM_CLASS_FULL, the end_char is ignored.
312 *
313 * UTF-8 is only allowed if allow_utf8 is non-0, otherwise the
314 * string is US-ASCII and only the \u and \U esapes are allowed.
315 * If enabled, both are allowed.
316 *
317 * Return value: Non 0 on failure
318 **/
319 static int
raptor_ntriples_term(raptor_parser * rdf_parser,const unsigned char ** start,unsigned char * dest,size_t * lenp,size_t * dest_lenp,char end_char,raptor_ntriples_term_class term_class,int allow_utf8)320 raptor_ntriples_term(raptor_parser* rdf_parser,
321 const unsigned char **start, unsigned char *dest,
322 size_t *lenp, size_t *dest_lenp,
323 char end_char,
324 raptor_ntriples_term_class term_class,
325 int allow_utf8)
326 {
327 const unsigned char *p=*start;
328 unsigned char c='\0';
329 size_t ulen=0;
330 unsigned long unichar=0;
331 unsigned int position=0;
332 int end_char_seen=0;
333
334 if(term_class == RAPTOR_TERM_CLASS_FULL)
335 end_char='\0';
336
337 /* find end of string, fixing backslashed characters on the way */
338 while(*lenp > 0) {
339 c = *p;
340
341 p++;
342 (*lenp)--;
343 rdf_parser->locator.column++;
344 rdf_parser->locator.byte++;
345
346 if(allow_utf8) {
347 if(c > 0x7f) {
348 /* just copy the UTF-8 bytes through */
349 size_t unichar_len=raptor_utf8_to_unicode_char(NULL, (const unsigned char*)p-1, 1+*lenp);
350 if(unichar_len > *lenp) {
351 raptor_parser_error(rdf_parser, "UTF-8 encoding error at character %d (0x%02X) found.", c, c);
352 /* UTF-8 encoding had an error or ended in the middle of a string */
353 return 1;
354 }
355 memcpy(dest, p-1, unichar_len);
356 dest+= unichar_len;
357
358 unichar_len--; /* p, *lenp were moved on by 1 earlier */
359
360 p += unichar_len;
361 (*lenp) -= unichar_len;
362 rdf_parser->locator.column+= unichar_len;
363 rdf_parser->locator.byte+= unichar_len;
364 continue;
365 }
366 } else if(!IS_ASCII_PRINT(c)) {
367 /* This is an ASCII check, not a printable character check
368 * so isprint() is not appropriate, since that is a locale check.
369 */
370 raptor_parser_error(rdf_parser, "Non-printable ASCII character %d (0x%02X) found.", c, c);
371 continue;
372 }
373
374 if(c != '\\') {
375 /* finish at non-backslashed end_char */
376 if(end_char && c == end_char) {
377 end_char_seen=1;
378 break;
379 }
380
381 if(!raptor_ntriples_term_valid(c, position, term_class)) {
382 if(end_char) {
383 /* end char was expected, so finding an invalid thing is an error */
384 raptor_parser_error(rdf_parser, "Missing terminating '%c' (found '%c')", end_char, c);
385 return 0;
386 } else {
387 /* it's the end - so rewind 1 to save next char */
388 p--;
389 (*lenp)++;
390 rdf_parser->locator.column--;
391 rdf_parser->locator.byte--;
392 break;
393 }
394 }
395
396 /* otherwise store and move on */
397 *dest++=c;
398 position++;
399 continue;
400 }
401
402 if(!*lenp) {
403 if(term_class != RAPTOR_TERM_CLASS_FULL)
404 raptor_parser_error(rdf_parser, "\\ at end of line");
405 return 0;
406 }
407
408 c = *p;
409
410 p++;
411 (*lenp)--;
412 rdf_parser->locator.column++;
413 rdf_parser->locator.byte++;
414
415 switch(c) {
416 case '"':
417 case '\\':
418 *dest++=c;
419 break;
420 case 'n':
421 *dest++='\n';
422 break;
423 case 'r':
424 *dest++='\r';
425 break;
426 case 't':
427 *dest++='\t';
428 break;
429 case 'u':
430 case 'U':
431 ulen=(c == 'u') ? 4 : 8;
432
433 if(*lenp < ulen) {
434 raptor_parser_error(rdf_parser, "%c over end of line", c);
435 return 0;
436 }
437
438 if(1) {
439 int n;
440
441 n=sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
442 if(n != 1) {
443 raptor_parser_error(rdf_parser, "Illegal Uncode escape '%c%s...'", c, p);
444 break;
445 }
446 }
447
448 p+=ulen;
449 (*lenp)-=ulen;
450 rdf_parser->locator.column+=ulen;
451 rdf_parser->locator.byte+=ulen;
452
453 if(unichar > 0x10ffff) {
454 raptor_parser_error(rdf_parser, "Illegal Unicode character with code point #x%lX.", unichar);
455 break;
456 }
457
458 dest+=raptor_unicode_char_to_utf8(unichar, dest);
459 break;
460
461 default:
462 raptor_parser_error(rdf_parser,
463 "Illegal string escape \\%c in \"%s\"", c,
464 (char*)start);
465 return 0;
466 }
467
468 position++;
469 } /* end while */
470
471
472 if(end_char && !end_char_seen) {
473 raptor_parser_error(rdf_parser, "Missing terminating '%c' before end of line.", end_char);
474 return 1;
475 }
476
477 /* terminate dest, can be shorter than source */
478 *dest='\0';
479
480 if(dest_lenp)
481 *dest_lenp=p-*start;
482
483 *start=p;
484
485 return 0;
486 }
487
488
489 #ifndef RAPTOR_DISABLE_DEPRECATED
490 /**
491 * raptor_ntriples_string_as_utf8_string:
492 * @rdf_parser: parser object
493 * @src: data to read from
494 * @len: size of data
495 * @dest_lenp: pointer to length of destination (out) or NULL
496 *
497 * Turn an N-Triples string with escapes into a UTF-8 string.
498 *
499 * @deprecated: This requires use of parser internals and was never in the public API header.
500 *
501 * Return value: a new UTF-8 string
502 **/
503 unsigned char*
raptor_ntriples_string_as_utf8_string(raptor_parser * rdf_parser,const unsigned char * src,int len,size_t * dest_lenp)504 raptor_ntriples_string_as_utf8_string(raptor_parser* rdf_parser,
505 const unsigned char *src, int len,
506 size_t *dest_lenp)
507 {
508 const unsigned char *start=src;
509 size_t length=len;
510 unsigned char *dest;
511 int rc;
512
513 dest=(unsigned char*)RAPTOR_MALLOC(cstring, len+1);
514 if(!dest)
515 return NULL;
516
517 rc=raptor_ntriples_term(rdf_parser, &start, dest, &length, dest_lenp,
518 '\0', RAPTOR_TERM_CLASS_FULL, 1);
519 if(rc) {
520 RAPTOR_FREE(cstring, dest);
521 dest=NULL;
522 }
523 return dest;
524 }
525 #endif
526
527
528 static int
raptor_ntriples_parse_line(raptor_parser * rdf_parser,unsigned char * buffer,size_t len)529 raptor_ntriples_parse_line(raptor_parser* rdf_parser,
530 unsigned char *buffer, size_t len)
531 {
532 int i;
533 unsigned char *p;
534 unsigned char *dest;
535 unsigned char *terms[3];
536 int terms_allocated[3];
537 size_t term_lengths[3];
538 raptor_ntriples_term_type term_types[3];
539 size_t term_length= 0;
540 unsigned char *object_literal_language=NULL;
541 unsigned char *object_literal_datatype=NULL;
542 int rc=0;
543
544 for(i=0; i<3; i++)
545 terms_allocated[i]=0;
546
547 /* ASSERTION:
548 * p always points to first char we are considering
549 * p[len-1] always points to last char
550 */
551
552 /* Handle empty lines */
553 if(!len)
554 return 0;
555
556 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
557 RAPTOR_DEBUG3("handling line '%s' (%d bytes)\n", buffer, (unsigned int)len);
558 #endif
559
560 p=buffer;
561
562 while(len>0 && isspace((int)*p)) {
563 p++;
564 rdf_parser->locator.column++;
565 rdf_parser->locator.byte++;
566 len--;
567 }
568
569 /* Handle empty - all whitespace lines */
570 if(!len)
571 return 0;
572
573 /* Handle comment lines */
574 if(*p == '#')
575 return 0;
576
577 /* Remove trailing spaces */
578 while(len>0 && isspace((int)p[len-1])) {
579 p[len-1]='\0';
580 len--;
581 }
582
583 /* can't be empty now - that would have been caught above */
584
585 /* Check for terminating '.' */
586 if(p[len-1] != '.') {
587 /* Move current location to point to problem */
588 rdf_parser->locator.column += len-2;
589 rdf_parser->locator.byte += len-2;
590 raptor_parser_error(rdf_parser, "Missing . at end of line");
591 return 0;
592 }
593
594 p[len-1]='\0';
595 len--;
596
597
598 /* Must be triple */
599
600 for(i=0; i<3; i++) {
601 if(!len) {
602 raptor_parser_error(rdf_parser, "Unexpected end of line");
603 goto cleanup;
604 }
605
606 /* Expect either <URI> or _:name */
607 if(i == 2) {
608 if(*p != '<' && *p != '_' && *p != '"' && *p != 'x') {
609 raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref>, _:bnodeID or \"literal\"", *p);
610 goto cleanup;
611 }
612 if(*p == 'x') {
613 if(len < 4 || strncmp((const char*)p, "xml\"", 4)) {
614 raptor_parser_error(rdf_parser, "Saw '%c', expected xml\"...\")", *p);
615 goto cleanup;
616 }
617 }
618 } else if(i == 1) {
619 if(*p != '<') {
620 raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref>", *p);
621 goto cleanup;
622 }
623 } else /* i==0 */ {
624 if(*p != '<' && *p != '_') {
625 raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref> or _:bnodeID", *p);
626 goto cleanup;
627 }
628 }
629
630 switch(*p) {
631 case '<':
632 term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_URI_REF;
633
634 dest=p;
635
636 p++;
637 len--;
638 rdf_parser->locator.column++;
639 rdf_parser->locator.byte++;
640
641 if(raptor_ntriples_term(rdf_parser,
642 (const unsigned char**)&p,
643 dest, &len, &term_length,
644 '>', RAPTOR_TERM_CLASS_URI, 0)) {
645 rc=1;
646 goto cleanup;
647 }
648 break;
649
650 case '"':
651 term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_LITERAL;
652
653 dest=p;
654
655 p++;
656 len--;
657 rdf_parser->locator.column++;
658 rdf_parser->locator.byte++;
659
660 if(raptor_ntriples_term(rdf_parser,
661 (const unsigned char**)&p,
662 dest, &len, &term_length,
663 '"', RAPTOR_TERM_CLASS_STRING, 0)) {
664 rc=1;
665 goto cleanup;
666 }
667
668 if(len && (*p == '-' || *p == '@')) {
669 if(*p == '-')
670 raptor_parser_error(rdf_parser, "Old N-Triples language syntax using \"string\"-lang rather than \"string\"@lang.");
671
672 object_literal_language=p;
673
674 /* Skip - */
675 p++;
676 len--;
677 rdf_parser->locator.column++;
678 rdf_parser->locator.byte++;
679
680 if(!len) {
681 raptor_parser_error(rdf_parser, "Missing language after \"string\"-");
682 goto cleanup;
683 }
684
685
686 if(raptor_ntriples_term(rdf_parser,
687 (const unsigned char**)&p,
688 object_literal_language, &len, NULL,
689 '\0', RAPTOR_TERM_CLASS_LANGUAGE, 0)) {
690 rc=1;
691 goto cleanup;
692 }
693 }
694
695 if(len >1 && *p == '^' && p[1] == '^') {
696
697 object_literal_datatype=p;
698
699 /* Skip ^^ */
700 p+= 2;
701 len-= 2;
702 rdf_parser->locator.column+= 2;
703 rdf_parser->locator.byte+= 2;
704
705 if(!len || (len && *p != '<')) {
706 raptor_parser_error(rdf_parser, "Missing datatype URI-ref in\"string\"^^<URI-ref> after ^^");
707 goto cleanup;
708 }
709
710 p++;
711 len--;
712 rdf_parser->locator.column++;
713 rdf_parser->locator.byte++;
714
715 if(raptor_ntriples_term(rdf_parser,
716 (const unsigned char**)&p,
717 object_literal_datatype, &len, NULL,
718 '>', RAPTOR_TERM_CLASS_URI, 0)) {
719 rc=1;
720 goto cleanup;
721 }
722
723 }
724
725 if(object_literal_datatype && object_literal_language) {
726 raptor_parser_warning(rdf_parser, "Typed literal used with a language - ignoring the language");
727 object_literal_language=NULL;
728 }
729
730
731 break;
732
733
734 case '_':
735 term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE;
736
737 /* store where _ was */
738 dest=p;
739
740 p++;
741 len--;
742 rdf_parser->locator.column++;
743 rdf_parser->locator.byte++;
744
745 if(!len || (len > 0 && *p != ':')) {
746 raptor_parser_error(rdf_parser, "Illegal bNodeID - _ not followed by :");
747 goto cleanup;
748 }
749
750 /* Found ':' - move on */
751
752 p++;
753 len--;
754 rdf_parser->locator.column++;
755 rdf_parser->locator.byte++;
756
757 if(raptor_ntriples_term(rdf_parser,
758 (const unsigned char**)&p,
759 dest, &len, &term_length,
760 '\0', RAPTOR_TERM_CLASS_BNODEID, 0)) {
761 rc=1;
762 goto cleanup;
763 }
764
765 if(!term_length) {
766 raptor_parser_error(rdf_parser, "Bad or missing bNodeID after _:");
767 goto cleanup;
768 } else {
769 unsigned char *blank=(unsigned char*)RAPTOR_MALLOC(cstring, term_length+1);
770 if(!blank) {
771 raptor_parser_fatal_error(rdf_parser, "Out of memory");
772 rc=1;
773 goto cleanup;
774 }
775 strcpy((char*)blank, (const char*)dest);
776 dest=raptor_parser_internal_generate_id(rdf_parser,
777 RAPTOR_GENID_TYPE_BNODEID,
778 blank);
779 terms_allocated[i]=1;
780 }
781
782 break;
783
784 case 'x':
785
786 raptor_parser_error(rdf_parser, "Old N-Triples XML using xml\"string\"-lang rather than \"string\"@lang^^<%s>.", raptor_xml_literal_datatype_uri_string);
787
788 /* already know we have 'xml"' coming up */
789 term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_LITERAL;
790
791 /* 3=strlen("xml") */
792 p+=3;
793 len-=3;
794
795 dest=p;
796
797 p++;
798 len--;
799 rdf_parser->locator.column++;
800 rdf_parser->locator.byte++;
801
802 if(raptor_ntriples_term(rdf_parser,
803 (const unsigned char**)&p,
804 dest, &len, &term_length,
805 '"', RAPTOR_TERM_CLASS_STRING, 0)) {
806 rc=1;
807 goto cleanup;
808 }
809
810 /* got XML literal string */
811 object_literal_datatype=(unsigned char*)raptor_xml_literal_datatype_uri_string;
812
813 if(len && (*p == '-' || *p == '@')) {
814 if(*p == '-')
815 raptor_parser_error(rdf_parser, "Old N-Triples language syntax using xml\"string\"-lang rather than xml\"string\"@lang.");
816
817 object_literal_language=p;
818
819 /* Skip - */
820 p++;
821 len--;
822 rdf_parser->locator.column++;
823 rdf_parser->locator.byte++;
824
825 if(!len) {
826 raptor_parser_error(rdf_parser, "Missing language in xml\"string\"-language after -");
827 goto cleanup;
828 }
829
830 if(raptor_ntriples_term(rdf_parser,
831 (const unsigned char**)&p,
832 object_literal_language, &len, NULL,
833 '"', RAPTOR_TERM_CLASS_STRING, 0)) {
834 rc=1;
835 goto cleanup;
836 }
837
838 }
839
840 if(len >1 && *p == '^' && p[1] == '^') {
841
842 object_literal_datatype=p;
843
844 /* Skip ^^ */
845 p+= 2;
846 len-= 2;
847 rdf_parser->locator.column+= 2;
848 rdf_parser->locator.byte+= 2;
849
850 if(!len || (len && *p != '<')) {
851 raptor_parser_error(rdf_parser, "Missing datatype URI-ref in xml\"string\"^^<URI-ref> after ^^");
852 goto cleanup;
853 }
854
855 p++;
856 len--;
857 rdf_parser->locator.column++;
858 rdf_parser->locator.byte++;
859
860 if(raptor_ntriples_term(rdf_parser,
861 (const unsigned char**)&p,
862 object_literal_datatype, &len, NULL,
863 '>', RAPTOR_TERM_CLASS_URI, 0)) {
864 rc=1;
865 goto cleanup;
866 }
867
868 }
869
870 if(len) {
871 if(*p != ' ') {
872 raptor_parser_error(rdf_parser, "Missing terminating ' '");
873 return 0;
874 }
875
876 p++;
877 len--;
878 rdf_parser->locator.column++;
879 rdf_parser->locator.byte++;
880 }
881
882 break;
883
884
885 default:
886 raptor_parser_fatal_error(rdf_parser, "Unknown term type");
887 rc=1;
888 goto cleanup;
889 }
890
891
892 /* Store term */
893 terms[i]=dest; term_lengths[i]=term_length;
894
895 /* Whitespace must separate the terms */
896 if(i<2 && !isspace((int)*p)) {
897 raptor_parser_error(rdf_parser, "Missing whitespace after term '%s'", terms[i]);
898 rc=1;
899 goto cleanup;
900 }
901
902 /* Skip whitespace after terms */
903 while(len>0 && isspace((int)*p)) {
904 p++;
905 len--;
906 rdf_parser->locator.column++;
907 rdf_parser->locator.byte++;
908 }
909
910 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
911 fprintf(stderr, "item %d: term '%s' len %d type %s\n",
912 i, terms[i], (unsigned int)term_lengths[i],
913 raptor_ntriples_term_as_string(term_types[i]));
914 #endif
915 }
916
917 if(len) {
918 raptor_parser_error(rdf_parser, "Junk before terminating \".\"");
919 return 0;
920 }
921
922
923 if(object_literal_language) {
924 unsigned char *q;
925 /* Normalize language to lowercase
926 * http://www.w3.org/TR/rdf-concepts/#dfn-language-identifier
927 */
928 for(q=object_literal_language; *q; q++) {
929 if(IS_ASCII_UPPER(*q))
930 *q=TO_ASCII_LOWER(*q);
931 }
932 }
933
934 raptor_ntriples_generate_statement(rdf_parser,
935 terms[0], term_types[0],
936 terms[1], term_types[1],
937 terms[2], term_types[2],
938 object_literal_language,
939 object_literal_datatype);
940
941 rdf_parser->locator.byte += len;
942
943 cleanup:
944 for(i=0; i<3; i++)
945 if(terms_allocated[i])
946 RAPTOR_FREE(cstring, terms[i]);
947
948 return rc;
949 }
950
951
952 static int
raptor_ntriples_parse_chunk(raptor_parser * rdf_parser,const unsigned char * s,size_t len,int is_end)953 raptor_ntriples_parse_chunk(raptor_parser* rdf_parser,
954 const unsigned char *s, size_t len,
955 int is_end)
956 {
957 unsigned char *buffer;
958 unsigned char *ptr;
959 unsigned char *start;
960 raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context;
961
962 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
963 RAPTOR_DEBUG2("adding %d bytes to buffer\n", (unsigned int)len);
964 #endif
965
966 /* No data? It's the end */
967 if(!len)
968 return 0;
969
970 buffer=(unsigned char*)RAPTOR_MALLOC(cstring, ntriples_parser->line_length + len + 1);
971 if(!buffer) {
972 raptor_parser_fatal_error(rdf_parser, "Out of memory");
973 return 1;
974 }
975
976 if(ntriples_parser->line_length) {
977 strncpy((char*)buffer, (const char*)ntriples_parser->line, ntriples_parser->line_length);
978 RAPTOR_FREE(cstring, ntriples_parser->line);
979 }
980
981 ntriples_parser->line=buffer;
982
983 /* move pointer to end of cdata buffer */
984 ptr=buffer+ntriples_parser->line_length;
985
986 /* adjust stored length */
987 ntriples_parser->line_length += len;
988
989 /* now write new stuff at end of cdata buffer */
990 strncpy((char*)ptr, (const char*)s, len);
991 ptr += len;
992 *ptr = '\0';
993
994 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
995 RAPTOR_DEBUG2("buffer now %d bytes\n", ntriples_parser->line_length);
996 #endif
997
998 ptr=buffer+ntriples_parser->offset;
999 while(*(start=ptr)) {
1000 unsigned char *line_start=ptr;
1001
1002 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1003 RAPTOR_DEBUG3("line buffer now '%s' (offset %d)\n", ptr, ptr-(buffer+ntriples_parser->offset));
1004 #endif
1005
1006 /* skip \n when just seen \r - i.e. \r\n or CR LF */
1007 if(ntriples_parser->last_char == '\r' && *ptr == '\n') {
1008 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1009 RAPTOR_DEBUG1("skipping a \\n\n");
1010 #endif
1011 ptr++;
1012 rdf_parser->locator.byte++;
1013 rdf_parser->locator.column=0;
1014 start=line_start=ptr;
1015 }
1016
1017 while(*ptr && *ptr != '\n' && *ptr != '\r')
1018 ptr++;
1019
1020 if(!*ptr)
1021 break;
1022
1023 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1024 RAPTOR_DEBUG3("found newline \\x%02x at offset %d\n", *ptr,
1025 ptr-line_start);
1026 #endif
1027 ntriples_parser->last_char=*ptr;
1028
1029 len=ptr-line_start;
1030 rdf_parser->locator.column=0;
1031
1032 *ptr='\0';
1033 if(raptor_ntriples_parse_line(rdf_parser,line_start,len))
1034 return 1;
1035
1036 rdf_parser->locator.line++;
1037
1038 /* go past newline */
1039 ptr++;
1040 rdf_parser->locator.byte++;
1041
1042 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1043 /* Do not peek if too far */
1044 if(ptr-buffer < ntriples_parser->line_length)
1045 RAPTOR_DEBUG2("next char is \\x%02x\n", *ptr);
1046 else
1047 RAPTOR_DEBUG1("next char unknown - end of buffer\n");
1048 #endif
1049 }
1050
1051 ntriples_parser->offset=start-buffer;
1052
1053 len=ntriples_parser->line_length - ntriples_parser->offset;
1054
1055 if(len) {
1056 /* collapse buffer */
1057
1058 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1059 RAPTOR_DEBUG3("collapsing buffer from %d to %d bytes\n", ntriples_parser->line_length, (unsigned int)len);
1060 #endif
1061 buffer=(unsigned char*)RAPTOR_MALLOC(cstring, len + 1);
1062 if(!buffer) {
1063 raptor_parser_fatal_error(rdf_parser, "Out of memory");
1064 return 1;
1065 }
1066
1067 strncpy((char*)buffer,
1068 (const char*)ntriples_parser->line+ntriples_parser->line_length-len,
1069 len);
1070 buffer[len]='\0';
1071
1072 RAPTOR_FREE(cstring, ntriples_parser->line);
1073
1074 ntriples_parser->line=buffer;
1075 ntriples_parser->line_length -= ntriples_parser->offset;
1076 ntriples_parser->offset=0;
1077
1078 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1079 RAPTOR_DEBUG3("buffer now '%s' (%d bytes)\n", ntriples_parser->line, ntriples_parser->line_length);
1080 #endif
1081 }
1082
1083 /* exit now, no more input */
1084 if(is_end) {
1085 if(ntriples_parser->offset != ntriples_parser->line_length) {
1086 raptor_parser_error(rdf_parser, "Junk at end of input.\"");
1087 return 1;
1088 }
1089
1090 return 0;
1091 }
1092
1093 return 0;
1094 }
1095
1096
1097 static int
raptor_ntriples_parse_start(raptor_parser * rdf_parser)1098 raptor_ntriples_parse_start(raptor_parser* rdf_parser)
1099 {
1100 raptor_locator *locator=&rdf_parser->locator;
1101 raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context;
1102
1103 locator->line=1;
1104 locator->column=0;
1105 locator->byte=0;
1106
1107 ntriples_parser->last_char='\0';
1108
1109 return 0;
1110 }
1111
1112
1113 static int
raptor_ntriples_parse_recognise_syntax(raptor_parser_factory * factory,const unsigned char * buffer,size_t len,const unsigned char * identifier,const unsigned char * suffix,const char * mime_type)1114 raptor_ntriples_parse_recognise_syntax(raptor_parser_factory* factory,
1115 const unsigned char *buffer, size_t len,
1116 const unsigned char *identifier,
1117 const unsigned char *suffix,
1118 const char *mime_type)
1119 {
1120 int score= 0;
1121
1122 if(suffix) {
1123 if(!strcmp((const char*)suffix, "nt"))
1124 score=8;
1125 if(!strcmp((const char*)suffix, "ttl"))
1126 score=3;
1127 if(!strcmp((const char*)suffix, "n3"))
1128 score=1;
1129 }
1130
1131 if(mime_type) {
1132 if(strstr((const char*)mime_type, "ntriples"))
1133 score+=6;
1134 }
1135
1136 return score;
1137 }
1138
1139
1140 static int
raptor_ntriples_parser_register_factory(raptor_parser_factory * factory)1141 raptor_ntriples_parser_register_factory(raptor_parser_factory *factory)
1142 {
1143 int rc=0;
1144
1145 factory->context_length = sizeof(raptor_ntriples_parser_context);
1146
1147 factory->need_base_uri = 0;
1148
1149 factory->init = raptor_ntriples_parse_init;
1150 factory->terminate = raptor_ntriples_parse_terminate;
1151 factory->start = raptor_ntriples_parse_start;
1152 factory->chunk = raptor_ntriples_parse_chunk;
1153 factory->recognise_syntax = raptor_ntriples_parse_recognise_syntax;
1154
1155 rc=raptor_parser_factory_add_uri(factory,
1156 (const unsigned char*)"http://www.w3.org/TR/rdf-testcases/#ntriples");
1157
1158 if(!rc)
1159 rc = raptor_parser_factory_add_mime_type(factory, "text/plain", 1);
1160
1161 return rc;
1162 }
1163
1164
1165 int
raptor_init_parser_ntriples(raptor_world * world)1166 raptor_init_parser_ntriples(raptor_world* world)
1167 {
1168 return !raptor_parser_register_factory(world, "ntriples", "N-Triples",
1169 &raptor_ntriples_parser_register_factory);
1170 }
1171