1 /* -*- Mode: c; c-basic-offset: 2 -*-
2  *
3  * ntriples_parse.c - Raptor N-Triples Parser implementation
4  *
5  * N-Triples
6  * http://www.w3.org/TR/rdf-testcases/#ntriples
7  *
8  * Copyright (C) 2001-2008, David Beckett http://www.dajobe.org/
9  * Copyright (C) 2001-2005, University of Bristol, UK http://www.bristol.ac.uk/
10  *
11  * This package is Free Software and part of Redland http://librdf.org/
12  *
13  * It is licensed under the following three licenses as alternatives:
14  *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
15  *   2. GNU General Public License (GPL) V2 or any newer version
16  *   3. Apache License, V2.0 or any newer version
17  *
18  * You may not use this file except in compliance with at least one of
19  * the above three licenses.
20  *
21  * See LICENSE.html or LICENSE.txt at the top of this package for the
22  * complete terms and further detail along with the license texts for
23  * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
24  *
25  *
26  */
27 
28 
29 #ifdef HAVE_CONFIG_H
30 #include <raptor_config.h>
31 #endif
32 
33 #ifdef WIN32
34 #include <win32_raptor_config.h>
35 #endif
36 
37 #include <stdio.h>
38 #include <string.h>
39 #include <ctype.h>
40 #include <stdarg.h>
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_STDLIB_H
45 #include <stdlib.h>
46 #endif
47 
48 /* Raptor includes */
49 #include "raptor.h"
50 #include "raptor_internal.h"
51 
52 /* Set RAPTOR_DEBUG to > 1 to get lots of buffer related debugging */
53 /*
54 #undef RAPTOR_DEBUG
55 #define RAPTOR_DEBUG 2
56 */
57 
58 /* Prototypes for local functions */
59 static void raptor_ntriples_generate_statement(raptor_parser* parser, const unsigned char *subject, const raptor_ntriples_term_type subject_type, const unsigned char *predicate, const raptor_ntriples_term_type predicate_type, const void *object, const raptor_ntriples_term_type object_type, const unsigned char *object_literal_language, const unsigned char *object_literal_datatype);
60 
61 /*
62  * NTriples parser object
63  */
64 struct raptor_ntriples_parser_context_s {
65   /* current line */
66   unsigned char *line;
67   /* current line length */
68   int line_length;
69   /* current char in line buffer */
70   int offset;
71 
72   char last_char;
73 
74   /* static statement for use in passing to user code */
75   raptor_statement statement;
76 };
77 
78 
79 typedef struct raptor_ntriples_parser_context_s raptor_ntriples_parser_context;
80 
81 
82 
83 /**
84  * raptor_ntriples_parse_init:
85  *
86  * Initialise the Raptor NTriples parser.
87  *
88  * Return value: non 0 on failure
89  **/
90 
91 static int
raptor_ntriples_parse_init(raptor_parser * rdf_parser,const char * name)92 raptor_ntriples_parse_init(raptor_parser* rdf_parser, const char *name)
93 {
94   /*raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context; */
95   return 0;
96 }
97 
98 
99 /* PUBLIC FUNCTIONS */
100 
101 
102 /*
103  * raptor_ntriples_parse_terminate - Free the Raptor NTriples parser
104  * @rdf_parser: parser object
105  *
106  **/
107 static void
raptor_ntriples_parse_terminate(raptor_parser * rdf_parser)108 raptor_ntriples_parse_terminate(raptor_parser* rdf_parser)
109 {
110   raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context;
111   if(ntriples_parser->line_length)
112     RAPTOR_FREE(cdata, ntriples_parser->line);
113 }
114 
115 
116 static const char * const term_type_strings[]={
117   "URIref",
118   "bnodeID",
119   "Literal"
120 };
121 
122 
123 #ifndef RAPTOR_DISABLE_DEPRECATED
124 /**
125  * raptor_ntriples_term_as_string:
126  * @term: N-Triples term.
127  *
128  * Get a label for a #raptor_ntriples_term_type.
129  *
130  * @deprecated: an internal debug function, do not use.
131  *
132  * Return value: a pointer to a constant string.
133  **/
134 const char *
raptor_ntriples_term_as_string(raptor_ntriples_term_type term)135 raptor_ntriples_term_as_string(raptor_ntriples_term_type term)
136 {
137   return term_type_strings[(int)term];
138 }
139 #endif
140 
141 
142 static void
raptor_ntriples_generate_statement(raptor_parser * parser,const unsigned char * subject,const raptor_ntriples_term_type subject_type,const unsigned char * predicate,const raptor_ntriples_term_type predicate_type,const void * object,const raptor_ntriples_term_type object_type,const unsigned char * object_literal_language,const unsigned char * object_literal_datatype)143 raptor_ntriples_generate_statement(raptor_parser* parser,
144                                    const unsigned char *subject,
145                                    const raptor_ntriples_term_type subject_type,
146                                    const unsigned char *predicate,
147                                    const raptor_ntriples_term_type predicate_type,
148                                    const void *object,
149                                    const raptor_ntriples_term_type object_type,
150                                    const unsigned char *object_literal_language,
151                                    const unsigned char *object_literal_datatype)
152 {
153   /* raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)parser->context; */
154   raptor_statement *statement=&parser->statement;
155   raptor_uri *subject_uri=NULL;
156   raptor_uri *predicate_uri=NULL;
157   raptor_uri *object_uri=NULL;
158   raptor_uri *datatype_uri=NULL;
159 
160   /* Two choices for subject from N-Triples */
161   if(subject_type == RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE) {
162     statement->subject=subject;
163     statement->subject_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
164   } else {
165     subject_uri=raptor_new_uri_v2(parser->world, subject);
166     if(!subject_uri) {
167       raptor_parser_error(parser, "Could not create subject uri '%s', skipping", subject);
168       goto cleanup;
169     }
170     statement->subject=subject_uri;
171     statement->subject_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
172   }
173 
174   if(object_literal_datatype) {
175     datatype_uri=raptor_new_uri_v2(parser->world, object_literal_datatype);
176     if(!datatype_uri) {
177       raptor_parser_error(parser, "Could not create object literal datatype uri '%s', skipping", object_literal_datatype);
178       goto cleanup;
179     }
180     object_literal_language=NULL;
181   }
182 
183   /* Predicates in N-Triples are URIs but check for bad ordinals */
184   if(!strncmp((const char*)predicate, "http://www.w3.org/1999/02/22-rdf-syntax-ns#_", 44)) {
185     int predicate_ordinal=raptor_check_ordinal(predicate+44);
186     if(predicate_ordinal <= 0)
187       raptor_parser_error(parser, "Illegal ordinal value %d in property '%s'.", predicate_ordinal, predicate);
188   }
189 
190   predicate_uri=raptor_new_uri_v2(parser->world, predicate);
191   if(!predicate_uri) {
192     raptor_parser_error(parser, "Could not create predicate uri '%s', skipping", predicate);
193     goto cleanup;
194   }
195   statement->predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
196   statement->predicate=predicate_uri;
197 
198   /* Three choices for object from N-Triples */
199   statement->object_literal_language=NULL;
200   statement->object_literal_datatype=NULL;
201   if(object_type == RAPTOR_NTRIPLES_TERM_TYPE_URI_REF) {
202     object_uri=raptor_new_uri_v2(parser->world, (const unsigned char*)object);
203     if(!object_uri) {
204       raptor_parser_error(parser, "Could not create object uri '%s', skipping", (const char *)object);
205       goto cleanup;
206     }
207     statement->object=object_uri;
208     statement->object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
209   } else if(object_type == RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE) {
210     statement->object=object;
211     statement->object_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
212   } else {
213     statement->object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL;
214     statement->object=object;
215     statement->object_literal_language=object_literal_language;
216     statement->object_literal_datatype=datatype_uri;
217   }
218 
219   if(!parser->statement_handler)
220     goto cleanup;
221 
222   /* Generate the statement; or is it fact? */
223   (*parser->statement_handler)(parser->user_data, statement);
224 
225   cleanup:
226   if(subject_uri)
227     raptor_free_uri_v2(parser->world, subject_uri);
228   if(predicate_uri)
229     raptor_free_uri_v2(parser->world, predicate_uri);
230   if(object_uri)
231     raptor_free_uri_v2(parser->world, object_uri);
232   if(datatype_uri)
233     raptor_free_uri_v2(parser->world, datatype_uri);
234 }
235 
236 
237 /* These are for 7-bit ASCII and not locale-specific */
238 #define IS_ASCII_ALPHA(c) (((c)>0x40 && (c)<0x5B) || ((c)>0x60 && (c)<0x7B))
239 #define IS_ASCII_UPPER(c) ((c)>0x40 && (c)<0x5B)
240 #define IS_ASCII_DIGIT(c) ((c)>0x2F && (c)<0x3A)
241 #define IS_ASCII_PRINT(c) ((c)>0x1F && (c)<0x7F)
242 #define TO_ASCII_LOWER(c) ((c)+0x20)
243 
244 typedef enum {
245   RAPTOR_TERM_CLASS_URI,      /* ends on > */
246   RAPTOR_TERM_CLASS_BNODEID,  /* ends on first non [A-Za-z][A-Za-z0-9]* */
247   RAPTOR_TERM_CLASS_STRING,   /* ends on non-escaped " */
248   RAPTOR_TERM_CLASS_LANGUAGE, /* ends on first non [a-z0-9]+ ('-' [a-z0-9]+ )? */
249   RAPTOR_TERM_CLASS_FULL      /* the entire string is used */
250 } raptor_ntriples_term_class;
251 
252 
253 static int
raptor_ntriples_term_valid(unsigned char c,int position,raptor_ntriples_term_class term_class)254 raptor_ntriples_term_valid(unsigned char c, int position,
255                            raptor_ntriples_term_class term_class)
256 {
257   int result=0;
258 
259   switch(term_class) {
260     case RAPTOR_TERM_CLASS_URI:
261       /* ends on > */
262       result=(c!= '>');
263       break;
264 
265     case RAPTOR_TERM_CLASS_BNODEID:
266       /* ends on first non [A-Za-z][A-Za-z0-9]* */
267       result=IS_ASCII_ALPHA(c);
268       if(position)
269         result = (result || IS_ASCII_DIGIT(c));
270       break;
271 
272     case RAPTOR_TERM_CLASS_STRING:
273       /* ends on " */
274       result=(c!= '"');
275       break;
276 
277     case RAPTOR_TERM_CLASS_LANGUAGE:
278       /* ends on first non [a-z0-9]+ ('-' [a-z0-9]+ )? */
279       result=(IS_ASCII_ALPHA(c) || IS_ASCII_DIGIT(c));
280       if(position)
281         result = (result || c=='-');
282       break;
283 
284     case RAPTOR_TERM_CLASS_FULL:
285       result=1;
286       break;
287 
288     default:
289       RAPTOR_FATAL2("Unknown ntriples term %d", term_class);
290   }
291 
292   return result;
293 }
294 
295 
296 /*
297  * raptor_ntriples_term - Parse an N-Triples term with escapes
298  * @parser: NTriples parser
299  * @start: pointer to starting character of string (in)
300  * @dest: destination of string (in)
301  * @lenp: pointer to length of string (in/out)
302  * @dest_lenp: pointer to length of destination string (out)
303  * @end_char: string ending character
304  * @class: string class
305  * @allow_utf8: Non-0 if UTF-8 chars are allowed in the term
306  *
307  * N-Triples strings/URIs are written in ASCII at present; characters
308  * outside the printable ASCII range are discarded with a warning.
309  * See the grammar for full details of the allowed ranges.
310  *
311  * If the class is RAPTOR_TERM_CLASS_FULL, the end_char is ignored.
312  *
313  * UTF-8 is only allowed if allow_utf8 is non-0, otherwise the
314  * string is US-ASCII and only the \u and \U esapes are allowed.
315  * If enabled, both are allowed.
316  *
317  * Return value: Non 0 on failure
318  **/
319 static int
raptor_ntriples_term(raptor_parser * rdf_parser,const unsigned char ** start,unsigned char * dest,size_t * lenp,size_t * dest_lenp,char end_char,raptor_ntriples_term_class term_class,int allow_utf8)320 raptor_ntriples_term(raptor_parser* rdf_parser,
321                      const unsigned char **start, unsigned char *dest,
322                      size_t *lenp, size_t *dest_lenp,
323                      char end_char,
324                      raptor_ntriples_term_class term_class,
325                      int allow_utf8)
326 {
327   const unsigned char *p=*start;
328   unsigned char c='\0';
329   size_t ulen=0;
330   unsigned long unichar=0;
331   unsigned int position=0;
332   int end_char_seen=0;
333 
334   if(term_class == RAPTOR_TERM_CLASS_FULL)
335     end_char='\0';
336 
337   /* find end of string, fixing backslashed characters on the way */
338   while(*lenp > 0) {
339     c = *p;
340 
341     p++;
342     (*lenp)--;
343     rdf_parser->locator.column++;
344     rdf_parser->locator.byte++;
345 
346     if(allow_utf8) {
347       if(c > 0x7f) {
348         /* just copy the UTF-8 bytes through */
349         size_t unichar_len=raptor_utf8_to_unicode_char(NULL, (const unsigned char*)p-1, 1+*lenp);
350         if(unichar_len > *lenp) {
351           raptor_parser_error(rdf_parser, "UTF-8 encoding error at character %d (0x%02X) found.", c, c);
352           /* UTF-8 encoding had an error or ended in the middle of a string */
353           return 1;
354         }
355         memcpy(dest, p-1, unichar_len);
356         dest+= unichar_len;
357 
358         unichar_len--; /* p, *lenp were moved on by 1 earlier */
359 
360         p += unichar_len;
361         (*lenp) -= unichar_len;
362         rdf_parser->locator.column+= unichar_len;
363         rdf_parser->locator.byte+= unichar_len;
364         continue;
365       }
366     } else if(!IS_ASCII_PRINT(c)) {
367       /* This is an ASCII check, not a printable character check
368        * so isprint() is not appropriate, since that is a locale check.
369        */
370       raptor_parser_error(rdf_parser, "Non-printable ASCII character %d (0x%02X) found.", c, c);
371       continue;
372     }
373 
374     if(c != '\\') {
375       /* finish at non-backslashed end_char */
376       if(end_char && c == end_char) {
377         end_char_seen=1;
378         break;
379       }
380 
381       if(!raptor_ntriples_term_valid(c, position, term_class)) {
382         if(end_char) {
383           /* end char was expected, so finding an invalid thing is an error */
384           raptor_parser_error(rdf_parser, "Missing terminating '%c' (found '%c')", end_char, c);
385           return 0;
386         } else {
387           /* it's the end - so rewind 1 to save next char */
388           p--;
389           (*lenp)++;
390           rdf_parser->locator.column--;
391           rdf_parser->locator.byte--;
392           break;
393         }
394       }
395 
396       /* otherwise store and move on */
397       *dest++=c;
398       position++;
399       continue;
400     }
401 
402     if(!*lenp) {
403       if(term_class != RAPTOR_TERM_CLASS_FULL)
404         raptor_parser_error(rdf_parser, "\\ at end of line");
405       return 0;
406     }
407 
408     c = *p;
409 
410     p++;
411     (*lenp)--;
412     rdf_parser->locator.column++;
413     rdf_parser->locator.byte++;
414 
415     switch(c) {
416       case '"':
417       case '\\':
418         *dest++=c;
419         break;
420       case 'n':
421         *dest++='\n';
422         break;
423       case 'r':
424         *dest++='\r';
425         break;
426       case 't':
427         *dest++='\t';
428         break;
429       case 'u':
430       case 'U':
431         ulen=(c == 'u') ? 4 : 8;
432 
433         if(*lenp < ulen) {
434           raptor_parser_error(rdf_parser, "%c over end of line", c);
435           return 0;
436         }
437 
438         if(1) {
439           int n;
440 
441           n=sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
442           if(n != 1) {
443             raptor_parser_error(rdf_parser, "Illegal Uncode escape '%c%s...'", c, p);
444             break;
445           }
446         }
447 
448         p+=ulen;
449         (*lenp)-=ulen;
450         rdf_parser->locator.column+=ulen;
451         rdf_parser->locator.byte+=ulen;
452 
453         if(unichar > 0x10ffff) {
454           raptor_parser_error(rdf_parser, "Illegal Unicode character with code point #x%lX.", unichar);
455           break;
456         }
457 
458         dest+=raptor_unicode_char_to_utf8(unichar, dest);
459         break;
460 
461       default:
462         raptor_parser_error(rdf_parser,
463                             "Illegal string escape \\%c in \"%s\"", c,
464                             (char*)start);
465         return 0;
466     }
467 
468     position++;
469   } /* end while */
470 
471 
472   if(end_char && !end_char_seen) {
473     raptor_parser_error(rdf_parser, "Missing terminating '%c' before end of line.", end_char);
474     return 1;
475   }
476 
477   /* terminate dest, can be shorter than source */
478   *dest='\0';
479 
480   if(dest_lenp)
481     *dest_lenp=p-*start;
482 
483   *start=p;
484 
485   return 0;
486 }
487 
488 
489 #ifndef RAPTOR_DISABLE_DEPRECATED
490 /**
491  * raptor_ntriples_string_as_utf8_string:
492  * @rdf_parser: parser object
493  * @src: data to read from
494  * @len: size of data
495  * @dest_lenp: pointer to length of destination (out) or NULL
496  *
497  * Turn an N-Triples string with escapes into a UTF-8 string.
498  *
499  * @deprecated: This requires use of parser internals and was never in the public API header.
500  *
501  * Return value: a new UTF-8 string
502  **/
503 unsigned char*
raptor_ntriples_string_as_utf8_string(raptor_parser * rdf_parser,const unsigned char * src,int len,size_t * dest_lenp)504 raptor_ntriples_string_as_utf8_string(raptor_parser* rdf_parser,
505                                       const unsigned char *src, int len,
506                                       size_t *dest_lenp)
507 {
508   const unsigned char *start=src;
509   size_t length=len;
510   unsigned char *dest;
511   int rc;
512 
513   dest=(unsigned char*)RAPTOR_MALLOC(cstring, len+1);
514   if(!dest)
515     return NULL;
516 
517   rc=raptor_ntriples_term(rdf_parser, &start, dest, &length, dest_lenp,
518                               '\0', RAPTOR_TERM_CLASS_FULL, 1);
519   if(rc) {
520     RAPTOR_FREE(cstring, dest);
521     dest=NULL;
522   }
523   return dest;
524 }
525 #endif
526 
527 
528 static int
raptor_ntriples_parse_line(raptor_parser * rdf_parser,unsigned char * buffer,size_t len)529 raptor_ntriples_parse_line(raptor_parser* rdf_parser,
530                            unsigned char *buffer, size_t len)
531 {
532   int i;
533   unsigned char *p;
534   unsigned char *dest;
535   unsigned char *terms[3];
536   int terms_allocated[3];
537   size_t term_lengths[3];
538   raptor_ntriples_term_type term_types[3];
539   size_t term_length= 0;
540   unsigned char *object_literal_language=NULL;
541   unsigned char *object_literal_datatype=NULL;
542   int rc=0;
543 
544   for(i=0; i<3; i++)
545     terms_allocated[i]=0;
546 
547   /* ASSERTION:
548    * p always points to first char we are considering
549    * p[len-1] always points to last char
550    */
551 
552   /* Handle empty  lines */
553   if(!len)
554     return 0;
555 
556 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
557   RAPTOR_DEBUG3("handling line '%s' (%d bytes)\n", buffer, (unsigned int)len);
558 #endif
559 
560   p=buffer;
561 
562   while(len>0 && isspace((int)*p)) {
563     p++;
564     rdf_parser->locator.column++;
565     rdf_parser->locator.byte++;
566     len--;
567   }
568 
569   /* Handle empty - all whitespace lines */
570   if(!len)
571     return 0;
572 
573   /* Handle comment lines */
574   if(*p == '#')
575     return 0;
576 
577   /* Remove trailing spaces */
578   while(len>0 && isspace((int)p[len-1])) {
579     p[len-1]='\0';
580     len--;
581   }
582 
583   /* can't be empty now - that would have been caught above */
584 
585   /* Check for terminating '.' */
586   if(p[len-1] != '.') {
587     /* Move current location to point to problem */
588     rdf_parser->locator.column += len-2;
589     rdf_parser->locator.byte += len-2;
590     raptor_parser_error(rdf_parser, "Missing . at end of line");
591     return 0;
592   }
593 
594   p[len-1]='\0';
595   len--;
596 
597 
598   /* Must be triple */
599 
600   for(i=0; i<3; i++) {
601     if(!len) {
602       raptor_parser_error(rdf_parser, "Unexpected end of line");
603       goto cleanup;
604     }
605 
606     /* Expect either <URI> or _:name */
607     if(i == 2) {
608       if(*p != '<' && *p != '_' && *p != '"' && *p != 'x') {
609         raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref>, _:bnodeID or \"literal\"", *p);
610         goto cleanup;
611       }
612       if(*p == 'x') {
613         if(len < 4 || strncmp((const char*)p, "xml\"", 4)) {
614           raptor_parser_error(rdf_parser, "Saw '%c', expected xml\"...\")", *p);
615           goto cleanup;
616         }
617       }
618     } else if(i == 1) {
619       if(*p != '<') {
620         raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref>", *p);
621         goto cleanup;
622       }
623     } else /* i==0 */ {
624       if(*p != '<' && *p != '_') {
625         raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref> or _:bnodeID", *p);
626         goto cleanup;
627       }
628     }
629 
630     switch(*p) {
631       case '<':
632         term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_URI_REF;
633 
634         dest=p;
635 
636         p++;
637         len--;
638         rdf_parser->locator.column++;
639         rdf_parser->locator.byte++;
640 
641         if(raptor_ntriples_term(rdf_parser,
642                                 (const unsigned char**)&p,
643                                 dest, &len, &term_length,
644                                 '>', RAPTOR_TERM_CLASS_URI, 0)) {
645           rc=1;
646           goto cleanup;
647         }
648         break;
649 
650       case '"':
651         term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_LITERAL;
652 
653         dest=p;
654 
655         p++;
656         len--;
657         rdf_parser->locator.column++;
658         rdf_parser->locator.byte++;
659 
660         if(raptor_ntriples_term(rdf_parser,
661                                 (const unsigned char**)&p,
662                                 dest, &len, &term_length,
663                                 '"', RAPTOR_TERM_CLASS_STRING, 0)) {
664           rc=1;
665           goto cleanup;
666         }
667 
668         if(len && (*p == '-' || *p == '@')) {
669           if(*p == '-')
670             raptor_parser_error(rdf_parser, "Old N-Triples language syntax using \"string\"-lang rather than \"string\"@lang.");
671 
672           object_literal_language=p;
673 
674           /* Skip - */
675           p++;
676           len--;
677           rdf_parser->locator.column++;
678           rdf_parser->locator.byte++;
679 
680           if(!len) {
681             raptor_parser_error(rdf_parser, "Missing language after \"string\"-");
682             goto cleanup;
683           }
684 
685 
686           if(raptor_ntriples_term(rdf_parser,
687                                   (const unsigned char**)&p,
688                                   object_literal_language, &len, NULL,
689                                   '\0', RAPTOR_TERM_CLASS_LANGUAGE, 0)) {
690             rc=1;
691             goto cleanup;
692           }
693         }
694 
695         if(len >1 && *p == '^' && p[1] == '^') {
696 
697           object_literal_datatype=p;
698 
699           /* Skip ^^ */
700           p+= 2;
701           len-= 2;
702           rdf_parser->locator.column+= 2;
703           rdf_parser->locator.byte+= 2;
704 
705           if(!len || (len && *p != '<')) {
706             raptor_parser_error(rdf_parser, "Missing datatype URI-ref in\"string\"^^<URI-ref> after ^^");
707             goto cleanup;
708           }
709 
710           p++;
711           len--;
712           rdf_parser->locator.column++;
713           rdf_parser->locator.byte++;
714 
715           if(raptor_ntriples_term(rdf_parser,
716                                   (const unsigned char**)&p,
717                                   object_literal_datatype, &len, NULL,
718                                   '>', RAPTOR_TERM_CLASS_URI, 0)) {
719             rc=1;
720             goto cleanup;
721           }
722 
723         }
724 
725         if(object_literal_datatype && object_literal_language) {
726           raptor_parser_warning(rdf_parser, "Typed literal used with a language - ignoring the language");
727           object_literal_language=NULL;
728         }
729 
730 
731         break;
732 
733 
734       case '_':
735         term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE;
736 
737         /* store where _ was */
738         dest=p;
739 
740         p++;
741         len--;
742         rdf_parser->locator.column++;
743         rdf_parser->locator.byte++;
744 
745         if(!len || (len > 0 && *p != ':')) {
746           raptor_parser_error(rdf_parser, "Illegal bNodeID - _ not followed by :");
747           goto cleanup;
748         }
749 
750         /* Found ':' - move on */
751 
752         p++;
753         len--;
754         rdf_parser->locator.column++;
755         rdf_parser->locator.byte++;
756 
757         if(raptor_ntriples_term(rdf_parser,
758                                 (const unsigned char**)&p,
759                                 dest, &len, &term_length,
760                                 '\0', RAPTOR_TERM_CLASS_BNODEID, 0)) {
761           rc=1;
762           goto cleanup;
763         }
764 
765         if(!term_length) {
766           raptor_parser_error(rdf_parser, "Bad or missing bNodeID after _:");
767           goto cleanup;
768         } else {
769           unsigned char *blank=(unsigned char*)RAPTOR_MALLOC(cstring, term_length+1);
770           if(!blank) {
771             raptor_parser_fatal_error(rdf_parser, "Out of memory");
772             rc=1;
773             goto cleanup;
774           }
775           strcpy((char*)blank, (const char*)dest);
776           dest=raptor_parser_internal_generate_id(rdf_parser,
777                                                   RAPTOR_GENID_TYPE_BNODEID,
778                                                   blank);
779           terms_allocated[i]=1;
780         }
781 
782         break;
783 
784       case 'x':
785 
786         raptor_parser_error(rdf_parser, "Old N-Triples XML using xml\"string\"-lang rather than \"string\"@lang^^<%s>.", raptor_xml_literal_datatype_uri_string);
787 
788         /* already know we have 'xml"' coming up */
789         term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_LITERAL;
790 
791         /* 3=strlen("xml") */
792         p+=3;
793         len-=3;
794 
795         dest=p;
796 
797         p++;
798         len--;
799         rdf_parser->locator.column++;
800         rdf_parser->locator.byte++;
801 
802         if(raptor_ntriples_term(rdf_parser,
803                                 (const unsigned char**)&p,
804                                 dest, &len, &term_length,
805                                 '"', RAPTOR_TERM_CLASS_STRING, 0)) {
806           rc=1;
807           goto cleanup;
808         }
809 
810         /* got XML literal string */
811         object_literal_datatype=(unsigned char*)raptor_xml_literal_datatype_uri_string;
812 
813         if(len && (*p == '-' || *p == '@')) {
814           if(*p == '-')
815             raptor_parser_error(rdf_parser, "Old N-Triples language syntax using xml\"string\"-lang rather than xml\"string\"@lang.");
816 
817           object_literal_language=p;
818 
819           /* Skip - */
820           p++;
821           len--;
822           rdf_parser->locator.column++;
823           rdf_parser->locator.byte++;
824 
825           if(!len) {
826             raptor_parser_error(rdf_parser, "Missing language in xml\"string\"-language after -");
827             goto cleanup;
828           }
829 
830           if(raptor_ntriples_term(rdf_parser,
831                                   (const unsigned char**)&p,
832                                   object_literal_language, &len, NULL,
833                                   '"', RAPTOR_TERM_CLASS_STRING, 0)) {
834             rc=1;
835             goto cleanup;
836           }
837 
838         }
839 
840         if(len >1 && *p == '^' && p[1] == '^') {
841 
842           object_literal_datatype=p;
843 
844           /* Skip ^^ */
845           p+= 2;
846           len-= 2;
847           rdf_parser->locator.column+= 2;
848           rdf_parser->locator.byte+= 2;
849 
850           if(!len || (len && *p != '<')) {
851             raptor_parser_error(rdf_parser, "Missing datatype URI-ref in xml\"string\"^^<URI-ref> after ^^");
852             goto cleanup;
853           }
854 
855           p++;
856           len--;
857           rdf_parser->locator.column++;
858           rdf_parser->locator.byte++;
859 
860           if(raptor_ntriples_term(rdf_parser,
861                                   (const unsigned char**)&p,
862                                   object_literal_datatype, &len, NULL,
863                                   '>', RAPTOR_TERM_CLASS_URI, 0)) {
864             rc=1;
865             goto cleanup;
866           }
867 
868         }
869 
870         if(len) {
871           if(*p != ' ') {
872             raptor_parser_error(rdf_parser, "Missing terminating ' '");
873             return 0;
874           }
875 
876           p++;
877           len--;
878           rdf_parser->locator.column++;
879           rdf_parser->locator.byte++;
880         }
881 
882         break;
883 
884 
885       default:
886         raptor_parser_fatal_error(rdf_parser, "Unknown term type");
887         rc=1;
888         goto cleanup;
889     }
890 
891 
892     /* Store term */
893     terms[i]=dest; term_lengths[i]=term_length;
894 
895     /* Whitespace must separate the terms */
896     if(i<2 && !isspace((int)*p)) {
897       raptor_parser_error(rdf_parser, "Missing whitespace after term '%s'", terms[i]);
898       rc=1;
899       goto cleanup;
900     }
901 
902     /* Skip whitespace after terms */
903     while(len>0 && isspace((int)*p)) {
904       p++;
905       len--;
906       rdf_parser->locator.column++;
907       rdf_parser->locator.byte++;
908     }
909 
910 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
911     fprintf(stderr, "item %d: term '%s' len %d type %s\n",
912             i, terms[i], (unsigned int)term_lengths[i],
913             raptor_ntriples_term_as_string(term_types[i]));
914 #endif
915   }
916 
917   if(len) {
918     raptor_parser_error(rdf_parser, "Junk before terminating \".\"");
919     return 0;
920   }
921 
922 
923   if(object_literal_language) {
924     unsigned char *q;
925     /* Normalize language to lowercase
926      * http://www.w3.org/TR/rdf-concepts/#dfn-language-identifier
927      */
928     for(q=object_literal_language; *q; q++) {
929       if(IS_ASCII_UPPER(*q))
930         *q=TO_ASCII_LOWER(*q);
931     }
932   }
933 
934   raptor_ntriples_generate_statement(rdf_parser,
935                                      terms[0], term_types[0],
936                                      terms[1], term_types[1],
937                                      terms[2], term_types[2],
938                                      object_literal_language,
939                                      object_literal_datatype);
940 
941   rdf_parser->locator.byte += len;
942 
943  cleanup:
944   for(i=0; i<3; i++)
945     if(terms_allocated[i])
946       RAPTOR_FREE(cstring, terms[i]);
947 
948   return rc;
949 }
950 
951 
952 static int
raptor_ntriples_parse_chunk(raptor_parser * rdf_parser,const unsigned char * s,size_t len,int is_end)953 raptor_ntriples_parse_chunk(raptor_parser* rdf_parser,
954                             const unsigned char *s, size_t len,
955                             int is_end)
956 {
957   unsigned char *buffer;
958   unsigned char *ptr;
959   unsigned char *start;
960   raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context;
961 
962 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
963   RAPTOR_DEBUG2("adding %d bytes to buffer\n", (unsigned int)len);
964 #endif
965 
966   /* No data?  It's the end */
967   if(!len)
968     return 0;
969 
970   buffer=(unsigned char*)RAPTOR_MALLOC(cstring, ntriples_parser->line_length + len + 1);
971   if(!buffer) {
972     raptor_parser_fatal_error(rdf_parser, "Out of memory");
973     return 1;
974   }
975 
976   if(ntriples_parser->line_length) {
977     strncpy((char*)buffer, (const char*)ntriples_parser->line, ntriples_parser->line_length);
978     RAPTOR_FREE(cstring, ntriples_parser->line);
979   }
980 
981   ntriples_parser->line=buffer;
982 
983   /* move pointer to end of cdata buffer */
984   ptr=buffer+ntriples_parser->line_length;
985 
986   /* adjust stored length */
987   ntriples_parser->line_length += len;
988 
989   /* now write new stuff at end of cdata buffer */
990   strncpy((char*)ptr, (const char*)s, len);
991   ptr += len;
992   *ptr = '\0';
993 
994 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
995   RAPTOR_DEBUG2("buffer now %d bytes\n", ntriples_parser->line_length);
996 #endif
997 
998   ptr=buffer+ntriples_parser->offset;
999   while(*(start=ptr)) {
1000     unsigned char *line_start=ptr;
1001 
1002 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1003   RAPTOR_DEBUG3("line buffer now '%s' (offset %d)\n", ptr, ptr-(buffer+ntriples_parser->offset));
1004 #endif
1005 
1006     /* skip \n when just seen \r - i.e. \r\n or CR LF */
1007     if(ntriples_parser->last_char == '\r' && *ptr == '\n') {
1008 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1009       RAPTOR_DEBUG1("skipping a \\n\n");
1010 #endif
1011       ptr++;
1012       rdf_parser->locator.byte++;
1013       rdf_parser->locator.column=0;
1014       start=line_start=ptr;
1015     }
1016 
1017     while(*ptr && *ptr != '\n' && *ptr != '\r')
1018       ptr++;
1019 
1020     if(!*ptr)
1021       break;
1022 
1023 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1024     RAPTOR_DEBUG3("found newline \\x%02x at offset %d\n", *ptr,
1025                   ptr-line_start);
1026 #endif
1027     ntriples_parser->last_char=*ptr;
1028 
1029     len=ptr-line_start;
1030     rdf_parser->locator.column=0;
1031 
1032     *ptr='\0';
1033     if(raptor_ntriples_parse_line(rdf_parser,line_start,len))
1034       return 1;
1035 
1036     rdf_parser->locator.line++;
1037 
1038     /* go past newline */
1039     ptr++;
1040     rdf_parser->locator.byte++;
1041 
1042 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1043     /* Do not peek if too far */
1044     if(ptr-buffer < ntriples_parser->line_length)
1045       RAPTOR_DEBUG2("next char is \\x%02x\n", *ptr);
1046     else
1047       RAPTOR_DEBUG1("next char unknown - end of buffer\n");
1048 #endif
1049   }
1050 
1051   ntriples_parser->offset=start-buffer;
1052 
1053   len=ntriples_parser->line_length - ntriples_parser->offset;
1054 
1055   if(len) {
1056     /* collapse buffer */
1057 
1058 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1059     RAPTOR_DEBUG3("collapsing buffer from %d to %d bytes\n", ntriples_parser->line_length, (unsigned int)len);
1060 #endif
1061     buffer=(unsigned char*)RAPTOR_MALLOC(cstring, len + 1);
1062     if(!buffer) {
1063       raptor_parser_fatal_error(rdf_parser, "Out of memory");
1064       return 1;
1065     }
1066 
1067     strncpy((char*)buffer,
1068             (const char*)ntriples_parser->line+ntriples_parser->line_length-len,
1069             len);
1070     buffer[len]='\0';
1071 
1072     RAPTOR_FREE(cstring, ntriples_parser->line);
1073 
1074     ntriples_parser->line=buffer;
1075     ntriples_parser->line_length -= ntriples_parser->offset;
1076     ntriples_parser->offset=0;
1077 
1078 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
1079     RAPTOR_DEBUG3("buffer now '%s' (%d bytes)\n", ntriples_parser->line, ntriples_parser->line_length);
1080 #endif
1081   }
1082 
1083   /* exit now, no more input */
1084   if(is_end) {
1085     if(ntriples_parser->offset != ntriples_parser->line_length) {
1086        raptor_parser_error(rdf_parser, "Junk at end of input.\"");
1087        return 1;
1088     }
1089 
1090     return 0;
1091   }
1092 
1093   return 0;
1094 }
1095 
1096 
1097 static int
raptor_ntriples_parse_start(raptor_parser * rdf_parser)1098 raptor_ntriples_parse_start(raptor_parser* rdf_parser)
1099 {
1100   raptor_locator *locator=&rdf_parser->locator;
1101   raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context;
1102 
1103   locator->line=1;
1104   locator->column=0;
1105   locator->byte=0;
1106 
1107   ntriples_parser->last_char='\0';
1108 
1109   return 0;
1110 }
1111 
1112 
1113 static int
raptor_ntriples_parse_recognise_syntax(raptor_parser_factory * factory,const unsigned char * buffer,size_t len,const unsigned char * identifier,const unsigned char * suffix,const char * mime_type)1114 raptor_ntriples_parse_recognise_syntax(raptor_parser_factory* factory,
1115                                        const unsigned char *buffer, size_t len,
1116                                        const unsigned char *identifier,
1117                                        const unsigned char *suffix,
1118                                        const char *mime_type)
1119 {
1120   int score= 0;
1121 
1122   if(suffix) {
1123     if(!strcmp((const char*)suffix, "nt"))
1124       score=8;
1125     if(!strcmp((const char*)suffix, "ttl"))
1126       score=3;
1127     if(!strcmp((const char*)suffix, "n3"))
1128       score=1;
1129   }
1130 
1131   if(mime_type) {
1132     if(strstr((const char*)mime_type, "ntriples"))
1133       score+=6;
1134   }
1135 
1136   return score;
1137 }
1138 
1139 
1140 static int
raptor_ntriples_parser_register_factory(raptor_parser_factory * factory)1141 raptor_ntriples_parser_register_factory(raptor_parser_factory *factory)
1142 {
1143   int rc=0;
1144 
1145   factory->context_length     = sizeof(raptor_ntriples_parser_context);
1146 
1147   factory->need_base_uri = 0;
1148 
1149   factory->init      = raptor_ntriples_parse_init;
1150   factory->terminate = raptor_ntriples_parse_terminate;
1151   factory->start     = raptor_ntriples_parse_start;
1152   factory->chunk     = raptor_ntriples_parse_chunk;
1153   factory->recognise_syntax = raptor_ntriples_parse_recognise_syntax;
1154 
1155   rc=raptor_parser_factory_add_uri(factory,
1156                                 (const unsigned char*)"http://www.w3.org/TR/rdf-testcases/#ntriples");
1157 
1158   if(!rc)
1159     rc = raptor_parser_factory_add_mime_type(factory, "text/plain", 1);
1160 
1161   return rc;
1162 }
1163 
1164 
1165 int
raptor_init_parser_ntriples(raptor_world * world)1166 raptor_init_parser_ntriples(raptor_world* world)
1167 {
1168   return !raptor_parser_register_factory(world, "ntriples",  "N-Triples",
1169                                          &raptor_ntriples_parser_register_factory);
1170 }
1171