1 /*
2  * The contents of this file are subject to the Mozilla Public
3  * License Version 1.1 (the "License"); you may not use this file
4  * except in compliance with the License. You may obtain a copy of
5  * the License at http://www.mozilla.org/MPL/
6  *
7  * Software distributed under the License is distributed on an "AS
8  * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
9  * implied. See the License for the specific language governing
10  * rights and limitations under the License.
11  *
12  * The Original Code is the Sablotron XSLT Processor.
13  *
14  * The Initial Developer of the Original Code is Ginger Alliance Ltd.
15  * Portions created by Ginger Alliance are Copyright (C) 2000-2002
16  * Ginger Alliance Ltd. All Rights Reserved.
17  *
18  * Contributor(s):
19  *
20  * Alternatively, the contents of this file may be used under the
21  * terms of the GNU General Public License Version 2 or later (the
22  * "GPL"), in which case the provisions of the GPL are applicable
23  * instead of those above.  If you wish to allow use of your
24  * version of this file only under the terms of the GPL and not to
25  * allow others to use your version of this file under the MPL,
26  * indicate your decision by deleting the provisions above and
27  * replace them with the notice and other provisions required by
28  * the GPL.  If you do not delete the provisions above, a recipient
29  * may use your version of this file under either the MPL or the
30  * GPL.
31  */
32 
33 /*****************************************************************
34     uri.cpp
35 *****************************************************************/
36 
37 #include "uri.h"
38 #include <string.h>
39 #include "proc.h"
40 #include "platform.h"
41 
42 // GP: clean
43 
44 /*****************************************************************
45 
46     global functions
47 
48 *****************************************************************/
49 
50 #define RF(CONDITION) {if (!(CONDITION)) return;}
51 
52 // definition of names for various URI-reference parts
53 #define U_SCHEME    0
54 #define U_AUTH      1
55 #define U_PATH      2
56 #define U_QUERY     3
57 #define U_FRAG      4
58 
59 // definition of slahes in path names
60 #define slashes "/\\"
61 #define isSlash(c) (c == '/' || c == '\\')
62 
63 //special name reporting under windows
64 #ifdef __WIN_TOOLS
65 #define winName(name) ((name[0] == '/' && name[2] == ':') ? name + 1 : name)
66 #else
67 #define winName(name) name
68 #endif
69 
70 /*****************************************************************
71 uriHasAuthority
72 
73   decides, whether given schema should contain the authority
74 *****************************************************************/
75 
uriHasAuthority(Str & scheme)76 Bool uriHasAuthority(Str & scheme)
77 {
78   return (scheme == (const char*)"file");
79 }
80 
81 /*****************************************************************
82 splitBy
83 
84   splits a given string into two parts divided by the first occurence
85   of a delimiter from a given set. If no delimiter is found, returns FALSE
86   and leaves 'string' as is; otherwise shifts 'string' to the character
87   following the delimiter.
88 ARGS:
89   string        the asciiz string to be split
90   delims        the asciiz set of delimiters (all of them ASCII chars)
91   part1         first of the two parts
92 RETURN:
93   string        shifted to the other part (past the delimiter)
94   .             the delimiter found (or 0)
95 *****************************************************************/
96 
splitBy(const char * & string,const char * delims,Str & part1)97 char splitBy(const char *&string, const char *delims, Str &part1)
98 {
99     char c;
100     int firstLen = strcspn(string, delims);
101     part1.nset(string, firstLen);
102     if (!!(c = string[firstLen]))
103         string += firstLen + 1;
104     return c;
105 }
106 
107 typedef Str FiveStr[5];
108 
splitURI(const char * uri,FiveStr & parts)109 void splitURI(const char *uri, FiveStr &parts)
110 {
111     const char *rest;
112     char c;
113     for (int i = 0; i < 5; i++)
114         parts[i].empty();
115     RF( uri && *uri );
116     // extract the scheme part of the URI
117     if (!splitBy(rest = uri, ":", parts[U_SCHEME]))
118         parts[U_SCHEME].empty();
119     // if "//" follows, extract the authority part
120     c = 'A';    // marks the absence of auth
121     if (isSlash(*rest) && isSlash(rest[1]))
122         RF( c = splitBy(rest += 2, slashes"?#", parts[U_AUTH]) );
123     if (isSlash(c) || c == 'A')
124       // extract the path
125       RF( c = splitBy(rest -= (isSlash(c)), "?#", parts[U_PATH]) );
126     //query and fragment
127     if (c == '?')
128         // extract the query
129         RF( c = splitBy(rest, "#", parts[U_QUERY]) );
130     // copy the fragment
131     parts[U_FRAG] = (char *) rest;
132 };
133 
joinURI(DStr & joined,FiveStr & parts,Bool schemeToo)134 void joinURI(DStr &joined, FiveStr &parts, Bool schemeToo)
135 {
136   joined.empty();
137   if (schemeToo && !parts[U_SCHEME].isEmpty())
138     joined = parts[U_SCHEME] + ":";
139   //if (!parts[U_AUTH].isEmpty())
140   if (uriHasAuthority(parts[U_SCHEME]))
141     joined += Str("//") + parts[U_AUTH];  // add authority
142   joined += parts[U_PATH];                // add path
143   if (!parts[U_QUERY].isEmpty())          // add query
144     joined += Str("?") + parts[U_QUERY];
145   if (!parts[U_FRAG].isEmpty())           // add fragment
146     joined += Str("#") + parts[U_FRAG];
147 }
148 
149 /*****************************************************************
150 schemeToURI_()
151 
152   converts the scheme given as Str to one of the URI_... constants.
153   If the scheme is neither "file" or "arg" then URI_EXTENSION is
154   simply returned.
155 *****************************************************************/
156 
schemeToURI_(Sit S,Str & scheme)157 URIScheme schemeToURI_(Sit S, Str& scheme)
158 {
159     if (scheme.eqNoCase("file") && !S.hasFlag(SAB_FILES_TO_HANDLER))
160         return URI_FILE;
161     else
162     {
163         if (scheme.eqNoCase("arg"))
164             return URI_ARG;
165         else
166             return URI_EXTENSION;
167     }
168 }
169 
170 
171 /*****************************************************************
172 cutLast()
173 
174   truncates a path after 'howmany'-th slash from the right (1-based).
175   If there are fewer slashes, sets path to empty string and returns
176   FALSE, otherwise returns TRUE.
177 ARGS
178   path      the path to be truncated
179   howmany   # of slashes that disappear in truncation, MINUS 1
180 RETURNS
181   .         TRUE iff that many slashes were found
182   path      the truncated path
183 *****************************************************************/
184 
cutLast(Str & path,int howmany)185 Bool cutLast(Str& path, int howmany)
186 {
187     Str temp = path;
188     char *p = (char*) temp;
189     int slashCount = 0,
190         i;
191     for (i = temp.length() - 1; i >= 0; i--)
192     {
193         if (isSlash(p[i]))
194             slashCount++;
195         if (slashCount == howmany)
196             break;
197     };
198     if (i >= 0)
199         path.nset(p, i+1);
200     else
201         path.empty();
202     return (Bool)(i >= 0);
203 };
204 
205 /*****************************************************************
206 joinPaths()
207 
208   merges a relative path with a base path
209 ARGS
210   relPath       the relative path. The result is returned here.
211   basePath      the base path (always absolute)
212 RETURNS
213   relPath       the newly constructed absolute path
214 *****************************************************************/
215 
segP(Str & s,int oneOrTwo)216 Bool segP(Str &s, int oneOrTwo)
217 {
218     return (Bool) !strcmp((char *) s, (oneOrTwo == 1 ? "." : ".."));
219 }
220 
joinPaths(Str & relPath,const Str & basePath)221 void joinPaths(Str& relPath, const Str& basePath)
222 {
223     Str segment;
224     DStr absPath;
225     // append the relPath to all-but-the-last-segment-of-basePath
226 
227     Bool endSlash = cutLast(absPath = basePath, 1),
228         lastSeg;
229     DStr result = absPath + (endSlash? "" : "/") + relPath;
230 
231     // throw out all '.' from the path
232     const char *p = (const char*) result;
233     absPath.empty();
234     while(splitBy(p, slashes, segment))
235     {
236         if (!segP(segment, 1))
237             absPath += segment + "/";
238     }
239     if (!segP(segment, 1))
240         absPath += segment;
241 
242     // throw out all "something/.." from the path
243     p = (char*) absPath;
244     int depth = 0;
245     result.empty();
246     do
247     {
248         lastSeg = (Bool) !splitBy(p, slashes, segment);
249         if (!segP(segment, 2))
250         {
251             result += segment + (lastSeg ? "" : "/");
252             depth++;
253         }
254         else
255         {
256             if (depth > 1)
257             {
258                 cutLast(result, 2);
259                 depth--;
260             }
261             else
262                 result += segment + (lastSeg ? "" : "/");
263         };
264     }
265     while(!lastSeg);
266     relPath = result;
267 }
268 
269 
makeAbsoluteURI(Sit S,const char * uri,const char * base,Str & absolute)270 URIScheme makeAbsoluteURI(Sit S, const char* uri,
271 			   const char* base, Str& absolute)
272 {
273     FiveStr
274         u_parts,
275         b_parts;
276     Bool
277         u_defined[5],
278         u_any = FALSE;
279     Str scheme;
280 
281     // first, break up the URIs into their 5 components
282     splitURI(uri, u_parts);
283     splitURI(base, b_parts);
284 
285     // set u_defined[i] to TRUE if the i-th uri component is nonvoid
286     for (int i = 0; i < 5; i++)
287         u_any = (Bool) ((u_defined[i] = (Bool) !u_parts[i].isEmpty()) || u_any);
288 
289     if (!u_any) // all components empty: the reference is to the current document
290     {
291         splitURI(base,u_parts);
292         u_parts[U_QUERY].empty();       // query and fragment are NOT inherited from base
293         u_parts[U_FRAG].empty();
294     }
295     else    // not all components are empty
296     {
297         if (!u_defined[U_SCHEME])                       // undefined scheme
298         {
299             u_parts[U_SCHEME] = b_parts[U_SCHEME];      // inherit scheme from base
300             if (!u_defined[U_AUTH])                     // undefined authority
301             {
302                 u_parts[U_AUTH] = b_parts[U_AUTH];      // inherit authority from base
303                 if (!isSlash(u_parts[U_PATH][0]))       // path is relative
304                     joinPaths(u_parts[U_PATH], b_parts[U_PATH]);    // append path to base path
305                 // query and fragment stay as they are in 'uri'
306             }
307         }
308         else
309         {
310 	    scheme = u_parts[U_SCHEME];
311 
312 	    URIScheme uri_scheme = schemeToURI_(S, scheme);
313 	    if (uri_scheme == URI_EXTENSION)
314 	    {
315 		absolute = uri;
316 		return URI_EXTENSION;
317 	    }
318 	    // scheme defined, check for paths not starting with '/'
319             if (!u_defined[U_AUTH] && !isSlash(u_parts[U_PATH][0]))
320                 u_parts[U_PATH] = Str("/") + u_parts[U_PATH];
321         }
322     }
323     DStr joined = absolute;
324     joinURI(joined, u_parts, FALSE);         // join all components into a URI for return (no scheme)
325 
326     scheme = u_parts[U_SCHEME];
327     absolute = (scheme + ":") + joined;
328     return schemeToURI_(S, scheme);
329 }
330 
331 
332 //    URIScheme makeAbsoluteURI(uri, base, absolute)
333 //
334 //    Merges a (possibly relative) URI reference with a base URI, setting
335 //    'absolute' to the result.
336 //
337 
338 // URIScheme makeAbsoluteURI(Sit S, const char* uri,
339 // 			  const char* base, Str& absolute)
340 // {
341 //   return makeAbsoluteURI2(S, uri, base, absolute);
342 // }
343 
344 
uri2SchemePath(Sit S,const char * absolute,Str & scheme,Str & rest)345 URIScheme uri2SchemePath(Sit S, const char *absolute, Str& scheme, Str& rest)
346 {
347     Bool found = (Bool) !!splitBy(absolute, ":", scheme);
348     sabassert(found);
349     rest = (char*) absolute;
350 /*
351  *    if (isSlash(*absolute) && isSlash(absolute[1]))
352  *       rest = (char*) absolute + 2;
353  *   else
354  *       rest = (char*) absolute;
355  */
356     return schemeToURI_(S, scheme);
357 }
358 
359 
360 /*****************************************************************
361 DataLine
362 
363   is a class that holds the machinery needed to retrieve data from
364   a given URI. There are two internally supported URI schemes:
365   file  (the plain "file://...")
366   arg   (for access to named memory blocks passed to Sablotron)
367 
368   Other schemes are passed to the extending scheme handler (if
369   one has been registered). This way, requests such as http:...
370   can be processed.
371 
372   The life cycle of a DataLine:
373     Upon construction, no URI is attached yet.
374     Call open() to associate a URI.
375     Repeatedly call save() or get() to retrieve data.
376     Call close() to close the resource.
377     Call the destructor.
378 
379   The 'write' data line with the scheme of 'arg' will need to be
380   accessible to the user even after the Processor object is destroyed;
381   it is then freed by 'SablotFreeBuffer'.
382 *****************************************************************/
383 
384 /*****************************************************************
385 DataLine::DataLine()
386 
387   This constructor just sets everything to zeroes and such.
388 *****************************************************************/
389 
DataLine()390 DataLine::DataLine()
391 {
392     mode = DLMODE_NONE;
393     scheme = URI_NONE;
394     f = NULL;
395     buffer = NULL;
396     outBuf = NULL;
397     bufCurr = 0;
398     fileIsStd = FALSE;
399     utf16Encoded = FALSE;
400     handler = NULL;
401     handlerUD = NULL;
402     handle = 0;
403     gotWholeDocument = FALSE;
404 }
405 
406 /*****************************************************************
407 DataLine::~DataLine()
408 
409   The destructor asserts that the data line had been closed.
410 *****************************************************************/
411 
~DataLine()412 DataLine::~DataLine()
413 {
414     // removing the asserts (can be killed anytime due to error)
415     // assert(mode == DLMODE_CLOSED || mode == DLMODE_NONE);
416     // assert(!f);
417     // if there is an outBuf, delete it now
418     if (outBuf)
419         delete outBuf;
420 }
421 
422 /*****************************************************************
423 DataLine::open()
424 
425   Opens the data line for a given URI and access mode. Actual
426   data transfer is only done on subsequent get() or save() calls.
427   open() tries to call the extending scheme handler if it cannot
428   handle a request itself.
429 
430 ARGS
431 _uri        the URI identifier for the resource, including the
432             scheme (e.g. "file:///x.xml")
433 _baseUri    the base URI used in case the reference in _uri is
434             relative
435 _mode       the access mode (DLMODE_READ, DLMODE_WRITE)
436 *****************************************************************/
437 
438 #define specErr1(S, code, arg) \
439 {if (ignoreErr) {Warn1(S,code,arg); return NOT_OK;} else Err1(S,code,arg);}
440 
open(Sit S,const char * _uri,DLAccessMode _mode,StrStrList * argList_,Bool ignoreErr)441 eFlag DataLine::open(Sit S, const char *_uri, DLAccessMode _mode,
442 		     StrStrList* argList_, Bool ignoreErr /* = FALSE */)
443 {
444     sabassert(mode == DLMODE_NONE);  // the buffer must not be open yet
445     // combine _uri and _baseUri into one
446     Str strScheme, strPath;
447     scheme = uri2SchemePath(S, _uri, strScheme, strPath);
448     char *name = (char*) strPath;
449 
450     // mode set in the end
451     fullUri = (char*)_uri;
452 
453     switch(scheme)
454     {
455     case URI_FILE:
456         {
457             if (name[0] == '/' && name[1] == '/')
458                 name += 2;          // skipping the "//" in front
459             // try to open the file
460 #ifdef _MSC_VER
461             if (!(f = stdopen(name,_mode == DLMODE_WRITE ? "wb" : "rb")))
462 #else
463             if (!(f = stdopen(name,_mode == DLMODE_WRITE ? "w" : "r")))
464 #endif
465                 specErr1(S, E_FILE_OPEN, winName(name));
466             // set fileIsStd if filename is "stdin", "stdout" or "stderr"
467             fileIsStd = isstd(name);
468         }; break;
469     case URI_ARG:
470         {
471             // if opening for read access, get the pointer to the argument contents
472             // plus some extra information
473             if (_mode == DLMODE_READ)
474             {
475 	      Str *value = NULL;
476 	      if (argList_)
477 		value = argList_ -> find(name);
478 	      if (!value)
479 		specErr1(S, E1_ARG_NOT_FOUND, name);
480 	      buffer = (char*)*value;
481             }
482             // if opening for write access, just allocate a new dynamic block
483             else
484                 outBuf = new DynBlock;
485         }; break;
486     default:
487         {
488             // try the extending scheme handler
489             // ask the handler address from the Processor
490 	  Processor *proc = S.getProcessor();
491 	  if (proc)
492 	    handler = proc->getSchemeHandler(&handlerUD);
493 	  else
494 	    handler = NULL;
495 	  // if there is no handler, report unsupported scheme
496 	  if (!handler)
497 	    specErr1(S, E1_UNSUPPORTED_SCHEME, strScheme);
498 	  // try the fast way
499 	  int count = 0;
500 	  buffer = NULL;
501 	  if (_mode == DLMODE_READ && handler -> getAll)
502 	    handler -> getAll(handlerUD, proc,
503 			      strScheme, name, &buffer, &count);
504 	  if (buffer && (count != -1))
505             {
506 	      gotWholeDocument = TRUE;
507 	      bufCurr = 0;
508             }
509 	  else
510             {
511 	      // call the handler's open() function, obtaining a handle
512 	      switch(handler -> open(handlerUD, proc,
513 				     strScheme, name, &handle))
514                 {
515                 case SH_ERR_UNSUPPORTED_SCHEME:     // scheme not supported
516 		  specErr1(S, E1_UNSUPPORTED_SCHEME, strScheme);
517                 case SH_ERR_NOT_OK:
518 		  specErr1(S, E1_URI_OPEN, strScheme + ":" + strPath);
519                 };
520             }
521         };
522     };
523     // open successfully completed. Set the new mode.
524     mode = _mode;
525     return OK;
526 }
527 
528 /*****************************************************************
529 DataLine::close()
530 
531   closes the resource attached to this data line.
532 *****************************************************************/
close(Sit S)533 eFlag DataLine::close(Sit S)
534 {
535     sabassert(mode != DLMODE_NONE);
536     switch(scheme)
537     {
538     case URI_FILE:
539         {
540             sabassert(f);
541             if (!fileIsStd)
542             {
543 	      if (fclose(f))
544 		Err1(S, E1_URI_CLOSE, fullUri);
545             };
546             f = NULL;
547         }; break;
548     case URI_ARG:
549         break;
550     case URI_EXTENSION:
551         {
552             if (gotWholeDocument)
553             {
554                 NZ(handler) -> freeMemory(handlerUD, S.getProcessor(), buffer);
555             }
556             else
557             {
558                 if(NZ(handler) -> close(handlerUD, S.getProcessor(), handle))
559                     Err1(S, E1_URI_CLOSE, fullUri);
560             }
561         }; break;
562     };
563     mode = DLMODE_CLOSED;
564     return OK;
565 }
566 
567 /*****************************************************************
568 save()
569 
570   saves an UTF-8 string pointed to by data to the data line.
571   This is the place to perform any recoding, escaping and other operations
572   that require char-by-char scanning of the string.
573 *****************************************************************/
574 
my_wcslen(const char * p)575 int my_wcslen(const char *p)
576 {
577     int len;
578     for (len = 2;  *(short int*)p; p += 2, len += 2);
579     return len;
580 }
581 
save(Sit S,const char * data,int length)582 eFlag DataLine::save(Sit S, const char *data, int length)
583 {
584     sabassert(mode == DLMODE_WRITE); // assume the file open for writing
585     // int length = utf16Encoded ? my_wcslen(data) : strlen(data);
586     switch (scheme)             // choose the output procedure
587     {
588     case URI_FILE:              // file: scheme
589         {
590             sabassert(f);          // the file must be open
591             // fputs(data, f);
592             fwrite(data, 1, length, f);
593         }; break;
594     case URI_ARG:               // arg: scheme
595         {
596             sabassert(outBuf);     // the output buffer must exist
597             outBuf -> nadd(data, length);
598         }; break;
599     case URI_EXTENSION:         // external handler
600         {
601             int actual = length;
602             if( NZ(handler) -> put(handlerUD, S.getProcessor(), handle, data, &actual) )
603                 Err1(S, E1_URI_WRITE, fullUri);
604         };
605     }
606     return OK;
607 }
608 
609 /*................................................................
610 pointsAtEnd()
611 
612   DESCRIPTION
613 a macro that returns nonzero if the given char* points at a
614 string terminator
615 
616   ARGS
617 p       the pointer
618 is16    TRUE iff the string is UTF-16
619 ................................................................*/
620 
621 #define pointsAtEnd(p, is16) ((is16) ? (!*(unsigned short*)(p)) : (!*(p)))
622 
623 /*****************************************************************
624     get()
625 
626 - retrieves at most 'maxcount' bytes into buffer 'dest'.
627 - input should be NUL-terminated
628 - if a terminating 0 is reached, copying stops
629 *****************************************************************/
630 
get(Sit S,char * dest,int maxcount)631 int DataLine::get(Sit S, char *dest,int maxcount)
632 {
633     int result = 0;
634     sabassert(mode == DLMODE_READ);  // assume the file open for reading
635     switch(scheme)
636     {
637     case URI_FILE:
638         {
639             sabassert(f);          // the file must be open
640             result = fread(dest,1,maxcount,f);
641             // return the number of bytes read
642         }; break;
643     case URI_ARG:
644         {
645             sabassert(buffer);     // the buffer must exist
646             // do a 'strncpy' that shifts dest and bufCurr;
647             // i counts the number of bytes transferred
648 			char * copyChar = dest;
649             int i;
650             for (i = 0;
651                 (!pointsAtEnd(buffer + bufCurr, utf16Encoded)) && (i < maxcount);
652                 i++)
653                 {
654                     *(copyChar++) = buffer[bufCurr++];
655                 };
656                 result = i;
657         }; break;
658     case URI_EXTENSION:         // external handler
659         {
660             if (gotWholeDocument)
661             {
662                 // ugly hack: copied the following from above
663                 sabassert(buffer);     // the buffer must exist
664 				char * copyChar = dest;
665                 int i;
666                 for (i = 0;
667                 (!pointsAtEnd(buffer + bufCurr, utf16Encoded)) && (i < maxcount);
668                 i++)
669                 {
670                     *(copyChar++) = buffer[bufCurr++];
671                 };
672                 result = i;
673             }
674             else
675             {
676                 int actual = maxcount;
677                 if( NZ(handler) -> get(handlerUD, S.getProcessor(), handle, dest, &actual) )
678                 {
679                     S.message( MT_ERROR, E1_URI_READ, fullUri, "" );
680                     return -1;
681                 }
682                 result = actual;
683             }
684         }; break;
685     }
686 	// need to NUL terminate in order to prevent C string
687 	// functions running off the end of the buffer
688 
689 	// assignment assumes that the passed in dest is allocated
690 	// one bigger than maxcount
691 	dest[result] = '\0';
692     return result;              // return the number of bytes read
693 }
694 
695 /*****************************************************************
696 getOutBuffer()
697 
698   returns the pointer to the output buffer which may be used after
699   all processing is finished (remains allocated along with the
700   whole DataLine)
701 *****************************************************************/
702 
getOutBuffer()703 DynBlock* DataLine::getOutBuffer()
704 {
705     // check that the output buffer exists and that we're open for write
706     sabassert(mode == DLMODE_WRITE && scheme == URI_ARG);
707     return NZ(outBuf); // -> getPointer();
708 }
709 
setURIAndClose(Sit S,const char * _uri)710 eFlag DataLine::setURIAndClose(Sit S, const char *_uri)
711 {
712     sabassert( mode == DLMODE_NONE );
713     mode = DLMODE_CLOSED;
714     scheme = URI_ARG;
715     fullUri = _uri;
716     return OK;
717 }
718 
report(Sit S,MsgType type,MsgCode code,const Str & arg1,const Str & arg2)719 void DataLine::report(Sit S, MsgType type, MsgCode code, const Str& arg1, const Str& arg2)
720 {
721     S.message(type, code, arg1, arg2);
722 }
723 
724