1 /*
2  * Copyright (C) 2008 Emweb bv, Herent, Belgium.
3  *
4  * See the LICENSE file for terms of use.
5  * In addition to these terms, permission is also granted to use and
6  * modify these two files (CgiParser.C and CgiParser.h) so long as the
7  * copyright above is maintained, modifications are documented, and
8  * credit is given for any use of the library.
9  *
10  * CGI parser modelled after the PERL implementation cgi-lib.pl 2.18 by
11  * Steven E. Brenner with the following original copyright:
12 
13 # Perl Routines to Manipulate CGI input
14 # cgi-lib@pobox.com
15 #
16 # Copyright (c) 1993-1999 Steven E. Brenner
17 # Unpublished work.
18 # Permission granted to use and modify this library so long as the
19 # copyright above is maintained, modifications are documented, and
20 # credit is given for any use of the library.
21 #
22 # Thanks are due to many people for reporting bugs and suggestions
23 
24 # For more information, see:
25 #     http://cgi-lib.stanford.edu/cgi-lib/
26 
27  */
28 
29 #include <fstream>
30 #include <stdlib.h>
31 
32 #ifdef WT_HAVE_GNU_REGEX
33 #include <regex.h>
34 #else
35 #include <regex>
36 #endif // WT_HAVE_GNU_REGEX
37 
38 #include "CgiParser.h"
39 #include "WebRequest.h"
40 #include "WebUtils.h"
41 #include "FileUtils.h"
42 
43 #include "Wt/WException.h"
44 #include "Wt/WLogger.h"
45 #include "Wt/Http/Request.h"
46 
47 using std::memmove;
48 using std::strcpy;
49 using std::strtol;
50 
51 namespace {
52 #ifndef WT_HAVE_GNU_REGEX
53   const std::regex boundary_e("\\bboundary=(?:(?:\"([^\"]+)\")|(\\S+))",
54 			      std::regex::icase);
55   const std::regex name_e("\\bname=(?:(?:\"([^\"]+)\")|([^\\s:;]+))",
56 			  std::regex::icase);
57   const std::regex filename_e("\\bfilename=(?:(?:\"([^\"]*)\")|([^\\s:;]+))",
58 			      std::regex::icase);
59   const std::regex content_e("^\\s*Content-type:"
60 			     "\\s*(?:(?:\"([^\"]+)\")|([^\\s:;]+))",
61 			     std::regex::icase);
62   const std::regex content_disposition_e("^\\s*Content-Disposition:",
63 					 std::regex::icase);
64   const std::regex content_type_e("^\\s*Content-Type:",
65 				  std::regex::icase);
66 
fishValue(const std::string & text,const std::regex & e,std::string & result)67   bool fishValue(const std::string& text,
68 		 const std::regex& e, std::string& result)
69   {
70     std::smatch what;
71 
72     if (std::regex_search(text, what, e)) {
73       result = std::string(what[1]) + std::string(what[2]);
74       return true;
75     } else
76       return false;
77   }
78 
regexMatch(const std::string & text,const std::regex & e)79   bool regexMatch(const std::string& text, const std::regex& e)
80   {
81     return std::regex_search(text, e);
82   }
83 
84 #else
85   regex_t boundary_e, name_e, filename_e, content_e,
86     content_disposition_e, content_type_e;
87 
88   const char *boundary_ep = "\\bboundary=((\"([^\"]*)\")|([^ \t]*))";
89   const char *name_ep = "\\bname=((\"([^\"]*)\")|([^ \t:;]*))";
90   const char *filename_ep = "\\bfilename=((\"([^\"]*)\")|([^ \t:;]*))";
91   const char *content_ep = "^[ \t]*Content-type:"
92     "[ \t]*((\"([^\"]*)\")|([^ \t:;]*))";
93   const char *content_disposition_ep = "^[ \t]*Content-Disposition:";
94   const char *content_type_ep = "^[ \t]*Content-Type:";
95 
96   bool fishValue(const std::string& text,
97 		 regex_t& e1, std::string& result)
98   {
99     regmatch_t pmatch[5];
100     int res = regexec(&e1, text.c_str(), 5, pmatch, 0);
101 
102     if (res == 0) {
103       if (pmatch[3].rm_so != -1)
104 	result = text.substr(pmatch[3].rm_so,
105 			     pmatch[3].rm_eo - pmatch[3].rm_so);
106       if (pmatch[4].rm_so != -1)
107 	result = text.substr(pmatch[4].rm_so,
108 			     pmatch[4].rm_eo - pmatch[4].rm_so);
109 
110       return true;
111     } else
112       return false;
113   }
114 
115   bool regexMatch(const std::string& text, regex_t& e)
116   {
117     regmatch_t pmatch[1];
118 
119     return regexec(&e, text.c_str(), 1, pmatch, 0) == 0;
120   }
121 
122   class RegInitializer {
123   protected:
124     static bool regInitialized_;
125 
126   public:
127     RegInitializer()
128     {}
129 
130     ~RegInitializer() {
131       cleanup();
132     }
133 
134     static void setup() {
135       if (!regInitialized_) {
136 	regcomp(&boundary_e, boundary_ep, REG_ICASE | REG_EXTENDED);
137 	regcomp(&name_e, name_ep, REG_ICASE | REG_EXTENDED);
138 	regcomp(&filename_e, filename_ep, REG_ICASE | REG_EXTENDED);
139 	regcomp(&content_e, content_ep, REG_ICASE | REG_EXTENDED);
140 	regcomp(&content_disposition_e, content_disposition_ep,
141 		REG_ICASE | REG_EXTENDED);
142 	regcomp(&content_type_e, content_type_ep, REG_ICASE | REG_EXTENDED);
143 	regInitialized_ = true;
144       }
145     }
146 
147     static void cleanup() {
148       if (regInitialized_) {
149 	regfree(&boundary_e);
150 	regfree(&name_e);
151 	regfree(&filename_e);
152 	regfree(&content_e);
153 	regfree(&content_disposition_e);
154 	regfree(&content_type_e);
155 	regInitialized_ = false;
156       }
157     }
158   };
159 
160   bool RegInitializer::regInitialized_ = false;
161 
162   static RegInitializer regInitializer;
163 #endif
164 }
165 
166 namespace Wt {
167 
168 LOGGER("CgiParser");
169 
init()170 void CgiParser::init()
171 {
172 #ifdef WT_HAVE_GNU_REGEX
173   RegInitializer::setup();
174 #endif
175 }
176 
CgiParser(::int64_t maxRequestSize,::int64_t maxFormData)177 CgiParser::CgiParser(::int64_t maxRequestSize, ::int64_t maxFormData)
178   : maxFormData_(maxFormData),
179     maxRequestSize_(maxRequestSize)
180 { }
181 
parse(WebRequest & request,ReadOption readOption)182 void CgiParser::parse(WebRequest& request, ReadOption readOption)
183 {
184   /*
185    * TODO: optimize this ...
186    */
187   request_ = &request;
188 
189   ::int64_t len = request.contentLength();
190   const char *type = request.contentType();
191   const char *meth = request.requestMethod();
192 
193   request.postDataExceeded_ = (len > maxRequestSize_ ? len : 0);
194 
195   std::string queryString = request.queryString();
196 
197   LOG_DEBUG("queryString (len=" << len << "): " << queryString);
198 
199   if (!queryString.empty() && request_->parameters_.empty()) {
200     Http::Request::parseFormUrlEncoded(queryString, request_->parameters_);
201   }
202 
203   // XDomainRequest cannot set a contentType header, we therefore pass it
204   // as a request parameter
205   if (readOption != ReadHeadersOnly &&
206       strcmp(meth, "POST") == 0 &&
207       ((type && strstr(type, "application/x-www-form-urlencoded") == type) ||
208        (queryString.find("&contentType=x-www-form-urlencoded") !=
209 	std::string::npos))) {
210     /*
211      * TODO: parse this stream-based to avoid the malloc here. For now
212      * we protect the maximum that can be POST'ed as form data.
213      */
214     if (len > maxFormData_)
215       throw WException("Oversized application/x-www-form-urlencoded ("
216 		       + std::to_string(len) + ")");
217 
218     auto buf = std::unique_ptr<char[]>(new char[len + 1]);
219 
220     request.in().read(buf.get(), len);
221 
222     if (request.in().gcount() != (int)len) {
223       throw WException("Unexpected short read.");
224     }
225 
226     buf[len] = 0;
227 
228     // This is a special Wt feature, I do not think it standard.
229     // For POST, parameters in url-encoded URL are still parsed.
230 
231     std::string formQueryString = buf.get();
232 
233     LOG_DEBUG("formQueryString (len=" << len << "): " << formQueryString);
234     if (!formQueryString.empty()) {
235       Http::Request::parseFormUrlEncoded(formQueryString, request_->parameters_);
236     }
237     Http::ParameterMap::const_iterator it = request_->parameters_.find("Wt-params");
238     if (it != request_->parameters_.end() && it->second.size() == 1) {
239       Http::Request::parseFormUrlEncoded(it->second[0], request_->parameters_);
240     }
241   }
242 
243   if (readOption != ReadHeadersOnly &&
244       type && strstr(type, "multipart/form-data") == type) {
245     if (strcmp(meth, "POST") != 0) {
246       throw WException("Invalid method for multipart/form-data: "
247 		       + std::string(meth));
248     }
249 
250     if (!request.postDataExceeded_)
251       readMultipartData(request, type, len);
252     else if (readOption == ReadBodyAnyway) {
253       for (;len > 0;) {
254 	::int64_t toRead = std::min(::int64_t(BUFSIZE), len);
255 	request.in().read(buf_, toRead);
256 	if (request.in().gcount() != (::int64_t)toRead)
257 	  throw WException("CgiParser: short read");
258 	len -= toRead;
259       }
260     }
261   }
262 }
263 
readMultipartData(WebRequest & request,const std::string type,::int64_t len)264 void CgiParser::readMultipartData(WebRequest& request,
265 				  const std::string type, ::int64_t len)
266 {
267   std::string boundary;
268 
269   if (!fishValue(type, boundary_e, boundary))
270     throw WException("Could not find a boundary for multipart data.");
271 
272   boundary = "--" + boundary;
273 
274   buflen_ = 0;
275   left_ = len;
276   spoolStream_ = 0;
277   currentKey_.clear();
278 
279   if (!parseBody(request, boundary))
280     return;
281 
282   for (;;) {
283     if (!parseHead(request))
284       break;
285     if (!parseBody(request,boundary))
286       break;
287   }
288 }
289 
290 /*
291  * Read until finding the boundary, saving to resultString or
292  * resultFile. The boundary itself is not consumed.
293  *
294  * tossAtBoundary controls how many characters extra (<0)
295  * or few (>0) are saved at the start of the boundary in the result.
296  */
readUntilBoundary(WebRequest & request,const std::string boundary,int tossAtBoundary,std::string * resultString,std::ostream * resultFile)297 void CgiParser::readUntilBoundary(WebRequest& request,
298 				  const std::string boundary,
299 				  int tossAtBoundary,
300 				  std::string *resultString,
301 				  std::ostream *resultFile)
302 {
303   int bpos;
304 
305   while ((bpos = index(boundary)) == -1) {
306     /*
307      * If we couldn't find it. We need to wind the buffer, but only save
308      * not including the boundary length.
309      */
310     if (left_ == 0)
311       throw WException("CgiParser: reached end of input while seeking end of "
312 		       "headers or content. Format of CGI input is wrong");
313 
314     /* save (up to) BUFSIZE from buffer to file or value string, but
315      * mind the boundary length */
316     int save = std::min((buflen_ - (int)boundary.length()), (int)BUFSIZE);
317 
318     if (save > 0) {
319       if (resultString)
320 	*resultString += std::string(buf_, save);
321       if (resultFile)
322 	resultFile->write(buf_, save);
323 
324       /* wind buffer */
325       windBuffer(save);
326     }
327 
328     unsigned amt = static_cast<unsigned>
329       (std::min(left_,
330 		static_cast< ::int64_t >(BUFSIZE + MAXBOUND - buflen_)));
331 
332     request.in().read(buf_ + buflen_, amt);
333     if (request.in().gcount() != (int)amt)
334       throw WException("CgiParser: short read");
335 
336     left_ -= amt;
337     buflen_ += amt;
338   }
339 
340   if (resultString)
341     *resultString += std::string(buf_, bpos - tossAtBoundary);
342   if (resultFile)
343     resultFile->write(buf_, bpos - tossAtBoundary);
344 
345   /* wind buffer */
346   windBuffer(bpos);
347 }
348 
windBuffer(int offset)349 void CgiParser::windBuffer(int offset)
350 {
351   if (offset < buflen_) {
352     memmove(buf_, buf_ + offset, buflen_ - offset);
353     buflen_ -= offset;
354   } else
355     buflen_ = 0;
356 }
357 
index(const std::string search)358 int CgiParser::index(const std::string search)
359 {
360   std::string bufS = std::string(buf_, buflen_);
361 
362   std::string::size_type i = bufS.find(search);
363 
364   if (i == std::string::npos)
365     return -1;
366   else
367     return i;
368 }
369 
parseHead(WebRequest & request)370 bool CgiParser::parseHead(WebRequest& request)
371 {
372   std::string head;
373   readUntilBoundary(request, "\r\n\r\n", -2, &head, 0);
374 
375   std::string name;
376   std::string fn;
377   std::string ctype;
378 
379   for (unsigned current = 0; current < head.length();) {
380     /* read line by line */
381     std::string::size_type i = head.find("\r\n", current);
382     const std::string text = head.substr(current, (i == std::string::npos
383 						   ? std::string::npos
384 						   : i - current));
385 
386     if (regexMatch(text, content_disposition_e)) {
387       fishValue(text, name_e, name);
388       fishValue(text, filename_e, fn);
389     }
390 
391     if (regexMatch(text, content_type_e)) {
392       fishValue(text, content_e, ctype);
393     }
394 
395     current = i + 2;
396   }
397 
398   LOG_DEBUG("name: " << name << " ct: " << ctype  << " fn: " << fn);
399 
400   currentKey_ = name;
401 
402   if (!fn.empty()) {
403     if (!request.postDataExceeded_) {
404       /*
405        * It is not easy to create a std::ostream pointing to a
406        * temporary file name.
407        */
408       std::string spool = FileUtils::createTempFileName();
409 
410       spoolStream_ = new std::ofstream(spool.c_str(),
411         std::ios::out | std::ios::binary);
412 
413       request_->files_.insert
414 	(std::make_pair(name, Http::UploadedFile(spool, fn, ctype)));
415 
416       LOG_DEBUG("spooling file to " << spool.c_str());
417 
418     } else {
419       spoolStream_ = 0;
420       // Clear currentKey so that file we don't do harm by reading this
421       // giant blob in memory
422       currentKey_ = "";
423     }
424   }
425 
426   windBuffer(4);
427 
428   return true;
429 }
430 
parseBody(WebRequest & request,const std::string boundary)431 bool CgiParser::parseBody(WebRequest& request, const std::string boundary)
432 {
433   std::string value;
434 
435   readUntilBoundary(request, boundary, 2,
436 		    spoolStream_ ? 0 : (!currentKey_.empty() ? &value : 0),
437 		    spoolStream_);
438 
439   if (spoolStream_) {
440     LOG_DEBUG("completed spooling");
441     delete spoolStream_;
442     spoolStream_ = 0;
443   } else {
444     if (!currentKey_.empty()) {
445       LOG_DEBUG("value: \"" << value << "\"");
446       request_->parameters_[currentKey_].push_back(value);
447     }
448   }
449 
450   currentKey_.clear();
451 
452   if (std::string(buf_ + boundary.length(), 2) == "--")
453     return false;
454 
455   windBuffer(boundary.length() + 2);
456 
457   return true;
458 }
459 
460 } // namespace Wt
461