1 /*
2    ratproxy - MIME detection
3    -------------------------
4 
5    MIME content sniffing routines. This code tries to figure out
6    what is actually being served, regardless of what HTTP headers
7    say.
8 
9    Author: Michal Zalewski <lcamtuf@google.com>
10 
11    Copyright 2007, 2008 by Google Inc. All Rights Reserved.
12 
13    Licensed under the Apache License, Version 2.0 (the "License");
14    you may not use this file except in compliance with the License.
15    You may obtain a copy of the License at
16 
17      http://www.apache.org/licenses/LICENSE-2.0
18 
19    Unless required by applicable law or agreed to in writing, software
20    distributed under the License is distributed on an "AS IS" BASIS,
21    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22    See the License for the specific language governing permissions and
23    limitations under the License.
24 
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <unistd.h>
30 #include <sys/socket.h>
31 #include <netinet/in.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <fcntl.h>
35 #include <string.h>
36 #include <sys/wait.h>
37 #include <ctype.h>
38 #include <netdb.h>
39 #include <openssl/md5.h>
40 
41 #include "config.h"
42 #include "types.h"
43 #include "debug.h"
44 #include "nlist.h"
45 #include "http.h"
46 #include "mime.h"
47 #include "string-inl.h"
48 
49 
50 /* Check for JSON prologues... */
is_json_safe_mime(_u8 * str)51 static _u8 is_json_safe_mime(_u8* str) {
52   _u32 i = 0;
53 
54   /* JSON prologues of more than 1 characters are "authoritative" and override
55      further content sniffing. */
56 
57   while (json_safe[i]) {
58     if (json_safe[i][1] && !strncmp(str,json_safe[i],strlen(json_safe[i]))) return 1;
59     i++;
60   }
61 
62   return 0;
63 
64 }
65 
66 
67 /* Attempt MIME type detection for formats that are likely to be served by a
68    modern web application based on payload signature matching. */
detect_mime(struct http_response * r)69 void detect_mime(struct http_response* r) {
70   _u32 i, max;
71   _u8  text = 1;
72   _u8  sniffbuf[SNIFFBUF + 1];
73   _u8* xxx;
74 
75   /* TODO: Add more popular formats. This is oriented toward common web 2.0
76      technologies at the moment. */
77 
78   if (!r->payload_len) return;
79 
80   if (r->payload_len > SNIFFBUF) max = SNIFFBUF; else max = r->payload_len;
81   memcpy(sniffbuf,r->payload,max);
82   sniffbuf[max] = 0;
83 
84   /* Is this a plain-text file? */
85 
86   for (i=0;i<max;i++)
87     if (sniffbuf[i] < 0x20 && !isspace(sniffbuf[i])) { text = 0; break; }
88 
89   if (text) {
90     _u8 got_alpha = 0, got_bracket = 0, got_try = 0, got_alpha_before = 0;
91 
92     r->is_text = 1;
93 
94     /* First, some files with known, fixed signatures. */
95 
96     if (!strncmp(sniffbuf,"%!PS",4)) {
97       r->sniffed_mime = "application/postscript";
98       return;
99     }
100 
101     if (!strncmp(sniffbuf,"{\\rtf",5)) {
102       r->sniffed_mime = "text/rtf";
103       return;
104     }
105 
106     /* Try to detect Javascript - this is a bit tricky, because
107        JSON snippets can have minimal syntax, and CSS uses a notation
108        vaguely resembling Javascript. */
109 
110     /* JSON breaker prefixes automatically qualify content as JS */
111     if (is_json_safe_mime(sniffbuf)) goto got_javascript;
112 
113     for (i=0;i<max;i++) {
114 
115       /* First, skip comment blocks */
116 
117       if (!strncmp(sniffbuf+i,"//",2)) {
118         _u8* x = strchr(sniffbuf + i + 2, '\n');
119         if (!x) i = max; else i = x - sniffbuf;
120         continue;
121       }
122 
123       if (!strncmp(sniffbuf+i,"/*",2)) {
124         _u8* x = strstr(sniffbuf + i + 2, "*/");
125         if (!x) i = max; else i = x - sniffbuf + 1;
126         continue;
127       }
128 
129       /* If what follows look HTML-esque, bail out */
130 
131       if ((sniffbuf[i] == '<' || !strcmp(sniffbuf+i,"&lt;"))) break;
132 
133       if (!strncmp(sniffbuf+i,"try",3) && (isspace(sniffbuf[i+3]) || sniffbuf[i+3] == '{'))
134         got_try = 1;
135 
136       /* try { ... is a special JSON-like response that looks like CSS, but should
137          be handled as Javascript. */
138 
139       if (sniffbuf[i] == '{' && got_try) goto got_javascript;
140 
141       /* Otherwise, if { is encountered before any =, (, or HTML, but after alnums,
142          it's likely a stylesheet... well, unless followed by '"', in which case it
143          might be a serialized object with a non-standard anti-XSSI prologue. */
144 
145       if (got_alpha && sniffbuf[i] == '{') {
146         _u32 j = i + 1;
147 
148         while (j < max && sniffbuf[j] && !isalpha(sniffbuf[j])) {
149           if (sniffbuf[j] == '{'  || sniffbuf[j] == '"' ||
150               sniffbuf[j] == '\'' || sniffbuf[j] == '(') goto got_javascript;
151           j++;
152         }
153 
154         r->sniffed_mime = "text/css";
155         return;
156       }
157 
158       if (isalpha(sniffbuf[i])) got_alpha = 1;
159       if (sniffbuf[i] == '{') { got_bracket = 1; got_alpha_before = got_alpha; }
160 
161       /* { "foo" is very JSONish. */
162       if (!got_alpha && got_bracket && sniffbuf[i] == '"') goto got_javascript;
163 
164       /* { foo: 1 is JSONish too. */
165       if (!got_alpha_before && got_alpha && got_bracket && sniffbuf[i] == ':')
166         goto got_javascript;
167 
168       /* And finally, if =, ( or JS keyword is encountered before <, assume JS. */
169 
170       if (sniffbuf[i] == '=' || sniffbuf[i] == '(' || sniffbuf[i] == '[' ||
171           !strncmp(sniffbuf + i, "function ",9) ||
172           !strncmp(sniffbuf + i, "throw ",6) ||
173           !strncmp(sniffbuf + i, "var ",4)) {
174 
175 got_javascript:
176 
177         r->sniffed_mime = "application/x-javascript";
178 
179         /* RFC 4329 lists no fewer than 16 variations of Javascript MIME type.
180            Not all of them are common in the wild, but all are roughly equivalent
181            security-wise, so let's be lenient. */
182 
183         if (r->mime_type) {
184 
185           if (!strcasecmp(r->mime_type,"text/javascript"))
186             r->sniffed_mime = "text/javascript";
187           else if (!strcasecmp(r->mime_type,"application/javascript"))
188             r->sniffed_mime = "application_javascript";
189           else if (!strcasecmp(r->mime_type,"application/json"))
190             r->sniffed_mime = "application/json";
191 
192         }
193 
194         return;
195       }
196 
197     }
198 
199     /* OpenSearch */
200 
201     if (strstr(sniffbuf,"<OpenSearch")) {
202       r->sniffed_mime = "application/opensearchdescription+xml";
203       return;
204     }
205 
206     /* Try to detect RSS */
207 
208     if (strstr(sniffbuf,"<channel") || strstr(sniffbuf,"<description") ||
209         strstr(sniffbuf,"<item")    || strstr(sniffbuf,"<rdf:RDF") ||
210         strstr(sniffbuf,"<rss")) {
211       r->sniffed_mime = "application/rss+xml";
212       return;
213     }
214 
215     /* Try to detect Atom */
216 
217     if (strstr(sniffbuf,"<feed ") || strstr(sniffbuf,"<updated>")) {
218       r->sniffed_mime = "application/atom+xml";
219       return;
220     }
221 
222     /* Try to detect WML */
223 
224     if (rp_strcasestr(sniffbuf,"<wml") || rp_strcasestr(sniffbuf,"<!DOCTYPE wml ")) {
225       r->sniffed_mime = "text/vnd.wap.wml";
226       return;
227     }
228 
229     /* Try to detect <cross-domain-policy> - just promote the new, fancy MIME type for
230        security reasons. */
231 
232     if (rp_strcasestr(sniffbuf,"<cross-domain-policy>")) {
233       r->sniffed_mime = "text/x-cross-domain-policy";
234       return;
235     }
236 
237     /* Try to detect XHTML, SVG, or generic XML of some other type. */
238 
239     if (rp_strcasestr(sniffbuf,"<?xml")) {
240 
241       if (rp_strcasestr(sniffbuf,"<svg"))
242         r->sniffed_mime = "image/svg+xml";
243       else if (rp_strcasestr(sniffbuf,"<!doctype") && !rp_strcasestr(sniffbuf,"cross-domain-policy"))
244         r->sniffed_mime = "application/xhtml+xml";
245       else {
246 
247         if (r->mime_type && !strcasecmp(r->mime_type,"text/xml"))
248           r->sniffed_mime = "text/xml";
249         else r->sniffed_mime = "application/xml";
250       }
251 
252       return;
253     }
254 
255     /* Try to detect generic HTML */
256 
257     if (rp_strcasestr(sniffbuf,"<html")     || rp_strcasestr(sniffbuf,"<meta")     ||
258         rp_strcasestr(sniffbuf,"<head")     || rp_strcasestr(sniffbuf,"<title")    ||
259         rp_strcasestr(sniffbuf,"<!--")      ||
260         rp_strcasestr(sniffbuf,"<!doctype") || rp_strcasestr(sniffbuf,"<body")     ||
261         rp_strcasestr(sniffbuf,"<font")     || rp_strcasestr(sniffbuf,"<br")       ||
262         rp_strcasestr(sniffbuf,"<td")       || rp_strcasestr(sniffbuf,"<div")      ||
263         rp_strcasestr(sniffbuf,"<span")     || rp_strcasestr(sniffbuf,"<img")      ||
264         rp_strcasestr(sniffbuf,"<li")       || rp_strcasestr(sniffbuf,"href=")     ||
265         rp_strcasestr(sniffbuf,"<ol")       || rp_strcasestr(sniffbuf,"<ul")       ||
266         rp_strcasestr(sniffbuf,"<style")    || rp_strcasestr(sniffbuf,"<script")) {
267 
268       r->sniffed_mime = "text/html";
269       return;
270     }
271 
272     /* Last resort for XML */
273 
274     xxx = sniffbuf;
275     while (isspace(*xxx)) xxx++;
276 
277     if (rp_strcasestr(xxx,"<![CDATA[") || (xxx[0] == '<' && (strstr(xxx,"</") || strstr(xxx,"/>") || strstr(xxx,"/ >")))) {
278 
279         if (r->mime_type && !strcasecmp(r->mime_type,"text/xml"))
280           r->sniffed_mime = "text/xml";
281         else r->sniffed_mime = "application/xml";
282 
283     }
284 
285     /* Oh well, at least it seems to be text. */
286 
287     r->sniffed_mime = "text/plain";
288 
289   } else {
290 
291     /* This is considerably less messy - binary signatures for some non-text files. */
292 
293     if (sniffbuf[0] == 0xFF && sniffbuf[1] == 0xD8 &&
294         sniffbuf[2] == 0xFF) {
295       r->sniffed_mime = "image/jpeg";
296 
297       /* Progressive JPEG; recognized by MSIE. */
298 
299       if (r->mime_type && !strcasecmp(r->mime_type,"image/pjpeg"))
300         r->sniffed_mime = "image/pjpeg";
301 
302       return;
303     }
304 
305     if (sniffbuf[0] == 'G' && sniffbuf[1] == 'I' &&
306         sniffbuf[2] == 'F' && sniffbuf[3] == '8') {
307       r->sniffed_mime = "image/gif";
308       return;
309     }
310 
311     if (sniffbuf[0] == 0x89 && sniffbuf[1] == 'P' &&
312         sniffbuf[2] == 'N' && sniffbuf[3] == 'G') {
313       r->sniffed_mime = "image/png";
314       return;
315     }
316 
317     if (sniffbuf[0] == 'B' && sniffbuf[1] == 'M') {
318       r->sniffed_mime = "image/x-ms-bmp";
319       return;
320     }
321 
322     if (sniffbuf[0] == 'I' && sniffbuf[1] == 'I' && sniffbuf[2] == 42) {
323       r->sniffed_mime = "image/tiff";
324       return;
325     }
326 
327     if (sniffbuf[0] == 0xFF && sniffbuf[1] == 0xFB) {
328       r->sniffed_mime = "audio/mpeg";
329       return;
330     }
331 
332     if (sniffbuf[0] == 0x00 && sniffbuf[1] == 0x00 &&
333         sniffbuf[2] == 0x01 && (sniffbuf[3] & 0xF0) == 0xB0) {
334       r->sniffed_mime = "video/mpeg";
335       return;
336     }
337 
338     if (sniffbuf[0] == 'O' && sniffbuf[1] == 'g' &&
339         sniffbuf[2] == 'g' && sniffbuf[3] == 'S') {
340       r->sniffed_mime = "application/ogg";
341       return;
342     }
343 
344     if (sniffbuf[0] == 'R' && sniffbuf[1] == 'I' &&
345         sniffbuf[2] == 'F' && sniffbuf[3] == 'F') {
346 
347        if (sniffbuf[8] == 'A') {
348          if (sniffbuf[9] == 'C') {
349            r->sniffed_mime = "application/x-navi-animation";
350          } else {
351            r->sniffed_mime = "video/avi";
352          }
353        } else r->sniffed_mime = "audio/wav";
354 
355       return;
356 
357     }
358 
359     if (sniffbuf[0] == 0x28 && sniffbuf[1] == 'R' &&
360         sniffbuf[2] == 'M' && sniffbuf[3] == 'F') {
361 
362       r->sniffed_mime = "audio/x-realaudio";
363       return;
364 
365     }
366 
367     if (sniffbuf[0] == 0x30 && sniffbuf[1] == 0x26 &&
368         sniffbuf[2] == 0xB2) {
369 
370       r->sniffed_mime = "video/x-ms-asf";
371       return;
372 
373     }
374 
375     if (!strncmp(sniffbuf+4,"free",4) || !strncmp(sniffbuf+4,"mdat",4) ||
376         !strncmp(sniffbuf+4,"wide",4) || !strncmp(sniffbuf+4,"pnot",4) ||
377         !strncmp(sniffbuf+4,"skip",4) || !strncmp(sniffbuf+4,"moov",4)) {
378 
379       r->sniffed_mime = "video/quicktime";
380       return;
381 
382     }
383 
384 
385     if ((sniffbuf[0] == 0x46 || sniffbuf[0] == 0x43) &&
386          sniffbuf[1] == 0x57 && sniffbuf[2] == 0x53) {
387 
388       r->sniffed_mime = "application/x-shockwave-flash";
389       return;
390 
391     }
392 
393     if (sniffbuf[0] == 0x46 && sniffbuf[1] == 0x4C && sniffbuf[2] == 0x56) {
394 
395       /* Again, multiple valid options in use; be polite. */
396 
397       if (r->mime_type && !strcasecmp(r->mime_type,"video/flv"))
398         r->sniffed_mime = "video/flv";
399       else
400         r->sniffed_mime = "video/x-flv";
401 
402       return;
403 
404     }
405 
406     if (r->payload_len > 3 && sniffbuf[0] == 0 && sniffbuf[1] == 0 && sniffbuf[2] < 3 && sniffbuf[3] == 0) {
407 
408       /* Be polite again. */
409 
410       if (r->mime_type && !strcasecmp(r->mime_type,"image/x-icon"))
411         r->sniffed_mime = "image/x-icon";
412         else
413       if (r->mime_type && !strcasecmp(r->mime_type,"image/bmp"))
414         r->sniffed_mime = "image/bmp";
415         else r->sniffed_mime = "image/vnd.microsoft.icon";
416 
417       return;
418     }
419 
420     if (sniffbuf[0] == '%' && sniffbuf[1] == 'P' && sniffbuf[2] == 'D' && sniffbuf[3] == 'F') {
421       r->sniffed_mime = "application/pdf";
422       return;
423     }
424 
425     if (sniffbuf[0] == 'P' && sniffbuf[1] == 'K' && sniffbuf[2] < 6 && sniffbuf[3] < 7) {
426 
427       if (rp_memmem(r->payload,r->payload_len,"META-INF/",9))
428         r->sniffed_mime = "application/java-archive";
429       else
430         r->sniffed_mime = "application/zip";
431 
432       return;
433     }
434 
435     if (sniffbuf[0] == 0xCA && sniffbuf[1] == 0xFE && sniffbuf[2] == 0xBA && sniffbuf[3] == 0xBE) {
436       r->sniffed_mime = "application/java-vm";
437       return;
438     }
439 
440     /* Microsoft office is kind-of fuzzy. */
441 
442     if (sniffbuf[0] == 0xD0 && sniffbuf[1] == 0xCF &&
443         sniffbuf[2] == 0x11 && sniffbuf[3] == 0xE0 && r->payload_len > 512) {
444 
445       _u8 c = r->payload[512];
446 
447       switch (c) {
448         case 0xEC: r->sniffed_mime = "application/msword"; break;
449         case 0xFD:
450         case 0x09: r->sniffed_mime = "application/vnd.ms-excel"; break;
451         case 0x00:
452         case 0x0F:
453         case 0xA0: r->sniffed_mime = "application/vnd.ms-powerpoint"; break;
454       }
455 
456       return;
457 
458     }
459 
460     /* If we have no idea what it is, just leave it NULL. */
461 
462   }
463 
464 }
465