1 /*
2 ratproxy - MIME detection
3 -------------------------
4
5 MIME content sniffing routines. This code tries to figure out
6 what is actually being served, regardless of what HTTP headers
7 say.
8
9 Author: Michal Zalewski <lcamtuf@google.com>
10
11 Copyright 2007, 2008 by Google Inc. All Rights Reserved.
12
13 Licensed under the Apache License, Version 2.0 (the "License");
14 you may not use this file except in compliance with the License.
15 You may obtain a copy of the License at
16
17 http://www.apache.org/licenses/LICENSE-2.0
18
19 Unless required by applicable law or agreed to in writing, software
20 distributed under the License is distributed on an "AS IS" BASIS,
21 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 See the License for the specific language governing permissions and
23 limitations under the License.
24
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <unistd.h>
30 #include <sys/socket.h>
31 #include <netinet/in.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <fcntl.h>
35 #include <string.h>
36 #include <sys/wait.h>
37 #include <ctype.h>
38 #include <netdb.h>
39 #include <openssl/md5.h>
40
41 #include "config.h"
42 #include "types.h"
43 #include "debug.h"
44 #include "nlist.h"
45 #include "http.h"
46 #include "mime.h"
47 #include "string-inl.h"
48
49
50 /* Check for JSON prologues... */
is_json_safe_mime(_u8 * str)51 static _u8 is_json_safe_mime(_u8* str) {
52 _u32 i = 0;
53
54 /* JSON prologues of more than 1 characters are "authoritative" and override
55 further content sniffing. */
56
57 while (json_safe[i]) {
58 if (json_safe[i][1] && !strncmp(str,json_safe[i],strlen(json_safe[i]))) return 1;
59 i++;
60 }
61
62 return 0;
63
64 }
65
66
67 /* Attempt MIME type detection for formats that are likely to be served by a
68 modern web application based on payload signature matching. */
detect_mime(struct http_response * r)69 void detect_mime(struct http_response* r) {
70 _u32 i, max;
71 _u8 text = 1;
72 _u8 sniffbuf[SNIFFBUF + 1];
73 _u8* xxx;
74
75 /* TODO: Add more popular formats. This is oriented toward common web 2.0
76 technologies at the moment. */
77
78 if (!r->payload_len) return;
79
80 if (r->payload_len > SNIFFBUF) max = SNIFFBUF; else max = r->payload_len;
81 memcpy(sniffbuf,r->payload,max);
82 sniffbuf[max] = 0;
83
84 /* Is this a plain-text file? */
85
86 for (i=0;i<max;i++)
87 if (sniffbuf[i] < 0x20 && !isspace(sniffbuf[i])) { text = 0; break; }
88
89 if (text) {
90 _u8 got_alpha = 0, got_bracket = 0, got_try = 0, got_alpha_before = 0;
91
92 r->is_text = 1;
93
94 /* First, some files with known, fixed signatures. */
95
96 if (!strncmp(sniffbuf,"%!PS",4)) {
97 r->sniffed_mime = "application/postscript";
98 return;
99 }
100
101 if (!strncmp(sniffbuf,"{\\rtf",5)) {
102 r->sniffed_mime = "text/rtf";
103 return;
104 }
105
106 /* Try to detect Javascript - this is a bit tricky, because
107 JSON snippets can have minimal syntax, and CSS uses a notation
108 vaguely resembling Javascript. */
109
110 /* JSON breaker prefixes automatically qualify content as JS */
111 if (is_json_safe_mime(sniffbuf)) goto got_javascript;
112
113 for (i=0;i<max;i++) {
114
115 /* First, skip comment blocks */
116
117 if (!strncmp(sniffbuf+i,"//",2)) {
118 _u8* x = strchr(sniffbuf + i + 2, '\n');
119 if (!x) i = max; else i = x - sniffbuf;
120 continue;
121 }
122
123 if (!strncmp(sniffbuf+i,"/*",2)) {
124 _u8* x = strstr(sniffbuf + i + 2, "*/");
125 if (!x) i = max; else i = x - sniffbuf + 1;
126 continue;
127 }
128
129 /* If what follows look HTML-esque, bail out */
130
131 if ((sniffbuf[i] == '<' || !strcmp(sniffbuf+i,"<"))) break;
132
133 if (!strncmp(sniffbuf+i,"try",3) && (isspace(sniffbuf[i+3]) || sniffbuf[i+3] == '{'))
134 got_try = 1;
135
136 /* try { ... is a special JSON-like response that looks like CSS, but should
137 be handled as Javascript. */
138
139 if (sniffbuf[i] == '{' && got_try) goto got_javascript;
140
141 /* Otherwise, if { is encountered before any =, (, or HTML, but after alnums,
142 it's likely a stylesheet... well, unless followed by '"', in which case it
143 might be a serialized object with a non-standard anti-XSSI prologue. */
144
145 if (got_alpha && sniffbuf[i] == '{') {
146 _u32 j = i + 1;
147
148 while (j < max && sniffbuf[j] && !isalpha(sniffbuf[j])) {
149 if (sniffbuf[j] == '{' || sniffbuf[j] == '"' ||
150 sniffbuf[j] == '\'' || sniffbuf[j] == '(') goto got_javascript;
151 j++;
152 }
153
154 r->sniffed_mime = "text/css";
155 return;
156 }
157
158 if (isalpha(sniffbuf[i])) got_alpha = 1;
159 if (sniffbuf[i] == '{') { got_bracket = 1; got_alpha_before = got_alpha; }
160
161 /* { "foo" is very JSONish. */
162 if (!got_alpha && got_bracket && sniffbuf[i] == '"') goto got_javascript;
163
164 /* { foo: 1 is JSONish too. */
165 if (!got_alpha_before && got_alpha && got_bracket && sniffbuf[i] == ':')
166 goto got_javascript;
167
168 /* And finally, if =, ( or JS keyword is encountered before <, assume JS. */
169
170 if (sniffbuf[i] == '=' || sniffbuf[i] == '(' || sniffbuf[i] == '[' ||
171 !strncmp(sniffbuf + i, "function ",9) ||
172 !strncmp(sniffbuf + i, "throw ",6) ||
173 !strncmp(sniffbuf + i, "var ",4)) {
174
175 got_javascript:
176
177 r->sniffed_mime = "application/x-javascript";
178
179 /* RFC 4329 lists no fewer than 16 variations of Javascript MIME type.
180 Not all of them are common in the wild, but all are roughly equivalent
181 security-wise, so let's be lenient. */
182
183 if (r->mime_type) {
184
185 if (!strcasecmp(r->mime_type,"text/javascript"))
186 r->sniffed_mime = "text/javascript";
187 else if (!strcasecmp(r->mime_type,"application/javascript"))
188 r->sniffed_mime = "application_javascript";
189 else if (!strcasecmp(r->mime_type,"application/json"))
190 r->sniffed_mime = "application/json";
191
192 }
193
194 return;
195 }
196
197 }
198
199 /* OpenSearch */
200
201 if (strstr(sniffbuf,"<OpenSearch")) {
202 r->sniffed_mime = "application/opensearchdescription+xml";
203 return;
204 }
205
206 /* Try to detect RSS */
207
208 if (strstr(sniffbuf,"<channel") || strstr(sniffbuf,"<description") ||
209 strstr(sniffbuf,"<item") || strstr(sniffbuf,"<rdf:RDF") ||
210 strstr(sniffbuf,"<rss")) {
211 r->sniffed_mime = "application/rss+xml";
212 return;
213 }
214
215 /* Try to detect Atom */
216
217 if (strstr(sniffbuf,"<feed ") || strstr(sniffbuf,"<updated>")) {
218 r->sniffed_mime = "application/atom+xml";
219 return;
220 }
221
222 /* Try to detect WML */
223
224 if (rp_strcasestr(sniffbuf,"<wml") || rp_strcasestr(sniffbuf,"<!DOCTYPE wml ")) {
225 r->sniffed_mime = "text/vnd.wap.wml";
226 return;
227 }
228
229 /* Try to detect <cross-domain-policy> - just promote the new, fancy MIME type for
230 security reasons. */
231
232 if (rp_strcasestr(sniffbuf,"<cross-domain-policy>")) {
233 r->sniffed_mime = "text/x-cross-domain-policy";
234 return;
235 }
236
237 /* Try to detect XHTML, SVG, or generic XML of some other type. */
238
239 if (rp_strcasestr(sniffbuf,"<?xml")) {
240
241 if (rp_strcasestr(sniffbuf,"<svg"))
242 r->sniffed_mime = "image/svg+xml";
243 else if (rp_strcasestr(sniffbuf,"<!doctype") && !rp_strcasestr(sniffbuf,"cross-domain-policy"))
244 r->sniffed_mime = "application/xhtml+xml";
245 else {
246
247 if (r->mime_type && !strcasecmp(r->mime_type,"text/xml"))
248 r->sniffed_mime = "text/xml";
249 else r->sniffed_mime = "application/xml";
250 }
251
252 return;
253 }
254
255 /* Try to detect generic HTML */
256
257 if (rp_strcasestr(sniffbuf,"<html") || rp_strcasestr(sniffbuf,"<meta") ||
258 rp_strcasestr(sniffbuf,"<head") || rp_strcasestr(sniffbuf,"<title") ||
259 rp_strcasestr(sniffbuf,"<!--") ||
260 rp_strcasestr(sniffbuf,"<!doctype") || rp_strcasestr(sniffbuf,"<body") ||
261 rp_strcasestr(sniffbuf,"<font") || rp_strcasestr(sniffbuf,"<br") ||
262 rp_strcasestr(sniffbuf,"<td") || rp_strcasestr(sniffbuf,"<div") ||
263 rp_strcasestr(sniffbuf,"<span") || rp_strcasestr(sniffbuf,"<img") ||
264 rp_strcasestr(sniffbuf,"<li") || rp_strcasestr(sniffbuf,"href=") ||
265 rp_strcasestr(sniffbuf,"<ol") || rp_strcasestr(sniffbuf,"<ul") ||
266 rp_strcasestr(sniffbuf,"<style") || rp_strcasestr(sniffbuf,"<script")) {
267
268 r->sniffed_mime = "text/html";
269 return;
270 }
271
272 /* Last resort for XML */
273
274 xxx = sniffbuf;
275 while (isspace(*xxx)) xxx++;
276
277 if (rp_strcasestr(xxx,"<![CDATA[") || (xxx[0] == '<' && (strstr(xxx,"</") || strstr(xxx,"/>") || strstr(xxx,"/ >")))) {
278
279 if (r->mime_type && !strcasecmp(r->mime_type,"text/xml"))
280 r->sniffed_mime = "text/xml";
281 else r->sniffed_mime = "application/xml";
282
283 }
284
285 /* Oh well, at least it seems to be text. */
286
287 r->sniffed_mime = "text/plain";
288
289 } else {
290
291 /* This is considerably less messy - binary signatures for some non-text files. */
292
293 if (sniffbuf[0] == 0xFF && sniffbuf[1] == 0xD8 &&
294 sniffbuf[2] == 0xFF) {
295 r->sniffed_mime = "image/jpeg";
296
297 /* Progressive JPEG; recognized by MSIE. */
298
299 if (r->mime_type && !strcasecmp(r->mime_type,"image/pjpeg"))
300 r->sniffed_mime = "image/pjpeg";
301
302 return;
303 }
304
305 if (sniffbuf[0] == 'G' && sniffbuf[1] == 'I' &&
306 sniffbuf[2] == 'F' && sniffbuf[3] == '8') {
307 r->sniffed_mime = "image/gif";
308 return;
309 }
310
311 if (sniffbuf[0] == 0x89 && sniffbuf[1] == 'P' &&
312 sniffbuf[2] == 'N' && sniffbuf[3] == 'G') {
313 r->sniffed_mime = "image/png";
314 return;
315 }
316
317 if (sniffbuf[0] == 'B' && sniffbuf[1] == 'M') {
318 r->sniffed_mime = "image/x-ms-bmp";
319 return;
320 }
321
322 if (sniffbuf[0] == 'I' && sniffbuf[1] == 'I' && sniffbuf[2] == 42) {
323 r->sniffed_mime = "image/tiff";
324 return;
325 }
326
327 if (sniffbuf[0] == 0xFF && sniffbuf[1] == 0xFB) {
328 r->sniffed_mime = "audio/mpeg";
329 return;
330 }
331
332 if (sniffbuf[0] == 0x00 && sniffbuf[1] == 0x00 &&
333 sniffbuf[2] == 0x01 && (sniffbuf[3] & 0xF0) == 0xB0) {
334 r->sniffed_mime = "video/mpeg";
335 return;
336 }
337
338 if (sniffbuf[0] == 'O' && sniffbuf[1] == 'g' &&
339 sniffbuf[2] == 'g' && sniffbuf[3] == 'S') {
340 r->sniffed_mime = "application/ogg";
341 return;
342 }
343
344 if (sniffbuf[0] == 'R' && sniffbuf[1] == 'I' &&
345 sniffbuf[2] == 'F' && sniffbuf[3] == 'F') {
346
347 if (sniffbuf[8] == 'A') {
348 if (sniffbuf[9] == 'C') {
349 r->sniffed_mime = "application/x-navi-animation";
350 } else {
351 r->sniffed_mime = "video/avi";
352 }
353 } else r->sniffed_mime = "audio/wav";
354
355 return;
356
357 }
358
359 if (sniffbuf[0] == 0x28 && sniffbuf[1] == 'R' &&
360 sniffbuf[2] == 'M' && sniffbuf[3] == 'F') {
361
362 r->sniffed_mime = "audio/x-realaudio";
363 return;
364
365 }
366
367 if (sniffbuf[0] == 0x30 && sniffbuf[1] == 0x26 &&
368 sniffbuf[2] == 0xB2) {
369
370 r->sniffed_mime = "video/x-ms-asf";
371 return;
372
373 }
374
375 if (!strncmp(sniffbuf+4,"free",4) || !strncmp(sniffbuf+4,"mdat",4) ||
376 !strncmp(sniffbuf+4,"wide",4) || !strncmp(sniffbuf+4,"pnot",4) ||
377 !strncmp(sniffbuf+4,"skip",4) || !strncmp(sniffbuf+4,"moov",4)) {
378
379 r->sniffed_mime = "video/quicktime";
380 return;
381
382 }
383
384
385 if ((sniffbuf[0] == 0x46 || sniffbuf[0] == 0x43) &&
386 sniffbuf[1] == 0x57 && sniffbuf[2] == 0x53) {
387
388 r->sniffed_mime = "application/x-shockwave-flash";
389 return;
390
391 }
392
393 if (sniffbuf[0] == 0x46 && sniffbuf[1] == 0x4C && sniffbuf[2] == 0x56) {
394
395 /* Again, multiple valid options in use; be polite. */
396
397 if (r->mime_type && !strcasecmp(r->mime_type,"video/flv"))
398 r->sniffed_mime = "video/flv";
399 else
400 r->sniffed_mime = "video/x-flv";
401
402 return;
403
404 }
405
406 if (r->payload_len > 3 && sniffbuf[0] == 0 && sniffbuf[1] == 0 && sniffbuf[2] < 3 && sniffbuf[3] == 0) {
407
408 /* Be polite again. */
409
410 if (r->mime_type && !strcasecmp(r->mime_type,"image/x-icon"))
411 r->sniffed_mime = "image/x-icon";
412 else
413 if (r->mime_type && !strcasecmp(r->mime_type,"image/bmp"))
414 r->sniffed_mime = "image/bmp";
415 else r->sniffed_mime = "image/vnd.microsoft.icon";
416
417 return;
418 }
419
420 if (sniffbuf[0] == '%' && sniffbuf[1] == 'P' && sniffbuf[2] == 'D' && sniffbuf[3] == 'F') {
421 r->sniffed_mime = "application/pdf";
422 return;
423 }
424
425 if (sniffbuf[0] == 'P' && sniffbuf[1] == 'K' && sniffbuf[2] < 6 && sniffbuf[3] < 7) {
426
427 if (rp_memmem(r->payload,r->payload_len,"META-INF/",9))
428 r->sniffed_mime = "application/java-archive";
429 else
430 r->sniffed_mime = "application/zip";
431
432 return;
433 }
434
435 if (sniffbuf[0] == 0xCA && sniffbuf[1] == 0xFE && sniffbuf[2] == 0xBA && sniffbuf[3] == 0xBE) {
436 r->sniffed_mime = "application/java-vm";
437 return;
438 }
439
440 /* Microsoft office is kind-of fuzzy. */
441
442 if (sniffbuf[0] == 0xD0 && sniffbuf[1] == 0xCF &&
443 sniffbuf[2] == 0x11 && sniffbuf[3] == 0xE0 && r->payload_len > 512) {
444
445 _u8 c = r->payload[512];
446
447 switch (c) {
448 case 0xEC: r->sniffed_mime = "application/msword"; break;
449 case 0xFD:
450 case 0x09: r->sniffed_mime = "application/vnd.ms-excel"; break;
451 case 0x00:
452 case 0x0F:
453 case 0xA0: r->sniffed_mime = "application/vnd.ms-powerpoint"; break;
454 }
455
456 return;
457
458 }
459
460 /* If we have no idea what it is, just leave it NULL. */
461
462 }
463
464 }
465