1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #include "nsFeedSniffer.h"
7
8
9 #include "nsNetCID.h"
10 #include "nsXPCOM.h"
11 #include "nsCOMPtr.h"
12 #include "nsStringStream.h"
13
14 #include "nsBrowserCompsCID.h"
15
16 #include "nsICategoryManager.h"
17 #include "nsIServiceManager.h"
18 #include "nsComponentManagerUtils.h"
19 #include "nsServiceManagerUtils.h"
20
21 #include "nsIStreamConverterService.h"
22 #include "nsIStreamConverter.h"
23
24 #include "nsIStreamListener.h"
25
26 #include "nsIHttpChannel.h"
27 #include "nsIMIMEHeaderParam.h"
28
29 #include "nsMimeTypes.h"
30 #include "nsIURI.h"
31 #include <algorithm>
32
33 #define TYPE_ATOM "application/atom+xml"
34 #define TYPE_RSS "application/rss+xml"
35 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
36
37 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
38 #define NS_RSS "http://purl.org/rss/1.0/"
39
40 #define MAX_BYTES 512u
41
NS_IMPL_ISUPPORTS(nsFeedSniffer,nsIContentSniffer,nsIStreamListener,nsIRequestObserver)42 NS_IMPL_ISUPPORTS(nsFeedSniffer,
43 nsIContentSniffer,
44 nsIStreamListener,
45 nsIRequestObserver)
46
47 nsresult
48 nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
49 const uint8_t* data,
50 uint32_t length)
51 {
52 nsresult rv = NS_OK;
53
54 mDecodedData = "";
55 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
56 if (!httpChannel)
57 return NS_ERROR_NO_INTERFACE;
58
59 nsAutoCString contentEncoding;
60 httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
61 contentEncoding);
62 if (!contentEncoding.IsEmpty()) {
63 nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
64 if (converterService) {
65 ToLowerCase(contentEncoding);
66
67 nsCOMPtr<nsIStreamListener> converter;
68 rv = converterService->AsyncConvertData(contentEncoding.get(),
69 "uncompressed", this, nullptr,
70 getter_AddRefs(converter));
71 NS_ENSURE_SUCCESS(rv, rv);
72
73 converter->OnStartRequest(request, nullptr);
74
75 nsCOMPtr<nsIStringInputStream> rawStream =
76 do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
77 if (!rawStream)
78 return NS_ERROR_FAILURE;
79
80 rv = rawStream->SetData((const char*)data, length);
81 NS_ENSURE_SUCCESS(rv, rv);
82
83 rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length);
84 NS_ENSURE_SUCCESS(rv, rv);
85
86 converter->OnStopRequest(request, nullptr, NS_OK);
87 }
88 }
89 return rv;
90 }
91
92 template<int N>
93 static bool
StringBeginsWithLowercaseLiteral(nsAString & aString,const char (& aSubstring)[N])94 StringBeginsWithLowercaseLiteral(nsAString& aString,
95 const char (&aSubstring)[N])
96 {
97 return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
98 }
99
100 bool
HasAttachmentDisposition(nsIHttpChannel * httpChannel)101 HasAttachmentDisposition(nsIHttpChannel* httpChannel)
102 {
103 if (!httpChannel)
104 return false;
105
106 uint32_t disp;
107 nsresult rv = httpChannel->GetContentDisposition(&disp);
108
109 if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
110 return true;
111
112 return false;
113 }
114
115 /**
116 * @return the first occurrence of a character within a string buffer,
117 * or nullptr if not found
118 */
119 static const char*
FindChar(char c,const char * begin,const char * end)120 FindChar(char c, const char *begin, const char *end)
121 {
122 for (; begin < end; ++begin) {
123 if (*begin == c)
124 return begin;
125 }
126 return nullptr;
127 }
128
129 /**
130 *
131 * Determine if a substring is the "documentElement" in the document.
132 *
133 * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
134 * element within the XML DOM, i.e. the root container element. Otherwise,
135 * it's possible that someone embedded one of these tags inside a document of
136 * another type, e.g. a HTML document, and we don't want to show the preview
137 * page if the document isn't actually a feed.
138 *
139 * @param start
140 * The beginning of the data being sniffed
141 * @param end
142 * The end of the data being sniffed, right before the substring that
143 * was found.
144 * @returns true if the found substring is the documentElement, false
145 * otherwise.
146 */
147 static bool
IsDocumentElement(const char * start,const char * end)148 IsDocumentElement(const char *start, const char* end)
149 {
150 // For every tag in the buffer, check to see if it's a PI, Doctype or
151 // comment, our desired substring or something invalid.
152 while ( (start = FindChar('<', start, end)) ) {
153 ++start;
154 if (start >= end)
155 return false;
156
157 // Check to see if the character following the '<' is either '?' or '!'
158 // (processing instruction or doctype or comment)... these are valid nodes
159 // to have in the prologue.
160 if (*start != '?' && *start != '!')
161 return false;
162
163 // Now advance the iterator until the '>' (We do this because we don't want
164 // to sniff indicator substrings that are embedded within other nodes, e.g.
165 // comments: <!-- <rdf:RDF .. > -->
166 start = FindChar('>', start, end);
167 if (!start)
168 return false;
169
170 ++start;
171 }
172 return true;
173 }
174
175 /**
176 * Determines whether or not a string exists as the root element in an XML data
177 * string buffer.
178 * @param dataString
179 * The data being sniffed
180 * @param substring
181 * The substring being tested for existence and root-ness.
182 * @returns true if the substring exists and is the documentElement, false
183 * otherwise.
184 */
185 static bool
ContainsTopLevelSubstring(nsACString & dataString,const char * substring)186 ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
187 {
188 nsACString::const_iterator start, end;
189 dataString.BeginReading(start);
190 dataString.EndReading(end);
191
192 if (!FindInReadable(nsCString(substring), start, end)){
193 return false;
194 }
195
196 auto offset = start.get() - dataString.Data();
197
198 const char *begin = dataString.BeginReading();
199
200 // Only do the validation when we find the substring.
201 return IsDocumentElement(begin, begin + offset);
202 }
203
204 NS_IMETHODIMP
GetMIMETypeFromContent(nsIRequest * request,const uint8_t * data,uint32_t length,nsACString & sniffedType)205 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
206 const uint8_t* data,
207 uint32_t length,
208 nsACString& sniffedType)
209 {
210 nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
211 if (!channel)
212 return NS_ERROR_NO_INTERFACE;
213
214 // Check that this is a GET request, since you can't subscribe to a POST...
215 nsAutoCString method;
216 channel->GetRequestMethod(method);
217 if (!method.EqualsLiteral("GET")) {
218 sniffedType.Truncate();
219 return NS_OK;
220 }
221
222 // We need to find out if this is a load of a view-source document. In this
223 // case we do not want to override the content type, since the source display
224 // does not need to be converted from feed format to XUL. More importantly,
225 // we don't want to change the content type from something
226 // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
227 // etc) to something that only the application fe knows about (maybe.feed)
228 // thus deactivating syntax highlighting.
229 nsCOMPtr<nsIURI> originalURI;
230 channel->GetOriginalURI(getter_AddRefs(originalURI));
231
232 nsAutoCString scheme;
233 originalURI->GetScheme(scheme);
234 if (scheme.EqualsLiteral("view-source")) {
235 sniffedType.Truncate();
236 return NS_OK;
237 }
238
239 // Check the Content-Type to see if it is set correctly. If it is set to
240 // something specific that we think is a reliable indication of a feed, don't
241 // bother sniffing since we assume the site maintainer knows what they're
242 // doing.
243 nsAutoCString contentType;
244 channel->GetContentType(contentType);
245 bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
246 contentType.EqualsLiteral(TYPE_ATOM);
247
248 // Check to see if this was a feed request from the location bar or from
249 // the feed: protocol. This is also a reliable indication.
250 // The value of the header doesn't matter.
251 if (!noSniff) {
252 nsAutoCString sniffHeader;
253 nsresult foundHeader =
254 channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
255 sniffHeader);
256 noSniff = NS_SUCCEEDED(foundHeader);
257 }
258
259 if (noSniff) {
260 // check for an attachment after we have a likely feed.
261 if(HasAttachmentDisposition(channel)) {
262 sniffedType.Truncate();
263 return NS_OK;
264 }
265
266 // set the feed header as a response header, since we have good metadata
267 // telling us that the feed is supposed to be RSS or Atom
268 channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
269 NS_LITERAL_CSTRING("1"), false);
270 sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
271 return NS_OK;
272 }
273
274 // Don't sniff arbitrary types. Limit sniffing to situations that
275 // we think can reasonably arise.
276 if (!contentType.EqualsLiteral(TEXT_HTML) &&
277 !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
278 // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
279 // and check for text/xml and application/xml by hand instead?
280 contentType.Find("xml") == -1) {
281 sniffedType.Truncate();
282 return NS_OK;
283 }
284
285 // Now we need to potentially decompress data served with
286 // Content-Encoding: gzip
287 nsresult rv = ConvertEncodedData(request, data, length);
288 if (NS_FAILED(rv))
289 return rv;
290
291 // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
292 // false positives by accidentally reading document content, e.g. a "how to
293 // make a feed" page.
294 const char* testData;
295 if (mDecodedData.IsEmpty()) {
296 testData = (const char*)data;
297 length = std::min(length, MAX_BYTES);
298 } else {
299 testData = mDecodedData.get();
300 length = std::min(mDecodedData.Length(), MAX_BYTES);
301 }
302
303 // The strategy here is based on that described in:
304 // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
305 // for interoperarbility purposes.
306
307 // Thus begins the actual sniffing.
308 nsDependentCSubstring dataString((const char*)testData, length);
309
310 bool isFeed = false;
311
312 // RSS 0.91/0.92/2.0
313 isFeed = ContainsTopLevelSubstring(dataString, "<rss");
314
315 // Atom 1.0
316 if (!isFeed)
317 isFeed = ContainsTopLevelSubstring(dataString, "<feed");
318
319 // RSS 1.0
320 if (!isFeed) {
321 bool foundNS_RDF = FindInReadable(NS_LITERAL_CSTRING(NS_RDF), dataString);
322 bool foundNS_RSS = FindInReadable(NS_LITERAL_CSTRING(NS_RSS), dataString);
323 isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
324 foundNS_RDF && foundNS_RSS;
325 }
326
327 // If we sniffed a feed, coerce our internal type
328 if (isFeed && !HasAttachmentDisposition(channel))
329 sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
330 else
331 sniffedType.Truncate();
332 return NS_OK;
333 }
334
335 NS_IMETHODIMP
OnStartRequest(nsIRequest * request,nsISupports * context)336 nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
337 {
338 return NS_OK;
339 }
340
341 nsresult
AppendSegmentToString(nsIInputStream * inputStream,void * closure,const char * rawSegment,uint32_t toOffset,uint32_t count,uint32_t * writeCount)342 nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
343 void* closure,
344 const char* rawSegment,
345 uint32_t toOffset,
346 uint32_t count,
347 uint32_t* writeCount)
348 {
349 nsCString* decodedData = static_cast<nsCString*>(closure);
350 decodedData->Append(rawSegment, count);
351 *writeCount = count;
352 return NS_OK;
353 }
354
355 NS_IMETHODIMP
OnDataAvailable(nsIRequest * request,nsISupports * context,nsIInputStream * stream,uint64_t offset,uint32_t count)356 nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
357 nsIInputStream* stream, uint64_t offset,
358 uint32_t count)
359 {
360 uint32_t read;
361 return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
362 &read);
363 }
364
365 NS_IMETHODIMP
OnStopRequest(nsIRequest * request,nsISupports * context,nsresult status)366 nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
367 nsresult status)
368 {
369 return NS_OK;
370 }
371