1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "nsFeedSniffer.h"
7 
8 
9 #include "nsNetCID.h"
10 #include "nsXPCOM.h"
11 #include "nsCOMPtr.h"
12 #include "nsStringStream.h"
13 
14 #include "nsBrowserCompsCID.h"
15 
16 #include "nsICategoryManager.h"
17 #include "nsIServiceManager.h"
18 #include "nsComponentManagerUtils.h"
19 #include "nsServiceManagerUtils.h"
20 
21 #include "nsIStreamConverterService.h"
22 #include "nsIStreamConverter.h"
23 
24 #include "nsIStreamListener.h"
25 
26 #include "nsIHttpChannel.h"
27 #include "nsIMIMEHeaderParam.h"
28 
29 #include "nsMimeTypes.h"
30 #include "nsIURI.h"
31 #include <algorithm>
32 
33 #define TYPE_ATOM "application/atom+xml"
34 #define TYPE_RSS "application/rss+xml"
35 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
36 
37 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
38 #define NS_RSS "http://purl.org/rss/1.0/"
39 
40 #define MAX_BYTES 512u
41 
NS_IMPL_ISUPPORTS(nsFeedSniffer,nsIContentSniffer,nsIStreamListener,nsIRequestObserver)42 NS_IMPL_ISUPPORTS(nsFeedSniffer,
43                   nsIContentSniffer,
44                   nsIStreamListener,
45                   nsIRequestObserver)
46 
47 nsresult
48 nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
49                                   const uint8_t* data,
50                                   uint32_t length)
51 {
52   nsresult rv = NS_OK;
53 
54  mDecodedData = "";
55  nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
56   if (!httpChannel)
57     return NS_ERROR_NO_INTERFACE;
58 
59   nsAutoCString contentEncoding;
60   httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
61                                  contentEncoding);
62   if (!contentEncoding.IsEmpty()) {
63     nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
64     if (converterService) {
65       ToLowerCase(contentEncoding);
66 
67       nsCOMPtr<nsIStreamListener> converter;
68       rv = converterService->AsyncConvertData(contentEncoding.get(),
69                                               "uncompressed", this, nullptr,
70                                               getter_AddRefs(converter));
71       NS_ENSURE_SUCCESS(rv, rv);
72 
73       converter->OnStartRequest(request, nullptr);
74 
75       nsCOMPtr<nsIStringInputStream> rawStream =
76         do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
77       if (!rawStream)
78         return NS_ERROR_FAILURE;
79 
80       rv = rawStream->SetData((const char*)data, length);
81       NS_ENSURE_SUCCESS(rv, rv);
82 
83       rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length);
84       NS_ENSURE_SUCCESS(rv, rv);
85 
86       converter->OnStopRequest(request, nullptr, NS_OK);
87     }
88   }
89   return rv;
90 }
91 
92 template<int N>
93 static bool
StringBeginsWithLowercaseLiteral(nsAString & aString,const char (& aSubstring)[N])94 StringBeginsWithLowercaseLiteral(nsAString& aString,
95                                  const char (&aSubstring)[N])
96 {
97   return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
98 }
99 
100 bool
HasAttachmentDisposition(nsIHttpChannel * httpChannel)101 HasAttachmentDisposition(nsIHttpChannel* httpChannel)
102 {
103   if (!httpChannel)
104     return false;
105 
106   uint32_t disp;
107   nsresult rv = httpChannel->GetContentDisposition(&disp);
108 
109   if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
110     return true;
111 
112   return false;
113 }
114 
115 /**
116  * @return the first occurrence of a character within a string buffer,
117  *         or nullptr if not found
118  */
119 static const char*
FindChar(char c,const char * begin,const char * end)120 FindChar(char c, const char *begin, const char *end)
121 {
122   for (; begin < end; ++begin) {
123     if (*begin == c)
124       return begin;
125   }
126   return nullptr;
127 }
128 
129 /**
130  *
131  * Determine if a substring is the "documentElement" in the document.
132  *
133  * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
134  * element within the XML DOM, i.e. the root container element. Otherwise,
135  * it's possible that someone embedded one of these tags inside a document of
136  * another type, e.g. a HTML document, and we don't want to show the preview
137  * page if the document isn't actually a feed.
138  *
139  * @param   start
140  *          The beginning of the data being sniffed
141  * @param   end
142  *          The end of the data being sniffed, right before the substring that
143  *          was found.
144  * @returns true if the found substring is the documentElement, false
145  *          otherwise.
146  */
147 static bool
IsDocumentElement(const char * start,const char * end)148 IsDocumentElement(const char *start, const char* end)
149 {
150   // For every tag in the buffer, check to see if it's a PI, Doctype or
151   // comment, our desired substring or something invalid.
152   while ( (start = FindChar('<', start, end)) ) {
153     ++start;
154     if (start >= end)
155       return false;
156 
157     // Check to see if the character following the '<' is either '?' or '!'
158     // (processing instruction or doctype or comment)... these are valid nodes
159     // to have in the prologue.
160     if (*start != '?' && *start != '!')
161       return false;
162 
163     // Now advance the iterator until the '>' (We do this because we don't want
164     // to sniff indicator substrings that are embedded within other nodes, e.g.
165     // comments: <!-- <rdf:RDF .. > -->
166     start = FindChar('>', start, end);
167     if (!start)
168       return false;
169 
170     ++start;
171   }
172   return true;
173 }
174 
175 /**
176  * Determines whether or not a string exists as the root element in an XML data
177  * string buffer.
178  * @param   dataString
179  *          The data being sniffed
180  * @param   substring
181  *          The substring being tested for existence and root-ness.
182  * @returns true if the substring exists and is the documentElement, false
183  *          otherwise.
184  */
185 static bool
ContainsTopLevelSubstring(nsACString & dataString,const char * substring)186 ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
187 {
188   nsACString::const_iterator start, end;
189   dataString.BeginReading(start);
190   dataString.EndReading(end);
191 
192   if (!FindInReadable(nsCString(substring), start, end)){
193     return false;
194   }
195 
196   auto offset = start.get() - dataString.Data();
197 
198   const char *begin = dataString.BeginReading();
199 
200   // Only do the validation when we find the substring.
201   return IsDocumentElement(begin, begin + offset);
202 }
203 
204 NS_IMETHODIMP
GetMIMETypeFromContent(nsIRequest * request,const uint8_t * data,uint32_t length,nsACString & sniffedType)205 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
206                                       const uint8_t* data,
207                                       uint32_t length,
208                                       nsACString& sniffedType)
209 {
210   nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
211   if (!channel)
212     return NS_ERROR_NO_INTERFACE;
213 
214   // Check that this is a GET request, since you can't subscribe to a POST...
215   nsAutoCString method;
216   channel->GetRequestMethod(method);
217   if (!method.EqualsLiteral("GET")) {
218     sniffedType.Truncate();
219     return NS_OK;
220   }
221 
222   // We need to find out if this is a load of a view-source document. In this
223   // case we do not want to override the content type, since the source display
224   // does not need to be converted from feed format to XUL. More importantly,
225   // we don't want to change the content type from something
226   // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
227   // etc) to something that only the application fe knows about (maybe.feed)
228   // thus deactivating syntax highlighting.
229   nsCOMPtr<nsIURI> originalURI;
230   channel->GetOriginalURI(getter_AddRefs(originalURI));
231 
232   nsAutoCString scheme;
233   originalURI->GetScheme(scheme);
234   if (scheme.EqualsLiteral("view-source")) {
235     sniffedType.Truncate();
236     return NS_OK;
237   }
238 
239   // Check the Content-Type to see if it is set correctly. If it is set to
240   // something specific that we think is a reliable indication of a feed, don't
241   // bother sniffing since we assume the site maintainer knows what they're
242   // doing.
243   nsAutoCString contentType;
244   channel->GetContentType(contentType);
245   bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
246                    contentType.EqualsLiteral(TYPE_ATOM);
247 
248   // Check to see if this was a feed request from the location bar or from
249   // the feed: protocol. This is also a reliable indication.
250   // The value of the header doesn't matter.
251   if (!noSniff) {
252     nsAutoCString sniffHeader;
253     nsresult foundHeader =
254       channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
255                                 sniffHeader);
256     noSniff = NS_SUCCEEDED(foundHeader);
257   }
258 
259   if (noSniff) {
260     // check for an attachment after we have a likely feed.
261     if(HasAttachmentDisposition(channel)) {
262       sniffedType.Truncate();
263       return NS_OK;
264     }
265 
266     // set the feed header as a response header, since we have good metadata
267     // telling us that the feed is supposed to be RSS or Atom
268     channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
269                                NS_LITERAL_CSTRING("1"), false);
270     sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
271     return NS_OK;
272   }
273 
274   // Don't sniff arbitrary types.  Limit sniffing to situations that
275   // we think can reasonably arise.
276   if (!contentType.EqualsLiteral(TEXT_HTML) &&
277       !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
278       // Same criterion as XMLHttpRequest.  Should we be checking for "+xml"
279       // and check for text/xml and application/xml by hand instead?
280       contentType.Find("xml") == -1) {
281     sniffedType.Truncate();
282     return NS_OK;
283   }
284 
285   // Now we need to potentially decompress data served with
286   // Content-Encoding: gzip
287   nsresult rv = ConvertEncodedData(request, data, length);
288   if (NS_FAILED(rv))
289     return rv;
290 
291   // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
292   // false positives by accidentally reading document content, e.g. a "how to
293   // make a feed" page.
294   const char* testData;
295   if (mDecodedData.IsEmpty()) {
296     testData = (const char*)data;
297     length = std::min(length, MAX_BYTES);
298   } else {
299     testData = mDecodedData.get();
300     length = std::min(mDecodedData.Length(), MAX_BYTES);
301   }
302 
303   // The strategy here is based on that described in:
304   // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
305   // for interoperarbility purposes.
306 
307   // Thus begins the actual sniffing.
308   nsDependentCSubstring dataString((const char*)testData, length);
309 
310   bool isFeed = false;
311 
312   // RSS 0.91/0.92/2.0
313   isFeed = ContainsTopLevelSubstring(dataString, "<rss");
314 
315   // Atom 1.0
316   if (!isFeed)
317     isFeed = ContainsTopLevelSubstring(dataString, "<feed");
318 
319   // RSS 1.0
320   if (!isFeed) {
321     bool foundNS_RDF = FindInReadable(NS_LITERAL_CSTRING(NS_RDF), dataString);
322     bool foundNS_RSS = FindInReadable(NS_LITERAL_CSTRING(NS_RSS), dataString);
323     isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
324       foundNS_RDF && foundNS_RSS;
325   }
326 
327   // If we sniffed a feed, coerce our internal type
328   if (isFeed && !HasAttachmentDisposition(channel))
329     sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
330   else
331     sniffedType.Truncate();
332   return NS_OK;
333 }
334 
335 NS_IMETHODIMP
OnStartRequest(nsIRequest * request,nsISupports * context)336 nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
337 {
338   return NS_OK;
339 }
340 
341 nsresult
AppendSegmentToString(nsIInputStream * inputStream,void * closure,const char * rawSegment,uint32_t toOffset,uint32_t count,uint32_t * writeCount)342 nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
343                                      void* closure,
344                                      const char* rawSegment,
345                                      uint32_t toOffset,
346                                      uint32_t count,
347                                      uint32_t* writeCount)
348 {
349   nsCString* decodedData = static_cast<nsCString*>(closure);
350   decodedData->Append(rawSegment, count);
351   *writeCount = count;
352   return NS_OK;
353 }
354 
355 NS_IMETHODIMP
OnDataAvailable(nsIRequest * request,nsISupports * context,nsIInputStream * stream,uint64_t offset,uint32_t count)356 nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
357                                nsIInputStream* stream, uint64_t offset,
358                                uint32_t count)
359 {
360   uint32_t read;
361   return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
362                               &read);
363 }
364 
365 NS_IMETHODIMP
OnStopRequest(nsIRequest * request,nsISupports * context,nsresult status)366 nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
367                              nsresult status)
368 {
369   return NS_OK;
370 }
371