nsFeedSniffer.cpp
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4  *
5  * The contents of this file are subject to the Mozilla Public License Version
6  * 1.1 (the "License"); you may not use this file except in compliance with
7  * the License. You may obtain a copy of the License at
8  * http://www.mozilla.org/MPL/
9  *
10  * Software distributed under the License is distributed on an "AS IS" basis,
11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12  * for the specific language governing rights and limitations under the
13  * License.
14  *
15  * The Original Code is the Feed Content Sniffer.
16  *
17  * The Initial Developer of the Original Code is Google Inc.
18  * Portions created by the Initial Developer are Copyright (C) 2006
19  * the Initial Developer. All Rights Reserved.
20  *
21  * Contributor(s):
22  * Ben Goodger <beng@google.com>
23  * Robert Sayre <sayrer@gmail.com>
24  *
25  * Alternatively, the contents of this file may be used under the terms of
26  * either the GNU General Public License Version 2 or later (the "GPL"), or
27  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28  * in which case the provisions of the GPL or the LGPL are applicable instead
29  * of those above. If you wish to allow use of your version of this file only
30  * under the terms of either the GPL or the LGPL, and not to allow others to
31  * use your version of this file under the terms of the MPL, indicate your
32  * decision by deleting the provisions above and replace them with the notice
33  * and other provisions required by the GPL or the LGPL. If you do not delete
34  * the provisions above, a recipient may use your version of this file under
35  * the terms of any one of the MPL, the GPL or the LGPL.
36  *
37  * ***** END LICENSE BLOCK ***** */
38 
39 #include "nsFeedSniffer.h"
40 
41 #include "prmem.h"
42 
43 #include "nsNetCID.h"
44 #include "nsXPCOM.h"
45 #include "nsCOMPtr.h"
46 #include "nsStringStream.h"
47 
48 #include "nsBrowserCompsCID.h"
49 
50 #include "nsICategoryManager.h"
51 #include "nsIServiceManager.h"
52 #include "nsComponentManagerUtils.h"
53 #include "nsServiceManagerUtils.h"
54 
55 #include "nsIStreamConverterService.h"
56 #include "nsIStreamConverter.h"
57 
58 #include "nsIStreamListener.h"
59 
60 #include "nsIHttpChannel.h"
61 #include "nsIMIMEHeaderParam.h"
62 
63 #include "nsMimeTypes.h"
64 
65 #define TYPE_ATOM "application/atom+xml"
66 #define TYPE_RSS "application/rss+xml"
67 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
68 
69 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
70 #define NS_RSS "http://purl.org/rss/1.0/"
71 
72 #define MAX_BYTES 512
73 
75  nsIContentSniffer,
76  nsIStreamListener,
77  nsIRequestObserver)
78 
79 nsresult
80 nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
81  const PRUint8* data,
82  PRUint32 length)
83 {
84  nsresult rv = NS_OK;
85 
86  mDecodedData = "";
87  nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
88  if (!httpChannel)
89  return NS_ERROR_NO_INTERFACE;
90 
91  nsCAutoString contentEncoding;
92  httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
93  contentEncoding);
94  if (!contentEncoding.IsEmpty()) {
95  nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
96  if (converterService) {
97  ToLowerCase(contentEncoding);
98 
99  nsCOMPtr<nsIStreamListener> converter;
100  rv = converterService->AsyncConvertData(contentEncoding.get(),
101  "uncompressed", this, nsnull,
102  getter_AddRefs(converter));
103  NS_ENSURE_SUCCESS(rv, rv);
104 
105  converter->OnStartRequest(request, nsnull);
106 
107  nsCOMPtr<nsIStringInputStream> rawStream =
108  do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
109  if (!rawStream)
110  return NS_ERROR_FAILURE;
111 
112  rv = rawStream->SetData((const char*)data, length);
113  NS_ENSURE_SUCCESS(rv, rv);
114 
115  rv = converter->OnDataAvailable(request, nsnull, rawStream, 0, length);
116  NS_ENSURE_SUCCESS(rv, rv);
117 
118  converter->OnStopRequest(request, nsnull, NS_OK);
119  }
120  }
121  return rv;
122 }
123 
124 template<int N>
125 static PRBool
127  const char (&aSubstring)[N])
128 {
129  return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
130 }
131 
132 // XXXsayrer put this in here to get on the branch with minimal delay.
133 // Trunk really needs to factor this out. This is the third usage.
134 PRBool
135 HasAttachmentDisposition(nsIHttpChannel* httpChannel)
136 {
137  if (!httpChannel)
138  return PR_FALSE;
139 
140  nsCAutoString contentDisposition;
141  nsresult rv =
142  httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("content-disposition"),
143  contentDisposition);
144 
145  if (NS_SUCCEEDED(rv) && !contentDisposition.IsEmpty()) {
146  nsCOMPtr<nsIURI> uri;
147  httpChannel->GetURI(getter_AddRefs(uri));
148  nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar =
149  do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
150  if (NS_SUCCEEDED(rv))
151  {
152  nsCAutoString fallbackCharset;
153  if (uri)
154  uri->GetOriginCharset(fallbackCharset);
155  nsAutoString dispToken;
156  // Get the disposition type
157  rv = mimehdrpar->GetParameter(contentDisposition, "", fallbackCharset,
158  PR_TRUE, nsnull, dispToken);
159  // RFC 2183, section 2.8 says that an unknown disposition
160  // value should be treated as "attachment"
161  // XXXbz this code is duplicated in GetFilenameAndExtensionFromChannel in
162  // nsExternalHelperAppService. Factor it out!
163  if (NS_FAILED(rv) ||
164  (!dispToken.IsEmpty() &&
165  !StringBeginsWithLowercaseLiteral(dispToken, "inline") &&
166  // Broken sites just send
167  // Content-Disposition: filename="file"
168  // without a disposition token... screen those out.
169  !StringBeginsWithLowercaseLiteral(dispToken, "filename") &&
170  // Also in use is Content-Disposition: name="file"
171  !StringBeginsWithLowercaseLiteral(dispToken, "name")))
172  // We have a content-disposition of "attachment" or unknown
173  return PR_TRUE;
174  }
175  }
176 
177  return PR_FALSE;
178 }
179 
184 static const char*
185 FindChar(char c, const char *begin, const char *end)
186 {
187  for (; begin < end; ++begin) {
188  if (*begin == c)
189  return begin;
190  }
191  return nsnull;
192 }
193 
212 static PRBool
213 IsDocumentElement(const char *start, const char* end)
214 {
215  // For every tag in the buffer, check to see if it's a PI, Doctype or
216  // comment, our desired substring or something invalid.
217  while ( (start = FindChar('<', start, end)) ) {
218  ++start;
219  if (start >= end)
220  return PR_FALSE;
221 
222  // Check to see if the character following the '<' is either '?' or '!'
223  // (processing instruction or doctype or comment)... these are valid nodes
224  // to have in the prologue.
225  if (*start != '?' && *start != '!')
226  return PR_FALSE;
227 
228  // Now advance the iterator until the '>' (We do this because we don't want
229  // to sniff indicator substrings that are embedded within other nodes, e.g.
230  // comments: <!-- <rdf:RDF .. > -->
231  start = FindChar('>', start, end);
232  if (!start)
233  return PR_FALSE;
234 
235  ++start;
236  }
237  return PR_TRUE;
238 }
239 
250 static PRBool
251 ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
252 {
253  PRInt32 offset = dataString.Find(substring);
254  if (offset == -1)
255  return PR_FALSE;
256 
257  const char *begin = dataString.BeginReading();
258 
259  // Only do the validation when we find the substring.
260  return IsDocumentElement(begin, begin + offset);
261 }
262 
263 NS_IMETHODIMP
264 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
265  const PRUint8* data,
266  PRUint32 length,
267  nsACString& sniffedType)
268 {
269  nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
270  if (!channel)
271  return NS_ERROR_NO_INTERFACE;
272 
273  // Check that this is a GET request, since you can't subscribe to a POST...
274  nsCAutoString method;
275  channel->GetRequestMethod(method);
276  if (!method.Equals("GET")) {
277  sniffedType.Truncate();
278  return NS_OK;
279  }
280 
281  // We need to find out if this is a load of a view-source document. In this
282  // case we do not want to override the content type, since the source display
283  // does not need to be converted from feed format to XUL. More importantly,
284  // we don't want to change the content type from something
285  // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
286  // etc) to something that only the application fe knows about (maybe.feed)
287  // thus deactivating syntax highlighting.
288  nsCOMPtr<nsIURI> originalURI;
289  channel->GetOriginalURI(getter_AddRefs(originalURI));
290 
291  nsCAutoString scheme;
292  originalURI->GetScheme(scheme);
293  if (scheme.EqualsLiteral("view-source")) {
294  sniffedType.Truncate();
295  return NS_OK;
296  }
297 
298  // Check the Content-Type to see if it is set correctly. If it is set to
299  // something specific that we think is a reliable indication of a feed, don't
300  // bother sniffing since we assume the site maintainer knows what they're
301  // doing.
302  nsCAutoString contentType;
303  channel->GetContentType(contentType);
304  PRBool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
305  contentType.EqualsLiteral(TYPE_ATOM);
306 
307  // Check to see if this was a feed request from the location bar or from
308  // the feed: protocol. This is also a reliable indication.
309  // The value of the header doesn't matter.
310  if (!noSniff) {
311  nsCAutoString sniffHeader;
312  nsresult foundHeader =
313  channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
314  sniffHeader);
315  noSniff = NS_SUCCEEDED(foundHeader);
316  }
317 
318  if (noSniff) {
319  // check for an attachment after we have a likely feed.
320  if(HasAttachmentDisposition(channel)) {
321  sniffedType.Truncate();
322  return NS_OK;
323  }
324 
325  // set the feed header as a response header, since we have good metadata
326  // telling us that the feed is supposed to be RSS or Atom
327  channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
328  NS_LITERAL_CSTRING("1"), PR_FALSE);
329  sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
330  return NS_OK;
331  }
332 
333  // Don't sniff arbitrary types. Limit sniffing to situations that
334  // we think can reasonably arise.
335  if (!contentType.EqualsLiteral(TEXT_HTML) &&
336  !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
337  // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
338  // and check for text/xml and application/xml by hand instead?
339  contentType.Find("xml") == -1) {
340  sniffedType.Truncate();
341  return NS_OK;
342  }
343 
344  // Now we need to potentially decompress data served with
345  // Content-Encoding: gzip
346  nsresult rv = ConvertEncodedData(request, data, length);
347  if (NS_FAILED(rv))
348  return rv;
349 
350  const char* testData =
351  mDecodedData.IsEmpty() ? (const char*)data : mDecodedData.get();
352 
353  // The strategy here is based on that described in:
354  // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
355  // for interoperarbility purposes.
356 
357  // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
358  // false positives by accidentally reading document content, e.g. a "how to
359  // make a feed" page.
360  if (length > MAX_BYTES)
361  length = MAX_BYTES;
362 
363  // Thus begins the actual sniffing.
364  nsDependentCSubstring dataString((const char*)testData, length);
365 
366  PRBool isFeed = PR_FALSE;
367 
368  // RSS 0.91/0.92/2.0
369  isFeed = ContainsTopLevelSubstring(dataString, "<rss");
370 
371  // Atom 1.0
372  if (!isFeed)
373  isFeed = ContainsTopLevelSubstring(dataString, "<feed");
374 
375  // RSS 1.0
376  if (!isFeed) {
377  isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
378  dataString.Find(NS_RDF) != -1 &&
379  dataString.Find(NS_RSS) != -1;
380  }
381 
382  // If we sniffed a feed, coerce our internal type
383  if (isFeed && !HasAttachmentDisposition(channel))
384  sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
385  else
386  sniffedType.Truncate();
387  return NS_OK;
388 }
389 
390 NS_IMETHODIMP
391 nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
392 {
393  return NS_OK;
394 }
395 
396 NS_METHOD
397 nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
398  void* closure,
399  const char* rawSegment,
400  PRUint32 toOffset,
401  PRUint32 count,
402  PRUint32* writeCount)
403 {
404  nsCString* decodedData = static_cast<nsCString*>(closure);
405  decodedData->Append(rawSegment, count);
406  *writeCount = count;
407  return NS_OK;
408 }
409 
410 NS_IMETHODIMP
411 nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
412  nsIInputStream* stream, PRUint32 offset,
413  PRUint32 count)
414 {
415  PRUint32 read;
416  return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
417  &read);
418 }
419 
420 NS_IMETHODIMP
421 nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
422  nsresult status)
423 {
424  return NS_OK;
425 }
426 
427 NS_METHOD
428 nsFeedSniffer::Register(nsIComponentManager *compMgr, nsIFile *path,
429  const char *registryLocation,
430  const char *componentType,
431  const nsModuleComponentInfo *info)
432 {
433  nsresult rv;
434  nsCOMPtr<nsICategoryManager> catman = do_GetService(NS_CATEGORYMANAGER_CONTRACTID, &rv);
435  if (NS_FAILED(rv))
436  return rv;
437 
438  return catman->AddCategoryEntry(NS_CONTENT_SNIFFER_CATEGORY, "Feed Sniffer",
439  NS_FEEDSNIFFER_CONTRACTID, PR_TRUE, PR_TRUE,
440  nsnull);
441 }
function start(ch)
static PRBool StringBeginsWithLowercaseLiteral(nsAString &aString, const char(&aSubstring)[N])
#define TYPE_RSS
NS_DECL_ISUPPORTS NS_DECL_NSICONTENTSNIFFER NS_DECL_NSIREQUESTOBSERVER static NS_DECL_NSISTREAMLISTENER NS_METHOD AppendSegmentToString(nsIInputStream *inputStream, void *closure, const char *rawSegment, PRUint32 toOffset, PRUint32 count, PRUint32 *writeCount)
#define NS_RDF
return NS_OK
static NS_METHOD Register(nsIComponentManager *compMgr, nsIFile *path, const char *registryLocation, const char *componentType, const nsModuleComponentInfo *info)
PRBool HasAttachmentDisposition(nsIHttpChannel *httpChannel)
var converter
PRUint32 const nsAString & aSubstring
#define TYPE_MAYBE_FEED
#define NS_RSS
#define MAX_BYTES
PRUint32 & offset
var count
Definition: test_bug7406.js:32
static const char * FindChar(char c, const char *begin, const char *end)
static PRBool ContainsTopLevelSubstring(nsACString &dataString, const char *substring)
#define TYPE_ATOM
var uri
Definition: FeedWriter.js:1135
NS_IMPL_ISUPPORTS3(nsFeedSniffer, nsIContentSniffer, nsIStreamListener, nsIRequestObserver) nsresult nsFeedSniffer
static PRBool IsDocumentElement(const char *start, const char *end)
nsresult ConvertEncodedData(nsIRequest *request, const PRUint8 *data, PRUint32 length)
observe data
Definition: FeedWriter.js:1329
#define NS_FEEDSNIFFER_CONTRACTID