scraperSteps.js
Go to the documentation of this file.
1 // vim: set sw=2 :miv
2 //
3 // BEGIN SONGBIRD GPL
4 //
5 // This file is part of the Songbird web player.
6 //
7 // Copyright(c) 2005-2008 POTI, Inc.
8 // http://songbirdnest.com
9 //
10 // This file may be licensed under the terms of of the
11 // GNU General Public License Version 2 (the "GPL").
12 //
13 // Software distributed under the License is distributed
14 // on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either
15 // express or implied. See the GPL for the specific language
16 // governing rights and limitations.
17 //
18 // You should have received a copy of the GPL along with this
19 // program. If not, go to http://www.gnu.org/licenses/gpl.html
20 // or write to the Free Software Foundation, Inc.,
21 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 //
23 // END SONGBIRD GPL
24 //
25 
26 /* Web Scraper Steps */
27 
28 /*
29  This file is part of the web scraper, a system for creating playlists
30  out of web content. The WebScraper instantiates these steps.
31 
32  The web scraper is based around an extensible architecture.
33 
34  The basic idea is that the data flows through a graph (which is strictly
35  linear at this point, though not by necessity.) Data enters via "Sources"
36  and is discarded at a "Sink". Intermediate steps examine incoming items
37  and annotate, modify, add, or discard items. Below is a sample "Identity"
38  step, which can be added with no effect on the pipeline.
39 
40  Each step is implemented as a javascript generator function ScraperStep_and the steps
41  below make good examples of ways in which these tools can be pieced together.
42 
43  (For more documentation on generators see:
44  http://developer.mozilla.org/en/New_in_JavaScript_1.7
45  )
46 
47  The default execution flow is such that each call of pipeline.next() should
48  introduce a single new link to the the pipeline and run it through to the finish.
49 
50  Yielding something false from your step will close down the pipeline.
51 
52  \sa webScraper.js
53 */
54 
56  /*
57  * Identity
58  *
59  * A do-nothing template step for a pipeline.
60  * The "properties" starts as an anonymous object containing properties for the
61  * eventual media properties and a contextNode containing the location the properties was
62  * found.
63  * for example:
64  * { contextNode: <someNode>, SBProperties.contentURL: "http://some/file" }
65  */
66  Identity: function ScraperStep_Identity(scraper, node, pipeline) {
67  var properties;
68  while((properties = yield properties)) {
69  pipeline.send(properties);
70  }
71  },
72 
73  /* DocumentURLSource
74  * Find all URL-alike items in a document and pass them
75  * into the pipeline for further processing.
76  * Must be the first step in a pipeline, as it does not
77  * check for incoming items.
78  */
79  DocumentURLSource: function ScraperStep_DocumentURLSource(scraper, node, pipeline) {
80  if(!node){
81  yield false;
82  }
83 
84  // Use xpath instead of getElementsByTagName to preserve
85  // document order between tag types.
86  // XXX: note the '.' in front of the expressions to select a set of nodes
87  // relative to the context node.
88  var xpath = [
89  ".//@href",
90  // TODO: ultimately, some of these things really shouldn't be treated as urls
91  // particularly the param/@values which are used for random crap all the time
92  ".//embed/@src", ".//*[local-name()='embed']/@src",
93  ".//object/@data", ".//*[local-name()='object']/@data",
94  ".//param/@value", ".//*[local-name()='param']/@value",
95  ".//enclosure/@url", ".//*[local-name()='enclosure']/@url"
96  ].join('|');
97 
98  var nodeDocument;
99  if (node.ownerDocument) {
100  nodeDocument = node.ownerDocument;
101  }
102  else {
103  nodeDocument = node;
104  }
105 
106  // Yield here to defer beginning the scan while the generators start up.
107  yield true;
108 
109  var results = nodeDocument.evaluate(
110  xpath, node, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null );
111 
112  // for progress reporting.
113  scraper._total = results.snapshotLength;
114  for (var i = 0; i < results.snapshotLength; i++) {
115  let contextNode = results.snapshotItem(i);
116  let url = contextNode.value;
117 
118  // Resolve relative urls.
119  try {
120  url = nodeDocument.baseURIObject.resolve(url);
121  }
122  catch (e) {
123  url = null;
124  }
125 
126  if (url) {
127  let properties = {};
128 
129  properties.contextNode = contextNode;
130  properties[SBProperties.originURL] = properties[SBProperties.contentURL] = url;
131 
132  // Update progress. :)
133  scraper.job._progress++;
134  scraper.job.notifyJobProgressListeners();
135 
136  pipeline.send(properties);
137  }
138 
139  if (i % 10 == 0) {
140  yield true;
141  }
142  }
143  },
144 
145  CancelScrape: function ScraperStep_CancelScrape(scraper, node, pipeline) {
146  var properties;
147  while((properties = yield properties)) {
148  if (scraper._terminate) {
149  // This will throw a StopIteration exception and roll everything up.
150  return;
151  }
152  pipeline.send(properties);
153  }
154  },
155 
156  /* MediaURL
157  * a media url filter. throws out anything that doesn't look like a media URL
158  * not really a great policy, but at least it's easy to replace.
159  */
160  MediaURL: function ScraperStep_MediaURL(scraper, node, pipeline) {
161  Components.utils.import("resource://app/jsmodules/ArrayConverter.jsm");
162  var typeSniffer = Cc["@songbirdnest.com/Songbird/Mediacore/TypeSniffer;1"]
163  .createInstance(Ci.sbIMediacoreTypeSniffer);
164  var extensionsEnum = typeSniffer.mediaFileExtensions;
165  if (!Application.prefs.getValue("songbird.mediascan.enableVideo", false)) {
166  // disable video, so scan only audio - see bug 13173
167  extensionsEnum = typeSniffer.audioFileExtensions;
168  }
169  var mediaURLExtensions = [i for each (i in ArrayConverter.JSEnum(extensionsEnum))];
170  var mediaURLSchemes = ["mms", "rstp"];
171 
172  var properties;
173  while ((properties = yield properties)) {
174  var url = newURI(properties[SBProperties.contentURL])
175  if (!(url instanceof Ci.nsIURL)) {
176  continue;
177  }
178 
179  if (mediaURLExtensions.indexOf(url.fileExtension) != -1 ||
180  mediaURLSchemes.indexOf(url.scheme) != -1) {
181  pipeline.send(properties);
182  }
183  }
184  },
185 
186  /*
187  * Hacks_DropBadUrls
188  * A step used to store hacks which work around common false positives.
189  */
190  Hacks_DropBadUrls: function ScraperStep_Hacks_DropBadUrls(scraper, node, pipeline) {
191  var properties;
192  while((properties = yield properties)) {
193  var url = newURI(properties[SBProperties.contentURL])
194  if (!(url instanceof Ci.nsIURL)) {
195  continue;
196  }
197 
198  if (url.fileName.indexOf("playerID") == 0) {
199  // TODO:
200  // This hack is here because of a common Flash Player
201  // which uses a <param> value that looks like a Media URL after we resolve() it.
202  // like this example found at skr**mr:
203  // <param name="FlashVars"
204  // value='playerID=1&amp;bg=0xCDDFF3&amp;leftbg=0x357DCE&amp;
205  // lefticon=0xF2F2F2&amp; rightbg=0xF06A51&amp;rightbghover
206  // =0xAF2910&amp;righticon=0xF2F2F2&amp;righticonhover=
207  // 0xFFFFFF&amp;text=0x357DCE&amp;slider=0x357DCE&amp;track
208  // =0xFFFFFF&amp;border=0xFFFFFF&amp;loader=0xAF2910&amp;
209  // soundFile=http%3A%2F%2Fbzhrock.free.fr%2Fytrwynau1.mp3'>
210  // it would be nice if we didn't do that, but if we don't resolve()
211  // we don't pick up relative urls, and if we ignore the field altogether
212  // we risk missing out on other valid media.
213  continue;
214  }
215  pipeline.send(properties);
216  }
217  },
218 
219  /* DupeCheck
220  * Drop URLs we've already seen.
221  * Drop 'em like they're hot.
222  * Don't use this one for a web scraping session.
223  * Do use this one for a playlist parsing session.
224  */
225  DupeCheck: function ScraperStep_DupeCheck(scraper, node, pipeline) {
226  var properties;
227  while ((properties = yield properties)) {
228  if (!scraper._seenURLs[properties[SBProperties.originURL]]) {
229  pipeline.send(properties);
230  }
231  scraper._seenURLs[properties[SBProperties.originURL]] = true;
232  }
233  },
234 
235  /*
236  * AddOriginPage
237  *
238  * Adds origin page information to the properties as it passes by.
239  */
240  AddOriginPage: function ScraperStep_AddOriginPage(scraper, node, pipeline) {
241  // allow for sub-document nodes to get picked up
242  if (node.ownerDocument) {
243  node = node.ownerDocument;
244  }
245 
246  var properties;
247  while((properties = yield properties)) {
248  properties[SBProperties.originPage] = node.URL;
249  properties[SBProperties.originPageImage] = node.URL;
250  properties[SBProperties.originPageTitle] = node.title;
251  pipeline.send(properties);
252  }
253  },
254 
255 
256  CreateMediaItem: function ScraperStep_CreateMediaItem(scraper, node, pipeline, mediaList) {
257  var properties;
258  while ((properties = yield properties)) {
259  var url = newURI(properties[SBProperties.contentURL]);
260  if (!(url instanceof Ci.nsIURL)) {
261  continue;
262  }
263 
264  // Next we convert the JS object into a property-array
265  // which is used to create a media properties (which then replaces
266  // the JS object in future pipeline steps.)
267 
268  // Because we are converting the JS object into a real media properties
269  // we have to throw away the context node, as it isn't a valid media
270  // properties property.
271  delete properties.contextNode;
272 
273  var uri = newURI(properties[SBProperties.contentURL]);
274  delete properties[SBProperties.contentURL];
275  if (!uri) {
276  continue;
277  }
278 
279  if (!properties[SBProperties.trackName]) {
280  properties[SBProperties.trackName] = url.fileName;
281  }
282 
283  // set up the download button
284  properties[SBProperties.enableAutoDownload] = "1";
285  properties[SBProperties.downloadButton] = "1|0|0";
286 
287  // create a new media item, if necessary, or get an existing one for
288  // this URI
289  var mediaItem = mediaList.library.createMediaItem(
290  uri,
291  SBProperties.createArray(properties));
292  // add it to the media list, ignoring duplicates
293  if (!mediaList.contains(mediaItem)) {
294  mediaList.add(mediaItem);
295  pipeline.send(mediaItem);
296  }
297  }
298  },
299 
300  ScanForMetadata: function ScraperStep_ScanForMetadata(scraper, node, pipeline) {
301  var metadataService =
302  Components.classes["@songbirdnest.com/Songbird/FileMetadataService;1"]
303  .getService(Components.interfaces.sbIFileMetadataService);
304 
305  var mediaItem;
306  while(( mediaItem = yield mediaItem )) {
307  try {
308  // TODO: this is stupidly expensive. and verbose.
309  var mediaItemsToScan = Cc["@songbirdnest.com/moz/xpcom/threadsafe-array;1"]
310  .createInstance(Ci.nsIMutableArray);
311  mediaItemsToScan.appendElement(mediaItem, false);
312  metadataService.read(mediaItemsToScan)
313  } catch(e) {
314  Cu.reportError(e);
315  }
316  pipeline.send(mediaItem);
317  }
318  },
319 
320 
321  /* Sink
322  * A cap for a pipeline.
323  * Just absorbs any input and makes it so that your other steps
324  * don't need to do any fancy checking to see if they're the last
325  * one.
326  */
327  Sink: function ScraperStep_Sink(scraper, node, pipeline) {
328  var properties;
329  while((properties = yield properties)) {
330  // do nothing.
331  }
332  },
333 
334  /* Squawk
335  * Speak your input. A debug step.
336  * Particularly handy since Venkman doesn't seem
337  * to like setting breakpoints inside generator
338  * functions.
339  */
340  Squawk: function ScraperStep_Squawk(scraper, node, pipeline) {
341  var properties;
342  while((properties = yield properties)) {
343  var str = "";
344  for (var i in properties) {
345  str += i + ": " + properties[i] + "\n";
346  }
347  Components.utils.reportError(str);
348  pipeline.send(properties);
349  }
350  }
351 
352 };
const Cu
const Cc
var Application
Definition: sbAboutDRM.js:37
var WebScraperSteps
Definition: scraperSteps.js:55
return null
Definition: FeedWriter.js:1143
function newURI(aURLString)
let node
var uri
Definition: FeedWriter.js:1135
function url(spec)
const Ci
_getSelectedPageStyle s i