webScraper.js
Go to the documentation of this file.
1 // vim: set sw=2 :miv
2 //
3 // BEGIN SONGBIRD GPL
4 //
5 // This file is part of the Songbird web player.
6 //
7 // Copyright(c) 2005-2008 POTI, Inc.
8 // http://songbirdnest.com
9 //
10 // This file may be licensed under the terms of of the
11 // GNU General Public License Version 2 (the "GPL").
12 //
13 // Software distributed under the License is distributed
14 // on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either
15 // express or implied. See the GPL for the specific language
16 // governing rights and limitations.
17 //
18 // You should have received a copy of the GPL along with this
19 // program. If not, go to http://www.gnu.org/licenses/gpl.html
20 // or write to the Free Software Foundation, Inc.,
21 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 //
23 // END SONGBIRD GPL
24 //
25 
26 /*
27  The WebScraper is a system for extracting media from web pages.
28  It is composed of a number of steps which run in the background
29  after a page's DOM content is loaded, and fills in a playlist
30  provided by the sbTabBrowserTab with any located media.
31 
32  The system is designed to be extensible, but at the moment,
33  no extensibility API is provided.
34 
35  \sa scraperSteps.js
36 */
37 
38 Components.utils.import("resource://app/jsmodules/DOMUtils.jsm");
39 
40 function WebScraper(medialist) {
41  this.medialist = medialist;
42 
43  // Add a .job to implement sbIJobProgress
44  this.job = new SBJobUtils.JobBase();
45  this.job._progress = 0;
46  this.job._total = 0;
47 
48  // Create a DOM event listener set.
49  this._domEventListenerSet = new DOMEventListenerSet();
50 
51  this._terminate = false;
52  this._seenURLs = {};
53 }
54 WebScraper.prototype = {
55  start: function WebScraper_start(aNode) {
56  if (!aNode) {
57  Components.utils.reportError(
58  "WebScraper::start(aNode) called with nothing to scrape!\n");
59  return;
60  }
61 
62  let webScraper = this;
63  webScraper.job.notifyJobProgressListeners();
64 
65  // TODO: This is where extensibility will come in, somehow.
66  let steps = [
67  WebScraperSteps.DocumentURLSource,
68  WebScraperSteps.CancelScrape,
69  WebScraperSteps.MediaURL,
70  WebScraperSteps.Hacks_DropBadUrls,
71  WebScraperSteps.DupeCheck,
72  //WebScraperSteps.Squawk,
73  WebScraperSteps.AddOriginPage,
74  WebScraperSteps.CreateMediaItem,
75  WebScraperSteps.ScanForMetadata,
76  WebScraperSteps.Sink
77  ];
78 
79  // build the pipeline with the given document
80  let pipeline = null;
81  for (let i in steps.reverse()) {
82  pipeline = steps[i](webScraper, aNode, pipeline, this.medialist);
83  pipeline.next();
84  }
85 
86  // Watch for DOM nodes getting added to documents we scrape.
87  if(aNode instanceof Document){
88  this._domEventListenerSet.add
89  (aNode,
90  "DOMNodeInserted",
91  function(event) { webScraper.start(event.originalTarget) },
92  true);
93  }
94 
95  // begin the pipeline.
96  pseudoThread(pipeline);
97  },
98 
99  cancel: function WebScraper_cancel() {
100  this._domEventListenerSet.removeAll();
101  this._terminate = true;
102  }
103 };
104 
105 // Handy bit of reusable code:
106 // do cooperative multitasking using the window event handler and generators
107 function pseudoThread(gen) {
108  try{
109  if (gen.next()) {
110  window.setTimeout(function() {
111  try {
112  pseudoThread(gen);
113  }
114  catch (e) {
115  Components.utils.reportError(e);
116  }
117 
118  }, 0);
119  } else {
120  gen.close();
121  }
122  } catch (e if e instanceof StopIteration) {
123  // no worries
124  } catch(e) {
125  gen.close();
126  Cu.reportError(e);
127  };
128 }
function start(ch)
const Cu
function pseudoThread(gen)
Definition: webScraper.js:107
var event
function DOMEventListenerSet()
Definition: DOMUtils.jsm:766
let window
var WebScraperSteps
Definition: scraperSteps.js:55
function WebScraper(medialist)
Definition: webScraper.js:40
return null
Definition: FeedWriter.js:1143
var Document
_getSelectedPageStyle s i