diff --git a/phantom-scrape.js b/phantom-scrape.js index d1c1ce5..a3c580b 100644 --- a/phantom-scrape.js +++ b/phantom-scrape.js @@ -4,10 +4,14 @@ var url = system.args[1]; var readabilityPath = system.args[2]; var userAgent = system.args[3]; var consoleLogs = []; +var injectingReadabilityJS = false; // Prevent page js errors to break JSON output // XXX: should we log these instead? -phantom.onError = page.onError = function(){}; +phantom.onError = page.onError = function(err) { + if (injectingReadabilityJS) + consoleLogs.push("While injecting Readability.js - " + err) +}; function exitWithError(message) { outputJSON({error: {message: message}}); @@ -22,6 +26,14 @@ function outputJSON(object) { * Note: This function runs within page environment. */ function runReadability(url, userAgent, pageContent) { + // PhantomJS's onConsoleMessage converts all the `console.log()` parameters + // from Readability's debug output to a single string, which is not very useful + // since you get strings like "Reader: (Readability) [Object Arguments]". + // Luckily Readability will use `dump()` if we define it here. + window.dump = function(msg) { + console.log(msg.trim()); // this triggers page.onConsoleMessage below + }; + var location = document.location; var uri = { spec: location.href, @@ -31,7 +43,7 @@ function runReadability(url, userAgent, pageContent) { pathBase: location.protocol + "//" + location.host + location.pathname.substr(0, location.pathname.lastIndexOf("/") + 1) }; try { - var readabilityObj = new Readability(uri, document); + var readabilityObj = new Readability(uri, document, {debug: false}); var isProbablyReaderable = readabilityObj.isProbablyReaderable(); var result = readabilityObj.parse(); if (result) { @@ -49,7 +61,7 @@ function runReadability(url, userAgent, pageContent) { } catch (err) { return { error: { - message: err.message, + message: "runReadability - " + err.message, line: err.line, stack: err.stack, sourceHTML: pageContent || "Empty page content." @@ -82,13 +94,15 @@ page.open(url, function(status) { if (status !== "success") { return exitWithError("Unable to access " + url); } + injectingReadabilityJS = true; if (!page.injectJs(readabilityPath)) { exitWithError("Couldn't inject " + readabilityPath); } + injectingReadabilityJS = false; var result = page.evaluate(runReadability, url, page.settings.userAgent, page.content); - if (result && result.error) { + if (result.error) { result.error.consoleLogs = consoleLogs; - } else if (result && result.content) { + } else { result.consoleLogs = consoleLogs; } outputJSON(result); diff --git a/scrape.js b/scrape.js index 0e89a9e..e4d78a4 100644 --- a/scrape.js +++ b/scrape.js @@ -1,3 +1,5 @@ +// This module runs in Node, in response to the requests to the readable-proxy. + var childProcess = require("child_process"); var phantomjs = require("phantomjs-prebuilt"); var binPath = phantomjs.path; @@ -8,36 +10,52 @@ var objectAssign = require("object-assign"); var readabilityPath = process.env.READABILITY_LIB_PATH || path.normalize(path.join(__dirname, "vendor", "Readability.js")); +/** + * Runs the PhantomJS executable to process the given URL in a (headless) web + * browser context via phantom-scrape.js. + * @returns a promise with the results of running readability on the given URL. + */ module.exports = function scrape(url, options) { options = options || {}; if (!url) throw new Error("Missing url."); return new Promise(function(fulfill, reject) { var childArgs = [path.join(__dirname, "phantom-scrape.js"), url, readabilityPath]; + var execOpts = {}; if (options.userAgent) { childArgs.push(options.userAgent); } - childProcess.execFile(binPath, childArgs, function(err, stdout, stderr) { - if (err) { - return reject(err); - } + if (options.phantomJSDebug) { + childArgs.unshift("--debug=true"); + // Since the debug output may be large, use a 1MB buffer by default. + // Increase this if you get 'stderr maxBuffer exceeded'. + execOpts.maxBuffer = 1024*1024*1; + } + childProcess.execFile(binPath, childArgs, execOpts, function(err, stdout, stderr) { var response, error; - try { - response = JSON.parse(stdout); - } catch (e) { - error = { - message: "Unable to parse JSON proxy response.", - line: e.line, - stack: e.stack - }; - } - if (response && response.error) { - error = response.error; + if (err) { + error = err; + } else { + try { + response = JSON.parse(stdout); + } catch (e) { + error = { + message: "Unable to parse JSON proxy response.", + line: e.line, + stack: e.stack + }; + } + if (response && response.error) { + error = response.error; + } else if (!response && !error) { + error = new Error("Empty scraped response."); + } } + if (error) { + error.stderr = stderr; reject(objectAssign(new Error(error.message), error)); - } else if (!response) { - reject(new Error("Empty scraped response.")); } else { + response.stderr = stderr; fulfill(response); } }); diff --git a/server.js b/server.js index 0206dbd..cb28264 100644 --- a/server.js +++ b/server.js @@ -38,15 +38,20 @@ app.get("/api", function(req, res) { app.get("/api/get", function(req, res) { var url = req.query.url, sanitize = boolArg(req.query.sanitize), - userAgent = req.query.userAgent; + scrapeOptions = { + userAgent: req.query.userAgent, + phantomJSDebug: boolArg(req.query.phantomJSDebug) + }; if (!url) { return res.status(400).json({error: "Missing url parameter"}); } - scrape(url, {userAgent: userAgent}).then(function(result) { + scrape(url, scrapeOptions).then(function(result) { res.json(sanitize ? sanitizeResult(result) : result); }).catch(function(err) { console.log(err); - res.status(500).json({error: {message: err.message}}); + res.status(500).json({error: {message: err.message}, + consoleLogs: err.consoleLogs, + stderr: err.stderr}); }); }); diff --git a/static/index.html b/static/index.html index 2f93751..83a32e1 100644 --- a/static/index.html +++ b/static/index.html @@ -18,7 +18,10 @@ -
+
+
+ Jump to debug output... +
@@ -43,7 +46,7 @@

Readability.js test page

- +
@@ -73,9 +76,16 @@

Readability.js test page

-
+

Console logs

+

PhantomJS stderr

+
+ +
+

     
diff --git a/static/main.js b/static/main.js index b50db28..dac1329 100644 --- a/static/main.js +++ b/static/main.js @@ -4,19 +4,18 @@ var q = document.querySelector.bind(document); function injectReadableContents(params, target) { - q("#error").classList.add("hide"); - var req = new XMLHttpRequest(); + q("#error-container").classList.add("hide"); + var apiUrl = [ "/api/get?sanitize=" + (params.sanitize ? "yes" : "no"), "url=" + encodeURIComponent(params.url), - "userAgent=" + encodeURIComponent(params.userAgent) + "userAgent=" + encodeURIComponent(params.userAgent), + "phantomJSDebug=" + params.phantomJSDebug ].join("&"); - req.open("GET", apiUrl, false); - req.send(null); - var jsonResponse = JSON.parse(req.responseText); - if (jsonResponse.error) { - q("#error").textContent = jsonResponse.error.message; - q("#error").classList.remove("hide"); + + return new Promise(function(resolve, reject) { + q("#submit-btn").disabled = true; + q("#error").textContent = ""; q("#readerable").textContent = ""; q("#title").textContent = ""; q("#byline").textContent = ""; @@ -24,30 +23,53 @@ q("#dir").textContent = ""; q("#excerpt").textContent = ""; q("#logs").value = ""; + q("#stderr").textContent = ""; target.contentDocument.body.innerHTML = ""; - } else { - q("#error").textContent = ""; - q("#readerable").textContent = jsonResponse.isProbablyReaderable; - q("#title").textContent = jsonResponse.title; - q("#byline").textContent = jsonResponse.byline; - q("#length").textContent = jsonResponse.length; - q("#dir").textContent = jsonResponse.dir; - q("#excerpt").textContent = jsonResponse.excerpt; - q("#logs").value = (jsonResponse.consoleLogs || []).join("\n"); - target.contentDocument.body.innerHTML = jsonResponse.content; - } + + fetch(apiUrl) + .then(function(response) { + return response.json(); + }) + .then(function(jsonResponse) { + q("#stderr").textContent = jsonResponse.stderr || ""; + q("#logs").value = (jsonResponse.consoleLogs || []).join("\n"); + if (jsonResponse.error) { + throw jsonResponse.error; + } else { + q("#readerable").textContent = jsonResponse.isProbablyReaderable; + q("#title").textContent = jsonResponse.title; + q("#byline").textContent = jsonResponse.byline; + q("#length").textContent = jsonResponse.length; + q("#dir").textContent = jsonResponse.dir; + q("#excerpt").textContent = jsonResponse.excerpt; + target.contentDocument.body.innerHTML = jsonResponse.content; + } + q("#submit-btn").disabled = false; + resolve(jsonResponse); + }) + .catch(function (reason) { + q("#submit-btn").disabled = false; + q("#error").textContent = reason.message; + q("#error-container").classList.remove("hide"); + reject(reason); + }); + }); } function init() { q("form").addEventListener("submit", function(event) { event.preventDefault(); var url = q("#url").value; - q("#source").src = url; + q("#source").src = ""; injectReadableContents({ - url: url, - sanitize: q("#sanitize").checked, - userAgent: q("#userAgent").value - }, q("#target")); + url: url, + sanitize: q("#sanitize").checked, + phantomJSDebug: q("#phantomJSDebug").checked, + userAgent: q("#userAgent").value + }, q("#target")) + .then(function() { + q("#source").src = url; + }); }); } diff --git a/test/index.js b/test/index.js index bf3ea7d..8ae85bc 100644 --- a/test/index.js +++ b/test/index.js @@ -41,7 +41,7 @@ describe("Tests", function() { it("should handle rejection on process call error", function(done) { var fakeErr = new Error("Boom"); - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(fakeErr); }); @@ -52,7 +52,7 @@ describe("Tests", function() { }); it("should reject on stdout json parsing failure", function(done) { - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(null, "invalid.json.string"); }); @@ -63,7 +63,7 @@ describe("Tests", function() { }); it("should reject on data extraction error", function(done) { - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(null, JSON.stringify({error: {message: "Foo"}})); }); @@ -75,7 +75,7 @@ describe("Tests", function() { }); it("should fulfill with a valid json result", function(done) { - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(null, JSON.stringify({title: "plop", content: "plip"})); }); @@ -131,7 +131,7 @@ describe("Tests", function() { }); it("should return scraped response", function(done) { - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(null, JSON.stringify({title: "plop"})); }); @@ -145,7 +145,7 @@ describe("Tests", function() { }); it("should return a server error on call error", function(done) { - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(null, JSON.stringify({error: {message: "fail"}})); }); @@ -159,7 +159,7 @@ describe("Tests", function() { }); it("should apply custom user agent when provided", function(done) { - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(null, "{}"); }); @@ -173,7 +173,7 @@ describe("Tests", function() { }); it("should return sanitized response when sanitize arg is passed", function(done) { - sandbox.stub(childProcess, "execFile", function(exec, args, cb) { + sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) { cb(null, JSON.stringify({content: "

plop

"})); });