diff --git a/phantom-scrape.js b/phantom-scrape.js
index d1c1ce5..a3c580b 100644
--- a/phantom-scrape.js
+++ b/phantom-scrape.js
@@ -4,10 +4,14 @@ var url = system.args[1];
var readabilityPath = system.args[2];
var userAgent = system.args[3];
var consoleLogs = [];
+var injectingReadabilityJS = false;
// Prevent page js errors to break JSON output
// XXX: should we log these instead?
-phantom.onError = page.onError = function(){};
+phantom.onError = page.onError = function(err) {
+ if (injectingReadabilityJS)
+ consoleLogs.push("While injecting Readability.js - " + err)
+};
function exitWithError(message) {
outputJSON({error: {message: message}});
@@ -22,6 +26,14 @@ function outputJSON(object) {
* Note: This function runs within page environment.
*/
function runReadability(url, userAgent, pageContent) {
+ // PhantomJS's onConsoleMessage converts all the `console.log()` parameters
+ // from Readability's debug output to a single string, which is not very useful
+ // since you get strings like "Reader: (Readability) [Object Arguments]".
+ // Luckily Readability will use `dump()` if we define it here.
+ window.dump = function(msg) {
+ console.log(msg.trim()); // this triggers page.onConsoleMessage below
+ };
+
var location = document.location;
var uri = {
spec: location.href,
@@ -31,7 +43,7 @@ function runReadability(url, userAgent, pageContent) {
pathBase: location.protocol + "//" + location.host + location.pathname.substr(0, location.pathname.lastIndexOf("/") + 1)
};
try {
- var readabilityObj = new Readability(uri, document);
+ var readabilityObj = new Readability(uri, document, {debug: false});
var isProbablyReaderable = readabilityObj.isProbablyReaderable();
var result = readabilityObj.parse();
if (result) {
@@ -49,7 +61,7 @@ function runReadability(url, userAgent, pageContent) {
} catch (err) {
return {
error: {
- message: err.message,
+ message: "runReadability - " + err.message,
line: err.line,
stack: err.stack,
sourceHTML: pageContent || "Empty page content."
@@ -82,13 +94,15 @@ page.open(url, function(status) {
if (status !== "success") {
return exitWithError("Unable to access " + url);
}
+ injectingReadabilityJS = true;
if (!page.injectJs(readabilityPath)) {
exitWithError("Couldn't inject " + readabilityPath);
}
+ injectingReadabilityJS = false;
var result = page.evaluate(runReadability, url, page.settings.userAgent, page.content);
- if (result && result.error) {
+ if (result.error) {
result.error.consoleLogs = consoleLogs;
- } else if (result && result.content) {
+ } else {
result.consoleLogs = consoleLogs;
}
outputJSON(result);
diff --git a/scrape.js b/scrape.js
index 0e89a9e..e4d78a4 100644
--- a/scrape.js
+++ b/scrape.js
@@ -1,3 +1,5 @@
+// This module runs in Node, in response to the requests to the readable-proxy.
+
var childProcess = require("child_process");
var phantomjs = require("phantomjs-prebuilt");
var binPath = phantomjs.path;
@@ -8,36 +10,52 @@ var objectAssign = require("object-assign");
var readabilityPath = process.env.READABILITY_LIB_PATH ||
path.normalize(path.join(__dirname, "vendor", "Readability.js"));
+/**
+ * Runs the PhantomJS executable to process the given URL in a (headless) web
+ * browser context via phantom-scrape.js.
+ * @returns a promise with the results of running readability on the given URL.
+ */
module.exports = function scrape(url, options) {
options = options || {};
if (!url) throw new Error("Missing url.");
return new Promise(function(fulfill, reject) {
var childArgs = [path.join(__dirname, "phantom-scrape.js"), url, readabilityPath];
+ var execOpts = {};
if (options.userAgent) {
childArgs.push(options.userAgent);
}
- childProcess.execFile(binPath, childArgs, function(err, stdout, stderr) {
- if (err) {
- return reject(err);
- }
+ if (options.phantomJSDebug) {
+ childArgs.unshift("--debug=true");
+ // Since the debug output may be large, use a 1MB buffer by default.
+ // Increase this if you get 'stderr maxBuffer exceeded'.
+ execOpts.maxBuffer = 1024*1024*1;
+ }
+ childProcess.execFile(binPath, childArgs, execOpts, function(err, stdout, stderr) {
var response, error;
- try {
- response = JSON.parse(stdout);
- } catch (e) {
- error = {
- message: "Unable to parse JSON proxy response.",
- line: e.line,
- stack: e.stack
- };
- }
- if (response && response.error) {
- error = response.error;
+ if (err) {
+ error = err;
+ } else {
+ try {
+ response = JSON.parse(stdout);
+ } catch (e) {
+ error = {
+ message: "Unable to parse JSON proxy response.",
+ line: e.line,
+ stack: e.stack
+ };
+ }
+ if (response && response.error) {
+ error = response.error;
+ } else if (!response && !error) {
+ error = new Error("Empty scraped response.");
+ }
}
+
if (error) {
+ error.stderr = stderr;
reject(objectAssign(new Error(error.message), error));
- } else if (!response) {
- reject(new Error("Empty scraped response."));
} else {
+ response.stderr = stderr;
fulfill(response);
}
});
diff --git a/server.js b/server.js
index 0206dbd..cb28264 100644
--- a/server.js
+++ b/server.js
@@ -38,15 +38,20 @@ app.get("/api", function(req, res) {
app.get("/api/get", function(req, res) {
var url = req.query.url,
sanitize = boolArg(req.query.sanitize),
- userAgent = req.query.userAgent;
+ scrapeOptions = {
+ userAgent: req.query.userAgent,
+ phantomJSDebug: boolArg(req.query.phantomJSDebug)
+ };
if (!url) {
return res.status(400).json({error: "Missing url parameter"});
}
- scrape(url, {userAgent: userAgent}).then(function(result) {
+ scrape(url, scrapeOptions).then(function(result) {
res.json(sanitize ? sanitizeResult(result) : result);
}).catch(function(err) {
console.log(err);
- res.status(500).json({error: {message: err.message}});
+ res.status(500).json({error: {message: err.message},
+ consoleLogs: err.consoleLogs,
+ stderr: err.stderr});
});
});
diff --git a/static/index.html b/static/index.html
index 2f93751..83a32e1 100644
--- a/static/index.html
+++ b/static/index.html
@@ -18,7 +18,10 @@
-
+
@@ -73,9 +76,16 @@
Readability.js test page
-
+
Console logs
+
PhantomJS stderr
+
+
+
+
diff --git a/static/main.js b/static/main.js
index b50db28..dac1329 100644
--- a/static/main.js
+++ b/static/main.js
@@ -4,19 +4,18 @@
var q = document.querySelector.bind(document);
function injectReadableContents(params, target) {
- q("#error").classList.add("hide");
- var req = new XMLHttpRequest();
+ q("#error-container").classList.add("hide");
+
var apiUrl = [
"/api/get?sanitize=" + (params.sanitize ? "yes" : "no"),
"url=" + encodeURIComponent(params.url),
- "userAgent=" + encodeURIComponent(params.userAgent)
+ "userAgent=" + encodeURIComponent(params.userAgent),
+ "phantomJSDebug=" + params.phantomJSDebug
].join("&");
- req.open("GET", apiUrl, false);
- req.send(null);
- var jsonResponse = JSON.parse(req.responseText);
- if (jsonResponse.error) {
- q("#error").textContent = jsonResponse.error.message;
- q("#error").classList.remove("hide");
+
+ return new Promise(function(resolve, reject) {
+ q("#submit-btn").disabled = true;
+ q("#error").textContent = "";
q("#readerable").textContent = "";
q("#title").textContent = "";
q("#byline").textContent = "";
@@ -24,30 +23,53 @@
q("#dir").textContent = "";
q("#excerpt").textContent = "";
q("#logs").value = "";
+ q("#stderr").textContent = "";
target.contentDocument.body.innerHTML = "";
- } else {
- q("#error").textContent = "";
- q("#readerable").textContent = jsonResponse.isProbablyReaderable;
- q("#title").textContent = jsonResponse.title;
- q("#byline").textContent = jsonResponse.byline;
- q("#length").textContent = jsonResponse.length;
- q("#dir").textContent = jsonResponse.dir;
- q("#excerpt").textContent = jsonResponse.excerpt;
- q("#logs").value = (jsonResponse.consoleLogs || []).join("\n");
- target.contentDocument.body.innerHTML = jsonResponse.content;
- }
+
+ fetch(apiUrl)
+ .then(function(response) {
+ return response.json();
+ })
+ .then(function(jsonResponse) {
+ q("#stderr").textContent = jsonResponse.stderr || "";
+ q("#logs").value = (jsonResponse.consoleLogs || []).join("\n");
+ if (jsonResponse.error) {
+ throw jsonResponse.error;
+ } else {
+ q("#readerable").textContent = jsonResponse.isProbablyReaderable;
+ q("#title").textContent = jsonResponse.title;
+ q("#byline").textContent = jsonResponse.byline;
+ q("#length").textContent = jsonResponse.length;
+ q("#dir").textContent = jsonResponse.dir;
+ q("#excerpt").textContent = jsonResponse.excerpt;
+ target.contentDocument.body.innerHTML = jsonResponse.content;
+ }
+ q("#submit-btn").disabled = false;
+ resolve(jsonResponse);
+ })
+ .catch(function (reason) {
+ q("#submit-btn").disabled = false;
+ q("#error").textContent = reason.message;
+ q("#error-container").classList.remove("hide");
+ reject(reason);
+ });
+ });
}
function init() {
q("form").addEventListener("submit", function(event) {
event.preventDefault();
var url = q("#url").value;
- q("#source").src = url;
+ q("#source").src = "";
injectReadableContents({
- url: url,
- sanitize: q("#sanitize").checked,
- userAgent: q("#userAgent").value
- }, q("#target"));
+ url: url,
+ sanitize: q("#sanitize").checked,
+ phantomJSDebug: q("#phantomJSDebug").checked,
+ userAgent: q("#userAgent").value
+ }, q("#target"))
+ .then(function() {
+ q("#source").src = url;
+ });
});
}
diff --git a/test/index.js b/test/index.js
index bf3ea7d..8ae85bc 100644
--- a/test/index.js
+++ b/test/index.js
@@ -41,7 +41,7 @@ describe("Tests", function() {
it("should handle rejection on process call error", function(done) {
var fakeErr = new Error("Boom");
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(fakeErr);
});
@@ -52,7 +52,7 @@ describe("Tests", function() {
});
it("should reject on stdout json parsing failure", function(done) {
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, "invalid.json.string");
});
@@ -63,7 +63,7 @@ describe("Tests", function() {
});
it("should reject on data extraction error", function(done) {
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({error: {message: "Foo"}}));
});
@@ -75,7 +75,7 @@ describe("Tests", function() {
});
it("should fulfill with a valid json result", function(done) {
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({title: "plop", content: "plip"}));
});
@@ -131,7 +131,7 @@ describe("Tests", function() {
});
it("should return scraped response", function(done) {
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({title: "plop"}));
});
@@ -145,7 +145,7 @@ describe("Tests", function() {
});
it("should return a server error on call error", function(done) {
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({error: {message: "fail"}}));
});
@@ -159,7 +159,7 @@ describe("Tests", function() {
});
it("should apply custom user agent when provided", function(done) {
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, "{}");
});
@@ -173,7 +173,7 @@ describe("Tests", function() {
});
it("should return sanitized response when sanitize arg is passed", function(done) {
- sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
+ sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({content: "plop
"}));
});