Skip to content
This repository was archived by the owner on Nov 10, 2017. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions phantom-scrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@ var url = system.args[1];
var readabilityPath = system.args[2];
var userAgent = system.args[3];
var consoleLogs = [];
var injectingReadabilityJS = false;

// Prevent page js errors to break JSON output
// XXX: should we log these instead?
phantom.onError = page.onError = function(){};
phantom.onError = page.onError = function(err) {
if (injectingReadabilityJS)
consoleLogs.push("While injecting Readability.js - " + err)
};

function exitWithError(message) {
outputJSON({error: {message: message}});
Expand All @@ -22,6 +26,14 @@ function outputJSON(object) {
* Note: This function runs within page environment.
*/
function runReadability(url, userAgent, pageContent) {
// PhantomJS's onConsoleMessage converts all the `console.log()` parameters
// from Readability's debug output to a single string, which is not very useful
// since you get strings like "Reader: (Readability) [Object Arguments]".
// Luckily Readability will use `dump()` if we define it here.
window.dump = function(msg) {
console.log(msg.trim()); // this triggers page.onConsoleMessage below
};

var location = document.location;
var uri = {
spec: location.href,
Expand All @@ -31,7 +43,7 @@ function runReadability(url, userAgent, pageContent) {
pathBase: location.protocol + "//" + location.host + location.pathname.substr(0, location.pathname.lastIndexOf("/") + 1)
};
try {
var readabilityObj = new Readability(uri, document);
var readabilityObj = new Readability(uri, document, {debug: false});
var isProbablyReaderable = readabilityObj.isProbablyReaderable();
var result = readabilityObj.parse();
if (result) {
Expand All @@ -49,7 +61,7 @@ function runReadability(url, userAgent, pageContent) {
} catch (err) {
return {
error: {
message: err.message,
message: "runReadability - " + err.message,
line: err.line,
stack: err.stack,
sourceHTML: pageContent || "Empty page content."
Expand Down Expand Up @@ -82,13 +94,15 @@ page.open(url, function(status) {
if (status !== "success") {
return exitWithError("Unable to access " + url);
}
injectingReadabilityJS = true;
if (!page.injectJs(readabilityPath)) {
exitWithError("Couldn't inject " + readabilityPath);
}
injectingReadabilityJS = false;
var result = page.evaluate(runReadability, url, page.settings.userAgent, page.content);
if (result && result.error) {
if (result.error) {
result.error.consoleLogs = consoleLogs;
} else if (result && result.content) {
} else {
result.consoleLogs = consoleLogs;
}
outputJSON(result);
Expand Down
52 changes: 35 additions & 17 deletions scrape.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// This module runs in Node, in response to the requests to the readable-proxy.

var childProcess = require("child_process");
var phantomjs = require("phantomjs-prebuilt");
var binPath = phantomjs.path;
Expand All @@ -8,36 +10,52 @@ var objectAssign = require("object-assign");
var readabilityPath = process.env.READABILITY_LIB_PATH ||
path.normalize(path.join(__dirname, "vendor", "Readability.js"));

/**
* Runs the PhantomJS executable to process the given URL in a (headless) web
* browser context via phantom-scrape.js.
* @returns a promise with the results of running readability on the given URL.
*/
module.exports = function scrape(url, options) {
options = options || {};
if (!url) throw new Error("Missing url.");
return new Promise(function(fulfill, reject) {
var childArgs = [path.join(__dirname, "phantom-scrape.js"), url, readabilityPath];
var execOpts = {};
if (options.userAgent) {
childArgs.push(options.userAgent);
}
childProcess.execFile(binPath, childArgs, function(err, stdout, stderr) {
if (err) {
return reject(err);
}
if (options.phantomJSDebug) {
childArgs.unshift("--debug=true");
// Since the debug output may be large, use a 1MB buffer by default.
// Increase this if you get 'stderr maxBuffer exceeded'.
execOpts.maxBuffer = 1024*1024*1;
}
childProcess.execFile(binPath, childArgs, execOpts, function(err, stdout, stderr) {
var response, error;
try {
response = JSON.parse(stdout);
} catch (e) {
error = {
message: "Unable to parse JSON proxy response.",
line: e.line,
stack: e.stack
};
}
if (response && response.error) {
error = response.error;
if (err) {
error = err;
} else {
try {
response = JSON.parse(stdout);
} catch (e) {
error = {
message: "Unable to parse JSON proxy response.",
line: e.line,
stack: e.stack
};
}
if (response && response.error) {
error = response.error;
} else if (!response && !error) {
error = new Error("Empty scraped response.");
}
}

if (error) {
error.stderr = stderr;
reject(objectAssign(new Error(error.message), error));
} else if (!response) {
reject(new Error("Empty scraped response."));
} else {
response.stderr = stderr;
fulfill(response);
}
});
Expand Down
11 changes: 8 additions & 3 deletions server.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,20 @@ app.get("/api", function(req, res) {
app.get("/api/get", function(req, res) {
var url = req.query.url,
sanitize = boolArg(req.query.sanitize),
userAgent = req.query.userAgent;
scrapeOptions = {
userAgent: req.query.userAgent,
phantomJSDebug: boolArg(req.query.phantomJSDebug)
};
if (!url) {
return res.status(400).json({error: "Missing url parameter"});
}
scrape(url, {userAgent: userAgent}).then(function(result) {
scrape(url, scrapeOptions).then(function(result) {
res.json(sanitize ? sanitizeResult(result) : result);
}).catch(function(err) {
console.log(err);
res.status(500).json({error: {message: err.message}});
res.status(500).json({error: {message: err.message},
consoleLogs: err.consoleLogs,
stderr: err.stderr});
});
});

Expand Down
16 changes: 13 additions & 3 deletions static/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
<div class="page-header">
<h1>Readability.js <small>test page</small></h1>
</div>
<div class="alert alert-danger hide" id="error"></div>
<div class="alert alert-danger hide" id="error-container">
<div id="error"></div>
<a href="#debug">Jump to debug output...</a>
</div>
<div class="row">
<div class="col-md-6">
<form class="form-horizontal" id="form">
Expand All @@ -43,7 +46,7 @@ <h1>Readability.js <small>test page</small></h1>
</div>
<div class="form-group">
<div class="col-sm-offset-1 col-sm-10">
<input class="btn btn-info" type="submit">
<input class="btn btn-info" id="submit-btn" type="submit">
</div>
</div>
</form>
Expand Down Expand Up @@ -73,9 +76,16 @@ <h1>Readability.js <small>test page</small></h1>
</div>
</div>
</div>
<div>
<div id="debug">
<h3>Console logs</h3>
<textarea id="logs"></textarea>
<h3>PhantomJS stderr</h3>
<div class="checkbox">
<label><input type="checkbox" id="phantomJSDebug">
Enable additional debugging output from PhantomJS (useful for troubleshooting the "Unable to access..." error)
</label>
</div>
<pre id="stderr"></pre>
</div>
</div>
<script src="main.js"></script>
Expand Down
72 changes: 47 additions & 25 deletions static/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,50 +4,72 @@
var q = document.querySelector.bind(document);

function injectReadableContents(params, target) {
q("#error").classList.add("hide");
var req = new XMLHttpRequest();
q("#error-container").classList.add("hide");

var apiUrl = [
"/api/get?sanitize=" + (params.sanitize ? "yes" : "no"),
"url=" + encodeURIComponent(params.url),
"userAgent=" + encodeURIComponent(params.userAgent)
"userAgent=" + encodeURIComponent(params.userAgent),
"phantomJSDebug=" + params.phantomJSDebug
].join("&");
req.open("GET", apiUrl, false);
req.send(null);
var jsonResponse = JSON.parse(req.responseText);
if (jsonResponse.error) {
q("#error").textContent = jsonResponse.error.message;
q("#error").classList.remove("hide");

return new Promise(function(resolve, reject) {
q("#submit-btn").disabled = true;
q("#error").textContent = "";
q("#readerable").textContent = "";
q("#title").textContent = "";
q("#byline").textContent = "";
q("#length").textContent = "";
q("#dir").textContent = "";
q("#excerpt").textContent = "";
q("#logs").value = "";
q("#stderr").textContent = "";
target.contentDocument.body.innerHTML = "";
} else {
q("#error").textContent = "";
q("#readerable").textContent = jsonResponse.isProbablyReaderable;
q("#title").textContent = jsonResponse.title;
q("#byline").textContent = jsonResponse.byline;
q("#length").textContent = jsonResponse.length;
q("#dir").textContent = jsonResponse.dir;
q("#excerpt").textContent = jsonResponse.excerpt;
q("#logs").value = (jsonResponse.consoleLogs || []).join("\n");
target.contentDocument.body.innerHTML = jsonResponse.content;
}

fetch(apiUrl)
.then(function(response) {
return response.json();
})
.then(function(jsonResponse) {
q("#stderr").textContent = jsonResponse.stderr || "<stderr is empty>";
q("#logs").value = (jsonResponse.consoleLogs || []).join("\n");
if (jsonResponse.error) {
throw jsonResponse.error;
} else {
q("#readerable").textContent = jsonResponse.isProbablyReaderable;
q("#title").textContent = jsonResponse.title;
q("#byline").textContent = jsonResponse.byline;
q("#length").textContent = jsonResponse.length;
q("#dir").textContent = jsonResponse.dir;
q("#excerpt").textContent = jsonResponse.excerpt;
target.contentDocument.body.innerHTML = jsonResponse.content;
}
q("#submit-btn").disabled = false;
resolve(jsonResponse);
})
.catch(function (reason) {
q("#submit-btn").disabled = false;
q("#error").textContent = reason.message;
q("#error-container").classList.remove("hide");
reject(reason);
});
});
}

function init() {
q("form").addEventListener("submit", function(event) {
event.preventDefault();
var url = q("#url").value;
q("#source").src = url;
q("#source").src = "";
injectReadableContents({
url: url,
sanitize: q("#sanitize").checked,
userAgent: q("#userAgent").value
}, q("#target"));
url: url,
sanitize: q("#sanitize").checked,
phantomJSDebug: q("#phantomJSDebug").checked,
userAgent: q("#userAgent").value
}, q("#target"))
.then(function() {
q("#source").src = url;
});
});
}

Expand Down
16 changes: 8 additions & 8 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ describe("Tests", function() {

it("should handle rejection on process call error", function(done) {
var fakeErr = new Error("Boom");
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(fakeErr);
});

Expand All @@ -52,7 +52,7 @@ describe("Tests", function() {
});

it("should reject on stdout json parsing failure", function(done) {
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, "invalid.json.string");
});

Expand All @@ -63,7 +63,7 @@ describe("Tests", function() {
});

it("should reject on data extraction error", function(done) {
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({error: {message: "Foo"}}));
});

Expand All @@ -75,7 +75,7 @@ describe("Tests", function() {
});

it("should fulfill with a valid json result", function(done) {
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({title: "plop", content: "plip"}));
});

Expand Down Expand Up @@ -131,7 +131,7 @@ describe("Tests", function() {
});

it("should return scraped response", function(done) {
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({title: "plop"}));
});

Expand All @@ -145,7 +145,7 @@ describe("Tests", function() {
});

it("should return a server error on call error", function(done) {
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({error: {message: "fail"}}));
});

Expand All @@ -159,7 +159,7 @@ describe("Tests", function() {
});

it("should apply custom user agent when provided", function(done) {
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, "{}");
});

Expand All @@ -173,7 +173,7 @@ describe("Tests", function() {
});

it("should return sanitized response when sanitize arg is passed", function(done) {
sandbox.stub(childProcess, "execFile", function(exec, args, cb) {
sandbox.stub(childProcess, "execFile", function(exec, args, opts, cb) {
cb(null, JSON.stringify({content: "<p><script>alert('xss')</script>plop</p>"}));
});

Expand Down