From d12aa3d042f90260ba0652ddaf2f488a50a83816 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 6 Dec 2018 11:27:03 -0600 Subject: [PATCH 001/245] Reorg profile-data.sql.js --- dbs/schemas/profile-data.sql.js | 83 +++++++++++++++++---------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index 39b9090a..2c94e90c 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -46,18 +46,6 @@ CREATE TABLE archives_meta_type ( type TEXT ); --- a list of the draft-dats for a master-dat -CREATE TABLE archive_drafts ( - profileId INTEGER, - masterKey TEXT, -- key of the master dat - draftKey TEXT, -- key of the draft dat - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - - isActive INTEGER, -- is this the active draft? (deprecated) - - FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE -); - CREATE TABLE bookmarks ( profileId INTEGER, url TEXT NOT NULL, @@ -72,17 +60,6 @@ CREATE TABLE bookmarks ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); -CREATE TABLE templates ( - profileId INTEGER, - url TEXT NOT NULL, - title TEXT, - screenshot, - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - - PRIMARY KEY (profileId, url), - FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE -); - CREATE TABLE visits ( profileId INTEGER, url TEXT NOT NULL, @@ -102,8 +79,48 @@ CREATE TABLE visit_stats ( CREATE VIRTUAL TABLE visit_fts USING fts4 (url, title); CREATE UNIQUE INDEX visits_stats_url ON visit_stats (url); --- list of the user's installed apps +-- list of dats being looked for +CREATE TABLE watchlist ( + profileId INTEGER NOT NULL, + url TEXT NOT NULL, + description TEXT NOT NULL, + seedWhenResolved BOOLEAN NOT NULL, + resolved BOOLEAN NOT NULL DEFAULT (0), + updatedAt INTEGER DEFAULT (strftime('%s', 'now')), + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + + PRIMARY KEY (profileId, url), + FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +); + +-- list of the users current templates +-- deprecated (may return) +CREATE TABLE templates ( + profileId INTEGER, + url TEXT NOT NULL, + title TEXT, + screenshot, + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + + PRIMARY KEY (profileId, url), + FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +); + +-- a list of the draft-dats for a master-dat -- deprecated +CREATE TABLE archive_drafts ( + profileId INTEGER, + masterKey TEXT, -- key of the master dat + draftKey TEXT, -- key of the draft dat + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + + isActive INTEGER, -- is this the active draft? (deprecated) + + FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +); + +-- list of the users installed apps +-- deprecated (may return) CREATE TABLE apps ( profileId INTEGER NOT NULL, name TEXT NOT NULL, @@ -115,8 +132,8 @@ CREATE TABLE apps ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); --- log of the user's app installations --- deprecated +-- log of the users app installations +-- deprecated (may return) CREATE TABLE apps_log ( profileId INTEGER NOT NULL, name TEXT NOT NULL, @@ -126,20 +143,6 @@ CREATE TABLE apps_log ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); --- add a database for watchlist feature -CREATE TABLE watchlist ( - profileId INTEGER NOT NULL, - url TEXT NOT NULL, - description TEXT NOT NULL, - seedWhenResolved BOOLEAN NOT NULL, - resolved BOOLEAN NOT NULL DEFAULT (0), - updatedAt INTEGER DEFAULT (strftime('%s', 'now')), - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - - PRIMARY KEY (profileId, url), - FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE -); - -- deprecated CREATE TABLE workspaces ( profileId INTEGER NOT NULL, From 0a4e01f6fcb988f029831a4f70ec0b440b43174f Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 6 Dec 2018 14:44:42 -0600 Subject: [PATCH 002/245] Add the beaker.posts and beaker.followgraph API skeletons --- crawler/followgraph.js | 16 ++++++++++++++++ crawler/posts.js | 20 ++++++++++++++++++++ web-apis/bg.js | 6 ++++++ web-apis/fg/beaker.js | 17 +++++++++++++++++ web-apis/manifests/internal/followgraph.js | 6 ++++++ web-apis/manifests/internal/posts.js | 7 +++++++ 6 files changed, 72 insertions(+) create mode 100644 crawler/followgraph.js create mode 100644 crawler/posts.js create mode 100644 web-apis/manifests/internal/followgraph.js create mode 100644 web-apis/manifests/internal/posts.js diff --git a/crawler/followgraph.js b/crawler/followgraph.js new file mode 100644 index 00000000..172d75ea --- /dev/null +++ b/crawler/followgraph.js @@ -0,0 +1,16 @@ + +exports.queryAll = function () { + throw new Error('Not yet implemented') +} + +exports.query = function () { + throw new Error('Not yet implemented') +} + +exports.follow = function () { + throw new Error('Not yet implemented') +} + +exports.unfollow = function () { + throw new Error('Not yet implemented') +} diff --git a/crawler/posts.js b/crawler/posts.js new file mode 100644 index 00000000..58341be9 --- /dev/null +++ b/crawler/posts.js @@ -0,0 +1,20 @@ + +exports.list = async function () { + throw new Error('Not yet implemented') +} + +exports.get = async function () { + throw new Error('Not yet implemented') +} + +exports.create = async function () { + throw new Error('Not yet implemented') +} + +exports.edit = async function () { + throw new Error('Not yet implemented') +} + +exports.delete = async function () { + throw new Error('Not yet implemented') +} \ No newline at end of file diff --git a/web-apis/bg.js b/web-apis/bg.js index 7b431c04..6702b4cb 100644 --- a/web-apis/bg.js +++ b/web-apis/bg.js @@ -10,6 +10,8 @@ const downloadsManifest = require('./manifests/internal/downloads') const historyManifest = require('./manifests/internal/history') const sitedataManifest = require('./manifests/internal/sitedata') const watchlistManifest = require('./manifests/internal/watchlist') +const postsManifest = require('./manifests/internal/posts') +const followgraphManifest = require('./manifests/internal/followgraph') // internal apis const archivesAPI = require('./bg/archives') @@ -17,6 +19,8 @@ const bookmarksAPI = require('./bg/bookmarks') const historyAPI = require('./bg/history') const sitedataAPI = require('../dbs/sitedata').WEBAPI const watchlistAPI = require('./bg/watchlist') +const postsAPI = require('../crawler/posts') +const followgraphAPI = require('../crawler/followgraph') // external manifests const datArchiveManifest = require('./manifests/external/dat-archive') @@ -50,6 +54,8 @@ exports.setup = function () { globals.rpcAPI.exportAPI('history', historyManifest, historyAPI, internalOnly) globals.rpcAPI.exportAPI('sitedata', sitedataManifest, sitedataAPI, internalOnly) globals.rpcAPI.exportAPI('watchlist', watchlistManifest, watchlistAPI, internalOnly) + globals.rpcAPI.exportAPI('posts', postsManifest, postsAPI, internalOnly) + globals.rpcAPI.exportAPI('followgraph', followgraphManifest, followgraphAPI, internalOnly) // external apis globals.rpcAPI.exportAPI('dat-archive', datArchiveManifest, datArchiveAPI, secureOnly) diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 0d463cee..f356c463 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -8,6 +8,8 @@ const downloadsManifest = require('../manifests/internal/downloads') const historyManifest = require('../manifests/internal/history') const sitedataManifest = require('../manifests/internal/sitedata') const watchlistManifest = require('../manifests/internal/watchlist') +const postsManifest = require('../manifests/internal/posts') +const followgraphManifest = require('../manifests/internal/followgraph') exports.setup = function (rpc) { const beaker = {} @@ -22,6 +24,8 @@ exports.setup = function (rpc) { const historyRPC = rpc.importAPI('history', historyManifest, opts) const sitedataRPC = rpc.importAPI('sitedata', sitedataManifest, opts) const watchlistRPC = rpc.importAPI('watchlist', watchlistManifest, opts) + const postsRPC = rpc.importAPI('posts', postsManifest, opts) + const followgraphRPC = rpc.importAPI('followgraph', followgraphManifest, opts) // beaker.archives beaker.archives = new EventTarget() @@ -148,6 +152,19 @@ exports.setup = function (rpc) { beaker.watchlist.update = watchlistRPC.update beaker.watchlist.remove = watchlistRPC.remove beaker.watchlist.createEventsStream = () => fromEventStream(watchlistRPC.createEventsStream()) + + // beaker.posts + beaker.posts.list = postsRPC.list + beaker.posts.get = postsRPC.get + beaker.posts.create = postsRPC.create + beaker.posts.edit = postsRPC.edit + beaker.posts.delete = postsRPC.delete + + // beaker.followgraph + beaker.followgraph.queryall = followgraphRPC.queryall + beaker.followgraph.query = followgraphRPC.query + beaker.followgraph.follow = followgraphRPC.follow + beaker.followgraph.unfollow = followgraphRPC.unfollow } return beaker diff --git a/web-apis/manifests/internal/followgraph.js b/web-apis/manifests/internal/followgraph.js new file mode 100644 index 00000000..8c8549c8 --- /dev/null +++ b/web-apis/manifests/internal/followgraph.js @@ -0,0 +1,6 @@ +module.exports = { + queryAll: 'promise', + query: 'promise', + follow: 'promise', + unfollow: 'promise' +} \ No newline at end of file diff --git a/web-apis/manifests/internal/posts.js b/web-apis/manifests/internal/posts.js new file mode 100644 index 00000000..b4e21d9b --- /dev/null +++ b/web-apis/manifests/internal/posts.js @@ -0,0 +1,7 @@ +module.exports = { + list: 'promise', + get: 'promise', + create: 'promise', + edit: 'promise', + delete: 'promise' +} \ No newline at end of file From 428568f6a357d49e190307fdbbfaf4249cd755c5 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 6 Dec 2018 19:25:08 -0600 Subject: [PATCH 003/245] Implement the majority of the crawler (WIP) --- crawler/followgraph.js | 87 ++++++++++++++- crawler/index.js | 63 +++++++++++ crawler/posts.js | 107 ++++++++++++++++++- crawler/util.js | 45 ++++++++ dat/library.js | 5 +- dbs/schemas/profile-data.sql.js | 51 +++++++++ index.js | 13 ++- users/index.js | 180 ++++++++++++++++++++++++++++++++ 8 files changed, 533 insertions(+), 18 deletions(-) create mode 100644 crawler/index.js create mode 100644 crawler/util.js create mode 100644 users/index.js diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 172d75ea..6ccfab45 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -1,16 +1,95 @@ +const Events = require('events') +const db = require('../dbs/profile-data-db') +const {doCrawl} = require('./util') -exports.queryAll = function () { - throw new Error('Not yet implemented') +// constants +// = + +const TABLE_VERSION = 1 + +// globals +// = + +var events = new Events() + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +exports.crawlSite = async function (archive) { + return doCrawl(archive, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { + if (resetRequired) { + // reset all data + // TODO + } + + // find files that need to be processed + // TODO + + // process the files + // TODO + // events.emit('follow-added', sourceUrl, subjectUrl) + // events.emit('follow-removed', sourceUrl, subjectUrl) + }) } -exports.query = function () { - throw new Error('Not yet implemented') +// List urls of sites that follow subject +// - subject. String (URL). +// - returns Array +exports.listFollowers = async function (subject) { + var rows = await db.all(` + SELECT crawl_sources.url + FROM crawl_sources + INNER JOIN crawl_followgraph + ON crawl_followgraph.crawlSourceId = crawl_sources.id + AND crawl_followgraph.destUrl = ? + `, [subject]) + return rows.map(row => row.url) +} + +// List urls of sites that subject follows +// - subject. String (URL). +// - returns Array +exports.listFollows = async function (subject) { + var rows = await db.all(` + SELECT crawl_followgraph.destUrl + FROM crawl_followgraph + INNER JOIN crawl_sources + ON crawl_followgraph.crawlSourceId = crawl_sources.id + AND crawl_sources.url = ? + `, [subject]) + return rows.map(row => row.destUrl) +} + +// Check for the existence of an individual follow +// - a. String (URL), the site being queried. +// - b. String (URL), does a follow this site? +// - returns bool +exports.isAFollowingB = async function (a, b) { + var res = await db.get(` + SELECT crawl_sources.id + FROM crawl_sources + INNER JOIN crawl_followgraph + ON crawl_followgraph.crawlSourceId = crawl_sources.id + AND crawl_followgraph.destUrl = ? + WHERE crawl_sources.url = ? + `, [b, a]) + return !!res } exports.follow = function () { throw new Error('Not yet implemented') + + // update the user dat + // TODO } exports.unfollow = function () { throw new Error('Not yet implemented') + + // update the user dat + // TODO } diff --git a/crawler/index.js b/crawler/index.js new file mode 100644 index 00000000..f87825f3 --- /dev/null +++ b/crawler/index.js @@ -0,0 +1,63 @@ +const _throttle = require('lodash.throttle') +const lock = require('../lib/lock') +const users = require('../users') +const dat = require('../dat') + +const posts = require('./posts') +const followgraph = require('./followgraph') + +// globals +// = + +const watches = {} + +// exported api +// = + +exports.posts = posts +exports.followgraph = followgraph + +exports.setup = async function () { +} + +exports.watchSite = async function (archive) { + if (typeof archive === 'string') { + archive = await dat.library.getOrLoadArchive() + } + + if (!(archive.url in watches)) { + const queueCrawl = _throttle(() => crawlSite(archive), 5e3) + + // watch for file changes + watches[archive.url] = archive.pda.watch() + watches[archive.url].on('data', ([event, args]) => { + if (event === 'invalidated') { + queueCrawl() + } + }) + + // run the first crawl + crawlSite(archive) + } +} + +exports.unwatchSite = async function (url) { + // stop watching for file changes + if (url in watches) { + watches[url].close() + watches[url] = null + } +} + +async function crawlSite (archive) { + var release = await lock('crawl:' + archive.url) + try { + await Promise.all([ + posts.crawlSite(archive), + followgraph.crawlSite(archive) + ]) + } finally { + release() + } +} +exports.crawlSite = crawlSite \ No newline at end of file diff --git a/crawler/posts.js b/crawler/posts.js index 58341be9..07479d75 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -1,20 +1,117 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const db = require('../dbs/profile-data-db') +const {doCrawl} = require('./util') -exports.list = async function () { - throw new Error('Not yet implemented') +// constants +// = + +const TABLE_VERSION = 1 + +// globals +// = + +var events = new Events() + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +exports.crawlSite = async function (archive) { + return doCrawl(archive, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { + if (resetRequired) { + // reset all data + // TODO + } + + // find files that need to be processed + // TODO + + // process the files + // TODO + // events.emit('post-added', sourceUrl) + // events.emit('post-updated', sourceUrl) + // events.emit('post-removed', sourceUrl) + }) } -exports.get = async function () { - throw new Error('Not yet implemented') +exports.list = async function ({offset, limit, reverse, author} = {}) { + // validate & parse params + assert(!offset || typeof offset === 'number', 'Offset must be a number') + assert(!limit || typeof limit === 'number', 'Limit must be a number') + assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') + assert(!author || typeof author === 'string', 'Author must be a string') + if (author) { + try { author = new URL(author) } + catch (e) { throw new Error('Failed to parse author URL: ' + author) } + } + + // build query + var query = `SELECT crawl_posts.*, src.url AS crawlSourceUrl FROM crawl_posts` + var values = [] + if (author) { + query += ` INNER JOIN crawl_sources src ON src.url = ?` + values.push(author.origin) + } + if (offset) { + query += ` OFFSET ?` + values.push(offset) + } + if (limit) { + query += ` LIMIT ?` + values.push(limit) + } + query += ` ORDER BY createdAt` + if (reverse) { + query += ` DESC` + } + + // execute query + return db.all(query, values) +} + +exports.get = async function (url, pathname = undefined) { + // validate & parse params + if (url) { + try { url = new URL(url) } + catch (e) { throw new Error('Failed to parse post URL: ' + url) } + } + pathname = pathname || url.pathname + + // execute query + return db.get(` + SELECT + crawl_posts.*, src.url AS crawlSourceUrl + FROM crawl_posts + INNER JOIN crawl_sources src + ON src.id = crawl_posts.crawlSourceId + AND src.url = ? + WHERE + crawl_posts.pathname = ? + `, [url.origin, pathname]) } exports.create = async function () { throw new Error('Not yet implemented') + + // update the user dat + // TODO } exports.edit = async function () { throw new Error('Not yet implemented') + + // update the user dat + // TODO } exports.delete = async function () { throw new Error('Not yet implemented') -} \ No newline at end of file + + // update the user dat + // TODO +} diff --git a/crawler/util.js b/crawler/util.js new file mode 100644 index 00000000..303f25c0 --- /dev/null +++ b/crawler/util.js @@ -0,0 +1,45 @@ +const db = require('../dbs/profile-data-db') + +exports.doCrawl = async function (archive, crawlDataset, crawlDatasetVersion, handlerFn) { + const url = archive.url + + // fetch current crawl state + var resetRequired = false + var state = await db.get(` + SELECT crawl_sources_meta.* FROM crawl_sources_meta + INNER JOIN crawl_sources ON crawl_sources.url = ? + WHERE crawl_sources_meta.crawlDataset = ? + `, [url, crawlDataset]) + if (state.crawlDatasetVersion !== crawlDatasetVersion) { + resetRequired = true + state = null + } + if (!state) { + // new state + state = { + crawlSourceId: null, + url, + crawlDataset, + crawlDatasetVersion, + updatedAt: 0 + } + } + + // fetch current archive version + // TODO + + // fetch change log + var changes = [] // TODO + + // handle changes + await handlerFn({changes, resetRequired}) + + if (!state.crawlSourceId) { + // upsert crawl source + // TODO + } + + // upsert crawl state + state.updatedAt = Date.now() + // TODO +} \ No newline at end of file diff --git a/dat/library.js b/dat/library.js index e17dfb38..1cb63df5 100644 --- a/dat/library.js +++ b/dat/library.js @@ -5,7 +5,7 @@ const pify = require('pify') const pda = require('pauls-dat-api') const signatures = require('sodium-signatures') const parseDatURL = require('parse-dat-url') -const debounce = require('lodash.debounce') +const _debounce = require('lodash.debounce') const mkdirp = require('mkdirp') // dbs @@ -312,7 +312,7 @@ async function loadArchiveInner (key, secretKey, userSettings = null) { await pullLatestArchiveMeta(archive) // wire up events - archive.pullLatestArchiveMeta = debounce(opts => pullLatestArchiveMeta(archive, opts), 1e3) + archive.pullLatestArchiveMeta = _debounce(opts => pullLatestArchiveMeta(archive, opts), 1e3) archive.fileActStream = archive.pda.watch() archive.fileActStream.on('data', ([event, {path}]) => { if (event === 'changed') { @@ -535,6 +535,7 @@ function createArchiveProxy (key, version, archiveInfo) { const pdaStat = makeArchiveProxyPDAPromiseFn(key, version, 'stat') return { key: datEncoding.toBuf(key), + url: `dat://${key}`, discoveryKey: datEncoding.toBuf(archiveInfo.discoveryKey), writable: archiveInfo.writable, diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index 2c94e90c..46cbd6a7 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -5,6 +5,13 @@ CREATE TABLE profiles ( createdAt INTEGER DEFAULT (strftime('%s', 'now')) ); +CREATE TABLE users ( + id INTEGER PRIMARY KEY NOT NULL, + url TEXT, + isDefault INTEGER DEFAULT 0, + createdAt INTEGER +); + CREATE TABLE archives ( profileId INTEGER NOT NULL, key TEXT NOT NULL, -- dat key @@ -93,6 +100,50 @@ CREATE TABLE watchlist ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); +-- list of sites being crawled +CREATE TABLE crawl_sources ( + id INTEGER PRIMARY KEY NOT NULL, + url TEXT NOT NULL +); + +-- tracking information on the crawl-state of the sources +CREATE TABLE crawl_sources_meta ( + crawlSourceId INTEGER NOT NULL, + crawlSourceVersion INTEGER NOT NULL, + crawlDataset TEXT NOT NULL, + crawlDatasetVersion INTEGER NOT NULL, + updatedAt INTEGER DEFAULT, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +-- crawled posts +CREATE TABLE crawl_posts ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + + type TEXT NOT NULL, + content TEXT, + + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + updatedAt INTEGER DEFAULT (strftime('%s', 'now')), + crawledAt INTEGER DEFAULT (strftime('%s', 'now')), + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +-- crawled follows +CREATE TABLE crawl_followgraph ( + crawlSourceId INTEGER NOT NULL, + + destUrl TEXT NOT NULL, + + updatedAt INTEGER DEFAULT (strftime('%s', 'now')), + crawledAt INTEGER DEFAULT (strftime('%s', 'now')), + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + -- list of the users current templates -- deprecated (may return) CREATE TABLE templates ( diff --git a/index.js b/index.js index decbfc1e..7f46ea90 100644 --- a/index.js +++ b/index.js @@ -5,6 +5,8 @@ const globals = require('./globals') const {getEnvVar} = require('./lib/env') const dat = require('./dat') const dbs = require('./dbs') +const users = require('./users') +const crawler = require('./crawler') const webapis = require('./web-apis/bg') const spellChecker = require('./web-apis/bg/spell-checker') const spellCheckerLib = require('./lib/spell-checker') @@ -45,16 +47,13 @@ module.exports = { } } - // setup dat + // start subsystems + // (order is important) await dat.library.setup(opts) - - // setup watchlist await dat.watchlist.setup() - - // setup web apis + await crawler.setup(opts) + await users.setup(opts) webapis.setup(opts) - - // setup spellchecker spellCheckerLib.setup() } } diff --git a/users/index.js b/users/index.js new file mode 100644 index 00000000..da8b72b4 --- /dev/null +++ b/users/index.js @@ -0,0 +1,180 @@ +const Events = require('events') +const dat = require('../dat') +const crawler = require('../crawler') +const db = require('../dbs/profile-data-db') +const archivesDb = require('../dbs/archives') +const debug = require('../lib/debug-logger').debugLogger('users') + +// globals +// = + +var events = new Events() +var users + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +exports.setup = async function () { + // wire up events + crawler.followgraph.on('follow-added', onFollowAdded) + crawler.followgraph.on('follow-removed', onFollowRemoved) + + // load the current users + users = await db.all(`SELECT * FROM users`) + users.forEach(async (user) => { + // massage data + user.archive = null + user.isDefault = Boolean(user.isDefault) + user.createdAt = new Date(user.createdAt) + + // fetch the user archive + try { + await validateUserUrl(user.url) + user.archive = await dat.library.getOrLoadArchive(user.url) + watchUser(user) + events.emit('load-user', user) + } catch (err) { + debug('Failed to load user', {user, err}) + } + }) +} + +exports.getAll = async function () { + return Promise.all(users.map(fetchUserInfo)) +} + +const get = +exports.get = async function (url) { + var user = users.find(user => user.url === url) + if (!user) return null + return await fetchUserInfo(user) +} + +const getDefault = +exports.getDefault = async function (url = undefined) { + var user = users.find(user => user.isDefault === true) + if (!user) return null + return await fetchUserInfo(user) +} + +exports.add = async function (url) { + // make sure the user doesnt already exist + var existingUser = await get(url) + if (existingUser) return + + // validate + await validateUserUrl(url) + + // create the new user + var user = { + url, + archive: null, + isDefault: users.length === 0, + createdAt: Date.now() + } + await db.run( + `INSERT INTO users (url, isDefault, createdAt) VALUES (?, ?, ?)`, + [user.url, Number(user.isDefault), user.createdAt] + ) + + // fetch the user archive + user.archive = await dat.library.getOrLoadArchive(user.url) + watchUser(user) + events.emit('load-user', user) +} + +exports.remove = async function (url) { + // get the user + var user = await get(url) + if (!user) return + + // remove the user + users.splice(users.indexOf(user), 1) + await db.run(`DELETE FROM users WHERE url = ?`, [user.url]) + unwatchUser(user) + events.emit('unload-user', user) +} + +// internal methods +// = + +async function isUser (url) { + return !!(await get(url)) +} + +async function watchUser (user) { + // watch the user + await crawler.watchSite(user.archive) + + // watch anybody the user follows + var followUrls = await crawler.followgraph.listFollows(user.url) + followUrls.forEach(async (followUrl) => { + try { + await crawler.watchSite(followUrl) + } catch (err) { + debug('Failed to sync followed user', {url: followUrl, err}) + } + }) +} + +async function unwatchUser (user) { + // unwatch anybody the user follows + + // BUG This will cause glitches if there are any shared follows between 2 local users (which is likely) + // sites will be unwatched when they shouldn't be + // this is temporary and will fix itself when beaker restarts + // -prf + + var followUrls = await crawler.followgraph.listFollows(user.url) + followUrls.forEach(crawler.unwatchSite) + + // unwatch the user + await crawler.unwatchSite(user.url) +} + +async function onFollowAdded (sourceUrl, subjectUrl) { + if (isUser(sourceUrl)) { + try { + await crawler.watchSite(subjectUrl) + } catch (err) { + debug('Failed to sync followed user', {url: subjectUrl, err}) + } + } +} + +async function onFollowRemoved (sourceUrl, subjectUrl) { + if (isUser(sourceUrl)) { + await crawler.unwatchSite(subjectUrl) + } +} + +async function fetchUserInfo (user) { + var urlp = new URL(user.url) + var meta = await archivesDb.getMeta(urlp.hostname) + return { + url: user.url, + isDefault: user.isDefault, + title: meta.title, + description: meta.description, + createdAt: user.createdAt + } +} + +async function validateUserUrl (url) { + // make sure the archive is saved and that we own the archive + var urlp = new URL(url) + var [meta, userSettings] = await Promise.all([ + archivesDb.getMeta(urlp.hostname), + archivesDb.getUserSettings(urlp.hostname) + ]) + if (!meta.isOwner) { + throw new Error('User dat is not owned by this device') + } + if (!userSettings.isSaved) { + throw new Error('User dat has been deleted') + } +} From 36e1f9e7397cebb902fa3d6202c00f85bc97185f Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 6 Dec 2018 21:55:49 -0600 Subject: [PATCH 004/245] Implement post and follows crawlers --- crawler/followgraph.js | 65 +++++++++++++++++++++---- crawler/index.js | 9 +++- crawler/posts.js | 85 ++++++++++++++++++++++++++++----- crawler/util.js | 4 ++ dbs/schemas/profile-data.sql.js | 13 ++--- package.json | 1 + users/index.js | 8 ++++ 7 files changed, 154 insertions(+), 31 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 6ccfab45..f75d7a44 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -1,11 +1,15 @@ +const assert = require('assert') +const _difference = require('lodash.difference') const Events = require('events') const db = require('../dbs/profile-data-db') -const {doCrawl} = require('./util') +const {doCrawl, doCheckpoint} = require('./util') // constants // = const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/follows' +const JSON_PATH = '/data/follows.json' // globals // = @@ -19,20 +23,61 @@ exports.on = events.on.bind(events) exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) -exports.crawlSite = async function (archive) { +exports.crawlSite = async function (archive, crawlSourceId) { return doCrawl(archive, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info if (resetRequired) { // reset all data - // TODO + await db.run(` + DELETE FROM crawl_followgraph WHERE crawlSourceId = ? + `, [crawlSourceId]) + await doCheckpoint('crawl_followgraph', TABLE_VERSION, crawlSourceId, 0) } - // find files that need to be processed - // TODO + // did follows.json change? + var change = changes.find(c => c.path === JSON_PATH) + if (!change) { + return + } + + // read and validate + try { + var followsJson = JSON.parse(await archive.pda.readFile(JSON_PATH, 'utf8')) + assert(typeof followsJson === 'object', 'File be an object') + assert(followsJson.type === 'unwalled.garden/follows', 'JSON type must be unwalled.garden/follows') + assert(Array.isArray(followsJson.follows), 'JSON follows must be an array of strings') + followsJson.follows = followsJson.follows.filter(v => typeof v === 'string') + } catch (err) { + debug('Failed to read follows file', {url: archive.url, err}) + return + } + + // diff against the current follows + var currentFollows = await listFollows(archive) + var newFollows = followsJson.urls + var adds = _difference(newFollows, currentFollows) + var removes = _difference(currentFollows, newFollows) + + // write updates + for (let add of adds) { + await db.run(` + INSERT INTO crawl_followgraph (crawlSourceId, destUrl, crawledAt) VALUES (?, ?, ?) + `, [crawlSourceId, add, Date.now()]) + if (!supressEvents) { + events.emit('follow-added', archive.url, add) + } + } + for (let remove of removes) { + await db.run(` + DELETE FROM crawl_followgraph WHERE crawlSourceId = ? AND destUrl = ? + `, [crawlSourceId, remove]) + if (supressEvents) { + events.emit('follow-removed', archive.url, add) + } + } - // process the files - // TODO - // events.emit('follow-added', sourceUrl, subjectUrl) - // events.emit('follow-removed', sourceUrl, subjectUrl) + // write checkpoint as success + await doCheckpoint('crawl_followgraph', TABLE_VERSION, crawlSourceId, changes[changes.length - 1].version) }) } @@ -53,7 +98,7 @@ exports.listFollowers = async function (subject) { // List urls of sites that subject follows // - subject. String (URL). // - returns Array -exports.listFollows = async function (subject) { +const listFollows = exports.listFollows = async function (subject) { var rows = await db.all(` SELECT crawl_followgraph.destUrl FROM crawl_followgraph diff --git a/crawler/index.js b/crawler/index.js index f87825f3..01ce5de9 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -52,9 +52,14 @@ exports.unwatchSite = async function (url) { async function crawlSite (archive) { var release = await lock('crawl:' + archive.url) try { + // insert crawl source + // TODO + var crawlSourceId = // TODO + + // crawl individual sources await Promise.all([ - posts.crawlSite(archive), - followgraph.crawlSite(archive) + posts.crawlSite(archive, crawlSourceId), + followgraph.crawlSite(archive, crawlSourceId) ]) } finally { release() diff --git a/crawler/posts.js b/crawler/posts.js index 07479d75..a1eed7da 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -8,6 +8,8 @@ const {doCrawl} = require('./util') // = const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/post' +const JSON_PATH_REGEX = /^\/data\/posts\/([^\/]+)\.json$/i // globals // = @@ -21,21 +23,82 @@ exports.on = events.on.bind(events) exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) -exports.crawlSite = async function (archive) { +exports.crawlSite = async function (archive, crawlSourceId) { return doCrawl(archive, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info if (resetRequired) { // reset all data - // TODO + await db.run(` + DELETE FROM crawl_posts WHERE crawlSourceId = ? + `, [crawlSourceId]) + await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSourceId, 0) } - // find files that need to be processed - // TODO - - // process the files - // TODO - // events.emit('post-added', sourceUrl) - // events.emit('post-updated', sourceUrl) - // events.emit('post-removed', sourceUrl) + // collect changed posts + var changedPosts = [] // order matters, must be oldest to newest + changes.forEach(c => { + if (JSON_PATH_REGEX.test(c.path)) { + let i = changedPosts.findIndex(c2 => c2.path === c.path) + if (i) { + changedPosts.splice(i, 1) // remove from old position + } + changedPosts.push(c) + } + }) + + // read and apply each post in order + for (let changedPost of changedPosts) { + // TODO Currently the crawler will abort reading the feed if any post fails to load + // this means that a single bad or unreachable file can stop the forward progress of post indexing + // to solve this, we need to find a way to tolerate bad post-files without losing our ability to efficiently detect new posts + // -prf + if (changedPost.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_posts WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSourceId, changedPost.path]) + events.emit('post-removed', archive.url) + } else { + // read and validate + let post + try { + post = JSON.parse(await archive.pda.readFile(changedPost.path, 'utf8')) + assert(typeof post === 'object', 'File be an object') + assert(post.type === 'unwalled.garden/post', 'JSON type must be unwalled.garden/post') + assert(typeof post.content === 'string', 'JSON content must be a string') + assert(typeof post.createdAt === 'string', 'JSON createdAt must be a date-time') + assert(!isNaN(Number(new Date(post.createdAt))), 'JSON createdAt must be a date-time') + } catch (err) { + debug('Failed to read post file', {url: archive.url, path: c.path, err}) + return // abort indexing + } + + // massage the post + post.createdAt = Number(new Date(post.createdAt)) + post.updatedAt = Number(new Date(post.updatedAt)) + if (isNaN(post.updatedAt)) post.updatedAt = 0 // value is optional + + // upsert + let existingPost = await get(archive.url, c.path) + if (existingPost) { + await db.run(` + UPDATE crawl_posts + SET crawledAt = ?, content = ?, createdAt = ?, updatedAt = ? + WHERE crawlSourceId = ? AND pathname = ? + `, [Date.now(), post.content, post.createdAt, post.updatedAt, crawlSourceId, changedPost.path]) + events.emit('post-updated', archive.url) + } else { + await db.run(` + INSERT INTO crawl_posts (crawlSourceId, pathname, crawledAt, content, createdAt, updatedAt) + VALUES (?, ?, ?, ?, ?, ?) + `, [crawlSourceId, changedPost.path, Date.now(), post.content, post.createdAt, post.updatedAt]) + events.emit('post-added', archive.url) + } + + // checkpoint our progress + await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSourceId, changedPost.version) + } + } }) } @@ -74,7 +137,7 @@ exports.list = async function ({offset, limit, reverse, author} = {}) { return db.all(query, values) } -exports.get = async function (url, pathname = undefined) { +const get = exports.get = async function (url, pathname = undefined) { // validate & parse params if (url) { try { url = new URL(url) } diff --git a/crawler/util.js b/crawler/util.js index 303f25c0..59875905 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -42,4 +42,8 @@ exports.doCrawl = async function (archive, crawlDataset, crawlDatasetVersion, ha // upsert crawl state state.updatedAt = Date.now() // TODO +} + +exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion) { + // TODO } \ No newline at end of file diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index 46cbd6a7..e323dfa6 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -121,13 +121,11 @@ CREATE TABLE crawl_sources_meta ( CREATE TABLE crawl_posts ( crawlSourceId INTEGER NOT NULL, pathname TEXT NOT NULL, + crawledAt INTEGER, - type TEXT NOT NULL, content TEXT, - - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - updatedAt INTEGER DEFAULT (strftime('%s', 'now')), - crawledAt INTEGER DEFAULT (strftime('%s', 'now')), + createdAt INTEGER, + updatedAt INTEGER, FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); @@ -135,12 +133,11 @@ CREATE TABLE crawl_posts ( -- crawled follows CREATE TABLE crawl_followgraph ( crawlSourceId INTEGER NOT NULL, + crawledAt INTEGER, destUrl TEXT NOT NULL, - updatedAt INTEGER DEFAULT (strftime('%s', 'now')), - crawledAt INTEGER DEFAULT (strftime('%s', 'now')), - + PRIMARY KEY (crawlSourceId, destUrl), FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); diff --git a/package.json b/package.json index 9889a3bd..502fd75f 100644 --- a/package.json +++ b/package.json @@ -51,6 +51,7 @@ "identify-filetype": "^1.0.0", "into-stream": "^3.1.0", "lodash.debounce": "^4.0.8", + "lodash.difference": "^4.5.0", "lodash.get": "^4.4.2", "lodash.isequal": "^4.5.0", "lodash.pick": "^4.4.0", diff --git a/users/index.js b/users/index.js index da8b72b4..2fae2f61 100644 --- a/users/index.js +++ b/users/index.js @@ -5,6 +5,11 @@ const db = require('../dbs/profile-data-db') const archivesDb = require('../dbs/archives') const debug = require('../lib/debug-logger').debugLogger('users') +// constants +// = + +const SITE_TYPE = 'unwalled.garden/user' + // globals // = @@ -174,6 +179,9 @@ async function validateUserUrl (url) { if (!meta.isOwner) { throw new Error('User dat is not owned by this device') } + if (!meta.type.includes(SITE_TYPE)) { + throw new Error('User dat is not the correct type') + } if (!userSettings.isSaved) { throw new Error('User dat has been deleted') } From f592ca268630b201d7350922ee251d0885950f38 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 7 Dec 2018 20:32:35 -0600 Subject: [PATCH 005/245] Add write methods to crawler posts and followgraph --- crawler/followgraph.js | 76 ++++++++++++++++++++++++++++++++++-------- crawler/posts.js | 43 ++++++++++++++---------- crawler/util.js | 10 ++++++ 3 files changed, 98 insertions(+), 31 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index f75d7a44..742ebaef 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -1,8 +1,10 @@ const assert = require('assert') const _difference = require('lodash.difference') const Events = require('events') +const lock = require('../lib/lock') const db = require('../dbs/profile-data-db') const {doCrawl, doCheckpoint} = require('./util') +const debug = require('../lib/debug-logger').debugLogger('crawler') // constants // = @@ -42,11 +44,7 @@ exports.crawlSite = async function (archive, crawlSourceId) { // read and validate try { - var followsJson = JSON.parse(await archive.pda.readFile(JSON_PATH, 'utf8')) - assert(typeof followsJson === 'object', 'File be an object') - assert(followsJson.type === 'unwalled.garden/follows', 'JSON type must be unwalled.garden/follows') - assert(Array.isArray(followsJson.follows), 'JSON follows must be an array of strings') - followsJson.follows = followsJson.follows.filter(v => typeof v === 'string') + var followsJson = await readFollowsFile(archive) } catch (err) { debug('Failed to read follows file', {url: archive.url, err}) return @@ -72,7 +70,7 @@ exports.crawlSite = async function (archive, crawlSourceId) { DELETE FROM crawl_followgraph WHERE crawlSourceId = ? AND destUrl = ? `, [crawlSourceId, remove]) if (supressEvents) { - events.emit('follow-removed', archive.url, add) + events.emit('follow-removed', archive.url, remove) } } @@ -125,16 +123,68 @@ exports.isAFollowingB = async function (a, b) { return !!res } -exports.follow = function () { - throw new Error('Not yet implemented') +exports.follow = function (archive, followUrl) { + // normalize followUrl + // TODO + assert(typeof followUrl === 'string', 'Follow() must be given a valid URL') + + return updateFollowsFile(archive, followsJson => { + if (!followsJson.urls.find(v => v === followUrl)) { + followsJson.urls.push(followUrl) + } + }) +} - // update the user dat +exports.unfollow = function (archive, followUrl) { + // normalize followUrl // TODO + assert(typeof followUrl === 'string', 'Unollow() must be given a valid URL') + + return updateFollowsFile(archive, followsJson => { + var i = followsJson.urls.findIndex(v => v === followUrl) + if (i !== -1) { + followsJson.urls.splice(i, 1) + } + }) +} + +// internal methods +// = + +async function readFollowsFile (archive) { + var followsJson = JSON.parse(await archive.pda.readFile(JSON_PATH, 'utf8')) + assert(typeof followsJson === 'object', 'File be an object') + assert(followsJson.type === JSON_TYPE, 'JSON type must be unwalled.garden/follows') + assert(Array.isArray(followsJson.follows), 'JSON follows must be an array of strings') + followsJson.follows = followsJson.follows.filter(v => typeof v === 'string') + return followsJson } -exports.unfollow = function () { - throw new Error('Not yet implemented') +async function updateFollowsFile (archive, updateFn) { + var release = await lock('crawler:followgraph:' + archive.url) + try { + // read the follows file + try { + var followsJson = await readFollowsFile(archive) + } catch (err) { + if (err.notFound) { + // create new + followsJson = { + type: JSON_TYPE, + urls: [] + } + } else { + debug('Failed to read follows file', {url: archive.url, err}) + throw err + } + } - // update the user dat - // TODO + // apply update + updateFn(followsJson) + + // write the follows file + await archive.pda.readFile(JSON_PATH, JSON.stringify(followsJson), 'utf8') + } finally { + release() + } } diff --git a/crawler/posts.js b/crawler/posts.js index a1eed7da..9c2818d1 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -2,7 +2,8 @@ const assert = require('assert') const {URL} = require('url') const Events = require('events') const db = require('../dbs/profile-data-db') -const {doCrawl} = require('./util') +const {doCrawl, doCheckpoint, generateTimeFilename} = require('./util') +const debug = require('../lib/debug-logger').debugLogger('crawler') // constants // = @@ -69,7 +70,7 @@ exports.crawlSite = async function (archive, crawlSourceId) { assert(typeof post.createdAt === 'string', 'JSON createdAt must be a date-time') assert(!isNaN(Number(new Date(post.createdAt))), 'JSON createdAt must be a date-time') } catch (err) { - debug('Failed to read post file', {url: archive.url, path: c.path, err}) + debug('Failed to read post file', {url: archive.url, path: changedPost.path, err}) return // abort indexing } @@ -79,7 +80,7 @@ exports.crawlSite = async function (archive, crawlSourceId) { if (isNaN(post.updatedAt)) post.updatedAt = 0 // value is optional // upsert - let existingPost = await get(archive.url, c.path) + let existingPost = await get(archive.url, changedPost.path) if (existingPost) { await db.run(` UPDATE crawl_posts @@ -158,23 +159,29 @@ const get = exports.get = async function (url, pathname = undefined) { `, [url.origin, pathname]) } -exports.create = async function () { - throw new Error('Not yet implemented') - - // update the user dat - // TODO +exports.create = async function (archive, {content} = {}) { + assert(typeof content === 'string', 'Create() must be provided a `content` string') + var filename = generateTimeFilename() + await archive.writeFile(`/posts/${filename}.json`, JSON.stringify({ + type: JSON_TYPE, + content, + createdAt: (new Date()).toISOString() + })) } -exports.edit = async function () { - throw new Error('Not yet implemented') - - // update the user dat - // TODO +exports.edit = async function (archive, pathname, {content} = {}) { + assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') + assert(typeof content === 'string', 'Edit() must be provided a `content` string') + var oldJson = JSON.parse(await archive.readFile(pathname)) + await archive.writeFile(pathname, JSON.stringify({ + type: JSON_TYPE, + content, + createdAt: oldJson.createdAt, + updatedAt: (new Date()).toISOString() + })) } -exports.delete = async function () { - throw new Error('Not yet implemented') - - // update the user dat - // TODO +exports.delete = async function (archive, pathname) { + assert(typeof pathname === 'string', 'Delete() must be provided a valid URL string') + await archive.unlink(pathname) } diff --git a/crawler/util.js b/crawler/util.js index 59875905..f201aeb2 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -46,4 +46,14 @@ exports.doCrawl = async function (archive, crawlDataset, crawlDatasetVersion, ha exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion) { // TODO +} + +var _lastGeneratedTimeFilename +exports.generateTimeFilename = function () { + var d = Date.now() + if (d === _lastGeneratedTimeFilename) { + d++ + } + _lastGeneratedTimeFilename = d + return (new Date(d)).toISOString() } \ No newline at end of file From bd112319b1839f0c356c26fdb256f6db94f9fa0e Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 15:21:35 -0600 Subject: [PATCH 006/245] Implement remaining crawler logic --- crawler/followgraph.js | 18 ++++++++++++++---- crawler/index.js | 10 +++++++--- crawler/posts.js | 4 ++-- crawler/util.js | 43 +++++++++++++++++++++--------------------- dbs/profile-data-db.js | 2 ++ 5 files changed, 46 insertions(+), 31 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 742ebaef..3d1b9421 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -1,6 +1,7 @@ const assert = require('assert') const _difference = require('lodash.difference') const Events = require('events') +const {Url} = require('url') const lock = require('../lib/lock') const db = require('../dbs/profile-data-db') const {doCrawl, doCheckpoint} = require('./util') @@ -26,7 +27,7 @@ exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) exports.crawlSite = async function (archive, crawlSourceId) { - return doCrawl(archive, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { + return doCrawl(archive, crawlSourceId, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info if (resetRequired) { // reset all data @@ -125,7 +126,7 @@ exports.isAFollowingB = async function (a, b) { exports.follow = function (archive, followUrl) { // normalize followUrl - // TODO + followUrl = normalizeFollowUrl(followUrl) assert(typeof followUrl === 'string', 'Follow() must be given a valid URL') return updateFollowsFile(archive, followsJson => { @@ -137,8 +138,8 @@ exports.follow = function (archive, followUrl) { exports.unfollow = function (archive, followUrl) { // normalize followUrl - // TODO - assert(typeof followUrl === 'string', 'Unollow() must be given a valid URL') + followUrl = normalizeFollowUrl(followUrl) + assert(typeof followUrl === 'string', 'Unfollow() must be given a valid URL') return updateFollowsFile(archive, followsJson => { var i = followsJson.urls.findIndex(v => v === followUrl) @@ -151,6 +152,15 @@ exports.unfollow = function (archive, followUrl) { // internal methods // = +function normalizeFollowUrl (url) { + try { + url = new URL(url) + return url.origin + } catch (e) { + return null + } +} + async function readFollowsFile (archive) { var followsJson = JSON.parse(await archive.pda.readFile(JSON_PATH, 'utf8')) assert(typeof followsJson === 'object', 'File be an object') diff --git a/crawler/index.js b/crawler/index.js index 01ce5de9..2e20e7bb 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -1,5 +1,6 @@ const _throttle = require('lodash.throttle') const lock = require('../lib/lock') +const db = require('../dbs/profile-data-db') const users = require('../users') const dat = require('../dat') @@ -52,9 +53,12 @@ exports.unwatchSite = async function (url) { async function crawlSite (archive) { var release = await lock('crawl:' + archive.url) try { - // insert crawl source - // TODO - var crawlSourceId = // TODO + // get/create crawl source + var crawlSourceId = await db.run(`SELECT id FROM crawl_sources WHERE url = ?`, [archive.url]) + if (!crawlSourceId) { + await db.run(`INSERT INTO crawl_sources (url) VALUES (?)`, [archive.url]) + crawlSourceId = db.getSqliteInstance().lastID + } // crawl individual sources await Promise.all([ diff --git a/crawler/posts.js b/crawler/posts.js index 9c2818d1..dc797c9a 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -10,7 +10,7 @@ const debug = require('../lib/debug-logger').debugLogger('crawler') const TABLE_VERSION = 1 const JSON_TYPE = 'unwalled.garden/post' -const JSON_PATH_REGEX = /^\/data\/posts\/([^\/]+)\.json$/i +const JSON_PATH_REGEX = /^\/data\/posts\/([^/]+)\.json$/i // globals // = @@ -25,7 +25,7 @@ exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) exports.crawlSite = async function (archive, crawlSourceId) { - return doCrawl(archive, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { + return doCrawl(archive, crawlSourceId, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info if (resetRequired) { // reset all data diff --git a/crawler/util.js b/crawler/util.js index f201aeb2..8a7973fe 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -1,12 +1,15 @@ const db = require('../dbs/profile-data-db') +const dat = require('../dat') -exports.doCrawl = async function (archive, crawlDataset, crawlDatasetVersion, handlerFn) { +const READ_TIMEOUT = 30e3 + +exports.doCrawl = async function (archive, crawlSourceId, crawlDataset, crawlDatasetVersion, handlerFn) { const url = archive.url // fetch current crawl state var resetRequired = false var state = await db.get(` - SELECT crawl_sources_meta.* FROM crawl_sources_meta + SELECT crawl_sources_meta.crawlSourceVersion FROM crawl_sources_meta INNER JOIN crawl_sources ON crawl_sources.url = ? WHERE crawl_sources_meta.crawlDataset = ? `, [url, crawlDataset]) @@ -15,37 +18,33 @@ exports.doCrawl = async function (archive, crawlDataset, crawlDatasetVersion, ha state = null } if (!state) { - // new state - state = { - crawlSourceId: null, - url, - crawlDataset, - crawlDatasetVersion, - updatedAt: 0 - } + state = {crawlSourceVersion: 0} } // fetch current archive version - // TODO + var archiveInfo = await dat.library.getDaemon().getArchiveInfo(archive.key) + var version = archiveInfo ? archiveInfo.version : 0 // fetch change log - var changes = [] // TODO + var start = state.crawlSourceVersion + var end = version + var changes = await new Promise((resolve, reject) => { + archive.history({start, end, timeout: READ_TIMEOUT}, (err, c) => { + if (err) reject(err) + else resolve(c) + }) + }) // handle changes await handlerFn({changes, resetRequired}) - - if (!state.crawlSourceId) { - // upsert crawl source - // TODO - } - - // upsert crawl state - state.updatedAt = Date.now() - // TODO } exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion) { - // TODO + await db.run(` + INSERT OR REPLACE + INTO crawl_sources_meta (crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion, updatedAt) + VALUES (?, ?, ?, ?, ?) + `, [crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion, Date.now()]) } var _lastGeneratedTimeFilename diff --git a/dbs/profile-data-db.js b/dbs/profile-data-db.js index 59734685..677af760 100644 --- a/dbs/profile-data-db.js +++ b/dbs/profile-data-db.js @@ -44,6 +44,8 @@ exports.parallelize = function () { return db.parallelize() } +exports.getSqliteInstance = () => db + // internal methods // = From 8af6f4f0c76ca4392ddae5b05bf6fbb04ec6771c Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 15:46:59 -0600 Subject: [PATCH 007/245] Implement followgraph web api --- web-apis/bg.js | 4 +- web-apis/bg/followgraph.js | 66 ++++++++++++++++++++++ web-apis/fg/beaker.js | 5 +- web-apis/manifests/internal/followgraph.js | 5 +- 4 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 web-apis/bg/followgraph.js diff --git a/web-apis/bg.js b/web-apis/bg.js index 6702b4cb..e3d16b38 100644 --- a/web-apis/bg.js +++ b/web-apis/bg.js @@ -19,8 +19,8 @@ const bookmarksAPI = require('./bg/bookmarks') const historyAPI = require('./bg/history') const sitedataAPI = require('../dbs/sitedata').WEBAPI const watchlistAPI = require('./bg/watchlist') -const postsAPI = require('../crawler/posts') -const followgraphAPI = require('../crawler/followgraph') +const postsAPI = require('./bg/posts') +const followgraphAPI = require('./bg/followgraph') // external manifests const datArchiveManifest = require('./manifests/external/dat-archive') diff --git a/web-apis/bg/followgraph.js b/web-apis/bg/followgraph.js new file mode 100644 index 00000000..e0d8b9de --- /dev/null +++ b/web-apis/bg/followgraph.js @@ -0,0 +1,66 @@ +const globals = require('../../globals') +const assert = require('assert') +const {Url} = require('url') +const {PermissionsError} = require('beaker-error-constants') +const dat = require('../../dat') +const followgraphCrawler = require('../../crawler/followgraph') + +// exported api +// = + +module.exports = { + + async listFollowers (url) { + url = normalizeFollowUrl(url) + assertString(url, 'Parameter one must be a URL') + return followgraphCrawler.listFollowers(url) + }, + + async listFollows (url) { + url = normalizeFollowUrl(url) + assertString(url, 'Parameter one must be a URL') + return followgraphCrawler.listFollows(url) + }, + + async isAFollowingB (a, b) { + a = normalizeFollowUrl(a) + b = normalizeFollowUrl(b) + assertString(a, 'Parameter one must be a URL') + assertString(b, 'Parameter two must be a URL') + return followgraphCrawler.isAFollowingB(a, b) + }, + + async follow (url) { + url = normalizeFollowUrl(url) + assertString(url, 'Parameter one must be a URL') + var userSession = globals.getUserSessionFor(this.sender) + if (!userSession) throw new Error('No active user session') + var userArchive = dat.library.getArchive(userSession.url) + return followgraphCrawler.follow(userArchive, url) + }, + + async unfollow (url) { + url = normalizeFollowUrl(url) + assertString(url, 'Parameter one must be a URL') + var userSession = globals.getUserSessionFor(this.sender) + if (!userSession) throw new Error('No active user session') + var userArchive = dat.library.getArchive(userSession.url) + return followgraphCrawler.follow(userArchive, url) + } +} + +// internal methods +// = + +function normalizeFollowUrl (url) { + try { + url = new URL(url) + return url.origin + } catch (e) { + return null + } +} + +function assertString (v, msg) { + assert(!!v && typeof v === 'string', msg) +} diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index f356c463..a2af512d 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -161,8 +161,9 @@ exports.setup = function (rpc) { beaker.posts.delete = postsRPC.delete // beaker.followgraph - beaker.followgraph.queryall = followgraphRPC.queryall - beaker.followgraph.query = followgraphRPC.query + beaker.followgraph.listFollowers = followgraphRPC.listFollowers + beaker.followgraph.listFollows = followgraphRPC.listFollows + beaker.followgraph.isAFollowingB = followgraphRPC.isAFollowingB beaker.followgraph.follow = followgraphRPC.follow beaker.followgraph.unfollow = followgraphRPC.unfollow } diff --git a/web-apis/manifests/internal/followgraph.js b/web-apis/manifests/internal/followgraph.js index 8c8549c8..2cb0fd6d 100644 --- a/web-apis/manifests/internal/followgraph.js +++ b/web-apis/manifests/internal/followgraph.js @@ -1,6 +1,7 @@ module.exports = { - queryAll: 'promise', - query: 'promise', + listFollowers: 'promise', + listFollows: 'promise', + isAFollowingB: 'promise', follow: 'promise', unfollow: 'promise' } \ No newline at end of file From cfd8a6ad9f3639212c045da7cac5af9f38ea8a39 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 16:09:46 -0600 Subject: [PATCH 008/245] Add posts web api --- web-apis/bg/posts.js | 54 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 web-apis/bg/posts.js diff --git a/web-apis/bg/posts.js b/web-apis/bg/posts.js new file mode 100644 index 00000000..8be750c1 --- /dev/null +++ b/web-apis/bg/posts.js @@ -0,0 +1,54 @@ +const globals = require('../../globals') +const assert = require('assert') +const {Url} = require('url') +const {PermissionsError} = require('beaker-error-constants') +const dat = require('../../dat') +const postsCrawler = require('../../crawler/posts') + +// exported api +// = + +module.exports = { + + async list ({offset, limit, reverse, author} = {}) { + // validate & parse params + assert(!offset || typeof offset === 'number', 'Offset must be a number') + assert(!limit || typeof limit === 'number', 'Limit must be a number') + assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') + assert(!author || typeof author === 'string', 'Author must be a string') + if (author) { + try { author = new URL(author) } + catch (e) { throw new Error('Failed to parse author URL: ' + author) } + } + return postsCrawler.list({offset, limit, reverse, author}) + }, + + async get (origin, pathname = undefined) { + return postsCrawler.get(origin, pathname) + }, + + async create ({content} = {}) { + assert(typeof content === 'string', 'Create() must be provided a `content` string') + var userSession = globals.getUserSessionFor(this.sender) + if (!userSession) throw new Error('No active user session') + var userArchive = dat.library.getArchive(userSession.url) + return postsCrawler.create(userArchive, {content}) + }, + + async edit (pathname, {content} = {}) { + assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') + assert(typeof content === 'string', 'Edit() must be provided a `content` string') + var userSession = globals.getUserSessionFor(this.sender) + if (!userSession) throw new Error('No active user session') + var userArchive = dat.library.getArchive(userSession.url) + return postsCrawler.edit(userArchive, pathname, {content}) + }, + + async delete (pathname) { + assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') + var userSession = globals.getUserSessionFor(this.sender) + if (!userSession) throw new Error('No active user session') + var userArchive = dat.library.getArchive(userSession.url) + return postsCrawler.delete(userArchive, pathname) + } +} \ No newline at end of file From 1f22b4dfabfd3befc8d0104e4c6aab5c99695f71 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 16:15:37 -0600 Subject: [PATCH 009/245] Add crawler sqlite delta --- dbs/profile-data-db.js | 1 + dbs/schemas/profile-data.sql.js | 2 +- dbs/schemas/profile-data.v24.sql.js | 52 +++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 dbs/schemas/profile-data.v24.sql.js diff --git a/dbs/profile-data-db.js b/dbs/profile-data-db.js index 677af760..ada82a01 100644 --- a/dbs/profile-data-db.js +++ b/dbs/profile-data-db.js @@ -76,6 +76,7 @@ migrations = [ migration('profile-data.v21.sql'), migration('profile-data.v22.sql', {canFail: true}), // canFail for the same reason as v16, ffs migration('profile-data.v23.sql'), + migration('profile-data.v24.sql') ] function migration (file, opts = {}) { return cb => { diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index e323dfa6..8568a698 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -217,5 +217,5 @@ INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Report an issu INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Explore the p2p Web', 'dat://taravancil.com/explore-the-p2p-web.md', 1); INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Support Beaker', 'https://opencollective.com/beaker', 1); -PRAGMA user_version = 23; +PRAGMA user_version = 24; ` diff --git a/dbs/schemas/profile-data.v24.sql.js b/dbs/schemas/profile-data.v24.sql.js new file mode 100644 index 00000000..cc3653b8 --- /dev/null +++ b/dbs/schemas/profile-data.v24.sql.js @@ -0,0 +1,52 @@ +module.exports = ` + +CREATE TABLE users ( + id INTEGER PRIMARY KEY NOT NULL, + url TEXT, + isDefault INTEGER DEFAULT 0, + createdAt INTEGER +); + +-- list of sites being crawled +CREATE TABLE crawl_sources ( + id INTEGER PRIMARY KEY NOT NULL, + url TEXT NOT NULL +); + +-- tracking information on the crawl-state of the sources +CREATE TABLE crawl_sources_meta ( + crawlSourceId INTEGER NOT NULL, + crawlSourceVersion INTEGER NOT NULL, + crawlDataset TEXT NOT NULL, + crawlDatasetVersion INTEGER NOT NULL, + updatedAt INTEGER DEFAULT, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +-- crawled posts +CREATE TABLE crawl_posts ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + content TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +-- crawled follows +CREATE TABLE crawl_followgraph ( + crawlSourceId INTEGER NOT NULL, + crawledAt INTEGER, + + destUrl TEXT NOT NULL, + + PRIMARY KEY (crawlSourceId, destUrl), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +PRAGMA user_version = 24; +` \ No newline at end of file From d7b3fa2a84eedabafe0f0824d589ee32968e3091 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 16:16:22 -0600 Subject: [PATCH 010/245] Add userSessionAPI interface --- README.md | 9 ++++++++- globals.js | 3 ++- index.js | 3 +++ web-apis/bg/followgraph.js | 4 ++-- web-apis/bg/posts.js | 6 +++--- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index df6f1065..65436571 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,10 @@ await beakerCore.setup({ exportAPI(apiName, apiManifest, apiImpl, [guardFn]) }, downloadsWebAPI: {...}, - browserWebAPI: {...} + browserWebAPI: {...}, + userSessionAPI: { + getFor(webContents) {/*...*/} + } }) // setup the protocol handler @@ -115,6 +118,10 @@ debug('dat-related stuff') ### `dat.debug` +### `crawler` + +### `users` + ## API (@beaker/core/webview) ### `setup()` \ No newline at end of file diff --git a/globals.js b/globals.js index 31516b70..a76cda47 100644 --- a/globals.js +++ b/globals.js @@ -10,5 +10,6 @@ module.exports = { uiAPI: null, rpcAPI: null, downloadsWebAPI: null, - browserWebAPI: null + browserWebAPI: null, + userSessionAPI: null } diff --git a/index.js b/index.js index 7f46ea90..28c92013 100644 --- a/index.js +++ b/index.js @@ -16,6 +16,8 @@ module.exports = { globals, dat, dbs, + crawler, + users, spellChecker, debugLogger: debugLogger.debugLogger, @@ -32,6 +34,7 @@ module.exports = { assert(!!opts.rpcAPI, 'must provide rpcAPI') assert(!!opts.downloadsWebAPI, 'must provide downloadsWebAPI') assert(!!opts.browserWebAPI, 'must provide browserWebAPI') + assert(!!opts.userSessionAPI, 'must provide userSessionAPI') for (let k in opts) { globals[k] = opts[k] diff --git a/web-apis/bg/followgraph.js b/web-apis/bg/followgraph.js index e0d8b9de..3ac7c14b 100644 --- a/web-apis/bg/followgraph.js +++ b/web-apis/bg/followgraph.js @@ -33,7 +33,7 @@ module.exports = { async follow (url) { url = normalizeFollowUrl(url) assertString(url, 'Parameter one must be a URL') - var userSession = globals.getUserSessionFor(this.sender) + var userSession = globals.userSessionAPI.getFor(this.sender) if (!userSession) throw new Error('No active user session') var userArchive = dat.library.getArchive(userSession.url) return followgraphCrawler.follow(userArchive, url) @@ -42,7 +42,7 @@ module.exports = { async unfollow (url) { url = normalizeFollowUrl(url) assertString(url, 'Parameter one must be a URL') - var userSession = globals.getUserSessionFor(this.sender) + var userSession = globals.userSessionAPI.getFor(this.sender) if (!userSession) throw new Error('No active user session') var userArchive = dat.library.getArchive(userSession.url) return followgraphCrawler.follow(userArchive, url) diff --git a/web-apis/bg/posts.js b/web-apis/bg/posts.js index 8be750c1..d7c5e623 100644 --- a/web-apis/bg/posts.js +++ b/web-apis/bg/posts.js @@ -29,7 +29,7 @@ module.exports = { async create ({content} = {}) { assert(typeof content === 'string', 'Create() must be provided a `content` string') - var userSession = globals.getUserSessionFor(this.sender) + var userSession = globals.userSessionAPI.getFor(this.sender) if (!userSession) throw new Error('No active user session') var userArchive = dat.library.getArchive(userSession.url) return postsCrawler.create(userArchive, {content}) @@ -38,7 +38,7 @@ module.exports = { async edit (pathname, {content} = {}) { assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') assert(typeof content === 'string', 'Edit() must be provided a `content` string') - var userSession = globals.getUserSessionFor(this.sender) + var userSession = globals.userSessionAPI.getFor(this.sender) if (!userSession) throw new Error('No active user session') var userArchive = dat.library.getArchive(userSession.url) return postsCrawler.edit(userArchive, pathname, {content}) @@ -46,7 +46,7 @@ module.exports = { async delete (pathname) { assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') - var userSession = globals.getUserSessionFor(this.sender) + var userSession = globals.userSessionAPI.getFor(this.sender) if (!userSession) throw new Error('No active user session') var userArchive = dat.library.getArchive(userSession.url) return postsCrawler.delete(userArchive, pathname) From 3f3400bb357dd5d1be43239bea7419c409c43216 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 17:10:06 -0600 Subject: [PATCH 011/245] Add getUserSession/setUserSession to beaker.browser api --- web-apis/fg/beaker.js | 2 ++ web-apis/manifests/internal/browser.js | 3 +++ 2 files changed, 5 insertions(+) diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index a2af512d..213b4a2b 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -71,6 +71,8 @@ exports.setup = function (rpc) { beaker.browser.getInfo = beakerBrowserRPC.getInfo beaker.browser.checkForUpdates = beakerBrowserRPC.checkForUpdates beaker.browser.restartBrowser = beakerBrowserRPC.restartBrowser + beaker.browser.getUserSession = beakerBrowserRPC.getUserSession + beaker.browser.setUserSession = beakerBrowserRPC.setUserSession beaker.browser.getSetting = beakerBrowserRPC.getSetting beaker.browser.getSettings = beakerBrowserRPC.getSettings beaker.browser.setSetting = beakerBrowserRPC.setSetting diff --git a/web-apis/manifests/internal/browser.js b/web-apis/manifests/internal/browser.js index b14a1b0c..cd1c7743 100644 --- a/web-apis/manifests/internal/browser.js +++ b/web-apis/manifests/internal/browser.js @@ -4,6 +4,9 @@ module.exports = { checkForUpdates: 'promise', restartBrowser: 'sync', + getUserSession: 'promise', + setUserSession: 'promise', + getSettings: 'promise', getSetting: 'promise', setSetting: 'promise', From 1eb42a6a90e10ec76557db832154123a7434fdbb Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 18:53:52 -0600 Subject: [PATCH 012/245] Fixes --- dat/library.js | 1 + dat/watchlist.js | 3 +++ dbs/archives.js | 4 ++-- dbs/schemas/profile-data.sql.js | 2 +- dbs/schemas/profile-data.v24.sql.js | 2 +- package-lock.json | 15 ++++++--------- users/index.js | 4 ++-- web-apis/fg/beaker.js | 2 ++ 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/dat/library.js b/dat/library.js index 1cb63df5..3c0af69c 100644 --- a/dat/library.js +++ b/dat/library.js @@ -358,6 +358,7 @@ exports.getActiveArchives = function getActiveArchives () { } const getOrLoadArchive = exports.getOrLoadArchive = async function getOrLoadArchive (key, opts) { + key = fromURLToKey(key) var archive = getArchive(key) if (archive) { return archive diff --git a/dat/watchlist.js b/dat/watchlist.js index f8814556..3c4bdc93 100644 --- a/dat/watchlist.js +++ b/dat/watchlist.js @@ -1,5 +1,6 @@ const EventEmitter = require('events') const emitStream = require('emit-stream') +const debug = require('../lib/debug-logger').debugLogger('datwatchlist') // dat modules const datLibrary = require('../dat/library') @@ -21,6 +22,8 @@ exports.setup = async function setup () { watch(site) } } catch (err) { + console.error(err) + debug('Error while loading watchlist', err) throw new Error('Failed to load the watchlist') } } diff --git a/dbs/archives.js b/dbs/archives.js index a14783cc..99977b5e 100644 --- a/dbs/archives.js +++ b/dbs/archives.js @@ -228,7 +228,7 @@ exports.touch = async function (key, timeVar = 'lastAccessTime', value = -1) { // - supresses a not-found with an empty object const getUserSettings = exports.getUserSettings = async function (profileId, key) { // massage inputs - key = datEncoding.toStr(key) + key = typeof key !== 'string' ? datEncoding.toStr(key) : key // validate inputs if (!DAT_HASH_REGEX.test(key)) { @@ -362,7 +362,7 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { // - supresses a not-found with an empty object const getMeta = exports.getMeta = async function (key) { // massage inputs - key = datEncoding.toStr(key) + key = typeof key !== 'string' ? datEncoding.toStr(key) : key // validate inputs if (!DAT_HASH_REGEX.test(key)) { diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index 8568a698..6cb2e383 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -112,7 +112,7 @@ CREATE TABLE crawl_sources_meta ( crawlSourceVersion INTEGER NOT NULL, crawlDataset TEXT NOT NULL, crawlDatasetVersion INTEGER NOT NULL, - updatedAt INTEGER DEFAULT, + updatedAt INTEGER, FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); diff --git a/dbs/schemas/profile-data.v24.sql.js b/dbs/schemas/profile-data.v24.sql.js index cc3653b8..2f769b70 100644 --- a/dbs/schemas/profile-data.v24.sql.js +++ b/dbs/schemas/profile-data.v24.sql.js @@ -19,7 +19,7 @@ CREATE TABLE crawl_sources_meta ( crawlSourceVersion INTEGER NOT NULL, crawlDataset TEXT NOT NULL, crawlDatasetVersion INTEGER NOT NULL, - updatedAt INTEGER DEFAULT, + updatedAt INTEGER, FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); diff --git a/package-lock.json b/package-lock.json index 130e07fa..bd6543ca 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2326,6 +2326,11 @@ "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", "integrity": "sha1-gteb/zCmfEAF/9XiUVMArZyk168=" }, + "lodash.difference": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.difference/-/lodash.difference-4.5.0.tgz", + "integrity": "sha1-nMtOUF1Ia5FlE0V3KIWi3yf9AXw=" + }, "lodash.get": { "version": "4.4.2", "resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz", @@ -2361,15 +2366,7 @@ "dev": true, "requires": { "pseudomap": "^1.0.2", - "yallist": "^2.1.2" - }, - "dependencies": { - "yallist": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", - "integrity": "sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=", - "dev": true - } + "yallist": "^3.0.2" } }, "map-age-cleaner": { diff --git a/users/index.js b/users/index.js index 2fae2f61..8e96a481 100644 --- a/users/index.js +++ b/users/index.js @@ -48,7 +48,7 @@ exports.setup = async function () { }) } -exports.getAll = async function () { +exports.list = async function () { return Promise.all(users.map(fetchUserInfo)) } @@ -174,7 +174,7 @@ async function validateUserUrl (url) { var urlp = new URL(url) var [meta, userSettings] = await Promise.all([ archivesDb.getMeta(urlp.hostname), - archivesDb.getUserSettings(urlp.hostname) + archivesDb.getUserSettings(0, urlp.hostname) ]) if (!meta.isOwner) { throw new Error('User dat is not owned by this device') diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 213b4a2b..b3d4de13 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -156,6 +156,7 @@ exports.setup = function (rpc) { beaker.watchlist.createEventsStream = () => fromEventStream(watchlistRPC.createEventsStream()) // beaker.posts + beaker.posts = {} beaker.posts.list = postsRPC.list beaker.posts.get = postsRPC.get beaker.posts.create = postsRPC.create @@ -163,6 +164,7 @@ exports.setup = function (rpc) { beaker.posts.delete = postsRPC.delete // beaker.followgraph + beaker.followgraph = {} beaker.followgraph.listFollowers = followgraphRPC.listFollowers beaker.followgraph.listFollows = followgraphRPC.listFollows beaker.followgraph.isAFollowingB = followgraphRPC.isAFollowingB From 99d71ec45d0761a5d834108c329f8396d13d0d78 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 8 Dec 2018 20:32:30 -0600 Subject: [PATCH 013/245] Fixes --- crawler/util.js | 2 +- dbs/archives.js | 2 +- users/index.js | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/crawler/util.js b/crawler/util.js index 8a7973fe..60daffaa 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -13,7 +13,7 @@ exports.doCrawl = async function (archive, crawlSourceId, crawlDataset, crawlDat INNER JOIN crawl_sources ON crawl_sources.url = ? WHERE crawl_sources_meta.crawlDataset = ? `, [url, crawlDataset]) - if (state.crawlDatasetVersion !== crawlDatasetVersion) { + if (state && state.crawlDatasetVersion !== crawlDatasetVersion) { resetRequired = true state = null } diff --git a/dbs/archives.js b/dbs/archives.js index 99977b5e..85433822 100644 --- a/dbs/archives.js +++ b/dbs/archives.js @@ -278,7 +278,7 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { autoDownload: ('autoDownload' in newValues) ? newValues.autoDownload : newValues.isSaved, autoUpload: ('autoUpload' in newValues) ? newValues.autoUpload : newValues.isSaved, expiresAt: newValues.expiresAt, - localSyncPath: ('localSyncPath' in newValues) ? newValues.localSyncPath : '', + localSyncPath: (newValues.localSyncPath) ? newValues.localSyncPath : '', previewMode: ('previewMode' in newValues) ? newValues.previewMode : '' } let valueArray = [ diff --git a/users/index.js b/users/index.js index 8e96a481..e31b3024 100644 --- a/users/index.js +++ b/users/index.js @@ -85,6 +85,7 @@ exports.add = async function (url) { `INSERT INTO users (url, isDefault, createdAt) VALUES (?, ?, ?)`, [user.url, Number(user.isDefault), user.createdAt] ) + users.push(user) // fetch the user archive user.archive = await dat.library.getOrLoadArchive(user.url) From 462427dac182cf03986e7d782016d1e3ea74a01c Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 9 Dec 2018 18:29:47 -0600 Subject: [PATCH 014/245] Fixes --- crawler/followgraph.js | 14 +++++++------- crawler/index.js | 19 ++++++++++++------- crawler/posts.js | 43 +++++++++++++++++++++++++----------------- crawler/util.js | 31 +++++++++++++++++------------- users/index.js | 2 ++ 5 files changed, 65 insertions(+), 44 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 3d1b9421..8d22d4b1 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -26,15 +26,15 @@ exports.on = events.on.bind(events) exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) -exports.crawlSite = async function (archive, crawlSourceId) { - return doCrawl(archive, crawlSourceId, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info if (resetRequired) { // reset all data await db.run(` DELETE FROM crawl_followgraph WHERE crawlSourceId = ? - `, [crawlSourceId]) - await doCheckpoint('crawl_followgraph', TABLE_VERSION, crawlSourceId, 0) + `, [crawlSource.id]) + await doCheckpoint('crawl_followgraph', TABLE_VERSION, crawlSource, 0) } // did follows.json change? @@ -61,7 +61,7 @@ exports.crawlSite = async function (archive, crawlSourceId) { for (let add of adds) { await db.run(` INSERT INTO crawl_followgraph (crawlSourceId, destUrl, crawledAt) VALUES (?, ?, ?) - `, [crawlSourceId, add, Date.now()]) + `, [crawlSource.id, add, Date.now()]) if (!supressEvents) { events.emit('follow-added', archive.url, add) } @@ -69,14 +69,14 @@ exports.crawlSite = async function (archive, crawlSourceId) { for (let remove of removes) { await db.run(` DELETE FROM crawl_followgraph WHERE crawlSourceId = ? AND destUrl = ? - `, [crawlSourceId, remove]) + `, [crawlSource.id, remove]) if (supressEvents) { events.emit('follow-removed', archive.url, remove) } } // write checkpoint as success - await doCheckpoint('crawl_followgraph', TABLE_VERSION, crawlSourceId, changes[changes.length - 1].version) + await doCheckpoint('crawl_followgraph', TABLE_VERSION, crawlSource, changes[changes.length - 1].version) }) } diff --git a/crawler/index.js b/crawler/index.js index 2e20e7bb..004607a3 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -23,15 +23,19 @@ exports.setup = async function () { exports.watchSite = async function (archive) { if (typeof archive === 'string') { - archive = await dat.library.getOrLoadArchive() + archive = await dat.library.getOrLoadArchive(archive) } + console.log('watchSite', archive.url) if (!(archive.url in watches)) { const queueCrawl = _throttle(() => crawlSite(archive), 5e3) // watch for file changes - watches[archive.url] = archive.pda.watch() + watches[archive.url] = await archive.pda.watch() + watches[archive.url].on('error', console.error) + watches[archive.url].on('close', console.log.bind(console, 'close')) watches[archive.url].on('data', ([event, args]) => { + console.log('change event', archive.url, event, args) if (event === 'invalidated') { queueCrawl() } @@ -51,19 +55,20 @@ exports.unwatchSite = async function (url) { } async function crawlSite (archive) { + console.log('crawling', archive.url) var release = await lock('crawl:' + archive.url) try { // get/create crawl source - var crawlSourceId = await db.run(`SELECT id FROM crawl_sources WHERE url = ?`, [archive.url]) - if (!crawlSourceId) { + var crawlSource = await db.get(`SELECT id FROM crawl_sources WHERE url = ?`, [archive.url]) + if (!crawlSource) { await db.run(`INSERT INTO crawl_sources (url) VALUES (?)`, [archive.url]) - crawlSourceId = db.getSqliteInstance().lastID + crawlSource = {id: db.getSqliteInstance().lastID, url: archive.url} } // crawl individual sources await Promise.all([ - posts.crawlSite(archive, crawlSourceId), - followgraph.crawlSite(archive, crawlSourceId) + posts.crawlSite(archive, crawlSource), + followgraph.crawlSite(archive, crawlSource) ]) } finally { release() diff --git a/crawler/posts.js b/crawler/posts.js index dc797c9a..eda18370 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -24,28 +24,31 @@ exports.on = events.on.bind(events) exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) -exports.crawlSite = async function (archive, crawlSourceId) { - return doCrawl(archive, crawlSourceId, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info + console.log('Crawling posts for', archive.url, {changes, resetRequired}) if (resetRequired) { // reset all data + console.log('resetting data') await db.run(` DELETE FROM crawl_posts WHERE crawlSourceId = ? - `, [crawlSourceId]) - await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSourceId, 0) + `, [crawlSource.id]) + await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSource, 0) } // collect changed posts var changedPosts = [] // order matters, must be oldest to newest changes.forEach(c => { - if (JSON_PATH_REGEX.test(c.path)) { - let i = changedPosts.findIndex(c2 => c2.path === c.path) - if (i) { + if (JSON_PATH_REGEX.test(c.name)) { + let i = changedPosts.findIndex(c2 => c2.name === c.name) + if (i !== -1) { changedPosts.splice(i, 1) // remove from old position } changedPosts.push(c) } }) + console.log('collected changed posts', changedPosts) // read and apply each post in order for (let changedPost of changedPosts) { @@ -55,22 +58,24 @@ exports.crawlSite = async function (archive, crawlSourceId) { // -prf if (changedPost.type === 'del') { // delete + console.log('deleting', changedPost) await db.run(` DELETE FROM crawl_posts WHERE crawlSourceId = ? AND pathname = ? - `, [crawlSourceId, changedPost.path]) + `, [crawlSource.id, changedPost.name]) events.emit('post-removed', archive.url) } else { // read and validate + console.log('adding', changedPost) let post try { - post = JSON.parse(await archive.pda.readFile(changedPost.path, 'utf8')) + post = JSON.parse(await archive.pda.readFile(changedPost.name, 'utf8')) assert(typeof post === 'object', 'File be an object') assert(post.type === 'unwalled.garden/post', 'JSON type must be unwalled.garden/post') assert(typeof post.content === 'string', 'JSON content must be a string') assert(typeof post.createdAt === 'string', 'JSON createdAt must be a date-time') assert(!isNaN(Number(new Date(post.createdAt))), 'JSON createdAt must be a date-time') } catch (err) { - debug('Failed to read post file', {url: archive.url, path: changedPost.path, err}) + debug('Failed to read post file', {url: archive.url, name: changedPost.name, err}) return // abort indexing } @@ -80,25 +85,26 @@ exports.crawlSite = async function (archive, crawlSourceId) { if (isNaN(post.updatedAt)) post.updatedAt = 0 // value is optional // upsert - let existingPost = await get(archive.url, changedPost.path) + let existingPost = await get(archive.url, changedPost.name) if (existingPost) { await db.run(` UPDATE crawl_posts SET crawledAt = ?, content = ?, createdAt = ?, updatedAt = ? WHERE crawlSourceId = ? AND pathname = ? - `, [Date.now(), post.content, post.createdAt, post.updatedAt, crawlSourceId, changedPost.path]) + `, [Date.now(), post.content, post.createdAt, post.updatedAt, crawlSource.id, changedPost.name]) events.emit('post-updated', archive.url) } else { await db.run(` INSERT INTO crawl_posts (crawlSourceId, pathname, crawledAt, content, createdAt, updatedAt) VALUES (?, ?, ?, ?, ?, ?) - `, [crawlSourceId, changedPost.path, Date.now(), post.content, post.createdAt, post.updatedAt]) + `, [crawlSource.id, changedPost.name, Date.now(), post.content, post.createdAt, post.updatedAt]) events.emit('post-added', archive.url) } // checkpoint our progress - await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSourceId, changedPost.version) + await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSource, changedPost.version) } + console.log('success', changedPost) } }) } @@ -115,10 +121,13 @@ exports.list = async function ({offset, limit, reverse, author} = {}) { } // build query - var query = `SELECT crawl_posts.*, src.url AS crawlSourceUrl FROM crawl_posts` + var query = ` + SELECT crawl_posts.*, src.url AS crawlSourceUrl FROM crawl_posts + INNER JOIN crawl_sources src ON src.id = crawl_posts.crawlSourceId + ` var values = [] if (author) { - query += ` INNER JOIN crawl_sources src ON src.url = ?` + query += ` WHERE src.url = ?` values.push(author.origin) } if (offset) { @@ -162,7 +171,7 @@ const get = exports.get = async function (url, pathname = undefined) { exports.create = async function (archive, {content} = {}) { assert(typeof content === 'string', 'Create() must be provided a `content` string') var filename = generateTimeFilename() - await archive.writeFile(`/posts/${filename}.json`, JSON.stringify({ + await archive.writeFile(`/data/posts/${filename}.json`, JSON.stringify({ type: JSON_TYPE, content, createdAt: (new Date()).toISOString() diff --git a/crawler/util.js b/crawler/util.js index 60daffaa..cdb0e001 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -1,24 +1,26 @@ +const pump = require('pump') +const concat = require('concat-stream') const db = require('../dbs/profile-data-db') const dat = require('../dat') const READ_TIMEOUT = 30e3 -exports.doCrawl = async function (archive, crawlSourceId, crawlDataset, crawlDatasetVersion, handlerFn) { +exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatasetVersion, handlerFn) { const url = archive.url // fetch current crawl state var resetRequired = false var state = await db.get(` - SELECT crawl_sources_meta.crawlSourceVersion FROM crawl_sources_meta + SELECT meta.crawlSourceVersion, meta.crawlDatasetVersion FROM crawl_sources_meta meta INNER JOIN crawl_sources ON crawl_sources.url = ? - WHERE crawl_sources_meta.crawlDataset = ? + WHERE meta.crawlDataset = ? `, [url, crawlDataset]) if (state && state.crawlDatasetVersion !== crawlDatasetVersion) { resetRequired = true state = null } if (!state) { - state = {crawlSourceVersion: 0} + state = {crawlSourceVersion: 0, crawlDatasetVersion} } // fetch current archive version @@ -26,25 +28,28 @@ exports.doCrawl = async function (archive, crawlSourceId, crawlDataset, crawlDat var version = archiveInfo ? archiveInfo.version : 0 // fetch change log - var start = state.crawlSourceVersion - var end = version + var start = state.crawlSourceVersion + 1 + var end = version + 1 + console.log('fetching changes', start, end, state) var changes = await new Promise((resolve, reject) => { - archive.history({start, end, timeout: READ_TIMEOUT}, (err, c) => { - if (err) reject(err) - else resolve(c) - }) + pump( + archive.history({start, end, timeout: READ_TIMEOUT}), + concat({encoding: 'object'}, resolve), + reject + ) }) // handle changes await handlerFn({changes, resetRequired}) } -exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion) { +exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { + await db.run(`DELETE FROM crawl_sources_meta WHERE crawlDataset = ? AND crawlSourceId = ?`, [crawlDataset, crawlSource.id]) await db.run(` - INSERT OR REPLACE + INSERT INTO crawl_sources_meta (crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion, updatedAt) VALUES (?, ?, ?, ?, ?) - `, [crawlDataset, crawlDatasetVersion, crawlSourceId, crawlSourceVersion, Date.now()]) + `, [crawlDataset, crawlDatasetVersion, crawlSource.id, crawlSourceVersion, Date.now()]) } var _lastGeneratedTimeFilename diff --git a/users/index.js b/users/index.js index e31b3024..2d05298b 100644 --- a/users/index.js +++ b/users/index.js @@ -30,6 +30,7 @@ exports.setup = async function () { // load the current users users = await db.all(`SELECT * FROM users`) + console.log('users loaded', users) users.forEach(async (user) => { // massage data user.archive = null @@ -81,6 +82,7 @@ exports.add = async function (url) { isDefault: users.length === 0, createdAt: Date.now() } + console.log('adding new user', user) await db.run( `INSERT INTO users (url, isDefault, createdAt) VALUES (?, ?, ?)`, [user.url, Number(user.isDefault), user.createdAt] From a79a5f600e7b0bfc76a3037b0a315835e43f25a1 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 9 Dec 2018 20:05:32 -0600 Subject: [PATCH 015/245] Crawler fixes, particularly around the changes watcher --- crawler/followgraph.js | 6 +++++- crawler/index.js | 17 ++++++++++++----- crawler/posts.js | 18 ++++++++++-------- crawler/util.js | 2 +- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 8d22d4b1..3ec2a7f5 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -4,6 +4,7 @@ const Events = require('events') const {Url} = require('url') const lock = require('../lib/lock') const db = require('../dbs/profile-data-db') +const crawler = require('./index') const {doCrawl, doCheckpoint} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') @@ -193,7 +194,10 @@ async function updateFollowsFile (archive, updateFn) { updateFn(followsJson) // write the follows file - await archive.pda.readFile(JSON_PATH, JSON.stringify(followsJson), 'utf8') + await archive.pda.writeFile(JSON_PATH, JSON.stringify(followsJson), 'utf8') + + // trigger crawl now + await crawler.crawlSite(archive) } finally { release() } diff --git a/crawler/index.js b/crawler/index.js index 004607a3..586c1c0a 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -7,6 +7,8 @@ const dat = require('../dat') const posts = require('./posts') const followgraph = require('./followgraph') +const CRAWL_POLL_INTERVAL = 30e3 + // globals // = @@ -31,16 +33,20 @@ exports.watchSite = async function (archive) { const queueCrawl = _throttle(() => crawlSite(archive), 5e3) // watch for file changes - watches[archive.url] = await archive.pda.watch() - watches[archive.url].on('error', console.error) - watches[archive.url].on('close', console.log.bind(console, 'close')) + watches[archive.url] = archive.pda.watch() watches[archive.url].on('data', ([event, args]) => { - console.log('change event', archive.url, event, args) + console.log('MIRACLE ALERT! The crawler watch stream emitted a change event', archive.url, event, args) if (event === 'invalidated') { queueCrawl() } }) + // HACK + // for reasons that currently surpass me + // the `archive.pda.watch()` call is not currently working all the time + // so we need to poll sites for now + setInterval(queueCrawl, CRAWL_POLL_INTERVAL) + // run the first crawl crawlSite(archive) } @@ -54,7 +60,8 @@ exports.unwatchSite = async function (url) { } } -async function crawlSite (archive) { +const crawlSite = +exports.crawlSite = async function (archive) { console.log('crawling', archive.url) var release = await lock('crawl:' + archive.url) try { diff --git a/crawler/posts.js b/crawler/posts.js index eda18370..4401c2ee 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -2,6 +2,7 @@ const assert = require('assert') const {URL} = require('url') const Events = require('events') const db = require('../dbs/profile-data-db') +const crawler = require('./index') const {doCrawl, doCheckpoint, generateTimeFilename} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') @@ -30,7 +31,6 @@ exports.crawlSite = async function (archive, crawlSource) { console.log('Crawling posts for', archive.url, {changes, resetRequired}) if (resetRequired) { // reset all data - console.log('resetting data') await db.run(` DELETE FROM crawl_posts WHERE crawlSourceId = ? `, [crawlSource.id]) @@ -58,14 +58,12 @@ exports.crawlSite = async function (archive, crawlSource) { // -prf if (changedPost.type === 'del') { // delete - console.log('deleting', changedPost) await db.run(` DELETE FROM crawl_posts WHERE crawlSourceId = ? AND pathname = ? `, [crawlSource.id, changedPost.name]) events.emit('post-removed', archive.url) } else { // read and validate - console.log('adding', changedPost) let post try { post = JSON.parse(await archive.pda.readFile(changedPost.name, 'utf8')) @@ -104,7 +102,6 @@ exports.crawlSite = async function (archive, crawlSource) { // checkpoint our progress await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSource, changedPost.version) } - console.log('success', changedPost) } }) } @@ -171,26 +168,31 @@ const get = exports.get = async function (url, pathname = undefined) { exports.create = async function (archive, {content} = {}) { assert(typeof content === 'string', 'Create() must be provided a `content` string') var filename = generateTimeFilename() - await archive.writeFile(`/data/posts/${filename}.json`, JSON.stringify({ + console.log('writing file') + await archive.pda.writeFile(`/data/posts/${filename}.json`, JSON.stringify({ type: JSON_TYPE, content, createdAt: (new Date()).toISOString() })) + console.log('file written') + await crawler.crawlSite(archive) } exports.edit = async function (archive, pathname, {content} = {}) { assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') assert(typeof content === 'string', 'Edit() must be provided a `content` string') - var oldJson = JSON.parse(await archive.readFile(pathname)) - await archive.writeFile(pathname, JSON.stringify({ + var oldJson = JSON.parse(await archive.pda.readFile(pathname)) + await archive.pda.writeFile(pathname, JSON.stringify({ type: JSON_TYPE, content, createdAt: oldJson.createdAt, updatedAt: (new Date()).toISOString() })) + await crawler.crawlSite(archive) } exports.delete = async function (archive, pathname) { assert(typeof pathname === 'string', 'Delete() must be provided a valid URL string') - await archive.unlink(pathname) + await archive.pda.unlink(pathname) + await crawler.crawlSite(archive) } diff --git a/crawler/util.js b/crawler/util.js index cdb0e001..f16c018f 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -30,7 +30,7 @@ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatas // fetch change log var start = state.crawlSourceVersion + 1 var end = version + 1 - console.log('fetching changes', start, end, state) + console.log('fetching changes', archive.url, start, end, state) var changes = await new Promise((resolve, reject) => { pump( archive.history({start, end, timeout: READ_TIMEOUT}), From c08bcc36f4b951d05fdfd2ffdf6dbd18e6804e6e Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 9 Dec 2018 20:21:43 -0600 Subject: [PATCH 016/245] Give better author info in posts responses --- crawler/posts.js | 18 +++++++++++++----- web-apis/bg/posts.js | 19 +++++++++++++++++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/crawler/posts.js b/crawler/posts.js index 4401c2ee..b7235412 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -141,7 +141,7 @@ exports.list = async function ({offset, limit, reverse, author} = {}) { } // execute query - return db.all(query, values) + return (await db.all(query, values)).map(massagePostRow) } const get = exports.get = async function (url, pathname = undefined) { @@ -153,7 +153,7 @@ const get = exports.get = async function (url, pathname = undefined) { pathname = pathname || url.pathname // execute query - return db.get(` + return massagePostRow(await db.get(` SELECT crawl_posts.*, src.url AS crawlSourceUrl FROM crawl_posts @@ -162,19 +162,17 @@ const get = exports.get = async function (url, pathname = undefined) { AND src.url = ? WHERE crawl_posts.pathname = ? - `, [url.origin, pathname]) + `, [url.origin, pathname])) } exports.create = async function (archive, {content} = {}) { assert(typeof content === 'string', 'Create() must be provided a `content` string') var filename = generateTimeFilename() - console.log('writing file') await archive.pda.writeFile(`/data/posts/${filename}.json`, JSON.stringify({ type: JSON_TYPE, content, createdAt: (new Date()).toISOString() })) - console.log('file written') await crawler.crawlSite(archive) } @@ -196,3 +194,13 @@ exports.delete = async function (archive, pathname) { await archive.pda.unlink(pathname) await crawler.crawlSite(archive) } + +// internal methods +// = + +function massagePostRow (row) { + row.author = {url: row.crawlSourceUrl} + delete row.crawlSourceUrl + delete row.crawlSourceId + return row +} diff --git a/web-apis/bg/posts.js b/web-apis/bg/posts.js index d7c5e623..02213262 100644 --- a/web-apis/bg/posts.js +++ b/web-apis/bg/posts.js @@ -3,6 +3,7 @@ const assert = require('assert') const {Url} = require('url') const {PermissionsError} = require('beaker-error-constants') const dat = require('../../dat') +const archivesDb = require('../../dbs/archives') const postsCrawler = require('../../crawler/posts') // exported api @@ -20,11 +21,17 @@ module.exports = { try { author = new URL(author) } catch (e) { throw new Error('Failed to parse author URL: ' + author) } } - return postsCrawler.list({offset, limit, reverse, author}) + var posts = await postsCrawler.list({offset, limit, reverse, author}) + await Promise.all(posts.map(async (post) => { + post.author.title = await getUserTitle(post.author) + })) + return posts }, async get (origin, pathname = undefined) { - return postsCrawler.get(origin, pathname) + var post = await postsCrawler.get(origin, pathname) + post.author.title = await getUserTitle(post.author) + return post }, async create ({content} = {}) { @@ -51,4 +58,12 @@ module.exports = { var userArchive = dat.library.getArchive(userSession.url) return postsCrawler.delete(userArchive, pathname) } +} + +// internal methods +// = + +async function getUserTitle (author) { + var meta = await archivesDb.getMeta(author.url.slice('dat://'.length)) + return meta ? meta.title : false } \ No newline at end of file From fc8a52a4e40a7b54b51d2903c5b7adadfaab5950 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 9 Dec 2018 20:30:22 -0600 Subject: [PATCH 017/245] Correctly handle get() that doesnt have content --- crawler/posts.js | 1 + 1 file changed, 1 insertion(+) diff --git a/crawler/posts.js b/crawler/posts.js index b7235412..40f63e4c 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -199,6 +199,7 @@ exports.delete = async function (archive, pathname) { // = function massagePostRow (row) { + if (!row) return null row.author = {url: row.crawlSourceUrl} delete row.crawlSourceUrl delete row.crawlSourceId From 6252f9bced15e1fd2fe8e45b5291f30d8bdc1ff6 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 9 Dec 2018 20:40:48 -0600 Subject: [PATCH 018/245] Normalize the URL provided by the users api --- users/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/index.js b/users/index.js index 2d05298b..3d672a84 100644 --- a/users/index.js +++ b/users/index.js @@ -164,7 +164,7 @@ async function fetchUserInfo (user) { var urlp = new URL(user.url) var meta = await archivesDb.getMeta(urlp.hostname) return { - url: user.url, + url: user.url.replace(/(\/)$/, ''), isDefault: user.isDefault, title: meta.title, description: meta.description, From 85c9f3c61997ce707449d8638e20cca5d5512955 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 9 Dec 2018 20:45:38 -0600 Subject: [PATCH 019/245] More consistent URL normalization --- users/index.js | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/users/index.js b/users/index.js index 3d672a84..9f66b19c 100644 --- a/users/index.js +++ b/users/index.js @@ -33,6 +33,7 @@ exports.setup = async function () { console.log('users loaded', users) users.forEach(async (user) => { // massage data + user.url = normalizeUrl(user.url) user.archive = null user.isDefault = Boolean(user.isDefault) user.createdAt = new Date(user.createdAt) @@ -55,13 +56,15 @@ exports.list = async function () { const get = exports.get = async function (url) { + url = normalizeUrl(url) + console.log('getting user', url, users) var user = users.find(user => user.url === url) if (!user) return null return await fetchUserInfo(user) } const getDefault = -exports.getDefault = async function (url = undefined) { +exports.getDefault = async function () { var user = users.find(user => user.isDefault === true) if (!user) return null return await fetchUserInfo(user) @@ -69,6 +72,7 @@ exports.getDefault = async function (url = undefined) { exports.add = async function (url) { // make sure the user doesnt already exist + url = normalizeUrl(url) var existingUser = await get(url) if (existingUser) return @@ -96,6 +100,7 @@ exports.add = async function (url) { } exports.remove = async function (url) { + url = normalizeUrl(url) // get the user var user = await get(url) if (!user) return @@ -164,7 +169,7 @@ async function fetchUserInfo (user) { var urlp = new URL(user.url) var meta = await archivesDb.getMeta(urlp.hostname) return { - url: user.url.replace(/(\/)$/, ''), + url: normalizeUrl(user.url), isDefault: user.isDefault, title: meta.title, description: meta.description, @@ -172,6 +177,10 @@ async function fetchUserInfo (user) { } } +function normalizeUrl (url) { + return url ? url.replace(/(\/)$/, '') : url +} + async function validateUserUrl (url) { // make sure the archive is saved and that we own the archive var urlp = new URL(url) From b23ef93235dc6d0953425e5cfc1404042b46937c Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 9 Dec 2018 20:53:05 -0600 Subject: [PATCH 020/245] Create folders as needed --- crawler/posts.js | 7 +++++++ users/index.js | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crawler/posts.js b/crawler/posts.js index 40f63e4c..b516d149 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -168,6 +168,8 @@ const get = exports.get = async function (url, pathname = undefined) { exports.create = async function (archive, {content} = {}) { assert(typeof content === 'string', 'Create() must be provided a `content` string') var filename = generateTimeFilename() + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/posts') await archive.pda.writeFile(`/data/posts/${filename}.json`, JSON.stringify({ type: JSON_TYPE, content, @@ -198,6 +200,11 @@ exports.delete = async function (archive, pathname) { // internal methods // = +async function ensureDirectory (archive, pathname) { + try { await archive.pda.mkdir(pathname) } + catch (e) { /* ignore */ } +} + function massagePostRow (row) { if (!row) return null row.author = {url: row.crawlSourceUrl} diff --git a/users/index.js b/users/index.js index 9f66b19c..b8e00e7f 100644 --- a/users/index.js +++ b/users/index.js @@ -57,7 +57,6 @@ exports.list = async function () { const get = exports.get = async function (url) { url = normalizeUrl(url) - console.log('getting user', url, users) var user = users.find(user => user.url === url) if (!user) return null return await fetchUserInfo(user) From e9b33ba46a8dc3f7c3c0e7d8fca98d0c5656cae1 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 10 Dec 2018 11:46:46 -0600 Subject: [PATCH 021/245] Fixes --- crawler/followgraph.js | 2 +- web-apis/bg/followgraph.js | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 3ec2a7f5..12b1dfa6 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -156,7 +156,7 @@ exports.unfollow = function (archive, followUrl) { function normalizeFollowUrl (url) { try { url = new URL(url) - return url.origin + return url.protocol + '//' + url.hostname } catch (e) { return null } diff --git a/web-apis/bg/followgraph.js b/web-apis/bg/followgraph.js index 3ac7c14b..4fee199d 100644 --- a/web-apis/bg/followgraph.js +++ b/web-apis/bg/followgraph.js @@ -45,7 +45,7 @@ module.exports = { var userSession = globals.userSessionAPI.getFor(this.sender) if (!userSession) throw new Error('No active user session') var userArchive = dat.library.getArchive(userSession.url) - return followgraphCrawler.follow(userArchive, url) + return followgraphCrawler.unfollow(userArchive, url) } } @@ -55,10 +55,9 @@ module.exports = { function normalizeFollowUrl (url) { try { url = new URL(url) - return url.origin - } catch (e) { - return null - } + return url.protocol + '//' + url.hostname + } catch (e) {} + return null } function assertString (v, msg) { From bc4bbbbe33128a1e5575a44f695e2140efe7feb9 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 10 Dec 2018 12:10:35 -0600 Subject: [PATCH 022/245] Fixes to followgraph crawler --- crawler/followgraph.js | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 12b1dfa6..87bf7d59 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -30,6 +30,7 @@ exports.removeListener = events.removeListener.bind(events) exports.crawlSite = async function (archive, crawlSource) { return doCrawl(archive, crawlSource, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info + console.log('Crawling follows for', archive.url, {changes, resetRequired}) if (resetRequired) { // reset all data await db.run(` @@ -39,7 +40,7 @@ exports.crawlSite = async function (archive, crawlSource) { } // did follows.json change? - var change = changes.find(c => c.path === JSON_PATH) + var change = changes.find(c => c.name === JSON_PATH) if (!change) { return } @@ -48,12 +49,13 @@ exports.crawlSite = async function (archive, crawlSource) { try { var followsJson = await readFollowsFile(archive) } catch (err) { + console.error('Failed to read follows file', {url: archive.url, err}) debug('Failed to read follows file', {url: archive.url, err}) return } // diff against the current follows - var currentFollows = await listFollows(archive) + var currentFollows = await listFollows(archive.url) var newFollows = followsJson.urls var adds = _difference(newFollows, currentFollows) var removes = _difference(currentFollows, newFollows) @@ -163,11 +165,17 @@ function normalizeFollowUrl (url) { } async function readFollowsFile (archive) { - var followsJson = JSON.parse(await archive.pda.readFile(JSON_PATH, 'utf8')) + try { + var followsJson = await archive.pda.readFile(JSON_PATH, 'utf8') + } catch (e) { + if (e.notFound) return {urls: []} // empty default when not found + throw e + } + followsJson = JSON.parse(followsJson) assert(typeof followsJson === 'object', 'File be an object') assert(followsJson.type === JSON_TYPE, 'JSON type must be unwalled.garden/follows') - assert(Array.isArray(followsJson.follows), 'JSON follows must be an array of strings') - followsJson.follows = followsJson.follows.filter(v => typeof v === 'string') + assert(Array.isArray(followsJson.urls), 'JSON .urls must be an array of strings') + followsJson.urls = followsJson.urls.filter(v => typeof v === 'string') return followsJson } From 3495ba1e552be6434a084996a93a6b4b6f7acb18 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 10 Dec 2018 12:11:11 -0600 Subject: [PATCH 023/245] Fixes to crawler --- crawler/index.js | 2 +- crawler/util.js | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/crawler/index.js b/crawler/index.js index 586c1c0a..1112ac64 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -66,7 +66,7 @@ exports.crawlSite = async function (archive) { var release = await lock('crawl:' + archive.url) try { // get/create crawl source - var crawlSource = await db.get(`SELECT id FROM crawl_sources WHERE url = ?`, [archive.url]) + var crawlSource = await db.get(`SELECT id, url FROM crawl_sources WHERE url = ?`, [archive.url]) if (!crawlSource) { await db.run(`INSERT INTO crawl_sources (url) VALUES (?)`, [archive.url]) crawlSource = {id: db.getSqliteInstance().lastID, url: archive.url} diff --git a/crawler/util.js b/crawler/util.js index f16c018f..02c008dc 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -11,10 +11,9 @@ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatas // fetch current crawl state var resetRequired = false var state = await db.get(` - SELECT meta.crawlSourceVersion, meta.crawlDatasetVersion FROM crawl_sources_meta meta - INNER JOIN crawl_sources ON crawl_sources.url = ? - WHERE meta.crawlDataset = ? - `, [url, crawlDataset]) + SELECT crawlSourceVersion, crawlDatasetVersion FROM crawl_sources_meta + WHERE crawlSourceId = ? AND crawlDataset = ? + `, [crawlSource.id, crawlDataset]) if (state && state.crawlDatasetVersion !== crawlDatasetVersion) { resetRequired = true state = null From 6b6dab7a8e416da28da2ede5d4db316815cfe227 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 10 Dec 2018 12:33:32 -0600 Subject: [PATCH 024/245] Add authors param to beaker.posts.list --- crawler/posts.js | 26 ++++++++++++++++++++++---- web-apis/bg/posts.js | 9 +++------ web-apis/manifests/internal/users.js | 7 +++++++ 3 files changed, 32 insertions(+), 10 deletions(-) create mode 100644 web-apis/manifests/internal/users.js diff --git a/crawler/posts.js b/crawler/posts.js index b516d149..dc4c8f1f 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -106,15 +106,21 @@ exports.crawlSite = async function (archive, crawlSource) { }) } -exports.list = async function ({offset, limit, reverse, author} = {}) { +exports.list = async function ({offset, limit, reverse, author, authors} = {}) { // validate & parse params assert(!offset || typeof offset === 'number', 'Offset must be a number') assert(!limit || typeof limit === 'number', 'Limit must be a number') assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') assert(!author || typeof author === 'string', 'Author must be a string') + assert(!authors || !Array.isArray(author), 'Authors must be an array of strings') + if (author) { - try { author = new URL(author) } - catch (e) { throw new Error('Failed to parse author URL: ' + author) } + try { author = toOrigin(author) } + catch (e) { throw new Error('Author must be a valid URL') } + } + if (authors) { + try { authors = authors.map(toOrigin) } + catch (e) { throw new Error('Authors array must contain valid URLs') } } // build query @@ -125,7 +131,14 @@ exports.list = async function ({offset, limit, reverse, author} = {}) { var values = [] if (author) { query += ` WHERE src.url = ?` - values.push(author.origin) + values.push(author) + } else if (authors) { + let op = 'WHERE' + for (let author of authors) { + query += ` ${op} src.url = ?` + op = 'OR' + values.push(author) + } } if (offset) { query += ` OFFSET ?` @@ -200,6 +213,11 @@ exports.delete = async function (archive, pathname) { // internal methods // = +function toOrigin (url) { + url = new URL(url) + return url.protocol + '//' + url.hostname +} + async function ensureDirectory (archive, pathname) { try { await archive.pda.mkdir(pathname) } catch (e) { /* ignore */ } diff --git a/web-apis/bg/posts.js b/web-apis/bg/posts.js index 02213262..ee3eeca4 100644 --- a/web-apis/bg/posts.js +++ b/web-apis/bg/posts.js @@ -11,17 +11,14 @@ const postsCrawler = require('../../crawler/posts') module.exports = { - async list ({offset, limit, reverse, author} = {}) { + async list ({offset, limit, reverse, author, authors} = {}) { // validate & parse params assert(!offset || typeof offset === 'number', 'Offset must be a number') assert(!limit || typeof limit === 'number', 'Limit must be a number') assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') assert(!author || typeof author === 'string', 'Author must be a string') - if (author) { - try { author = new URL(author) } - catch (e) { throw new Error('Failed to parse author URL: ' + author) } - } - var posts = await postsCrawler.list({offset, limit, reverse, author}) + assert(!authors || !Array.isArray(author), 'Authors must be an array of strings') + var posts = await postsCrawler.list({offset, limit, reverse, author, authors}) await Promise.all(posts.map(async (post) => { post.author.title = await getUserTitle(post.author) })) diff --git a/web-apis/manifests/internal/users.js b/web-apis/manifests/internal/users.js new file mode 100644 index 00000000..bdc4add0 --- /dev/null +++ b/web-apis/manifests/internal/users.js @@ -0,0 +1,7 @@ +module.exports = { + list: 'promise', + get: 'promise', + getDefault: 'promise', + add: 'promise', + remove: 'promise' +} From 1461a5f447a0ab2ef77583f22114adb3b1b3d7c2 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 10 Dec 2018 12:55:26 -0600 Subject: [PATCH 025/245] More consistent URL normalization in followgraph --- crawler/followgraph.js | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 87bf7d59..b85c3808 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -94,7 +94,7 @@ exports.listFollowers = async function (subject) { ON crawl_followgraph.crawlSourceId = crawl_sources.id AND crawl_followgraph.destUrl = ? `, [subject]) - return rows.map(row => row.url) + return rows.map(row => toOrigin(row.url)) } // List urls of sites that subject follows @@ -108,7 +108,7 @@ const listFollows = exports.listFollows = async function (subject) { ON crawl_followgraph.crawlSourceId = crawl_sources.id AND crawl_sources.url = ? `, [subject]) - return rows.map(row => row.destUrl) + return rows.map(row => toOrigin(row.destUrl)) } // Check for the existence of an individual follow @@ -116,6 +116,8 @@ const listFollows = exports.listFollows = async function (subject) { // - b. String (URL), does a follow this site? // - returns bool exports.isAFollowingB = async function (a, b) { + a = toOrigin(a) + b = toOrigin(b) var res = await db.get(` SELECT crawl_sources.id FROM crawl_sources @@ -129,7 +131,7 @@ exports.isAFollowingB = async function (a, b) { exports.follow = function (archive, followUrl) { // normalize followUrl - followUrl = normalizeFollowUrl(followUrl) + followUrl = toOrigin(followUrl) assert(typeof followUrl === 'string', 'Follow() must be given a valid URL') return updateFollowsFile(archive, followsJson => { @@ -141,7 +143,7 @@ exports.follow = function (archive, followUrl) { exports.unfollow = function (archive, followUrl) { // normalize followUrl - followUrl = normalizeFollowUrl(followUrl) + followUrl = toOrigin(followUrl) assert(typeof followUrl === 'string', 'Unfollow() must be given a valid URL') return updateFollowsFile(archive, followsJson => { @@ -155,7 +157,7 @@ exports.unfollow = function (archive, followUrl) { // internal methods // = -function normalizeFollowUrl (url) { +function toOrigin (url) { try { url = new URL(url) return url.protocol + '//' + url.hostname @@ -175,7 +177,7 @@ async function readFollowsFile (archive) { assert(typeof followsJson === 'object', 'File be an object') assert(followsJson.type === JSON_TYPE, 'JSON type must be unwalled.garden/follows') assert(Array.isArray(followsJson.urls), 'JSON .urls must be an array of strings') - followsJson.urls = followsJson.urls.filter(v => typeof v === 'string') + followsJson.urls = followsJson.urls.filter(v => typeof v === 'string').map(toOrigin) return followsJson } From 2848a5241a2897e40425f0b86fa97463556d671d Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 10 Dec 2018 12:55:52 -0600 Subject: [PATCH 026/245] Correctly get last ID of inserts --- crawler/index.js | 4 ++-- dbs/profile-data-db.js | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/crawler/index.js b/crawler/index.js index 1112ac64..3a7e5c08 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -68,8 +68,8 @@ exports.crawlSite = async function (archive) { // get/create crawl source var crawlSource = await db.get(`SELECT id, url FROM crawl_sources WHERE url = ?`, [archive.url]) if (!crawlSource) { - await db.run(`INSERT INTO crawl_sources (url) VALUES (?)`, [archive.url]) - crawlSource = {id: db.getSqliteInstance().lastID, url: archive.url} + let res = await db.run(`INSERT INTO crawl_sources (url) VALUES (?)`, [archive.url]) + crawlSource = {id: res.lastID, url: archive.url} } // crawl individual sources diff --git a/dbs/profile-data-db.js b/dbs/profile-data-db.js index ada82a01..a4c919b4 100644 --- a/dbs/profile-data-db.js +++ b/dbs/profile-data-db.js @@ -33,7 +33,10 @@ exports.all = async function (...args) { exports.run = async function (...args) { await setupPromise - return cbPromise(cb => db.run(...args, cb)) + return cbPromise(cb => db.run(...args, function (err) { + if (err) cb(err) + else cb(null, {lastID: this.lastID}) + })) } exports.serialize = function () { From b155908cba327e4caf1b78c12c591e47ccec40db Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 10 Dec 2018 16:09:06 -0600 Subject: [PATCH 027/245] Add readFile and showEditProfileModal to beaker.browser --- web-apis/fg/beaker.js | 2 ++ web-apis/manifests/internal/browser.js | 2 ++ 2 files changed, 4 insertions(+) diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index b3d4de13..4af45a67 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -73,6 +73,7 @@ exports.setup = function (rpc) { beaker.browser.restartBrowser = beakerBrowserRPC.restartBrowser beaker.browser.getUserSession = beakerBrowserRPC.getUserSession beaker.browser.setUserSession = beakerBrowserRPC.setUserSession + beaker.browser.showEditProfileModal = beakerBrowserRPC.showEditProfileModal beaker.browser.getSetting = beakerBrowserRPC.getSetting beaker.browser.getSettings = beakerBrowserRPC.getSettings beaker.browser.setSetting = beakerBrowserRPC.setSetting @@ -85,6 +86,7 @@ exports.setup = function (rpc) { beaker.browser.removeAsDefaultProtocolClient = beakerBrowserRPC.removeAsDefaultProtocolClient beaker.browser.fetchBody = beakerBrowserRPC.fetchBody beaker.browser.downloadURL = beakerBrowserRPC.downloadURL + beaker.browser.readFile = beakerBrowserRPC.readFile beaker.browser.getResourceContentType = beakerBrowserRPC.getResourceContentType beaker.browser.listBuiltinFavicons = beakerBrowserRPC.listBuiltinFavicons beaker.browser.getBuiltinFavicon = beakerBrowserRPC.getBuiltinFavicon diff --git a/web-apis/manifests/internal/browser.js b/web-apis/manifests/internal/browser.js index cd1c7743..6ffc0d7f 100644 --- a/web-apis/manifests/internal/browser.js +++ b/web-apis/manifests/internal/browser.js @@ -6,6 +6,7 @@ module.exports = { getUserSession: 'promise', setUserSession: 'promise', + showEditProfileModal: 'promise', getSettings: 'promise', getSetting: 'promise', @@ -25,6 +26,7 @@ module.exports = { fetchBody: 'promise', downloadURL: 'promise', + readFile: 'promise', getResourceContentType: 'sync', From 1ab95a6027bec1d86538198113880661a31dcdbe Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 13 Dec 2018 14:56:30 -0600 Subject: [PATCH 028/245] Fix first follow() --- crawler/followgraph.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index b85c3808..f8459d32 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -170,7 +170,7 @@ async function readFollowsFile (archive) { try { var followsJson = await archive.pda.readFile(JSON_PATH, 'utf8') } catch (e) { - if (e.notFound) return {urls: []} // empty default when not found + if (e.notFound) return {type: JSON_TYPE, urls: []} // empty default when not found throw e } followsJson = JSON.parse(followsJson) From f6c30ecd03256c8703c1c7078032ce2c924584a4 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 13 Dec 2018 14:57:59 -0600 Subject: [PATCH 029/245] Add site-descriptions to crawler --- crawler/followgraph.js | 62 ++++- crawler/index.js | 8 +- crawler/posts.js | 38 +-- crawler/site-descriptions.js | 303 +++++++++++++++++++++ crawler/util.js | 17 +- dbs/schemas/profile-data.sql.js | 16 ++ dbs/schemas/profile-data.v24.sql.js | 16 ++ web-apis/bg/followgraph.js | 14 +- web-apis/fg/beaker.js | 1 + web-apis/manifests/internal/followgraph.js | 1 + 10 files changed, 431 insertions(+), 45 deletions(-) create mode 100644 crawler/site-descriptions.js diff --git a/crawler/followgraph.js b/crawler/followgraph.js index f8459d32..f3403161 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -5,6 +5,7 @@ const {Url} = require('url') const lock = require('../lib/lock') const db = require('../dbs/profile-data-db') const crawler = require('./index') +const siteDescriptions = require('./site-descriptions') const {doCrawl, doCheckpoint} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') @@ -83,10 +84,11 @@ exports.crawlSite = async function (archive, crawlSource) { }) } -// List urls of sites that follow subject +// List sites that follow subject // - subject. String (URL). -// - returns Array -exports.listFollowers = async function (subject) { +// - opts.includeDesc. Boolean. +// - returns Array +exports.listFollowers = async function (subject, {includeDesc} = {}) { var rows = await db.all(` SELECT crawl_sources.url FROM crawl_sources @@ -94,13 +96,22 @@ exports.listFollowers = async function (subject) { ON crawl_followgraph.crawlSourceId = crawl_sources.id AND crawl_followgraph.destUrl = ? `, [subject]) - return rows.map(row => toOrigin(row.url)) + if (!includeDesc) { + return rows.map(row => toOrigin(row.url)) + } + return Promise.all(rows.map(async (row) => { + var url = toOrigin(row.url) + var desc = await siteDescriptions.getBest({subject: url}) + desc.url = url + return desc + })) } -// List urls of sites that subject follows +// List sites that subject follows // - subject. String (URL). -// - returns Array -const listFollows = exports.listFollows = async function (subject) { +// - opts.includeDesc. Boolean. +// - returns Array +const listFollows = exports.listFollows = async function (subject, {includeDesc} = {}) { var rows = await db.all(` SELECT crawl_followgraph.destUrl FROM crawl_followgraph @@ -108,7 +119,29 @@ const listFollows = exports.listFollows = async function (subject) { ON crawl_followgraph.crawlSourceId = crawl_sources.id AND crawl_sources.url = ? `, [subject]) - return rows.map(row => toOrigin(row.destUrl)) + if (!includeDesc) { + return rows.map(row => toOrigin(row.destUrl)) + } + return Promise.all(rows.map(async (row) => { + var url = toOrigin(row.destUrl) + var desc = await siteDescriptions.getBest({subject: url, author: subject}) + desc.url = url + return desc + })) +} + +// List sites that are followed by sites that the subject follows +// - subject. String (URL). +// - opts.includeDesc. Boolean. +// - returns Array +const listFoaFs = exports.listFoaFs = async function (subject, {includeDesc} = {}) { + var foafs = [] + var follows = await listFollows(subject) + for (let url of follows) { + foafs = foafs.concat(await listFollows(url, {includeDesc})) + } + // TODO remove duplicates + return foafs } // Check for the existence of an individual follow @@ -129,24 +162,29 @@ exports.isAFollowingB = async function (a, b) { return !!res } -exports.follow = function (archive, followUrl) { +exports.follow = async function (archive, followUrl) { // normalize followUrl followUrl = toOrigin(followUrl) assert(typeof followUrl === 'string', 'Follow() must be given a valid URL') - return updateFollowsFile(archive, followsJson => { + // write new follows.json + await updateFollowsFile(archive, followsJson => { if (!followsJson.urls.find(v => v === followUrl)) { followsJson.urls.push(followUrl) } }) + + // capture site description + /* dont await */siteDescriptions.capture(archive, followUrl) } -exports.unfollow = function (archive, followUrl) { +exports.unfollow = async function (archive, followUrl) { // normalize followUrl followUrl = toOrigin(followUrl) assert(typeof followUrl === 'string', 'Unfollow() must be given a valid URL') - return updateFollowsFile(archive, followsJson => { + // write new follows.json + await updateFollowsFile(archive, followsJson => { var i = followsJson.urls.findIndex(v => v === followUrl) if (i !== -1) { followsJson.urls.splice(i, 1) diff --git a/crawler/index.js b/crawler/index.js index 3a7e5c08..dd1a2ec4 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -6,6 +6,7 @@ const dat = require('../dat') const posts = require('./posts') const followgraph = require('./followgraph') +const siteDescriptions = require('./site-descriptions') const CRAWL_POLL_INTERVAL = 30e3 @@ -19,6 +20,7 @@ const watches = {} exports.posts = posts exports.followgraph = followgraph +exports.siteDescriptions = siteDescriptions exports.setup = async function () { } @@ -75,10 +77,10 @@ exports.crawlSite = async function (archive) { // crawl individual sources await Promise.all([ posts.crawlSite(archive, crawlSource), - followgraph.crawlSite(archive, crawlSource) + followgraph.crawlSite(archive, crawlSource), + siteDescriptions.crawlSite(archive, crawlSource) ]) } finally { release() } -} -exports.crawlSite = crawlSite \ No newline at end of file +} \ No newline at end of file diff --git a/crawler/posts.js b/crawler/posts.js index dc4c8f1f..8acefe9e 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -3,7 +3,7 @@ const {URL} = require('url') const Events = require('events') const db = require('../dbs/profile-data-db') const crawler = require('./index') -const {doCrawl, doCheckpoint, generateTimeFilename} = require('./util') +const {doCrawl, doCheckpoint, getMatchingChangesInOrder, generateTimeFilename} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') // constants @@ -38,16 +38,7 @@ exports.crawlSite = async function (archive, crawlSource) { } // collect changed posts - var changedPosts = [] // order matters, must be oldest to newest - changes.forEach(c => { - if (JSON_PATH_REGEX.test(c.name)) { - let i = changedPosts.findIndex(c2 => c2.name === c.name) - if (i !== -1) { - changedPosts.splice(i, 1) // remove from old position - } - changedPosts.push(c) - } - }) + var changedPosts = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) console.log('collected changed posts', changedPosts) // read and apply each post in order @@ -106,21 +97,17 @@ exports.crawlSite = async function (archive, crawlSource) { }) } -exports.list = async function ({offset, limit, reverse, author, authors} = {}) { +exports.list = async function ({offset, limit, reverse, author} = {}) { // validate & parse params assert(!offset || typeof offset === 'number', 'Offset must be a number') assert(!limit || typeof limit === 'number', 'Limit must be a number') assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') - assert(!author || typeof author === 'string', 'Author must be a string') - assert(!authors || !Array.isArray(author), 'Authors must be an array of strings') + assert(!author || typeof author === 'string' || (Array.isArray(author) && author.every(isString)), 'Author must be a string or an array of strings') if (author) { - try { author = toOrigin(author) } - catch (e) { throw new Error('Author must be a valid URL') } - } - if (authors) { - try { authors = authors.map(toOrigin) } - catch (e) { throw new Error('Authors array must contain valid URLs') } + author = Array.isArray(author) ? author : [author] + try { author = author.map(toOrigin) } + catch (e) { throw new Error('Author must contain valid URLs') } } // build query @@ -130,14 +117,11 @@ exports.list = async function ({offset, limit, reverse, author, authors} = {}) { ` var values = [] if (author) { - query += ` WHERE src.url = ?` - values.push(author) - } else if (authors) { let op = 'WHERE' - for (let author of authors) { + for (let a of author) { query += ` ${op} src.url = ?` op = 'OR' - values.push(author) + values.push(a) } } if (offset) { @@ -213,6 +197,10 @@ exports.delete = async function (archive, pathname) { // internal methods // = +function isString (v) { + return typeof v === 'string' +} + function toOrigin (url) { url = new URL(url) return url.protocol + '//' + url.hostname diff --git a/crawler/site-descriptions.js b/crawler/site-descriptions.js new file mode 100644 index 00000000..90d5e925 --- /dev/null +++ b/crawler/site-descriptions.js @@ -0,0 +1,303 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const _pick = require('lodash.pick') +const db = require('../dbs/profile-data-db') +const archivesDb = require('../dbs/archives') +const dat = require('../dat') +const crawler = require('./index') +const {doCrawl, doCheckpoint, getMatchingChangesInOrder, generateTimeFilename} = require('./util') +const debug = require('../lib/debug-logger').debugLogger('crawler') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/site-description' +const JSON_PATH_REGEX = /^\/data\/known_sites\/([^/]+)\.json$/i + +// globals +// = + +var events = new Events() + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_site_descriptions', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + console.log('Crawling site descriptions for', archive.url, {changes, resetRequired}) + if (resetRequired) { + // reset all data + await db.run(` + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_site_descriptions', TABLE_VERSION, crawlSource, 0) + } + + // collect changed site descriptions + var changedSiteDescriptions = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + console.log('collected changed site descriptions', changedSiteDescriptions) + + // read and apply each post in order + for (let changedSiteDescription of changedSiteDescriptions) { + // TODO Currently the crawler will abort reading the feed if any description fails to load + // this means that a single bad or unreachable file can stop the forward progress of description indexing + // to solve this, we need to find a way to tolerate bad description-files without losing our ability to efficiently detect new posts + // -prf + if (changedSiteDescription.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedSiteDescription.name]) + events.emit('description-removed', archive.url) + } else { + // read and validate + let desc + try { + desc = JSON.parse(await archive.pda.readFile(changedSiteDescription.name, 'utf8')) + assert(typeof desc === 'object', 'File be an object') + assert(desc.type === 'unwalled.garden/site-description', 'JSON .type must be unwalled.garden/site-description') + assert(typeof desc.subject === 'string', 'JSON .subject must be a URL string') + try { let subject = new URL(desc.subject) } + catch (e) { throw new Error('JSON .subject must be a URL string') } + assert(desc.metadata && typeof desc.metadata === 'object', 'JSON .metadata must be object') + assert(typeof desc.createdAt === 'string', 'JSON .createdAt must be a date-time') + assert(!isNaN(Number(new Date(desc.createdAt))), 'JSON .createdAt must be a date-time') + } catch (err) { + debug('Failed to read site-description file', {url: archive.url, name: changedSiteDescription.name, err}) + return // abort indexing + } + + // massage the description + desc.subject = toOrigin(desc.subject) + desc.metadata.title = typeof desc.metadata.title === 'string' ? desc.metadata.title : '' + desc.metadata.description = typeof desc.metadata.description === 'string' ? desc.metadata.description : '' + if (typeof desc.metadata.type === 'string') desc.metadata.type = desc.metadata.type.split(',') + if (Array.isArray(desc.metadata.type)) { + desc.metadata.type = desc.metadata.type.filter(isString) + } else { + desc.metadata.type = [] + } + desc.createdAt = Number(new Date(desc.createdAt)) + + // replace + await db.run(` + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedSiteDescription.name]) + await db.run(` + INSERT OR REPLACE INTO crawl_site_descriptions (crawlSourceId, pathname, crawledAt, subject, title, description, type, createdAt) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `, [crawlSource.id, changedSiteDescription.name, Date.now(), desc.subject, desc.metadata.title, desc.metadata.description, desc.metadata.type.join(','), desc.createdAt]) + events.emit('description-added', archive.url) + + // checkpoint our progress + await doCheckpoint('crawl_site_descriptions', TABLE_VERSION, crawlSource, changedSiteDescription.version) + } + } + }) +} + +const list = exports.list = async function ({offset, limit, reverse, author, subject} = {}) { + // validate & parse params + assert(!offset || typeof offset === 'number', 'Offset must be a number') + assert(!limit || typeof limit === 'number', 'Limit must be a number') + assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') + assert(!author || typeof author === 'string' || (Array.isArray(author) && author.every(isString)), 'Author must be a string or an array of strings') + assert(!subject || typeof subject === 'string' || (Array.isArray(subject) && subject.every(isString)), 'Subject must be a string or an array of strings') + + if (author) { + author = Array.isArray(author) ? author : [author] + try { author = author.map(toOrigin) } + catch (e) { throw new Error('Author must contain valid URLs') } + } + if (subject) { + subject = Array.isArray(subject) ? subject : [subject] + try { subject = subject.map(toOrigin) } + catch (e) { throw new Error('Subject must contain valid URLs') } + } + + // build query + var query = ` + SELECT crawl_site_descriptions.*, src.url AS crawlSourceUrl FROM crawl_site_descriptions + INNER JOIN crawl_sources src ON src.id = crawl_site_descriptions.crawlSourceId + ` + var values = [] + if (author) { + let op = 'WHERE' + for (let a of author) { + query += ` ${op} src.url = ?` + op = 'OR' + values.push(a) + } + } + if (subject) { + let op = 'WHERE' + for (let s of subject) { + query += ` ${op} subject = ?` + op = 'OR' + values.push(s) + } + } + if (offset) { + query += ` OFFSET ?` + values.push(offset) + } + if (limit) { + query += ` LIMIT ?` + values.push(limit) + } + query += ` ORDER BY createdAt` + if (reverse) { + query += ` DESC` + } + + // execute query + return (await db.all(query, values)).map(massageSiteDescriptionRow) +} + +exports.getBest = async function ({subject, author} = {}) { + // TODO + // while the archivesdb is more recent, it won't have the thumbnail + // -prf + // check archivesDb meta + // var meta = await archivesDb.getMeta(subject) + // if (meta) { + // return _pick(meta, ['title', 'description', 'type']) + // } + + // check for descriptions + console.log('getting best', subject, author) + var descriptions = await list({subject, author}) + return _pick(descriptions[0] || {}, ['title', 'description', 'type', 'author']) +} + +const get = exports.get = async function (url, pathname = undefined) { + // validate & parse params + if (url) { + try { url = new URL(url) } + catch (e) { throw new Error('Failed to parse post URL: ' + url) } + } + pathname = pathname || url.pathname + + // execute query + return massageSiteDescriptionRow(await db.get(` + SELECT + crawl_site_descriptions.*, src.url AS crawlSourceUrl + FROM crawl_site_descriptions + INNER JOIN crawl_sources src + ON src.id = crawl_site_descriptions.crawlSourceId + AND src.url = ? + WHERE + crawl_site_descriptions.pathname = ? + `, [url.origin, pathname])) +} + +exports.capture = async function (archive, subjectArchive) { + if (typeof subjectArchive === 'string') { + subjectArchive = await dat.library.getOrLoadArchive(subjectArchive) + } + + // capture metadata + try { + var info = JSON.parse(await subjectArchive.pda.readFile('/dat.json')) + } catch (e) { + console.error('Failed to read dat.json of subject archive', e) + debug('Failed to read dat.json of subject archive', e) + throw new Error('Unabled to read subject dat.json') + } + await put(archive, { + subject: subjectArchive.url, + title: typeof info.title === 'string' ? info.title : undefined, + description: typeof info.description === 'string' ? info.description : undefined, + type: typeof info.type === 'string' || (Array.isArray(info.type) && info.type.every(isString)) ? info.type : undefined + }) + + // capture thumb + for (let ext of ['jpg', 'jpeg', 'png']) { + let thumbPath = `/thumb.${ext}` + if (await fileExists(subjectArchive, thumbPath)) { + let targetPath = `/data/known_sites/${toHostname(subjectArchive.url)}.${ext}` + await archive.pda.writeFile(targetPath, await subjectArchive.pda.readFile(thumbPath, 'binary'), 'binary') + break + } + } +} + +const put = +exports.put = async function (archive, {subject, title, description, type} = {}) { + assert(typeof subject === 'string', 'Put() must be provided a `subject` string') + try { + var subjectUrl = new URL(subject) + } catch (e) { + throw new Error('Put() `subject` must be a valid URL') + } + assert(!title || typeof title === 'string', 'Put() `title` must be a string') + assert(!description || typeof description === 'string', 'Put() `description` must be a string') + if (type) { + if (typeof type === 'string') type = type.split(',') + assert(Array.isArray(type), 'Put() `type` must be a string or an array of strings') + assert(type.every(isString), 'Put() `type` must be a string or an array of strings') + } + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/known_sites') + await archive.pda.writeFile(`/data/known_sites/${subjectUrl.hostname}.json`, JSON.stringify({ + type: JSON_TYPE, + subject: subjectUrl.toString(), + metadata: { + title, + description, + type + }, + createdAt: (new Date()).toISOString() + })) + await crawler.crawlSite(archive) +} + +exports.delete = async function (archive, pathname) { + assert(typeof pathname === 'string', 'Delete() must be provided a valid URL string') + await archive.pda.unlink(pathname) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +function isString (v) { + return typeof v === 'string' +} + +function toOrigin (url) { + url = new URL(url) + return url.protocol + '//' + url.hostname +} + +function toHostname (url) { + url = new URL(url) + return url.hostname +} + +async function ensureDirectory (archive, pathname) { + try { await archive.pda.mkdir(pathname) } + catch (e) { /* ignore */ } +} + +async function fileExists (archive, pathname) { + try { await archive.pda.stat(pathname) } + catch (e) { return false } + return true +} + +function massageSiteDescriptionRow (row) { + if (!row) return null + row.author = {url: row.crawlSourceUrl} + row.type = row.type && typeof row.type === 'string' ? row.type.split(',') : undefined + delete row.crawlSourceUrl + delete row.crawlSourceId + return row +} diff --git a/crawler/util.js b/crawler/util.js index 02c008dc..aa22556f 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -40,9 +40,12 @@ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatas // handle changes await handlerFn({changes, resetRequired}) + + // final checkpoint + await doCheckpoint(crawlDataset, crawlDatasetVersion, crawlSource, version) } -exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { +const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { await db.run(`DELETE FROM crawl_sources_meta WHERE crawlDataset = ? AND crawlSourceId = ?`, [crawlDataset, crawlSource.id]) await db.run(` INSERT @@ -51,6 +54,18 @@ exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlS `, [crawlDataset, crawlDatasetVersion, crawlSource.id, crawlSourceVersion, Date.now()]) } +exports.getMatchingChangesInOrder = function (changes, regex) { + var list = [] // order matters, must be oldest to newest + changes.forEach(c => { + if (regex.test(c.name)) { + let i = list.findIndex(c2 => c2.name === c.name) + if (i !== -1) list.splice(i, 1) // remove from old position + list.push(c) + } + }) + return list +} + var _lastGeneratedTimeFilename exports.generateTimeFilename = function () { var d = Date.now() diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index 6cb2e383..a3305441 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -117,6 +117,22 @@ CREATE TABLE crawl_sources_meta ( FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); +-- crawled descriptions of other sites +CREATE TABLE crawl_site_descriptions ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + subject TEXT, + title TEXT, + description TEXT, + type TEXT, -- comma separated strings + createdAt INTEGER, + + PRIMARY KEY (crawlSourceId, pathname), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + -- crawled posts CREATE TABLE crawl_posts ( crawlSourceId INTEGER NOT NULL, diff --git a/dbs/schemas/profile-data.v24.sql.js b/dbs/schemas/profile-data.v24.sql.js index 2f769b70..e9b7bb71 100644 --- a/dbs/schemas/profile-data.v24.sql.js +++ b/dbs/schemas/profile-data.v24.sql.js @@ -24,6 +24,22 @@ CREATE TABLE crawl_sources_meta ( FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); +-- crawled descriptions of other sites +CREATE TABLE crawl_site_descriptions ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + subject TEXT, + title TEXT, + description TEXT, + type TEXT, -- comma separated strings + createdAt INTEGER, + + PRIMARY KEY (crawlSourceId, pathname), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + -- crawled posts CREATE TABLE crawl_posts ( crawlSourceId INTEGER NOT NULL, diff --git a/web-apis/bg/followgraph.js b/web-apis/bg/followgraph.js index 4fee199d..ba07c3d6 100644 --- a/web-apis/bg/followgraph.js +++ b/web-apis/bg/followgraph.js @@ -10,16 +10,22 @@ const followgraphCrawler = require('../../crawler/followgraph') module.exports = { - async listFollowers (url) { + async listFollowers (url, opts) { url = normalizeFollowUrl(url) assertString(url, 'Parameter one must be a URL') - return followgraphCrawler.listFollowers(url) + return followgraphCrawler.listFollowers(url, opts) }, - async listFollows (url) { + async listFollows (url, opts) { url = normalizeFollowUrl(url) assertString(url, 'Parameter one must be a URL') - return followgraphCrawler.listFollows(url) + return followgraphCrawler.listFollows(url, opts) + }, + + async listFoaFs (url, opts) { + url = normalizeFollowUrl(url) + assertString(url, 'Parameter one must be a URL') + return followgraphCrawler.listFoaFs(url, opts) }, async isAFollowingB (a, b) { diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 4af45a67..44511452 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -169,6 +169,7 @@ exports.setup = function (rpc) { beaker.followgraph = {} beaker.followgraph.listFollowers = followgraphRPC.listFollowers beaker.followgraph.listFollows = followgraphRPC.listFollows + beaker.followgraph.listFoaFs = followgraphRPC.listFoaFs beaker.followgraph.isAFollowingB = followgraphRPC.isAFollowingB beaker.followgraph.follow = followgraphRPC.follow beaker.followgraph.unfollow = followgraphRPC.unfollow diff --git a/web-apis/manifests/internal/followgraph.js b/web-apis/manifests/internal/followgraph.js index 2cb0fd6d..d4927e25 100644 --- a/web-apis/manifests/internal/followgraph.js +++ b/web-apis/manifests/internal/followgraph.js @@ -1,6 +1,7 @@ module.exports = { listFollowers: 'promise', listFollows: 'promise', + listFoaFs: 'promise', isAFollowingB: 'promise', follow: 'promise', unfollow: 'promise' From 041b7bc7eeb7526ab38d25ff9d56d32c69c01b04 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 13 Dec 2018 14:58:11 -0600 Subject: [PATCH 030/245] Add more export functions to daemon --- dat/daemon/index.js | 10 ++++++++++ dat/daemon/manifest.js | 2 ++ 2 files changed, 12 insertions(+) diff --git a/dat/daemon/index.js b/dat/daemon/index.js index 0aad5c42..4b25b8eb 100644 --- a/dat/daemon/index.js +++ b/dat/daemon/index.js @@ -331,6 +331,16 @@ const RPC_API = { configureAutoDownload(archive, userSettings) }, + async exportFilesystemToArchive (opts) { + opts.dstArchive = getArchive(opts.dstArchive) + return pda.exportFilesystemToArchive(opts) + }, + + async exportArchiveToFilesystem (opts) { + opts.srcArchive = getArchive(opts.srcArchive) + return pda.exportFilesystemToArchive(opts) + }, + async exportArchiveToArchive (opts) { opts.srcArchive = getArchive(opts.srcArchive) opts.dstArchive = getArchive(opts.dstArchive) diff --git a/dat/daemon/manifest.js b/dat/daemon/manifest.js index 327d4514..c926f07a 100644 --- a/dat/daemon/manifest.js +++ b/dat/daemon/manifest.js @@ -26,6 +26,8 @@ module.exports = { callArchivePDAPromiseMethod: 'promise', callArchivePDAReadStreamMethod: 'readable', clearFileCache: 'promise', + exportFilesystemToArchive: 'async', + exportArchiveToFilesystem: 'async', exportArchiveToArchive: 'async', // folder sync From d9e86aaf00d1393df2caa27b173397691adae641 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 13 Dec 2018 15:10:21 -0600 Subject: [PATCH 031/245] Fixes to the site-descriptions and FoaF algorithms --- crawler/followgraph.js | 16 ++++++++++++++-- crawler/site-descriptions.js | 25 ++++++++++++++++++------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index f3403161..13808de3 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -136,11 +136,23 @@ const listFollows = exports.listFollows = async function (subject, {includeDesc} // - returns Array const listFoaFs = exports.listFoaFs = async function (subject, {includeDesc} = {}) { var foafs = [] + // list URLs followed by subject var follows = await listFollows(subject) for (let url of follows) { - foafs = foafs.concat(await listFollows(url, {includeDesc})) + // list follows of this follow + for (let foaf of await listFollows(url, {includeDesc})) { + // ignore if followed by subject + if (follows.indexOf(foaf.url) !== -1) continue + // merge into list + let existingFoaF = foafs.find(v => v.url === foaf.url) + if (existingFoaF) { + existingFoaF.followedBy.push(url) + } else { + foaf.followedBy = [url] + foafs.push(foaf) + } + } } - // TODO remove duplicates return foafs } diff --git a/crawler/site-descriptions.js b/crawler/site-descriptions.js index 90d5e925..7e9bb80d 100644 --- a/crawler/site-descriptions.js +++ b/crawler/site-descriptions.js @@ -128,21 +128,33 @@ const list = exports.list = async function ({offset, limit, reverse, author, sub INNER JOIN crawl_sources src ON src.id = crawl_site_descriptions.crawlSourceId ` var values = [] + + if (author || subject) { + query += ` WHERE ` + } + if (author) { - let op = 'WHERE' + query += `(` + let op = `` for (let a of author) { - query += ` ${op} src.url = ?` - op = 'OR' + query += `${op} src.url = ?` + op = ` OR` values.push(a) } + query += `) ` } if (subject) { - let op = 'WHERE' + if (author) { + query += ` AND ` + } + query += `(` + let op = `` for (let s of subject) { - query += ` ${op} subject = ?` - op = 'OR' + query += `${op} subject = ?` + op = ` OR` values.push(s) } + query += `) ` } if (offset) { query += ` OFFSET ?` @@ -172,7 +184,6 @@ exports.getBest = async function ({subject, author} = {}) { // } // check for descriptions - console.log('getting best', subject, author) var descriptions = await list({subject, author}) return _pick(descriptions[0] || {}, ['title', 'description', 'type', 'author']) } From bb4b16ae192788e1dbe02db34e4b06fa72199f87 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 13 Dec 2018 17:19:52 -0600 Subject: [PATCH 032/245] Add more options and data to followgraph list functions --- crawler/followgraph.js | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 13808de3..cec6fecc 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -88,7 +88,7 @@ exports.crawlSite = async function (archive, crawlSource) { // - subject. String (URL). // - opts.includeDesc. Boolean. // - returns Array -exports.listFollowers = async function (subject, {includeDesc} = {}) { +const listFollowers = exports.listFollowers = async function (subject, {includeDesc} = {}) { var rows = await db.all(` SELECT crawl_sources.url FROM crawl_sources @@ -110,8 +110,9 @@ exports.listFollowers = async function (subject, {includeDesc} = {}) { // List sites that subject follows // - subject. String (URL). // - opts.includeDesc. Boolean. +// - opts.includeFollowers. Boolean. Requires includeDesc to be true. // - returns Array -const listFollows = exports.listFollows = async function (subject, {includeDesc} = {}) { +const listFollows = exports.listFollows = async function (subject, {includeDesc, includeFollowers} = {}) { var rows = await db.all(` SELECT crawl_followgraph.destUrl FROM crawl_followgraph @@ -126,29 +127,31 @@ const listFollows = exports.listFollows = async function (subject, {includeDesc} var url = toOrigin(row.destUrl) var desc = await siteDescriptions.getBest({subject: url, author: subject}) desc.url = url + if (includeFollowers) { + desc.followedBy = await listFollowers(url, {includeDesc: true}) + } return desc })) } // List sites that are followed by sites that the subject follows // - subject. String (URL). -// - opts.includeDesc. Boolean. -// - returns Array -const listFoaFs = exports.listFoaFs = async function (subject, {includeDesc} = {}) { +// - returns Array +const listFoaFs = exports.listFoaFs = async function (subject) { var foafs = [] // list URLs followed by subject - var follows = await listFollows(subject) - for (let url of follows) { + var follows = await listFollows(subject, {includeDesc: true}) + for (let follow of follows) { // list follows of this follow - for (let foaf of await listFollows(url, {includeDesc})) { + for (let foaf of await listFollows(follow.url, {includeDesc: true})) { // ignore if followed by subject - if (follows.indexOf(foaf.url) !== -1) continue + if (follows.find(v => v.url === foaf.url)) continue // merge into list let existingFoaF = foafs.find(v => v.url === foaf.url) if (existingFoaF) { - existingFoaF.followedBy.push(url) + existingFoaF.followedBy.push(follow) } else { - foaf.followedBy = [url] + foaf.followedBy = [follow] foafs.push(foaf) } } From 4ef0203b6f1828cd939209c818389289bdf043fc Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 14 Dec 2018 11:23:09 -0600 Subject: [PATCH 033/245] Bump deps --- package-lock.json | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index bd6543ca..2133c6cc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2366,7 +2366,15 @@ "dev": true, "requires": { "pseudomap": "^1.0.2", - "yallist": "^3.0.2" + "yallist": "^2.1.2" + }, + "dependencies": { + "yallist": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", + "integrity": "sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=", + "dev": true + } } }, "map-age-cleaner": { From e0caefe3b7a785c9b41364ed69101b4a26335577 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 14 Dec 2018 13:24:32 -0600 Subject: [PATCH 034/245] Add {followedBy} filter to followgraph --- crawler/followgraph.js | 59 +++++++++++++++++++++++++++----------- web-apis/bg/followgraph.js | 12 ++++++++ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index cec6fecc..7fdaa958 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -63,9 +63,19 @@ exports.crawlSite = async function (archive, crawlSource) { // write updates for (let add of adds) { - await db.run(` - INSERT INTO crawl_followgraph (crawlSourceId, destUrl, crawledAt) VALUES (?, ?, ?) - `, [crawlSource.id, add, Date.now()]) + try { + await db.run(` + INSERT INTO crawl_followgraph (crawlSourceId, destUrl, crawledAt) VALUES (?, ?, ?) + `, [crawlSource.id, add, Date.now()]) + } catch (e) { + if (e.code === 'SQLITE_CONSTRAINT') { + // uniqueness constraint probably failed, which means we got a duplicate somehow + // dont worry about it + debug('Attempted to insert duplicate followgraph record', {crawlSource, url: add}) + } else { + throw e + } + } if (!supressEvents) { events.emit('follow-added', archive.url, add) } @@ -86,16 +96,31 @@ exports.crawlSite = async function (archive, crawlSource) { // List sites that follow subject // - subject. String (URL). +// - opts.followedBy. String (URL). // - opts.includeDesc. Boolean. // - returns Array -const listFollowers = exports.listFollowers = async function (subject, {includeDesc} = {}) { - var rows = await db.all(` - SELECT crawl_sources.url - FROM crawl_sources - INNER JOIN crawl_followgraph - ON crawl_followgraph.crawlSourceId = crawl_sources.id - AND crawl_followgraph.destUrl = ? - `, [subject]) +const listFollowers = exports.listFollowers = async function (subject, {followedBy, includeDesc} = {}) { + var rows + if (followedBy) { + rows = await db.all(` + SELECT cs.url FROM crawl_followgraph fg + INNER JOIN crawl_sources cs ON cs.id = fg.crawlSourceId + WHERE fg.destUrl = ? + AND (cs.url = ? OR cs.url IN ( + SELECT destUrl as url FROM crawl_followgraph + INNER JOIN crawl_sources ON crawl_sources.id = crawl_followgraph.crawlSourceId + WHERE crawl_sources.url = ? + )) + `, [subject, followedBy, followedBy]) + } else { + rows = await db.all(` + SELECT f.url + FROM crawl_sources f + INNER JOIN crawl_followgraph + ON crawl_followgraph.crawlSourceId = f.id + AND crawl_followgraph.destUrl = ? + `, [subject]) + } if (!includeDesc) { return rows.map(row => toOrigin(row.url)) } @@ -109,10 +134,11 @@ const listFollowers = exports.listFollowers = async function (subject, {includeD // List sites that subject follows // - subject. String (URL). +// - opts.followedBy. String (URL). // - opts.includeDesc. Boolean. // - opts.includeFollowers. Boolean. Requires includeDesc to be true. // - returns Array -const listFollows = exports.listFollows = async function (subject, {includeDesc, includeFollowers} = {}) { +const listFollows = exports.listFollows = async function (subject, {followedBy, includeDesc, includeFollowers} = {}) { var rows = await db.all(` SELECT crawl_followgraph.destUrl FROM crawl_followgraph @@ -128,7 +154,7 @@ const listFollows = exports.listFollows = async function (subject, {includeDesc, var desc = await siteDescriptions.getBest({subject: url, author: subject}) desc.url = url if (includeFollowers) { - desc.followedBy = await listFollowers(url, {includeDesc: true}) + desc.followedBy = await listFollowers(url, {followedBy, includeDesc: true}) } return desc })) @@ -136,14 +162,15 @@ const listFollows = exports.listFollows = async function (subject, {includeDesc, // List sites that are followed by sites that the subject follows // - subject. String (URL). +// - opts.followedBy. String (URL). // - returns Array -const listFoaFs = exports.listFoaFs = async function (subject) { +const listFoaFs = exports.listFoaFs = async function (subject, {followedBy} = {}) { var foafs = [] // list URLs followed by subject - var follows = await listFollows(subject, {includeDesc: true}) + var follows = await listFollows(subject, {followedBy, includeDesc: true}) for (let follow of follows) { // list follows of this follow - for (let foaf of await listFollows(follow.url, {includeDesc: true})) { + for (let foaf of await listFollows(follow.url, {followedBy, includeDesc: true})) { // ignore if followed by subject if (follows.find(v => v.url === foaf.url)) continue // merge into list diff --git a/web-apis/bg/followgraph.js b/web-apis/bg/followgraph.js index ba07c3d6..ad903df3 100644 --- a/web-apis/bg/followgraph.js +++ b/web-apis/bg/followgraph.js @@ -11,20 +11,32 @@ const followgraphCrawler = require('../../crawler/followgraph') module.exports = { async listFollowers (url, opts) { + opts = opts || {} url = normalizeFollowUrl(url) assertString(url, 'Parameter one must be a URL') + var userSession = globals.userSessionAPI.getFor(this.sender) + if (!userSession) throw new Error('No active user session') + opts.followedBy = userSession.url return followgraphCrawler.listFollowers(url, opts) }, async listFollows (url, opts) { + opts = opts || {} url = normalizeFollowUrl(url) assertString(url, 'Parameter one must be a URL') + var userSession = globals.userSessionAPI.getFor(this.sender) + if (!userSession) throw new Error('No active user session') + opts.followedBy = userSession.url return followgraphCrawler.listFollows(url, opts) }, async listFoaFs (url, opts) { + opts = opts || {} url = normalizeFollowUrl(url) assertString(url, 'Parameter one must be a URL') + var userSession = globals.userSessionAPI.getFor(this.sender) + if (!userSession) throw new Error('No active user session') + opts.followedBy = userSession.url return followgraphCrawler.listFoaFs(url, opts) }, From d9ed01292dff60c2ae2072ed48838428940df84e Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 14 Dec 2018 14:19:25 -0600 Subject: [PATCH 035/245] Add .followsUser output to followgraph queries --- crawler/followgraph.js | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 7fdaa958..9b8dbbc1 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -128,13 +128,16 @@ const listFollowers = exports.listFollowers = async function (subject, {followed var url = toOrigin(row.url) var desc = await siteDescriptions.getBest({subject: url}) desc.url = url + if (followedBy) { + desc.followsUser = await isAFollowingB(url, followedBy) + } return desc })) } // List sites that subject follows // - subject. String (URL). -// - opts.followedBy. String (URL). +// - opts.followedBy. String (URL). Filters to users who are followed by the URL specified. Causes .followsUser boolean to be set. // - opts.includeDesc. Boolean. // - opts.includeFollowers. Boolean. Requires includeDesc to be true. // - returns Array @@ -153,6 +156,9 @@ const listFollows = exports.listFollows = async function (subject, {followedBy, var url = toOrigin(row.destUrl) var desc = await siteDescriptions.getBest({subject: url, author: subject}) desc.url = url + if (followedBy) { + desc.followsUser = await isAFollowingB(url, followedBy) + } if (includeFollowers) { desc.followedBy = await listFollowers(url, {followedBy, includeDesc: true}) } @@ -190,7 +196,7 @@ const listFoaFs = exports.listFoaFs = async function (subject, {followedBy} = {} // - a. String (URL), the site being queried. // - b. String (URL), does a follow this site? // - returns bool -exports.isAFollowingB = async function (a, b) { +const isAFollowingB = exports.isAFollowingB = async function (a, b) { a = toOrigin(a) b = toOrigin(b) var res = await db.get(` From 9f534806a554e0050ac620903f247b26e9b9d176 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 15 Dec 2018 14:40:07 -0600 Subject: [PATCH 036/245] Filter out the subject from listFoaFs --- crawler/followgraph.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 9b8dbbc1..af32237a 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -168,7 +168,7 @@ const listFollows = exports.listFollows = async function (subject, {followedBy, // List sites that are followed by sites that the subject follows // - subject. String (URL). -// - opts.followedBy. String (URL). +// - opts.followedBy. String (URL). Filters to users who are followed by the URL specified. Causes .followsUser boolean to be set. // - returns Array const listFoaFs = exports.listFoaFs = async function (subject, {followedBy} = {}) { var foafs = [] @@ -177,7 +177,8 @@ const listFoaFs = exports.listFoaFs = async function (subject, {followedBy} = {} for (let follow of follows) { // list follows of this follow for (let foaf of await listFollows(follow.url, {followedBy, includeDesc: true})) { - // ignore if followed by subject + // ignore if followed by subject or is subject + if (foaf.url === subject) continue if (follows.find(v => v.url === foaf.url)) continue // merge into list let existingFoaF = foafs.find(v => v.url === foaf.url) From 8581969097bc57906fbf55fad0c6ffb4712f9eac Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 15 Dec 2018 18:04:42 -0600 Subject: [PATCH 037/245] Add beaker.browser.showShellModal --- web-apis/fg/beaker.js | 1 + web-apis/manifests/internal/browser.js | 1 + 2 files changed, 2 insertions(+) diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 44511452..30a2fbe3 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -95,6 +95,7 @@ exports.setup = function (rpc) { beaker.browser.setWindowDimensions = beakerBrowserRPC.setWindowDimensions beaker.browser.showOpenDialog = beakerBrowserRPC.showOpenDialog beaker.browser.showContextMenu = beakerBrowserRPC.showContextMenu + beaker.browser.showShellModal = beakerBrowserRPC.showShellModal beaker.browser.openUrl = beakerBrowserRPC.openUrl beaker.browser.openFolder = beakerBrowserRPC.openFolder beaker.browser.doWebcontentsCmd = beakerBrowserRPC.doWebcontentsCmd diff --git a/web-apis/manifests/internal/browser.js b/web-apis/manifests/internal/browser.js index 6ffc0d7f..7eb661cb 100644 --- a/web-apis/manifests/internal/browser.js +++ b/web-apis/manifests/internal/browser.js @@ -33,6 +33,7 @@ module.exports = { setWindowDimensions: 'promise', showOpenDialog: 'promise', showContextMenu: 'promise', + showShellModal: 'promise', openUrl: 'promise', openFolder: 'promise', doWebcontentsCmd: 'promise', From 78a3c7fb42289276de4113be2a92c911d58502a2 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 17 Dec 2018 16:21:22 -0600 Subject: [PATCH 038/245] Add beaker.crawler API --- crawler/index.js | 52 +++++++++++++++++++++++++- crawler/util.js | 9 +++++ web-apis/bg.js | 3 ++ web-apis/fg/beaker.js | 8 ++++ web-apis/manifests/internal/crawler.js | 5 +++ 5 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 web-apis/manifests/internal/crawler.js diff --git a/crawler/index.js b/crawler/index.js index dd1a2ec4..ea9aa953 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -1,9 +1,13 @@ +const emitStream = require('emit-stream') +const {URL} = require('url') const _throttle = require('lodash.throttle') const lock = require('../lib/lock') const db = require('../dbs/profile-data-db') +const archivesDb = require('../dbs/archives') const users = require('../users') const dat = require('../dat') +const {crawlerEvents} = require('./util') const posts = require('./posts') const followgraph = require('./followgraph') const siteDescriptions = require('./site-descriptions') @@ -21,6 +25,7 @@ const watches = {} exports.posts = posts exports.followgraph = followgraph exports.siteDescriptions = siteDescriptions +const createEventsStream = exports.createEventsStream = () => emitStream(crawlerEvents) exports.setup = async function () { } @@ -32,6 +37,7 @@ exports.watchSite = async function (archive) { console.log('watchSite', archive.url) if (!(archive.url in watches)) { + crawlerEvents.emit('watch', {sourceUrl: archive.url}) const queueCrawl = _throttle(() => crawlSite(archive), 5e3) // watch for file changes @@ -57,6 +63,7 @@ exports.watchSite = async function (archive) { exports.unwatchSite = async function (url) { // stop watching for file changes if (url in watches) { + crawlerEvents.emit('unwatch', {sourceUrl: url}) watches[url].close() watches[url] = null } @@ -65,6 +72,7 @@ exports.unwatchSite = async function (url) { const crawlSite = exports.crawlSite = async function (archive) { console.log('crawling', archive.url) + crawlerEvents.emit('crawl-start', {sourceUrl: archive.url}) var release = await lock('crawl:' + archive.url) try { // get/create crawl source @@ -80,7 +88,49 @@ exports.crawlSite = async function (archive) { followgraph.crawlSite(archive, crawlSource), siteDescriptions.crawlSite(archive, crawlSource) ]) + } catch (err) { + crawlerEvents.emit('crawl-error', {sourceUrl: archive.url, err: err.toString()}) } finally { + crawlerEvents.emit('crawl-finish', {sourceUrl: archive.url}) release() } -} \ No newline at end of file +} + +const getCrawlStates = +exports.getCrawlStates = async function () { + var rows = await db.all(` + SELECT + crawl_sources.url AS url, + GROUP_CONCAT(crawl_sources_meta.crawlSourceVersion) AS versions, + GROUP_CONCAT(crawl_sources_meta.crawlDataset) AS datasets, + MAX(crawl_sources_meta.updatedAt) AS updatedAt + FROM crawl_sources + INNER JOIN crawl_sources_meta ON crawl_sources_meta.crawlSourceId = crawl_sources.id + GROUP BY crawl_sources.id + `) + return Promise.all(rows.map(async ({url, versions, datasets, updatedAt}) => { + var datasetVersions = {} + versions = versions.split(',') + datasets = datasets.split(',') + for (let i = 0; i < datasets.length; i++) { + datasetVersions[datasets[i]] = Number(versions[i]) + } + var meta = await archivesDb.getMeta(toHostname(url)) + return {url, title: meta.title, datasetVersions, updatedAt} + })) +} + +const resetSite = +exports.resetSite = async function (url) { + await db.run(`DELETE FROM crawl_sources WHERE url = ?`, [url]) +} + +exports.WEBAPI = {createEventsStream, getCrawlStates, resetSite} + +// internal methods +// = + +function toHostname (url) { + url = new URL(url) + return url.hostname +} diff --git a/crawler/util.js b/crawler/util.js index aa22556f..98c984a4 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -1,3 +1,4 @@ +const EventEmitter = require('events') const pump = require('pump') const concat = require('concat-stream') const db = require('../dbs/profile-data-db') @@ -5,6 +6,9 @@ const dat = require('../dat') const READ_TIMEOUT = 30e3 +const crawlerEvents = new EventEmitter() +exports.crawlerEvents = crawlerEvents + exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatasetVersion, handlerFn) { const url = archive.url @@ -38,14 +42,19 @@ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatas ) }) + crawlerEvents.emit('crawl-dataset-start', {sourceUrl: archive.url, crawlDataset, crawlRange: {start, end}}) + // handle changes await handlerFn({changes, resetRequired}) // final checkpoint await doCheckpoint(crawlDataset, crawlDatasetVersion, crawlSource, version) + + crawlerEvents.emit('crawl-dataset-finish', {sourceUrl: archive.url, crawlDataset, crawlRange: {start, end}}) } const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { + crawlerEvents.emit('crawl-dataset-progress', {sourceUrl: crawlSource.url, crawlDataset, crawledVersion: crawlSourceVersion}) await db.run(`DELETE FROM crawl_sources_meta WHERE crawlDataset = ? AND crawlSourceId = ?`, [crawlDataset, crawlSource.id]) await db.run(` INSERT diff --git a/web-apis/bg.js b/web-apis/bg.js index e3d16b38..ac74937f 100644 --- a/web-apis/bg.js +++ b/web-apis/bg.js @@ -10,6 +10,7 @@ const downloadsManifest = require('./manifests/internal/downloads') const historyManifest = require('./manifests/internal/history') const sitedataManifest = require('./manifests/internal/sitedata') const watchlistManifest = require('./manifests/internal/watchlist') +const crawlerManifest = require('./manifests/internal/crawler') const postsManifest = require('./manifests/internal/posts') const followgraphManifest = require('./manifests/internal/followgraph') @@ -19,6 +20,7 @@ const bookmarksAPI = require('./bg/bookmarks') const historyAPI = require('./bg/history') const sitedataAPI = require('../dbs/sitedata').WEBAPI const watchlistAPI = require('./bg/watchlist') +const crawlerAPI = require('../crawler').WEBAPI const postsAPI = require('./bg/posts') const followgraphAPI = require('./bg/followgraph') @@ -54,6 +56,7 @@ exports.setup = function () { globals.rpcAPI.exportAPI('history', historyManifest, historyAPI, internalOnly) globals.rpcAPI.exportAPI('sitedata', sitedataManifest, sitedataAPI, internalOnly) globals.rpcAPI.exportAPI('watchlist', watchlistManifest, watchlistAPI, internalOnly) + globals.rpcAPI.exportAPI('crawler', crawlerManifest, crawlerAPI, internalOnly) globals.rpcAPI.exportAPI('posts', postsManifest, postsAPI, internalOnly) globals.rpcAPI.exportAPI('followgraph', followgraphManifest, followgraphAPI, internalOnly) diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 30a2fbe3..54ee141a 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -8,6 +8,7 @@ const downloadsManifest = require('../manifests/internal/downloads') const historyManifest = require('../manifests/internal/history') const sitedataManifest = require('../manifests/internal/sitedata') const watchlistManifest = require('../manifests/internal/watchlist') +const crawlerManifest = require('../manifests/internal/crawler') const postsManifest = require('../manifests/internal/posts') const followgraphManifest = require('../manifests/internal/followgraph') @@ -24,6 +25,7 @@ exports.setup = function (rpc) { const historyRPC = rpc.importAPI('history', historyManifest, opts) const sitedataRPC = rpc.importAPI('sitedata', sitedataManifest, opts) const watchlistRPC = rpc.importAPI('watchlist', watchlistManifest, opts) + const crawlerRPC = rpc.importAPI('crawler', crawlerManifest, opts) const postsRPC = rpc.importAPI('posts', postsManifest, opts) const followgraphRPC = rpc.importAPI('followgraph', followgraphManifest, opts) @@ -158,6 +160,12 @@ exports.setup = function (rpc) { beaker.watchlist.remove = watchlistRPC.remove beaker.watchlist.createEventsStream = () => fromEventStream(watchlistRPC.createEventsStream()) + // beaker.crawler + beaker.crawler = {} + beaker.crawler.getCrawlStates = crawlerRPC.getCrawlStates + beaker.crawler.resetSite = crawlerRPC.resetSite + beaker.crawler.createEventsStream = () => fromEventStream(crawlerRPC.createEventsStream()) + // beaker.posts beaker.posts = {} beaker.posts.list = postsRPC.list diff --git a/web-apis/manifests/internal/crawler.js b/web-apis/manifests/internal/crawler.js new file mode 100644 index 00000000..686ac47a --- /dev/null +++ b/web-apis/manifests/internal/crawler.js @@ -0,0 +1,5 @@ +module.exports = { + getCrawlStates: 'promise', + resetSite: 'promise', + createEventsStream: 'readable' +} From 2cdcd3d65d081dbb2294b1d047c8744d76bc691b Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Mon, 17 Dec 2018 18:59:15 -0600 Subject: [PATCH 039/245] Add beaker.crawler.crawlSite --- crawler/index.js | 10 +++++++++- web-apis/fg/beaker.js | 1 + web-apis/manifests/internal/crawler.js | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/crawler/index.js b/crawler/index.js index ea9aa953..8e997cb2 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -125,7 +125,15 @@ exports.resetSite = async function (url) { await db.run(`DELETE FROM crawl_sources WHERE url = ?`, [url]) } -exports.WEBAPI = {createEventsStream, getCrawlStates, resetSite} +exports.WEBAPI = { + createEventsStream, + getCrawlStates, + crawlSite: async (url) => { + var archive = await dat.library.getOrLoadArchive(url) + return crawlSite(archive) + }, + resetSite +} // internal methods // = diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 54ee141a..81c420ad 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -163,6 +163,7 @@ exports.setup = function (rpc) { // beaker.crawler beaker.crawler = {} beaker.crawler.getCrawlStates = crawlerRPC.getCrawlStates + beaker.crawler.crawlSite = crawlerRPC.crawlSite beaker.crawler.resetSite = crawlerRPC.resetSite beaker.crawler.createEventsStream = () => fromEventStream(crawlerRPC.createEventsStream()) diff --git a/web-apis/manifests/internal/crawler.js b/web-apis/manifests/internal/crawler.js index 686ac47a..47365431 100644 --- a/web-apis/manifests/internal/crawler.js +++ b/web-apis/manifests/internal/crawler.js @@ -1,5 +1,6 @@ module.exports = { getCrawlStates: 'promise', + crawlSite: 'promise', resetSite: 'promise', createEventsStream: 'readable' } From 2d808973f006d81504f37d76d4f133cac1e270f1 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Tue, 18 Dec 2018 11:38:29 -0600 Subject: [PATCH 040/245] Enable foreign keys in sqlite --- lib/db.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/db.js b/lib/db.js index 780e4956..94a4bf29 100644 --- a/lib/db.js +++ b/lib/db.js @@ -49,6 +49,13 @@ exports.makeSqliteTransactor = function (setupPromise) { // runs needed migrations, returns a promise exports.setupSqliteDB = function (db, {setup, migrations}, logTag) { return new Promise((resolve, reject) => { + // configure connection + db.run('PRAGMA foreign_keys = ON;', (err) => { + if (err) { + console.error('Failed to enable FK support in SQLite', err) + } + }) + // run migrations db.get('PRAGMA user_version;', (err, res) => { if (err) return reject(err) From c13cf914a6efea9cdf0d3fe5fffef7be48416a48 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Tue, 18 Dec 2018 11:38:41 -0600 Subject: [PATCH 041/245] Rework crawler progress events --- crawler/followgraph.js | 5 ++++- crawler/posts.js | 11 +++++++---- crawler/site-descriptions.js | 11 +++++++---- crawler/util.js | 6 +++++- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index af32237a..0340117d 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -6,7 +6,7 @@ const lock = require('../lib/lock') const db = require('../dbs/profile-data-db') const crawler = require('./index') const siteDescriptions = require('./site-descriptions') -const {doCrawl, doCheckpoint} = require('./util') +const {doCrawl, doCheckpoint, emitProgressEvent} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') // constants @@ -46,6 +46,8 @@ exports.crawlSite = async function (archive, crawlSource) { return } + emitProgressEvent(archive.url, 'crawl_followgraph', 0, 1) + // read and validate try { var followsJson = await readFollowsFile(archive) @@ -91,6 +93,7 @@ exports.crawlSite = async function (archive, crawlSource) { // write checkpoint as success await doCheckpoint('crawl_followgraph', TABLE_VERSION, crawlSource, changes[changes.length - 1].version) + emitProgressEvent(archive.url, 'crawl_followgraph', 1, 1) }) } diff --git a/crawler/posts.js b/crawler/posts.js index 8acefe9e..32db063a 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -3,7 +3,7 @@ const {URL} = require('url') const Events = require('events') const db = require('../dbs/profile-data-db') const crawler = require('./index') -const {doCrawl, doCheckpoint, getMatchingChangesInOrder, generateTimeFilename} = require('./util') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') // constants @@ -40,8 +40,10 @@ exports.crawlSite = async function (archive, crawlSource) { // collect changed posts var changedPosts = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) console.log('collected changed posts', changedPosts) + emitProgressEvent(archive.url, 'crawl_posts', 0, changedPosts.length) // read and apply each post in order + var progress = 0 for (let changedPost of changedPosts) { // TODO Currently the crawler will abort reading the feed if any post fails to load // this means that a single bad or unreachable file can stop the forward progress of post indexing @@ -89,10 +91,11 @@ exports.crawlSite = async function (archive, crawlSource) { `, [crawlSource.id, changedPost.name, Date.now(), post.content, post.createdAt, post.updatedAt]) events.emit('post-added', archive.url) } - - // checkpoint our progress - await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSource, changedPost.version) } + + // checkpoint our progress + await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSource, changedPost.version) + emitProgressEvent(archive.url, 'crawl_posts', ++progress, changedPosts.length) } }) } diff --git a/crawler/site-descriptions.js b/crawler/site-descriptions.js index 7e9bb80d..d92ac1e4 100644 --- a/crawler/site-descriptions.js +++ b/crawler/site-descriptions.js @@ -6,7 +6,7 @@ const db = require('../dbs/profile-data-db') const archivesDb = require('../dbs/archives') const dat = require('../dat') const crawler = require('./index') -const {doCrawl, doCheckpoint, getMatchingChangesInOrder, generateTimeFilename} = require('./util') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') // constants @@ -43,8 +43,10 @@ exports.crawlSite = async function (archive, crawlSource) { // collect changed site descriptions var changedSiteDescriptions = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) console.log('collected changed site descriptions', changedSiteDescriptions) + emitProgressEvent(archive.url, 'crawl_site_descriptions', 0, changedSiteDescriptions.length) // read and apply each post in order + var progress = 0 for (let changedSiteDescription of changedSiteDescriptions) { // TODO Currently the crawler will abort reading the feed if any description fails to load // this means that a single bad or unreachable file can stop the forward progress of description indexing @@ -95,10 +97,11 @@ exports.crawlSite = async function (archive, crawlSource) { VALUES (?, ?, ?, ?, ?, ?, ?, ?) `, [crawlSource.id, changedSiteDescription.name, Date.now(), desc.subject, desc.metadata.title, desc.metadata.description, desc.metadata.type.join(','), desc.createdAt]) events.emit('description-added', archive.url) - - // checkpoint our progress - await doCheckpoint('crawl_site_descriptions', TABLE_VERSION, crawlSource, changedSiteDescription.version) } + + // checkpoint our progress + await doCheckpoint('crawl_site_descriptions', TABLE_VERSION, crawlSource, changedSiteDescription.version) + emitProgressEvent(archive.url, 'crawl_site_descriptions', ++progress, changedSiteDescription.length) } }) } diff --git a/crawler/util.js b/crawler/util.js index 98c984a4..cbd3c549 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -54,7 +54,7 @@ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatas } const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { - crawlerEvents.emit('crawl-dataset-progress', {sourceUrl: crawlSource.url, crawlDataset, crawledVersion: crawlSourceVersion}) + await db.run(`DELETE FROM crawl_sources_meta WHERE crawlDataset = ? AND crawlSourceId = ?`, [crawlDataset, crawlSource.id]) await db.run(` INSERT @@ -63,6 +63,10 @@ const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlD `, [crawlDataset, crawlDatasetVersion, crawlSource.id, crawlSourceVersion, Date.now()]) } +exports.emitProgressEvent = function (sourceUrl, crawlDataset, progress, numUpdates) { + crawlerEvents.emit('crawl-dataset-progress', {sourceUrl, crawlDataset, progress, numUpdates}) +} + exports.getMatchingChangesInOrder = function (changes, regex) { var list = [] // order matters, must be oldest to newest changes.forEach(c => { From 2b562f6c0092b00654cab1e2d02ea8efae3edba5 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 23 Dec 2018 17:29:08 -0600 Subject: [PATCH 042/245] Add beaker.templates internal web api --- crawler/util.js | 1 - dbs/schemas/profile-data.sql.js | 25 ++++++++++++------------ web-apis/bg.js | 3 +++ web-apis/bg/archives.js | 20 ------------------- web-apis/bg/templates.js | 22 +++++++++++++++++++++ web-apis/fg/beaker.js | 13 ++++++++---- web-apis/manifests/internal/archives.js | 6 ------ web-apis/manifests/internal/templates.js | 6 ++++++ 8 files changed, 52 insertions(+), 44 deletions(-) create mode 100644 web-apis/bg/templates.js create mode 100644 web-apis/manifests/internal/templates.js diff --git a/crawler/util.js b/crawler/util.js index cbd3c549..04a9f7b2 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -54,7 +54,6 @@ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatas } const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { - await db.run(`DELETE FROM crawl_sources_meta WHERE crawlDataset = ? AND crawlSourceId = ?`, [crawlDataset, crawlSource.id]) await db.run(` INSERT diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index a3305441..3879146e 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -100,6 +100,18 @@ CREATE TABLE watchlist ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); +-- list of the users current templates +CREATE TABLE templates ( + profileId INTEGER, + url TEXT NOT NULL, + title TEXT, + screenshot, + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + + PRIMARY KEY (profileId, url), + FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +); + -- list of sites being crawled CREATE TABLE crawl_sources ( id INTEGER PRIMARY KEY NOT NULL, @@ -157,19 +169,6 @@ CREATE TABLE crawl_followgraph ( FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); --- list of the users current templates --- deprecated (may return) -CREATE TABLE templates ( - profileId INTEGER, - url TEXT NOT NULL, - title TEXT, - screenshot, - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - - PRIMARY KEY (profileId, url), - FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE -); - -- a list of the draft-dats for a master-dat -- deprecated CREATE TABLE archive_drafts ( diff --git a/web-apis/bg.js b/web-apis/bg.js index ac74937f..df7bfd95 100644 --- a/web-apis/bg.js +++ b/web-apis/bg.js @@ -10,6 +10,7 @@ const downloadsManifest = require('./manifests/internal/downloads') const historyManifest = require('./manifests/internal/history') const sitedataManifest = require('./manifests/internal/sitedata') const watchlistManifest = require('./manifests/internal/watchlist') +const templatesManifest = require('./manifests/internal/templates') const crawlerManifest = require('./manifests/internal/crawler') const postsManifest = require('./manifests/internal/posts') const followgraphManifest = require('./manifests/internal/followgraph') @@ -20,6 +21,7 @@ const bookmarksAPI = require('./bg/bookmarks') const historyAPI = require('./bg/history') const sitedataAPI = require('../dbs/sitedata').WEBAPI const watchlistAPI = require('./bg/watchlist') +const templatesAPI = require('./bg/templates') const crawlerAPI = require('../crawler').WEBAPI const postsAPI = require('./bg/posts') const followgraphAPI = require('./bg/followgraph') @@ -56,6 +58,7 @@ exports.setup = function () { globals.rpcAPI.exportAPI('history', historyManifest, historyAPI, internalOnly) globals.rpcAPI.exportAPI('sitedata', sitedataManifest, sitedataAPI, internalOnly) globals.rpcAPI.exportAPI('watchlist', watchlistManifest, watchlistAPI, internalOnly) + globals.rpcAPI.exportAPI('templates', templatesManifest, templatesAPI, internalOnly) globals.rpcAPI.exportAPI('crawler', crawlerManifest, crawlerAPI, internalOnly) globals.rpcAPI.exportAPI('posts', postsManifest, postsAPI, internalOnly) globals.rpcAPI.exportAPI('followgraph', followgraphManifest, followgraphAPI, internalOnly) diff --git a/web-apis/bg/archives.js b/web-apis/bg/archives.js index 60385262..6432e88c 100644 --- a/web-apis/bg/archives.js +++ b/web-apis/bg/archives.js @@ -1,7 +1,6 @@ const path = require('path') const mkdirp = require('mkdirp') const jetpack = require('fs-jetpack') -const templatesDb = require('../../dbs/templates') const datDns = require('../../dat/dns') const datLibrary = require('../../dat/library') const datGC = require('../../dat/garbage-collector') @@ -266,25 +265,6 @@ module.exports = { return archiveDraftsDb.remove(0, masterKey, draftKey) }, - // templates - // = - - async getTemplate (url) { - return templatesDb.get(0, url) - }, - - async listTemplates () { - return templatesDb.list(0) - }, - - async putTemplate (url, {title, screenshot}) { - return templatesDb.put(0, url, {title, screenshot}) - }, - - async removeTemplate (url) { - return templatesDb.remove(0, url) - }, - // internal management // = diff --git a/web-apis/bg/templates.js b/web-apis/bg/templates.js new file mode 100644 index 00000000..3d6c3eca --- /dev/null +++ b/web-apis/bg/templates.js @@ -0,0 +1,22 @@ +const templatesDb = require('../../dbs/templates') + +// exported api +// = + +module.exports = { + async get (url) { + return templatesDb.get(0, url) + }, + + async list () { + return templatesDb.list(0) + }, + + async put (url, {title, screenshot}) { + return templatesDb.put(0, url, {title, screenshot}) + }, + + async remove (url) { + return templatesDb.remove(0, url) + } +} \ No newline at end of file diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 81c420ad..532b72cf 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -8,6 +8,7 @@ const downloadsManifest = require('../manifests/internal/downloads') const historyManifest = require('../manifests/internal/history') const sitedataManifest = require('../manifests/internal/sitedata') const watchlistManifest = require('../manifests/internal/watchlist') +const templatesManifest = require('../manifests/internal/templates') const crawlerManifest = require('../manifests/internal/crawler') const postsManifest = require('../manifests/internal/posts') const followgraphManifest = require('../manifests/internal/followgraph') @@ -25,6 +26,7 @@ exports.setup = function (rpc) { const historyRPC = rpc.importAPI('history', historyManifest, opts) const sitedataRPC = rpc.importAPI('sitedata', sitedataManifest, opts) const watchlistRPC = rpc.importAPI('watchlist', watchlistManifest, opts) + const templatesRPC = rpc.importAPI('templates', templatesManifest, opts) const crawlerRPC = rpc.importAPI('crawler', crawlerManifest, opts) const postsRPC = rpc.importAPI('posts', postsManifest, opts) const followgraphRPC = rpc.importAPI('followgraph', followgraphManifest, opts) @@ -49,10 +51,6 @@ exports.setup = function (rpc) { beaker.archives.listDrafts = archivesRPC.listDrafts beaker.archives.addDraft = archivesRPC.addDraft beaker.archives.removeDraft = archivesRPC.removeDraft - beaker.archives.getTemplate = archivesRPC.getTemplate - beaker.archives.listTemplates = archivesRPC.listTemplates - beaker.archives.putTemplate = archivesRPC.putTemplate - beaker.archives.removeTemplate = archivesRPC.removeTemplate beaker.archives.touch = archivesRPC.touch beaker.archives.clearFileCache = archivesRPC.clearFileCache beaker.archives.clearGarbage = archivesRPC.clearGarbage @@ -160,6 +158,13 @@ exports.setup = function (rpc) { beaker.watchlist.remove = watchlistRPC.remove beaker.watchlist.createEventsStream = () => fromEventStream(watchlistRPC.createEventsStream()) + // beaker.templates + beaker.templates = {} + beaker.templates.get = templatesRPC.get + beaker.templates.list = templatesRPC.list + beaker.templates.put = templatesRPC.put + beaker.templates.remove = templatesRPC.remove + // beaker.crawler beaker.crawler = {} beaker.crawler.getCrawlStates = crawlerRPC.getCrawlStates diff --git a/web-apis/manifests/internal/archives.js b/web-apis/manifests/internal/archives.js index 3eb9a6f9..d6eecebc 100644 --- a/web-apis/manifests/internal/archives.js +++ b/web-apis/manifests/internal/archives.js @@ -27,12 +27,6 @@ module.exports = { addDraft: 'promise', removeDraft: 'promise', - // templates - getTemplate: 'promise', - listTemplates: 'promise', - putTemplate: 'promise', - removeTemplate: 'promise', - // internal management touch: 'promise', clearFileCache: 'promise', diff --git a/web-apis/manifests/internal/templates.js b/web-apis/manifests/internal/templates.js new file mode 100644 index 00000000..78500b87 --- /dev/null +++ b/web-apis/manifests/internal/templates.js @@ -0,0 +1,6 @@ +module.exports = { + get: 'promise', + list: 'promise', + put: 'promise', + remove: 'promise' +} \ No newline at end of file From 76b31898acb62642ef0e6a78362081b254cc3b26 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 23 Dec 2018 18:09:18 -0600 Subject: [PATCH 043/245] Fix limit/offset part of queries --- crawler/posts.js | 12 ++++++------ crawler/site-descriptions.js | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/crawler/posts.js b/crawler/posts.js index 32db063a..298d2b84 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -127,17 +127,17 @@ exports.list = async function ({offset, limit, reverse, author} = {}) { values.push(a) } } - if (offset) { - query += ` OFFSET ?` - values.push(offset) + query += ` ORDER BY createdAt` + if (reverse) { + query += ` DESC` } if (limit) { query += ` LIMIT ?` values.push(limit) } - query += ` ORDER BY createdAt` - if (reverse) { - query += ` DESC` + if (offset) { + query += ` OFFSET ?` + values.push(offset) } // execute query diff --git a/crawler/site-descriptions.js b/crawler/site-descriptions.js index d92ac1e4..7e9c18e2 100644 --- a/crawler/site-descriptions.js +++ b/crawler/site-descriptions.js @@ -159,17 +159,17 @@ const list = exports.list = async function ({offset, limit, reverse, author, sub } query += `) ` } - if (offset) { - query += ` OFFSET ?` - values.push(offset) + query += ` ORDER BY createdAt` + if (reverse) { + query += ` DESC` } if (limit) { query += ` LIMIT ?` values.push(limit) } - query += ` ORDER BY createdAt` - if (reverse) { - query += ` DESC` + if (offset) { + query += ` OFFSET ?` + values.push(offset) } // execute query From 1cd71bfe953d4ace8e672abf822555dcf1e498b4 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 23 Dec 2018 18:17:03 -0600 Subject: [PATCH 044/245] Fix {authors} handling of posts.list --- crawler/posts.js | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/crawler/posts.js b/crawler/posts.js index 298d2b84..517c65eb 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -100,17 +100,21 @@ exports.crawlSite = async function (archive, crawlSource) { }) } -exports.list = async function ({offset, limit, reverse, author} = {}) { +exports.list = async function ({offset, limit, reverse, author, authors} = {}) { // validate & parse params assert(!offset || typeof offset === 'number', 'Offset must be a number') assert(!limit || typeof limit === 'number', 'Limit must be a number') assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') - assert(!author || typeof author === 'string' || (Array.isArray(author) && author.every(isString)), 'Author must be a string or an array of strings') + assert(!author || typeof author === 'string', 'Author must be a string') + assert(!authors || (Array.isArray(authors) && authors.every(isString)), 'Authors must be an array of strings') if (author) { - author = Array.isArray(author) ? author : [author] - try { author = author.map(toOrigin) } - catch (e) { throw new Error('Author must contain valid URLs') } + authors = authors || [] + authors.push(author) + } + if (authors) { + try { authors = authors.map(toOrigin) } + catch (e) { throw new Error('Author/authors must contain valid URLs') } } // build query @@ -119,9 +123,9 @@ exports.list = async function ({offset, limit, reverse, author} = {}) { INNER JOIN crawl_sources src ON src.id = crawl_posts.crawlSourceId ` var values = [] - if (author) { + if (authors) { let op = 'WHERE' - for (let a of author) { + for (let a of authors) { query += ` ${op} src.url = ?` op = 'OR' values.push(a) From d25d940025d581af99774237051fd6ed6ba5aa1b Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 23 Dec 2018 19:15:47 -0600 Subject: [PATCH 045/245] Add ?disable_fallback_page QP to dat:// --- dat/protocol.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dat/protocol.js b/dat/protocol.js index e018d7eb..39fe8847 100644 --- a/dat/protocol.js +++ b/dat/protocol.js @@ -262,7 +262,7 @@ exports.electronHandler = async function (request, respond) { debug('Entry not found:', urlp.path) // check for a fallback page - if (manifest && manifest.fallback_page) { + if (manifest && manifest.fallback_page && !urlp.query.disable_fallback_page) { await tryStat(manifest.fallback_page) } From 80d945fb66c9c23a19a2ab214c9baaa9988ee984 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Wed, 26 Dec 2018 22:41:53 -0600 Subject: [PATCH 046/245] dat:// - look for file extensions based on Accept header --- dat/protocol.js | 5 ++++- lib/mime.js | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/dat/protocol.js b/dat/protocol.js index 39fe8847..6d4c55a3 100644 --- a/dat/protocol.js +++ b/dat/protocol.js @@ -213,7 +213,10 @@ exports.electronHandler = async function (request, respond) { await tryStat(filepath) } else { await tryStat(filepath) - await tryStat(filepath + '.html') // fallback to .html + for (let ext of mime.acceptHeaderExtensions(request.headers.Accept)) { + // fallback to different requested headers + await tryStat(filepath + ext) + } if (entry && entry.isDirectory()) { // unexpected directory, give the .html fallback a chance let dirEntry = entry diff --git a/lib/mime.js b/lib/mime.js index 57bb74b3..6b0b6d79 100644 --- a/lib/mime.js +++ b/lib/mime.js @@ -63,6 +63,16 @@ exports.isFileContentBinary = async function (fsInstance, filepath) { }) } +// for a given HTTP accept header, provide a list of file-extensions to try +exports.acceptHeaderExtensions = function (accept) { + var exts = [] + accept = accept.split(',') + if (accept.includes('text/html') || (accept.length === 1 && accept[0] === '*/*')) exts.push('.html') + if (accept.includes('text/css')) exts.push('.css') + if (accept.includes('image/*') || accept.includes('image/apng')) exts = exts.concat(['.png', '.jpg', '.jpeg', '.gif']) + return exts +} + // pulled from https://github.com/gjtorikian/isBinaryFile function isBinaryCheck (bytes) { var size = bytes.length From 4e45caa73d9fe913c8f99a88aa6240f4e535dc34 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Tue, 1 Jan 2019 12:00:41 -0600 Subject: [PATCH 047/245] Fix exception when no Accept header is set --- lib/mime.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mime.js b/lib/mime.js index 6b0b6d79..7d21e2a9 100644 --- a/lib/mime.js +++ b/lib/mime.js @@ -66,7 +66,7 @@ exports.isFileContentBinary = async function (fsInstance, filepath) { // for a given HTTP accept header, provide a list of file-extensions to try exports.acceptHeaderExtensions = function (accept) { var exts = [] - accept = accept.split(',') + accept = (accept || '').split(',') if (accept.includes('text/html') || (accept.length === 1 && accept[0] === '*/*')) exts.push('.html') if (accept.includes('text/css')) exts.push('.css') if (accept.includes('image/*') || accept.includes('image/apng')) exts = exts.concat(['.png', '.jpg', '.jpeg', '.gif']) From 2bca05f7d7f7bd492fe72ced7a0d90649f7460cb Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Tue, 1 Jan 2019 13:11:54 -0600 Subject: [PATCH 048/245] Fix dat download-as-zip --- dat/protocol.js | 4 ++-- lib/zip.js | 42 ++++++++++++++++++++++++++++++++++++++++++ package-lock.json | 19 ------------------- package.json | 4 ++-- 4 files changed, 46 insertions(+), 23 deletions(-) create mode 100644 lib/zip.js diff --git a/dat/protocol.js b/dat/protocol.js index 6d4c55a3..d146a752 100644 --- a/dat/protocol.js +++ b/dat/protocol.js @@ -4,7 +4,7 @@ const parseRange = require('range-parser') const once = require('once') const debug = require('../lib/debug-logger').debugLogger('dat-serve') const intoStream = require('into-stream') -const toZipStream = require('hyperdrive-to-zip-stream') +const {toZipStream} = require('../lib/zip') const slugify = require('slugify') const datDns = require('./dns') @@ -173,7 +173,7 @@ exports.electronHandler = async function (request, respond) { }) } else { // serve the zip - var zs = toZipStream(archive, filepath) + var zs = toZipStream(checkoutFS, filepath) zs.on('error', err => console.log('Error while producing .zip file', err)) return respond({ statusCode: 200, diff --git a/lib/zip.js b/lib/zip.js new file mode 100644 index 00000000..ee7cb310 --- /dev/null +++ b/lib/zip.js @@ -0,0 +1,42 @@ +const {join} = require('path') +const yazl = require('yazl') + +exports.toZipStream = function (archive, dirpath) { + var zipfile = new yazl.ZipFile() + + // create listing stream + dirpath = dirpath || '/' + archive.pda.readdir(dirpath, {recursive: true}).then(async (paths) => { + for (let path of paths) { + let readPath = join(dirpath, path) + + // files only + try { + let entry = await archive.pda.stat(readPath) + if (!entry.isFile()) { + continue + } + } catch (e) { + // ignore, file must have been removed + continue + } + + // pipe each entry into the zip + console.log('go go go', readPath, path) + zipfile.addBuffer(await archive.pda.readFile(readPath, 'binary'), path) + // NOTE + // for some reason using archive.createReadStream() to feed into the zipfile addReadStream() was not working with multiple files + // no idea why, maybe a sign of a bug in the dat-daemon's zip rpc + // -prf + } + zipfile.end() + }).catch(onerror) + + // on error, push to the output stream + function onerror (e) { + console.error('Error while producing zip stream', e) + zipfile.outputStream.emit('error', e) + } + + return zipfile.outputStream +} \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 2133c6cc..c8f5688b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1831,17 +1831,6 @@ } } }, - "hyperdrive-to-zip-stream": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/hyperdrive-to-zip-stream/-/hyperdrive-to-zip-stream-2.1.1.tgz", - "integrity": "sha512-qHZKedMzgbLLGtY33SRm8J3n2mPCJ3v/Uhyr78W20h9MN99cwhaoKMiH9WpXy9eFCtj/EWFJRoHXqBNOvzNSXw==", - "requires": { - "from2": "^2.3.0", - "pauls-dat-api": "^8.0.1", - "through2-concurrent": "^1.1.1", - "yazl": "^2.4.2" - } - }, "iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -3975,14 +3964,6 @@ "xtend": "~4.0.1" } }, - "through2-concurrent": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/through2-concurrent/-/through2-concurrent-1.1.1.tgz", - "integrity": "sha1-EctOpMnjG8puTB5tukjRxyjDUks=", - "requires": { - "through2": "^2.0.0" - } - }, "thunky": { "version": "0.1.0", "resolved": "http://registry.npmjs.org/thunky/-/thunky-0.1.0.tgz", diff --git a/package.json b/package.json index 502fd75f..7f72977f 100644 --- a/package.json +++ b/package.json @@ -47,7 +47,6 @@ "hypercore-protocol": "^6.9.0", "hyperdrive": "^9.14.0", "hyperdrive-network-speed": "^2.1.0", - "hyperdrive-to-zip-stream": "^2.1.1", "identify-filetype": "^1.0.0", "into-stream": "^3.1.0", "lodash.debounce": "^4.0.8", @@ -81,7 +80,8 @@ "supports-sparse-files": "^1.0.2", "textextensions": "^2.4.0", "through2": "^2.0.5", - "utp-native": "^2.1.3" + "utp-native": "^2.1.3", + "yazl": "^2.5.1" }, "devDependencies": { "eslint": "^4.19.1", From a8453d089d81f7d44174404ce08bd5d8386f3dcf Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 3 Jan 2019 17:16:38 -0600 Subject: [PATCH 049/245] Add beaker.crawler.listSuggestions --- crawler/index.js | 1 + crawler/search.js | 49 ++++++++++++++++++++++++++ web-apis/fg/beaker.js | 1 + web-apis/manifests/internal/crawler.js | 1 + 4 files changed, 52 insertions(+) create mode 100644 crawler/search.js diff --git a/crawler/index.js b/crawler/index.js index 8e997cb2..799af9a0 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -126,6 +126,7 @@ exports.resetSite = async function (url) { } exports.WEBAPI = { + listSuggestions: require('./search').listSuggestions, createEventsStream, getCrawlStates, crawlSite: async (url) => { diff --git a/crawler/search.js b/crawler/search.js new file mode 100644 index 00000000..a3bd85f4 --- /dev/null +++ b/crawler/search.js @@ -0,0 +1,49 @@ +const bookmarksDb = require('../dbs/bookmarks') +const historyDb = require('../dbs/history') +const datLibrary = require('../dat/library') + +const BUILTIN_PAGES = [ + {title: 'Feed', url: 'beaker://feed'}, + {title: 'Library', url: 'beaker://library'}, + {title: 'Search', url: 'beaker://search'}, + {title: 'Bookmarks', url: 'beaker://bookmarks'}, + {title: 'History', url: 'beaker://history'}, + {title: 'Watchlist', url: 'beaker://watchlist'}, + {title: 'Downloads', url: 'beaker://downloads'}, + {title: 'Settings', url: 'beaker://settings'}, +] + +// exported api +// = + +exports.listSuggestions = async function (query = '', opts = {}) { + var suggestions = {} + const filterFn = a => ((a.url || a.href).includes(query) || a.title.toLowerCase().includes(query)) + + // builtin pages + suggestions.apps = BUILTIN_PAGES.filter(filterFn) + + // bookmarks + var bookmarkResults = await bookmarksDb.listBookmarks(0) + if (opts.filterPins) { + bookmarkResults = bookmarkResults.filter(b => !b.pinned && filterFn(b)) + } else { + bookmarkResults = bookmarkResults.filter(filterFn) + } + bookmarkResults = bookmarkResults.slice(0, 12) + suggestions.bookmarks = bookmarkResults.map(b => ({title: b.title, url: b.href})) + + // library + var libraryResults = await datLibrary.queryArchives({isSaved: true}) + libraryResults = libraryResults.filter(filterFn) + suggestions.library = libraryResults.slice(0, 12) + + // fetch history + if (query) { + var historyResults = await historyDb.search(query) + suggestions.history = historyResults.slice(0, 12) + suggestions.history.sort((a, b) => a.url.length - b.url.length) // shorter urls at top + } + + return suggestions +} diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 532b72cf..80a5f7ea 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -167,6 +167,7 @@ exports.setup = function (rpc) { // beaker.crawler beaker.crawler = {} + beaker.crawler.listSuggestions = crawlerRPC.listSuggestions beaker.crawler.getCrawlStates = crawlerRPC.getCrawlStates beaker.crawler.crawlSite = crawlerRPC.crawlSite beaker.crawler.resetSite = crawlerRPC.resetSite diff --git a/web-apis/manifests/internal/crawler.js b/web-apis/manifests/internal/crawler.js index 47365431..2cd65b21 100644 --- a/web-apis/manifests/internal/crawler.js +++ b/web-apis/manifests/internal/crawler.js @@ -1,4 +1,5 @@ module.exports = { + listSuggestions: 'promise', getCrawlStates: 'promise', crawlSite: 'promise', resetSite: 'promise', From 07d703f8d64b1d837c3f58de2af7ff14d451cc95 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 4 Jan 2019 12:21:33 -0600 Subject: [PATCH 050/245] Rename feed -> timeline --- crawler/search.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/search.js b/crawler/search.js index a3bd85f4..a8b86136 100644 --- a/crawler/search.js +++ b/crawler/search.js @@ -3,7 +3,7 @@ const historyDb = require('../dbs/history') const datLibrary = require('../dat/library') const BUILTIN_PAGES = [ - {title: 'Feed', url: 'beaker://feed'}, + {title: 'Timeline', url: 'beaker://timeline'}, {title: 'Library', url: 'beaker://library'}, {title: 'Search', url: 'beaker://search'}, {title: 'Bookmarks', url: 'beaker://bookmarks'}, From d42556b24e7560867a09b4601c25aa5b6f4c2114 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 4 Jan 2019 13:51:55 -0600 Subject: [PATCH 051/245] Split search results into types --- crawler/search.js | 26 ++++++++++++++------------ lib/dat.js | 9 +++++++++ package-lock.json | 5 +++++ package.json | 1 + 4 files changed, 29 insertions(+), 12 deletions(-) create mode 100644 lib/dat.js diff --git a/crawler/search.js b/crawler/search.js index a8b86136..9b6aa307 100644 --- a/crawler/search.js +++ b/crawler/search.js @@ -1,6 +1,8 @@ +const _groupBy = require('lodash.groupby') const bookmarksDb = require('../dbs/bookmarks') const historyDb = require('../dbs/history') const datLibrary = require('../dat/library') +const {getBasicType} = require('../lib/dat') const BUILTIN_PAGES = [ {title: 'Timeline', url: 'beaker://timeline'}, @@ -23,23 +25,23 @@ exports.listSuggestions = async function (query = '', opts = {}) { // builtin pages suggestions.apps = BUILTIN_PAGES.filter(filterFn) - // bookmarks - var bookmarkResults = await bookmarksDb.listBookmarks(0) - if (opts.filterPins) { - bookmarkResults = bookmarkResults.filter(b => !b.pinned && filterFn(b)) - } else { - bookmarkResults = bookmarkResults.filter(filterFn) - } - bookmarkResults = bookmarkResults.slice(0, 12) - suggestions.bookmarks = bookmarkResults.map(b => ({title: b.title, url: b.href})) - // library var libraryResults = await datLibrary.queryArchives({isSaved: true}) libraryResults = libraryResults.filter(filterFn) - suggestions.library = libraryResults.slice(0, 12) + Object.assign(suggestions, _groupBy(libraryResults, a => getBasicType(a.type))) - // fetch history if (query) { + // bookmarks + var bookmarkResults = await bookmarksDb.listBookmarks(0) + if (opts.filterPins) { + bookmarkResults = bookmarkResults.filter(b => !b.pinned && filterFn(b)) + } else { + bookmarkResults = bookmarkResults.filter(filterFn) + } + bookmarkResults = bookmarkResults.slice(0, 12) + suggestions.bookmarks = bookmarkResults.map(b => ({title: b.title, url: b.href})) + + // history var historyResults = await historyDb.search(query) suggestions.history = historyResults.slice(0, 12) suggestions.history.sort((a, b) => a.url.length - b.url.length) // shorter urls at top diff --git a/lib/dat.js b/lib/dat.js new file mode 100644 index 00000000..0aaef3cc --- /dev/null +++ b/lib/dat.js @@ -0,0 +1,9 @@ +exports.getBasicType = function (type) { + if (type && Array.isArray(type)) { + if (type.includes('user')) return 'user' + if (type.includes('web-page')) return 'web-page' + if (type.includes('file-share')) return 'file-share' + if (type.includes('image-collection')) return 'image-collection' + } + return 'other' +} diff --git a/package-lock.json b/package-lock.json index c8f5688b..f6b7944f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2325,6 +2325,11 @@ "resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz", "integrity": "sha1-LRd/ZS+jHpObRDjVNBSZ36OCXpk=" }, + "lodash.groupby": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/lodash.groupby/-/lodash.groupby-4.6.0.tgz", + "integrity": "sha1-Cwih3PaDl8OXhVwyOXg4Mt90A9E=" + }, "lodash.isequal": { "version": "4.5.0", "resolved": "https://registry.npmjs.org/lodash.isequal/-/lodash.isequal-4.5.0.tgz", diff --git a/package.json b/package.json index 7f72977f..18ef8ddd 100644 --- a/package.json +++ b/package.json @@ -52,6 +52,7 @@ "lodash.debounce": "^4.0.8", "lodash.difference": "^4.5.0", "lodash.get": "^4.4.2", + "lodash.groupby": "^4.6.0", "lodash.isequal": "^4.5.0", "lodash.pick": "^4.4.0", "lodash.throttle": "^4.1.1", From 5763d0c3ae19c774fb76acf809d1b70be46b197a Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Thu, 10 Jan 2019 16:27:14 -0600 Subject: [PATCH 052/245] Add beaker.crawler.listSearchResults and FTS indexes --- crawler/index.js | 1 + crawler/search.js | 235 ++++++++++++++++++++++++- dbs/profile-data-db.js | 3 +- dbs/schemas/profile-data.sql.js | 28 ++- dbs/schemas/profile-data.v25.sql.js | 32 ++++ package-lock.json | 5 + package.json | 1 + web-apis/fg/beaker.js | 1 + web-apis/manifests/internal/crawler.js | 1 + 9 files changed, 303 insertions(+), 4 deletions(-) create mode 100644 dbs/schemas/profile-data.v25.sql.js diff --git a/crawler/index.js b/crawler/index.js index 799af9a0..ef27b57f 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -127,6 +127,7 @@ exports.resetSite = async function (url) { exports.WEBAPI = { listSuggestions: require('./search').listSuggestions, + listSearchResults: require('./search').listSearchResults, createEventsStream, getCrawlStates, crawlSite: async (url) => { diff --git a/crawler/search.js b/crawler/search.js index 9b6aa307..6c92b7db 100644 --- a/crawler/search.js +++ b/crawler/search.js @@ -1,12 +1,17 @@ const _groupBy = require('lodash.groupby') +const _uniqWith = require('lodash.uniqwith') +const db = require('../dbs/profile-data-db') const bookmarksDb = require('../dbs/bookmarks') const historyDb = require('../dbs/history') const datLibrary = require('../dat/library') +const followgraph = require('./followgraph') +const siteDescriptions = require('./site-descriptions') const {getBasicType} = require('../lib/dat') +/** @type {Array} */ const BUILTIN_PAGES = [ {title: 'Timeline', url: 'beaker://timeline'}, - {title: 'Library', url: 'beaker://library'}, + {title: 'Your Library', url: 'beaker://library'}, {title: 'Search', url: 'beaker://search'}, {title: 'Bookmarks', url: 'beaker://bookmarks'}, {title: 'History', url: 'beaker://history'}, @@ -15,9 +20,33 @@ const BUILTIN_PAGES = [ {title: 'Settings', url: 'beaker://settings'}, ] +// typedefs +// = + // exported api // = +/** + * @description + * Get suggested content of various types. + * + * @param {string} [query=''] - The search query. + * @param {Object} [opts={}] + * @param {boolean} [opts.filterPins] - If true, will filter out pinned bookmarks. + * @returns {Promise} + * + * @typedef {Object} SuggestionResults + * @prop {Array} apps + * @prop {Array} people + * @prop {Array} webPages + * @prop {Array} fileShares + * @prop {Array} imageCollections + * @prop {Array} others + * @prop {(undefined|Array)} bookmarks + * @prop {(undefined|Array)} history + * + * TODO: make the return values much more concrete + */ exports.listSuggestions = async function (query = '', opts = {}) { var suggestions = {} const filterFn = a => ((a.url || a.href).includes(query) || a.title.toLowerCase().includes(query)) @@ -28,7 +57,12 @@ exports.listSuggestions = async function (query = '', opts = {}) { // library var libraryResults = await datLibrary.queryArchives({isSaved: true}) libraryResults = libraryResults.filter(filterFn) - Object.assign(suggestions, _groupBy(libraryResults, a => getBasicType(a.type))) + libraryResults = _groupBy(libraryResults, a => getBasicType(a.type)) + suggestions.people = libraryResults.user + suggestions.webPages = libraryResults['web-page'] + suggestions.fileShares = libraryResults['file-share'] + suggestions.imageCollections = libraryResults['image-collection'] + suggestions.others = libraryResults.other if (query) { // bookmarks @@ -49,3 +83,200 @@ exports.listSuggestions = async function (query = '', opts = {}) { return suggestions } + +/** + * @description + * Run a search query against crawled data. + * + * @param {Object} opts + * @param {string} opts.user - The current user's URL. + * @param {string} [opts.query] - The search query. + * @param {number} [opts.hops=1] - How many hops out in the user's follow graph should be included? + * @param {Object} [opts.types] - Content types to query. Defaults to all. + * @param {boolean} [opts.types.people] + * @param {boolean} [opts.types.posts] + * @param {number} [opts.since] - Filter results to items created since the given timestamp. + * @param {number} [opts.offset] + * @param {number} [opts.limit = 20] + * @returns {Promise} + * + * Search results: + * @typedef {Object} SearchResults + * @prop {number} highlightNonce - A number used to create perimeters around text that should be highlighted. + * @prop {(null|Array)} people + * @prop {(null|Array)} posts + * + * People search results: + * @typedef {Object} PeopleSearchResult + * @prop {string} url + * @prop {string} title + * @prop {string} description + * @prop {Array} followedBy + * @prop {bool} followsUser + * @prop {Object} author + * @prop {string} author.url + * + * Post search results: + * @typedef {Object} PostSearchResult + * @prop {string} url + * @prop {SiteDescription} author + * @prop {string} content + * @prop {string} createdAt + * @prop {string} [updatedAt] + * + * Site description objects: + * @typedef {Object} SiteDescription + * @prop {string} url + * @prop {string} [title] + * @prop {string} [description] + * @prop {Array} [type] + * @prop {Object} [author] + * @prop {string} [author.url] + */ +exports.listSearchResults = async function (opts) { + const highlightNonce = (Math.random() * 1e3)|0 + const startHighlight = `{${highlightNonce}}` + const endHighlight = `{/${highlightNonce}}` + + var searchResults = { + highlightNonce, + people: null, + posts: null + } + var {user, query, hops, types, since, offset, limit} = opts + if (!types || typeof types !== 'object') { + types = {people: true, posts: true} + } + since = since || 0 + offset = offset || 0 + limit = limit || 20 + hops = Math.min(Math.max(Math.floor(hops), 1), 2) // clamp to [1, 2] for now + + // prep search terms + if (query && typeof query === 'string') { + query = query + .toLowerCase() // all lowercase. (uppercase is interpretted as a directive by sqlite.) + .replace(/[:^*\.]/g, ' ') // strip symbols that sqlite interprets. + query += '*' // match prefixes + } + + // get user's crawl_source id + var userCrawlSourceId + { + let res = await db.get(`SELECT id FROM crawl_sources WHERE url = ?`, [user]) + userCrawlSourceId = res.id + } + + // construct set of crawl sources to query + var crawlSourceIds + if (hops === 2) { + // the user and all followed sources + let res = await db.all(` + SELECT id FROM crawl_sources src + INNER JOIN crawl_followgraph fgraph ON fgraph.destUrl = src.url AND fgraph.crawlSourceId = ? + `, [userCrawlSourceId]) + crawlSourceIds = [userCrawlSourceId].concat(res.map(({id}) => id)) + } else if (hops === 1) { + // just the user + crawlSourceIds = [userCrawlSourceId] + } + + // run queries + if (types.people) { + if (query) { + searchResults.people = await db.all(` + SELECT + desc.subject AS url, + descSrc.url AS authorUrl, + SNIPPET(crawl_site_descriptions_fts_index, 0, '${startHighlight}', '${endHighlight}', '...', 25) AS title, + SNIPPET(crawl_site_descriptions_fts_index, 1, '${startHighlight}', '${endHighlight}', '...', 25) AS description + FROM crawl_site_descriptions_fts_index desc_fts + INNER JOIN crawl_site_descriptions desc ON desc.rowid = desc_fts.rowid + INNER JOIN crawl_followgraph fgraph ON fgraph.destUrl = desc.subject AND fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) + INNER JOIN crawl_sources descSrc ON desc.crawlSourceId = descSrc.id + WHERE crawl_site_descriptions_fts_index MATCH ? + ORDER BY rank + LIMIT ? + OFFSET ?; + `, [query, limit, offset]) + } else { + searchResults.people = await db.all(` + SELECT desc.subject AS url, desc.title, desc.description, descSrc.url AS authorUrl + FROM crawl_site_descriptions desc + INNER JOIN crawl_followgraph fgraph ON fgraph.destUrl = desc.subject AND fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) + INNER JOIN crawl_sources descSrc ON desc.crawlSourceId = descSrc.id + ORDER BY desc.createdAt + LIMIT ? + OFFSET ?; + `, [limit, offset]) + } + searchResults.people = _uniqWith(searchResults.people, (a, b) => a.url === b.url) + await Promise.all(searchResults.people.map(async (p) => { + // fetch additional info + p.followedBy = await followgraph.listFollowers(p.url, {includeDesc: true}) + p.followsUser = await followgraph.isAFollowingB(p.url, user) + + // massage attrs + p.author = {url: p.authorUrl} + delete p.authorUrl + })) + } + if (types.posts) { + if (query) { + searchResults.posts = await db.all(` + SELECT + SNIPPET(crawl_posts_fts_index, 0, '${startHighlight}', '${endHighlight}', '...', 25) AS content, + post.pathname, + post.createdAt, + post.updatedAt, + postSrc.url AS authorUrl + FROM crawl_posts_fts_index post_fts + INNER JOIN crawl_posts post ON post.rowid = post_fts.rowid + INNER JOIN crawl_sources postSrc ON post.crawlSourceId = postSrc.id + LEFT JOIN crawl_followgraph fgraph ON fgraph.destUrl = postSrc.url + WHERE + crawl_posts_fts_index MATCH ? + AND (fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) OR post.crawlSourceId = ?) + AND post.createdAt >= ? + ORDER BY rank + LIMIT ? + OFFSET ?; + `, [query, userCrawlSourceId, since, limit, offset]) + } else { + searchResults.posts = await db.all(` + SELECT post.content, post.pathname, post.createdAt, post.updatedAt, postSrc.url AS authorUrl + FROM crawl_posts post + INNER JOIN crawl_sources postSrc ON post.crawlSourceId = postSrc.id + LEFT JOIN crawl_followgraph fgraph ON fgraph.destUrl = postSrc.url + WHERE + (fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) OR post.crawlSourceId = ?) + AND post.createdAt >= ? + ORDER BY post.createdAt DESC + LIMIT ? + OFFSET ?; + `, [userCrawlSourceId, since, limit, offset]) + } + await Promise.all(searchResults.posts.map(async (p) => { + // fetch additional info + p.author = await siteDescriptions.getBest({subject: p.authorUrl}) + + // massage attrs + p.url = p.authorUrl + p.pathname + delete p.authorUrl + delete p.pathname + })) + // TODO hops == 2 + /*searchResults.posts = await db.all(` + SELECT post.content, post.pathname, postSrc.url + FROM crawl_posts post + INNER JOIN crawl_sources postSrc ON post.crawlSourceId = postSrc.id + INNER JOIN crawl_followgraph fgraph ON fgraph.destUrl = postSrc.url AND fgraph.crawlSourceId = ? + WHERE (post.content MATCH ?) AND (post.createdAt >= ?) + ORDER BY rank + LIMIT ? + OFFSET ?; + `, [userCrawlSourceId, query, since, offset, limit])*/ + } + + return searchResults +} diff --git a/dbs/profile-data-db.js b/dbs/profile-data-db.js index a4c919b4..542ea376 100644 --- a/dbs/profile-data-db.js +++ b/dbs/profile-data-db.js @@ -79,7 +79,8 @@ migrations = [ migration('profile-data.v21.sql'), migration('profile-data.v22.sql', {canFail: true}), // canFail for the same reason as v16, ffs migration('profile-data.v23.sql'), - migration('profile-data.v24.sql') + migration('profile-data.v24.sql'), + migration('profile-data.v25.sql') ] function migration (file, opts = {}) { return cb => { diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index 3879146e..f59bada2 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -144,6 +144,19 @@ CREATE TABLE crawl_site_descriptions ( PRIMARY KEY (crawlSourceId, pathname), FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); +CREATE VIRTUAL TABLE crawl_site_descriptions_fts_index USING fts5(title, description, content='crawl_site_descriptions'); + +-- triggers to keep crawl_site_descriptions_fts_index updated +CREATE TRIGGER crawl_site_descriptions_ai AFTER INSERT ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_site_descriptions_ad AFTER DELETE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_site_descriptions_au AFTER UPDATE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.a, old.title, old.description); + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; -- crawled posts CREATE TABLE crawl_posts ( @@ -157,6 +170,19 @@ CREATE TABLE crawl_posts ( FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); +CREATE VIRTUAL TABLE crawl_posts_fts_index USING fts5(content, content='crawl_posts'); + +-- triggers to keep crawl_posts_fts_index updated +CREATE TRIGGER crawl_posts_ai AFTER INSERT ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); +END; +CREATE TRIGGER crawl_posts_ad AFTER DELETE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); +END; +CREATE TRIGGER crawl_posts_au AFTER UPDATE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); + INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); +END; -- crawled follows CREATE TABLE crawl_followgraph ( @@ -232,5 +258,5 @@ INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Report an issu INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Explore the p2p Web', 'dat://taravancil.com/explore-the-p2p-web.md', 1); INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Support Beaker', 'https://opencollective.com/beaker', 1); -PRAGMA user_version = 24; +PRAGMA user_version = 25; ` diff --git a/dbs/schemas/profile-data.v25.sql.js b/dbs/schemas/profile-data.v25.sql.js new file mode 100644 index 00000000..b84e73a1 --- /dev/null +++ b/dbs/schemas/profile-data.v25.sql.js @@ -0,0 +1,32 @@ +module.exports = ` + +-- add full-text search indexes +CREATE VIRTUAL TABLE crawl_site_descriptions_fts_index USING fts5(title, description, content='crawl_site_descriptions'); +CREATE VIRTUAL TABLE crawl_posts_fts_index USING fts5(content, content='crawl_posts'); + +-- triggers to keep crawl_site_descriptions_fts_index updated +CREATE TRIGGER crawl_site_descriptions_ai AFTER INSERT ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_site_descriptions_ad AFTER DELETE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_site_descriptions_au AFTER UPDATE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.a, old.title, old.description); + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- triggers to keep crawl_posts_fts_index updated +CREATE TRIGGER crawl_posts_ai AFTER INSERT ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); +END; +CREATE TRIGGER crawl_posts_ad AFTER DELETE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); +END; +CREATE TRIGGER crawl_posts_au AFTER UPDATE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); + INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); +END; + +PRAGMA user_version = 25; +` \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index f6b7944f..abce505b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2345,6 +2345,11 @@ "resolved": "https://registry.npmjs.org/lodash.throttle/-/lodash.throttle-4.1.1.tgz", "integrity": "sha1-wj6RtxAkKscMN/HhzaknTMOb8vQ=" }, + "lodash.uniqwith": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.uniqwith/-/lodash.uniqwith-4.5.0.tgz", + "integrity": "sha1-egy/ZfQ7WShiWp1NDcVLGMrcfvM=" + }, "lru": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/lru/-/lru-3.1.0.tgz", diff --git a/package.json b/package.json index 18ef8ddd..f933fb4a 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "lodash.isequal": "^4.5.0", "lodash.pick": "^4.4.0", "lodash.throttle": "^4.1.1", + "lodash.uniqwith": "^4.5.0", "mime": "^1.4.0", "mkdirp": "^0.5.1", "moment": "^2.23.0", diff --git a/web-apis/fg/beaker.js b/web-apis/fg/beaker.js index 80a5f7ea..819a0ec3 100644 --- a/web-apis/fg/beaker.js +++ b/web-apis/fg/beaker.js @@ -168,6 +168,7 @@ exports.setup = function (rpc) { // beaker.crawler beaker.crawler = {} beaker.crawler.listSuggestions = crawlerRPC.listSuggestions + beaker.crawler.listSearchResults = crawlerRPC.listSearchResults beaker.crawler.getCrawlStates = crawlerRPC.getCrawlStates beaker.crawler.crawlSite = crawlerRPC.crawlSite beaker.crawler.resetSite = crawlerRPC.resetSite diff --git a/web-apis/manifests/internal/crawler.js b/web-apis/manifests/internal/crawler.js index 2cd65b21..af7554a2 100644 --- a/web-apis/manifests/internal/crawler.js +++ b/web-apis/manifests/internal/crawler.js @@ -1,5 +1,6 @@ module.exports = { listSuggestions: 'promise', + listSearchResults: 'promise', getCrawlStates: 'promise', crawlSite: 'promise', resetSite: 'promise', From 8bab9d1bf29d716ddbe3c7cb45de552906a27bb2 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 11 Jan 2019 17:33:17 -0600 Subject: [PATCH 053/245] Update the known_sites implementation to use folder-based captures and to index self-descriptions. Also added a lot of jsdocs to the crawler. --- crawler/followgraph.js | 107 +++++++++--- crawler/index.js | 1 + crawler/posts.js | 100 +++++++++++- crawler/search.js | 119 +++++++------- crawler/site-descriptions.js | 244 ++++++++++++++++------------ crawler/util.js | 65 +++++++- dbs/profile-data-db.js | 3 +- dbs/schemas/profile-data.sql.js | 7 +- dbs/schemas/profile-data.v24.sql.js | 30 +++- dbs/schemas/profile-data.v25.sql.js | 32 ---- 10 files changed, 475 insertions(+), 233 deletions(-) delete mode 100644 dbs/schemas/profile-data.v25.sql.js diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 0340117d..7f967e2e 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -16,6 +16,19 @@ const TABLE_VERSION = 1 const JSON_TYPE = 'unwalled.garden/follows' const JSON_PATH = '/data/follows.json' +// typedefs +// = + +/** + * @typedef CrawlSourceRecord {import('./util').CrawlSourceRecord} + * @typedef SiteDescription {import('./site-descriptions').SiteDescription} + * + * @typedef {Object} SiteDescriptionWithFollowData + * @extends {SiteDescription} + * @prop {boolean} [followsUser] - does this site follow the specified user site? + * @prop {Array} [followedBy] - list of sites following this site. + */ + // globals // = @@ -28,6 +41,14 @@ exports.on = events.on.bind(events) exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) +/** + * @description + * Crawl the given site for follows. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ exports.crawlSite = async function (archive, crawlSource) { return doCrawl(archive, crawlSource, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info @@ -97,11 +118,16 @@ exports.crawlSite = async function (archive, crawlSource) { }) } -// List sites that follow subject -// - subject. String (URL). -// - opts.followedBy. String (URL). -// - opts.includeDesc. Boolean. -// - returns Array +/** + * @description + * List sites that follow subject. + * + * @param {string} subject - (URL) + * @param {Object} [opts] + * @param {string} [opts.followedBy] - (URL) filter results to those followed by the site specified with this param. Causes .followsUser boolean to be set. + * @param {boolean} [opts.includeDesc] - output a site description instead of a simple URL. + * @returns {(Promise>|Promise>)} + */ const listFollowers = exports.listFollowers = async function (subject, {followedBy, includeDesc} = {}) { var rows if (followedBy) { @@ -138,12 +164,17 @@ const listFollowers = exports.listFollowers = async function (subject, {followed })) } -// List sites that subject follows -// - subject. String (URL). -// - opts.followedBy. String (URL). Filters to users who are followed by the URL specified. Causes .followsUser boolean to be set. -// - opts.includeDesc. Boolean. -// - opts.includeFollowers. Boolean. Requires includeDesc to be true. -// - returns Array +/** + * @description + * List sites that subject follows. + * + * @param {string} subject - (URL) + * @param {Object} [opts] + * @param {string} [opts.followedBy] - (URL) filter results to those followed by the site specified with this param. Causes .followsUser boolean to be set. + * @param {boolean} [opts.includeDesc] - output a site description instead of a simple URL. + * @param {boolean} [opts.includeFollowers] - include .followedBy in the result. Requires includeDesc to be true. + * @returns {(Promise>|Promise>)} + */ const listFollows = exports.listFollows = async function (subject, {followedBy, includeDesc, includeFollowers} = {}) { var rows = await db.all(` SELECT crawl_followgraph.destUrl @@ -169,10 +200,15 @@ const listFollows = exports.listFollows = async function (subject, {followedBy, })) } -// List sites that are followed by sites that the subject follows -// - subject. String (URL). -// - opts.followedBy. String (URL). Filters to users who are followed by the URL specified. Causes .followsUser boolean to be set. -// - returns Array +/** + * @description + * List sites that are followed by sites that the subject follows. + * + * @param {string} subject - (URL) + * @param {Object} [opts] + * @param {string} [opts.followedBy] - (URL) filter results to those followed by the site specified with this param. Causes .followsUser boolean to be set. + * @returns {Promise>} + */ const listFoaFs = exports.listFoaFs = async function (subject, {followedBy} = {}) { var foafs = [] // list URLs followed by subject @@ -196,10 +232,14 @@ const listFoaFs = exports.listFoaFs = async function (subject, {followedBy} = {} return foafs } -// Check for the existence of an individual follow -// - a. String (URL), the site being queried. -// - b. String (URL), does a follow this site? -// - returns bool +/** + * @description + * Check for the existence of an individual follow. + * + * @param {string} a - (URL) the site being queried. + * @param {string} b - (URL) does a follow this site? + * @returns {Promise} + */ const isAFollowingB = exports.isAFollowingB = async function (a, b) { a = toOrigin(a) b = toOrigin(b) @@ -214,6 +254,14 @@ const isAFollowingB = exports.isAFollowingB = async function (a, b) { return !!res } +/** + * @description + * Add a follow to the given archive. + * + * @param {InternalDatArchive} archive + * @param {string} followUrl + * @returns {Promise} + */ exports.follow = async function (archive, followUrl) { // normalize followUrl followUrl = toOrigin(followUrl) @@ -230,6 +278,14 @@ exports.follow = async function (archive, followUrl) { /* dont await */siteDescriptions.capture(archive, followUrl) } +/** + * @description + * Remove a follow from the given archive. + * + * @param {InternalDatArchive} archive + * @param {string} followUrl + * @returns {Promise} + */ exports.unfollow = async function (archive, followUrl) { // normalize followUrl followUrl = toOrigin(followUrl) @@ -247,6 +303,10 @@ exports.unfollow = async function (archive, followUrl) { // internal methods // = +/** + * @param {string} url + * @returns {string} + */ function toOrigin (url) { try { url = new URL(url) @@ -256,6 +316,10 @@ function toOrigin (url) { } } +/** + * @param {InternalDatArchive} archive + * @returns {Promise} + */ async function readFollowsFile (archive) { try { var followsJson = await archive.pda.readFile(JSON_PATH, 'utf8') @@ -271,6 +335,11 @@ async function readFollowsFile (archive) { return followsJson } +/** + * @param {InternalDatArchive} archive + * @param {(Object) => undefined} updateFn + * @returns {Promise} + */ async function updateFollowsFile (archive, updateFn) { var release = await lock('crawler:followgraph:' + archive.url) try { diff --git a/crawler/index.js b/crawler/index.js index ef27b57f..bcb1c692 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -89,6 +89,7 @@ exports.crawlSite = async function (archive) { siteDescriptions.crawlSite(archive, crawlSource) ]) } catch (err) { + console.error('Crawler error', {sourceUrl: archive.url, err: err.toString()}) crawlerEvents.emit('crawl-error', {sourceUrl: archive.url, err: err.toString()}) } finally { crawlerEvents.emit('crawl-finish', {sourceUrl: archive.url}) diff --git a/crawler/posts.js b/crawler/posts.js index 517c65eb..c5e33378 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -3,6 +3,7 @@ const {URL} = require('url') const Events = require('events') const db = require('../dbs/profile-data-db') const crawler = require('./index') +const siteDescriptions = require('./site-descriptions') const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') @@ -13,6 +14,22 @@ const TABLE_VERSION = 1 const JSON_TYPE = 'unwalled.garden/post' const JSON_PATH_REGEX = /^\/data\/posts\/([^/]+)\.json$/i +// typedefs +// = + +/** + * @typedef CrawlSourceRecord {import('./util').CrawlSourceRecord} + * @typedef SiteDescription { import("./site-descriptions").SiteDescription } + * + * @typedef {Object} Post + * @prop {string} pathname + * @prop {string} content + * @prop {number} crawledAt + * @prop {number} createdAt + * @prop {number} updatedAt + * @prop {SiteDescription} author + */ + // globals // = @@ -25,6 +42,14 @@ exports.on = events.on.bind(events) exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) +/** + * @description + * Crawl the given site for posts. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ exports.crawlSite = async function (archive, crawlSource) { return doCrawl(archive, crawlSource, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info @@ -100,6 +125,18 @@ exports.crawlSite = async function (archive, crawlSource) { }) } +/** + * @description + * List crawled posts. + * + * @param {Object} [opts] + * @param {string} [opts.author] - (URL) filter descriptions to those created by this author. + * @param {Array} [opts.authors] - (URL) filter descriptions to those created by these authors. + * @param {number} [opts.offset] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ exports.list = async function ({offset, limit, reverse, author, authors} = {}) { // validate & parse params assert(!offset || typeof offset === 'number', 'Offset must be a number') @@ -145,9 +182,18 @@ exports.list = async function ({offset, limit, reverse, author, authors} = {}) { } // execute query - return (await db.all(query, values)).map(massagePostRow) + var rows = await db.all(query, values) + return Promise.all(rows.map(massagePostRow)) } +/** + * @description + * Get crawled post. + * + * @param {string} url - The URL of the post or of the author (if pathname is provided). + * @param {string} [pathname] - The pathname of the post. + * @returns {Promise} + */ const get = exports.get = async function (url, pathname = undefined) { // validate & parse params if (url) { @@ -157,7 +203,7 @@ const get = exports.get = async function (url, pathname = undefined) { pathname = pathname || url.pathname // execute query - return massagePostRow(await db.get(` + return await massagePostRow(await db.get(` SELECT crawl_posts.*, src.url AS crawlSourceUrl FROM crawl_posts @@ -169,6 +215,15 @@ const get = exports.get = async function (url, pathname = undefined) { `, [url.origin, pathname])) } +/** + * @description + * Create a new post. + * + * @param {InternalDatArchive} archive - where to write the post to. + * @param {Object} post + * @param {string} post.content + * @returns {Promise} + */ exports.create = async function (archive, {content} = {}) { assert(typeof content === 'string', 'Create() must be provided a `content` string') var filename = generateTimeFilename() @@ -182,6 +237,16 @@ exports.create = async function (archive, {content} = {}) { await crawler.crawlSite(archive) } +/** + * @description + * Update the content of an existing post. + * + * @param {InternalDatArchive} archive - where to write the post to. + * @param {string} pathname - the pathname of the post. + * @param {Object} post + * @param {string} post.content + * @returns {Promise} + */ exports.edit = async function (archive, pathname, {content} = {}) { assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') assert(typeof content === 'string', 'Edit() must be provided a `content` string') @@ -195,6 +260,14 @@ exports.edit = async function (archive, pathname, {content} = {}) { await crawler.crawlSite(archive) } +/** + * @description + * Delete an existing post + * + * @param {InternalDatArchive} archive - where to write the post to. + * @param {string} pathname - the pathname of the post. + * @returns {Promise} + */ exports.delete = async function (archive, pathname) { assert(typeof pathname === 'string', 'Delete() must be provided a valid URL string') await archive.pda.unlink(pathname) @@ -204,23 +277,42 @@ exports.delete = async function (archive, pathname) { // internal methods // = +/** + * @param {string} v + * @returns {boolean} + */ function isString (v) { return typeof v === 'string' } +/** + * @param {string} url + * @returns {string} + */ function toOrigin (url) { url = new URL(url) return url.protocol + '//' + url.hostname } +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns {Promise} + */ async function ensureDirectory (archive, pathname) { try { await archive.pda.mkdir(pathname) } catch (e) { /* ignore */ } } -function massagePostRow (row) { +/** + * @param {Object} row + * @returns {Post} + */ +async function massagePostRow (row) { if (!row) return null - row.author = {url: row.crawlSourceUrl} + row.author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) +console.log('author for', row.author, row) + if (!row.author) row.author = {url: row.crawlSourceUrl} delete row.crawlSourceUrl delete row.crawlSourceId return row diff --git a/crawler/search.js b/crawler/search.js index 6c92b7db..abe5278f 100644 --- a/crawler/search.js +++ b/crawler/search.js @@ -7,6 +7,7 @@ const datLibrary = require('../dat/library') const followgraph = require('./followgraph') const siteDescriptions = require('./site-descriptions') const {getBasicType} = require('../lib/dat') +const {getSiteDescriptionThumbnailUrl} = require('./util') /** @type {Array} */ const BUILTIN_PAGES = [ @@ -23,17 +24,8 @@ const BUILTIN_PAGES = [ // typedefs // = -// exported api -// = - /** - * @description - * Get suggested content of various types. - * - * @param {string} [query=''] - The search query. - * @param {Object} [opts={}] - * @param {boolean} [opts.filterPins] - If true, will filter out pinned bookmarks. - * @returns {Promise} + * @typedef SiteDescription { import("./site-descriptions").SiteDescription } * * @typedef {Object} SuggestionResults * @prop {Array} apps @@ -45,7 +37,42 @@ const BUILTIN_PAGES = [ * @prop {(undefined|Array)} bookmarks * @prop {(undefined|Array)} history * - * TODO: make the return values much more concrete + * TODO: define the SuggestionResults values + * + * @typedef {Object} SearchResults + * @prop {number} highlightNonce - A number used to create perimeters around text that should be highlighted. + * @prop {(null|Array)} people + * @prop {(null|Array)} posts + * + * @typedef {Object} PeopleSearchResult + * @prop {string} url + * @prop {string} title + * @prop {string} description + * @prop {Array} followedBy + * @prop {bool} followsUser + * @prop {string} thumbUrl + * @prop {Object} author + * @prop {string} author.url + * + * @typedef {Object} PostSearchResult + * @prop {string} url + * @prop {SiteDescription} author + * @prop {string} content + * @prop {number} createdAt + * @prop {number} updatedAt + */ + +// exported api +// = + +/** + * @description + * Get suggested content of various types. + * + * @param {string} [query=''] - The search query. + * @param {Object} [opts={}] + * @param {boolean} [opts.filterPins] - If true, will filter out pinned bookmarks. + * @returns {Promise} */ exports.listSuggestions = async function (query = '', opts = {}) { var suggestions = {} @@ -99,39 +126,6 @@ exports.listSuggestions = async function (query = '', opts = {}) { * @param {number} [opts.offset] * @param {number} [opts.limit = 20] * @returns {Promise} - * - * Search results: - * @typedef {Object} SearchResults - * @prop {number} highlightNonce - A number used to create perimeters around text that should be highlighted. - * @prop {(null|Array)} people - * @prop {(null|Array)} posts - * - * People search results: - * @typedef {Object} PeopleSearchResult - * @prop {string} url - * @prop {string} title - * @prop {string} description - * @prop {Array} followedBy - * @prop {bool} followsUser - * @prop {Object} author - * @prop {string} author.url - * - * Post search results: - * @typedef {Object} PostSearchResult - * @prop {string} url - * @prop {SiteDescription} author - * @prop {string} content - * @prop {string} createdAt - * @prop {string} [updatedAt] - * - * Site description objects: - * @typedef {Object} SiteDescription - * @prop {string} url - * @prop {string} [title] - * @prop {string} [description] - * @prop {Array} [type] - * @prop {Object} [author] - * @prop {string} [author.url] */ exports.listSearchResults = async function (opts) { const highlightNonce = (Math.random() * 1e3)|0 @@ -186,29 +180,38 @@ exports.listSearchResults = async function (opts) { if (query) { searchResults.people = await db.all(` SELECT - desc.subject AS url, + desc.url AS url, descSrc.url AS authorUrl, SNIPPET(crawl_site_descriptions_fts_index, 0, '${startHighlight}', '${endHighlight}', '...', 25) AS title, SNIPPET(crawl_site_descriptions_fts_index, 1, '${startHighlight}', '${endHighlight}', '...', 25) AS description FROM crawl_site_descriptions_fts_index desc_fts INNER JOIN crawl_site_descriptions desc ON desc.rowid = desc_fts.rowid - INNER JOIN crawl_followgraph fgraph ON fgraph.destUrl = desc.subject AND fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) + LEFT JOIN crawl_followgraph fgraph ON fgraph.destUrl = desc.url INNER JOIN crawl_sources descSrc ON desc.crawlSourceId = descSrc.id - WHERE crawl_site_descriptions_fts_index MATCH ? + WHERE + crawl_site_descriptions_fts_index MATCH ? + AND ( + fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) -- description by a followed user + OR (desc.url = ? AND desc.crawlSourceId = ?) -- description by me about me + ) ORDER BY rank LIMIT ? OFFSET ?; - `, [query, limit, offset]) + `, [query, user, userCrawlSourceId, limit, offset]) } else { searchResults.people = await db.all(` - SELECT desc.subject AS url, desc.title, desc.description, descSrc.url AS authorUrl + SELECT desc.url AS url, desc.title, desc.description, descSrc.url AS authorUrl FROM crawl_site_descriptions desc - INNER JOIN crawl_followgraph fgraph ON fgraph.destUrl = desc.subject AND fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) + LEFT JOIN crawl_followgraph fgraph ON fgraph.destUrl = desc.url INNER JOIN crawl_sources descSrc ON desc.crawlSourceId = descSrc.id - ORDER BY desc.createdAt + WHERE ( + fgraph.crawlSourceId IN (${crawlSourceIds.join(',')}) -- description by a followed user + OR (desc.url = ? AND desc.crawlSourceId = ?) -- description by me about me + ) + ORDER BY desc.title LIMIT ? OFFSET ?; - `, [limit, offset]) + `, [user, userCrawlSourceId, limit, offset]) } searchResults.people = _uniqWith(searchResults.people, (a, b) => a.url === b.url) await Promise.all(searchResults.people.map(async (p) => { @@ -217,6 +220,7 @@ exports.listSearchResults = async function (opts) { p.followsUser = await followgraph.isAFollowingB(p.url, user) // massage attrs + p.thumbUrl = getSiteDescriptionThumbnailUrl(p.authorUrl, p.url) p.author = {url: p.authorUrl} delete p.authorUrl })) @@ -265,17 +269,6 @@ exports.listSearchResults = async function (opts) { delete p.authorUrl delete p.pathname })) - // TODO hops == 2 - /*searchResults.posts = await db.all(` - SELECT post.content, post.pathname, postSrc.url - FROM crawl_posts post - INNER JOIN crawl_sources postSrc ON post.crawlSourceId = postSrc.id - INNER JOIN crawl_followgraph fgraph ON fgraph.destUrl = postSrc.url AND fgraph.crawlSourceId = ? - WHERE (post.content MATCH ?) AND (post.createdAt >= ?) - ORDER BY rank - LIMIT ? - OFFSET ?; - `, [userCrawlSourceId, query, since, offset, limit])*/ } return searchResults diff --git a/crawler/site-descriptions.js b/crawler/site-descriptions.js index 7e9c18e2..91dafd26 100644 --- a/crawler/site-descriptions.js +++ b/crawler/site-descriptions.js @@ -1,20 +1,42 @@ const assert = require('assert') const {URL} = require('url') const Events = require('events') -const _pick = require('lodash.pick') const db = require('../dbs/profile-data-db') const archivesDb = require('../dbs/archives') const dat = require('../dat') const crawler = require('./index') -const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename} = require('./util') +const { + doCrawl, + doCheckpoint, + emitProgressEvent, + getMatchingChangesInOrder, + generateTimeFilename, + getSiteDescriptionThumbnailUrl, + toHostname +} = require('./util') const debug = require('../lib/debug-logger').debugLogger('crawler') // constants // = const TABLE_VERSION = 1 -const JSON_TYPE = 'unwalled.garden/site-description' -const JSON_PATH_REGEX = /^\/data\/known_sites\/([^/]+)\.json$/i +const JSON_PATH_REGEX = /^\/(dat\.json|data\/known_sites\/([^/]+)\/dat\.json)$/i + +// typedefs +// = + +/** + * @typedef CrawlSourceRecord {import('./util').CrawlSourceRecord} + * + * @typedef {Object} SiteDescription + * @prop {string} url + * @prop {string} title + * @prop {string} description + * @prop {Array} type + * @prop {string} thumbUrl + * @prop {Object} descAuthor + * @prop {string} descAuthor.url + */ // globals // = @@ -28,6 +50,14 @@ exports.on = events.on.bind(events) exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) +/** + * @description + * Crawl the given site for site descriptions. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ exports.crawlSite = async function (archive, crawlSource) { return doCrawl(archive, crawlSource, 'crawl_site_descriptions', TABLE_VERSION, async ({changes, resetRequired}) => { const supressEvents = resetRequired === true // dont emit when replaying old info @@ -52,11 +82,15 @@ exports.crawlSite = async function (archive, crawlSource) { // this means that a single bad or unreachable file can stop the forward progress of description indexing // to solve this, we need to find a way to tolerate bad description-files without losing our ability to efficiently detect new posts // -prf + + // determine the url + let url = getUrlFromDescriptionPath(archive, changedSiteDescription.name) + if (changedSiteDescription.type === 'del') { // delete await db.run(` - DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND pathname = ? - `, [crawlSource.id, changedSiteDescription.name]) + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND url = ? + `, [crawlSource.id, url]) events.emit('description-removed', archive.url) } else { // read and validate @@ -64,38 +98,30 @@ exports.crawlSite = async function (archive, crawlSource) { try { desc = JSON.parse(await archive.pda.readFile(changedSiteDescription.name, 'utf8')) assert(typeof desc === 'object', 'File be an object') - assert(desc.type === 'unwalled.garden/site-description', 'JSON .type must be unwalled.garden/site-description') - assert(typeof desc.subject === 'string', 'JSON .subject must be a URL string') - try { let subject = new URL(desc.subject) } - catch (e) { throw new Error('JSON .subject must be a URL string') } - assert(desc.metadata && typeof desc.metadata === 'object', 'JSON .metadata must be object') - assert(typeof desc.createdAt === 'string', 'JSON .createdAt must be a date-time') - assert(!isNaN(Number(new Date(desc.createdAt))), 'JSON .createdAt must be a date-time') } catch (err) { + console.error('Failed to read site-description file', {url: archive.url, name: changedSiteDescription.name, err}) debug('Failed to read site-description file', {url: archive.url, name: changedSiteDescription.name, err}) return // abort indexing } // massage the description - desc.subject = toOrigin(desc.subject) - desc.metadata.title = typeof desc.metadata.title === 'string' ? desc.metadata.title : '' - desc.metadata.description = typeof desc.metadata.description === 'string' ? desc.metadata.description : '' - if (typeof desc.metadata.type === 'string') desc.metadata.type = desc.metadata.type.split(',') - if (Array.isArray(desc.metadata.type)) { - desc.metadata.type = desc.metadata.type.filter(isString) + desc.title = typeof desc.title === 'string' ? desc.title : '' + desc.description = typeof desc.description === 'string' ? desc.description : '' + if (typeof desc.type === 'string') desc.type = desc.type.split(',') + if (Array.isArray(desc.type)) { + desc.type = desc.type.filter(isString) } else { - desc.metadata.type = [] + desc.type = [] } - desc.createdAt = Number(new Date(desc.createdAt)) // replace await db.run(` - DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND pathname = ? - `, [crawlSource.id, changedSiteDescription.name]) + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND url = ? + `, [crawlSource.id, url]) await db.run(` - INSERT OR REPLACE INTO crawl_site_descriptions (crawlSourceId, pathname, crawledAt, subject, title, description, type, createdAt) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - `, [crawlSource.id, changedSiteDescription.name, Date.now(), desc.subject, desc.metadata.title, desc.metadata.description, desc.metadata.type.join(','), desc.createdAt]) + INSERT OR REPLACE INTO crawl_site_descriptions (crawlSourceId, crawledAt, url, title, description, type) + VALUES (?, ?, ?, ?, ?, ?) + `, [crawlSource.id, Date.now(), url, desc.title, desc.description, desc.type.join(',')]) events.emit('description-added', archive.url) } @@ -106,6 +132,18 @@ exports.crawlSite = async function (archive, crawlSource) { }) } +/** + * @description + * List crawled site descriptions. + * + * @param {Object} [opts] + * @param {string} [opts.subject] - (URL) filter descriptions to those which describe this subject. + * @param {string} [opts.author] - (URL) filter descriptions to those created by this author. + * @param {number} [opts.offset] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ const list = exports.list = async function ({offset, limit, reverse, author, subject} = {}) { // validate & parse params assert(!offset || typeof offset === 'number', 'Offset must be a number') @@ -153,13 +191,12 @@ const list = exports.list = async function ({offset, limit, reverse, author, sub query += `(` let op = `` for (let s of subject) { - query += `${op} subject = ?` + query += `${op} crawl_site_descriptions.url = ?` op = ` OR` values.push(s) } query += `) ` } - query += ` ORDER BY createdAt` if (reverse) { query += ` DESC` } @@ -176,141 +213,140 @@ const list = exports.list = async function ({offset, limit, reverse, author, sub return (await db.all(query, values)).map(massageSiteDescriptionRow) } +/** + * @description + * Get the most trustworthy site description available. + * + * @param {Object} [opts] + * @param {string} [opts.subject] - (URL) filter descriptions to those which describe this subject. + * @param {string} [opts.author] - (URL) filter descriptions to those created by this author. + * @returns {Promise} + */ exports.getBest = async function ({subject, author} = {}) { - // TODO - // while the archivesdb is more recent, it won't have the thumbnail - // -prf - // check archivesDb meta - // var meta = await archivesDb.getMeta(subject) - // if (meta) { - // return _pick(meta, ['title', 'description', 'type']) - // } - - // check for descriptions + // TODO choose based on trust var descriptions = await list({subject, author}) - return _pick(descriptions[0] || {}, ['title', 'description', 'type', 'author']) -} - -const get = exports.get = async function (url, pathname = undefined) { - // validate & parse params - if (url) { - try { url = new URL(url) } - catch (e) { throw new Error('Failed to parse post URL: ' + url) } - } - pathname = pathname || url.pathname - - // execute query - return massageSiteDescriptionRow(await db.get(` - SELECT - crawl_site_descriptions.*, src.url AS crawlSourceUrl - FROM crawl_site_descriptions - INNER JOIN crawl_sources src - ON src.id = crawl_site_descriptions.crawlSourceId - AND src.url = ? - WHERE - crawl_site_descriptions.pathname = ? - `, [url.origin, pathname])) + return descriptions[0] } +/** + * @description + * Capture a site description into the archive's known_sites cache. + * + * @param {InternalDatArchive} archive - where to write the capture to. + * @param {(InternalDatArchive|string)} subjectArchive - which archive to capture. + * @returns Promise + */ exports.capture = async function (archive, subjectArchive) { if (typeof subjectArchive === 'string') { subjectArchive = await dat.library.getOrLoadArchive(subjectArchive) } - // capture metadata + // create directory + var hostname = toHostname(subjectArchive.url) + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/known_sites') + await ensureDirectory(archive, `/data/known_sites/${hostname}`) + + // capture dat.json try { - var info = JSON.parse(await subjectArchive.pda.readFile('/dat.json')) + var datJson = JSON.parse(await subjectArchive.pda.readFile('/dat.json')) } catch (e) { console.error('Failed to read dat.json of subject archive', e) debug('Failed to read dat.json of subject archive', e) throw new Error('Unabled to read subject dat.json') } - await put(archive, { - subject: subjectArchive.url, - title: typeof info.title === 'string' ? info.title : undefined, - description: typeof info.description === 'string' ? info.description : undefined, - type: typeof info.type === 'string' || (Array.isArray(info.type) && info.type.every(isString)) ? info.type : undefined - }) + await archive.pda.writeFile(`/data/known_sites/${hostname}/dat.json`, JSON.stringify(datJson)) // capture thumb for (let ext of ['jpg', 'jpeg', 'png']) { let thumbPath = `/thumb.${ext}` if (await fileExists(subjectArchive, thumbPath)) { - let targetPath = `/data/known_sites/${toHostname(subjectArchive.url)}.${ext}` + let targetPath = `/data/known_sites/${hostname}/thumb.${ext}` await archive.pda.writeFile(targetPath, await subjectArchive.pda.readFile(thumbPath, 'binary'), 'binary') break } } } -const put = -exports.put = async function (archive, {subject, title, description, type} = {}) { - assert(typeof subject === 'string', 'Put() must be provided a `subject` string') - try { - var subjectUrl = new URL(subject) - } catch (e) { - throw new Error('Put() `subject` must be a valid URL') - } - assert(!title || typeof title === 'string', 'Put() `title` must be a string') - assert(!description || typeof description === 'string', 'Put() `description` must be a string') - if (type) { - if (typeof type === 'string') type = type.split(',') - assert(Array.isArray(type), 'Put() `type` must be a string or an array of strings') - assert(type.every(isString), 'Put() `type` must be a string or an array of strings') +/** + * @description + * Delete a captured site description in the given archive's known_sites cache. + * + * @param {InternalDatArchive} archive - where to remove the capture from. + * @param {(InternalDatArchive|string)} subjectUrl - which archive's capture to remove. + * @returns Promise + */ +exports.deleteCapture = async function (archive, subjectUrl) { + if (subjectUrl && subjectUrl.url) { + subjectUrl = subjectUrl.url } - await ensureDirectory(archive, '/data') - await ensureDirectory(archive, '/data/known_sites') - await archive.pda.writeFile(`/data/known_sites/${subjectUrl.hostname}.json`, JSON.stringify({ - type: JSON_TYPE, - subject: subjectUrl.toString(), - metadata: { - title, - description, - type - }, - createdAt: (new Date()).toISOString() - })) - await crawler.crawlSite(archive) -} - -exports.delete = async function (archive, pathname) { - assert(typeof pathname === 'string', 'Delete() must be provided a valid URL string') - await archive.pda.unlink(pathname) + assert(typeof subjectUrl === 'string', 'Delete() must be provided a valid URL string') + var hostname = toHostname(subjectUrl) + await archive.pda.rmdir(`/data/known_sites/${hostname}`, {recursive: true}) await crawler.crawlSite(archive) } // internal methods // = +/** + * @param {any} v + * returns {boolean} + */ function isString (v) { return typeof v === 'string' } +/** + * @param {string} url + * @returns {string} + */ function toOrigin (url) { url = new URL(url) return url.protocol + '//' + url.hostname } -function toHostname (url) { - url = new URL(url) - return url.hostname +/** + * @param {InternalDatArchive} archive + * @param {string} name + * @returns {string} + */ +function getUrlFromDescriptionPath (archive, name) { + if (name === '/dat.json') return archive.url + name = name.split('/') // '/data/known_sites/{hostname}/dat.json' -> ['', 'data', 'known_sites', hostname, 'dat.json'] + return 'dat://' + name[3] } +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns {Promise} + */ async function ensureDirectory (archive, pathname) { try { await archive.pda.mkdir(pathname) } catch (e) { /* ignore */ } } +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns {Promise} + */ async function fileExists (archive, pathname) { try { await archive.pda.stat(pathname) } catch (e) { return false } return true } +/** + * @param {Object} row + * @returns {SiteDescription} + */ function massageSiteDescriptionRow (row) { if (!row) return null row.author = {url: row.crawlSourceUrl} row.type = row.type && typeof row.type === 'string' ? row.type.split(',') : undefined + row.thumbUrl = getSiteDescriptionThumbnailUrl(row.author.url, row.url) delete row.crawlSourceUrl delete row.crawlSourceId return row diff --git a/crawler/util.js b/crawler/util.js index 04a9f7b2..657e0d6f 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -6,9 +6,29 @@ const dat = require('../dat') const READ_TIMEOUT = 30e3 +// typedefs +// = + +/** + * @typedef {Object} CrawlSourceRecord + * @prop {string} id + * @prop {string} url + */ + +// exported api +// = + const crawlerEvents = new EventEmitter() exports.crawlerEvents = crawlerEvents +/** + * @param {InternalDatArchive} archive + * @param {CrawlSourceRecord} crawlSource + * @param {string} crawlDataset + * @param {number} crawlDatasetVersion + * @param {(Object) => undefined} handlerFn + * @returns {Promise} + */ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatasetVersion, handlerFn) { const url = archive.url @@ -53,6 +73,13 @@ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatas crawlerEvents.emit('crawl-dataset-finish', {sourceUrl: archive.url, crawlDataset, crawlRange: {start, end}}) } +/** + * @param {string} crawlDataset + * @param {number} crawlDatasetVersion + * @param {CrawlSourceRecord} crawlSource + * @param {number} crawlSourceVersion + * @returns {Promise} + */ const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { await db.run(`DELETE FROM crawl_sources_meta WHERE crawlDataset = ? AND crawlSourceId = ?`, [crawlDataset, crawlSource.id]) await db.run(` @@ -62,10 +89,21 @@ const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlD `, [crawlDataset, crawlDatasetVersion, crawlSource.id, crawlSourceVersion, Date.now()]) } +/** + * @param {string} sourceUrl + * @param {string} crawlDataset + * @param {number} progress + * @param {number} numUpdates + */ exports.emitProgressEvent = function (sourceUrl, crawlDataset, progress, numUpdates) { crawlerEvents.emit('crawl-dataset-progress', {sourceUrl, crawlDataset, progress, numUpdates}) } +/** + * @param {Array} changes + * @param {RegExp} regex + * @returns {Array} + */ exports.getMatchingChangesInOrder = function (changes, regex) { var list = [] // order matters, must be oldest to newest changes.forEach(c => { @@ -78,6 +116,9 @@ exports.getMatchingChangesInOrder = function (changes, regex) { return list } +/** + * @returns {string} + */ var _lastGeneratedTimeFilename exports.generateTimeFilename = function () { var d = Date.now() @@ -86,4 +127,26 @@ exports.generateTimeFilename = function () { } _lastGeneratedTimeFilename = d return (new Date(d)).toISOString() -} \ No newline at end of file +} + +/** + * @param {string} url + * @returns {string} + */ +const toHostname = +exports.toHostname = function (url) { + url = new URL(url) + return url.hostname +} + +/** + * @description Helper to determine the thumbUrl for a site description. + * @param {string} author - (URL) the author of the site description. + * @param {string} subject - (URL) the site being described. + * @returns {string} - the URL of the thumbnail. + */ +exports.getSiteDescriptionThumbnailUrl = function (author, subject) { + return author === subject + ? `${subject}/thumb` // self-description, use their own thumb + : `${author}/data/known_sites/${toHostname(subject)}/thumb` // use captured thumb +} diff --git a/dbs/profile-data-db.js b/dbs/profile-data-db.js index 542ea376..a4c919b4 100644 --- a/dbs/profile-data-db.js +++ b/dbs/profile-data-db.js @@ -79,8 +79,7 @@ migrations = [ migration('profile-data.v21.sql'), migration('profile-data.v22.sql', {canFail: true}), // canFail for the same reason as v16, ffs migration('profile-data.v23.sql'), - migration('profile-data.v24.sql'), - migration('profile-data.v25.sql') + migration('profile-data.v24.sql') ] function migration (file, opts = {}) { return cb => { diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index f59bada2..a2310263 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -132,16 +132,13 @@ CREATE TABLE crawl_sources_meta ( -- crawled descriptions of other sites CREATE TABLE crawl_site_descriptions ( crawlSourceId INTEGER NOT NULL, - pathname TEXT NOT NULL, crawledAt INTEGER, - subject TEXT, + url TEXT, title TEXT, description TEXT, type TEXT, -- comma separated strings - createdAt INTEGER, - PRIMARY KEY (crawlSourceId, pathname), FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); CREATE VIRTUAL TABLE crawl_site_descriptions_fts_index USING fts5(title, description, content='crawl_site_descriptions'); @@ -258,5 +255,5 @@ INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Report an issu INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Explore the p2p Web', 'dat://taravancil.com/explore-the-p2p-web.md', 1); INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Support Beaker', 'https://opencollective.com/beaker', 1); -PRAGMA user_version = 25; +PRAGMA user_version = 24; ` diff --git a/dbs/schemas/profile-data.v24.sql.js b/dbs/schemas/profile-data.v24.sql.js index e9b7bb71..eae3dcd7 100644 --- a/dbs/schemas/profile-data.v24.sql.js +++ b/dbs/schemas/profile-data.v24.sql.js @@ -27,18 +27,29 @@ CREATE TABLE crawl_sources_meta ( -- crawled descriptions of other sites CREATE TABLE crawl_site_descriptions ( crawlSourceId INTEGER NOT NULL, - pathname TEXT NOT NULL, crawledAt INTEGER, - subject TEXT, + url TEXT, title TEXT, description TEXT, type TEXT, -- comma separated strings - createdAt INTEGER, PRIMARY KEY (crawlSourceId, pathname), FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); +CREATE VIRTUAL TABLE crawl_site_descriptions_fts_index USING fts5(title, description, content='crawl_site_descriptions'); + +-- triggers to keep crawl_site_descriptions_fts_index updated +CREATE TRIGGER crawl_site_descriptions_ai AFTER INSERT ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_site_descriptions_ad AFTER DELETE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_site_descriptions_au AFTER UPDATE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.a, old.title, old.description); + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; -- crawled posts CREATE TABLE crawl_posts ( @@ -52,6 +63,19 @@ CREATE TABLE crawl_posts ( FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE ); +CREATE VIRTUAL TABLE crawl_posts_fts_index USING fts5(content, content='crawl_posts'); + +-- triggers to keep crawl_posts_fts_index updated +CREATE TRIGGER crawl_posts_ai AFTER INSERT ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); +END; +CREATE TRIGGER crawl_posts_ad AFTER DELETE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); +END; +CREATE TRIGGER crawl_posts_au AFTER UPDATE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); + INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); +END; -- crawled follows CREATE TABLE crawl_followgraph ( diff --git a/dbs/schemas/profile-data.v25.sql.js b/dbs/schemas/profile-data.v25.sql.js deleted file mode 100644 index b84e73a1..00000000 --- a/dbs/schemas/profile-data.v25.sql.js +++ /dev/null @@ -1,32 +0,0 @@ -module.exports = ` - --- add full-text search indexes -CREATE VIRTUAL TABLE crawl_site_descriptions_fts_index USING fts5(title, description, content='crawl_site_descriptions'); -CREATE VIRTUAL TABLE crawl_posts_fts_index USING fts5(content, content='crawl_posts'); - --- triggers to keep crawl_site_descriptions_fts_index updated -CREATE TRIGGER crawl_site_descriptions_ai AFTER INSERT ON crawl_site_descriptions BEGIN - INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); -END; -CREATE TRIGGER crawl_site_descriptions_ad AFTER DELETE ON crawl_site_descriptions BEGIN - INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); -END; -CREATE TRIGGER crawl_site_descriptions_au AFTER UPDATE ON crawl_site_descriptions BEGIN - INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.a, old.title, old.description); - INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); -END; - --- triggers to keep crawl_posts_fts_index updated -CREATE TRIGGER crawl_posts_ai AFTER INSERT ON crawl_posts BEGIN - INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); -END; -CREATE TRIGGER crawl_posts_ad AFTER DELETE ON crawl_posts BEGIN - INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); -END; -CREATE TRIGGER crawl_posts_au AFTER UPDATE ON crawl_posts BEGIN - INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, content) VALUES('delete', old.rowid, old.content); - INSERT INTO crawl_posts_fts_index(rowid, content) VALUES (new.rowid, new.content); -END; - -PRAGMA user_version = 25; -` \ No newline at end of file From c0419be3bf3b12150e0c9b6259415b305f71e44a Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Fri, 11 Jan 2019 19:05:33 -0600 Subject: [PATCH 054/245] Move crawler to a priority queueing system --- crawler/followgraph.js | 2 +- crawler/index.js | 23 ++----- crawler/search.js | 8 +-- users/index.js | 145 +++++++++++++++++++++++++++-------------- 4 files changed, 106 insertions(+), 72 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 7f967e2e..21cee3fd 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -188,7 +188,7 @@ const listFollows = exports.listFollows = async function (subject, {followedBy, } return Promise.all(rows.map(async (row) => { var url = toOrigin(row.destUrl) - var desc = await siteDescriptions.getBest({subject: url, author: subject}) + var desc = (await siteDescriptions.getBest({subject: url, author: subject})) || {} desc.url = url if (followedBy) { desc.followsUser = await isAFollowingB(url, followedBy) diff --git a/crawler/index.js b/crawler/index.js index bcb1c692..52bbd18d 100644 --- a/crawler/index.js +++ b/crawler/index.js @@ -7,17 +7,15 @@ const archivesDb = require('../dbs/archives') const users = require('../users') const dat = require('../dat') -const {crawlerEvents} = require('./util') +const {crawlerEvents, toHostname} = require('./util') const posts = require('./posts') const followgraph = require('./followgraph') const siteDescriptions = require('./site-descriptions') -const CRAWL_POLL_INTERVAL = 30e3 - // globals // = -const watches = {} +var watches = {} // exported api // = @@ -43,18 +41,13 @@ exports.watchSite = async function (archive) { // watch for file changes watches[archive.url] = archive.pda.watch() watches[archive.url].on('data', ([event, args]) => { + // BUG watch is really inconsistent -prf console.log('MIRACLE ALERT! The crawler watch stream emitted a change event', archive.url, event, args) if (event === 'invalidated') { queueCrawl() } }) - // HACK - // for reasons that currently surpass me - // the `archive.pda.watch()` call is not currently working all the time - // so we need to poll sites for now - setInterval(queueCrawl, CRAWL_POLL_INTERVAL) - // run the first crawl crawlSite(archive) } @@ -136,12 +129,4 @@ exports.WEBAPI = { return crawlSite(archive) }, resetSite -} - -// internal methods -// = - -function toHostname (url) { - url = new URL(url) - return url.hostname -} +} \ No newline at end of file diff --git a/crawler/search.js b/crawler/search.js index abe5278f..ffeeb2f5 100644 --- a/crawler/search.js +++ b/crawler/search.js @@ -150,7 +150,7 @@ exports.listSearchResults = async function (opts) { if (query && typeof query === 'string') { query = query .toLowerCase() // all lowercase. (uppercase is interpretted as a directive by sqlite.) - .replace(/[:^*\.]/g, ' ') // strip symbols that sqlite interprets. + .replace(/[:^*.]/g, ' ') // strip symbols that sqlite interprets. query += '*' // match prefixes } @@ -211,7 +211,7 @@ exports.listSearchResults = async function (opts) { ORDER BY desc.title LIMIT ? OFFSET ?; - `, [user, userCrawlSourceId, limit, offset]) + `, [user, userCrawlSourceId, limit, offset]) } searchResults.people = _uniqWith(searchResults.people, (a, b) => a.url === b.url) await Promise.all(searchResults.people.map(async (p) => { @@ -258,7 +258,7 @@ exports.listSearchResults = async function (opts) { ORDER BY post.createdAt DESC LIMIT ? OFFSET ?; - `, [userCrawlSourceId, since, limit, offset]) + `, [userCrawlSourceId, since, limit, offset]) } await Promise.all(searchResults.posts.map(async (p) => { // fetch additional info @@ -270,6 +270,6 @@ exports.listSearchResults = async function (opts) { delete p.pathname })) } - + return searchResults } diff --git a/users/index.js b/users/index.js index b8e00e7f..e90b615f 100644 --- a/users/index.js +++ b/users/index.js @@ -1,6 +1,7 @@ const Events = require('events') const dat = require('../dat') const crawler = require('../crawler') +const followgraph = require('../crawler/followgraph') const db = require('../dbs/profile-data-db') const archivesDb = require('../dbs/archives') const debug = require('../lib/debug-logger').debugLogger('users') @@ -9,6 +10,8 @@ const debug = require('../lib/debug-logger').debugLogger('users') // = const SITE_TYPE = 'unwalled.garden/user' +const CRAWL_TICK_INTERVAL = 5e3 +const NUM_SIMULTANEOUS_CRAWLS = 10 // globals // = @@ -24,9 +27,8 @@ exports.addListener = events.addListener.bind(events) exports.removeListener = events.removeListener.bind(events) exports.setup = async function () { - // wire up events - crawler.followgraph.on('follow-added', onFollowAdded) - crawler.followgraph.on('follow-removed', onFollowRemoved) + // initiate ticker + queueTick() // load the current users users = await db.all(`SELECT * FROM users`) @@ -42,7 +44,7 @@ exports.setup = async function () { try { await validateUserUrl(user.url) user.archive = await dat.library.getOrLoadArchive(user.url) - watchUser(user) + /* dont await */crawler.watchSite(user.archive) events.emit('load-user', user) } catch (err) { debug('Failed to load user', {user, err}) @@ -50,6 +52,49 @@ exports.setup = async function () { }) } +function queueTick () { + setTimeout(tick, CRAWL_TICK_INTERVAL) +} + +async function tick () { + try { + // TODO handle multiple users + var user = users[0] + if (!user) return queueTick() + + // assemble the next set of crawl targets + var crawlTargets = await selectNextCrawlTargets(user) + + // trigger the crawls on each + var activeCrawls = crawlTargets.map(async (crawlTarget) => { + try { + // load archive + var wasLoaded = true // TODO + var archive = await dat.library.getOrLoadArchive(crawlTarget) // TODO timeout on load + + // run crawl + await crawler.crawlSite(archive) + + if (!wasLoaded) { + // unload archive + // TODO + } + } catch (e) { + console.error('Failed to crawl site', crawlTarget, e) + // TODO more handling? + } + }) + + // await all crawls + await Promise.all(activeCrawls) + } catch (e) { + console.error('Crawler tick failed', e) + } + + // queue next tick + queueTick() +} + exports.list = async function () { return Promise.all(users.map(fetchUserInfo)) } @@ -94,7 +139,7 @@ exports.add = async function (url) { // fetch the user archive user.archive = await dat.library.getOrLoadArchive(user.url) - watchUser(user) + /* dont await */crawler.watchSite(user.archive) events.emit('load-user', user) } @@ -107,7 +152,7 @@ exports.remove = async function (url) { // remove the user users.splice(users.indexOf(user), 1) await db.run(`DELETE FROM users WHERE url = ?`, [user.url]) - unwatchUser(user) + /* dont await */crawler.unwatchSite(user.archive) events.emit('unload-user', user) } @@ -118,50 +163,54 @@ async function isUser (url) { return !!(await get(url)) } -async function watchUser (user) { - // watch the user - await crawler.watchSite(user.archive) - - // watch anybody the user follows - var followUrls = await crawler.followgraph.listFollows(user.url) - followUrls.forEach(async (followUrl) => { - try { - await crawler.watchSite(followUrl) - } catch (err) { - debug('Failed to sync followed user', {url: followUrl, err}) - } - }) -} - -async function unwatchUser (user) { - // unwatch anybody the user follows - - // BUG This will cause glitches if there are any shared follows between 2 local users (which is likely) - // sites will be unwatched when they shouldn't be - // this is temporary and will fix itself when beaker restarts - // -prf - - var followUrls = await crawler.followgraph.listFollows(user.url) - followUrls.forEach(crawler.unwatchSite) - - // unwatch the user - await crawler.unwatchSite(user.url) -} - -async function onFollowAdded (sourceUrl, subjectUrl) { - if (isUser(sourceUrl)) { - try { - await crawler.watchSite(subjectUrl) - } catch (err) { - debug('Failed to sync followed user', {url: subjectUrl, err}) - } +/** + * @description + * Assembles a list of crawl targets based on the current database state. + * + * @param {Object} user - the user to select crawl-targets for. + * @returns {Promise>} + * + * Depends on NUM_SIMULTANEOUS_CRAWLS. + * + * This function will assemble the list using simple priority heuristics. The priorities are currently: + * + * 1. Followed sites + * 2. Sites published by followed sites + * 3. Sites followed by followed sites + * + * The sites will be ordered by these priorities and then iterated linearly. The ordering within + * the priority groupings will be according to URL for a deterministic but effectively random ordering. + * + * NOTE. The current database state must be queried every time this function is run because the user + * will follow and unfollow during runtime, which changes the list. + */ +async function selectNextCrawlTargets (user) { + var rows = [] + + // get followed sites + rows = rows.concat(await followgraph.listFollows(user.url)) + + // get sites published by followed sites + // TODO + + // get sites followed by followed sites + rows = rows.concat(await followgraph.listFoaFs(user.url)) + + // assemble into list + var start = user.crawlSelectorCursor || 0 + if (start > rows.length) start = 0 + var end = start + NUM_SIMULTANEOUS_CRAWLS + var nextCrawlTargets = rows.slice(start, end) + var numRemaining = NUM_SIMULTANEOUS_CRAWLS - nextCrawlTargets.length + if (numRemaining && rows.length > NUM_SIMULTANEOUS_CRAWLS) { + // wrap around + nextCrawlTargets = nextCrawlTargets.concat(rows.slice(0, numRemaining)) + user.crawlSelectorCursor = numRemaining + } else { + user.crawlSelectorCursor = end } -} -async function onFollowRemoved (sourceUrl, subjectUrl) { - if (isUser(sourceUrl)) { - await crawler.unwatchSite(subjectUrl) - } + return nextCrawlTargets.map(row => typeof row === 'string' ? row : row.url) } async function fetchUserInfo (user) { From e6b8a81b0f2fbf365aa78ea453ef587dadd72d58 Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sat, 12 Jan 2019 17:18:16 -0600 Subject: [PATCH 055/245] More JSDoc type annotations --- crawler/followgraph.js | 40 +++++++++++++++----------------- crawler/posts.js | 23 +++++++++--------- crawler/search.js | 2 +- crawler/site-descriptions.js | 45 +++++++++++++++++++++--------------- crawler/util.js | 8 ++++--- dat/library.js | 43 ++++++++++++++++++++++++++++++++++ 6 files changed, 106 insertions(+), 55 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index 21cee3fd..e1124bfd 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -1,7 +1,7 @@ const assert = require('assert') const _difference = require('lodash.difference') const Events = require('events') -const {Url} = require('url') +const {URL} = require('url') const lock = require('../lib/lock') const db = require('../dbs/profile-data-db') const crawler = require('./index') @@ -20,13 +20,9 @@ const JSON_PATH = '/data/follows.json' // = /** - * @typedef CrawlSourceRecord {import('./util').CrawlSourceRecord} - * @typedef SiteDescription {import('./site-descriptions').SiteDescription} - * - * @typedef {Object} SiteDescriptionWithFollowData - * @extends {SiteDescription} - * @prop {boolean} [followsUser] - does this site follow the specified user site? - * @prop {Array} [followedBy] - list of sites following this site. + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef {import('./site-descriptions').SiteDescription} SiteDescription */ // globals @@ -47,7 +43,7 @@ exports.removeListener = events.removeListener.bind(events) * * @param {InternalDatArchive} archive - site to crawl. * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. - * @returns {Promise} + * @returns {Promise} */ exports.crawlSite = async function (archive, crawlSource) { return doCrawl(archive, crawlSource, 'crawl_followgraph', TABLE_VERSION, async ({changes, resetRequired}) => { @@ -126,7 +122,7 @@ exports.crawlSite = async function (archive, crawlSource) { * @param {Object} [opts] * @param {string} [opts.followedBy] - (URL) filter results to those followed by the site specified with this param. Causes .followsUser boolean to be set. * @param {boolean} [opts.includeDesc] - output a site description instead of a simple URL. - * @returns {(Promise>|Promise>)} + * @returns {Promise>} */ const listFollowers = exports.listFollowers = async function (subject, {followedBy, includeDesc} = {}) { var rows @@ -173,7 +169,7 @@ const listFollowers = exports.listFollowers = async function (subject, {followed * @param {string} [opts.followedBy] - (URL) filter results to those followed by the site specified with this param. Causes .followsUser boolean to be set. * @param {boolean} [opts.includeDesc] - output a site description instead of a simple URL. * @param {boolean} [opts.includeFollowers] - include .followedBy in the result. Requires includeDesc to be true. - * @returns {(Promise>|Promise>)} + * @returns {Promise>} */ const listFollows = exports.listFollows = async function (subject, {followedBy, includeDesc, includeFollowers} = {}) { var rows = await db.all(` @@ -188,13 +184,13 @@ const listFollows = exports.listFollows = async function (subject, {followedBy, } return Promise.all(rows.map(async (row) => { var url = toOrigin(row.destUrl) - var desc = (await siteDescriptions.getBest({subject: url, author: subject})) || {} + var desc = /** @type SiteDescription */ ((await siteDescriptions.getBest({subject: url, author: subject})) || {}) desc.url = url if (followedBy) { desc.followsUser = await isAFollowingB(url, followedBy) } if (includeFollowers) { - desc.followedBy = await listFollowers(url, {followedBy, includeDesc: true}) + desc.followedBy = /** @type Array */ (await listFollowers(url, {followedBy, includeDesc: true})) } return desc })) @@ -207,15 +203,15 @@ const listFollows = exports.listFollows = async function (subject, {followedBy, * @param {string} subject - (URL) * @param {Object} [opts] * @param {string} [opts.followedBy] - (URL) filter results to those followed by the site specified with this param. Causes .followsUser boolean to be set. - * @returns {Promise>} + * @returns {Promise>} */ const listFoaFs = exports.listFoaFs = async function (subject, {followedBy} = {}) { var foafs = [] // list URLs followed by subject - var follows = await listFollows(subject, {followedBy, includeDesc: true}) + var follows = /** @type Array */ (await listFollows(subject, {followedBy, includeDesc: true})) for (let follow of follows) { // list follows of this follow - for (let foaf of await listFollows(follow.url, {followedBy, includeDesc: true})) { + for (let foaf of /** @type Array */ (await listFollows(follow.url, {followedBy, includeDesc: true}))) { // ignore if followed by subject or is subject if (foaf.url === subject) continue if (follows.find(v => v.url === foaf.url)) continue @@ -260,7 +256,7 @@ const isAFollowingB = exports.isAFollowingB = async function (a, b) { * * @param {InternalDatArchive} archive * @param {string} followUrl - * @returns {Promise} + * @returns {Promise} */ exports.follow = async function (archive, followUrl) { // normalize followUrl @@ -284,7 +280,7 @@ exports.follow = async function (archive, followUrl) { * * @param {InternalDatArchive} archive * @param {string} followUrl - * @returns {Promise} + * @returns {Promise} */ exports.unfollow = async function (archive, followUrl) { // normalize followUrl @@ -309,8 +305,8 @@ exports.unfollow = async function (archive, followUrl) { */ function toOrigin (url) { try { - url = new URL(url) - return url.protocol + '//' + url.hostname + var urlParsed = new URL(url) + return urlParsed.protocol + '//' + urlParsed.hostname } catch (e) { return null } @@ -337,8 +333,8 @@ async function readFollowsFile (archive) { /** * @param {InternalDatArchive} archive - * @param {(Object) => undefined} updateFn - * @returns {Promise} + * @param {function(Object): void} updateFn + * @returns {Promise} */ async function updateFollowsFile (archive, updateFn) { var release = await lock('crawler:followgraph:' + archive.url) diff --git a/crawler/posts.js b/crawler/posts.js index c5e33378..eaa291a2 100644 --- a/crawler/posts.js +++ b/crawler/posts.js @@ -18,8 +18,9 @@ const JSON_PATH_REGEX = /^\/data\/posts\/([^/]+)\.json$/i // = /** - * @typedef CrawlSourceRecord {import('./util').CrawlSourceRecord} - * @typedef SiteDescription { import("./site-descriptions").SiteDescription } + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription * * @typedef {Object} Post * @prop {string} pathname @@ -196,11 +197,12 @@ exports.list = async function ({offset, limit, reverse, author, authors} = {}) { */ const get = exports.get = async function (url, pathname = undefined) { // validate & parse params + var urlParsed if (url) { - try { url = new URL(url) } + try { urlParsed = new URL(url) } catch (e) { throw new Error('Failed to parse post URL: ' + url) } } - pathname = pathname || url.pathname + pathname = pathname || urlParsed.pathname // execute query return await massagePostRow(await db.get(` @@ -212,7 +214,7 @@ const get = exports.get = async function (url, pathname = undefined) { AND src.url = ? WHERE crawl_posts.pathname = ? - `, [url.origin, pathname])) + `, [urlParsed.origin, pathname])) } /** @@ -224,7 +226,7 @@ const get = exports.get = async function (url, pathname = undefined) { * @param {string} post.content * @returns {Promise} */ -exports.create = async function (archive, {content} = {}) { +exports.create = async function (archive, {content}) { assert(typeof content === 'string', 'Create() must be provided a `content` string') var filename = generateTimeFilename() await ensureDirectory(archive, '/data') @@ -247,7 +249,7 @@ exports.create = async function (archive, {content} = {}) { * @param {string} post.content * @returns {Promise} */ -exports.edit = async function (archive, pathname, {content} = {}) { +exports.edit = async function (archive, pathname, {content}) { assert(typeof pathname === 'string', 'Edit() must be provided a valid URL string') assert(typeof content === 'string', 'Edit() must be provided a `content` string') var oldJson = JSON.parse(await archive.pda.readFile(pathname)) @@ -290,8 +292,8 @@ function isString (v) { * @returns {string} */ function toOrigin (url) { - url = new URL(url) - return url.protocol + '//' + url.hostname + var urlParsed = new URL(url) + return urlParsed.protocol + '//' + urlParsed.hostname } /** @@ -306,12 +308,11 @@ async function ensureDirectory (archive, pathname) { /** * @param {Object} row - * @returns {Post} + * @returns {Promise} */ async function massagePostRow (row) { if (!row) return null row.author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) -console.log('author for', row.author, row) if (!row.author) row.author = {url: row.crawlSourceUrl} delete row.crawlSourceUrl delete row.crawlSourceId diff --git a/crawler/search.js b/crawler/search.js index ffeeb2f5..f07f0eb7 100644 --- a/crawler/search.js +++ b/crawler/search.js @@ -25,7 +25,7 @@ const BUILTIN_PAGES = [ // = /** - * @typedef SiteDescription { import("./site-descriptions").SiteDescription } + * @typedef {import("./site-descriptions").SiteDescription} SiteDescription * * @typedef {Object} SuggestionResults * @prop {Array} apps diff --git a/crawler/site-descriptions.js b/crawler/site-descriptions.js index 91dafd26..0b865875 100644 --- a/crawler/site-descriptions.js +++ b/crawler/site-descriptions.js @@ -26,7 +26,8 @@ const JSON_PATH_REGEX = /^\/(dat\.json|data\/known_sites\/([^/]+)\/dat\.json)$/i // = /** - * @typedef CrawlSourceRecord {import('./util').CrawlSourceRecord} + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord * * @typedef {Object} SiteDescription * @prop {string} url @@ -36,6 +37,8 @@ const JSON_PATH_REGEX = /^\/(dat\.json|data\/known_sites\/([^/]+)\/dat\.json)$/i * @prop {string} thumbUrl * @prop {Object} descAuthor * @prop {string} descAuthor.url + * @prop {boolean} [followsUser] - does this site follow the specified user site? + * @prop {Array} [followedBy] - list of sites following this site. */ // globals @@ -56,7 +59,7 @@ exports.removeListener = events.removeListener.bind(events) * * @param {InternalDatArchive} archive - site to crawl. * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. - * @returns {Promise} + * @returns {Promise} */ exports.crawlSite = async function (archive, crawlSource) { return doCrawl(archive, crawlSource, 'crawl_site_descriptions', TABLE_VERSION, async ({changes, resetRequired}) => { @@ -137,8 +140,8 @@ exports.crawlSite = async function (archive, crawlSource) { * List crawled site descriptions. * * @param {Object} [opts] - * @param {string} [opts.subject] - (URL) filter descriptions to those which describe this subject. - * @param {string} [opts.author] - (URL) filter descriptions to those created by this author. + * @param {string | Array} [opts.subject] - (URL) filter descriptions to those which describe this subject. + * @param {string | Array} [opts.author] - (URL) filter descriptions to those created by this author. * @param {number} [opts.offset] * @param {number} [opts.limit] * @param {boolean} [opts.reverse] @@ -233,12 +236,15 @@ exports.getBest = async function ({subject, author} = {}) { * Capture a site description into the archive's known_sites cache. * * @param {InternalDatArchive} archive - where to write the capture to. - * @param {(InternalDatArchive|string)} subjectArchive - which archive to capture. + * @param {(InternalDatArchive|string)} subject - which archive to capture. * @returns Promise */ -exports.capture = async function (archive, subjectArchive) { - if (typeof subjectArchive === 'string') { - subjectArchive = await dat.library.getOrLoadArchive(subjectArchive) +exports.capture = async function (archive, subject) { + var subjectArchive + if (typeof subject === 'string') { + subjectArchive = await dat.library.getOrLoadArchive(subject) + } else { + subjectArchive = subject } // create directory @@ -273,12 +279,15 @@ exports.capture = async function (archive, subjectArchive) { * Delete a captured site description in the given archive's known_sites cache. * * @param {InternalDatArchive} archive - where to remove the capture from. - * @param {(InternalDatArchive|string)} subjectUrl - which archive's capture to remove. + * @param {(InternalDatArchive|string)} subject - which archive's capture to remove. * @returns Promise */ -exports.deleteCapture = async function (archive, subjectUrl) { - if (subjectUrl && subjectUrl.url) { - subjectUrl = subjectUrl.url +exports.deleteCapture = async function (archive, subject) { + var subjectUrl + if (typeof subject === 'string') { + subjectUrl = subject + } else { + subjectUrl = subject.url } assert(typeof subjectUrl === 'string', 'Delete() must be provided a valid URL string') var hostname = toHostname(subjectUrl) @@ -302,8 +311,8 @@ function isString (v) { * @returns {string} */ function toOrigin (url) { - url = new URL(url) - return url.protocol + '//' + url.hostname + var urlParsed = new URL(url) + return urlParsed.protocol + '//' + urlParsed.hostname } /** @@ -313,14 +322,14 @@ function toOrigin (url) { */ function getUrlFromDescriptionPath (archive, name) { if (name === '/dat.json') return archive.url - name = name.split('/') // '/data/known_sites/{hostname}/dat.json' -> ['', 'data', 'known_sites', hostname, 'dat.json'] - return 'dat://' + name[3] + var parts = name.split('/') // '/data/known_sites/{hostname}/dat.json' -> ['', 'data', 'known_sites', hostname, 'dat.json'] + return 'dat://' + parts[3] } /** * @param {InternalDatArchive} archive * @param {string} pathname - * @returns {Promise} + * @returns {Promise} */ async function ensureDirectory (archive, pathname) { try { await archive.pda.mkdir(pathname) } @@ -330,7 +339,7 @@ async function ensureDirectory (archive, pathname) { /** * @param {InternalDatArchive} archive * @param {string} pathname - * @returns {Promise} + * @returns {Promise} */ async function fileExists (archive, pathname) { try { await archive.pda.stat(pathname) } diff --git a/crawler/util.js b/crawler/util.js index 657e0d6f..8e292f1e 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -10,6 +10,8 @@ const READ_TIMEOUT = 30e3 // = /** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * * @typedef {Object} CrawlSourceRecord * @prop {string} id * @prop {string} url @@ -26,7 +28,7 @@ exports.crawlerEvents = crawlerEvents * @param {CrawlSourceRecord} crawlSource * @param {string} crawlDataset * @param {number} crawlDatasetVersion - * @param {(Object) => undefined} handlerFn + * @param {function(Object): Promise} handlerFn * @returns {Promise} */ exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatasetVersion, handlerFn) { @@ -135,8 +137,8 @@ exports.generateTimeFilename = function () { */ const toHostname = exports.toHostname = function (url) { - url = new URL(url) - return url.hostname + var urlParsed = new URL(url) + return urlParsed.hostname } /** diff --git a/dat/library.js b/dat/library.js index 3c0af69c..d7b8fd46 100644 --- a/dat/library.js +++ b/dat/library.js @@ -26,6 +26,49 @@ const { const {InvalidURLError} = require('beaker-error-constants') const DAT_DAEMON_MANIFEST = require('./daemon/manifest') +// typedefs +// = + +/** + * @typedef {Object} InternalDatArchive + * @prop {Buffer} key + * @prop {string} url + * @prop {Buffer} discoveryKey + * @prop {boolean} writable + * @prop {function(Function): void} ready + * @prop {function(Object, Function=): void} download + * @prop {function(Object=): ReadableStream} history + * @prop {function(Object=): ReadableStream} createReadStream + * @prop {function(string, Object=, Function=): any} readFile + * @prop {function(number, Object=): ReadableStream} createDiffStream + * @prop {function(string, Object=): WritableStream} createWriteStream + * @prop {function(string, any, Object=, Function=): void} writeFile + * @prop {function(string, Function=): void} unlink + * @prop {function(string, Object=, Function=): void} mkdir + * @prop {function(string, Function=): void} rmdir + * @prop {function(string, Object=, Function=): void} readdir + * @prop {function(string, Object=, Function=): void} stat + * @prop {function(string, Object=, Function=): void} lstat + * @prop {function(string, Object=, Function=): void} access + * @prop {Object} pda + * @prop {function(string): Promise} pda.stat + * @prop {function(string, Object=): Promise} pda.readFile + * @prop {function(string, Object=): Promise>} pda.readdir + * @prop {function(string): Promise} pda.readSize + * @prop {function(string, any, Object=): Promise} pda.writeFile + * @prop {function(string): Promise} pda.mkdir + * @prop {function(string, string): Promise} pda.copy + * @prop {function(string, string): Promise} pda.rename + * @prop {function(string): Promise} pda.unlink + * @prop {function(string, Object=): Promise} pda.rmdir + * @prop {function(string=): Promise} pda.download + * @prop {function(string=): ReadableStream} pda.watch + * @prop {function(): ReadableStream} pda.createNetworkActivityStream + * @prop {function(): Promise} pda.readManifest + * @prop {function(Object): Promise} pda.writeManifest + * @prop {function(Object): Promise} pda.updateManifest + */ + // globals // = From 53ae755938c4a45786813e4cc68d76917e1598fc Mon Sep 17 00:00:00 2001 From: Paul Frazee Date: Sun, 13 Jan 2019 15:04:32 -0600 Subject: [PATCH 056/245] Add jsdocs to the database scripts --- crawler/followgraph.js | 2 +- crawler/search.js | 15 +- crawler/util.js | 2 +- dbs/archives.js | 310 +++++++++++++++++++++++++++++---------- dbs/bookmarks.js | 91 ++++++++++-- dbs/history.js | 63 +++++++- dbs/profile-data-db.js | 22 +++ dbs/settings.js | 25 +++- dbs/sitedata.js | 88 ++++++++--- dbs/templates.js | 41 ++++++ dbs/watchlist.js | 43 +++++- lib/db.js | 8 +- lib/env.js | 4 + web-apis/bg/watchlist.js | 4 +- 14 files changed, 590 insertions(+), 128 deletions(-) diff --git a/crawler/followgraph.js b/crawler/followgraph.js index e1124bfd..6b27c6b8 100644 --- a/crawler/followgraph.js +++ b/crawler/followgraph.js @@ -75,7 +75,7 @@ exports.crawlSite = async function (archive, crawlSource) { } // diff against the current follows - var currentFollows = await listFollows(archive.url) + var currentFollows = /** @type string[] */(await listFollows(archive.url)) var newFollows = followsJson.urls var adds = _difference(newFollows, currentFollows) var removes = _difference(currentFollows, newFollows) diff --git a/crawler/search.js b/crawler/search.js index f07f0eb7..e788c34c 100644 --- a/crawler/search.js +++ b/crawler/search.js @@ -26,6 +26,7 @@ const BUILTIN_PAGES = [ /** * @typedef {import("./site-descriptions").SiteDescription} SiteDescription + * @typedef {import("../dbs/archives").LibraryArchiveRecord} LibraryArchiveRecord * * @typedef {Object} SuggestionResults * @prop {Array} apps @@ -82,14 +83,14 @@ exports.listSuggestions = async function (query = '', opts = {}) { suggestions.apps = BUILTIN_PAGES.filter(filterFn) // library - var libraryResults = await datLibrary.queryArchives({isSaved: true}) + var libraryResults = /** @type LibraryArchiveRecord[] */(await datLibrary.queryArchives({isSaved: true})) libraryResults = libraryResults.filter(filterFn) - libraryResults = _groupBy(libraryResults, a => getBasicType(a.type)) - suggestions.people = libraryResults.user - suggestions.webPages = libraryResults['web-page'] - suggestions.fileShares = libraryResults['file-share'] - suggestions.imageCollections = libraryResults['image-collection'] - suggestions.others = libraryResults.other + var libraryResultsGrouped = _groupBy(libraryResults, a => getBasicType(a.type)) + suggestions.people = libraryResultsGrouped.user + suggestions.webPages = libraryResultsGrouped['web-page'] + suggestions.fileShares = libraryResultsGrouped['file-share'] + suggestions.imageCollections = libraryResultsGrouped['image-collection'] + suggestions.others = libraryResultsGrouped.other if (query) { // bookmarks diff --git a/crawler/util.js b/crawler/util.js index 8e292f1e..6e0dae76 100644 --- a/crawler/util.js +++ b/crawler/util.js @@ -11,7 +11,7 @@ const READ_TIMEOUT = 30e3 /** * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive - * + * * @typedef {Object} CrawlSourceRecord * @prop {string} id * @prop {string} url diff --git a/dbs/archives.js b/dbs/archives.js index 85433822..67df8bc5 100644 --- a/dbs/archives.js +++ b/dbs/archives.js @@ -12,38 +12,128 @@ const { DAT_GC_EXPIRATION_AGE } = require('../lib/const') +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * + * @typedef {Object} LibraryArchiveRecord + * @prop {string} key + * @prop {string} url + * @prop {string} title + * @prop {string} description + * @prop {Array} type + * @prop {number} mtime + * @prop {number} size + * @prop {boolean} isOwner + * @prop {number} lastAccessTime + * @prop {number} lastLibraryAccessTime + * @prop {Object} userSettings + * @prop {boolean} userSettings.isSaved + * @prop {boolean} userSettings.hidden + * @prop {boolean} userSettings.networked + * @prop {boolean} userSettings.autoDownload + * @prop {boolean} userSettings.autoUpload + * @prop {number} userSettings.expiresAt + * @prop {string} userSettings.localSyncPath + * @prop {boolean} userSettings.previewMode + * + * @typedef {Object} LibraryArchiveMeta + * @prop {string} key + * @prop {string} title + * @prop {string} description + * @prop {string | Array} type + * @prop {Array} installedNames + * @prop {number} mtime + * @prop {number} size + * @prop {boolean} isOwner + * @prop {number} lastAccessTime + * @prop {number} lastLibraryAccessTime + * + * @typedef {Object} LibraryArchiveUserSettings + * @prop {number} profileId + * @prop {string} key + * @prop {boolean} isSaved + * @prop {boolean} hidden + * @prop {boolean} networked + * @prop {boolean} autoDownload + * @prop {boolean} autoUpload + * @prop {number} expiresAt + * @prop {string} localSyncPath + * @prop {boolean} previewMode + * @prop {number} createdAt + * + * @typedef {Object} MinimalLibraryArchiveRecord + * @prop {string} key + */ + // globals // = -var datPath // path to the dat folder +var datPath /** @type string - path to the dat folder */ var events = new Events() // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + */ exports.setup = function (opts) { // make sure the folders exist datPath = path.join(opts.userDataPath, 'Dat') mkdirp.sync(path.join(datPath, 'Archives')) } +/** + * @returns {string} + */ exports.getDatPath = function () { return datPath } -// get the path to an archive's files +/** + * @description Get the path to an archive's files. + * @param {string | Buffer | InternalDatArchive} archiveOrKey + * @returns {string} + */ +// const getArchiveMetaPath = exports.getArchiveMetaPath = function (archiveOrKey) { - var key = datEncoding.toStr(archiveOrKey.key || archiveOrKey) + var key /** @type string */ + if (typeof archiveOrKey === 'string') { + key = archiveOrKey + } else if (Buffer.isBuffer(archiveOrKey)) { + key = datEncoding.toStr(archiveOrKey) + } else { + key = datEncoding.toStr(archiveOrKey.key) + } return path.join(datPath, 'Archives', 'Meta', key.slice(0, 2), key.slice(2)) } -// get the path to an archive's temporary local sync path +/** + * @description Get the path to an archive's temporary local sync path. + * @param {string | Buffer | InternalDatArchive} archiveOrKey + * @returns {string} + */ const getInternalLocalSyncPath = exports.getInternalLocalSyncPath = function (archiveOrKey) { - var key = datEncoding.toStr(archiveOrKey.key || archiveOrKey) + var key /** @type string */ + if (typeof archiveOrKey === 'string') { + key = archiveOrKey + } else if (Buffer.isBuffer(archiveOrKey)) { + key = datEncoding.toStr(archiveOrKey) + } else { + key = datEncoding.toStr(archiveOrKey.key) + } return path.join(datPath, 'Archives', 'LocalCopy', key.slice(0, 2), key.slice(2)) } -// delete all db entries and files for an archive +/** + * @description Delete all db entries and files for an archive. + * @param {string} key + * @returns {Promise} + */ exports.deleteArchive = async function (key) { const path = getArchiveMetaPath(key) const info = await jetpack.inspectTreeAsync(path) @@ -64,40 +154,42 @@ exports.removeListener = events.removeListener.bind(events) // exported methods: archive user settings // = -// get an array of saved archives -// - optional `query` keys: -// - `isSaved`: bool -// - `isNetworked`: bool -// - `isOwner`: bool, does beaker have the secret key? -// - `type`: string, a type filter -// - `showHidden`: bool, show hidden dats -// - `key`: string, the key of the archive you want (return single result) -exports.query = async function (profileId, query) { - query = query || {} - +/** + * @description Get an array of saved archives. + * @param {number} profileId + * @param {Object} [query] + * @param {string} [query.key] + * @param {boolean} [query.isSaved] + * @param {boolean} [query.isNetworked] + * @param {boolean} [query.isOwner] + * @param {boolean} [query.showHidden] + * @param {string} [query.type] + * @param {string} [query.string] + * @returns {Promise>} + */ +exports.query = async function (profileId, query = {}) { // fetch archive meta var values = [] - var WHERE = [] - if (query.isOwner === true) WHERE.push('archives_meta.isOwner = 1') - if (query.isOwner === false) WHERE.push('archives_meta.isOwner = 0') - if (query.isNetworked === true) WHERE.push('archives.networked = 1') - if (query.isNetworked === false) WHERE.push('archives.networked = 0') + var whereList = [] + if (query.isOwner === true) whereList.push('archives_meta.isOwner = 1') + if (query.isOwner === false) whereList.push('archives_meta.isOwner = 0') + if (query.isNetworked === true) whereList.push('archives.networked = 1') + if (query.isNetworked === false) whereList.push('archives.networked = 0') if ('isSaved' in query) { if (query.isSaved) { - WHERE.push('archives.profileId = ?') + whereList.push('archives.profileId = ?') values.push(profileId) - WHERE.push('archives.isSaved = 1') + whereList.push('archives.isSaved = 1') } else { - WHERE.push('(archives.isSaved = 0 OR archives.isSaved IS NULL)') + whereList.push('(archives.isSaved = 0 OR archives.isSaved IS NULL)') } } - if ('key' in query) { - WHERE.push('archives_meta.key = ?') + if (typeof query.key !== 'undefined') { + whereList.push('archives_meta.key = ?') values.push(query.key) } - if (!query.showHidden) WHERE.push('(archives.hidden = 0 OR archives.hidden IS NULL)') - if (WHERE.length) WHERE = `WHERE ${WHERE.join(' AND ')}` - else WHERE = '' + if (!query.showHidden) whereList.push('(archives.hidden = 0 OR archives.hidden IS NULL)') + var WHERE = whereList.length ? `WHERE ${whereList.join(' AND ')}` : '' var archives = await db.all(` SELECT @@ -156,7 +248,7 @@ exports.query = async function (profileId, query) { // apply manual filters if ('type' in query) { let types = Array.isArray(query.type) ? query.type : [query.type] - archives = archives.filter(a => { + archives = archives.filter((/** @type LibraryArchiveRecord */ a) => { for (let type of types) { if (a.type.indexOf(type) === -1) { return false @@ -169,7 +261,10 @@ exports.query = async function (profileId, query) { return ('key' in query) ? archives[0] : archives } -// get all archives that should be unsaved +/** + * @description Get all archives that should be unsaved. + * @returns {Promise>} + */ exports.listExpiredArchives = async function () { return db.all(` SELECT archives.key @@ -182,10 +277,16 @@ exports.listExpiredArchives = async function () { `, [Date.now()]) } -// get all archives that are ready for garbage collection +/** + * @description Get all archives that are ready for garbage collection. + * @param {Object} [opts] + * @param {number} [opts.olderThan] + * @param {boolean} [opts.isOwner] + * @returns {Promise>} + */ exports.listGarbageCollectableArchives = async function ({olderThan, isOwner} = {}) { olderThan = typeof olderThan === 'number' ? olderThan : DAT_GC_EXPIRATION_AGE - isOwner = typeof isOwner === 'boolean' ? `AND archives_meta.isOwner = ${isOwner ? '1' : '0'}` : '' + var isOwnerClause = typeof isOwner === 'boolean' ? `AND archives_meta.isOwner = ${isOwner ? '1' : '0'}` : '' // fetch archives var records = await db.all(` @@ -195,7 +296,7 @@ exports.listGarbageCollectableArchives = async function ({olderThan, isOwner} = WHERE (archives.isSaved != 1 OR archives.isSaved IS NULL) AND archives_meta.lastAccessTime < ? - ${isOwner} + ${isOwnerClause} `, [Date.now() - olderThan]) var records2 = records.slice() @@ -208,7 +309,13 @@ exports.listGarbageCollectableArchives = async function ({olderThan, isOwner} = return records } -// upsert the last-access time +/** + * @description Upsert the last-access time. + * @param {string | Buffer} key + * @param {string} [timeVar] + * @param {number} [value] + * @returns {Promise} + */ exports.touch = async function (key, timeVar = 'lastAccessTime', value = -1) { var release = await lock('archives-db:meta') try { @@ -216,22 +323,28 @@ exports.touch = async function (key, timeVar = 'lastAccessTime', value = -1) { timeVar = 'lastAccessTime' } if (value === -1) value = Date.now() - key = datEncoding.toStr(key) - await db.run(`UPDATE archives_meta SET ${timeVar}=? WHERE key=?`, [value, key]) - await db.run(`INSERT OR IGNORE INTO archives_meta (key, ${timeVar}) VALUES (?, ?)`, [key, value]) + var keyStr = datEncoding.toStr(key) + await db.run(`UPDATE archives_meta SET ${timeVar}=? WHERE key=?`, [value, keyStr]) + await db.run(`INSERT OR IGNORE INTO archives_meta (key, ${timeVar}) VALUES (?, ?)`, [keyStr, value]) } finally { release() } } -// get a single archive's user settings -// - supresses a not-found with an empty object +/** + * @description + * Get a single archive's user settings. + * (Returns an empty object on not found.) + * @param {number} profileId + * @param {string | Buffer} key + * @returns {Promise} + */ const getUserSettings = exports.getUserSettings = async function (profileId, key) { // massage inputs - key = typeof key !== 'string' ? datEncoding.toStr(key) : key + var keyStr = typeof key !== 'string' ? datEncoding.toStr(key) : key // validate inputs - if (!DAT_HASH_REGEX.test(key)) { + if (!DAT_HASH_REGEX.test(keyStr)) { throw new InvalidArchiveKeyError() } @@ -239,39 +352,53 @@ const getUserSettings = exports.getUserSettings = async function (profileId, key try { var settings = await db.get(` SELECT * FROM archives WHERE profileId = ? AND key = ? - `, [profileId, key]) + `, [profileId, keyStr]) settings.isSaved = !!settings.isSaved settings.hidden = !!settings.hidden settings.networked = !!settings.networked settings.autoDownload = !!settings.autoDownload settings.autoUpload = !!settings.autoUpload - settings.previewMode = settings.previewMode == 1 - return settings + settings.previewMode = Number(settings.previewMode) === 1 + return /** @type LibraryArchiveUserSettings */(settings) } catch (e) { - return {} + return /** @type LibraryArchiveUserSettings */({}) } } -// write an archive's user setting +/** + * @description Write an archive's user setting. + * @param {number} profileId + * @param {string | Buffer} key + * @param {Object} [newValues] + * @param {boolean} [newValues.isSaved] + * @param {boolean} [newValues.hidden] + * @param {boolean} [newValues.networked] + * @param {boolean} [newValues.autoDownload] + * @param {boolean} [newValues.autoUpload] + * @param {number} [newValues.expiresAt] + * @param {string} [newValues.localSyncPath] + * @param {boolean} [newValues.previewMode] + * @returns {Promise} + */ exports.setUserSettings = async function (profileId, key, newValues = {}) { // massage inputs - key = datEncoding.toStr(key) + var keyStr = datEncoding.toStr(key) // validate inputs - if (!DAT_HASH_REGEX.test(key)) { + if (!DAT_HASH_REGEX.test(keyStr)) { throw new InvalidArchiveKeyError() } var release = await lock('archives-db') try { // fetch current - var value = await getUserSettings(profileId, key) + var value = await getUserSettings(profileId, keyStr) if (!value || typeof value.key === 'undefined') { // create - value = { + value = /** @type LibraryArchiveUserSettings */ ({ profileId, - key, + key: keyStr, isSaved: newValues.isSaved, hidden: newValues.hidden, networked: ('networked' in newValues) ? newValues.networked : true, @@ -280,10 +407,10 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { expiresAt: newValues.expiresAt, localSyncPath: (newValues.localSyncPath) ? newValues.localSyncPath : '', previewMode: ('previewMode' in newValues) ? newValues.previewMode : '' - } + }) let valueArray = [ profileId, - key, + keyStr, flag(value.isSaved), flag(value.hidden), flag(value.networked), @@ -330,7 +457,7 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { value.localSyncPath, flag(value.previewMode), profileId, - key + keyStr ] await db.run(` UPDATE archives @@ -348,7 +475,7 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { `, valueArray) } - events.emit('update:archive-user-settings', key, value, newValues) + events.emit('update:archive-user-settings', keyStr, value, newValues) return value } finally { release() @@ -358,14 +485,19 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { // exported methods: archive meta // = -// get a single archive's metadata -// - supresses a not-found with an empty object +/** + * @description + * Get a single archive's metadata. + * Returns an empty object on not-found. + * @param {string | Buffer} key + * @returns {Promise} + */ const getMeta = exports.getMeta = async function (key) { // massage inputs - key = typeof key !== 'string' ? datEncoding.toStr(key) : key + var keyStr = typeof key !== 'string' ? datEncoding.toStr(key) : key // validate inputs - if (!DAT_HASH_REGEX.test(key)) { + if (!DAT_HASH_REGEX.test(keyStr)) { throw new InvalidArchiveKeyError() } @@ -380,9 +512,9 @@ const getMeta = exports.getMeta = async function (key) { LEFT JOIN apps ON apps.url = ('dat://' || archives_meta.key) WHERE archives_meta.key = ? GROUP BY archives_meta.key - `, [key]) + `, [keyStr]) if (!meta) { - return defaultMeta(key) + return defaultMeta(keyStr) } // massage some values @@ -401,15 +533,23 @@ const getMeta = exports.getMeta = async function (key) { return meta } -// write an archive's metadata -exports.setMeta = async function (key, value = {}) { +/** + * @description Write an archive's metadata. + * @param {string | Buffer} key + * @param {LibraryArchiveMeta} [value] + * @returns {Promise} + */ +exports.setMeta = async function (key, value) { // massage inputs - key = datEncoding.toStr(key) + var keyStr = datEncoding.toStr(key) // validate inputs - if (!DAT_HASH_REGEX.test(key)) { + if (!DAT_HASH_REGEX.test(keyStr)) { throw new InvalidArchiveKeyError() } + if (!value || typeof value !== 'object') { + return // dont bother + } // extract the desired values var {title, description, type, size, mtime, isOwner} = value @@ -417,30 +557,35 @@ exports.setMeta = async function (key, value = {}) { description = typeof description === 'string' ? description : '' if (typeof type === 'string') type = type.split(' ') else if (Array.isArray(type)) type = type.filter(v => v && typeof v === 'string') - isOwner = flag(isOwner) + var isOwnerFlag = flag(isOwner) // write var release = await lock('archives-db:meta') - var {lastAccessTime, lastLibraryAccessTime} = await getMeta(key) + var {lastAccessTime, lastLibraryAccessTime} = await getMeta(keyStr) try { await db.run(` INSERT OR REPLACE INTO archives_meta (key, title, description, mtime, size, isOwner, lastAccessTime, lastLibraryAccessTime) VALUES (?, ?, ?, ?, ?, ?, ?, ?) - `, [key, title, description, mtime, size, isOwner, lastAccessTime, lastLibraryAccessTime]) - await db.run(`DELETE FROM archives_meta_type WHERE key=?`, key) + `, [keyStr, title, description, mtime, size, isOwnerFlag, lastAccessTime, lastLibraryAccessTime]) + await db.run(`DELETE FROM archives_meta_type WHERE key=?`, keyStr) if (type) { await Promise.all(type.map(t => ( - db.run(`INSERT INTO archives_meta_type (key, type) VALUES (?, ?)`, [key, t]) + db.run(`INSERT INTO archives_meta_type (key, type) VALUES (?, ?)`, [keyStr, t]) ))) } } finally { release() } - events.emit('update:archive-meta', key, value) + events.emit('update:archive-meta', keyStr, value) } -// find the archive currently using a given localSyncPath +/** + * @description Find the archive currently using a given localSyncPath. + * @param {number} profileId + * @param {string} localSyncPath + * @returns {Promise} + */ exports.getByLocalSyncPath = async function (profileId, localSyncPath) { try { return await db.get(` @@ -454,24 +599,37 @@ exports.getByLocalSyncPath = async function (profileId, localSyncPath) { // internal methods // = +/** + * @param {string} key + * @returns {LibraryArchiveMeta} + */ function defaultMeta (key) { return { key, title: null, description: null, type: [], - author: null, mtime: 0, isOwner: false, lastAccessTime: 0, - installedNames: [] + lastLibraryAccessTime: 0, + installedNames: [], + size: 0 } } +/** + * @param {boolean} b + * @returns {number} + */ function flag (b) { return b ? 1 : 0 } +/** + * @param {string} originURL + * @returns {string} + */ exports.extractOrigin = function (originURL) { var urlp = url.parse(originURL) if (!urlp || !urlp.host || !urlp.protocol) return diff --git a/dbs/bookmarks.js b/dbs/bookmarks.js index da53524d..62081f2b 100644 --- a/dbs/bookmarks.js +++ b/dbs/bookmarks.js @@ -5,15 +5,41 @@ const lock = require('../lib/lock') const NORMALIZE_OPTS = { stripFragment: false, stripWWW: false, - removeQueryParameters: false, removeTrailingSlash: false } +// typedefs +// = + +/** + * @typedef {Object} Bookmark + * @prop {boolean} _origin + * @prop {boolean} _url + * @prop {boolean} private + * @prop {number} createdAt + * @prop {string} href + * @prop {string} title + * @prop {string[]} tags + * @prop {string} notes + * @prop {boolean} pinned + * @prop {number} pinOrder + */ + // exported methods // = +/** + * @param {number} profileId + * @param {string} url + * @param {Object} values + * @param {string} values.title + * @param {string | string[]} values.tags + * @param {string} values.notes + * @param {number} values.pinOrder + * @returns {Promise} + */ exports.bookmark = async function (profileId, url, {title, tags, notes, pinOrder}) { - tags = tagsToString(tags) + var tagsStr = tagsToString(tags) var release = await lock(`bookmark:${url}`) try { // read old bookmark and fallback to old values as needed @@ -21,7 +47,7 @@ exports.bookmark = async function (profileId, url, {title, tags, notes, pinOrder oldBookmark = oldBookmark || {} const pinned = oldBookmark.pinned ? 1 : 0 title = typeof title === 'undefined' ? oldBookmark.title : title - tags = typeof tags === 'undefined' ? oldBookmark.tags : tags + tagsStr = typeof tagsStr === 'undefined' ? oldBookmark.tags : tagsStr notes = typeof notes === 'undefined' ? oldBookmark.notes : notes pinOrder = typeof pinOrder === 'undefined' ? oldBookmark.pinOrder : pinOrder @@ -30,20 +56,36 @@ exports.bookmark = async function (profileId, url, {title, tags, notes, pinOrder INSERT OR REPLACE INTO bookmarks (profileId, url, title, tags, notes, pinned, pinOrder) VALUES (?, ?, ?, ?, ?, ?, ?) - `, [profileId, url, title, tags, notes, pinned, pinOrder]) + `, [profileId, url, title, tagsStr, notes, pinned, pinOrder]) } finally { release() } } +/** + * @param {number} profileId + * @param {string} url + * @returns {Promise} + */ exports.unbookmark = function (profileId, url) { return db.run(`DELETE FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, url]) } +/** + * @param {number} profileId + * @param {string} url + * @param {boolean} pinned + * @returns {Promise} + */ exports.setBookmarkPinned = function (profileId, url, pinned) { return db.run(`UPDATE bookmarks SET pinned = ? WHERE profileId = ? AND url = ?`, [pinned ? 1 : 0, profileId, url]) } +/** + * @param {number} profileId + * @param {string[]} urls + * @returns {Promise} + */ exports.setBookmarkPinOrder = async function (profileId, urls) { var len = urls.length await Promise.all(urls.map((url, i) => ( @@ -51,10 +93,21 @@ exports.setBookmarkPinOrder = async function (profileId, urls) { ))) } +/** + * @param {number} profileId + * @param {string} url + * @returns {Promise} + */ exports.getBookmark = async function (profileId, url) { return toNewFormat(await db.get(`SELECT url, title, tags, notes, pinned, pinOrder, createdAt FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, url])) } +/** + * @param {number} profileId + * @param {Object} [opts] + * @param {string} [opts.tag] + * @returns {Promise>} + */ exports.listBookmarks = async function (profileId, {tag} = {}) { var bookmarks = await db.all(`SELECT url, title, tags, notes, pinned, pinOrder, createdAt FROM bookmarks WHERE profileId = ? ORDER BY createdAt DESC`, [profileId]) bookmarks = bookmarks.map(toNewFormat) @@ -73,11 +126,19 @@ exports.listBookmarks = async function (profileId, {tag} = {}) { return bookmarks } +/** + * @param {number} profileId + * @returns {Promise>} + */ exports.listPinnedBookmarks = async function (profileId) { var bookmarks = await db.all(`SELECT url, title, tags, notes, pinned, pinOrder, createdAt FROM bookmarks WHERE profileId = ? AND pinned = 1 ORDER BY pinOrder DESC`, [profileId]) return bookmarks.map(toNewFormat) } +/** + * @param {number} profileId + * @returns {Promise>} + */ exports.listBookmarkTags = async function (profileId) { var tagSet = new Set() var bookmarks = await db.all(`SELECT tags FROM bookmarks WHERE profileId = ?`, [profileId]) @@ -89,10 +150,14 @@ exports.listBookmarkTags = async function (profileId) { return Array.from(tagSet) } -// TEMP -// apply normalization to old bookmarks -// (can probably remove this in 2018 or so) -// -prf +/** + * @description + * TEMP + * apply normalization to old bookmarks + * (can probably remove this in 2018 or so) + * -prf + * @returns {Promise} + */ exports.fixOldBookmarks = async function () { var bookmarks = await db.all(`SELECT url FROM bookmarks`) bookmarks.forEach(b => { @@ -101,6 +166,10 @@ exports.fixOldBookmarks = async function () { }) } +/** + * @param {string | string[]} v + * @returns {string} + */ function tagsToString (v) { if (Array.isArray(v)) { v = v.join(' ') @@ -108,8 +177,12 @@ function tagsToString (v) { return v } +/** + * @param {Object} b + * @returns {Bookmark | null} + */ function toNewFormat (b) { - if (!b) return b + if (!b) return null return { _origin: false, _url: false, diff --git a/dbs/history.js b/dbs/history.js index 7c17d457..aa2ed62a 100644 --- a/dbs/history.js +++ b/dbs/history.js @@ -1,7 +1,13 @@ const lock = require('../lib/lock') const db = require('./profile-data-db') +// typedefs +// = + class BadParamError extends Error { + /** + * @param {string} msg + */ constructor (msg) { super() this.name = 'BadParamError' @@ -9,9 +15,30 @@ class BadParamError extends Error { } } +/** + * @typedef {Object} Visit + * @prop {number} profileId + * @prop {string} url + * @prop {string} title + * @prop {number} ts + * + * @typedef {Object} VisitSearchResult + * @prop {string} offsets + * @prop {string} url + * @prop {string} title + * @prop {number} num_visits + */ + // exported methods // = +/** + * @param {number} profileId + * @param {Object} values + * @param {string} values.url + * @param {string} values.title + * @returns {Promise} + */ exports.addVisit = async function (profileId, {url, title}) { // validate parameters if (!url || typeof url !== 'string') { @@ -54,14 +81,24 @@ exports.addVisit = async function (profileId, {url, title}) { } } +/** + * @param {number} profileId + * @param {Object} opts + * @param {string} [opts.search] + * @param {number} [opts.offset] + * @param {number} [opts.limit] + * @param {number} [opts.before] + * @param {number} [opts.after] + * @returns {Promise>} + */ exports.getVisitHistory = async function (profileId, {search, offset, limit, before, after}) { var release = await lock('history-db') try { - const params = [ + const params = /** @type Array */([ profileId, limit || 50, offset || 0 - ] + ]) if (search) { // prep search terms params.push( @@ -102,6 +139,13 @@ exports.getVisitHistory = async function (profileId, {search, offset, limit, bef } } +/** + * @param {number} profileId + * @param {Object} opts + * @param {number} [opts.offset] + * @param {number} [opts.limit] + * @returns {Promise>} + */ exports.getMostVisited = async function (profileId, { offset, limit }) { var release = await lock('history-db') try { @@ -121,6 +165,10 @@ exports.getMostVisited = async function (profileId, { offset, limit }) { } } +/** + * @param {string} q + * @returns {Promise>} + */ exports.search = async function (q) { if (!q || typeof q !== 'string') { throw new BadParamError('q must be a string') @@ -148,6 +196,10 @@ exports.search = async function (q) { } } +/** + * @param {string} url + * @returns {Promise} + */ exports.removeVisit = async function (url) { // validate parameters if (!url || typeof url !== 'string') { @@ -168,6 +220,10 @@ exports.removeVisit = async function (url) { } } +/** + * @param {number} timestamp + * @returns {Promise} + */ exports.removeVisitsAfter = async function (timestamp) { var release = await lock('history-db') try { @@ -182,6 +238,9 @@ exports.removeVisitsAfter = async function (timestamp) { } } +/** + * @returns {Promise} + */ exports.removeAllVisits = async function () { var release = await lock('history-db') db.run('DELETE FROM visits;') diff --git a/dbs/profile-data-db.js b/dbs/profile-data-db.js index a4c919b4..b310a93f 100644 --- a/dbs/profile-data-db.js +++ b/dbs/profile-data-db.js @@ -14,6 +14,10 @@ var setupPromise // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + */ exports.setup = function (opts) { // open database var dbPath = path.join(opts.userDataPath, 'Profiles') @@ -21,16 +25,28 @@ exports.setup = function (opts) { setupPromise = setupSqliteDB(db, {setup: setupDb, migrations}, '[PROFILES]') } +/** + * @param {...(string | number | boolean | Array)} args + * @return {Promise} + */ exports.get = async function (...args) { await setupPromise return cbPromise(cb => db.get(...args, cb)) } +/** + * @param {...(string | number | boolean | Array)} args + * @return {Promise>} + */ exports.all = async function (...args) { await setupPromise return cbPromise(cb => db.all(...args, cb)) } +/** + * @param {...(string | number | boolean | Array)} args + * @return {Promise} + */ exports.run = async function (...args) { await setupPromise return cbPromise(cb => db.run(...args, function (err) { @@ -39,10 +55,16 @@ exports.run = async function (...args) { })) } +/** + * @returns {Promise} + */ exports.serialize = function () { return db.serialize() } +/** + * @returns {Promise} + */ exports.parallelize = function () { return db.parallelize() } diff --git a/dbs/settings.js b/dbs/settings.js index e2fafe4b..0a9ce06d 100644 --- a/dbs/settings.js +++ b/dbs/settings.js @@ -16,6 +16,11 @@ var events = new EventEmitter() // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + * @param {string} opts.homePath + */ exports.setup = function (opts) { // open database var dbPath = path.join(opts.userDataPath, 'Settings') @@ -38,10 +43,15 @@ exports.setup = function (opts) { exports.on = events.on.bind(events) exports.once = events.once.bind(events) +/** + * @param {string} key + * @param {string | number} value + * @returns {Promise} + */ exports.set = function (key, value) { events.emit('set', key, value) events.emit('set:' + key, value) - return setupPromise.then(v => cbPromise(cb => { + return setupPromise.then(() => cbPromise(cb => { db.run(` INSERT OR REPLACE INTO settings (key, value, ts) @@ -50,13 +60,17 @@ exports.set = function (key, value) { })) } +/** + * @param {string} key + * @returns {boolean | Promise} + */ exports.get = function (key) { // env variables if (key === 'no_welcome_tab') { - return (getEnvVar('BEAKER_NO_WELCOME_TAB') == 1) + return (Number(getEnvVar('BEAKER_NO_WELCOME_TAB')) === 1) } // stored values - return setupPromise.then(v => cbPromise(cb => { + return setupPromise.then(() => cbPromise(cb => { db.get(`SELECT value FROM settings WHERE key = ?`, [key], (err, row) => { if (row) { row = row.value } if (typeof row === 'undefined') { row = defaultSettings[key] } @@ -65,6 +79,9 @@ exports.get = function (key) { })) } +/** + * @returns {Promise} + */ exports.getAll = function () { return setupPromise.then(v => cbPromise(cb => { db.all(`SELECT key, value FROM settings`, (err, rows) => { @@ -73,7 +90,7 @@ exports.getAll = function () { var obj = {} rows.forEach(row => { obj[row.key] = row.value }) obj = Object.assign({}, defaultSettings, obj) - obj.no_welcome_tab = (getEnvVar('BEAKER_NO_WELCOME_TAB') == 1) + obj.no_welcome_tab = (Number(getEnvVar('BEAKER_NO_WELCOME_TAB')) === 1) cb(null, obj) }) })) diff --git a/dbs/sitedata.js b/dbs/sitedata.js index 9fcfe436..b2c181e1 100644 --- a/dbs/sitedata.js +++ b/dbs/sitedata.js @@ -8,6 +8,7 @@ const datLibrary = require('../dat/library') // globals // = + var db var migrations var setupPromise @@ -15,6 +16,10 @@ var setupPromise // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + */ exports.setup = function (opts) { // open database var dbPath = path.join(opts.userDataPath, 'SiteData') @@ -22,6 +27,14 @@ exports.setup = function (opts) { setupPromise = setupSqliteDB(db, {migrations}, '[SITEDATA]') } +/** + * @param {string} url + * @param {string} key + * @param {number | string} value + * @param {Object} [opts] + * @param {boolean} [opts.dontExtractOrigin] + * @returns {Promise} + */ const set = exports.set = async function (url, key, value, opts) { await setupPromise var origin = opts && opts.dontExtractOrigin ? url : await extractOrigin(url) @@ -35,6 +48,11 @@ const set = exports.set = async function (url, key, value, opts) { }) } +/** + * @param {string} url + * @param {string} key + * @returns {Promise} + */ const clear = exports.clear = async function (url, key) { await setupPromise var origin = await extractOrigin(url) @@ -46,6 +64,13 @@ const clear = exports.clear = async function (url, key) { }) } +/** + * @param {string} url + * @param {string} key + * @param {Object} [opts] + * @param {boolean} [opts.dontExtractOrigin] + * @returns {Promise} + */ const get = exports.get = async function (url, key, opts) { await setupPromise var origin = opts && opts.dontExtractOrigin ? url : await extractOrigin(url) @@ -58,6 +83,10 @@ const get = exports.get = async function (url, key, opts) { }) } +/** + * @param {string} url + * @returns {Promise} + */ const getPermissions = exports.getPermissions = async function (url) { await setupPromise var origin = await extractOrigin(url) @@ -75,6 +104,10 @@ const getPermissions = exports.getPermissions = async function (url) { }) } +/** + * @param {string} url + * @returns {Promise>} + */ exports.getNetworkPermissions = async function (url) { await setupPromise var origin = await extractOrigin(url) @@ -84,7 +117,7 @@ exports.getNetworkPermissions = async function (url) { if (err) return cb(err) // convert to array - var origins = [] + var origins = /** @type string[] */([]) if (rows) { rows.forEach(row => { if (row.value) origins.push(row.key.split(':').pop()) @@ -95,6 +128,10 @@ exports.getNetworkPermissions = async function (url) { }) } +/** + * @param {string} url + * @returns {Promise} + */ const getAppPermissions = exports.getAppPermissions = async function (url) { await setupPromise var origin = await extractOrigin(url) @@ -117,15 +154,31 @@ const getAppPermissions = exports.getAppPermissions = async function (url) { }) } +/** + * @param {string} url + * @param {string} key + * @returns {Promise} + */ const getPermission = exports.getPermission = function (url, key) { return get(url, 'perm:' + key) } +/** + * @param {string} url + * @param {string} key + * @param {string | number} value + * @returns {Promise} + */ const setPermission = exports.setPermission = function (url, key, value) { value = value ? 1 : 0 return set(url, 'perm:' + key, value) } +/** + * @param {string} url + * @param {Object} appPerms + * @returns {Promise} + */ const setAppPermissions = exports.setAppPermissions = async function (url, appPerms) { await setupPromise var origin = await extractOrigin(url) @@ -150,10 +203,19 @@ const setAppPermissions = exports.setAppPermissions = async function (url, appPe } } +/** + * @param {string} url + * @param {string} key + * @returns {Promise} + */ const clearPermission = exports.clearPermission = function (url, key) { return clear(url, 'perm:' + key) } +/** + * @param {string} key + * @returns {Promise} + */ const clearPermissionAllOrigins = exports.clearPermissionAllOrigins = async function (key) { await setupPromise key = 'perm:' + key @@ -164,26 +226,6 @@ const clearPermissionAllOrigins = exports.clearPermissionAllOrigins = async func }) } -exports.query = async function (values) { - await setupPromise - - // massage query - if ('origin' in values) { - values.origin = await extractOrigin(values.origin) - } - - return cbPromise(cb => { - // run query - const keys = Object.keys(values) - const where = keys.map(k => `${k} = ?`).join(' AND ') - values = keys.map(k => values[k]) - db.all(`SELECT * FROM sitedata WHERE ${where}`, values, (err, res) => { - if (err) return cb(err) - cb(null, res && res.value) - }) - }) -} - exports.WEBAPI = { get, set, @@ -199,6 +241,10 @@ exports.WEBAPI = { // internal methods // = +/** + * @param {string} originURL + * @returns {Promise} + */ async function extractOrigin (originURL) { var urlp = url.parse(originURL) if (!urlp || !urlp.host || !urlp.protocol) return diff --git a/dbs/templates.js b/dbs/templates.js index 4be48c6c..61a9e387 100644 --- a/dbs/templates.js +++ b/dbs/templates.js @@ -1,20 +1,56 @@ const db = require('./profile-data-db') +// typedefs +// = + +/** + * @typedef {Object} Template + * @prop {string} url + * @prop {string} title + * @prop {number} createdAt + * + * @typedef {Object} TemplateScreenshot + * @prop {string} url + * @prop {string} screenshot + */ + // exported api // = +/** + * @param {number} profileId + * @param {string} url + * @returns {Promise