diff --git a/LICENSE b/LICENSE index dcdc31b4..d1a38ee6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2018 Blue Link Labs +Copyright (c) 2019 Blue Link Labs Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index df6f1065..65436571 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,10 @@ await beakerCore.setup({ exportAPI(apiName, apiManifest, apiImpl, [guardFn]) }, downloadsWebAPI: {...}, - browserWebAPI: {...} + browserWebAPI: {...}, + userSessionAPI: { + getFor(webContents) {/*...*/} + } }) // setup the protocol handler @@ -115,6 +118,10 @@ debug('dat-related stuff') ### `dat.debug` +### `crawler` + +### `users` + ## API (@beaker/core/webview) ### `setup()` \ No newline at end of file diff --git a/applications.js b/applications.js new file mode 100644 index 00000000..654a0651 --- /dev/null +++ b/applications.js @@ -0,0 +1,69 @@ +const sessionPerms = require('./lib/session-perms') +const knex = require('./lib/knex') +const db = require('./dbs/profile-data-db') +const sitedataDb = require('./dbs/sitedata') +const dat = require('./dat') + +// typedefs +// = + +/** + * @typedef {Object} ApplicationPermission + * @prop {string} id + * @prop {string[]} caps + * @prop {string} description + * + * @typedef {Object} ApplicationState + * @prop {string} url + * @prop {ApplicationPermission[]} permissions + * @prop {boolean} installed + * @prop {boolean} enabled + * @prop {string} installedAt + */ + +// exported api +// = + +/** + * @param {Object} opts + * @param {number} opts.userId + * @param {string} opts.url + * @returns {Promise} + */ +exports.getApplicationState = async function ({userId, url}) { + url = await dat.library.getPrimaryUrl(url) + var record = await db.get(knex('installed_applications').where({userId, url})) + if (record) { + record.installed = true + } else { + record = { + url, + installed: false, + enabled: false, + installedAt: null + } + } + record.permissions = await sitedataDb.getAppPermissions(record.url) + return massageAppRecord(record) +} + +// internal methods +// = + +/** + * @param {Object} record + * @returns {ApplicationState} + */ +function massageAppRecord (record) { + return { + url: record.url, + permissions: Object.entries(record.permissions).map(([id, caps]) => ({ + id, + caps, + description: sessionPerms.describePerm(id, caps) + })), + installed: record.installed, + enabled: Boolean(record.enabled), + installedAt: record.createdAt ? (new Date(record.createdAt)).toISOString() : null + } +} \ No newline at end of file diff --git a/crawler/bookmarks.js b/crawler/bookmarks.js new file mode 100644 index 00000000..3c28f72b --- /dev/null +++ b/crawler/bookmarks.js @@ -0,0 +1,387 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'bookmarks'}) +const db = require('../dbs/profile-data-db') +const datLibrary = require('../dat/library') +const knex = require('../lib/knex') +const crawler = require('./index') +const siteDescriptions = require('./site-descriptions') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename, normalizeTopicUrl, ensureDirectory} = require('./util') +const bookmarkSchema = require('./json-schemas/bookmark') + +// constants +// = + +const TABLE_VERSION = 3 +const JSON_TYPE = 'unwalled.garden/bookmark' +const JSON_PATH_REGEX = /^\/data\/bookmarks\/([^/]+)\.json$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription + * + * @typedef {Object} Bookmark + * @prop {string} pathname + * @prop {string} href + * @prop {string} title + * @prop {string?} description + * @prop {string[]?} tags + * @prop {number} crawledAt + * @prop {number} createdAt + * @prop {number} updatedAt + * @prop {SiteDescription} author + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validateBookmark = ajv.compile(bookmarkSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for bookmarks. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_bookmarks', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling bookmarks', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_bookmarks WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_bookmarks', TABLE_VERSION, crawlSource, 0) + } + + // collect changed bookmarks + var changedBookmarks = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedBookmarks.length) { + logger.verbose('Collected new/changed bookmark files', {details: {url: archive.url, changedBookmarks: changedBookmarks.map(p => p.name)}}) + } else { + logger.debug('No new bookmark-files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_bookmarks', 0, changedBookmarks.length) + + // read and apply each bookmark in order + var progress = 0 + for (let changedBookmark of changedBookmarks) { + // TODO Currently the crawler will abort reading the bookmarks if any bookmark fails to load + // this means that a single unreachable file can stop the forward progress of bookmark indexing + // to solve this, we need to find a way to tolerate unreachable bookmark-files without losing our ability to efficiently detect new bookmarks + // -prf + if (changedBookmark.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_bookmarks WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedBookmark.name]) + events.emit('bookmark-removed', archive.url) + } else { + // read + let bookmarkString + try { + bookmarkString = await archive.pda.readFile(changedBookmark.name, 'utf8') + } catch (err) { + logger.warn('Failed to read bookmark file, aborting', {details: {url: archive.url, name: changedBookmark.name, err}}) + return // abort indexing + } + + // parse and validate + let bookmark + try { + bookmark = JSON.parse(bookmarkString) + let valid = validateBookmark(bookmark) + if (!valid) throw ajv.errorsText(validateBookmark.errors) + } catch (err) { + logger.warn('Failed to parse bookmark file, skipping', {details: {url: archive.url, name: changedBookmark.name, err}}) + continue // skip + } + + // massage the bookmark + bookmark.href = normalizeTopicUrl(bookmark.href) + bookmark.createdAt = Number(new Date(bookmark.createdAt)) + bookmark.updatedAt = Number(new Date(bookmark.updatedAt)) + if (isNaN(bookmark.updatedAt)) bookmark.updatedAt = 0 // optional + if (!bookmark.description) bookmark.description = '' // optional + if (!bookmark.tags) bookmark.tags = [] // optional + + // upsert + let existingBookmark = await getBookmark(joinPath(archive.url, changedBookmark.name)) + if (existingBookmark) { + await db.run(`DELETE FROM crawl_bookmarks WHERE crawlSourceId = ? and pathname = ?`, [crawlSource.id, changedBookmark.name]) + } + let res = await db.run(` + INSERT INTO crawl_bookmarks (crawlSourceId, pathname, crawledAt, href, title, description, createdAt, updatedAt) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `, [crawlSource.id, changedBookmark.name, Date.now(), bookmark.href, bookmark.title, bookmark.description, bookmark.createdAt, bookmark.updatedAt]) + var bookmarkId = res.lastID + for (let tag of bookmark.tags) { + await db.run(`INSERT OR IGNORE INTO crawl_tags (tag) VALUES (?)`, [tag]) + let tagRow = await db.get(`SELECT id FROM crawl_tags WHERE tag = ?`, [tag]) + await db.run(`INSERT INTO crawl_bookmarks_tags (crawlBookmarkId, crawlTagId) VALUES (?, ?)`, [bookmarkId, tagRow.id]) + } + events.emit('bookmark-added', archive.url) + } + + // checkpoint our progress + logger.silly(`Finished crawling bookmarks`, {details: {url: archive.url}}) + await doCheckpoint('crawl_bookmarks', TABLE_VERSION, crawlSource, changedBookmark.version) + emitProgressEvent(archive.url, 'crawl_bookmarks', ++progress, changedBookmarks.length) + } + }) +} + +/** + * @description + * List crawled bookmarks. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.tags] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.query = async function (opts) { + // TODO tags filter + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'number', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('tags' in opts.filters) { + if (Array.isArray(opts.filters.tags)) { + assert(opts.filters.tags.every(v => typeof v === 'string'), 'Tags filter must be a string or array of strings') + } else { + assert(typeof opts.filters.tags === 'string', 'Tags filter must be a string or array of strings') + opts.filters.tags = [opts.filters.tags] + } + } + } + + // build query + var sql = knex('crawl_bookmarks') + .select('crawl_bookmarks.*') + .select('crawl_sources.url as crawlSourceUrl') + .select(knex.raw('group_concat(crawl_tags.tag, ",") as tags')) + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_bookmarks.crawlSourceId') + .leftJoin('crawl_bookmarks_tags', 'crawl_bookmarks_tags.crawlBookmarkId', '=', 'crawl_bookmarks.id') + .leftJoin('crawl_tags', 'crawl_bookmarks_tags.crawlTagId', '=', 'crawl_tags.id') + .groupBy('crawl_bookmarks.id') + .orderBy('crawl_bookmarks.createdAt', opts.reverse ? 'DESC' : 'ASC') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + var bookmarks = await Promise.all(rows.map(massageBookmarkRow)) + + // apply tags filter + if (opts && opts.filters && opts.filters.tags) { + const someFn = t => opts.filters.tags.includes(t) + bookmarks = bookmarks.filter(bookmark => bookmark.tags.some(someFn)) + } + + return bookmarks +} + +/** + * @description + * Get crawled bookmark. + * + * @param {string} url - The URL of the bookmark + * @returns {Promise} + */ +const getBookmark = exports.getBookmark = async function (url) { + // validate & parse params + var urlParsed + if (url) { + try { urlParsed = new URL(url) } + catch (e) { throw new Error('Invalid URL: ' + url) } + } + + // build query + var sql = knex('crawl_bookmarks') + .select('crawl_bookmarks.*') + .select('crawl_sources.url as crawlSourceUrl') + .select(knex.raw('group_concat(crawl_tags.tag, ",") as tags')) + .innerJoin('crawl_sources', function () { + this.on('crawl_sources.id', '=', 'crawl_bookmarks.crawlSourceId') + .andOn('crawl_sources.url', '=', knex.raw('?', `${urlParsed.protocol}//${urlParsed.hostname}`)) + }) + .leftJoin('crawl_bookmarks_tags', 'crawl_bookmarks_tags.crawlBookmarkId', '=', 'crawl_bookmarks.id') + .leftJoin('crawl_tags', 'crawl_tags.id', '=', 'crawl_bookmarks_tags.crawlTagId') + .where('crawl_bookmarks.pathname', urlParsed.pathname) + .groupBy('crawl_bookmarks.id') + + // execute query + return await massageBookmarkRow(await db.get(sql)) +} + +/** + * @description + * Create a new bookmark. + * + * @param {InternalDatArchive} archive - where to write the bookmark to. + * @param {Object} bookmark + * @param {string} bookmark.href + * @param {string} bookmark.title + * @param {string?} bookmark.description + * @param {string?|string[]?} bookmark.tags + * @returns {Promise} url + */ +exports.addBookmark = async function (archive, bookmark) { + if (bookmark && typeof bookmark.tags === 'string') bookmark.tags = bookmark.tags.split(' ') + + var bookmarkObject = { + type: JSON_TYPE, + href: normalizeTopicUrl(bookmark.href), + title: bookmark.title, + description: bookmark.description, + tags: bookmark.tags, + createdAt: (new Date()).toISOString() + } + + var valid = validateBookmark(bookmarkObject) + if (!valid) throw ajv.errorsText(validateBookmark.errors) + + var filename = generateTimeFilename() + var filepath = `/data/bookmarks/${filename}.json` + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/bookmarks') + await archive.pda.writeFile(filepath, JSON.stringify(bookmarkObject, null, 2)) + await crawler.crawlSite(archive) + return archive.url + filepath +} + +/** + * @description + * Update the content of an existing bookmark. + * + * @param {InternalDatArchive} archive - where to write the bookmark to. + * @param {string} pathname - the pathname of the bookmark. + * @param {Object} bookmark + * @param {string} bookmark.href + * @param {string} bookmark.title + * @param {string?} bookmark.description + * @param {string?|string[]?} bookmark.tags + * @returns {Promise} + */ +exports.editBookmark = async function (archive, pathname, bookmark) { + if (bookmark && typeof bookmark.tags === 'string') bookmark.tags = bookmark.tags.split(' ') + var existingBookmark = JSON.parse(await archive.pda.readFile(pathname)) + + var bookmarkObject = { + type: JSON_TYPE, + href: bookmark.href ? normalizeTopicUrl(bookmark.href) : existingBookmark.title, + title: ('title' in bookmark) ? bookmark.title : existingBookmark.title, + description: ('description' in bookmark) ? bookmark.description : existingBookmark.description, + tags: ('tags' in bookmark) ? bookmark.tags : existingBookmark.tags, + createdAt: existingBookmark.createdAt, + updatedAt: (new Date()).toISOString() + } + + var valid = validateBookmark(bookmark) + if (!valid) throw ajv.errorsText(validateBookmark.errors) + + await archive.pda.writeFile(pathname, JSON.stringify(bookmarkObject, null, 2)) + await crawler.crawlSite(archive) +} + +/** + * @description + * Delete an existing bookmark + * + * @param {InternalDatArchive} archive - where to write the bookmark to. + * @param {string} pathname - the pathname of the bookmark. + * @returns {Promise} + */ +exports.deleteBookmark = async function (archive, pathname) { + assert(typeof pathname === 'string', 'Delete() must be provided a valid URL string') + await archive.pda.unlink(pathname) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {string} origin + * @param {string} pathname + * @returns {string} + */ +function joinPath (origin, pathname) { + if (origin.endsWith('/') && pathname.startsWith('/')) { + return origin + pathname.slice(1) + } + if (!origin.endsWith('/') && !pathname.startsWith('/')) { + return origin + '/' + pathname + } + return origin + pathname +} + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageBookmarkRow (row) { + if (!row) return null + var author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if (!author) { + author = { + url: row.crawlSourceUrl, + title: '', + description: '', + type: [], + thumbUrl: `${row.crawlSourceUrl}/thumb`, + descAuthor: {url: null} + } + } + return { + pathname: row.pathname, + author, + href: row.href, + title: row.title, + description: row.description, + tags: row.tags ? row.tags.split(',').filter(Boolean) : [], + crawledAt: row.crawledAt, + createdAt: row.createdAt, + updatedAt: row.updatedAt + } +} diff --git a/crawler/comments.js b/crawler/comments.js new file mode 100644 index 00000000..ddae39ea --- /dev/null +++ b/crawler/comments.js @@ -0,0 +1,538 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'comments'}) +const db = require('../dbs/profile-data-db') +const crawler = require('./index') +const datLibrary = require('../dat/library') +const lock = require('../lib/lock') +const knex = require('../lib/knex') +const siteDescriptions = require('./site-descriptions') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename, ensureDirectory, normalizeTopicUrl} = require('./util') +const commentSchema = require('./json-schemas/comment') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/comment' +const JSON_PATH_REGEX = /^\/data\/comments\/([^/]+)\.json$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription + * + * @typedef {Object} Comment + * @prop {string} pathname + * @prop {string} topic + * @prop {string} replyTo + * @prop {string} body + * @prop {string} createdAt + * @prop {string} updatedAt + * @prop {SiteDescription} author + * @prop {string} visibility + * + * @typedef {Object} ThreadedComment + * @prop {string} pathname + * @prop {string} topic + * @prop {string} replyTo + * @prop {ThreadedComment[]} replies + * @prop {number} replyCount + * @prop {string} body + * @prop {string} createdAt + * @prop {string} updatedAt + * @prop {SiteDescription} author + * @prop {string} visibility + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validateComment = ajv.compile(commentSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for comments. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_comments', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling comments', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_comments WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_comments', TABLE_VERSION, crawlSource, 0) + } + + // collect changed comments + var changedComments = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedComments.length) { + logger.verbose('Collected new/changed comment files', {details: {url: archive.url, changedComments: changedComments.map(p => p.name)}}) + } else { + logger.debug('No new comment-files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_comments', 0, changedComments.length) + + // read and apply each comment in order + var progress = 0 + for (let changedComment of changedComments) { + // TODO Currently the crawler will abort reading the feed if any comment fails to load + // this means that a single unreachable file can stop the forward progress of comment indexing + // to solve this, we need to find a way to tolerate unreachable comment-files without losing our ability to efficiently detect new comments + // -prf + if (changedComment.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_comments WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedComment.name]) + events.emit('comment-removed', archive.url) + } else { + // read + let commentString + try { + commentString = await archive.pda.readFile(changedComment.name, 'utf8') + } catch (err) { + logger.warn('Failed to read comment file, aborting', {details: {url: archive.url, name: changedComment.name, err}}) + return // abort indexing + } + + // parse and validate + let comment + try { + comment = JSON.parse(commentString) + let valid = validateComment(comment) + if (!valid) throw ajv.errorsText(validateComment.errors) + } catch (err) { + logger.warn('Failed to parse comment file, skipping', {details: {url: archive.url, name: changedComment.name, err}}) + continue // skip + } + + // massage the comment + comment.topic = normalizeTopicUrl(comment.topic) + comment.repliesTo = comment.repliesTo ? normalizeTopicUrl(comment.repliesTo) : '' + comment.createdAt = Number(new Date(comment.createdAt)) + comment.updatedAt = Number(new Date(comment.updatedAt)) + if (isNaN(comment.updatedAt)) comment.updatedAt = 0 // optional + + // upsert + let existingComment = await get(joinPath(archive.url, changedComment.name)) + if (existingComment) { + await db.run(` + UPDATE crawl_comments + SET crawledAt = ?, topic = ?, replyTo = ?, body = ?, createdAt = ?, updatedAt = ? + WHERE crawlSourceId = ? AND pathname = ? + `, [Date.now(), comment.topic, comment.replyTo, comment.body, comment.createdAt, comment.updatedAt, crawlSource.id, changedComment.name]) + events.emit('comment-updated', archive.url) + } else { + await db.run(` + INSERT INTO crawl_comments (crawlSourceId, pathname, crawledAt, topic, replyTo, body, createdAt, updatedAt) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `, [crawlSource.id, changedComment.name, Date.now(), comment.topic, comment.replyTo, comment.body, comment.createdAt, comment.updatedAt]) + events.emit('comment-added', archive.url) + } + } + + // checkpoint our progress + await doCheckpoint('crawl_comments', TABLE_VERSION, crawlSource, changedComment.version) + emitProgressEvent(archive.url, 'crawl_comments', ++progress, changedComments.length) + } + logger.silly(`Finished crawling comments`, {details: {url: archive.url}}) + }) +} + +/** + * @description + * List crawled comments. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.topics] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.list = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('topics' in opts.filters) { + if (Array.isArray(opts.filters.topics)) { + assert(opts.filters.topics.every(v => typeof v === 'string'), 'Topics filter must be a string or array of strings') + } else { + assert(typeof opts.filters.topics === 'string', 'Topics filter must be a string or array of strings') + opts.filters.topics = [opts.filters.topics] + } + opts.filters.topics = opts.filters.topics.map(normalizeTopicUrl) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // build query + var sql = knex('crawl_comments') + .select('crawl_comments.*') + .select('crawl_sources.url AS crawlSourceUrl') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_comments.crawlSourceId') + .orderBy('crawl_comments.createdAt', opts.reverse ? 'DESC' : 'ASC') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.filters && opts.filters.topics) { + sql = sql.whereIn('crawl_comments.topic', opts.filters.topics) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + return Promise.all(rows.map(massageCommentRow)) +} + +/** + * @description + * List crawled comments. + * @param {string} topic + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.parent] + * @param {number} [opts.depth] + * @param {string} [opts.sortBy] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.thread = async function (topic, opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + assert(typeof topic === 'string', 'Topic must be a URL string') + topic = normalizeTopicUrl(topic) + if (opts && 'parent' in opts) { + assert(typeof opts.parent === 'string', 'Parent must be a string') + opts.parent = normalizeTopicUrl(opts.parent) + } + if (opts && 'depth' in opts) assert(typeof opts.depth === 'number', 'Depth must be a number') + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // build query + var sql = knex('crawl_comments') + .select('crawl_comments.*') + .select('crawl_sources.url AS crawlSourceUrl') + .where('crawl_comments.topic', topic) + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_comments.crawlSourceId') + .orderBy('crawl_comments.createdAt', opts.reverse ? 'DESC' : 'ASC') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + + // execute query + var rows = await db.all(sql) + + // create a map of comments by their URL + var commentsByUrl = {} + rows.forEach(row => { commentsByUrl[joinPath(row.crawlSourceUrl, row.pathname)] = row }) + + // attach each comment to its parent, forming a tree + var rootComments = [] + rows.forEach(row => { + if (row.replyTo) { + let parent = commentsByUrl[row.replyTo] + if (!parent) { + // TODO insert a placeholder parent when not found + // something that means "this post was by somebody you dont follow" + // -prf + return + } + if (!parent.replies) { + parent.replies = [] + parent.replyCount = 0 + } + parent.replies.push(row) + parent.replyCount++ + } else { + rootComments.push(row) + } + }) + + // apply the parent filter + if (opts && opts.parent) { + rootComments = [] + rows.forEach(row => { + if (row.replyTo === opts.parent) { + rootComments.push(row) + } + }) + } + + // apply the depth limit + if (opts && opts.depth) { + let recursiveApplyDepth = (currentDepth, comment) => { + if (!comment.replies) return + if (currentDepth === opts.depth) { + comment.replies = null + } else { + comment.replies.forEach(reply => recursiveApplyDepth(currentDepth + 1, reply)) + } + } + rootComments.forEach(comment => recursiveApplyDepth(1, comment)) + } + + return Promise.all(rootComments.map(massageThreadedCommentRow)) +} + +/** + * @description + * Get crawled comment. + * + * @param {string} url - The URL of the comment + * @returns {Promise} + */ +const get = exports.get = async function (url) { + // validate & parse params + var urlParsed + if (url) { + try { urlParsed = new URL(url) } + catch (e) { throw new Error('Invalid URL: ' + url) } + } + + // execute query + var sql = knex('crawl_comments') + .select('crawl_comments.*') + .select('crawl_sources.url AS crawlSourceUrl') + .innerJoin('crawl_sources', function () { + this.on('crawl_sources.id', '=', 'crawl_comments.crawlSourceId') + .andOn('crawl_sources.url', '=', knex.raw('?', `${urlParsed.protocol}//${urlParsed.hostname}`)) + }) + .where('crawl_comments.pathname', urlParsed.pathname) + return await massageCommentRow(await db.get(sql)) +} + +/** + * @description + * Create a new comment. + * + * @param {InternalDatArchive} archive - where to write the comment to. + * @param {string} topic + * @param {Object} comment + * @param {string} comment.replyTo + * @param {string} comment.body + * @param {string} comment.visibility + * @returns {Promise} url + */ +exports.add = async function (archive, topic, comment) { + // TODO visibility + + var commentObject = { + type: JSON_TYPE, + topic: normalizeTopicUrl(topic), + replyTo: comment.replyTo ? normalizeTopicUrl(comment.replyTo) : undefined, + body: comment.body, + createdAt: (new Date()).toISOString() + } + var valid = validateComment(commentObject) + if (!valid) throw ajv.errorsText(validateComment.errors) + + var filename = generateTimeFilename() + var filepath = `/data/comments/${filename}.json` + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/comments') + await archive.pda.writeFile(filepath, JSON.stringify(commentObject, null, 2)) + await crawler.crawlSite(archive) + return archive.url + filepath +} + +/** + * @description + * Update the content of an existing comment. + * + * @param {InternalDatArchive} archive - where to write the comment to. + * @param {string} pathname - the pathname of the comment. + * @param {Object} comment + * @param {string} [comment.replyTo] + * @param {string} [comment.body] + * @param {string} [comment.visibility] + * @returns {Promise} + */ +exports.edit = async function (archive, pathname, comment) { + // TODO visibility + + var release = await lock('crawler:comments:' + archive.url) + try { + // fetch comment + var existingComment = await get(archive.url + pathname) + if (!existingComment) throw new Error('Comment not found') + + // update comment content + var commentObject = { + type: JSON_TYPE, + topic: normalizeTopicUrl(existingComment.topic), + replyTo: ('replyTo' in comment) ? normalizeTopicUrl(comment.replyTo) : existingComment.replyTo, + body: ('body' in comment) ? comment.body : existingComment.body, + createdAt: existingComment.createdAt, + updatedAt: (new Date()).toISOString() + } + + // validate + var valid = validateComment(commentObject) + if (!valid) throw ajv.errorsText(validateComment.errors) + + // write + await archive.pda.writeFile(pathname, JSON.stringify(commentObject, null, 2)) + await crawler.crawlSite(archive) + } finally { + release() + } +} + +/** + * @description + * Delete an existing comment + * + * @param {InternalDatArchive} archive - where to write the comment to. + * @param {string} pathname - the pathname of the comment. + * @returns {Promise} + */ +exports.remove = async function (archive, pathname) { + assert(typeof pathname === 'string', 'Remove() must be provided a valid URL string') + await archive.pda.unlink(pathname) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {string} origin + * @param {string} pathname + * @returns {string} + */ +function joinPath (origin, pathname) { + if (origin.endsWith('/') && pathname.startsWith('/')) { + return origin + pathname.slice(1) + } + if (!origin.endsWith('/') && !pathname.startsWith('/')) { + return origin + '/' + pathname + } + return origin + pathname +} + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageCommentRow (row) { + if (!row) return null + var author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if (!author) { + author = { + url: row.crawlSourceUrl, + title: '', + description: '', + type: [], + thumbUrl: `${row.crawlSourceUrl}/thumb`, + descAuthor: {url: null} + } + } + return { + pathname: row.pathname, + author, + topic: row.topic, + replyTo: row.replyTo, + body: row.body, + createdAt: new Date(row.createdAt).toISOString(), + updatedAt: row.updatedAt ? new Date(row.updatedAt).toISOString() : null, + visibility: 'public' // TODO visibility + } +} + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageThreadedCommentRow (row) { + if (!row) return null + if (row.replies) { + row.replies = await Promise.all(row.replies.map(massageThreadedCommentRow)) + } + var author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if (!author) { + author = { + url: row.crawlSourceUrl, + title: '', + description: '', + type: [], + thumbUrl: `${row.crawlSourceUrl}/thumb`, + descAuthor: {url: null} + } + } + return { + pathname: row.pathname, + author, + topic: row.topic, + replyTo: row.replyTo, + body: row.body, + replies: row.replies || null, + replyCount: row.replyCount || 0, + createdAt: new Date(row.createdAt).toISOString(), + updatedAt: row.updatedAt ? new Date(row.updatedAt).toISOString() : null, + visibility: 'public' // TODO visibility + } +} diff --git a/crawler/discussions.js b/crawler/discussions.js new file mode 100644 index 00000000..f6b7e2e0 --- /dev/null +++ b/crawler/discussions.js @@ -0,0 +1,418 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'discussions'}) +const db = require('../dbs/profile-data-db') +const crawler = require('./index') +const datLibrary = require('../dat/library') +const lock = require('../lib/lock') +const knex = require('../lib/knex') +const siteDescriptions = require('./site-descriptions') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename, ensureDirectory} = require('./util') +const discussionSchema = require('./json-schemas/discussion') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/discussion' +const JSON_PATH_REGEX = /^\/data\/discussions\/([^/]+)\.json$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription + * + * @typedef {Object} Discussion + * @prop {string} pathname + * @prop {string} title + * @prop {string} body + * @prop {string} href + * @prop {string[]} tags + * @prop {string} createdAt + * @prop {string} updatedAt + * @prop {SiteDescription} author + * @prop {string} visibility + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validateDiscussion = ajv.compile(discussionSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for discussions. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_discussions', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling discussions', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_discussions WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_discussions', TABLE_VERSION, crawlSource, 0) + } + + // collect changed discussions + var changedDiscussions = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedDiscussions.length) { + logger.verbose('Collected new/changed discussion files', {details: {url: archive.url, changedDiscussions: changedDiscussions.map(p => p.name)}}) + } else { + logger.debug('No new discussion-files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_discussions', 0, changedDiscussions.length) + + // read and apply each discussion in order + var progress = 0 + for (let changedDiscussion of changedDiscussions) { + // TODO Currently the crawler will abort reading the feed if any discussion fails to load + // this means that a single unreachable file can stop the forward progress of discussion indexing + // to solve this, we need to find a way to tolerate unreachable discussion-files without losing our ability to efficiently detect new discussions + // -prf + if (changedDiscussion.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_discussions WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedDiscussion.name]) + events.emit('discussion-removed', archive.url) + } else { + // read + let discussionString + try { + discussionString = await archive.pda.readFile(changedDiscussion.name, 'utf8') + } catch (err) { + logger.warn('Failed to read discussion file, aborting', {details: {url: archive.url, name: changedDiscussion.name, err}}) + return // abort indexing + } + + // parse and validate + let discussion + try { + discussion = JSON.parse(discussionString) + let valid = validateDiscussion(discussion) + if (!valid) throw ajv.errorsText(validateDiscussion.errors) + } catch (err) { + logger.warn('Failed to parse discussion file, skipping', {details: {url: archive.url, name: changedDiscussion.name, err}}) + continue // skip + } + + // massage the discussion + discussion.createdAt = Number(new Date(discussion.createdAt)) + discussion.updatedAt = Number(new Date(discussion.updatedAt)) + if (!discussion.title) discussion.title = '' // optional + if (!discussion.href) discussion.href = '' // optional + if (!discussion.tags) discussion.tags = [] // optional + if (isNaN(discussion.updatedAt)) discussion.updatedAt = 0 // optional + + // upsert + let discussionId = 0 + let existingDiscussion = await db.get(knex('crawl_discussions') + .select('id') + .where({ + crawlSourceId: crawlSource.id, + pathname: changedDiscussion.name + }) + ) + if (existingDiscussion) { + let res = await db.run(knex('crawl_discussions') + .where({ + crawlSourceId: crawlSource.id, + pathname: changedDiscussion.name + }).update({ + crawledAt: Date.now(), + title: discussion.title, + body: discussion.body, + href: discussion.href, + createdAt: discussion.createdAt, + updatedAt: discussion.updatedAt, + }) + ) + discussionId = existingDiscussion.id + events.emit('discussion-updated', archive.url) + } else { + let res = await db.run(knex('crawl_discussions') + .insert({ + crawlSourceId: crawlSource.id, + pathname: changedDiscussion.name, + crawledAt: Date.now(), + title: discussion.title, + body: discussion.body, + href: discussion.href, + createdAt: discussion.createdAt, + updatedAt: discussion.updatedAt, + }) + ) + discussionId = +res.lastID + events.emit('discussion-added', archive.url) + } + await db.run(`DELETE FROM crawl_discussions_tags WHERE crawlDiscussionId = ?`, [discussionId]) + for (let tag of discussion.tags) { + await db.run(`INSERT OR IGNORE INTO crawl_tags (tag) VALUES (?)`, [tag]) + let tagRow = await db.get(`SELECT id FROM crawl_tags WHERE tag = ?`, [tag]) + await db.run(`INSERT INTO crawl_discussions_tags (crawlDiscussionId, crawlTagId) VALUES (?, ?)`, [discussionId, tagRow.id]) + } + } + + // checkpoint our progress + await doCheckpoint('crawl_discussions', TABLE_VERSION, crawlSource, changedDiscussion.version) + emitProgressEvent(archive.url, 'crawl_discussions', ++progress, changedDiscussions.length) + } + logger.silly(`Finished crawling discussions`, {details: {url: archive.url}}) + }) +} + +/** + * @description + * List crawled discussions. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.tags] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.list = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'number', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('tags' in opts.filters) { + if (Array.isArray(opts.filters.tags)) { + assert(opts.filters.tags.every(v => typeof v === 'string'), 'Tags filter must be a string or array of strings') + } else { + assert(typeof opts.filters.tags === 'string', 'Tags filter must be a string or array of strings') + opts.filters.tags = [opts.filters.tags] + } + } + } + + // build query + var sql = knex('crawl_discussions') + .select('crawl_discussions.*') + .select('crawl_sources.url as crawlSourceUrl') + .select(knex.raw('group_concat(crawl_tags.tag, ",") as tags')) + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_discussions.crawlSourceId') + .leftJoin('crawl_discussions_tags', 'crawl_discussions_tags.crawlDiscussionId', '=', 'crawl_discussions.id') + .leftJoin('crawl_tags', 'crawl_discussions_tags.crawlTagId', '=', 'crawl_tags.id') + .groupBy('crawl_discussions.id') + .orderBy('crawl_discussions.createdAt', opts.reverse ? 'DESC' : 'ASC') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + var discussions = await Promise.all(rows.map(massageDiscussionRow)) + + // apply tags filter + if (opts && opts.filters && opts.filters.tags) { + const someFn = t => opts.filters.tags.includes(t) + discussions = discussions.filter(discussion => discussion.tags.some(someFn)) + } + + return discussions +} + +/** + * @description + * Get crawled discussion. + * + * @param {string} url - The URL of the discussion + * @returns {Promise} + */ +const get = exports.get = async function (url) { + // validate & parse params + var urlParsed + if (url) { + try { urlParsed = new URL(url) } + catch (e) { throw new Error('Invalid URL: ' + url) } + } + + // build query + var sql = knex('crawl_discussions') + .select('crawl_discussions.*') + .select('crawl_sources.url as crawlSourceUrl') + .select(knex.raw('group_concat(crawl_tags.tag, ",") as tags')) + .innerJoin('crawl_sources', function () { + this.on('crawl_sources.id', '=', 'crawl_discussions.crawlSourceId') + .andOn('crawl_sources.url', '=', knex.raw('?', `${urlParsed.protocol}//${urlParsed.hostname}`)) + }) + .leftJoin('crawl_discussions_tags', 'crawl_discussions_tags.crawlDiscussionId', '=', 'crawl_discussions.id') + .leftJoin('crawl_tags', 'crawl_tags.id', '=', 'crawl_discussions_tags.crawlTagId') + .where('crawl_discussions.pathname', urlParsed.pathname) + .groupBy('crawl_discussions.id') + + // execute query + return await massageDiscussionRow(await db.get(sql)) +} + +/** + * @description + * Create a new discussion. + * + * @param {InternalDatArchive} archive - where to write the discussion to. + * @param {Object} discussion + * @param {string} discussion.title + * @param {string} discussion.body + * @param {string} discussion.href + * @param {string[]} discussion.tags + * @param {string} discussion.visibility + * @returns {Promise} url + */ +exports.add = async function (archive, discussion) { + // TODO visibility + + var discussionObject = { + type: JSON_TYPE, + title: discussion.title, + body: discussion.body, + href: discussion.href, + tags: discussion.tags, + createdAt: (new Date()).toISOString() + } + var valid = validateDiscussion(discussionObject) + if (!valid) throw ajv.errorsText(validateDiscussion.errors) + + var filename = generateTimeFilename() + var filepath = `/data/discussions/${filename}.json` + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/discussions') + await archive.pda.writeFile(filepath, JSON.stringify(discussionObject, null, 2)) + await crawler.crawlSite(archive) + return archive.url + filepath +} + +/** + * @description + * Update the content of an existing discussion. + * + * @param {InternalDatArchive} archive - where to write the discussion to. + * @param {string} pathname - the pathname of the discussion. + * @param {Object} discussion + * @param {string} [discussion.title] + * @param {string} [discussion.body] + * @param {string} [discussion.href] + * @param {string[]} [discussion.tags] + * @param {string} [discussion.visibility] + * @returns {Promise} + */ +exports.edit = async function (archive, pathname, discussion) { + // TODO visibility + + var release = await lock('crawler:discussions:' + archive.url) + try { + // fetch discussion + var existingDiscussion = await get(archive.url + pathname) + if (!existingDiscussion) throw new Error('Discussion not found') + + // update discussion content + var discussionObject = { + type: JSON_TYPE, + title: ('title' in discussion) ? discussion.title : existingDiscussion.title, + body: ('body' in discussion) ? discussion.body : existingDiscussion.body, + href: ('href' in discussion) ? discussion.href : existingDiscussion.href, + tags: ('tags' in discussion) ? discussion.tags : existingDiscussion.tags, + createdAt: existingDiscussion.createdAt, + updatedAt: (new Date()).toISOString() + } + + // validate + var valid = validateDiscussion(discussionObject) + if (!valid) throw ajv.errorsText(validateDiscussion.errors) + + // write + await archive.pda.writeFile(pathname, JSON.stringify(discussionObject, null, 2)) + await crawler.crawlSite(archive) + } finally { + release() + } +} + +/** + * @description + * Delete an existing discussion + * + * @param {InternalDatArchive} archive - where to write the discussion to. + * @param {string} pathname - the pathname of the discussion. + * @returns {Promise} + */ +exports.remove = async function (archive, pathname) { + assert(typeof pathname === 'string', 'Remove() must be provided a valid URL string') + await archive.pda.unlink(pathname) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageDiscussionRow (row) { + if (!row) return null + var author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if (!author) { + author = { + url: row.crawlSourceUrl, + title: '', + description: '', + type: [], + thumbUrl: `${row.crawlSourceUrl}/thumb`, + descAuthor: {url: null} + } + } + return { + pathname: row.pathname, + author, + title: row.title, + body: row.body, + href: row.href, + tags: row.tags ? row.tags.split(',').filter(Boolean) : [], + createdAt: new Date(row.createdAt).toISOString(), + updatedAt: row.updatedAt ? new Date(row.updatedAt).toISOString() : null, + visibility: 'public' // TODO visibility + } +} diff --git a/crawler/follows.js b/crawler/follows.js new file mode 100644 index 00000000..1ba5bdd7 --- /dev/null +++ b/crawler/follows.js @@ -0,0 +1,390 @@ +const assert = require('assert') +const _difference = require('lodash.difference') +const Events = require('events') +const {URL} = require('url') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'follows'}) +const lock = require('../lib/lock') +const knex = require('../lib/knex') +const db = require('../dbs/profile-data-db') +const crawler = require('./index') +const datLibrary = require('../dat/library') +const siteDescriptions = require('./site-descriptions') +const {doCrawl, doCheckpoint, emitProgressEvent} = require('./util') +const followsSchema = require('./json-schemas/follows') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/follows' +const JSON_PATH = '/data/follows.json' + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef {import('./site-descriptions').SiteDescription} SiteDescription + * + * @typedef {Object} Follow + * @prop {SiteDescription} author + * @prop {SiteDescription} topic + * @prop {string} visibility + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validateFollows = ajv.compile(followsSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for follows. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_follows', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling follows', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_follows WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_follows', TABLE_VERSION, crawlSource, 0) + } + + // did follows.json change? + var change = changes.find(c => c.name === JSON_PATH) + if (!change) { + logger.debug('No change detected to follows record', {details: {url: archive.url}}) + if (changes.length) { + await doCheckpoint('crawl_follows', TABLE_VERSION, crawlSource, changes[changes.length - 1].version) + } + return + } + + logger.verbose('Change detected to follows record', {details: {url: archive.url}}) + emitProgressEvent(archive.url, 'crawl_follows', 0, 1) + + // read and validate + try { + var followsJson = await readFollowsFile(archive) + } catch (err) { + logger.warn('Failed to read follows file', {details: {url: archive.url, err}}) + return + } + + // diff against the current follows + var currentFollowRows = await db.all( + knex('crawl_follows') + .select('crawl_follows.*') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_follows.crawlSourceId') + .where('crawl_sources.url', archive.url) + ) + var currentFollows = currentFollowRows.map(({destUrl}) => destUrl) + var newFollows = followsJson.urls + var adds = _difference(newFollows, currentFollows) + var removes = _difference(currentFollows, newFollows) + logger.silly(`Adding ${adds.length} follows and removing ${removes.length} follows`, {details: {url: archive.url}}) + + // write updates + for (let add of adds) { + try { + await db.run(` + INSERT INTO crawl_follows (crawlSourceId, destUrl, crawledAt) VALUES (?, ?, ?) + `, [crawlSource.id, add, Date.now()]) + } catch (e) { + if (e.code === 'SQLITE_CONSTRAINT') { + // uniqueness constraint probably failed, which means we got a duplicate somehow + // dont worry about it + logger.warn('Attempted to insert duplicate follow record', {details: {url: archive.url, add}}) + } else { + throw e + } + } + if (!supressEvents) { + events.emit('follow-added', archive.url, add) + } + } + for (let remove of removes) { + await db.run(` + DELETE FROM crawl_follows WHERE crawlSourceId = ? AND destUrl = ? + `, [crawlSource.id, remove]) + if (supressEvents) { + events.emit('follow-removed', archive.url, remove) + } + } + + // write checkpoint as success + logger.silly(`Finished crawling follows`, {details: {url: archive.url}}) + await doCheckpoint('crawl_follows', TABLE_VERSION, crawlSource, changes[changes.length - 1].version) + emitProgressEvent(archive.url, 'crawl_follows', 1, 1) + }) +} + +/** + * @description + * List crawled follows. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.topics] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +const list = exports.list = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('topics' in opts.filters) { + if (Array.isArray(opts.filters.topics)) { + assert(opts.filters.topics.every(v => typeof v === 'string'), 'Topics filter must be a string or array of strings') + } else { + assert(typeof opts.filters.topics === 'string', 'Topics filter must be a string or array of strings') + opts.filters.topics = [opts.filters.topics] + } + opts.filters.topics = await Promise.all(opts.filters.topics.map(datLibrary.getPrimaryUrl)) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // execute query + let sql = knex('crawl_follows') + .select('crawl_follows.*') + .select('crawl_sources.url AS authorUrl') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_follows.crawlSourceId') + .orderBy('crawl_follows.destUrl', opts.reverse ? 'DESC' : 'ASC') + if (opts.limit) sql = sql.limit(opts.limit) + if (opts.offset) sql = sql.offset(opts.offset) + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.filters && opts.filters.topics) { + sql = sql.whereIn('crawl_follows.destUrl', opts.filters.topics) + } + var rows = await db.all(sql) + + // massage results + return (await Promise.all(rows.map(async (row) => { + var author = toOrigin(row.authorUrl) + var topic = toOrigin(row.destUrl) + return { + author: await siteDescriptions.getBest({subject: author}), + topic: await siteDescriptions.getBest({subject: topic}), + visibility: 'public' + } + }))).filter(record => !!record.author && !!record.topic) +} + +/** + * @description + * Get an individual follow. + * + * @param {string} author - (URL) the site being queried. + * @param {string} topic - (URL) does a follow this site? + * @returns {Promise} + */ +const get = exports.get = async function (author, topic) { + author = await datLibrary.getPrimaryUrl(author) + topic = await datLibrary.getPrimaryUrl(topic) + var res = await db.get(knex('crawl_follows') + .select('crawl_follows.*') + .select('crawl_sources.url AS authorUrl') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_follows.crawlSourceId') + .where('crawl_sources.url', author) + .where('crawl_follows.destUrl', topic)) + if (!res) return null + var record = { + author: await siteDescriptions.getBest({subject: toOrigin(res.authorUrl)}), + topic: await siteDescriptions.getBest({subject: toOrigin(res.destUrl)}), + visibility: 'public' + } + if (!record.author || !record.topic) return null + return record +} + +/** + * @description + * Add a follow to the given archive. + * + * @param {InternalDatArchive} archive + * @param {string} topic + * @param {Object} [opts] + * @param {string} [opts.visibility] + * @returns {Promise} + */ +exports.add = async function (archive, topic, opts) { + // TODO visibility + + // normalize topic + topic = await datLibrary.getPrimaryUrl(topic) + assert(typeof topic === 'string', 'Follow() must be given a valid URL') + + // write new follows.json + await updateFollowsFile(archive, followsJson => { + if (!followsJson.urls.find(v => v === topic)) { + followsJson.urls.push(topic) + } + }) + + // capture site description + /* dont await */siteDescriptions.capture(archive, topic) +} + +/** + * @description + * Edit a follow for the given archive. + * + * @param {InternalDatArchive} archive + * @param {string} topic + * @param {Object} [opts] + * @param {string} [opts.visibility] + * @returns {Promise} + */ +exports.edit = async function (archive, topic, opts) { + // TODO visibility + + // normalize topic + topic = await datLibrary.getPrimaryUrl(topic) + assert(typeof topic === 'string', 'Follow() must be given a valid URL') + + // write new follows.json + await updateFollowsFile(archive, followsJson => { + if (!followsJson.urls.find(v => v === topic)) { + followsJson.urls.push(topic) + } + }) +} + +/** + * @description + * Remove a follow from the given archive. + * + * @param {InternalDatArchive} archive + * @param {string} topic + * @returns {Promise} + */ +exports.remove = async function (archive, topic) { + // TODO private follows + + // normalize topic + topic = await datLibrary.getPrimaryUrl(topic) + assert(typeof topic === 'string', 'Unfollow() must be given a valid URL') + + // write new follows.json + await updateFollowsFile(archive, followsJson => { + var i = followsJson.urls.findIndex(v => v === topic) + if (i !== -1) { + followsJson.urls.splice(i, 1) + } + }) +} + +// internal methods +// = + +/** + * @param {string} url + * @returns {string} + */ +function toOrigin (url) { + try { + var urlParsed = new URL(url) + return urlParsed.protocol + '//' + urlParsed.hostname + } catch (e) { + return null + } +} + +/** + * @param {InternalDatArchive} archive + * @returns {Promise} + */ +async function readFollowsFile (archive) { + try { + var followsJson = await archive.pda.readFile(JSON_PATH, 'utf8') + } catch (e) { + if (e.notFound) return {type: JSON_TYPE, urls: []} // empty default when not found + throw e + } + followsJson = JSON.parse(followsJson) + var valid = validateFollows(followsJson) + if (!valid) throw ajv.errorsText(validateFollows.errors) + return followsJson +} + +/** + * @param {InternalDatArchive} archive + * @param {function(Object): void} updateFn + * @returns {Promise} + */ +async function updateFollowsFile (archive, updateFn) { + var release = await lock('crawler:follows:' + archive.url) + try { + // read the follows file + try { + var followsJson = await readFollowsFile(archive) + } catch (err) { + if (err.notFound) { + // create new + followsJson = { + type: JSON_TYPE, + urls: [] + } + } else { + logger.warn('Failed to read follows file', {details: {url: archive.url, err}}) + throw err + } + } + + // apply update + updateFn(followsJson) + + // write the follows file + await archive.pda.mkdir('/data').catch(err => undefined) + await archive.pda.writeFile(JSON_PATH, JSON.stringify(followsJson, null, 2), 'utf8') + + // trigger crawl now + await crawler.crawlSite(archive) + } finally { + release() + } +} diff --git a/crawler/index.js b/crawler/index.js new file mode 100644 index 00000000..dc697119 --- /dev/null +++ b/crawler/index.js @@ -0,0 +1,194 @@ +const emitStream = require('emit-stream') +const _throttle = require('lodash.throttle') +const logger = require('../logger').category('crawler') +const lock = require('../lib/lock') +const knex = require('../lib/knex') +const db = require('../dbs/profile-data-db') +const archivesDb = require('../dbs/archives') +const dat = require('../dat') +const users = require('../users') + +const {crawlerEvents, toHostname} = require('./util') +const bookmarks = require('./bookmarks') +const comments = require('./comments') +const discussions = require('./discussions') +const follows = require('./follows') +const media = require('./media') +const posts = require('./posts') +const reactions = require('./reactions') +const siteDescriptions = require('./site-descriptions') +const votes = require('./votes') + +// globals +// = + +var watches = {} + +// exported api +// = + +exports.bookmarks = bookmarks +exports.comments = comments +exports.discussions = discussions +exports.follows = follows +exports.media = media +exports.posts = posts +exports.reactions = reactions +exports.siteDescriptions = siteDescriptions +exports.votes = votes +const createEventsStream = exports.createEventsStream = () => emitStream(crawlerEvents) + +exports.setup = async function () { + logger.info('Initialized crawler') +} + +exports.watchSite = async function (archive) { + if (typeof archive === 'string') { + archive = await dat.library.getOrLoadArchive(archive) + } + logger.silly('Watching site', {url: archive.url}) + + if (!(archive.url in watches)) { + crawlerEvents.emit('watch', {sourceUrl: archive.url}) + const queueCrawl = _throttle(() => crawlSite(archive), 5e3) + + // watch for file changes + watches[archive.url] = archive.pda.watch() + watches[archive.url].on('data', ([event, args]) => { + // BUG watch is really inconsistent -prf + logger.debug('MIRACLE ALERT! The crawler watch stream emitted a change event', {url: archive.url, event, args}) + if (event === 'invalidated') { + queueCrawl() + } + }) + + // run the first crawl + crawlSite(archive) + } +} + +exports.unwatchSite = async function (url) { + // stop watching for file changes + url = await dat.library.getPrimaryUrl(url) + if (url in watches) { + logger.silly('Unwatching site', {url}) + crawlerEvents.emit('unwatch', {sourceUrl: url}) + watches[url].close() + watches[url] = null + } +} + +const crawlSite = +exports.crawlSite = async function (archive) { + if (typeof archive === 'string') { + archive = await dat.library.getOrLoadArchive(archive) + } + logger.silly('Crawling site', {details: {url: archive.url}}) + crawlerEvents.emit('crawl-start', {sourceUrl: archive.url}) + var release = await lock('crawl:' + archive.url) + try { + var url = archive.url + + // fetch current dns record + var datDnsRecord = null + if (archive.domain) { + datDnsRecord = await db.get(knex('dat_dns').where({name: archive.domain, isCurrent: 1})) + } + + // get/create crawl source + var crawlSource = await db.get(`SELECT id, url, datDnsId FROM crawl_sources WHERE url = ?`, [url]) + if (!crawlSource) { + let res = await db.run(knex('crawl_sources').insert({ + url, + datDnsId: datDnsRecord ? datDnsRecord.id : undefined + })) + crawlSource = {id: res.lastID, url, datDnsId: datDnsRecord ? datDnsRecord.id : undefined} + } + crawlSource.globalResetRequired = false + + // check for dns changes + var didDnsChange = datDnsRecord && crawlSource.datDnsId !== datDnsRecord.id + if (didDnsChange) { + crawlSource.globalResetRequired = true + logger.verbose('Site DNS change detected, recrawling site', {details: {url: archive.url}}) + crawlerEvents.emit('crawl-dns-change', {sourceUrl: archive.url}) + } + + // crawl individual sources + await Promise.all([ + bookmarks.crawlSite(archive, crawlSource), + comments.crawlSite(archive, crawlSource), + discussions.crawlSite(archive, crawlSource), + follows.crawlSite(archive, crawlSource), + media.crawlSite(archive, crawlSource), + posts.crawlSite(archive, crawlSource), + reactions.crawlSite(archive, crawlSource), + siteDescriptions.crawlSite(archive, crawlSource), + votes.crawlSite(archive, crawlSource) + ]) + + // update dns tracking + if (didDnsChange) { + await db.run( + knex('crawl_sources') + .update({datDnsId: datDnsRecord.id}) + .where({id: crawlSource.id}) + ) + } + } catch (err) { + logger.error('Failed to crawl site', {details: {url: archive.url, err: err.toString()}}) + crawlerEvents.emit('crawl-error', {sourceUrl: archive.url, err: err.toString()}) + } finally { + crawlerEvents.emit('crawl-finish', {sourceUrl: archive.url}) + release() + } +} + +const getCrawlStates = +exports.getCrawlStates = async function () { + var rows = await db.all(` + SELECT + crawl_sources.url AS url, + GROUP_CONCAT(crawl_sources_meta.crawlSourceVersion) AS versions, + GROUP_CONCAT(crawl_sources_meta.crawlDataset) AS datasets, + MAX(crawl_sources_meta.updatedAt) AS updatedAt + FROM crawl_sources + INNER JOIN crawl_sources_meta ON crawl_sources_meta.crawlSourceId = crawl_sources.id + GROUP BY crawl_sources.id + `) + return Promise.all(rows.map(async ({url, versions, datasets, updatedAt}) => { + var datasetVersions = {} + versions = versions.split(',') + datasets = datasets.split(',') + for (let i = 0; i < datasets.length; i++) { + datasetVersions[datasets[i]] = Number(versions[i]) + } + try { + var meta = await archivesDb.getMeta(toHostname(url)) + return {url, title: meta.title, datasetVersions, updatedAt} + } catch (e) { + console.error('Error loading archive meta', url, e) + return {url, title: '', datasetVersions: {}, updatedAt: null} + } + })) +} + +const resetSite = +exports.resetSite = async function (url) { + url = await dat.library.getPrimaryUrl(url) + var release = await lock('crawl:' + url) + try { + logger.debug('Resetting site', {details: {url}}) + await db.run(`DELETE FROM crawl_sources WHERE url = ?`, [url]) + } finally { + release() + } +} + +exports.WEBAPI = { + listSuggestions: require('./search').listSuggestions, + createEventsStream, + getCrawlStates, + crawlSite, + resetSite +} \ No newline at end of file diff --git a/crawler/json-schemas/bookmark.js b/crawler/json-schemas/bookmark.js new file mode 100644 index 00000000..80ae5660 --- /dev/null +++ b/crawler/json-schemas/bookmark.js @@ -0,0 +1,47 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/bookmark.json', + 'type': 'object', + 'title': 'Bookmark', + 'description': 'A saved/shared link to some URL.', + 'required': ['type', 'href', 'title', 'createdAt'], + 'properties': { + 'type': { + 'type': 'string', + 'description': "The object's type", + 'const': 'unwalled.garden/bookmark' + }, + 'href': { + 'type': 'string', + 'format': 'uri', + 'maxLength': 10000 + }, + 'title': { + 'type': 'string', + 'maxLength': 280 + }, + 'description': { + 'type': 'string', + 'maxLength': 560 + }, + 'tags': { + 'type': 'array', + 'items': { + 'type': 'string', + 'maxLength': 100, + 'pattern': '^[A-Za-z][A-Za-z0-9-_?]*$' + } + }, + 'ext': { + 'type': 'object' + }, + 'createdAt': { + 'type': 'string', + 'format': 'date-time' + }, + 'updatedAt': { + 'type': 'string', + 'format': 'date-time' + } + } +} \ No newline at end of file diff --git a/crawler/json-schemas/comment.js b/crawler/json-schemas/comment.js new file mode 100644 index 00000000..595a15ad --- /dev/null +++ b/crawler/json-schemas/comment.js @@ -0,0 +1,44 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/comment.json', + 'type': 'object', + 'title': 'Comment', + 'description': 'A text post about some resource.', + 'required': [ + 'type', + 'topic', + 'body', + 'createdAt' + ], + 'properties': { + 'type': { + 'type': 'string', + 'description': "The object's type", + 'const': 'unwalled.garden/comment' + }, + 'topic': { + 'type': 'string', + 'description': 'What this comment is about', + 'format': 'uri' + }, + 'replyTo': { + 'type': 'string', + 'description': 'What this comment is replying to', + 'format': 'uri' + }, + 'body': { + 'type': 'string', + 'description': "The post's text content" + }, + 'createdAt': { + 'type': 'string', + 'format': 'date-time', + 'description': "The time of this post's creation" + }, + 'updatedAt': { + 'type': 'string', + 'format': 'date-time', + 'description': "The time of this post's last edit" + } + } +} \ No newline at end of file diff --git a/crawler/json-schemas/discussion.js b/crawler/json-schemas/discussion.js new file mode 100644 index 00000000..ba561eea --- /dev/null +++ b/crawler/json-schemas/discussion.js @@ -0,0 +1,43 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/discussion.json', + 'type': 'object', + 'title': 'Discussion', + 'description': 'A forum discussion.', + 'required': ['type', 'title', 'createdAt'], + 'properties': { + 'type': { + 'type': 'string', + 'description': "The object's type", + 'const': 'unwalled.garden/discussion' + }, + 'title': { + 'type': 'string', + 'maxLength': 280 + }, + 'body': { + 'type': 'string', + 'maxLength': 1000000 + }, + 'href': { + 'type': 'string', + 'format': 'uri' + }, + 'tags': { + 'type': 'array', + 'items': { + 'type': 'string', + 'maxLength': 100, + 'pattern': '^[A-Za-z][A-Za-z0-9-_?]*$' + } + }, + 'createdAt': { + 'type': 'string', + 'format': 'date-time' + }, + 'updatedAt': { + 'type': 'string', + 'format': 'date-time' + } + } +} \ No newline at end of file diff --git a/crawler/json-schemas/follows.js b/crawler/json-schemas/follows.js new file mode 100644 index 00000000..28e1910d --- /dev/null +++ b/crawler/json-schemas/follows.js @@ -0,0 +1,30 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/follows.json', + 'type': 'object', + 'title': 'Follows', + 'description': ' A list of data subscriptions.', + 'required': [ + 'type', + 'urls' + ], + 'properties': { + 'type': { + 'type': 'string', + 'description': "The object's type", + 'const': 'unwalled.garden/follows' + }, + 'urls': { + 'type': 'array', + 'description': 'The followed URLs', + 'items': { + 'type': 'string', + 'format': 'uri', + 'examples': [ + 'dat://beakerbrowser.com' + ] + } + } + }, + 'additionalProperties': false +} \ No newline at end of file diff --git a/crawler/json-schemas/media.js b/crawler/json-schemas/media.js new file mode 100644 index 00000000..c904f02c --- /dev/null +++ b/crawler/json-schemas/media.js @@ -0,0 +1,52 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/media.json', + 'type': 'object', + 'title': 'media', + 'description': 'A published item of content.', + 'required': [ + 'type', + 'subtype', + 'href', + 'title', + 'createdAt' + ], + 'properties': { + 'type': { + 'type': 'string', + 'const': 'unwalled.garden/media' + }, + 'subtype': { + 'type': 'string' + }, + 'href': { + 'type': 'string', + 'format': 'uri' + }, + 'title': { + 'type': 'string' + }, + 'description': { + 'type': 'string' + }, + 'tags': { + 'type': 'array', + 'items': { + 'type': 'string', + 'maxLength': 100, + 'pattern': '^[A-Za-z][A-Za-z0-9-_?]*$' + } + }, + 'createdAt': { + 'type': 'string', + 'format': 'date-time' + }, + 'updatedAt': { + 'type': 'string', + 'format': 'date-time' + }, + 'ext': { + 'type': 'object' + } + } +} \ No newline at end of file diff --git a/crawler/json-schemas/post.js b/crawler/json-schemas/post.js new file mode 100644 index 00000000..2d30cc10 --- /dev/null +++ b/crawler/json-schemas/post.js @@ -0,0 +1,30 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/post.json', + 'type': 'object', + 'title': 'Post', + 'description': 'A broadcasted piece of content.', + 'required': ['type', 'body', 'createdAt'], + 'properties': { + 'type': { + 'type': 'string', + 'description': "The object's type", + 'const': 'unwalled.garden/post' + }, + 'body': { + 'type': 'string', + 'description': "The post's text body", + 'maxLength': 1000000 + }, + 'createdAt': { + 'type': 'string', + 'format': 'date-time', + 'description': "The time of this post's creation" + }, + 'updatedAt': { + 'type': 'string', + 'format': 'date-time', + 'description': "The time of this post's last edit" + } + } +} \ No newline at end of file diff --git a/crawler/json-schemas/reaction.js b/crawler/json-schemas/reaction.js new file mode 100644 index 00000000..b27b7cf6 --- /dev/null +++ b/crawler/json-schemas/reaction.js @@ -0,0 +1,3873 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/reaction.json', + 'type': 'object', + 'title': 'Reaction', + 'description': 'An emoji annotation on some resource.', + 'required': [ + 'type', + 'topic', + 'emojis' + ], + 'properties': { + 'type': { + 'type': 'string', + 'description': "The object's type", + 'const': 'unwalled.garden/reaction' + }, + 'topic': { + 'type': 'string', + 'description': 'What this reaction is about', + 'format': 'uri', + 'examples': [ + 'dat://beakerbrowser.com' + ] + }, + 'emojis': { + 'type': 'array', + 'description': 'The reaction emojis. Must contain supported emojis.', + 'items': { + 'type': 'string', + 'enum': [ + '😀', + '😃', + '😄', + '😁', + '😆', + '😅', + '🤣', + '😂', + '🙂', + '🙃', + '😉', + '😊', + '😇', + '🥰', + '😍', + '🤩', + '😘', + '😗', + '☺️', + '☺', + '😚', + '😙', + '😋', + '😛', + '😜', + '🤪', + '😝', + '🤑', + '🤗', + '🤭', + '🤫', + '🤔', + '🤐', + '🤨', + '😐', + '😑', + '😶', + '😏', + '😒', + '🙄', + '😬', + '🤥', + '😌', + '😔', + '😪', + '🤤', + '😴', + '😷', + '🤒', + '🤕', + '🤢', + '🤮', + '🤧', + '🥵', + '🥶', + '🥴', + '😵', + '🤯', + '🤠', + '🥳', + '😎', + '🤓', + '🧐', + '😕', + '😟', + '🙁', + '☹️', + '☹', + '😮', + '😯', + '😲', + '😳', + '🥺', + '😦', + '😧', + '😨', + '😰', + '😥', + '😢', + '😭', + '😱', + '😖', + '😣', + '😞', + '😓', + '😩', + '😫', + '🥱', + '😤', + '😡', + '😠', + '🤬', + '😈', + '👿', + '💀', + '☠️', + '☠', + '💩', + '🤡', + '👹', + '👺', + '👻', + '👽', + '👾', + '🤖', + '😺', + '😸', + '😹', + '😻', + '😼', + '😽', + '🙀', + '😿', + '😾', + '🙈', + '🙉', + '🙊', + '💋', + '💌', + '💘', + '💝', + '💖', + '💗', + '💓', + '💞', + '💕', + '💟', + '❣️', + '❣', + '💔', + '❤️', + '❤', + '🧡', + '💛', + '💚', + '💙', + '💜', + '🤎', + '🖤', + '🤍', + '💯', + '💢', + '💥', + '💫', + '💦', + '💨', + '🕳️', + '🕳', + '💣', + '💬', + '👁️‍🗨️', + '👁‍🗨️', + '👁️‍🗨', + '👁‍🗨', + '🗨️', + '🗨', + '🗯️', + '🗯', + '💭', + '💤', + '👋', + '👋🏻', + '👋🏼', + '👋🏽', + '👋🏾', + '👋🏿', + '🤚', + '🤚🏻', + '🤚🏼', + '🤚🏽', + '🤚🏾', + '🤚🏿', + '🖐️', + '🖐', + '🖐🏻', + '🖐🏼', + '🖐🏽', + '🖐🏾', + '🖐🏿', + '✋', + '✋🏻', + '✋🏼', + '✋🏽', + '✋🏾', + '✋🏿', + '🖖', + '🖖🏻', + '🖖🏼', + '🖖🏽', + '🖖🏾', + '🖖🏿', + '👌', + '👌🏻', + '👌🏼', + '👌🏽', + '👌🏾', + '👌🏿', + '🤏', + '🤏🏻', + '🤏🏼', + '🤏🏽', + '🤏🏾', + '🤏🏿', + '✌️', + '✌', + '✌🏻', + '✌🏼', + '✌🏽', + '✌🏾', + '✌🏿', + '🤞', + '🤞🏻', + '🤞🏼', + '🤞🏽', + '🤞🏾', + '🤞🏿', + '🤟', + '🤟🏻', + '🤟🏼', + '🤟🏽', + '🤟🏾', + '🤟🏿', + '🤘', + '🤘🏻', + '🤘🏼', + '🤘🏽', + '🤘🏾', + '🤘🏿', + '🤙', + '🤙🏻', + '🤙🏼', + '🤙🏽', + '🤙🏾', + '🤙🏿', + '👈', + '👈🏻', + '👈🏼', + '👈🏽', + '👈🏾', + '👈🏿', + '👉', + '👉🏻', + '👉🏼', + '👉🏽', + '👉🏾', + '👉🏿', + '👆', + '👆🏻', + '👆🏼', + '👆🏽', + '👆🏾', + '👆🏿', + '🖕', + '🖕🏻', + '🖕🏼', + '🖕🏽', + '🖕🏾', + '🖕🏿', + '👇', + '👇🏻', + '👇🏼', + '👇🏽', + '👇🏾', + '👇🏿', + '☝️', + '☝', + '☝🏻', + '☝🏼', + '☝🏽', + '☝🏾', + '☝🏿', + '👍', + '👍🏻', + '👍🏼', + '👍🏽', + '👍🏾', + '👍🏿', + '👎', + '👎🏻', + '👎🏼', + '👎🏽', + '👎🏾', + '👎🏿', + '✊', + '✊🏻', + '✊🏼', + '✊🏽', + '✊🏾', + '✊🏿', + '👊', + '👊🏻', + '👊🏼', + '👊🏽', + '👊🏾', + '👊🏿', + '🤛', + '🤛🏻', + '🤛🏼', + '🤛🏽', + '🤛🏾', + '🤛🏿', + '🤜', + '🤜🏻', + '🤜🏼', + '🤜🏽', + '🤜🏾', + '🤜🏿', + '👏', + '👏🏻', + '👏🏼', + '👏🏽', + '👏🏾', + '👏🏿', + '🙌', + '🙌🏻', + '🙌🏼', + '🙌🏽', + '🙌🏾', + '🙌🏿', + '👐', + '👐🏻', + '👐🏼', + '👐🏽', + '👐🏾', + '👐🏿', + '🤲', + '🤲🏻', + '🤲🏼', + '🤲🏽', + '🤲🏾', + '🤲🏿', + '🤝', + '🙏', + '🙏🏻', + '🙏🏼', + '🙏🏽', + '🙏🏾', + '🙏🏿', + '✍️', + '✍', + '✍🏻', + '✍🏼', + '✍🏽', + '✍🏾', + '✍🏿', + '💅', + '💅🏻', + '💅🏼', + '💅🏽', + '💅🏾', + '💅🏿', + '🤳', + '🤳🏻', + '🤳🏼', + '🤳🏽', + '🤳🏾', + '🤳🏿', + '💪', + '💪🏻', + '💪🏼', + '💪🏽', + '💪🏾', + '💪🏿', + '🦾', + '🦿', + '🦵', + '🦵🏻', + '🦵🏼', + '🦵🏽', + '🦵🏾', + '🦵🏿', + '🦶', + '🦶🏻', + '🦶🏼', + '🦶🏽', + '🦶🏾', + '🦶🏿', + '👂', + '👂🏻', + '👂🏼', + '👂🏽', + '👂🏾', + '👂🏿', + '🦻', + '🦻🏻', + '🦻🏼', + '🦻🏽', + '🦻🏾', + '🦻🏿', + '👃', + '👃🏻', + '👃🏼', + '👃🏽', + '👃🏾', + '👃🏿', + '🧠', + '🦷', + '🦴', + '👀', + '👁️', + '👁', + '👅', + '👄', + '👶', + '👶🏻', + '👶🏼', + '👶🏽', + '👶🏾', + '👶🏿', + '🧒', + '🧒🏻', + '🧒🏼', + '🧒🏽', + '🧒🏾', + '🧒🏿', + '👦', + '👦🏻', + '👦🏼', + '👦🏽', + '👦🏾', + '👦🏿', + '👧', + '👧🏻', + '👧🏼', + '👧🏽', + '👧🏾', + '👧🏿', + '🧑', + '🧑🏻', + '🧑🏼', + '🧑🏽', + '🧑🏾', + '🧑🏿', + '👱', + '👱🏻', + '👱🏼', + '👱🏽', + '👱🏾', + '👱🏿', + '👨', + '👨🏻', + '👨🏼', + '👨🏽', + '👨🏾', + '👨🏿', + '🧔', + '🧔🏻', + '🧔🏼', + '🧔🏽', + '🧔🏾', + '🧔🏿', + '👱‍♂️', + '👱‍♂', + '👱🏻‍♂️', + '👱🏻‍♂', + '👱🏼‍♂️', + '👱🏼‍♂', + '👱🏽‍♂️', + '👱🏽‍♂', + '👱🏾‍♂️', + '👱🏾‍♂', + '👱🏿‍♂️', + '👱🏿‍♂', + '👨‍🦰', + '👨🏻‍🦰', + '👨🏼‍🦰', + '👨🏽‍🦰', + '👨🏾‍🦰', + '👨🏿‍🦰', + '👨‍🦱', + '👨🏻‍🦱', + '👨🏼‍🦱', + '👨🏽‍🦱', + '👨🏾‍🦱', + '👨🏿‍🦱', + '👨‍🦳', + '👨🏻‍🦳', + '👨🏼‍🦳', + '👨🏽‍🦳', + '👨🏾‍🦳', + '👨🏿‍🦳', + '👨‍🦲', + '👨🏻‍🦲', + '👨🏼‍🦲', + '👨🏽‍🦲', + '👨🏾‍🦲', + '👨🏿‍🦲', + '👩', + '👩🏻', + '👩🏼', + '👩🏽', + '👩🏾', + '👩🏿', + '👱‍♀️', + '👱‍♀', + '👱🏻‍♀️', + '👱🏻‍♀', + '👱🏼‍♀️', + '👱🏼‍♀', + '👱🏽‍♀️', + '👱🏽‍♀', + '👱🏾‍♀️', + '👱🏾‍♀', + '👱🏿‍♀️', + '👱🏿‍♀', + '👩‍🦰', + '👩🏻‍🦰', + '👩🏼‍🦰', + '👩🏽‍🦰', + '👩🏾‍🦰', + '👩🏿‍🦰', + '👩‍🦱', + '👩🏻‍🦱', + '👩🏼‍🦱', + '👩🏽‍🦱', + '👩🏾‍🦱', + '👩🏿‍🦱', + '👩‍🦳', + '👩🏻‍🦳', + '👩🏼‍🦳', + '👩🏽‍🦳', + '👩🏾‍🦳', + '👩🏿‍🦳', + '👩‍🦲', + '👩🏻‍🦲', + '👩🏼‍🦲', + '👩🏽‍🦲', + '👩🏾‍🦲', + '👩🏿‍🦲', + '🧓', + '🧓🏻', + '🧓🏼', + '🧓🏽', + '🧓🏾', + '🧓🏿', + '👴', + '👴🏻', + '👴🏼', + '👴🏽', + '👴🏾', + '👴🏿', + '👵', + '👵🏻', + '👵🏼', + '👵🏽', + '👵🏾', + '👵🏿', + '🙍', + '🙍🏻', + '🙍🏼', + '🙍🏽', + '🙍🏾', + '🙍🏿', + '🙍‍♂️', + '🙍‍♂', + '🙍🏻‍♂️', + '🙍🏻‍♂', + '🙍🏼‍♂️', + '🙍🏼‍♂', + '🙍🏽‍♂️', + '🙍🏽‍♂', + '🙍🏾‍♂️', + '🙍🏾‍♂', + '🙍🏿‍♂️', + '🙍🏿‍♂', + '🙍‍♀️', + '🙍‍♀', + '🙍🏻‍♀️', + '🙍🏻‍♀', + '🙍🏼‍♀️', + '🙍🏼‍♀', + '🙍🏽‍♀️', + '🙍🏽‍♀', + '🙍🏾‍♀️', + '🙍🏾‍♀', + '🙍🏿‍♀️', + '🙍🏿‍♀', + '🙎', + '🙎🏻', + '🙎🏼', + '🙎🏽', + '🙎🏾', + '🙎🏿', + '🙎‍♂️', + '🙎‍♂', + '🙎🏻‍♂️', + '🙎🏻‍♂', + '🙎🏼‍♂️', + '🙎🏼‍♂', + '🙎🏽‍♂️', + '🙎🏽‍♂', + '🙎🏾‍♂️', + '🙎🏾‍♂', + '🙎🏿‍♂️', + '🙎🏿‍♂', + '🙎‍♀️', + '🙎‍♀', + '🙎🏻‍♀️', + '🙎🏻‍♀', + '🙎🏼‍♀️', + '🙎🏼‍♀', + '🙎🏽‍♀️', + '🙎🏽‍♀', + '🙎🏾‍♀️', + '🙎🏾‍♀', + '🙎🏿‍♀️', + '🙎🏿‍♀', + '🙅', + '🙅🏻', + '🙅🏼', + '🙅🏽', + '🙅🏾', + '🙅🏿', + '🙅‍♂️', + '🙅‍♂', + '🙅🏻‍♂️', + '🙅🏻‍♂', + '🙅🏼‍♂️', + '🙅🏼‍♂', + '🙅🏽‍♂️', + '🙅🏽‍♂', + '🙅🏾‍♂️', + '🙅🏾‍♂', + '🙅🏿‍♂️', + '🙅🏿‍♂', + '🙅‍♀️', + '🙅‍♀', + '🙅🏻‍♀️', + '🙅🏻‍♀', + '🙅🏼‍♀️', + '🙅🏼‍♀', + '🙅🏽‍♀️', + '🙅🏽‍♀', + '🙅🏾‍♀️', + '🙅🏾‍♀', + '🙅🏿‍♀️', + '🙅🏿‍♀', + '🙆', + '🙆🏻', + '🙆🏼', + '🙆🏽', + '🙆🏾', + '🙆🏿', + '🙆‍♂️', + '🙆‍♂', + '🙆🏻‍♂️', + '🙆🏻‍♂', + '🙆🏼‍♂️', + '🙆🏼‍♂', + '🙆🏽‍♂️', + '🙆🏽‍♂', + '🙆🏾‍♂️', + '🙆🏾‍♂', + '🙆🏿‍♂️', + '🙆🏿‍♂', + '🙆‍♀️', + '🙆‍♀', + '🙆🏻‍♀️', + '🙆🏻‍♀', + '🙆🏼‍♀️', + '🙆🏼‍♀', + '🙆🏽‍♀️', + '🙆🏽‍♀', + '🙆🏾‍♀️', + '🙆🏾‍♀', + '🙆🏿‍♀️', + '🙆🏿‍♀', + '💁', + '💁🏻', + '💁🏼', + '💁🏽', + '💁🏾', + '💁🏿', + '💁‍♂️', + '💁‍♂', + '💁🏻‍♂️', + '💁🏻‍♂', + '💁🏼‍♂️', + '💁🏼‍♂', + '💁🏽‍♂️', + '💁🏽‍♂', + '💁🏾‍♂️', + '💁🏾‍♂', + '💁🏿‍♂️', + '💁🏿‍♂', + '💁‍♀️', + '💁‍♀', + '💁🏻‍♀️', + '💁🏻‍♀', + '💁🏼‍♀️', + '💁🏼‍♀', + '💁🏽‍♀️', + '💁🏽‍♀', + '💁🏾‍♀️', + '💁🏾‍♀', + '💁🏿‍♀️', + '💁🏿‍♀', + '🙋', + '🙋🏻', + '🙋🏼', + '🙋🏽', + '🙋🏾', + '🙋🏿', + '🙋‍♂️', + '🙋‍♂', + '🙋🏻‍♂️', + '🙋🏻‍♂', + '🙋🏼‍♂️', + '🙋🏼‍♂', + '🙋🏽‍♂️', + '🙋🏽‍♂', + '🙋🏾‍♂️', + '🙋🏾‍♂', + '🙋🏿‍♂️', + '🙋🏿‍♂', + '🙋‍♀️', + '🙋‍♀', + '🙋🏻‍♀️', + '🙋🏻‍♀', + '🙋🏼‍♀️', + '🙋🏼‍♀', + '🙋🏽‍♀️', + '🙋🏽‍♀', + '🙋🏾‍♀️', + '🙋🏾‍♀', + '🙋🏿‍♀️', + '🙋🏿‍♀', + '🧏', + '🧏🏻', + '🧏🏼', + '🧏🏽', + '🧏🏾', + '🧏🏿', + '🧏‍♂️', + '🧏‍♂', + '🧏🏻‍♂️', + '🧏🏻‍♂', + '🧏🏼‍♂️', + '🧏🏼‍♂', + '🧏🏽‍♂️', + '🧏🏽‍♂', + '🧏🏾‍♂️', + '🧏🏾‍♂', + '🧏🏿‍♂️', + '🧏🏿‍♂', + '🧏‍♀️', + '🧏‍♀', + '🧏🏻‍♀️', + '🧏🏻‍♀', + '🧏🏼‍♀️', + '🧏🏼‍♀', + '🧏🏽‍♀️', + '🧏🏽‍♀', + '🧏🏾‍♀️', + '🧏🏾‍♀', + '🧏🏿‍♀️', + '🧏🏿‍♀', + '🙇', + '🙇🏻', + '🙇🏼', + '🙇🏽', + '🙇🏾', + '🙇🏿', + '🙇‍♂️', + '🙇‍♂', + '🙇🏻‍♂️', + '🙇🏻‍♂', + '🙇🏼‍♂️', + '🙇🏼‍♂', + '🙇🏽‍♂️', + '🙇🏽‍♂', + '🙇🏾‍♂️', + '🙇🏾‍♂', + '🙇🏿‍♂️', + '🙇🏿‍♂', + '🙇‍♀️', + '🙇‍♀', + '🙇🏻‍♀️', + '🙇🏻‍♀', + '🙇🏼‍♀️', + '🙇🏼‍♀', + '🙇🏽‍♀️', + '🙇🏽‍♀', + '🙇🏾‍♀️', + '🙇🏾‍♀', + '🙇🏿‍♀️', + '🙇🏿‍♀', + '🤦', + '🤦🏻', + '🤦🏼', + '🤦🏽', + '🤦🏾', + '🤦🏿', + '🤦‍♂️', + '🤦‍♂', + '🤦🏻‍♂️', + '🤦🏻‍♂', + '🤦🏼‍♂️', + '🤦🏼‍♂', + '🤦🏽‍♂️', + '🤦🏽‍♂', + '🤦🏾‍♂️', + '🤦🏾‍♂', + '🤦🏿‍♂️', + '🤦🏿‍♂', + '🤦‍♀️', + '🤦‍♀', + '🤦🏻‍♀️', + '🤦🏻‍♀', + '🤦🏼‍♀️', + '🤦🏼‍♀', + '🤦🏽‍♀️', + '🤦🏽‍♀', + '🤦🏾‍♀️', + '🤦🏾‍♀', + '🤦🏿‍♀️', + '🤦🏿‍♀', + '🤷', + '🤷🏻', + '🤷🏼', + '🤷🏽', + '🤷🏾', + '🤷🏿', + '🤷‍♂️', + '🤷‍♂', + '🤷🏻‍♂️', + '🤷🏻‍♂', + '🤷🏼‍♂️', + '🤷🏼‍♂', + '🤷🏽‍♂️', + '🤷🏽‍♂', + '🤷🏾‍♂️', + '🤷🏾‍♂', + '🤷🏿‍♂️', + '🤷🏿‍♂', + '🤷‍♀️', + '🤷‍♀', + '🤷🏻‍♀️', + '🤷🏻‍♀', + '🤷🏼‍♀️', + '🤷🏼‍♀', + '🤷🏽‍♀️', + '🤷🏽‍♀', + '🤷🏾‍♀️', + '🤷🏾‍♀', + '🤷🏿‍♀️', + '🤷🏿‍♀', + '👨‍⚕️', + '👨‍⚕', + '👨🏻‍⚕️', + '👨🏻‍⚕', + '👨🏼‍⚕️', + '👨🏼‍⚕', + '👨🏽‍⚕️', + '👨🏽‍⚕', + '👨🏾‍⚕️', + '👨🏾‍⚕', + '👨🏿‍⚕️', + '👨🏿‍⚕', + '👩‍⚕️', + '👩‍⚕', + '👩🏻‍⚕️', + '👩🏻‍⚕', + '👩🏼‍⚕️', + '👩🏼‍⚕', + '👩🏽‍⚕️', + '👩🏽‍⚕', + '👩🏾‍⚕️', + '👩🏾‍⚕', + '👩🏿‍⚕️', + '👩🏿‍⚕', + '👨‍🎓', + '👨🏻‍🎓', + '👨🏼‍🎓', + '👨🏽‍🎓', + '👨🏾‍🎓', + '👨🏿‍🎓', + '👩‍🎓', + '👩🏻‍🎓', + '👩🏼‍🎓', + '👩🏽‍🎓', + '👩🏾‍🎓', + '👩🏿‍🎓', + '👨‍🏫', + '👨🏻‍🏫', + '👨🏼‍🏫', + '👨🏽‍🏫', + '👨🏾‍🏫', + '👨🏿‍🏫', + '👩‍🏫', + '👩🏻‍🏫', + '👩🏼‍🏫', + '👩🏽‍🏫', + '👩🏾‍🏫', + '👩🏿‍🏫', + '👨‍⚖️', + '👨‍⚖', + '👨🏻‍⚖️', + '👨🏻‍⚖', + '👨🏼‍⚖️', + '👨🏼‍⚖', + '👨🏽‍⚖️', + '👨🏽‍⚖', + '👨🏾‍⚖️', + '👨🏾‍⚖', + '👨🏿‍⚖️', + '👨🏿‍⚖', + '👩‍⚖️', + '👩‍⚖', + '👩🏻‍⚖️', + '👩🏻‍⚖', + '👩🏼‍⚖️', + '👩🏼‍⚖', + '👩🏽‍⚖️', + '👩🏽‍⚖', + '👩🏾‍⚖️', + '👩🏾‍⚖', + '👩🏿‍⚖️', + '👩🏿‍⚖', + '👨‍🌾', + '👨🏻‍🌾', + '👨🏼‍🌾', + '👨🏽‍🌾', + '👨🏾‍🌾', + '👨🏿‍🌾', + '👩‍🌾', + '👩🏻‍🌾', + '👩🏼‍🌾', + '👩🏽‍🌾', + '👩🏾‍🌾', + '👩🏿‍🌾', + '👨‍🍳', + '👨🏻‍🍳', + '👨🏼‍🍳', + '👨🏽‍🍳', + '👨🏾‍🍳', + '👨🏿‍🍳', + '👩‍🍳', + '👩🏻‍🍳', + '👩🏼‍🍳', + '👩🏽‍🍳', + '👩🏾‍🍳', + '👩🏿‍🍳', + '👨‍🔧', + '👨🏻‍🔧', + '👨🏼‍🔧', + '👨🏽‍🔧', + '👨🏾‍🔧', + '👨🏿‍🔧', + '👩‍🔧', + '👩🏻‍🔧', + '👩🏼‍🔧', + '👩🏽‍🔧', + '👩🏾‍🔧', + '👩🏿‍🔧', + '👨‍🏭', + '👨🏻‍🏭', + '👨🏼‍🏭', + '👨🏽‍🏭', + '👨🏾‍🏭', + '👨🏿‍🏭', + '👩‍🏭', + '👩🏻‍🏭', + '👩🏼‍🏭', + '👩🏽‍🏭', + '👩🏾‍🏭', + '👩🏿‍🏭', + '👨‍💼', + '👨🏻‍💼', + '👨🏼‍💼', + '👨🏽‍💼', + '👨🏾‍💼', + '👨🏿‍💼', + '👩‍💼', + '👩🏻‍💼', + '👩🏼‍💼', + '👩🏽‍💼', + '👩🏾‍💼', + '👩🏿‍💼', + '👨‍🔬', + '👨🏻‍🔬', + '👨🏼‍🔬', + '👨🏽‍🔬', + '👨🏾‍🔬', + '👨🏿‍🔬', + '👩‍🔬', + '👩🏻‍🔬', + '👩🏼‍🔬', + '👩🏽‍🔬', + '👩🏾‍🔬', + '👩🏿‍🔬', + '👨‍💻', + '👨🏻‍💻', + '👨🏼‍💻', + '👨🏽‍💻', + '👨🏾‍💻', + '👨🏿‍💻', + '👩‍💻', + '👩🏻‍💻', + '👩🏼‍💻', + '👩🏽‍💻', + '👩🏾‍💻', + '👩🏿‍💻', + '👨‍🎤', + '👨🏻‍🎤', + '👨🏼‍🎤', + '👨🏽‍🎤', + '👨🏾‍🎤', + '👨🏿‍🎤', + '👩‍🎤', + '👩🏻‍🎤', + '👩🏼‍🎤', + '👩🏽‍🎤', + '👩🏾‍🎤', + '👩🏿‍🎤', + '👨‍🎨', + '👨🏻‍🎨', + '👨🏼‍🎨', + '👨🏽‍🎨', + '👨🏾‍🎨', + '👨🏿‍🎨', + '👩‍🎨', + '👩🏻‍🎨', + '👩🏼‍🎨', + '👩🏽‍🎨', + '👩🏾‍🎨', + '👩🏿‍🎨', + '👨‍✈️', + '👨‍✈', + '👨🏻‍✈️', + '👨🏻‍✈', + '👨🏼‍✈️', + '👨🏼‍✈', + '👨🏽‍✈️', + '👨🏽‍✈', + '👨🏾‍✈️', + '👨🏾‍✈', + '👨🏿‍✈️', + '👨🏿‍✈', + '👩‍✈️', + '👩‍✈', + '👩🏻‍✈️', + '👩🏻‍✈', + '👩🏼‍✈️', + '👩🏼‍✈', + '👩🏽‍✈️', + '👩🏽‍✈', + '👩🏾‍✈️', + '👩🏾‍✈', + '👩🏿‍✈️', + '👩🏿‍✈', + '👨‍🚀', + '👨🏻‍🚀', + '👨🏼‍🚀', + '👨🏽‍🚀', + '👨🏾‍🚀', + '👨🏿‍🚀', + '👩‍🚀', + '👩🏻‍🚀', + '👩🏼‍🚀', + '👩🏽‍🚀', + '👩🏾‍🚀', + '👩🏿‍🚀', + '👨‍🚒', + '👨🏻‍🚒', + '👨🏼‍🚒', + '👨🏽‍🚒', + '👨🏾‍🚒', + '👨🏿‍🚒', + '👩‍🚒', + '👩🏻‍🚒', + '👩🏼‍🚒', + '👩🏽‍🚒', + '👩🏾‍🚒', + '👩🏿‍🚒', + '👮', + '👮🏻', + '👮🏼', + '👮🏽', + '👮🏾', + '👮🏿', + '👮‍♂️', + '👮‍♂', + '👮🏻‍♂️', + '👮🏻‍♂', + '👮🏼‍♂️', + '👮🏼‍♂', + '👮🏽‍♂️', + '👮🏽‍♂', + '👮🏾‍♂️', + '👮🏾‍♂', + '👮🏿‍♂️', + '👮🏿‍♂', + '👮‍♀️', + '👮‍♀', + '👮🏻‍♀️', + '👮🏻‍♀', + '👮🏼‍♀️', + '👮🏼‍♀', + '👮🏽‍♀️', + '👮🏽‍♀', + '👮🏾‍♀️', + '👮🏾‍♀', + '👮🏿‍♀️', + '👮🏿‍♀', + '🕵️', + '🕵', + '🕵🏻', + '🕵🏼', + '🕵🏽', + '🕵🏾', + '🕵🏿', + '🕵️‍♂️', + '🕵‍♂️', + '🕵️‍♂', + '🕵‍♂', + '🕵🏻‍♂️', + '🕵🏻‍♂', + '🕵🏼‍♂️', + '🕵🏼‍♂', + '🕵🏽‍♂️', + '🕵🏽‍♂', + '🕵🏾‍♂️', + '🕵🏾‍♂', + '🕵🏿‍♂️', + '🕵🏿‍♂', + '🕵️‍♀️', + '🕵‍♀️', + '🕵️‍♀', + '🕵‍♀', + '🕵🏻‍♀️', + '🕵🏻‍♀', + '🕵🏼‍♀️', + '🕵🏼‍♀', + '🕵🏽‍♀️', + '🕵🏽‍♀', + '🕵🏾‍♀️', + '🕵🏾‍♀', + '🕵🏿‍♀️', + '🕵🏿‍♀', + '💂', + '💂🏻', + '💂🏼', + '💂🏽', + '💂🏾', + '💂🏿', + '💂‍♂️', + '💂‍♂', + '💂🏻‍♂️', + '💂🏻‍♂', + '💂🏼‍♂️', + '💂🏼‍♂', + '💂🏽‍♂️', + '💂🏽‍♂', + '💂🏾‍♂️', + '💂🏾‍♂', + '💂🏿‍♂️', + '💂🏿‍♂', + '💂‍♀️', + '💂‍♀', + '💂🏻‍♀️', + '💂🏻‍♀', + '💂🏼‍♀️', + '💂🏼‍♀', + '💂🏽‍♀️', + '💂🏽‍♀', + '💂🏾‍♀️', + '💂🏾‍♀', + '💂🏿‍♀️', + '💂🏿‍♀', + '👷', + '👷🏻', + '👷🏼', + '👷🏽', + '👷🏾', + '👷🏿', + '👷‍♂️', + '👷‍♂', + '👷🏻‍♂️', + '👷🏻‍♂', + '👷🏼‍♂️', + '👷🏼‍♂', + '👷🏽‍♂️', + '👷🏽‍♂', + '👷🏾‍♂️', + '👷🏾‍♂', + '👷🏿‍♂️', + '👷🏿‍♂', + '👷‍♀️', + '👷‍♀', + '👷🏻‍♀️', + '👷🏻‍♀', + '👷🏼‍♀️', + '👷🏼‍♀', + '👷🏽‍♀️', + '👷🏽‍♀', + '👷🏾‍♀️', + '👷🏾‍♀', + '👷🏿‍♀️', + '👷🏿‍♀', + '🤴', + '🤴🏻', + '🤴🏼', + '🤴🏽', + '🤴🏾', + '🤴🏿', + '👸', + '👸🏻', + '👸🏼', + '👸🏽', + '👸🏾', + '👸🏿', + '👳', + '👳🏻', + '👳🏼', + '👳🏽', + '👳🏾', + '👳🏿', + '👳‍♂️', + '👳‍♂', + '👳🏻‍♂️', + '👳🏻‍♂', + '👳🏼‍♂️', + '👳🏼‍♂', + '👳🏽‍♂️', + '👳🏽‍♂', + '👳🏾‍♂️', + '👳🏾‍♂', + '👳🏿‍♂️', + '👳🏿‍♂', + '👳‍♀️', + '👳‍♀', + '👳🏻‍♀️', + '👳🏻‍♀', + '👳🏼‍♀️', + '👳🏼‍♀', + '👳🏽‍♀️', + '👳🏽‍♀', + '👳🏾‍♀️', + '👳🏾‍♀', + '👳🏿‍♀️', + '👳🏿‍♀', + '👲', + '👲🏻', + '👲🏼', + '👲🏽', + '👲🏾', + '👲🏿', + '🧕', + '🧕🏻', + '🧕🏼', + '🧕🏽', + '🧕🏾', + '🧕🏿', + '🤵', + '🤵🏻', + '🤵🏼', + '🤵🏽', + '🤵🏾', + '🤵🏿', + '👰', + '👰🏻', + '👰🏼', + '👰🏽', + '👰🏾', + '👰🏿', + '🤰', + '🤰🏻', + '🤰🏼', + '🤰🏽', + '🤰🏾', + '🤰🏿', + '🤱', + '🤱🏻', + '🤱🏼', + '🤱🏽', + '🤱🏾', + '🤱🏿', + '👼', + '👼🏻', + '👼🏼', + '👼🏽', + '👼🏾', + '👼🏿', + '🎅', + '🎅🏻', + '🎅🏼', + '🎅🏽', + '🎅🏾', + '🎅🏿', + '🤶', + '🤶🏻', + '🤶🏼', + '🤶🏽', + '🤶🏾', + '🤶🏿', + '🦸', + '🦸🏻', + '🦸🏼', + '🦸🏽', + '🦸🏾', + '🦸🏿', + '🦸‍♂️', + '🦸‍♂', + '🦸🏻‍♂️', + '🦸🏻‍♂', + '🦸🏼‍♂️', + '🦸🏼‍♂', + '🦸🏽‍♂️', + '🦸🏽‍♂', + '🦸🏾‍♂️', + '🦸🏾‍♂', + '🦸🏿‍♂️', + '🦸🏿‍♂', + '🦸‍♀️', + '🦸‍♀', + '🦸🏻‍♀️', + '🦸🏻‍♀', + '🦸🏼‍♀️', + '🦸🏼‍♀', + '🦸🏽‍♀️', + '🦸🏽‍♀', + '🦸🏾‍♀️', + '🦸🏾‍♀', + '🦸🏿‍♀️', + '🦸🏿‍♀', + '🦹', + '🦹🏻', + '🦹🏼', + '🦹🏽', + '🦹🏾', + '🦹🏿', + '🦹‍♂️', + '🦹‍♂', + '🦹🏻‍♂️', + '🦹🏻‍♂', + '🦹🏼‍♂️', + '🦹🏼‍♂', + '🦹🏽‍♂️', + '🦹🏽‍♂', + '🦹🏾‍♂️', + '🦹🏾‍♂', + '🦹🏿‍♂️', + '🦹🏿‍♂', + '🦹‍♀️', + '🦹‍♀', + '🦹🏻‍♀️', + '🦹🏻‍♀', + '🦹🏼‍♀️', + '🦹🏼‍♀', + '🦹🏽‍♀️', + '🦹🏽‍♀', + '🦹🏾‍♀️', + '🦹🏾‍♀', + '🦹🏿‍♀️', + '🦹🏿‍♀', + '🧙', + '🧙🏻', + '🧙🏼', + '🧙🏽', + '🧙🏾', + '🧙🏿', + '🧙‍♂️', + '🧙‍♂', + '🧙🏻‍♂️', + '🧙🏻‍♂', + '🧙🏼‍♂️', + '🧙🏼‍♂', + '🧙🏽‍♂️', + '🧙🏽‍♂', + '🧙🏾‍♂️', + '🧙🏾‍♂', + '🧙🏿‍♂️', + '🧙🏿‍♂', + '🧙‍♀️', + '🧙‍♀', + '🧙🏻‍♀️', + '🧙🏻‍♀', + '🧙🏼‍♀️', + '🧙🏼‍♀', + '🧙🏽‍♀️', + '🧙🏽‍♀', + '🧙🏾‍♀️', + '🧙🏾‍♀', + '🧙🏿‍♀️', + '🧙🏿‍♀', + '🧚', + '🧚🏻', + '🧚🏼', + '🧚🏽', + '🧚🏾', + '🧚🏿', + '🧚‍♂️', + '🧚‍♂', + '🧚🏻‍♂️', + '🧚🏻‍♂', + '🧚🏼‍♂️', + '🧚🏼‍♂', + '🧚🏽‍♂️', + '🧚🏽‍♂', + '🧚🏾‍♂️', + '🧚🏾‍♂', + '🧚🏿‍♂️', + '🧚🏿‍♂', + '🧚‍♀️', + '🧚‍♀', + '🧚🏻‍♀️', + '🧚🏻‍♀', + '🧚🏼‍♀️', + '🧚🏼‍♀', + '🧚🏽‍♀️', + '🧚🏽‍♀', + '🧚🏾‍♀️', + '🧚🏾‍♀', + '🧚🏿‍♀️', + '🧚🏿‍♀', + '🧛', + '🧛🏻', + '🧛🏼', + '🧛🏽', + '🧛🏾', + '🧛🏿', + '🧛‍♂️', + '🧛‍♂', + '🧛🏻‍♂️', + '🧛🏻‍♂', + '🧛🏼‍♂️', + '🧛🏼‍♂', + '🧛🏽‍♂️', + '🧛🏽‍♂', + '🧛🏾‍♂️', + '🧛🏾‍♂', + '🧛🏿‍♂️', + '🧛🏿‍♂', + '🧛‍♀️', + '🧛‍♀', + '🧛🏻‍♀️', + '🧛🏻‍♀', + '🧛🏼‍♀️', + '🧛🏼‍♀', + '🧛🏽‍♀️', + '🧛🏽‍♀', + '🧛🏾‍♀️', + '🧛🏾‍♀', + '🧛🏿‍♀️', + '🧛🏿‍♀', + '🧜', + '🧜🏻', + '🧜🏼', + '🧜🏽', + '🧜🏾', + '🧜🏿', + '🧜‍♂️', + '🧜‍♂', + '🧜🏻‍♂️', + '🧜🏻‍♂', + '🧜🏼‍♂️', + '🧜🏼‍♂', + '🧜🏽‍♂️', + '🧜🏽‍♂', + '🧜🏾‍♂️', + '🧜🏾‍♂', + '🧜🏿‍♂️', + '🧜🏿‍♂', + '🧜‍♀️', + '🧜‍♀', + '🧜🏻‍♀️', + '🧜🏻‍♀', + '🧜🏼‍♀️', + '🧜🏼‍♀', + '🧜🏽‍♀️', + '🧜🏽‍♀', + '🧜🏾‍♀️', + '🧜🏾‍♀', + '🧜🏿‍♀️', + '🧜🏿‍♀', + '🧝', + '🧝🏻', + '🧝🏼', + '🧝🏽', + '🧝🏾', + '🧝🏿', + '🧝‍♂️', + '🧝‍♂', + '🧝🏻‍♂️', + '🧝🏻‍♂', + '🧝🏼‍♂️', + '🧝🏼‍♂', + '🧝🏽‍♂️', + '🧝🏽‍♂', + '🧝🏾‍♂️', + '🧝🏾‍♂', + '🧝🏿‍♂️', + '🧝🏿‍♂', + '🧝‍♀️', + '🧝‍♀', + '🧝🏻‍♀️', + '🧝🏻‍♀', + '🧝🏼‍♀️', + '🧝🏼‍♀', + '🧝🏽‍♀️', + '🧝🏽‍♀', + '🧝🏾‍♀️', + '🧝🏾‍♀', + '🧝🏿‍♀️', + '🧝🏿‍♀', + '🧞', + '🧞‍♂️', + '🧞‍♂', + '🧞‍♀️', + '🧞‍♀', + '🧟', + '🧟‍♂️', + '🧟‍♂', + '🧟‍♀️', + '🧟‍♀', + '💆', + '💆🏻', + '💆🏼', + '💆🏽', + '💆🏾', + '💆🏿', + '💆‍♂️', + '💆‍♂', + '💆🏻‍♂️', + '💆🏻‍♂', + '💆🏼‍♂️', + '💆🏼‍♂', + '💆🏽‍♂️', + '💆🏽‍♂', + '💆🏾‍♂️', + '💆🏾‍♂', + '💆🏿‍♂️', + '💆🏿‍♂', + '💆‍♀️', + '💆‍♀', + '💆🏻‍♀️', + '💆🏻‍♀', + '💆🏼‍♀️', + '💆🏼‍♀', + '💆🏽‍♀️', + '💆🏽‍♀', + '💆🏾‍♀️', + '💆🏾‍♀', + '💆🏿‍♀️', + '💆🏿‍♀', + '💇', + '💇🏻', + '💇🏼', + '💇🏽', + '💇🏾', + '💇🏿', + '💇‍♂️', + '💇‍♂', + '💇🏻‍♂️', + '💇🏻‍♂', + '💇🏼‍♂️', + '💇🏼‍♂', + '💇🏽‍♂️', + '💇🏽‍♂', + '💇🏾‍♂️', + '💇🏾‍♂', + '💇🏿‍♂️', + '💇🏿‍♂', + '💇‍♀️', + '💇‍♀', + '💇🏻‍♀️', + '💇🏻‍♀', + '💇🏼‍♀️', + '💇🏼‍♀', + '💇🏽‍♀️', + '💇🏽‍♀', + '💇🏾‍♀️', + '💇🏾‍♀', + '💇🏿‍♀️', + '💇🏿‍♀', + '🚶', + '🚶🏻', + '🚶🏼', + '🚶🏽', + '🚶🏾', + '🚶🏿', + '🚶‍♂️', + '🚶‍♂', + '🚶🏻‍♂️', + '🚶🏻‍♂', + '🚶🏼‍♂️', + '🚶🏼‍♂', + '🚶🏽‍♂️', + '🚶🏽‍♂', + '🚶🏾‍♂️', + '🚶🏾‍♂', + '🚶🏿‍♂️', + '🚶🏿‍♂', + '🚶‍♀️', + '🚶‍♀', + '🚶🏻‍♀️', + '🚶🏻‍♀', + '🚶🏼‍♀️', + '🚶🏼‍♀', + '🚶🏽‍♀️', + '🚶🏽‍♀', + '🚶🏾‍♀️', + '🚶🏾‍♀', + '🚶🏿‍♀️', + '🚶🏿‍♀', + '🧍', + '🧍🏻', + '🧍🏼', + '🧍🏽', + '🧍🏾', + '🧍🏿', + '🧍‍♂️', + '🧍‍♂', + '🧍🏻‍♂️', + '🧍🏻‍♂', + '🧍🏼‍♂️', + '🧍🏼‍♂', + '🧍🏽‍♂️', + '🧍🏽‍♂', + '🧍🏾‍♂️', + '🧍🏾‍♂', + '🧍🏿‍♂️', + '🧍🏿‍♂', + '🧍‍♀️', + '🧍‍♀', + '🧍🏻‍♀️', + '🧍🏻‍♀', + '🧍🏼‍♀️', + '🧍🏼‍♀', + '🧍🏽‍♀️', + '🧍🏽‍♀', + '🧍🏾‍♀️', + '🧍🏾‍♀', + '🧍🏿‍♀️', + '🧍🏿‍♀', + '🧎', + '🧎🏻', + '🧎🏼', + '🧎🏽', + '🧎🏾', + '🧎🏿', + '🧎‍♂️', + '🧎‍♂', + '🧎🏻‍♂️', + '🧎🏻‍♂', + '🧎🏼‍♂️', + '🧎🏼‍♂', + '🧎🏽‍♂️', + '🧎🏽‍♂', + '🧎🏾‍♂️', + '🧎🏾‍♂', + '🧎🏿‍♂️', + '🧎🏿‍♂', + '🧎‍♀️', + '🧎‍♀', + '🧎🏻‍♀️', + '🧎🏻‍♀', + '🧎🏼‍♀️', + '🧎🏼‍♀', + '🧎🏽‍♀️', + '🧎🏽‍♀', + '🧎🏾‍♀️', + '🧎🏾‍♀', + '🧎🏿‍♀️', + '🧎🏿‍♀', + '👨‍🦯', + '👨🏻‍🦯', + '👨🏼‍🦯', + '👨🏽‍🦯', + '👨🏾‍🦯', + '👨🏿‍🦯', + '👩‍🦯', + '👩🏻‍🦯', + '👩🏼‍🦯', + '👩🏽‍🦯', + '👩🏾‍🦯', + '👩🏿‍🦯', + '👨‍🦼', + '👨🏻‍🦼', + '👨🏼‍🦼', + '👨🏽‍🦼', + '👨🏾‍🦼', + '👨🏿‍🦼', + '👩‍🦼', + '👩🏻‍🦼', + '👩🏼‍🦼', + '👩🏽‍🦼', + '👩🏾‍🦼', + '👩🏿‍🦼', + '👨‍🦽', + '👨🏻‍🦽', + '👨🏼‍🦽', + '👨🏽‍🦽', + '👨🏾‍🦽', + '👨🏿‍🦽', + '👩‍🦽', + '👩🏻‍🦽', + '👩🏼‍🦽', + '👩🏽‍🦽', + '👩🏾‍🦽', + '👩🏿‍🦽', + '🏃', + '🏃🏻', + '🏃🏼', + '🏃🏽', + '🏃🏾', + '🏃🏿', + '🏃‍♂️', + '🏃‍♂', + '🏃🏻‍♂️', + '🏃🏻‍♂', + '🏃🏼‍♂️', + '🏃🏼‍♂', + '🏃🏽‍♂️', + '🏃🏽‍♂', + '🏃🏾‍♂️', + '🏃🏾‍♂', + '🏃🏿‍♂️', + '🏃🏿‍♂', + '🏃‍♀️', + '🏃‍♀', + '🏃🏻‍♀️', + '🏃🏻‍♀', + '🏃🏼‍♀️', + '🏃🏼‍♀', + '🏃🏽‍♀️', + '🏃🏽‍♀', + '🏃🏾‍♀️', + '🏃🏾‍♀', + '🏃🏿‍♀️', + '🏃🏿‍♀', + '💃', + '💃🏻', + '💃🏼', + '💃🏽', + '💃🏾', + '💃🏿', + '🕺', + '🕺🏻', + '🕺🏼', + '🕺🏽', + '🕺🏾', + '🕺🏿', + '🕴️', + '🕴', + '🕴🏻', + '🕴🏼', + '🕴🏽', + '🕴🏾', + '🕴🏿', + '👯', + '👯‍♂️', + '👯‍♂', + '👯‍♀️', + '👯‍♀', + '🧖', + '🧖🏻', + '🧖🏼', + '🧖🏽', + '🧖🏾', + '🧖🏿', + '🧖‍♂️', + '🧖‍♂', + '🧖🏻‍♂️', + '🧖🏻‍♂', + '🧖🏼‍♂️', + '🧖🏼‍♂', + '🧖🏽‍♂️', + '🧖🏽‍♂', + '🧖🏾‍♂️', + '🧖🏾‍♂', + '🧖🏿‍♂️', + '🧖🏿‍♂', + '🧖‍♀️', + '🧖‍♀', + '🧖🏻‍♀️', + '🧖🏻‍♀', + '🧖🏼‍♀️', + '🧖🏼‍♀', + '🧖🏽‍♀️', + '🧖🏽‍♀', + '🧖🏾‍♀️', + '🧖🏾‍♀', + '🧖🏿‍♀️', + '🧖🏿‍♀', + '🧗', + '🧗🏻', + '🧗🏼', + '🧗🏽', + '🧗🏾', + '🧗🏿', + '🧗‍♂️', + '🧗‍♂', + '🧗🏻‍♂️', + '🧗🏻‍♂', + '🧗🏼‍♂️', + '🧗🏼‍♂', + '🧗🏽‍♂️', + '🧗🏽‍♂', + '🧗🏾‍♂️', + '🧗🏾‍♂', + '🧗🏿‍♂️', + '🧗🏿‍♂', + '🧗‍♀️', + '🧗‍♀', + '🧗🏻‍♀️', + '🧗🏻‍♀', + '🧗🏼‍♀️', + '🧗🏼‍♀', + '🧗🏽‍♀️', + '🧗🏽‍♀', + '🧗🏾‍♀️', + '🧗🏾‍♀', + '🧗🏿‍♀️', + '🧗🏿‍♀', + '🤺', + '🏇', + '🏇🏻', + '🏇🏼', + '🏇🏽', + '🏇🏾', + '🏇🏿', + '⛷️', + '⛷', + '🏂', + '🏂🏻', + '🏂🏼', + '🏂🏽', + '🏂🏾', + '🏂🏿', + '🏌️', + '🏌', + '🏌🏻', + '🏌🏼', + '🏌🏽', + '🏌🏾', + '🏌🏿', + '🏌️‍♂️', + '🏌‍♂️', + '🏌️‍♂', + '🏌‍♂', + '🏌🏻‍♂️', + '🏌🏻‍♂', + '🏌🏼‍♂️', + '🏌🏼‍♂', + '🏌🏽‍♂️', + '🏌🏽‍♂', + '🏌🏾‍♂️', + '🏌🏾‍♂', + '🏌🏿‍♂️', + '🏌🏿‍♂', + '🏌️‍♀️', + '🏌‍♀️', + '🏌️‍♀', + '🏌‍♀', + '🏌🏻‍♀️', + '🏌🏻‍♀', + '🏌🏼‍♀️', + '🏌🏼‍♀', + '🏌🏽‍♀️', + '🏌🏽‍♀', + '🏌🏾‍♀️', + '🏌🏾‍♀', + '🏌🏿‍♀️', + '🏌🏿‍♀', + '🏄', + '🏄🏻', + '🏄🏼', + '🏄🏽', + '🏄🏾', + '🏄🏿', + '🏄‍♂️', + '🏄‍♂', + '🏄🏻‍♂️', + '🏄🏻‍♂', + '🏄🏼‍♂️', + '🏄🏼‍♂', + '🏄🏽‍♂️', + '🏄🏽‍♂', + '🏄🏾‍♂️', + '🏄🏾‍♂', + '🏄🏿‍♂️', + '🏄🏿‍♂', + '🏄‍♀️', + '🏄‍♀', + '🏄🏻‍♀️', + '🏄🏻‍♀', + '🏄🏼‍♀️', + '🏄🏼‍♀', + '🏄🏽‍♀️', + '🏄🏽‍♀', + '🏄🏾‍♀️', + '🏄🏾‍♀', + '🏄🏿‍♀️', + '🏄🏿‍♀', + '🚣', + '🚣🏻', + '🚣🏼', + '🚣🏽', + '🚣🏾', + '🚣🏿', + '🚣‍♂️', + '🚣‍♂', + '🚣🏻‍♂️', + '🚣🏻‍♂', + '🚣🏼‍♂️', + '🚣🏼‍♂', + '🚣🏽‍♂️', + '🚣🏽‍♂', + '🚣🏾‍♂️', + '🚣🏾‍♂', + '🚣🏿‍♂️', + '🚣🏿‍♂', + '🚣‍♀️', + '🚣‍♀', + '🚣🏻‍♀️', + '🚣🏻‍♀', + '🚣🏼‍♀️', + '🚣🏼‍♀', + '🚣🏽‍♀️', + '🚣🏽‍♀', + '🚣🏾‍♀️', + '🚣🏾‍♀', + '🚣🏿‍♀️', + '🚣🏿‍♀', + '🏊', + '🏊🏻', + '🏊🏼', + '🏊🏽', + '🏊🏾', + '🏊🏿', + '🏊‍♂️', + '🏊‍♂', + '🏊🏻‍♂️', + '🏊🏻‍♂', + '🏊🏼‍♂️', + '🏊🏼‍♂', + '🏊🏽‍♂️', + '🏊🏽‍♂', + '🏊🏾‍♂️', + '🏊🏾‍♂', + '🏊🏿‍♂️', + '🏊🏿‍♂', + '🏊‍♀️', + '🏊‍♀', + '🏊🏻‍♀️', + '🏊🏻‍♀', + '🏊🏼‍♀️', + '🏊🏼‍♀', + '🏊🏽‍♀️', + '🏊🏽‍♀', + '🏊🏾‍♀️', + '🏊🏾‍♀', + '🏊🏿‍♀️', + '🏊🏿‍♀', + '⛹️', + '⛹', + '⛹🏻', + '⛹🏼', + '⛹🏽', + '⛹🏾', + '⛹🏿', + '⛹️‍♂️', + '⛹‍♂️', + '⛹️‍♂', + '⛹‍♂', + '⛹🏻‍♂️', + '⛹🏻‍♂', + '⛹🏼‍♂️', + '⛹🏼‍♂', + '⛹🏽‍♂️', + '⛹🏽‍♂', + '⛹🏾‍♂️', + '⛹🏾‍♂', + '⛹🏿‍♂️', + '⛹🏿‍♂', + '⛹️‍♀️', + '⛹‍♀️', + '⛹️‍♀', + '⛹‍♀', + '⛹🏻‍♀️', + '⛹🏻‍♀', + '⛹🏼‍♀️', + '⛹🏼‍♀', + '⛹🏽‍♀️', + '⛹🏽‍♀', + '⛹🏾‍♀️', + '⛹🏾‍♀', + '⛹🏿‍♀️', + '⛹🏿‍♀', + '🏋️', + '🏋', + '🏋🏻', + '🏋🏼', + '🏋🏽', + '🏋🏾', + '🏋🏿', + '🏋️‍♂️', + '🏋‍♂️', + '🏋️‍♂', + '🏋‍♂', + '🏋🏻‍♂️', + '🏋🏻‍♂', + '🏋🏼‍♂️', + '🏋🏼‍♂', + '🏋🏽‍♂️', + '🏋🏽‍♂', + '🏋🏾‍♂️', + '🏋🏾‍♂', + '🏋🏿‍♂️', + '🏋🏿‍♂', + '🏋️‍♀️', + '🏋‍♀️', + '🏋️‍♀', + '🏋‍♀', + '🏋🏻‍♀️', + '🏋🏻‍♀', + '🏋🏼‍♀️', + '🏋🏼‍♀', + '🏋🏽‍♀️', + '🏋🏽‍♀', + '🏋🏾‍♀️', + '🏋🏾‍♀', + '🏋🏿‍♀️', + '🏋🏿‍♀', + '🚴', + '🚴🏻', + '🚴🏼', + '🚴🏽', + '🚴🏾', + '🚴🏿', + '🚴‍♂️', + '🚴‍♂', + '🚴🏻‍♂️', + '🚴🏻‍♂', + '🚴🏼‍♂️', + '🚴🏼‍♂', + '🚴🏽‍♂️', + '🚴🏽‍♂', + '🚴🏾‍♂️', + '🚴🏾‍♂', + '🚴🏿‍♂️', + '🚴🏿‍♂', + '🚴‍♀️', + '🚴‍♀', + '🚴🏻‍♀️', + '🚴🏻‍♀', + '🚴🏼‍♀️', + '🚴🏼‍♀', + '🚴🏽‍♀️', + '🚴🏽‍♀', + '🚴🏾‍♀️', + '🚴🏾‍♀', + '🚴🏿‍♀️', + '🚴🏿‍♀', + '🚵', + '🚵🏻', + '🚵🏼', + '🚵🏽', + '🚵🏾', + '🚵🏿', + '🚵‍♂️', + '🚵‍♂', + '🚵🏻‍♂️', + '🚵🏻‍♂', + '🚵🏼‍♂️', + '🚵🏼‍♂', + '🚵🏽‍♂️', + '🚵🏽‍♂', + '🚵🏾‍♂️', + '🚵🏾‍♂', + '🚵🏿‍♂️', + '🚵🏿‍♂', + '🚵‍♀️', + '🚵‍♀', + '🚵🏻‍♀️', + '🚵🏻‍♀', + '🚵🏼‍♀️', + '🚵🏼‍♀', + '🚵🏽‍♀️', + '🚵🏽‍♀', + '🚵🏾‍♀️', + '🚵🏾‍♀', + '🚵🏿‍♀️', + '🚵🏿‍♀', + '🤸', + '🤸🏻', + '🤸🏼', + '🤸🏽', + '🤸🏾', + '🤸🏿', + '🤸‍♂️', + '🤸‍♂', + '🤸🏻‍♂️', + '🤸🏻‍♂', + '🤸🏼‍♂️', + '🤸🏼‍♂', + '🤸🏽‍♂️', + '🤸🏽‍♂', + '🤸🏾‍♂️', + '🤸🏾‍♂', + '🤸🏿‍♂️', + '🤸🏿‍♂', + '🤸‍♀️', + '🤸‍♀', + '🤸🏻‍♀️', + '🤸🏻‍♀', + '🤸🏼‍♀️', + '🤸🏼‍♀', + '🤸🏽‍♀️', + '🤸🏽‍♀', + '🤸🏾‍♀️', + '🤸🏾‍♀', + '🤸🏿‍♀️', + '🤸🏿‍♀', + '🤼', + '🤼‍♂️', + '🤼‍♂', + '🤼‍♀️', + '🤼‍♀', + '🤽', + '🤽🏻', + '🤽🏼', + '🤽🏽', + '🤽🏾', + '🤽🏿', + '🤽‍♂️', + '🤽‍♂', + '🤽🏻‍♂️', + '🤽🏻‍♂', + '🤽🏼‍♂️', + '🤽🏼‍♂', + '🤽🏽‍♂️', + '🤽🏽‍♂', + '🤽🏾‍♂️', + '🤽🏾‍♂', + '🤽🏿‍♂️', + '🤽🏿‍♂', + '🤽‍♀️', + '🤽‍♀', + '🤽🏻‍♀️', + '🤽🏻‍♀', + '🤽🏼‍♀️', + '🤽🏼‍♀', + '🤽🏽‍♀️', + '🤽🏽‍♀', + '🤽🏾‍♀️', + '🤽🏾‍♀', + '🤽🏿‍♀️', + '🤽🏿‍♀', + '🤾', + '🤾🏻', + '🤾🏼', + '🤾🏽', + '🤾🏾', + '🤾🏿', + '🤾‍♂️', + '🤾‍♂', + '🤾🏻‍♂️', + '🤾🏻‍♂', + '🤾🏼‍♂️', + '🤾🏼‍♂', + '🤾🏽‍♂️', + '🤾🏽‍♂', + '🤾🏾‍♂️', + '🤾🏾‍♂', + '🤾🏿‍♂️', + '🤾🏿‍♂', + '🤾‍♀️', + '🤾‍♀', + '🤾🏻‍♀️', + '🤾🏻‍♀', + '🤾🏼‍♀️', + '🤾🏼‍♀', + '🤾🏽‍♀️', + '🤾🏽‍♀', + '🤾🏾‍♀️', + '🤾🏾‍♀', + '🤾🏿‍♀️', + '🤾🏿‍♀', + '🤹', + '🤹🏻', + '🤹🏼', + '🤹🏽', + '🤹🏾', + '🤹🏿', + '🤹‍♂️', + '🤹‍♂', + '🤹🏻‍♂️', + '🤹🏻‍♂', + '🤹🏼‍♂️', + '🤹🏼‍♂', + '🤹🏽‍♂️', + '🤹🏽‍♂', + '🤹🏾‍♂️', + '🤹🏾‍♂', + '🤹🏿‍♂️', + '🤹🏿‍♂', + '🤹‍♀️', + '🤹‍♀', + '🤹🏻‍♀️', + '🤹🏻‍♀', + '🤹🏼‍♀️', + '🤹🏼‍♀', + '🤹🏽‍♀️', + '🤹🏽‍♀', + '🤹🏾‍♀️', + '🤹🏾‍♀', + '🤹🏿‍♀️', + '🤹🏿‍♀', + '🧘', + '🧘🏻', + '🧘🏼', + '🧘🏽', + '🧘🏾', + '🧘🏿', + '🧘‍♂️', + '🧘‍♂', + '🧘🏻‍♂️', + '🧘🏻‍♂', + '🧘🏼‍♂️', + '🧘🏼‍♂', + '🧘🏽‍♂️', + '🧘🏽‍♂', + '🧘🏾‍♂️', + '🧘🏾‍♂', + '🧘🏿‍♂️', + '🧘🏿‍♂', + '🧘‍♀️', + '🧘‍♀', + '🧘🏻‍♀️', + '🧘🏻‍♀', + '🧘🏼‍♀️', + '🧘🏼‍♀', + '🧘🏽‍♀️', + '🧘🏽‍♀', + '🧘🏾‍♀️', + '🧘🏾‍♀', + '🧘🏿‍♀️', + '🧘🏿‍♀', + '🛀', + '🛀🏻', + '🛀🏼', + '🛀🏽', + '🛀🏾', + '🛀🏿', + '🛌', + '🛌🏻', + '🛌🏼', + '🛌🏽', + '🛌🏾', + '🛌🏿', + '🧑‍🤝‍🧑', + '🧑🏻‍🤝‍🧑🏻', + '🧑🏼‍🤝‍🧑🏻', + '🧑🏼‍🤝‍🧑🏼', + '🧑🏽‍🤝‍🧑🏻', + '🧑🏽‍🤝‍🧑🏼', + '🧑🏽‍🤝‍🧑🏽', + '🧑🏾‍🤝‍🧑🏻', + '🧑🏾‍🤝‍🧑🏼', + '🧑🏾‍🤝‍🧑🏽', + '🧑🏾‍🤝‍🧑🏾', + '🧑🏿‍🤝‍🧑🏻', + '🧑🏿‍🤝‍🧑🏼', + '🧑🏿‍🤝‍🧑🏽', + '🧑🏿‍🤝‍🧑🏾', + '🧑🏿‍🤝‍🧑🏿', + '👭', + '👭🏻', + '👩🏼‍🤝‍👩🏻', + '👭🏼', + '👩🏽‍🤝‍👩🏻', + '👩🏽‍🤝‍👩🏼', + '👭🏽', + '👩🏾‍🤝‍👩🏻', + '👩🏾‍🤝‍👩🏼', + '👩🏾‍🤝‍👩🏽', + '👭🏾', + '👩🏿‍🤝‍👩🏻', + '👩🏿‍🤝‍👩🏼', + '👩🏿‍🤝‍👩🏽', + '👩🏿‍🤝‍👩🏾', + '👭🏿', + '👫', + '👫🏻', + '👩🏻‍🤝‍👨🏼', + '👩🏻‍🤝‍👨🏽', + '👩🏻‍🤝‍👨🏾', + '👩🏻‍🤝‍👨🏿', + '👩🏼‍🤝‍👨🏻', + '👫🏼', + '👩🏼‍🤝‍👨🏽', + '👩🏼‍🤝‍👨🏾', + '👩🏼‍🤝‍👨🏿', + '👩🏽‍🤝‍👨🏻', + '👩🏽‍🤝‍👨🏼', + '👫🏽', + '👩🏽‍🤝‍👨🏾', + '👩🏽‍🤝‍👨🏿', + '👩🏾‍🤝‍👨🏻', + '👩🏾‍🤝‍👨🏼', + '👩🏾‍🤝‍👨🏽', + '👫🏾', + '👩🏾‍🤝‍👨🏿', + '👩🏿‍🤝‍👨🏻', + '👩🏿‍🤝‍👨🏼', + '👩🏿‍🤝‍👨🏽', + '👩🏿‍🤝‍👨🏾', + '👫🏿', + '👬', + '👬🏻', + '👨🏼‍🤝‍👨🏻', + '👬🏼', + '👨🏽‍🤝‍👨🏻', + '👨🏽‍🤝‍👨🏼', + '👬🏽', + '👨🏾‍🤝‍👨🏻', + '👨🏾‍🤝‍👨🏼', + '👨🏾‍🤝‍👨🏽', + '👬🏾', + '👨🏿‍🤝‍👨🏻', + '👨🏿‍🤝‍👨🏼', + '👨🏿‍🤝‍👨🏽', + '👨🏿‍🤝‍👨🏾', + '👬🏿', + '💏', + '👩‍❤️‍💋‍👨', + '👩‍❤‍💋‍👨', + '👨‍❤️‍💋‍👨', + '👨‍❤‍💋‍👨', + '👩‍❤️‍💋‍👩', + '👩‍❤‍💋‍👩', + '💑', + '👩‍❤️‍👨', + '👩‍❤‍👨', + '👨‍❤️‍👨', + '👨‍❤‍👨', + '👩‍❤️‍👩', + '👩‍❤‍👩', + '👪', + '👨‍👩‍👦', + '👨‍👩‍👧', + '👨‍👩‍👧‍👦', + '👨‍👩‍👦‍👦', + '👨‍👩‍👧‍👧', + '👨‍👨‍👦', + '👨‍👨‍👧', + '👨‍👨‍👧‍👦', + '👨‍👨‍👦‍👦', + '👨‍👨‍👧‍👧', + '👩‍👩‍👦', + '👩‍👩‍👧', + '👩‍👩‍👧‍👦', + '👩‍👩‍👦‍👦', + '👩‍👩‍👧‍👧', + '👨‍👦', + '👨‍👦‍👦', + '👨‍👧', + '👨‍👧‍👦', + '👨‍👧‍👧', + '👩‍👦', + '👩‍👦‍👦', + '👩‍👧', + '👩‍👧‍👦', + '👩‍👧‍👧', + '🗣️', + '🗣', + '👤', + '👥', + '👣', + '🏻', + '🏼', + '🏽', + '🏾', + '🏿', + '🦰', + '🦱', + '🦳', + '🦲', + '🐵', + '🐒', + '🦍', + '🦧', + '🐶', + '🐕', + '🦮', + '🐕‍🦺', + '🐩', + '🐺', + '🦊', + '🦝', + '🐱', + '🐈', + '🦁', + '🐯', + '🐅', + '🐆', + '🐴', + '🐎', + '🦄', + '🦓', + '🦌', + '🐮', + '🐂', + '🐃', + '🐄', + '🐷', + '🐖', + '🐗', + '🐽', + '🐏', + '🐑', + '🐐', + '🐪', + '🐫', + '🦙', + '🦒', + '🐘', + '🦏', + '🦛', + '🐭', + '🐁', + '🐀', + '🐹', + '🐰', + '🐇', + '🐿️', + '🐿', + '🦔', + '🦇', + '🐻', + '🐨', + '🐼', + '🦥', + '🦦', + '🦨', + '🦘', + '🦡', + '🐾', + '🦃', + '🐔', + '🐓', + '🐣', + '🐤', + '🐥', + '🐦', + '🐧', + '🕊️', + '🕊', + '🦅', + '🦆', + '🦢', + '🦉', + '🦩', + '🦚', + '🦜', + '🐸', + '🐊', + '🐢', + '🦎', + '🐍', + '🐲', + '🐉', + '🦕', + '🦖', + '🐳', + '🐋', + '🐬', + '🐟', + '🐠', + '🐡', + '🦈', + '🐙', + '🐚', + '🐌', + '🦋', + '🐛', + '🐜', + '🐝', + '🐞', + '🦗', + '🕷️', + '🕷', + '🕸️', + '🕸', + '🦂', + '🦟', + '🦠', + '💐', + '🌸', + '💮', + '🏵️', + '🏵', + '🌹', + '🥀', + '🌺', + '🌻', + '🌼', + '🌷', + '🌱', + '🌲', + '🌳', + '🌴', + '🌵', + '🌾', + '🌿', + '☘️', + '☘', + '🍀', + '🍁', + '🍂', + '🍃', + '🍇', + '🍈', + '🍉', + '🍊', + '🍋', + '🍌', + '🍍', + '🥭', + '🍎', + '🍏', + '🍐', + '🍑', + '🍒', + '🍓', + '🥝', + '🍅', + '🥥', + '🥑', + '🍆', + '🥔', + '🥕', + '🌽', + '🌶️', + '🌶', + '🥒', + '🥬', + '🥦', + '🧄', + '🧅', + '🍄', + '🥜', + '🌰', + '🍞', + '🥐', + '🥖', + '🥨', + '🥯', + '🥞', + '🧇', + '🧀', + '🍖', + '🍗', + '🥩', + '🥓', + '🍔', + '🍟', + '🍕', + '🌭', + '🥪', + '🌮', + '🌯', + '🥙', + '🧆', + '🥚', + '🍳', + '🥘', + '🍲', + '🥣', + '🥗', + '🍿', + '🧈', + '🧂', + '🥫', + '🍱', + '🍘', + '🍙', + '🍚', + '🍛', + '🍜', + '🍝', + '🍠', + '🍢', + '🍣', + '🍤', + '🍥', + '🥮', + '🍡', + '🥟', + '🥠', + '🥡', + '🦀', + '🦞', + '🦐', + '🦑', + '🦪', + '🍦', + '🍧', + '🍨', + '🍩', + '🍪', + '🎂', + '🍰', + '🧁', + '🥧', + '🍫', + '🍬', + '🍭', + '🍮', + '🍯', + '🍼', + '🥛', + '☕', + '🍵', + '🍶', + '🍾', + '🍷', + '🍸', + '🍹', + '🍺', + '🍻', + '🥂', + '🥃', + '🥤', + '🧃', + '🧉', + '🧊', + '🥢', + '🍽️', + '🍽', + '🍴', + '🥄', + '🔪', + '🏺', + '🌍', + '🌎', + '🌏', + '🌐', + '🗺️', + '🗺', + '🗾', + '🧭', + '🏔️', + '🏔', + '⛰️', + '⛰', + '🌋', + '🗻', + '🏕️', + '🏕', + '🏖️', + '🏖', + '🏜️', + '🏜', + '🏝️', + '🏝', + '🏞️', + '🏞', + '🏟️', + '🏟', + '🏛️', + '🏛', + '🏗️', + '🏗', + '🧱', + '🏘️', + '🏘', + '🏚️', + '🏚', + '🏠', + '🏡', + '🏢', + '🏣', + '🏤', + '🏥', + '🏦', + '🏨', + '🏩', + '🏪', + '🏫', + '🏬', + '🏭', + '🏯', + '🏰', + '💒', + '🗼', + '🗽', + '⛪', + '🕌', + '🛕', + '🕍', + '⛩️', + '⛩', + '🕋', + '⛲', + '⛺', + '🌁', + '🌃', + '🏙️', + '🏙', + '🌄', + '🌅', + '🌆', + '🌇', + '🌉', + '♨️', + '♨', + '🎠', + '🎡', + '🎢', + '💈', + '🎪', + '🚂', + '🚃', + '🚄', + '🚅', + '🚆', + '🚇', + '🚈', + '🚉', + '🚊', + '🚝', + '🚞', + '🚋', + '🚌', + '🚍', + '🚎', + '🚐', + '🚑', + '🚒', + '🚓', + '🚔', + '🚕', + '🚖', + '🚗', + '🚘', + '🚙', + '🚚', + '🚛', + '🚜', + '🏎️', + '🏎', + '🏍️', + '🏍', + '🛵', + '🦽', + '🦼', + '🛺', + '🚲', + '🛴', + '🛹', + '🚏', + '🛣️', + '🛣', + '🛤️', + '🛤', + '🛢️', + '🛢', + '⛽', + '🚨', + '🚥', + '🚦', + '🛑', + '🚧', + '⚓', + '⛵', + '🛶', + '🚤', + '🛳️', + '🛳', + '⛴️', + '⛴', + '🛥️', + '🛥', + '🚢', + '✈️', + '✈', + '🛩️', + '🛩', + '🛫', + '🛬', + '🪂', + '💺', + '🚁', + '🚟', + '🚠', + '🚡', + '🛰️', + '🛰', + '🚀', + '🛸', + '🛎️', + '🛎', + '🧳', + '⌛', + '⏳', + '⌚', + '⏰', + '⏱️', + '⏱', + '⏲️', + '⏲', + '🕰️', + '🕰', + '🕛', + '🕧', + '🕐', + '🕜', + '🕑', + '🕝', + '🕒', + '🕞', + '🕓', + '🕟', + '🕔', + '🕠', + '🕕', + '🕡', + '🕖', + '🕢', + '🕗', + '🕣', + '🕘', + '🕤', + '🕙', + '🕥', + '🕚', + '🕦', + '🌑', + '🌒', + '🌓', + '🌔', + '🌕', + '🌖', + '🌗', + '🌘', + '🌙', + '🌚', + '🌛', + '🌜', + '🌡️', + '🌡', + '☀️', + '☀', + '🌝', + '🌞', + '🪐', + '⭐', + '🌟', + '🌠', + '🌌', + '☁️', + '☁', + '⛅', + '⛈️', + '⛈', + '🌤️', + '🌤', + '🌥️', + '🌥', + '🌦️', + '🌦', + '🌧️', + '🌧', + '🌨️', + '🌨', + '🌩️', + '🌩', + '🌪️', + '🌪', + '🌫️', + '🌫', + '🌬️', + '🌬', + '🌀', + '🌈', + '🌂', + '☂️', + '☂', + '☔', + '⛱️', + '⛱', + '⚡', + '❄️', + '❄', + '☃️', + '☃', + '⛄', + '☄️', + '☄', + '🔥', + '💧', + '🌊', + '🎃', + '🎄', + '🎆', + '🎇', + '🧨', + '✨', + '🎈', + '🎉', + '🎊', + '🎋', + '🎍', + '🎎', + '🎏', + '🎐', + '🎑', + '🧧', + '🎀', + '🎁', + '🎗️', + '🎗', + '🎟️', + '🎟', + '🎫', + '🎖️', + '🎖', + '🏆', + '🏅', + '🥇', + '🥈', + '🥉', + '⚽', + '⚾', + '🥎', + '🏀', + '🏐', + '🏈', + '🏉', + '🎾', + '🥏', + '🎳', + '🏏', + '🏑', + '🏒', + '🥍', + '🏓', + '🏸', + '🥊', + '🥋', + '🥅', + '⛳', + '⛸️', + '⛸', + '🎣', + '🤿', + '🎽', + '🎿', + '🛷', + '🥌', + '🎯', + '🪀', + '🪁', + '🎱', + '🔮', + '🧿', + '🎮', + '🕹️', + '🕹', + '🎰', + '🎲', + '🧩', + '🧸', + '♠️', + '♠', + '♥️', + '♥', + '♦️', + '♦', + '♣️', + '♣', + '♟️', + '♟', + '🃏', + '🀄', + '🎴', + '🎭', + '🖼️', + '🖼', + '🎨', + '🧵', + '🧶', + '👓', + '🕶️', + '🕶', + '🥽', + '🥼', + '🦺', + '👔', + '👕', + '👖', + '🧣', + '🧤', + '🧥', + '🧦', + '👗', + '👘', + '🥻', + '🩱', + '🩲', + '🩳', + '👙', + '👚', + '👛', + '👜', + '👝', + '🛍️', + '🛍', + '🎒', + '👞', + '👟', + '🥾', + '🥿', + '👠', + '👡', + '🩰', + '👢', + '👑', + '👒', + '🎩', + '🎓', + '🧢', + '⛑️', + '⛑', + '📿', + '💄', + '💍', + '💎', + '🔇', + '🔈', + '🔉', + '🔊', + '📢', + '📣', + '📯', + '🔔', + '🔕', + '🎼', + '🎵', + '🎶', + '🎙️', + '🎙', + '🎚️', + '🎚', + '🎛️', + '🎛', + '🎤', + '🎧', + '📻', + '🎷', + '🎸', + '🎹', + '🎺', + '🎻', + '🪕', + '🥁', + '📱', + '📲', + '☎️', + '☎', + '📞', + '📟', + '📠', + '🔋', + '🔌', + '💻', + '🖥️', + '🖥', + '🖨️', + '🖨', + '⌨️', + '⌨', + '🖱️', + '🖱', + '🖲️', + '🖲', + '💽', + '💾', + '💿', + '📀', + '🧮', + '🎥', + '🎞️', + '🎞', + '📽️', + '📽', + '🎬', + '📺', + '📷', + '📸', + '📹', + '📼', + '🔍', + '🔎', + '🕯️', + '🕯', + '💡', + '🔦', + '🏮', + '🪔', + '📔', + '📕', + '📖', + '📗', + '📘', + '📙', + '📚', + '📓', + '📒', + '📃', + '📜', + '📄', + '📰', + '🗞️', + '🗞', + '📑', + '🔖', + '🏷️', + '🏷', + '💰', + '💴', + '💵', + '💶', + '💷', + '💸', + '💳', + '🧾', + '💹', + '💱', + '💲', + '✉️', + '✉', + '📧', + '📨', + '📩', + '📤', + '📥', + '📦', + '📫', + '📪', + '📬', + '📭', + '📮', + '🗳️', + '🗳', + '✏️', + '✏', + '✒️', + '✒', + '🖋️', + '🖋', + '🖊️', + '🖊', + '🖌️', + '🖌', + '🖍️', + '🖍', + '📝', + '💼', + '📁', + '📂', + '🗂️', + '🗂', + '📅', + '📆', + '🗒️', + '🗒', + '🗓️', + '🗓', + '📇', + '📈', + '📉', + '📊', + '📋', + '📌', + '📍', + '📎', + '🖇️', + '🖇', + '📏', + '📐', + '✂️', + '✂', + '🗃️', + '🗃', + '🗄️', + '🗄', + '🗑️', + '🗑', + '🔒', + '🔓', + '🔏', + '🔐', + '🔑', + '🗝️', + '🗝', + '🔨', + '🪓', + '⛏️', + '⛏', + '⚒️', + '⚒', + '🛠️', + '🛠', + '🗡️', + '🗡', + '⚔️', + '⚔', + '🔫', + '🏹', + '🛡️', + '🛡', + '🔧', + '🔩', + '⚙️', + '⚙', + '🗜️', + '🗜', + '⚖️', + '⚖', + '🦯', + '🔗', + '⛓️', + '⛓', + '🧰', + '🧲', + '⚗️', + '⚗', + '🧪', + '🧫', + '🧬', + '🔬', + '🔭', + '📡', + '💉', + '🩸', + '💊', + '🩹', + '🩺', + '🚪', + '🛏️', + '🛏', + '🛋️', + '🛋', + '🪑', + '🚽', + '🚿', + '🛁', + '🪒', + '🧴', + '🧷', + '🧹', + '🧺', + '🧻', + '🧼', + '🧽', + '🧯', + '🛒', + '🚬', + '⚰️', + '⚰', + '⚱️', + '⚱', + '🗿', + '🏧', + '🚮', + '🚰', + '♿', + '🚹', + '🚺', + '🚻', + '🚼', + '🚾', + '🛂', + '🛃', + '🛄', + '🛅', + '⚠️', + '⚠', + '🚸', + '⛔', + '🚫', + '🚳', + '🚭', + '🚯', + '🚱', + '🚷', + '📵', + '🔞', + '☢️', + '☢', + '☣️', + '☣', + '⬆️', + '⬆', + '↗️', + '↗', + '➡️', + '➡', + '↘️', + '↘', + '⬇️', + '⬇', + '↙️', + '↙', + '⬅️', + '⬅', + '↖️', + '↖', + '↕️', + '↕', + '↔️', + '↔', + '↩️', + '↩', + '↪️', + '↪', + '⤴️', + '⤴', + '⤵️', + '⤵', + '🔃', + '🔄', + '🔙', + '🔚', + '🔛', + '🔜', + '🔝', + '🛐', + '⚛️', + '⚛', + '🕉️', + '🕉', + '✡️', + '✡', + '☸️', + '☸', + '☯️', + '☯', + '✝️', + '✝', + '☦️', + '☦', + '☪️', + '☪', + '☮️', + '☮', + '🕎', + '🔯', + '♈', + '♉', + '♊', + '♋', + '♌', + '♍', + '♎', + '♏', + '♐', + '♑', + '♒', + '♓', + '⛎', + '🔀', + '🔁', + '🔂', + '▶️', + '▶', + '⏩', + '⏭️', + '⏭', + '⏯️', + '⏯', + '◀️', + '◀', + '⏪', + '⏮️', + '⏮', + '🔼', + '⏫', + '🔽', + '⏬', + '⏸️', + '⏸', + '⏹️', + '⏹', + '⏺️', + '⏺', + '⏏️', + '⏏', + '🎦', + '🔅', + '🔆', + '📶', + '📳', + '📴', + '♀️', + '♀', + '♂️', + '♂', + '⚕️', + '⚕', + '♾️', + '♾', + '♻️', + '♻', + '⚜️', + '⚜', + '🔱', + '📛', + '🔰', + '⭕', + '✅', + '☑️', + '☑', + '✔️', + '✔', + '✖️', + '✖', + '❌', + '❎', + '➕', + '➖', + '➗', + '➰', + '➿', + '〽️', + '〽', + '✳️', + '✳', + '✴️', + '✴', + '❇️', + '❇', + '‼️', + '‼', + '⁉️', + '⁉', + '❓', + '❔', + '❕', + '❗', + '〰️', + '〰', + '©️', + '©', + '®️', + '®', + '™️', + '™', + '#️⃣', + '#⃣', + '*️⃣', + '*⃣', + '0️⃣', + '0⃣', + '1️⃣', + '1⃣', + '2️⃣', + '2⃣', + '3️⃣', + '3⃣', + '4️⃣', + '4⃣', + '5️⃣', + '5⃣', + '6️⃣', + '6⃣', + '7️⃣', + '7⃣', + '8️⃣', + '8⃣', + '9️⃣', + '9⃣', + '🔟', + '🔠', + '🔡', + '🔢', + '🔣', + '🔤', + '🅰️', + '🅰', + '🆎', + '🅱️', + '🅱', + '🆑', + '🆒', + '🆓', + 'ℹ️', + 'ℹ', + '🆔', + 'Ⓜ️', + 'Ⓜ', + '🆕', + '🆖', + '🅾️', + '🅾', + '🆗', + '🅿️', + '🅿', + '🆘', + '🆙', + '🆚', + '🈁', + '🈂️', + '🈂', + '🈷️', + '🈷', + '🈶', + '🈯', + '🉐', + '🈹', + '🈚', + '🈲', + '🉑', + '🈸', + '🈴', + '🈳', + '㊗️', + '㊗', + '㊙️', + '㊙', + '🈺', + '🈵', + '🔴', + '🟠', + '🟡', + '🟢', + '🔵', + '🟣', + '🟤', + '⚫', + '⚪', + '🟥', + '🟧', + '🟨', + '🟩', + '🟦', + '🟪', + '🟫', + '⬛', + '⬜', + '◼️', + '◼', + '◻️', + '◻', + '◾', + '◽', + '▪️', + '▪', + '▫️', + '▫', + '🔶', + '🔷', + '🔸', + '🔹', + '🔺', + '🔻', + '💠', + '🔘', + '🔳', + '🔲', + '🏁', + '🚩', + '🎌', + '🏴', + '🏳️', + '🏳', + '🏳️‍🌈', + '🏳‍🌈', + '🏴‍☠️', + '🏴‍☠', + '🇦🇨', + '🇦🇩', + '🇦🇪', + '🇦🇫', + '🇦🇬', + '🇦🇮', + '🇦🇱', + '🇦🇲', + '🇦🇴', + '🇦🇶', + '🇦🇷', + '🇦🇸', + '🇦🇹', + '🇦🇺', + '🇦🇼', + '🇦🇽', + '🇦🇿', + '🇧🇦', + '🇧🇧', + '🇧🇩', + '🇧🇪', + '🇧🇫', + '🇧🇬', + '🇧🇭', + '🇧🇮', + '🇧🇯', + '🇧🇱', + '🇧🇲', + '🇧🇳', + '🇧🇴', + '🇧🇶', + '🇧🇷', + '🇧🇸', + '🇧🇹', + '🇧🇻', + '🇧🇼', + '🇧🇾', + '🇧🇿', + '🇨🇦', + '🇨🇨', + '🇨🇩', + '🇨🇫', + '🇨🇬', + '🇨🇭', + '🇨🇮', + '🇨🇰', + '🇨🇱', + '🇨🇲', + '🇨🇳', + '🇨🇴', + '🇨🇵', + '🇨🇷', + '🇨🇺', + '🇨🇻', + '🇨🇼', + '🇨🇽', + '🇨🇾', + '🇨🇿', + '🇩🇪', + '🇩🇬', + '🇩🇯', + '🇩🇰', + '🇩🇲', + '🇩🇴', + '🇩🇿', + '🇪🇦', + '🇪🇨', + '🇪🇪', + '🇪🇬', + '🇪🇭', + '🇪🇷', + '🇪🇸', + '🇪🇹', + '🇪🇺', + '🇫🇮', + '🇫🇯', + '🇫🇰', + '🇫🇲', + '🇫🇴', + '🇫🇷', + '🇬🇦', + '🇬🇧', + '🇬🇩', + '🇬🇪', + '🇬🇫', + '🇬🇬', + '🇬🇭', + '🇬🇮', + '🇬🇱', + '🇬🇲', + '🇬🇳', + '🇬🇵', + '🇬🇶', + '🇬🇷', + '🇬🇸', + '🇬🇹', + '🇬🇺', + '🇬🇼', + '🇬🇾', + '🇭🇰', + '🇭🇲', + '🇭🇳', + '🇭🇷', + '🇭🇹', + '🇭🇺', + '🇮🇨', + '🇮🇩', + '🇮🇪', + '🇮🇱', + '🇮🇲', + '🇮🇳', + '🇮🇴', + '🇮🇶', + '🇮🇷', + '🇮🇸', + '🇮🇹', + '🇯🇪', + '🇯🇲', + '🇯🇴', + '🇯🇵', + '🇰🇪', + '🇰🇬', + '🇰🇭', + '🇰🇮', + '🇰🇲', + '🇰🇳', + '🇰🇵', + '🇰🇷', + '🇰🇼', + '🇰🇾', + '🇰🇿', + '🇱🇦', + '🇱🇧', + '🇱🇨', + '🇱🇮', + '🇱🇰', + '🇱🇷', + '🇱🇸', + '🇱🇹', + '🇱🇺', + '🇱🇻', + '🇱🇾', + '🇲🇦', + '🇲🇨', + '🇲🇩', + '🇲🇪', + '🇲🇫', + '🇲🇬', + '🇲🇭', + '🇲🇰', + '🇲🇱', + '🇲🇲', + '🇲🇳', + '🇲🇴', + '🇲🇵', + '🇲🇶', + '🇲🇷', + '🇲🇸', + '🇲🇹', + '🇲🇺', + '🇲🇻', + '🇲🇼', + '🇲🇽', + '🇲🇾', + '🇲🇿', + '🇳🇦', + '🇳🇨', + '🇳🇪', + '🇳🇫', + '🇳🇬', + '🇳🇮', + '🇳🇱', + '🇳🇴', + '🇳🇵', + '🇳🇷', + '🇳🇺', + '🇳🇿', + '🇴🇲', + '🇵🇦', + '🇵🇪', + '🇵🇫', + '🇵🇬', + '🇵🇭', + '🇵🇰', + '🇵🇱', + '🇵🇲', + '🇵🇳', + '🇵🇷', + '🇵🇸', + '🇵🇹', + '🇵🇼', + '🇵🇾', + '🇶🇦', + '🇷🇪', + '🇷🇴', + '🇷🇸', + '🇷🇺', + '🇷🇼', + '🇸🇦', + '🇸🇧', + '🇸🇨', + '🇸🇩', + '🇸🇪', + '🇸🇬', + '🇸🇭', + '🇸🇮', + '🇸🇯', + '🇸🇰', + '🇸🇱', + '🇸🇲', + '🇸🇳', + '🇸🇴', + '🇸🇷', + '🇸🇸', + '🇸🇹', + '🇸🇻', + '🇸🇽', + '🇸🇾', + '🇸🇿', + '🇹🇦', + '🇹🇨', + '🇹🇩', + '🇹🇫', + '🇹🇬', + '🇹🇭', + '🇹🇯', + '🇹🇰', + '🇹🇱', + '🇹🇲', + '🇹🇳', + '🇹🇴', + '🇹🇷', + '🇹🇹', + '🇹🇻', + '🇹🇼', + '🇹🇿', + '🇺🇦', + '🇺🇬', + '🇺🇲', + '🇺🇳', + '🇺🇸', + '🇺🇾', + '🇺🇿', + '🇻🇦', + '🇻🇨', + '🇻🇪', + '🇻🇬', + '🇻🇮', + '🇻🇳', + '🇻🇺', + '🇼🇫', + '🇼🇸', + '🇽🇰', + '🇾🇪', + '🇾🇹', + '🇿🇦', + '🇿🇲', + '🇿🇼', + '🏴󠁧󠁢󠁥󠁮󠁧󠁿', + '🏴󠁧󠁢󠁳󠁣󠁴󠁿', + '🏴󠁧󠁢󠁷󠁬󠁳󠁿' + ] + } + } + }, + 'additionalProperties': false +} \ No newline at end of file diff --git a/crawler/json-schemas/vote.js b/crawler/json-schemas/vote.js new file mode 100644 index 00000000..74b17aeb --- /dev/null +++ b/crawler/json-schemas/vote.js @@ -0,0 +1,36 @@ +module.exports = { + '$schema': 'http://json-schema.org/draft-07/schema#', + '$id': 'dat://unwalled.garden/vote.json', + 'type': 'object', + 'title': 'Vote', + 'description': 'A vote up or down on some resource.', + 'required': [ + 'type', + 'topic', + 'vote', + 'createdAt' + ], + 'properties': { + 'type': { + 'type': 'string', + 'description': "The object's type", + 'const': 'unwalled.garden/vote' + }, + 'topic': { + 'type': 'string', + 'format': 'uri' + }, + 'vote': { + 'type': 'number', + 'enum': [-1, 1] + }, + 'createdAt': { + 'type': 'string', + 'format': 'date-time', + }, + 'updatedAt': { + 'type': 'string', + 'format': 'date-time' + } + } +} \ No newline at end of file diff --git a/crawler/media.js b/crawler/media.js new file mode 100644 index 00000000..7d018a7a --- /dev/null +++ b/crawler/media.js @@ -0,0 +1,450 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'media'}) +const db = require('../dbs/profile-data-db') +const crawler = require('./index') +const lock = require('../lib/lock') +const knex = require('../lib/knex') +const siteDescriptions = require('./site-descriptions') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename, ensureDirectory, normalizeSchemaUrl, toOrigin} = require('./util') +const mediaSchema = require('./json-schemas/media') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/media' +const JSON_PATH_REGEX = /^\/data\/media\/([^/]+)\.json$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription + * + * @typedef {Object} Media + * @prop {string} pathname + * @prop {string} subtype + * @prop {string} href + * @prop {string} title + * @prop {string} description + * @prop {string[]} tags + * @prop {string} createdAt + * @prop {string} updatedAt + * @prop {SiteDescription} author + * @prop {string} visibility + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validateMedia = ajv.compile(mediaSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for media. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_media', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling media', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_media WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_media', TABLE_VERSION, crawlSource, 0) + } + + // collect changed media + var changedMedia = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedMedia.length) { + logger.verbose('Collected new/changed media files', {details: {url: archive.url, changedMedia: changedMedia.map(p => p.name)}}) + } else { + logger.debug('No new media-files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_media', 0, changedMedia.length) + + // read and apply each media in order + var progress = 0 + for (let changedMediaItem of changedMedia) { + // TODO Currently the crawler will abort reading the feed if any media fails to load + // this means that a single unreachable file can stop the forward progress of media indexing + // to solve this, we need to find a way to tolerate unreachable media-files without losing our ability to efficiently detect new media + // -prf + if (changedMediaItem.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_media WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedMediaItem.name]) + events.emit('media-removed', archive.url) + } else { + // read + let mediaString + try { + mediaString = await archive.pda.readFile(changedMediaItem.name, 'utf8') + } catch (err) { + logger.warn('Failed to read media file, aborting', {details: {url: archive.url, name: changedMediaItem.name, err}}) + return // abort indexing + } + + // parse and validate + let media + try { + media = JSON.parse(mediaString) + let valid = validateMedia(media) + if (!valid) throw ajv.errorsText(validateMedia.errors) + } catch (err) { + logger.warn('Failed to parse media file, skipping', {details: {url: archive.url, name: changedMediaItem.name, err}}) + continue // skip + } + + // massage the media + media.subtype = normalizeSchemaUrl(media.subtype) + media.createdAt = Number(new Date(media.createdAt)) + media.updatedAt = Number(new Date(media.updatedAt)) + if (!media.description) media.description = '' // optional + if (!media.tags) media.tags = [] // optional + if (isNaN(media.updatedAt)) media.updatedAt = 0 // optional + + // upsert + let mediaId = 0 + let existingMedia = await db.get(knex('crawl_media') + .select('id') + .where({ + crawlSourceId: crawlSource.id, + pathname: changedMediaItem.name + }) + ) + if (existingMedia) { + await db.run(knex('crawl_media') + .where({ + crawlSourceId: crawlSource.id, + pathname: changedMediaItem.name + }).update({ + crawledAt: Date.now(), + subtype: media.subtype, + href: media.href, + title: media.title, + description: media.description, + createdAt: media.createdAt, + updatedAt: media.updatedAt, + }) + ) + mediaId = existingMedia.id + events.emit('media-updated', archive.url) + } else { + let res = await db.run(knex('crawl_media') + .insert({ + crawlSourceId: crawlSource.id, + pathname: changedMediaItem.name, + crawledAt: Date.now(), + subtype: media.subtype, + href: media.href, + title: media.title, + description: media.description, + createdAt: media.createdAt, + updatedAt: media.updatedAt, + }) + ) + mediaId = +res.lastID + events.emit('media-added', archive.url) + } + await db.run(`DELETE FROM crawl_media_tags WHERE crawlMediaId = ?`, [mediaId]) + for (let tag of media.tags) { + await db.run(`INSERT OR IGNORE INTO crawl_tags (tag) VALUES (?)`, [tag]) + let tagRow = await db.get(`SELECT id FROM crawl_tags WHERE tag = ?`, [tag]) + await db.run(`INSERT INTO crawl_media_tags (crawlMediaId, crawlTagId) VALUES (?, ?)`, [mediaId, tagRow.id]) + } + } + + // checkpoint our progress + await doCheckpoint('crawl_media', TABLE_VERSION, crawlSource, changedMediaItem.version) + emitProgressEvent(archive.url, 'crawl_media', ++progress, changedMedia.length) + } + logger.silly(`Finished crawling media`, {details: {url: archive.url}}) + }) +} + +/** + * @description + * List crawled media. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.hrefs] + * @param {string|string[]} [opts.filters.subtypes] + * @param {string|string[]} [opts.filters.tags] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.list = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'number', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = opts.filters.authors.map(url => toOrigin(url, true)) + } + if ('hrefs' in opts.filters) { + if (Array.isArray(opts.filters.hrefs)) { + assert(opts.filters.hrefs.every(v => typeof v === 'string'), 'Hrefs filter must be a string or array of strings') + } else { + assert(typeof opts.filters.hrefs === 'string', 'Hrefs filter must be a string or array of strings') + opts.filters.hrefs = [opts.filters.hrefs] + } + } + if ('subtypes' in opts.filters) { + if (Array.isArray(opts.filters.subtypes)) { + assert(opts.filters.subtypes.every(v => typeof v === 'string'), 'Subtypes filter must be a string or array of strings') + } else { + assert(typeof opts.filters.subtypes === 'string', 'Subtypes filter must be a string or array of strings') + opts.filters.subtypes = [opts.filters.subtypes] + } + opts.filters.subtypes = opts.filters.subtypes.map(normalizeSchemaUrl) + } + if ('tags' in opts.filters) { + if (Array.isArray(opts.filters.tags)) { + assert(opts.filters.tags.every(v => typeof v === 'string'), 'Tags filter must be a string or array of strings') + } else { + assert(typeof opts.filters.tags === 'string', 'Tags filter must be a string or array of strings') + opts.filters.tags = [opts.filters.tags] + } + } + } + + // build query + var sql = knex('crawl_media') + .select('crawl_media.*') + .select('crawl_sources.url as crawlSourceUrl') + .select(knex.raw('group_concat(crawl_tags.tag, ",") as tags')) + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_media.crawlSourceId') + .leftJoin('crawl_media_tags', 'crawl_media_tags.crawlMediaId', '=', 'crawl_media.id') + .leftJoin('crawl_tags', 'crawl_media_tags.crawlTagId', '=', 'crawl_tags.id') + .groupBy('crawl_media.id') + .orderBy('crawl_media.createdAt', opts.reverse ? 'DESC' : 'ASC') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.filters && opts.filters.hrefs) { + sql = sql.whereIn('crawl_media.href', opts.filters.hrefs) + } + if (opts && opts.filters && opts.filters.subtypes) { + sql = sql.whereIn('crawl_media.subtype', opts.filters.subtypes) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + var media = await Promise.all(rows.map(massageMediaRow)) + + // apply tags filter + if (opts && opts.filters && opts.filters.tags) { + const someFn = t => opts.filters.tags.includes(t) + media = media.filter(m => m.tags.some(someFn)) + } + + return media +} + +/** + * @description + * Get crawled media. + * + * @param {string} url - The URL of the media + * @returns {Promise} + */ +const get = exports.get = async function (url) { + // validate & parse params + var urlParsed + if (url) { + try { urlParsed = new URL(url) } + catch (e) { throw new Error('Invalid URL: ' + url) } + } + + // build query + var sql = knex('crawl_media') + .select('crawl_media.*') + .select('crawl_sources.url as crawlSourceUrl') + .select(knex.raw('group_concat(crawl_tags.tag, ",") as tags')) + .innerJoin('crawl_sources', function () { + this.on('crawl_sources.id', '=', 'crawl_media.crawlSourceId') + .andOn('crawl_sources.url', '=', knex.raw('?', `${urlParsed.protocol}//${urlParsed.hostname}`)) + }) + .leftJoin('crawl_media_tags', 'crawl_media_tags.crawlMediaId', '=', 'crawl_media.id') + .leftJoin('crawl_tags', 'crawl_tags.id', '=', 'crawl_media_tags.crawlTagId') + .where('crawl_media.pathname', urlParsed.pathname) + .groupBy('crawl_media.id') + + // execute query + return await massageMediaRow(await db.get(sql)) +} + +/** + * @description + * Create a new media. + * + * @param {InternalDatArchive} archive - where to write the media to. + * @param {Object} media + * @param {string} media.subtype + * @param {string} media.href + * @param {string} media.title + * @param {string} media.description + * @param {string[]} media.tags + * @param {string} media.visibility + * @returns {Promise} url + */ +exports.add = async function (archive, media) { + // TODO visibility + + var mediaObject = { + type: JSON_TYPE, + subtype: normalizeSchemaUrl(media.subtype), + href: media.href, + title: media.title, + description: media.description, + tags: media.tags, + createdAt: (new Date()).toISOString() + } + var valid = validateMedia(mediaObject) + if (!valid) throw ajv.errorsText(validateMedia.errors) + + var filename = generateTimeFilename() + var filepath = `/data/media/${filename}.json` + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/media') + await archive.pda.writeFile(filepath, JSON.stringify(mediaObject, null, 2)) + await crawler.crawlSite(archive) + return archive.url + filepath +} + +/** + * @description + * Update the content of an existing media. + * + * @param {InternalDatArchive} archive - where to write the media to. + * @param {string} pathname - the pathname of the media. + * @param {Object} media + * @param {string} [media.subtype] + * @param {string} [media.href] + * @param {string} [media.title] + * @param {string} [media.description] + * @param {string[]} [media.tags] + * @param {string} [media.visibility] + * @returns {Promise} + */ +exports.edit = async function (archive, pathname, media) { + // TODO visibility + + var release = await lock('crawler:media:' + archive.url) + try { + // fetch media + var existingMedia = await get(archive.url + pathname) + if (!existingMedia) throw new Error('Media not found') + + // update media content + var mediaObject = { + type: JSON_TYPE, + subtype: normalizeSchemaUrl(('subtype' in media) ? media.subtype : existingMedia.subtype), + href: ('href' in media) ? media.href : existingMedia.href, + title: ('title' in media) ? media.title : existingMedia.title, + description: ('description' in media) ? media.description : existingMedia.description, + tags: ('tags' in media) ? media.tags : existingMedia.tags, + createdAt: existingMedia.createdAt, + updatedAt: (new Date()).toISOString() + } + + // validate + var valid = validateMedia(mediaObject) + if (!valid) throw ajv.errorsText(validateMedia.errors) + + // write + await archive.pda.writeFile(pathname, JSON.stringify(mediaObject, null, 2)) + await crawler.crawlSite(archive) + } finally { + release() + } +} + +/** + * @description + * Delete an existing media + * + * @param {InternalDatArchive} archive - where to write the media to. + * @param {string} pathname - the pathname of the media. + * @returns {Promise} + */ +exports.remove = async function (archive, pathname) { + assert(typeof pathname === 'string', 'Remove() must be provided a valid URL string') + await archive.pda.unlink(pathname) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageMediaRow (row) { + if (!row) return null + var author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if (!author) { + author = { + url: row.crawlSourceUrl, + title: '', + description: '', + type: [], + thumbUrl: `${row.crawlSourceUrl}/thumb`, + descAuthor: {url: null} + } + } + return { + pathname: row.pathname, + author, + subtype: row.subtype, + href: row.href, + title: row.title, + description: row.description, + tags: row.tags ? row.tags.split(',').filter(Boolean) : [], + createdAt: new Date(row.createdAt).toISOString(), + updatedAt: row.updatedAt ? new Date(row.updatedAt).toISOString() : null, + visibility: 'public' // TODO visibility + } +} diff --git a/crawler/posts.js b/crawler/posts.js new file mode 100644 index 00000000..d7a93f59 --- /dev/null +++ b/crawler/posts.js @@ -0,0 +1,357 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'posts'}) +const db = require('../dbs/profile-data-db') +const crawler = require('./index') +const datLibrary = require('../dat/library') +const lock = require('../lib/lock') +const knex = require('../lib/knex') +const siteDescriptions = require('./site-descriptions') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, generateTimeFilename, ensureDirectory} = require('./util') +const postSchema = require('./json-schemas/post') + +// constants +// = + +const TABLE_VERSION = 2 +const JSON_TYPE = 'unwalled.garden/post' +const JSON_PATH_REGEX = /^\/data\/posts\/([^/]+)\.json$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription + * + * @typedef {Object} Post + * @prop {string} pathname + * @prop {string} body + * @prop {string} createdAt + * @prop {string} updatedAt + * @prop {SiteDescription} author + * @prop {string} visibility + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validatePost = ajv.compile(postSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for posts. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_posts', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling posts', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_posts WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSource, 0) + } + + // collect changed posts + var changedPosts = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedPosts.length) { + logger.verbose('Collected new/changed post files', {details: {url: archive.url, changedPosts: changedPosts.map(p => p.name)}}) + } else { + logger.debug('No new post-files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_posts', 0, changedPosts.length) + + // read and apply each post in order + var progress = 0 + for (let changedPost of changedPosts) { + // TODO Currently the crawler will abort reading the feed if any post fails to load + // this means that a single unreachable file can stop the forward progress of post indexing + // to solve this, we need to find a way to tolerate unreachable post-files without losing our ability to efficiently detect new posts + // -prf + if (changedPost.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_posts WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedPost.name]) + events.emit('post-removed', archive.url) + } else { + // read + let postString + try { + postString = await archive.pda.readFile(changedPost.name, 'utf8') + } catch (err) { + logger.warn('Failed to read post file, aborting', {details: {url: archive.url, name: changedPost.name, err}}) + return // abort indexing + } + + // parse and validate + let post + try { + post = JSON.parse(postString) + let valid = validatePost(post) + if (!valid) throw ajv.errorsText(validatePost.errors) + } catch (err) { + logger.warn('Failed to parse post file, skipping', {details: {url: archive.url, name: changedPost.name, err}}) + continue // skip + } + + // massage the post + post.createdAt = Number(new Date(post.createdAt)) + post.updatedAt = Number(new Date(post.updatedAt)) + if (isNaN(post.updatedAt)) post.updatedAt = 0 // optional + + // upsert + let existingPost = await get(joinPath(archive.url, changedPost.name)) + if (existingPost) { + await db.run(` + UPDATE crawl_posts + SET crawledAt = ?, body = ?, createdAt = ?, updatedAt = ? + WHERE crawlSourceId = ? AND pathname = ? + `, [Date.now(), post.body, post.createdAt, post.updatedAt, crawlSource.id, changedPost.name]) + events.emit('post-updated', archive.url) + } else { + await db.run(` + INSERT INTO crawl_posts (crawlSourceId, pathname, crawledAt, body, createdAt, updatedAt) + VALUES (?, ?, ?, ?, ?, ?) + `, [crawlSource.id, changedPost.name, Date.now(), post.body, post.createdAt, post.updatedAt]) + events.emit('post-added', archive.url) + } + } + + // checkpoint our progress + await doCheckpoint('crawl_posts', TABLE_VERSION, crawlSource, changedPost.version) + emitProgressEvent(archive.url, 'crawl_posts', ++progress, changedPosts.length) + } + logger.silly(`Finished crawling posts`, {details: {url: archive.url}}) + }) +} + +/** + * @description + * List crawled posts. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.list = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // build query + var sql = knex('crawl_posts') + .select('crawl_posts.*') + .select('crawl_sources.url AS crawlSourceUrl') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_posts.crawlSourceId') + .orderBy('crawl_posts.createdAt', opts.reverse ? 'DESC' : 'ASC') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + return Promise.all(rows.map(massagePostRow)) +} + +/** + * @description + * Get crawled post. + * + * @param {string} url - The URL of the post + * @returns {Promise} + */ +const get = exports.get = async function (url) { + // validate & parse params + var urlParsed + if (url) { + try { urlParsed = new URL(url) } + catch (e) { throw new Error('Invalid URL: ' + url) } + } + + // execute query + var sql = knex('crawl_posts') + .select('crawl_posts.*') + .select('crawl_sources.url AS crawlSourceUrl') + .innerJoin('crawl_sources', function () { + this.on('crawl_sources.id', '=', 'crawl_posts.crawlSourceId') + .andOn('crawl_sources.url', '=', knex.raw('?', `${urlParsed.protocol}//${urlParsed.hostname}`)) + }) + .where('crawl_posts.pathname', urlParsed.pathname) + return await massagePostRow(await db.get(sql)) +} + +/** + * @description + * Create a new post. + * + * @param {InternalDatArchive} archive - where to write the post to. + * @param {Object} post + * @param {string} post.body + * @param {string} post.visibility + * @returns {Promise} url + */ +exports.add = async function (archive, post) { + // TODO visibility + + var postObject = { + type: JSON_TYPE, + body: post.body, + createdAt: (new Date()).toISOString() + } + var valid = validatePost(postObject) + if (!valid) throw ajv.errorsText(validatePost.errors) + + var filename = generateTimeFilename() + var filepath = `/data/posts/${filename}.json` + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/posts') + await archive.pda.writeFile(filepath, JSON.stringify(postObject, null, 2)) + await crawler.crawlSite(archive) + return archive.url + filepath +} + +/** + * @description + * Update the content of an existing post. + * + * @param {InternalDatArchive} archive - where to write the post to. + * @param {string} pathname - the pathname of the post. + * @param {Object} post + * @param {string} [post.body] + * @param {string} [post.visibility] + * @returns {Promise} + */ +exports.edit = async function (archive, pathname, post) { + // TODO visibility + + var release = await lock('crawler:posts:' + archive.url) + try { + // fetch post + var existingPost = await get(archive.url + pathname) + if (!existingPost) throw new Error('Post not found') + + // update post content + var postObject = { + type: JSON_TYPE, + body: ('body' in post) ? post.body : existingPost.body, + createdAt: existingPost.createdAt, + updatedAt: (new Date()).toISOString() + } + + // validate + var valid = validatePost(postObject) + if (!valid) throw ajv.errorsText(validatePost.errors) + + // write + await archive.pda.writeFile(pathname, JSON.stringify(postObject, null, 2)) + await crawler.crawlSite(archive) + } finally { + release() + } +} + +/** + * @description + * Delete an existing post + * + * @param {InternalDatArchive} archive - where to write the post to. + * @param {string} pathname - the pathname of the post. + * @returns {Promise} + */ +exports.remove = async function (archive, pathname) { + assert(typeof pathname === 'string', 'Remove() must be provided a valid URL string') + await archive.pda.unlink(pathname) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {string} origin + * @param {string} pathname + * @returns {string} + */ +function joinPath (origin, pathname) { + if (origin.endsWith('/') && pathname.startsWith('/')) { + return origin + pathname.slice(1) + } + if (!origin.endsWith('/') && !pathname.startsWith('/')) { + return origin + '/' + pathname + } + return origin + pathname +} + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massagePostRow (row) { + if (!row) return null + var author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if (!author) { + author = { + url: row.crawlSourceUrl, + title: '', + description: '', + type: [], + thumbUrl: `${row.crawlSourceUrl}/thumb`, + descAuthor: {url: null} + } + } + return { + pathname: row.pathname, + author, + body: row.body, + createdAt: new Date(row.createdAt).toISOString(), + updatedAt: row.updatedAt ? new Date(row.updatedAt).toISOString() : null, + visibility: 'public' // TODO visibility + } +} diff --git a/crawler/reactions.js b/crawler/reactions.js new file mode 100644 index 00000000..96920183 --- /dev/null +++ b/crawler/reactions.js @@ -0,0 +1,367 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'reactions'}) +const db = require('../dbs/profile-data-db') +const crawler = require('./index') +const datLibrary = require('../dat/library') +const lock = require('../lib/lock') +const knex = require('../lib/knex') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, ensureDirectory, normalizeTopicUrl, slugifyUrl} = require('./util') +const reactionSchema = require('./json-schemas/reaction') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/reaction' +const JSON_PATH_REGEX = /^\/data\/reactions\/([^/]+)\.json$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * + * @typedef {Object} Reaction + * @prop {string} topic + * @prop {string[]} emojis + * @prop {string} author + * @prop {string} recordUrl + * @prop {number} crawledAt + * + * @typedef {Object} TopicReaction + * @prop {string} emoji + * @prop {string[]} authors + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validateReaction = ajv.compile(reactionSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for reactions. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_reactions', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling reactions', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_reactions WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_reactions', TABLE_VERSION, crawlSource, 0) + } + + // collect changed reactions + var changedReactions = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedReactions.length) { + logger.verbose('Collected new/changed reaction files', {details: {url: archive.url, changedReactions: changedReactions.map(p => p.name)}}) + } else { + logger.debug('No new reaction-files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_reactions', 0, changedReactions.length) + + // read and apply each reaction in order + var progress = 0 + for (let changedReaction of changedReactions) { + // TODO Currently the crawler will abort reading the feed if any reaction fails to load + // this means that a single unreachable file can stop the forward progress of reaction indexing + // to solve this, we need to find a way to tolerate unreachable reaction-files without losing our ability to efficiently detect new reactions + // -prf + if (changedReaction.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_reactions WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedReaction.name]) + events.emit('reaction-updated', archive.url) + } else { + // read + let fileString + try { + fileString = await archive.pda.readFile(changedReaction.name, 'utf8') + } catch (err) { + logger.warn('Failed to read reaction file, aborting', {details: {url: archive.url, name: changedReaction.name, err}}) + return // abort indexing + } + + // parse and validate + let reaction + try { + reaction = JSON.parse(fileString) + let valid = validateReaction(reaction) + if (!valid) throw ajv.errorsText(validateReaction.errors) + } catch (err) { + logger.warn('Failed to parse reaction file, skipping', {details: {url: archive.url, name: changedReaction.name, err}}) + continue // skip + } + + // massage record + reaction.topic = normalizeTopicUrl(reaction.topic) + + // upsert + await db.run(` + INSERT OR REPLACE INTO crawl_reactions (crawlSourceId, pathname, crawledAt, topic, emojis) + VALUES (?, ?, ?, ?, ?) + `, [crawlSource.id, changedReaction.name, Date.now(), reaction.topic, reaction.emojis.join(',')]) + events.emit('reaction-updated', archive.url) + } + + // checkpoint our progress + logger.silly(`Finished crawling reactions`, {details: {url: archive.url}}) + await doCheckpoint('crawl_reactions', TABLE_VERSION, crawlSource, changedReaction.version) + emitProgressEvent(archive.url, 'crawl_reactions', ++progress, changedReactions.length) + } + }) +} + +/** + * @description + * List crawled reactions. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.topics] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.list = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('topics' in opts.filters) { + if (Array.isArray(opts.filters.topics)) { + assert(opts.filters.topics.every(v => typeof v === 'string'), 'Topics filter must be a string or array of strings') + } else { + assert(typeof opts.filters.topics === 'string', 'Topics filter must be a string or array of strings') + opts.filters.topics = [opts.filters.topics] + } + opts.filters.topics = opts.filters.topics.map(normalizeTopicUrl) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // execute query + let sql = knex('crawl_reactions') + .select('crawl_reactions.*') + .select('crawl_sources.url AS author') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_reactions.crawlSourceId') + .orderBy('crawl_reactions.topic', opts.reverse ? 'DESC' : 'ASC') + if (opts.limit) sql = sql.limit(opts.limit) + if (opts.offset) sql = sql.offset(opts.offset) + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.filters && opts.filters.topics) { + sql = sql.whereIn('crawl_reactions.topic', opts.filters.topics) + } + var rows = await db.all(sql) + + // massage results + rows.forEach(row => { + row.emojis = row.emojis.split(',') + row.recordUrl = row.author + row.pathname + }) + return rows +} + +/** + * @description + * List crawled reactions on a topic. + * + * @param {string} topic - The URL of the topic + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string} [opts.filters.visibility] + * @returns {Promise} + */ +exports.tabulate = async function (topic, opts) { + // TODO handle visibility + + // validate params + try { new URL(topic) } + catch (e) { throw new Error('Invalid URL: ' + topic) } + topic = normalizeTopicUrl(topic) + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // execute query + var sql = knex('crawl_reactions') + .select('crawl_reactions.*') + .select('crawl_sources.url AS crawlSourceUrl') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_reactions.crawlSourceId') + .where('crawl_reactions.topic', topic) + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + var rows = await db.all(sql) + + // construct reactions list + var reactions = {} + rows.forEach(row => { + row.emojis.split(',').forEach(emoji => { + if (!reactions[emoji]) { + reactions[emoji] = {emoji, authors: [row.crawlSourceUrl]} + } else { + reactions[emoji].authors.push(row.crawlSourceUrl) + } + }) + }) + + return Object.values(reactions) +} + +/** + * @description + * Create a new reaction. + * + * @param {InternalDatArchive} archive - where to write the reaction to. + * @param {string} topic + * @param {string} emoji + * @returns {Promise} + */ +exports.add = async function (archive, topic, emoji) { + // TODO handle visibility + + topic = normalizeTopicUrl(topic) + emoji = emoji.replace('\uFE0F', '').replace('\uFE0E', '') // strip the emoji-enforcement token + var valid = validateReaction({type: JSON_TYPE, topic, emojis: [emoji]}) + if (!valid) throw ajv.errorsText(validateReaction.errors) + + var filepath = `/data/reactions/${slugifyUrl(topic)}.json` + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/reactions') + await updateReactionFile(archive, filepath, topic, emoji, false) + await crawler.crawlSite(archive) +} + +/** + * @description + * Delete an existing reaction + * + * @param {InternalDatArchive} archive - where to write the reaction to. + * @param {string} topic + * @param {string} emoji + * @returns {Promise} + */ +exports.remove = async function (archive, topic, emoji) { + // TODO handle visibility + + topic = normalizeTopicUrl(topic) + emoji = emoji.replace('\uFE0F', '').replace('\uFE0E', '') // strip the emoji-enforcement token + var valid = validateReaction({type: JSON_TYPE, topic, emojis: [emoji]}) + if (!valid) throw ajv.errorsText(validateReaction.errors) + + var filepath = `/data/reactions/${slugifyUrl(topic)}.json` + await updateReactionFile(archive, filepath, topic, false, emoji) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns {Promise} + */ +async function readReactionFile (archive, pathname) { + try { + var json = await archive.pda.readFile(pathname, 'utf8') + json = JSON.parse(json) + var valid = validateReaction(json) + if (!valid) throw ajv.errorsText(validateReaction.errors) + return json + } catch (e) { + // fallback to an empty on error + return { + type: JSON_TYPE, + topic: '', + emojis: [] + } + } +} + +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @param {string} topic + * @param {string|boolean} addEmoji + * @param {string|boolean} removeEmoji + * @returns {Promise} + */ +async function updateReactionFile (archive, pathname, topic, addEmoji = false, removeEmoji = false) { + var release = await lock('crawler:reactions:' + archive.url) + try { + // read the reaction file + var reactionJson = await readReactionFile(archive, pathname) + + // apply update + reactionJson.topic = topic + if (addEmoji) reactionJson.emojis = Array.from(new Set(reactionJson.emojis.concat([addEmoji]))) + if (removeEmoji) reactionJson.emojis = reactionJson.emojis.filter(v => v !== removeEmoji) + + // write or delete the reaction file + if (reactionJson.emojis.length) { + await archive.pda.writeFile(pathname, JSON.stringify(reactionJson, null, 2), 'utf8') + } else { + await archive.pda.unlink(pathname) + } + } finally { + release() + } +} diff --git a/crawler/search.js b/crawler/search.js new file mode 100644 index 00000000..a8d7e0c7 --- /dev/null +++ b/crawler/search.js @@ -0,0 +1,468 @@ +const _groupBy = require('lodash.groupby') +const _uniqWith = require('lodash.uniqwith') +const db = require('../dbs/profile-data-db') +const bookmarksDb = require('../dbs/bookmarks') +const historyDb = require('../dbs/history') +const datLibrary = require('../dat/library') +const follows = require('./follows') +const siteDescriptions = require('./site-descriptions') +const {getSiteDescriptionThumbnailUrl} = require('./util') +const knex = require('../lib/knex') +const users = require('../users') + +const KNOWN_SITE_TYPES = [ + 'unwalled.garden/person', + 'unwalled.garden/theme' +] + +// typedefs +// = + +/** + * @typedef {import("./site-descriptions").SiteDescription} SiteDescription + * @typedef {import("../dbs/archives").LibraryArchiveRecord} LibraryArchiveRecord + * + * @typedef {Object} SuggestionResults + * @prop {Array} bookmarks + * @prop {Array} websites + * @prop {Array} people + * @prop {Array} themes + * @prop {(undefined|Array)} history + * + * TODO: define the SuggestionResults values + * + * @typedef {Object} SearchResults + * @prop {number} highlightNonce - A number used to create perimeters around text that should be highlighted. + * @prop {Array} results + * + * @typedef {Object} SearchResultAuthor + * @prop {string} url + * @prop {string} title + * @prop {string} description + * @prop {Array} type + * + * @typedef {Object} SearchResultRecord + * @prop {string} type + * @prop {string} url + * @prop {number} crawledAt + * @prop {SearchResultAuthor} author + * + * @typedef {Object} SiteSearchResult + * @prop {SearchResultRecord} record + * @prop {string} url + * @prop {string} title + * @prop {string} description + * @prop {Array} type + * + * @typedef {Object} PostSearchResult + * @prop {SearchResultRecord} record + * @prop {string} url + * @prop {Object} content + * @prop {string} content.body + * @prop {number} createdAt + * @prop {number} updatedAt + * + * @typedef {Object} BookmarkSearchResult + * @prop {SearchResultRecord} record + * @prop {string} url + * @prop {Object} content + * @prop {string} content.href + * @prop {string} content.title + * @prop {string} content.description + * @prop {number} createdAt + * @prop {number} updatedAt + */ + +// exported api +// = + +/** + * @description + * Get suggested content of various types. + * + * @param {string} user - The current user's URL. + * @param {string} [query=''] - The search query. + * @param {Object} [opts={}] + * @param {boolean} [opts.filterPins] - If true, will filter out pinned bookmarks. + * @returns {Promise} + */ +exports.listSuggestions = async function (user, query = '', opts = {}) { + var suggestions = { + bookmarks: [], + websites: [], + people: [], + themes: [], + history: undefined + } + const filterFn = a => query ? ((a.url || a.href).includes(query) || a.title.toLowerCase().includes(query)) : true + const sortFn = (a, b) => (a.title||'').localeCompare(b.title||'') + function dedup (arr) { + var hits = new Set() + return arr.filter(item => { + if (hits.has(item.url)) return false + hits.add(item.url) + return true + }) + } + + var userId = (await users.get(user)).id + + // bookmarks + var bookmarkResults = await bookmarksDb.listBookmarks(0) + if (opts.filterPins) { + bookmarkResults = bookmarkResults.filter(b => !b.pinned && filterFn(b)) + } else { + bookmarkResults = bookmarkResults.filter(filterFn) + } + bookmarkResults.sort(sortFn) + bookmarkResults = bookmarkResults.slice(0, 12) + suggestions.bookmarks = bookmarkResults.map(b => ({title: b.title, url: b.href})) + + // websites + suggestions.websites = /** @type LibraryArchiveRecord[] */(await datLibrary.queryArchives({isSaved: true})) + suggestions.websites = suggestions.websites.filter(w => ( + w.url !== user // filter out the user's site + && (!w.type || !w.type.find(t => KNOWN_SITE_TYPES.includes(t))) // filter out other site types + )) + suggestions.websites = suggestions.websites.filter(filterFn) + suggestions.websites.sort(sortFn) + + // people + suggestions.people = (await follows.list({filters: {authors: user}})).map(({topic}) => topic) + suggestions.people = (await datLibrary.queryArchives({isSaved: true, type: 'unwalled.garden/person'})).concat(suggestions.people) + suggestions.people = dedup(suggestions.people) + suggestions.people = suggestions.people.filter(filterFn) + suggestions.people.sort(sortFn) + + // themes + suggestions.themes = /** @type LibraryArchiveRecord[] */(await datLibrary.queryArchives({isSaved: true, type: 'unwalled.garden/theme'})) + suggestions.themes = suggestions.themes.filter(filterFn) + suggestions.themes.sort(sortFn) + + if (query) { + // history + var historyResults = await historyDb.search(query) + suggestions.history = historyResults.slice(0, 12) + suggestions.history.sort((a, b) => a.url.length - b.url.length) // shorter urls at top + } + + return suggestions +} + +/** + * @description + * Run a search query against crawled data. + * + * @param {string} user - The current user's URL. + * @param {Object} opts + * @param {string} [opts.query] - The search query. + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.datasets] - Filter results to the given datasets. Defaults to 'all'. Valid values: 'all', 'sites', 'unwalled.garden/post', 'unwalled.garden/bookmark'. + * @param {number} [opts.filters.since] - Filter results to items created since the given timestamp. + * @param {number} [opts.hops=1] - How many hops out in the user's follow graph should be included? Valid values: 1, 2. + * @param {number} [opts.offset] + * @param {number} [opts.limit = 20] + * @returns {Promise} + */ +exports.query = async function (user, opts) { + const highlightNonce = (Math.random() * 1e3)|0 + const startHighlight = `{${highlightNonce}}` + const endHighlight = `{/${highlightNonce}}` + + var searchResults = { + highlightNonce, + results: [] + } + var {query, hops, filters, offset, limit} = Object.assign({}, { + query: undefined, + hops: 1, + filters: {}, + offset: 0, + limit: 20 + }, opts) + var {datasets, since} = Object.assign({}, { + datasets: 'all', + since: 0 + }, filters) + hops = Math.min(Math.max(Math.floor(hops), 1), 2) // clamp to [1, 2] for now + var datasetValues = (typeof datasets === 'undefined') + ? ['all'] + : Array.isArray(datasets) ? datasets : [datasets] + + // prep search terms + if (query && typeof query === 'string') { + query = query + .replace(/[^a-z0-9]/ig, ' ') // strip symbols that sqlite interprets. + .toLowerCase() // all lowercase. (uppercase is interpretted as a directive by sqlite.) + query += '*' // match prefixes + } + + // get user's crawl_source id + var userCrawlSourceId + { + let res = await db.get(`SELECT id FROM crawl_sources WHERE url = ?`, [user]) + userCrawlSourceId = res.id + } + + // construct set of crawl sources to query + var crawlSourceIds + if (hops === 2) { + // the user and all followed sources + let res = await db.all(` + SELECT id FROM crawl_sources src + INNER JOIN crawl_follows follows ON follows.destUrl = src.url AND follows.crawlSourceId = ? + `, [userCrawlSourceId]) + crawlSourceIds = [userCrawlSourceId].concat(res.map(({id}) => id)) + } else if (hops === 1) { + // just the user + crawlSourceIds = [userCrawlSourceId] + } + + // run queries + if (datasetValues.includes('all') || datasetValues.includes('sites')) { + // SITES + let rows = await db.all(buildSitesSearchQuery({ + query, + crawlSourceIds, + user, + userCrawlSourceId, + since, + limit, + offset, + startHighlight, + endHighlight + })) + rows = _uniqWith(rows, (a, b) => a.url === b.url) // remove duplicates + rows = await Promise.all(rows.map(massageSiteSearchResult)) + searchResults.results = searchResults.results.concat(rows) + } + if (datasetValues.includes('all') || datasets.includes('unwalled.garden/post')) { + // POSTS + let rows = await db.all(buildPostsSearchQuery({ + query, + crawlSourceIds, + userCrawlSourceId, + since, + limit, + offset, + startHighlight, + endHighlight + })) + rows = await Promise.all(rows.map(massagePostSearchResult)) + searchResults.results = searchResults.results.concat(rows) + } + if (datasetValues.includes('all') || datasets.includes('unwalled.garden/bookmark')) { + // BOOKMARKS + let rows = await db.all(buildBookmarksSearchQuery({ + query, + crawlSourceIds, + userCrawlSourceId, + since, + limit, + offset, + startHighlight, + endHighlight + })) + rows = await Promise.all(rows.map(massageBookmarkSearchResult)) + searchResults.results = searchResults.results.concat(rows) + } + + // sort and apply limit again + searchResults.results.sort((a, b) => b.record.crawledAt - a.record.crawledAt) + searchResults.results = searchResults.results.slice(0, limit) + + return searchResults +} + +// internal methods +// = + +function buildSitesSearchQuery ({query, crawlSourceIds, user, userCrawlSourceId, since, limit, offset, startHighlight, endHighlight}) { + let sql = knex(query ? 'crawl_site_descriptions_fts_index' : 'crawl_site_descriptions') + .select('crawl_site_descriptions.url AS url') + .select('crawl_sources.url AS authorUrl') + .select('crawl_site_descriptions.crawledAt') + .where(builder => builder + .whereIn('crawl_follows.crawlSourceId', crawlSourceIds) // description by a followed user + .orWhere(builder => builder + .where('crawl_site_descriptions.url', user) // about me and... + .andWhere('crawl_site_descriptions.crawlSourceId', userCrawlSourceId) // by me + ) + ) + .where('crawl_site_descriptions.crawledAt', '>=', since) + .orderBy('crawl_site_descriptions.crawledAt') + .limit(limit) + .offset(offset) + if (query) { + sql = sql + .select(knex.raw(`SNIPPET(crawl_site_descriptions_fts_index, 0, '${startHighlight}', '${endHighlight}', '...', 25) AS title`)) + .select(knex.raw(`SNIPPET(crawl_site_descriptions_fts_index, 1, '${startHighlight}', '${endHighlight}', '...', 25) AS description`)) + .innerJoin('crawl_site_descriptions', 'crawl_site_descriptions.rowid', '=', 'crawl_site_descriptions_fts_index.rowid') + .leftJoin('crawl_follows', 'crawl_follows.destUrl', '=', 'crawl_site_descriptions.url') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_site_descriptions.crawlSourceId') + .whereRaw('crawl_site_descriptions_fts_index MATCH ?', [query]) + } else { + sql = sql + .select('crawl_site_descriptions.title') + .select('crawl_site_descriptions.description') + .leftJoin('crawl_follows', 'crawl_follows.destUrl', '=', 'crawl_site_descriptions.url') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_site_descriptions.crawlSourceId') + } + return sql +} + +function buildPostsSearchQuery ({query, crawlSourceIds, userCrawlSourceId, since, limit, offset, startHighlight, endHighlight}) { + let sql = knex(query ? 'crawl_posts_fts_index' : 'crawl_posts') + .select('crawl_posts.pathname') + .select('crawl_posts.crawledAt') + .select('crawl_posts.createdAt') + .select('crawl_posts.updatedAt') + .select('crawl_sources.url AS authorUrl') + .where(builder => builder + .whereIn('crawl_follows.crawlSourceId', crawlSourceIds) // published by someone I follow + .orWhere('crawl_posts.crawlSourceId', userCrawlSourceId) // or by me + ) + .andWhere('crawl_posts.crawledAt', '>=', since) + .orderBy('crawl_posts.crawledAt') + .limit(limit) + .offset(offset) + if (query) { + sql = sql + .select(knex.raw(`SNIPPET(crawl_posts_fts_index, 0, '${startHighlight}', '${endHighlight}', '...', 25) AS body`)) + .innerJoin('crawl_posts', 'crawl_posts.rowid', '=', 'crawl_posts_fts_index.rowid') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_posts.crawlSourceId') + .leftJoin('crawl_follows', 'crawl_follows.destUrl', '=', 'crawl_sources.url') + .whereRaw('crawl_posts_fts_index MATCH ?', [query]) + } else { + sql = sql + .select('crawl_posts.body') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_posts.crawlSourceId') + .leftJoin('crawl_follows', 'crawl_follows.destUrl', '=', 'crawl_sources.url') + } + return sql +} + +function buildBookmarksSearchQuery ({query, crawlSourceIds, userCrawlSourceId, since, limit, offset, startHighlight, endHighlight}) { + let sql = knex(query ? 'crawl_bookmarks_fts_index' : 'crawl_bookmarks') + .select('crawl_bookmarks.pathname') + .select('crawl_bookmarks.crawledAt') + .select('crawl_bookmarks.createdAt') + .select('crawl_bookmarks.updatedAt') + .select('crawl_sources.url AS authorUrl') + .where(builder => builder + .whereIn('crawl_follows.crawlSourceId', crawlSourceIds) // published by someone I follow + .orWhere('crawl_bookmarks.crawlSourceId', userCrawlSourceId) // or by me + ) + .andWhere('crawl_bookmarks.crawledAt', '>=', since) + .orderBy('crawl_bookmarks.crawledAt') + .limit(limit) + .offset(offset) + if (query) { + sql = sql + .select('crawl_bookmarks.href') + .select(knex.raw(`SNIPPET(crawl_bookmarks_fts_index, 0, '${startHighlight}', '${endHighlight}', '...', 25) AS title`)) + .select(knex.raw(`SNIPPET(crawl_bookmarks_fts_index, 1, '${startHighlight}', '${endHighlight}', '...', 25) AS description`)) + .innerJoin('crawl_bookmarks', 'crawl_bookmarks.rowid', '=', 'crawl_bookmarks_fts_index.rowid') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_bookmarks.crawlSourceId') + .leftJoin('crawl_follows', 'crawl_follows.destUrl', '=', 'crawl_sources.url') + .whereRaw('crawl_bookmarks_fts_index MATCH ?', [query]) + } else { + sql = sql + .select('crawl_bookmarks.href') + .select('crawl_bookmarks.title') + .select('crawl_bookmarks.description') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_bookmarks.crawlSourceId') + .leftJoin('crawl_follows', 'crawl_follows.destUrl', '=', 'crawl_sources.url') + } + return sql +} + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageSiteSearchResult (row) { + // fetch additional info + var author = await siteDescriptions.getBest({subject: row.authorUrl}) + + // massage attrs + return { + record: { + type: 'site', + url: row.url, + author: { + url: author.url, + title: author.title, + description: author.description, + type: author.type + }, + crawledAt: row.crawledAt, + }, + url: row.url, + title: row.title, + description: row.description, + type: row.type + } +} + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massagePostSearchResult (row) { + // fetch additional info + var author = await siteDescriptions.getBest({subject: row.authorUrl}) + + // massage attrs + var url = row.authorUrl + row.pathname + return { + record: { + type: 'unwalled.garden/post', + url, + author: { + url: author.url, + title: author.title, + description: author.description, + type: author.type + }, + crawledAt: row.crawledAt, + }, + url, + content: {body: row.body}, + createdAt: row.createdAt, + updatedAt: row.updatedAt + } +} + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageBookmarkSearchResult (row) { + // fetch additional info + var author = await siteDescriptions.getBest({subject: row.authorUrl}) + + // massage attrs + var url = row.authorUrl + row.pathname + return { + record: { + type: 'unwalled.garden/bookmark', + url, + author: { + url: author.url, + title: author.title, + description: author.description, + type: author.type + }, + crawledAt: row.crawledAt, + }, + url, + content: { + href: row.href, + title: row.title, + description: row.description + }, + createdAt: row.createdAt, + updatedAt: row.updatedAt + } +} \ No newline at end of file diff --git a/crawler/site-descriptions.js b/crawler/site-descriptions.js new file mode 100644 index 00000000..6011815f --- /dev/null +++ b/crawler/site-descriptions.js @@ -0,0 +1,364 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const logger = require('../logger').child({category: 'crawler', dataset: 'site-descriptions'}) +const db = require('../dbs/profile-data-db') +const dat = require('../dat') +const crawler = require('./index') +const { + doCrawl, + doCheckpoint, + emitProgressEvent, + getMatchingChangesInOrder, + getSiteDescriptionThumbnailUrl, + toHostname +} = require('./util') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_PATH_REGEX = /^\/(dat\.json|data\/known-sites\/([^/]+)\/dat\.json)$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * + * @typedef {Object} SiteDescription + * @prop {string} url + * @prop {string} title + * @prop {string} description + * @prop {Array} type + * @prop {string} thumbUrl + * @prop {Object} descAuthor + * @prop {string} descAuthor.url + * @prop {boolean} [followsUser] - does this site follow the specified user site? + * @prop {Array} [followedBy] - list of sites following this site. + */ + +// globals +// = + +var events = new Events() + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for site descriptions. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_site_descriptions', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling site descriptions', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_site_descriptions', TABLE_VERSION, crawlSource, 0) + } + + // collect changed site descriptions + var changedSiteDescriptions = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedSiteDescriptions.length > 0) { + logger.verbose('Collected new/changed site-description files', {details: {url: archive.url, changedFiles: changedSiteDescriptions.map(p => p.name)}}) + } else { + logger.debug('No new site-description files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_site_descriptions', 0, changedSiteDescriptions.length) + + // read and apply each post in order + var progress = 0 + for (let changedSiteDescription of changedSiteDescriptions) { + // TODO Currently the crawler will abort reading the feed if any description fails to load + // this means that a single unreachable file can stop the forward progress of description indexing + // to solve this, we need to find a way to tolerate bad description-files without losing our ability to efficiently detect new posts + // -prf + + // determine the url + let url = getUrlFromDescriptionPath(archive, changedSiteDescription.name) + + if (changedSiteDescription.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND url = ? + `, [crawlSource.id, url]) + events.emit('description-removed', archive.url) + } else { + // read + let descString + try { + descString = await archive.pda.readFile(changedSiteDescription.name, 'utf8') + } catch (err) { + logger.warn('Failed to read dat.json file, aborting', {details: {url: archive.url, name: changedSiteDescription.name, err}}) + return // abort indexing + } + + // parse and validate + let desc + try { + desc = JSON.parse(descString) + assert(typeof desc === 'object', 'File be an object') + } catch (err) { + logger.warn('Failed to parse dat.json file, aborting', {details: {url: archive.url, name: changedSiteDescription.name, err}}) + continue // skip + } + + // massage the description + desc.title = typeof desc.title === 'string' ? desc.title : '' + desc.description = typeof desc.description === 'string' ? desc.description : '' + if (typeof desc.type === 'string') desc.type = desc.type.split(',') + if (Array.isArray(desc.type)) { + desc.type = desc.type.filter(isString) + } else { + desc.type = [] + } + + // replace + await db.run(` + DELETE FROM crawl_site_descriptions WHERE crawlSourceId = ? AND url = ? + `, [crawlSource.id, url]) + await db.run(` + INSERT INTO crawl_site_descriptions (crawlSourceId, crawledAt, url, title, description, type) + VALUES (?, ?, ?, ?, ?, ?) + `, [crawlSource.id, Date.now(), url, desc.title, desc.description, desc.type.join(',')]) + events.emit('description-added', archive.url) + } + + // checkpoint our progress + logger.silly(`Finished crawling site descriptions`, {details: {url: archive.url}}) + await doCheckpoint('crawl_site_descriptions', TABLE_VERSION, crawlSource, changedSiteDescription.version) + emitProgressEvent(archive.url, 'crawl_site_descriptions', ++progress, changedSiteDescription.length) + } + }) +} + +/** + * @description + * List crawled site descriptions. + * + * @param {Object} [opts] + * @param {string | Array} [opts.subject] - (URL) filter descriptions to those which describe this subject. + * @param {string | Array} [opts.author] - (URL) filter descriptions to those created by this author. + * @param {number} [opts.offset] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +const list = exports.list = async function ({offset, limit, reverse, author, subject} = {}) { + // validate & parse params + assert(!offset || typeof offset === 'number', 'Offset must be a number') + assert(!limit || typeof limit === 'number', 'Limit must be a number') + assert(!reverse || typeof reverse === 'boolean', 'Reverse must be a boolean') + assert(!author || typeof author === 'string' || (Array.isArray(author) && author.every(isString)), 'Author must be a string or an array of strings') + assert(!subject || typeof subject === 'string' || (Array.isArray(subject) && subject.every(isString)), 'Subject must be a string or an array of strings') + + if (author) { + author = Array.isArray(author) ? author : [author] + try { author = await Promise.all(author.map(dat.library.getPrimaryUrl)) } + catch (e) { throw new Error('Author must contain valid URLs') } + } + if (subject) { + subject = Array.isArray(subject) ? subject : [subject] + try { subject = await Promise.all(subject.map(dat.library.getPrimaryUrl)) } + catch (e) { throw new Error('Subject must contain valid URLs') } + } + + // build query + var query = ` + SELECT crawl_site_descriptions.*, src.url AS crawlSourceUrl FROM crawl_site_descriptions + INNER JOIN crawl_sources src ON src.id = crawl_site_descriptions.crawlSourceId + ` + var values = [] + + if (author || subject) { + query += ` WHERE ` + } + + if (author) { + query += `(` + let op = `` + for (let a of author) { + query += `${op} src.url = ?` + op = ` OR` + values.push(a) + } + query += `) ` + } + if (subject) { + if (author) { + query += ` AND ` + } + query += `(` + let op = `` + for (let s of subject) { + query += `${op} crawl_site_descriptions.url = ?` + op = ` OR` + values.push(s) + } + query += `) ` + } + if (reverse) { + query += ` DESC` + } + if (limit) { + query += ` LIMIT ?` + values.push(limit) + } + if (offset) { + query += ` OFFSET ?` + values.push(offset) + } + + // execute query + return (await db.all(query, values)).map(massageSiteDescriptionRow) +} + +/** + * @description + * Get the most trustworthy site description available. + * + * @param {Object} [opts] + * @param {string} [opts.subject] - (URL) filter descriptions to those which describe this subject. + * @param {string} [opts.author] - (URL) filter descriptions to those created by this author. + * @returns {Promise} + */ +exports.getBest = async function ({subject, author} = {}) { + // TODO choose based on trust + var descriptions = await list({subject, author}) + return descriptions[0] +} + +/** + * @description + * Capture a site description into the archive's known-sites cache. + * + * @param {InternalDatArchive} archive - where to write the capture to. + * @param {(InternalDatArchive|string)} subject - which archive to capture. + * @returns Promise + */ +exports.capture = async function (archive, subject) { + var subjectArchive + if (typeof subject === 'string') { + subjectArchive = await dat.library.getOrLoadArchive(subject) + } else { + subjectArchive = subject + } + + // create directory + var hostname = toHostname(subjectArchive.url) + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/known-sites') + await ensureDirectory(archive, `/data/known-sites/${hostname}`) + + // capture dat.json + try { + var datJson = JSON.parse(await subjectArchive.pda.readFile('/dat.json')) + } catch (err) { + logger.warn('Failed to read dat.json of subject archive', {details: {err}}) + throw new Error('Unabled to read subject dat.json') + } + await archive.pda.writeFile(`/data/known-sites/${hostname}/dat.json`, JSON.stringify(datJson, null, 2)) + + // capture thumb + for (let ext of ['jpg', 'jpeg', 'png']) { + let thumbPath = `/thumb.${ext}` + if (await fileExists(subjectArchive, thumbPath)) { + let targetPath = `/data/known-sites/${hostname}/thumb.${ext}` + await archive.pda.writeFile(targetPath, await subjectArchive.pda.readFile(thumbPath, 'binary'), 'binary') + break + } + } +} + +/** + * @description + * Delete a captured site description in the given archive's known-sites cache. + * + * @param {InternalDatArchive} archive - where to remove the capture from. + * @param {(InternalDatArchive|string)} subject - which archive's capture to remove. + * @returns Promise + */ +exports.deleteCapture = async function (archive, subject) { + var subjectUrl + if (typeof subject === 'string') { + subjectUrl = subject + } else { + subjectUrl = subject.url + } + assert(typeof subjectUrl === 'string', 'Delete() must be provided a valid URL string') + var hostname = toHostname(subjectUrl) + await archive.pda.rmdir(`/data/known-sites/${hostname}`, {recursive: true}) + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {any} v + * returns {boolean} + */ +function isString (v) { + return typeof v === 'string' +} + +/** + * @param {InternalDatArchive} archive + * @param {string} name + * @returns {string} + */ +function getUrlFromDescriptionPath (archive, name) { + if (name === '/dat.json') return archive.url + var parts = name.split('/') // '/data/known-sites/{hostname}/dat.json' -> ['', 'data', 'known-sites', hostname, 'dat.json'] + return 'dat://' + parts[3] +} + +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns {Promise} + */ +async function ensureDirectory (archive, pathname) { + try { await archive.pda.mkdir(pathname) } + catch (e) { /* ignore */ } +} + +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns {Promise} + */ +async function fileExists (archive, pathname) { + try { await archive.pda.stat(pathname) } + catch (e) { return false } + return true +} + +/** + * @param {Object} row + * @returns {SiteDescription} + */ +function massageSiteDescriptionRow (row) { + if (!row) return null + row.author = {url: row.crawlSourceUrl} + row.type = row.type && typeof row.type === 'string' ? row.type.split(',') : undefined + row.thumbUrl = getSiteDescriptionThumbnailUrl(row.author.url, row.url) + delete row.crawlSourceUrl + delete row.crawlSourceId + return row +} diff --git a/crawler/tags.js b/crawler/tags.js new file mode 100644 index 00000000..0aadb130 --- /dev/null +++ b/crawler/tags.js @@ -0,0 +1,218 @@ +const assert = require('assert') +const {URL} = require('url') +const db = require('../dbs/profile-data-db') +const knex = require('../lib/knex') +const datLibrary = require('../dat/library') +const {normalizeSchemaUrl} = require('./util') + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription + * + * @typedef {Object} Tag + * @prop {string} tag + * @prop {number} count + */ + +// exported api +// = + +/** + * @description + * List bookmark tags. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.listBookmarkTags = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // build query + var sql = knex('crawl_tags') + .select('crawl_tags.tag') + .select(knex.raw('count(crawl_tags.id) as count')) + .innerJoin('crawl_bookmarks_tags', 'crawl_bookmarks_tags.crawlTagId', '=', 'crawl_tags.id') + .innerJoin('crawl_bookmarks', 'crawl_bookmarks_tags.crawlBookmarkId', '=', 'crawl_bookmarks.id') + .leftJoin('crawl_sources', 'crawl_bookmarks.crawlSourceId', '=', 'crawl_sources.id') + .orderBy('crawl_tags.tag', opts.reverse ? 'DESC' : 'ASC') + .groupBy('crawl_tags.tag') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + return rows.map(row => ({ + tag: row.tag, + count: +row.count + })) +} + +/** + * @description + * List discussion tags. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.listDiscussionTags = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // build query + var sql = knex('crawl_tags') + .select('crawl_tags.tag') + .select(knex.raw('count(crawl_tags.id) as count')) + .innerJoin('crawl_discussions_tags', 'crawl_discussions_tags.crawlTagId', '=', 'crawl_tags.id') + .innerJoin('crawl_discussions', 'crawl_discussions_tags.crawlDiscussionId', '=', 'crawl_discussions.id') + .leftJoin('crawl_sources', 'crawl_discussions.crawlSourceId', '=', 'crawl_sources.id') + .orderBy('crawl_tags.tag', opts.reverse ? 'DESC' : 'ASC') + .groupBy('crawl_tags.tag') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + return rows.map(row => ({ + tag: row.tag, + count: +row.count + })) +} + +/** + * @description + * List media tags. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.subtypes] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.listMediaTags = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('subtypes' in opts.filters) { + if (Array.isArray(opts.filters.subtypes)) { + assert(opts.filters.subtypes.every(v => typeof v === 'string'), 'Subtypes filter must be a string or array of strings') + } else { + assert(typeof opts.filters.subtypes === 'string', 'Subtypes filter must be a string or array of strings') + opts.filters.subtypes = [opts.filters.subtypes] + } + opts.filters.subtypes = opts.filters.subtypes.map(normalizeSchemaUrl) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // build query + var sql = knex('crawl_tags') + .select('crawl_tags.tag') + .select(knex.raw('count(crawl_tags.id) as count')) + .innerJoin('crawl_media_tags', 'crawl_media_tags.crawlTagId', '=', 'crawl_tags.id') + .innerJoin('crawl_media', 'crawl_media_tags.crawlMediaId', '=', 'crawl_media.id') + .leftJoin('crawl_sources', 'crawl_media.crawlSourceId', '=', 'crawl_sources.id') + .orderBy('crawl_tags.tag', opts.reverse ? 'DESC' : 'ASC') + .groupBy('crawl_tags.tag') + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.filters && opts.filters.subtypes) { + sql = sql.whereIn('crawl_media.subtype', opts.filters.subtypes) + } + if (opts && opts.limit) sql = sql.limit(opts.limit) + if (opts && opts.offset) sql = sql.offset(opts.offset) + + // execute query + var rows = await db.all(sql) + return rows.map(row => ({ + tag: row.tag, + count: +row.count + })) +} \ No newline at end of file diff --git a/crawler/util.js b/crawler/util.js new file mode 100644 index 00000000..484086f4 --- /dev/null +++ b/crawler/util.js @@ -0,0 +1,228 @@ +const EventEmitter = require('events') +const pump = require('pump') +const concat = require('concat-stream') +const db = require('../dbs/profile-data-db') +const knex = require('../lib/knex') +const dat = require('../dat') + +const READ_TIMEOUT = 30e3 + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * + * @typedef {Object} CrawlSourceRecord + * @prop {string} id + * @prop {string} url + * @prop {number} datDnsId + * @prop {boolean} globalResetRequired + */ + +// exported api +// = + +const crawlerEvents = new EventEmitter() +exports.crawlerEvents = crawlerEvents + +/** + * @param {InternalDatArchive} archive + * @param {CrawlSourceRecord} crawlSource + * @param {string} crawlDataset + * @param {number} crawlDatasetVersion + * @param {function(Object): Promise} handlerFn + * @returns {Promise} + */ +exports.doCrawl = async function (archive, crawlSource, crawlDataset, crawlDatasetVersion, handlerFn) { + const url = archive.url + + // fetch current crawl state + var resetRequired = false + var state = await db.get( + knex('crawl_sources_meta') + .select('crawl_sources_meta.*') + .where({crawlSourceId: crawlSource.id, crawlDataset}) + ) + if (crawlSource.globalResetRequired || (state && state.crawlDatasetVersion !== crawlDatasetVersion)) { + resetRequired = true + state = null + } + if (!state) { + state = {crawlSourceVersion: 0, crawlDatasetVersion} + } + + // fetch current archive version + var archiveInfo = await dat.library.getDaemon().getArchiveInfo(archive.key) + var version = archiveInfo ? archiveInfo.version : 0 + + // fetch change log + var start = state.crawlSourceVersion + 1 + var end = version + 1 + var changes = await new Promise((resolve, reject) => { + pump( + archive.history({start, end, timeout: READ_TIMEOUT}), + concat({encoding: 'object'}, resolve), + reject + ) + }) + + crawlerEvents.emit('crawl-dataset-start', {sourceUrl: archive.url, crawlDataset, crawlRange: {start, end}}) + + // handle changes + await handlerFn({changes, resetRequired}) + + // final checkpoint + await doCheckpoint(crawlDataset, crawlDatasetVersion, crawlSource, version) + + crawlerEvents.emit('crawl-dataset-finish', {sourceUrl: archive.url, crawlDataset, crawlRange: {start, end}}) +} + +/** + * @param {string} crawlDataset + * @param {number} crawlDatasetVersion + * @param {CrawlSourceRecord} crawlSource + * @param {number} crawlSourceVersion + * @returns {Promise} + */ +const doCheckpoint = exports.doCheckpoint = async function (crawlDataset, crawlDatasetVersion, crawlSource, crawlSourceVersion) { + // TODO chould this be an INSERT OR REPLACE? + await db.run(knex('crawl_sources_meta').delete().where({crawlDataset, crawlSourceId: crawlSource.id})) + await db.run(knex('crawl_sources_meta').insert({ + crawlDataset, + crawlDatasetVersion, + crawlSourceId: crawlSource.id, + crawlSourceVersion, + updatedAt: Date.now() + })) +} + +/** + * @param {string} sourceUrl + * @param {string} crawlDataset + * @param {number} progress + * @param {number} numUpdates + */ +exports.emitProgressEvent = function (sourceUrl, crawlDataset, progress, numUpdates) { + crawlerEvents.emit('crawl-dataset-progress', {sourceUrl, crawlDataset, progress, numUpdates}) +} + +/** + * @param {Array} changes + * @param {RegExp} regex + * @returns {Array} + */ +exports.getMatchingChangesInOrder = function (changes, regex) { + var list = [] // order matters, must be oldest to newest + changes.forEach(c => { + if (regex.test(c.name)) { + let i = list.findIndex(c2 => c2.name === c.name) + if (i !== -1) list.splice(i, 1) // remove from old position + list.push(c) + } + }) + return list +} + +/** + * @returns {string} + */ +var _lastGeneratedTimeFilename +exports.generateTimeFilename = function () { + var d = Date.now() + if (d === _lastGeneratedTimeFilename) { + d++ + } + _lastGeneratedTimeFilename = d + return (new Date(d)).toISOString() +} + +/** + * @param {string} url + * @returns {string} + */ +const toHostname = +exports.toHostname = function (url) { + var urlParsed = new URL(url) + return urlParsed.hostname +} + +/** + * @param {string} url + * @param {boolean?} shouldThrow + * @returns {string} + */ +const toOrigin = +exports.toOrigin = function (url, shouldThrow = false) { + try { + var urlParsed = new URL(url) + return urlParsed.protocol + '//' + urlParsed.hostname + } catch (e) { + if (shouldThrow) { + throw new Error('Invalid URL: ' + url) + } + return null + } +} + +/** + * @param {string} url + * @returns {string} + */ +exports.normalizeTopicUrl = function (url) { + try { + var urlp = new URL(url) + return (urlp.protocol + '//' + urlp.hostname + urlp.pathname + urlp.search + urlp.hash).replace(/([/]$)/g, '') + } catch (e) {} + return null +} + +/** + * @param {string} url + * @returns {string} + */ +exports.normalizeSchemaUrl = function (url) { + try { + var urlp = new URL(url) + return (urlp.hostname + urlp.pathname + urlp.search + urlp.hash).replace(/([/]$)/g, '') + } catch (e) {} + return url +} + +/** + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns {Promise} + */ +exports.ensureDirectory = async function (archive, pathname) { + try { await archive.pda.mkdir(pathname) } + catch (e) { /* ignore */ } +} + +/** + * @description Helper to determine the thumbUrl for a site description. + * @param {string} author - (URL) the author of the site description. + * @param {string} subject - (URL) the site being described. + * @returns {string} - the URL of the thumbnail. + */ +exports.getSiteDescriptionThumbnailUrl = function (author, subject) { + return author === subject + ? `${subject}/thumb` // self-description, use their own thumb + : `${author}/data/known-sites/${toHostname(subject)}/thumb` // use captured thumb +} + +/** + * @param {string} url + * @returns {string} + */ +var reservedChars = /[<>:"/\\|?*\x00-\x1F]/g +var endingDashes = /([-]+$)/g +exports.slugifyUrl = function (str) { + try { + let url = new URL(str) + str = url.protocol + url.hostname + url.pathname + url.search + url.hash + } catch (e) { + // ignore + } + return str.replace(reservedChars, '-').replace(endingDashes, '') +} \ No newline at end of file diff --git a/crawler/votes.js b/crawler/votes.js new file mode 100644 index 00000000..d5d46b7f --- /dev/null +++ b/crawler/votes.js @@ -0,0 +1,382 @@ +const assert = require('assert') +const {URL} = require('url') +const Events = require('events') +const Ajv = require('ajv') +const logger = require('../logger').child({category: 'crawler', dataset: 'votes'}) +const db = require('../dbs/profile-data-db') +const crawler = require('./index') +const datLibrary = require('../dat/library') +const knex = require('../lib/knex') +const siteDescriptions = require('./site-descriptions') +const {doCrawl, doCheckpoint, emitProgressEvent, getMatchingChangesInOrder, ensureDirectory, normalizeTopicUrl, generateTimeFilename} = require('./util') +const voteSchema = require('./json-schemas/vote') + +// constants +// = + +const TABLE_VERSION = 1 +const JSON_TYPE = 'unwalled.garden/vote' +const JSON_PATH_REGEX = /^\/data\/votes\/([^/]+)\.json$/i + +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * @typedef {import('./util').CrawlSourceRecord} CrawlSourceRecord + * @typedef { import("./site-descriptions").SiteDescription } SiteDescription + * + * @typedef {Object} Vote + * @prop {string} pathname + * @prop {string} topic + * @prop {number} vote + * @prop {string} createdAt + * @prop {string} updatedAt + * @prop {SiteDescription} author + * @prop {string} visibility + * + * @typedef {Object} TabulatedVotes + * @prop {string} topic + * @prop {number} upvotes + * @prop {SiteDescription[]} upvoters + * @prop {number} downvotes + * @prop {SiteDescription[]} downvoters + */ + +// globals +// = + +const events = new Events() +const ajv = (new Ajv()) +const validateVote = ajv.compile(voteSchema) + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for votes. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {CrawlSourceRecord} crawlSource - internal metadata about the crawl target. + * @returns {Promise} + */ +exports.crawlSite = async function (archive, crawlSource) { + return doCrawl(archive, crawlSource, 'crawl_votes', TABLE_VERSION, async ({changes, resetRequired}) => { + const supressEvents = resetRequired === true // dont emit when replaying old info + logger.silly('Crawling votes', {details: {url: archive.url, numChanges: changes.length, resetRequired}}) + if (resetRequired) { + // reset all data + logger.debug('Resetting dataset', {details: {url: archive.url}}) + await db.run(` + DELETE FROM crawl_votes WHERE crawlSourceId = ? + `, [crawlSource.id]) + await doCheckpoint('crawl_votes', TABLE_VERSION, crawlSource, 0) + } + + // collect changed votes + var changedVotes = getMatchingChangesInOrder(changes, JSON_PATH_REGEX) + if (changedVotes.length) { + logger.verbose('Collected new/changed vote files', {details: {url: archive.url, changedVotes: changedVotes.map(p => p.name)}}) + } else { + logger.debug('No new vote-files found', {details: {url: archive.url}}) + } + emitProgressEvent(archive.url, 'crawl_votes', 0, changedVotes.length) + + // read and apply each vote in order + var progress = 0 + for (let changedVote of changedVotes) { + // TODO Currently the crawler will abort reading the feed if any vote fails to load + // this means that a single unreachable file can stop the forward progress of vote indexing + // to solve this, we need to find a way to tolerate unreachable vote-files without losing our ability to efficiently detect new votes + // -prf + if (changedVote.type === 'del') { + // delete + await db.run(` + DELETE FROM crawl_votes WHERE crawlSourceId = ? AND pathname = ? + `, [crawlSource.id, changedVote.name]) + events.emit('vote-updated', archive.url) + } else { + // read + let fileString + try { + fileString = await archive.pda.readFile(changedVote.name, 'utf8') + } catch (err) { + logger.warn('Failed to read vote file, aborting', {details: {url: archive.url, name: changedVote.name, err}}) + return // abort indexing + } + + // parse and validate + let vote + try { + vote = JSON.parse(fileString) + let valid = validateVote(vote) + if (!valid) throw ajv.errorsText(validateVote.errors) + } catch (err) { + logger.warn('Failed to parse vote file, skipping', {details: {url: archive.url, name: changedVote.name, err}}) + continue // skip + } + + // massage record + vote.topic = normalizeTopicUrl(vote.topic) + vote.createdAt = Number(new Date(vote.createdAt)) + vote.updatedAt = Number(new Date(vote.updatedAt)) + if (isNaN(vote.updatedAt)) vote.updatedAt = 0 // optional + + // delete existing + await db.run(knex('crawl_votes').where({crawlSourceId: crawlSource.id, topic: vote.topic}).del()) + + // insert new + await db.run(knex('crawl_votes').insert({ + crawlSourceId: crawlSource.id, + pathname: changedVote.name, + crawledAt: Date.now(), + topic: vote.topic, + vote: vote.vote, + createdAt: vote.createdAt, + updatedAt: vote.updatedAt + })) + events.emit('vote-updated', archive.url) + } + + // checkpoint our progress + logger.silly(`Finished crawling votes`, {details: {url: archive.url}}) + await doCheckpoint('crawl_votes', TABLE_VERSION, crawlSource, changedVote.version) + emitProgressEvent(archive.url, 'crawl_votes', ++progress, changedVotes.length) + } + }) +} + +/** + * @description + * List crawled votes. + * + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string|string[]} [opts.filters.topics] + * @param {string} [opts.filters.visibility] + * @param {string} [opts.sortBy] + * @param {number} [opts.offset=0] + * @param {number} [opts.limit] + * @param {boolean} [opts.reverse] + * @returns {Promise>} + */ +exports.list = async function (opts) { + // TODO: handle visibility + // TODO: sortBy options + + // validate & parse params + if (opts && 'sortBy' in opts) assert(typeof opts.sortBy === 'string', 'SortBy must be a string') + if (opts && 'offset' in opts) assert(typeof opts.offset === 'number', 'Offset must be a number') + if (opts && 'limit' in opts) assert(typeof opts.limit === 'number', 'Limit must be a number') + if (opts && 'reverse' in opts) assert(typeof opts.reverse === 'boolean', 'Reverse must be a boolean') + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('topics' in opts.filters) { + if (Array.isArray(opts.filters.topics)) { + assert(opts.filters.topics.every(v => typeof v === 'string'), 'Topics filter must be a string or array of strings') + } else { + assert(typeof opts.filters.topics === 'string', 'Topics filter must be a string or array of strings') + opts.filters.topics = [opts.filters.topics] + } + opts.filters.topics = opts.filters.topics.map(normalizeTopicUrl) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // execute query + let sql = knex('crawl_votes') + .select('crawl_votes.*') + .select('crawl_sources.url AS author') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_votes.crawlSourceId') + .orderBy('crawl_votes.topic', opts.reverse ? 'DESC' : 'ASC') + if (opts.limit) sql = sql.limit(opts.limit) + if (opts.offset) sql = sql.offset(opts.offset) + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + if (opts && opts.filters && opts.filters.topics) { + sql = sql.whereIn('crawl_votes.topic', opts.filters.topics) + } + var rows = await db.all(sql) + + // massage results + return Promise.all(rows.map(massageVoteRow)) +} + +/** + * @description + * List crawled votes on a topic. + * + * @param {string} topic - The URL of the topic + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {string|string[]} [opts.filters.authors] + * @param {string} [opts.filters.visibility] + * @returns {Promise} + */ +exports.tabulate = async function (topic, opts) { + // TODO handle visibility + + // validate params + try { new URL(topic) } + catch (e) { throw new Error('Invalid URL: ' + topic) } + if (opts && opts.filters) { + if ('authors' in opts.filters) { + if (Array.isArray(opts.filters.authors)) { + assert(opts.filters.authors.every(v => typeof v === 'string'), 'Authors filter must be a string or array of strings') + } else { + assert(typeof opts.filters.authors === 'string', 'Authors filter must be a string or array of strings') + opts.filters.authors = [opts.filters.authors] + } + opts.filters.authors = await Promise.all(opts.filters.authors.map(datLibrary.getPrimaryUrl)) + } + if ('visibility' in opts.filters) { + assert(typeof opts.filters.visibility === 'string', 'Visibility filter must be a string') + } + } + + // execute query + var sql = knex('crawl_votes') + .select('crawl_votes.*') + .select('crawl_sources.url AS crawlSourceUrl') + .innerJoin('crawl_sources', 'crawl_sources.id', '=', 'crawl_votes.crawlSourceId') + .where('crawl_votes.topic', topic) + if (opts && opts.filters && opts.filters.authors) { + sql = sql.whereIn('crawl_sources.url', opts.filters.authors) + } + var rows = await db.all(sql) + + // construct votes tally + var tally = { + topic, + upvotes: 0, + upvoters: [], + downvotes: 0, + downvoters: [] + } + await Promise.all(rows.map(async (row) => { + let author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if ((+row.vote) === 1) { + tally.upvotes++ + tally.upvoters.push(author) + } else { + tally.downvotes++ + tally.downvoters.push(author) + } + })) + return tally +} + +/** + * @description + * Get crawled vote. + * + * @param {string} author - The URL of the author + * @param {string} topic - The URL of the topic + * @returns {Promise} + */ +const get = exports.get = async function (author, topic) { + author = await datLibrary.getPrimaryUrl(author) + topic = normalizeTopicUrl(topic) + + // execute query + var sql = knex('crawl_votes') + .select('crawl_votes.*') + .select('crawl_sources.url AS crawlSourceUrl') + .innerJoin('crawl_sources', function () { + this.on('crawl_sources.id', '=', 'crawl_votes.crawlSourceId') + .andOn('crawl_sources.url', '=', knex.raw('?', author)) + }) + .where('crawl_votes.topic', topic) + return await massageVoteRow(await db.get(sql)) +} + +/** + * @description + * Set a vote. + * + * @param {InternalDatArchive} archive - where to write the vote to. + * @param {string} topic + * @param {number} vote + * @returns {Promise} + */ +exports.set = async function (archive, topic, vote) { + // TODO handle visibility + + // get the existing vote if it exists + let existingVote = await get(archive.url, topic) + var filename = existingVote ? existingVote.createdAt : generateTimeFilename() + var filepath = `/data/votes/${filename}.json` + + if (vote === 0) { + // delete vote + if (!existingVote) return + await archive.pda.unlink(filepath) + } else { + // set new vote + var voteObject = { + type: JSON_TYPE, + topic: normalizeTopicUrl(topic), + vote, + createdAt: existingVote ? existingVote.createdAt : (new Date()).toISOString() + } + if (existingVote) { + voteObject.updatedAt = (new Date()).toISOString() + } + + var valid = validateVote(voteObject) + if (!valid) throw ajv.errorsText(validateVote.errors) + + await ensureDirectory(archive, '/data') + await ensureDirectory(archive, '/data/votes') + await archive.pda.writeFile(filepath, JSON.stringify(voteObject, null, 2)) + } + await crawler.crawlSite(archive) +} + +// internal methods +// = + +/** + * @param {Object} row + * @returns {Promise} + */ +async function massageVoteRow (row) { + if (!row) return null + var author = await siteDescriptions.getBest({subject: row.crawlSourceUrl}) + if (!author) { + author = { + url: row.crawlSourceUrl, + title: '', + description: '', + type: [], + thumbUrl: `${row.crawlSourceUrl}/thumb`, + descAuthor: {url: null} + } + } + return { + pathname: row.pathname, + author, + topic: row.topic, + vote: row.vote, + createdAt: new Date(row.createdAt).toISOString(), + updatedAt: row.updatedAt ? new Date(row.updatedAt).toISOString() : null, + visibility: 'public' // TODO visibility + } +} \ No newline at end of file diff --git a/dat/assets.js b/dat/assets.js new file mode 100644 index 00000000..13a12ad9 --- /dev/null +++ b/dat/assets.js @@ -0,0 +1,97 @@ +const Events = require('events') +const ICO = require('icojs') +const mime = require('mime') +const sitedata = require('../dbs/sitedata') + +// constants +// = + +const ASSET_PATH_REGEX = /^\/?(favicon|thumb|cover).(jpg|jpeg|png|ico)$/i +const IDEAL_FAVICON_SIZE = 64 + +// typedefs +// = + +/** + * @typedef {import('./library').InternalDatArchive} InternalDatArchive + */ + +// globals +// = + +var events = new Events() + +// exported api +// = + +exports.on = events.on.bind(events) +exports.addListener = events.addListener.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @description + * Crawl the given site for assets. + * + * @param {InternalDatArchive} archive - site to crawl. + * @param {string[]?} filenames - which files to check. + * @returns {Promise} + */ +exports.update = async function (archive, filenames = null) { + // list target assets + if (!filenames) { + filenames = await archive.pda.readdir('/') + } + filenames = filenames.filter(v => ASSET_PATH_REGEX.test(v)) + + // read and cache each asset + for (let filename of filenames) { + try { + let assetType = extractAssetType(filename) + var dataUrl = await readAsset(archive, filename) + await sitedata.set(archive.url, assetType, dataUrl) + events.emit(`update:${assetType}:${archive.url}`) + } catch (e) { + console.log('Failed to update asset', filename, e) + } + } +} + +// internal +// = + +/** + * Extract the asset type from the pathname + * @param {string} pathname + * @returns string + */ +function extractAssetType (pathname) { + if (/cover/.test(pathname)) return 'cover' + if (/thumb/.test(pathname)) return 'thumb' + return 'favicon' +} + +/** + * Reads the asset file as a dataurl + * - Converts any .ico to .png + * @param {InternalDatArchive} archive + * @param {string} pathname + * @returns string The asset as a data URL + */ +async function readAsset (archive, pathname) { + if (pathname.endsWith('.ico')) { + let data = await archive.pda.readFile(pathname, 'binary') + // select the best-fitting size + let images = await ICO.parse(data, 'image/png') + let image = images[0] + for (let i = 1; i < images.length; i++) { + if (Math.abs(images[i].width - IDEAL_FAVICON_SIZE) < Math.abs(image.width - IDEAL_FAVICON_SIZE)) { + image = images[i] + } + } + let buf = Buffer.from(image.buffer) + return `data:image/png;base64,${buf.toString('base64')}` + } else { + let data = await archive.pda.readFile(pathname, 'base64') + return `data:${mime.lookup(pathname)};base64,${data}` + } +} \ No newline at end of file diff --git a/dat/daemon/folder-sync.js b/dat/daemon/folder-sync.js index ea4f1cff..b79e5eee 100644 --- a/dat/daemon/folder-sync.js +++ b/dat/daemon/folder-sync.js @@ -8,6 +8,7 @@ const EventEmitter = require('events') const pda = require('pauls-dat-api') const mkdirp = require('mkdirp') const {toAnymatchRules} = require('@beaker/datignore') +const logger = require('./logger').child({category: 'dat', subcategory: 'folder-sync'}) const {isFileNameBinary, isFileContentBinary} = require('../../lib/mime') const lock = require('../../lib/lock') const scopedFSes = require('../../lib/scoped-fses') @@ -85,7 +86,7 @@ const queueSyncEvent = exports.queueSyncEvent = function (archive, {toFolder, to } // ignore if currently syncing - if (archive.syncEventQueue.isSyncing) return console.log('already syncing, ignored') + if (archive.syncEventQueue.isSyncing) return logger.silly('Already syncing, ignored') // debounce the handler if (archive.syncEventQueue.timeout) { @@ -101,7 +102,7 @@ const queueSyncEvent = exports.queueSyncEvent = function (archive, {toFolder, to // lock archive.syncEventQueue.isSyncing = true - console.log('ok timed out, beginning sync', {toArchive, toFolder}) + logger.silly('Ok timed out, beginning sync', {details: {toArchive, toFolder}}) try { let st = await stat(fs, localSyncPath) @@ -109,14 +110,14 @@ const queueSyncEvent = exports.queueSyncEvent = function (archive, {toFolder, to // folder has been removed archive.stopWatchingLocalFolder() archive.stopWatchingLocalFolder = null - console.error('Local sync folder not found, aborting watch', localSyncPath) + logger.warn('Local sync folder not found, aborting watch', {details: {path: localSyncPath}}) return } // sync with priority given to the local folder if (toArchive) await syncFolderToArchive(archive, {localSyncPath, shallow: false}) else if (toFolder) await syncArchiveToFolder(archive, {localSyncPath, shallow: false}) } catch (e) { - console.error('Error syncing folder', localSyncPath, e) + logger.error('Error syncing folder', {details: {path: localSyncPath, error: e.toString()}}) if (e.name === 'CycleError') { events.emit('error', archive.key, e) } @@ -132,8 +133,6 @@ function newQueueObj () { // attach/detach a watcher on the local folder and sync it to the dat exports.configureFolderToArchiveWatcher = async function (archive) { - console.log('configureFolderToArchiveWatcher()', archive.localSyncSettings, !!archive.stopWatchingLocalFolder) - // HACKish // it's possible that configureFolderToArchiveWatcher() could be called multiple times in sequence // (for instance because of multiple settings changes) @@ -168,6 +167,8 @@ exports.configureFolderToArchiveWatcher = async function (archive) { // = if (archive.localSyncSettings) { + logger.silly('Configuring archive sync', {details: {key: archive.key.toString('hex'), settings: archive.localSyncSettings}}) + // create diff cache archive._compareContentCache = {} @@ -180,7 +181,7 @@ exports.configureFolderToArchiveWatcher = async function (archive) { let st = await stat(fs, archive.localSyncSettings.path) if (shouldAbort()) return if (!st) { - console.error('Local sync folder not found, aborting watch', archive.localSyncSettings.path) + logger.warn('Local sync folder not found, aborting watch', {details: {path: archive.localSyncSettings.path}}) } var scopedFS = scopedFSes.get(archive.localSyncSettings.path) @@ -199,8 +200,8 @@ exports.configureFolderToArchiveWatcher = async function (archive) { // sync up try { await mergeArchiveAndFolder(archive, archive.localSyncSettings.path) - } catch (e) { - console.error('Failed to merge local sync folder', e) + } catch (err) { + logger.error('Failed to merge local sync folder', {details: {err}}) } if (shouldAbort()) return @@ -214,7 +215,7 @@ exports.configureFolderToArchiveWatcher = async function (archive) { // B. maintain an in-memory copy of the datignore and keep it up-to-date, and then check at time of the event // -prf - console.log('changed detected', path) + logger.silly('Change detected', {details: {path}}) queueSyncEvent(archive, {toArchive: true}) }) } @@ -233,7 +234,7 @@ exports.configureFolderToArchiveWatcher = async function (archive) { exports.diffListing = async function (archive, opts = {}) { opts = opts || {} var localSyncPath = opts.localSyncPath || (archive.localSyncSettings && archive.localSyncSettings.path) - if (!localSyncPath) return console.log(new Error('diffListing() aborting, no localSyncPath')) // sanity check + if (!localSyncPath) return logger.warn('Sanity check failed - diffListing() aborting, no localSyncPath') var scopedFS = scopedFSes.get(localSyncPath) opts = massageDiffOpts(opts) @@ -253,7 +254,7 @@ exports.diffListing = async function (archive, opts = {}) { // diff an individual file // - filepath: string, the path of the file in the archive/folder exports.diffFile = async function (archive, filepath) { - if (!archive.localSyncSettings.path) return console.log(new Error('diffFile() aborting, no localSyncPath')) // sanity check + if (!archive.localSyncSettings.path) return logger.warn('Sanity check failed - diffFile() aborting, no localSyncPath') var scopedFS = scopedFSes.get(archive.localSyncSettings.path) filepath = path.normalize(filepath) @@ -326,7 +327,7 @@ exports.applyDatIgnoreFilter = function (archive, filepath) { // merge the dat.json in the folder and then merge files, with preference to folder files const mergeArchiveAndFolder = exports.mergeArchiveAndFolder = async function (archive, localSyncPath) { - console.log('merging archive with', localSyncPath) + logger.silly('Merging archive and folder', {details: {path: localSyncPath, key: archive.key.toString('hex')}}) const readManifest = async (fs) => { try { return await pda.readManifest(fs) } catch (e) { return {} } } @@ -338,7 +339,7 @@ const mergeArchiveAndFolder = exports.mergeArchiveAndFolder = async function (ar await sync(archive, false, {localSyncPath, shallow: false, addOnly: true}) // archive -> folder (add-only) await sync(archive, true, {localSyncPath, shallow: false}) // folder -> archive events.emit('merge:' + archive.key.toString('hex'), archive.key) - console.log('done merging archive with', localSyncPath) + logger.silly('Done merging archive and folder', {details: {path: localSyncPath, key: archive.key.toString('hex')}}) } // internal methods @@ -355,7 +356,7 @@ const mergeArchiveAndFolder = exports.mergeArchiveAndFolder = async function (ar async function sync (archive, toArchive, opts = {}) { opts = opts || {} var localSyncPath = opts.localSyncPath || (archive.localSyncSettings && archive.localSyncSettings.path) - if (!localSyncPath) return console.log(new Error('sync() aborting, no localSyncPath')) // sanity check + if (!localSyncPath) return logger.warn('Sanity check failed - sync() aborting, no localSyncPath') archive._activeSyncs = (archive._activeSyncs || 0) + 1 var release = await getArchiveSyncLock(archive) @@ -381,7 +382,7 @@ async function sync (archive, toArchive, opts = {}) { if (opts.addOnly) { diff = diff.filter(d => d.change === 'add') } - console.log('syncing to', toArchive ? 'archive' : 'folder', diff) // DEBUG + logger.silly(`Syncing to ${toArchive ? 'archive' : 'folder'}`, {details: {key: archive.key.toString('hex'), path: localSyncPath}}) // sync data await dft.applyRight(left, right, diff) @@ -391,10 +392,7 @@ async function sync (archive, toArchive, opts = {}) { // decrement active syncs archive._activeSyncs-- } catch (err) { - console.error('Failed to sync archive to local path') - console.error('- Archive:', archive.key.toString('hex')) - console.error('- Path:', localSyncPath) - console.error('- Error:', err) + logger.error('Failed to sync archive to local path', {details: {key: archive.key.toString('hex'), path: localSyncPath, err: err.toString()}}) } finally { release() } diff --git a/dat/daemon/index.js b/dat/daemon/index.js index 0aad5c42..c7ef8f9f 100644 --- a/dat/daemon/index.js +++ b/dat/daemon/index.js @@ -23,6 +23,8 @@ const discoverySwarm = require('discovery-swarm') const networkSpeed = require('hyperdrive-network-speed') const {ThrottleGroup} = require('stream-throttle') +const baseLogger = require('./logger') +const logger = baseLogger.child({category: 'dat', subcategory: 'daemon'}) const datStorage = require('./storage') const folderSync = require('./folder-sync') const {addArchiveSwarmLogging} = require('./logging-utils') @@ -38,7 +40,6 @@ var datPath var networkId = crypto.randomBytes(32) var archives = {} // in-memory cache of archive objects. key -> archive var archivesByDKey = {} // same, but discoveryKey -> archive -var archiveLoadPromises = {} // key -> promise var daemonEvents = new EventEmitter() var debugEvents = new EventEmitter() var debugLogFile @@ -93,7 +94,9 @@ exports.setup = async function ({rpcAPI, logfilePath}) { addArchiveSwarmLogging({archivesByDKey, log, archiveSwarm}) archiveSwarm.once('error', () => archiveSwarm.listen(0)) archiveSwarm.listen(DAT_SWARM_PORT) - archiveSwarm.on('error', error => log(null, {event: 'swarm-error', message: error.toString()})) + archiveSwarm.on('error', error => log(null, {event: 'swarm-error', message: error.toString()}, 'warn')) + + logger.info('Initialized dat daemon') } // rpc api @@ -103,6 +106,10 @@ const RPC_API = { // setup & config // = + /** + * @method + * @param {*} opts + */ async setup (opts) { datPath = opts.datPath folderSync.setup(opts) @@ -110,6 +117,7 @@ const RPC_API = { // up/down are in MB/s async setBandwidthThrottle ({up, down}) { + logger.info('Setting bandwidth throttle', {details: {up, down}}) if (typeof up !== 'undefined') { upThrottleGroup = up ? new ThrottleGroup({rate: up * 1e6}) : null } @@ -121,6 +129,10 @@ const RPC_API = { // event streams & debug // = + createLogStream () { + return emitStream(baseLogger.events) + }, + createEventStream () { return emitStream(daemonEvents) }, @@ -170,6 +182,12 @@ const RPC_API = { } }, + async getArchiveNetworkStats (key) { + var archive = getArchive(key) + if (!archive) return {} + return archive.networkStats + }, + updateSizeTracking, async loadArchive (opts) { @@ -179,8 +197,10 @@ const RPC_API = { metaPath, userSettings } = opts + var logDetails = {key: key.toString('hex')} // create the archive instance + logger.verbose('Loading archive', {details: logDetails}) var archive = hyperdrive(datStorage.create(metaPath), key, { sparse: true, secretKey @@ -190,7 +210,7 @@ const RPC_API = { }) archive.on('error', err => { let k = key.toString('hex') - log(k, {event: 'archive-error', message: err.toString()}) + log(k, {event: 'archive-error', message: err.toString()}, 'warn') console.error('Error in archive', k, err) }) archive.metadata.on('peer-add', () => onNetworkChanged(archive)) @@ -206,6 +226,7 @@ const RPC_API = { else resolve() }) }) + logger.silly('Archive ready', {details: {key: logDetails}}) await updateSizeTracking(archive) // attach extensions @@ -264,6 +285,7 @@ const RPC_API = { if (!archive) { return } + logger.verbose('Unloading archive', {details: {key}}) // shutdown archive leaveSwarm(key) @@ -287,7 +309,6 @@ const RPC_API = { // = callArchiveAsyncMethod (key, version, method, ...args) { - var cb = args.slice(-1)[0] var checkout = getArchiveCheckout(key, version) checkout[method](...args) }, @@ -317,6 +338,7 @@ const RPC_API = { if (!archive || archive.writable) { return // abort, only clear the content cache of downloaded archives } + logger.info('Clearing archive file cache', {details: {key: key.toString('hex')}}) // clear the cache await new Promise((resolve, reject) => { @@ -331,6 +353,16 @@ const RPC_API = { configureAutoDownload(archive, userSettings) }, + async exportFilesystemToArchive (opts) { + opts.dstArchive = getArchive(opts.dstArchive) + return pda.exportFilesystemToArchive(opts) + }, + + async exportArchiveToFilesystem (opts) { + opts.srcArchive = getArchive(opts.srcArchive) + return pda.exportFilesystemToArchive(opts) + }, + async exportArchiveToArchive (opts) { opts.srcArchive = getArchive(opts.srcArchive) opts.dstArchive = getArchive(opts.dstArchive) @@ -444,7 +476,11 @@ function getArchiveCheckout (key, version) { async function updateSizeTracking (archive) { archive = getArchive(archive) - archive.size = await pda.readSize(archive, '/') + try { + archive.size = await pda.readSize(archive, '/') + } catch (e) { + archive.size = 0 + } return archive.size } @@ -582,7 +618,7 @@ function createReplicationStream (info) { peer: `${info.host}:${info.port}`, connectionType: info.type, message: err.toString() - }) + }, 'warn') }) return stream @@ -638,7 +674,7 @@ function getInternalLocalSyncPath (archiveOrKey) { // helpers // = -function log (key, data) { +function log (key, data, logLevel = false) { var keys = Array.isArray(key) ? key : [key] keys.forEach(k => { let data2 = Object.assign(data, {archiveKey: k}) @@ -648,4 +684,8 @@ function log (key, data) { if (keys[0]) { debugLogFile.append(keys[0] + JSON.stringify(data) + '\n') } -} + if (logLevel) { + let message = data.event + (data.message ? `: ${data.message}` : '') + logger.log(logLevel, message, {details: {key, peer: data.peer}}) + } +} \ No newline at end of file diff --git a/dat/daemon/logger.js b/dat/daemon/logger.js new file mode 100644 index 00000000..e5b07c5e --- /dev/null +++ b/dat/daemon/logger.js @@ -0,0 +1,32 @@ +/** + * This logger is just an event-emitter wrapper which streams to the main process. + * The main process then folds the events into the main logger. + */ + +const Emitter = require('events') + +// globals +// = + +const events = new Emitter() + +// exported api +// = + +exports.events = events + +exports.child = (meta = {}) => { + const log = (level, message, etc = {}) => { + Object.assign(etc, meta) + events.emit('log', {level, message, etc}) + } + return { + log, + error: (...args) => log('error', ...args), + warn: (...args) => log('warn', ...args), + info: (...args) => log('info', ...args), + verbose: (...args) => log('verbose', ...args), + debug: (...args) => log('debug', ...args), + silly: (...args) => log('silly', ...args) + } +} \ No newline at end of file diff --git a/dat/daemon/logging-utils.js b/dat/daemon/logging-utils.js index 17b8b984..df028362 100644 --- a/dat/daemon/logging-utils.js +++ b/dat/daemon/logging-utils.js @@ -54,7 +54,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi log(datEncoding.toStr(archive.key), { event: 'peer-found', peer: `${peer.address || peer.host}:${peer.port}` - }) + }, 'silly') }) archiveSwarm.on('peer-banned', (peer, details) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -63,7 +63,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi event: 'peer-banned', peer: `${peer.address || peer.host}:${peer.port}`, message: peerBannedReason(details.reason) - }) + }, 'info') }) archiveSwarm.on('peer-rejected', (peer, details) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -72,7 +72,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi event: 'peer-rejected', peer: `${peer.address || peer.host}:${peer.port}`, message: peerRejectedReason(details.reason) - }) + }, 'silly') }) archiveSwarm.on('drop', (peer) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -81,7 +81,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi event: 'peer-dropped', peer: `${peer.address || peer.host}:${peer.port}`, message: 'Too many failed connection attempts' - }) + }, 'silly') }) archiveSwarm.on('connecting', (peer) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -89,7 +89,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi log(datEncoding.toStr(archive.key), { event: 'connecting', peer: `${peer.address || peer.host}:${peer.port}` - }) + }, 'debug') }) archiveSwarm.on('connect-failed', (peer, details) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -98,7 +98,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi event: 'connect-failed', peer: `${peer.address || peer.host}:${peer.port}`, message: connectFailedMessage(details) - }) + }, 'debug') }) archiveSwarm.on('handshaking', (conn, peer) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -109,7 +109,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi connectionId: conn._debugId, connectionType: peer.type, ts: 0 - }) + }, 'silly') }) archiveSwarm.on('handshake-timeout', (conn, peer) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -120,7 +120,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi connectionId: conn._debugId, connectionType: peer.type, ts: Date.now() - conn._debugStartTime - }) + }, 'silly') }) archiveSwarm.on('connection', (conn, peer) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -132,7 +132,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi connectionType: peer.type, ts: Date.now() - conn._debugStartTime, message: 'Starting replication' - }) + }, 'debug') }) archiveSwarm.on('redundant-connection', (conn, peer) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -143,7 +143,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi connectionId: conn._debugId, connectionType: peer.type, ts: Date.now() - conn._debugStartTime - }) + }, 'silly') }) archiveSwarm.on('connection-closed', (conn, peer) => { let archive = archivesByDKey[findFullDiscoveryKey(archivesByDKey, peer.channel)] @@ -154,7 +154,7 @@ const addArchiveSwarmLogging = exports.addArchiveSwarmLogging = function ({archi connectionId: conn._debugId, connectionType: peer.type, ts: Date.now() - conn._debugStartTime - }) + }, 'debug') }) } diff --git a/dat/daemon/manifest.js b/dat/daemon/manifest.js index 327d4514..09a78f27 100644 --- a/dat/daemon/manifest.js +++ b/dat/daemon/manifest.js @@ -1,3 +1,101 @@ +/** + * @typedef {import('../../dbs/archives').LibraryArchiveUserSettings} LibraryArchiveUserSettings + * + * @typedef {Object} DatDaemon + * @prop {function(DatDaemonSetupOpts): Promise} setup + * @prop {function(DatDaemonThrottleOpts): Promise} setBandwidthThrottle + * @prop {function(): NodeJS.ReadableStream} createLogStream + * @prop {function(): NodeJS.ReadableStream} createEventStream + * @prop {function(): NodeJS.ReadableStream} createDebugStream + * @prop {function(string): Promise} getDebugLog + * @prop {function(string | Buffer, LibraryArchiveUserSettings): Promise} configureArchive + * @prop {function(string | Buffer): Promise} getArchiveInfo + * @prop {function(string | Buffer): Promise} getArchiveNetworkStats + * @prop {function(string | Buffer): Promise} updateSizeTracking + * @prop {function(DatDaemonLoadArchiveOpts): Promise} loadArchive + * @prop {function(string): Promise} unloadArchive + * @prop {function(any=, ...any=): void} callArchiveAsyncMethod + * @prop {function(any=, ...any=): NodeJS.ReadableStream} callArchiveReadStreamMethod + * @prop {function(any=, ...any=): NodeJS.WritableStream} callArchiveWriteStreamMethod + * @prop {function(any=, ...any=): Promise} callArchivePDAPromiseMethod + * @prop {function(any=, ...any=): NodeJS.ReadableStream} callArchivePDAReadStreamMethod + * @prop {function(string | Buffer, LibraryArchiveUserSettings): Promise} clearFileCache + * @prop {function(Object): Promise} exportFilesystemToArchive + * @prop {function(Object): Promise} exportArchiveToFilesystem + * @prop {function(Object): Promise} exportArchiveToArchive + * @prop {function(string): Promise} fs_assertSafePath + * @prop {function(string | Buffer): Promise} fs_ensureSyncFinished + * @prop {function(string | Buffer, [DatDaemonFSDiffListingOpts]): Promise} fs_diffListing + * @prop {function(string | Buffer, string): Promise} fs_diffFile + * @prop {function(string | Buffer, DatDaemonFSQueueSyncEventOpts): Promise} fe_queueSyncEvent + * @prop {function(string | Buffer, [DatDaemonFSDiffListingOpts]): Promise} fs_syncFolderToArchive + * @prop {function(string | Buffer, [DatDaemonFSDiffListingOpts]): Promise} fs_syncArchiveToFolder + * @prop {function(any=, ...any=): Promise} ext_listPeers + * @prop {function(any=, ...any=): Promise} ext_getPeer + * @prop {function(any=, ...any=): Promise} ext_broadcastEphemeralMessage + * @prop {function(any=, ...any=): Promise} ext_sendEphemeralMessage + * @prop {function(any=, ...any=): Promise} ext_getSessionData + * @prop {function(any=, ...any=): Promise} ext_setSessionData + * @prop {function(any=, ...any=): NodeJS.ReadableStream} ext_createDatPeersStream + * NOTE: the ext_* methods are temporary so Im not going to bother documenting their types + * + * @typedef {Object} DatDaemonSetupOpts + * @prop {string} datPath + * @prop {string[]} disallowedSavePaths + * + * @typedef {Object} DatDaemonThrottleOpts + * @prop {number} [up] + * @prop {number} [down] + * + * @typedef {Object} DatDaemonLoadArchiveOpts + * @prop {string | Buffer} key + * @prop {Buffer} [secretKey] + * @prop {string} metaPath + * @prop {LibraryArchiveUserSettings} userSettings + * + * @typedef {Object} DatDaemonFSDiffListingOpts + * @prop {boolean} [shallow] - Dont descend into changed folders (default true) + * @prop {boolean} [compareContent] - Compare the actual content (default true) + * @prop {string[]} [paths] - A whitelist of files to compare + * @prop {string} [localSyncPath] - Override the archive localSyncPath + * @prop {boolean} [addOnly] - Dont modify or remove any files (default false) + * + * @typedef {Object} DatDaemonFSQueueSyncEventOpts + * @prop {boolean} toFolder + * @prop {boolean} toArchive + * + * @typedef {Object} DatDaemonLoadedArchiveInfo + * @prop {Buffer} discoveryKey + * @prop {boolean} writable + * + * @typedef {never} DatDaemonPeerInfo + * TODO- what's in here? + * + * @typedef {Object} DatDaemonPeerHistory + * @prop {number} ts + * @prop {number} peers + * + * @typedef {Object} DatDaemonNetworkStats + * @prop {number} downloadSpeed + * @prop {number} uploadSpeed + * @prop {number} downloadTotal + * @prop {number} uploadTotal + * + * @typedef {Object} DatDaemonArchiveInfo + * @prop {number} version + * @prop {number} size + * @prop {number} peers + * @prop {DatDaemonPeerInfo[]} peerInfo + * @prop {DatDaemonPeerHistory[]} peerHistory + * @prop {DatDaemonNetworkStats} networkStats + * + * @typedef {never} DatDaemonFSListingDiff + * TODO - what's in here? + * + * @typedef {never} DatDaemonFSFileDiff + * TODO - what's in here? + */ + module.exports = { // setup & config @@ -6,6 +104,7 @@ module.exports = { // event streams & debug + createLogStream: 'readable', createEventStream: 'readable', createDebugStream: 'readable', getDebugLog: 'promise', @@ -14,6 +113,7 @@ module.exports = { configureArchive: 'promise', getArchiveInfo: 'promise', + getArchiveNetworkStats: 'promise', updateSizeTracking: 'promise', loadArchive: 'promise', unloadArchive: 'promise', @@ -26,6 +126,8 @@ module.exports = { callArchivePDAPromiseMethod: 'promise', callArchivePDAReadStreamMethod: 'readable', clearFileCache: 'promise', + exportFilesystemToArchive: 'async', + exportArchiveToFilesystem: 'async', exportArchiveToArchive: 'async', // folder sync diff --git a/dat/daemon/storage.js b/dat/daemon/storage.js index 6bf1ec2f..a2215846 100644 --- a/dat/daemon/storage.js +++ b/dat/daemon/storage.js @@ -3,6 +3,7 @@ const fs = require('fs') const detectSparseFiles = require('supports-sparse-files') const raf = require('random-access-file') const raif = require('random-access-indexed-file') +const logger = require('./logger').child({category: 'dat', subcategory: 'storage'}) // globals // = @@ -22,7 +23,7 @@ exports.setup = async function () { detectSparseFiles(function (err, yes) { supportsSparseFiles = yes if (!yes) { - console.log('Sparse-file support not detected. Falling back to indexed data files.') + logger.info('Sparse-file support not detected. Falling back to indexed data files.') } resolve() }) diff --git a/dat/debugging.js b/dat/debugging.js index 1daf1634..b7dc0351 100644 --- a/dat/debugging.js +++ b/dat/debugging.js @@ -1,6 +1,9 @@ const {getActiveArchives} = require('./library') const datDns = require('./dns') +/** + * @returns {string} + */ exports.archivesDebugPage = function () { var archives = getActiveArchives() return ` @@ -24,6 +27,9 @@ exports.archivesDebugPage = function () { ` } +/** + * @returns {string} + */ exports.datDnsCachePage = function () { var cache = datDns.listCache() return ` @@ -41,6 +47,9 @@ exports.datDnsCachePage = function () { ` } +/** + * @returns {string} + */ exports.datDnsCacheJS = function () { return ` document.querySelector('button').addEventListener('click', clear) diff --git a/dat/directory-listing-page.js b/dat/directory-listing-page.js index 89ccb3de..18a75881 100644 --- a/dat/directory-listing-page.js +++ b/dat/directory-listing-page.js @@ -2,6 +2,8 @@ const {pluralize, makeSafe} = require('../lib/strings') const {stat, readdir} = require('pauls-dat-api') const {join, relative} = require('path') +/** @typedef {import('./library').InternalDatArchive} InternalDatArchive */ + const styles = `` +/** + * @prop {InternalDatArchive} archive + * @prop {string} dirPath + * @prop {string} webRoot + * @returns {Promise} + */ module.exports = async function renderDirectoryListingPage (archive, dirPath, webRoot) { // handle the webroot webRoot = webRoot || '/' @@ -31,14 +39,14 @@ module.exports = async function renderDirectoryListingPage (archive, dirPath, we try { names = await readdir(archive, realPath(dirPath)) } catch (e) {} // stat each file - var entries = await Promise.all(names.map(async (name) => { + var entries = /** @type any[] */(await Promise.all(names.map(async (name) => { var entry var entryPath = join(dirPath, name) try { entry = await stat(archive, realPath(entryPath)) } catch (e) { return false } entry.path = webrootPath(entryPath) entry.name = name return entry - })) + }))) entries = entries.filter(Boolean) // sort the listing @@ -58,7 +66,7 @@ module.exports = async function renderDirectoryListingPage (archive, dirPath, we // render entries var totalFiles = 0 - entries = entries.map(entry => { + var entriesStr = entries.map(entry => { totalFiles++ var url = makeSafe(entry.path) if (!url.startsWith('/')) url = '/' + url // all urls should have a leading slash @@ -71,5 +79,5 @@ module.exports = async function renderDirectoryListingPage (archive, dirPath, we var summary = `
${totalFiles} ${pluralize(totalFiles, 'file')}
` // render final - return '' + styles + updog + entries + summary + return '' + styles + updog + entriesStr + summary } diff --git a/dat/dns.js b/dat/dns.js index eb911c59..b75e6f57 100644 --- a/dat/dns.js +++ b/dat/dns.js @@ -1,16 +1,30 @@ +const parseDatURL = require('parse-dat-url') const {InvalidDomainName} = require('beaker-error-constants') -const sitedataDb = require('../dbs/sitedata') +const datDnsDb = require('../dbs/dat-dns') +const library = require('./library') const {DAT_HASH_REGEX} = require('../lib/const') +const logger = require('../logger').child({category: 'dat', subcategory: 'dns'}) + +const DNS_PROVIDERS = [['cloudflare-dns.com', '/dns-query'], ['dns.google.com', '/resolve']] +const DNS_PROVIDER = DNS_PROVIDERS[Math.random() > 0.5 ? 1 : 0] +logger.info(`Using ${DNS_PROVIDER[0]} to resolve DNS lookups`) // instantate a dns cache and export it const datDns = require('dat-dns')({ - persistentCache: {read, write} + persistentCache: {read, write}, + dnsHost: DNS_PROVIDER[0], + dnsPath: DNS_PROVIDER[1] }) module.exports = datDns +// hook up log events +datDns.on('resolved', details => logger.debug('Resolved', {details})) +datDns.on('failed', details => logger.debug('Failed lookup', {details})) +datDns.on('cache-flushed', details => logger.debug('Cache flushed')) + // wrap resolveName() with a better error const resolveName = datDns.resolveName -datDns.resolveName = function () { +datDns.resolveName = async function (name, opts, cb) { return resolveName.apply(datDns, arguments) .catch(_ => { throw new InvalidDomainName() @@ -18,13 +32,13 @@ datDns.resolveName = function () { } // persistent cache methods -const sitedataDbOpts = {dontExtractOrigin: true} async function read (name, err) { - var key = await sitedataDb.get('dat:' + name, 'dat-key', sitedataDbOpts) - if (!key) throw err - return key + // check the cache + var record = await datDnsDb.getCurrentByName(name) + if (!record) throw err + return record.key } async function write (name, key) { if (DAT_HASH_REGEX.test(name)) return // dont write for raw urls - await sitedataDb.set('dat:' + name, 'dat-key', key, sitedataDbOpts) + await library.confirmDomain(key) } diff --git a/dat/garbage-collector.js b/dat/garbage-collector.js index 21002150..1c28db8e 100644 --- a/dat/garbage-collector.js +++ b/dat/garbage-collector.js @@ -1,10 +1,21 @@ +const ms = require('ms') const archivesDb = require('../dbs/archives') const datLibrary = require('./library') const { DAT_GC_FIRST_COLLECT_WAIT, DAT_GC_REGULAR_COLLECT_WAIT } = require('../lib/const') -const debug = require('../lib/debug-logger').debugLogger('datgc') +const logger = require('../logger').child({category: 'dat', subcategory: 'garbage-collector'}) + +// typedefs +// = + +/** + * @typedef {Object} CollectResult + * @prop {number} totalBytes + * @prop {number} totalArchives + * @prop {number} skippedArchives + */ // globals // = @@ -18,7 +29,15 @@ exports.setup = function () { schedule(DAT_GC_FIRST_COLLECT_WAIT) } +/** + * @param {Object} [opts] + * @param {number} [opts.olderThan] + * @param {boolean} [opts.isOwner] + * @returns {Promise} + */ const collect = exports.collect = async function ({olderThan, isOwner} = {}) { + logger.info('Running GC') + // clear any scheduled GC if (nextGCTimeout) { clearTimeout(nextGCTimeout) @@ -32,7 +51,9 @@ const collect = exports.collect = async function ({olderThan, isOwner} = {}) { // first unsave expired archives var expiredArchives = await archivesDb.listExpiredArchives() - debug('GC unsaving %d expired archives', expiredArchives.length) + if (expiredArchives.length) { + logger.info(`Unsaving ${expiredArchives.length} expired archives`) + } var promises = [] for (let i = 0; i < expiredArchives.length; i++) { promises.push(archivesDb.setUserSettings(0, expiredArchives[i].key, {isSaved: false})) @@ -41,17 +62,20 @@ const collect = exports.collect = async function ({olderThan, isOwner} = {}) { // now GC old archives var unusedArchives = await archivesDb.listGarbageCollectableArchives({olderThan, isOwner}) - debug('GC cleaning out %d unused archives', unusedArchives.length) - debug(unusedArchives) + if (unusedArchives.length) { + logger.info(`Cleaning out ${unusedArchives.length} unused archives`) + logger.silly('Archives:', {urls: unusedArchives.map(a => a.key)}) + } for (let i = 0; i < unusedArchives.length; i++) { await datLibrary.unloadArchive(unusedArchives[i].key) totalBytes += await archivesDb.deleteArchive(unusedArchives[i].key) } - debug('GC completed in %d ms', Date.now() - startTime) + logger.debug(`GC completed in ${Date.now() - startTime} ms`) // schedule the next GC schedule(DAT_GC_REGULAR_COLLECT_WAIT) + logger.debug(`Scheduling next run to happen in ${ms(DAT_GC_REGULAR_COLLECT_WAIT)}`) // return stats return {totalBytes, totalArchives: unusedArchives.length - skippedArchives, skippedArchives} @@ -60,6 +84,9 @@ const collect = exports.collect = async function ({olderThan, isOwner} = {}) { // helpers // = +/** + * @param {number} time + */ function schedule (time) { nextGCTimeout = setTimeout(collect, time) nextGCTimeout.unref() diff --git a/dat/index.js b/dat/index.js index 7b7ac20a..512518b3 100644 --- a/dat/index.js +++ b/dat/index.js @@ -1,4 +1,5 @@ module.exports = { + assets: require('./assets'), debug: require('./debugging'), dns: require('./dns'), garbageCollector: require('./garbage-collector'), diff --git a/dat/library.js b/dat/library.js index e17dfb38..6a656315 100644 --- a/dat/library.js +++ b/dat/library.js @@ -2,19 +2,22 @@ const emitStream = require('emit-stream') const EventEmitter = require('events') const datEncoding = require('dat-encoding') const pify = require('pify') -const pda = require('pauls-dat-api') const signatures = require('sodium-signatures') const parseDatURL = require('parse-dat-url') -const debounce = require('lodash.debounce') +const _debounce = require('lodash.debounce') const mkdirp = require('mkdirp') +const baseLogger = require('../logger').get() +const logger = baseLogger.child({category: 'dat', subcategory: 'library'}) // dbs const siteData = require('../dbs/sitedata') const settingsDb = require('../dbs/settings') const archivesDb = require('../dbs/archives') +const datDnsDb = require('../dbs/dat-dns') // dat modules const datGC = require('./garbage-collector') +const datAssets = require('./assets') // constants // = @@ -23,9 +26,56 @@ const { DAT_HASH_REGEX, DAT_PRESERVED_FIELDS_ON_FORK } = require('../lib/const') -const {InvalidURLError} = require('beaker-error-constants') +const {InvalidURLError, TimeoutError} = require('beaker-error-constants') const DAT_DAEMON_MANIFEST = require('./daemon/manifest') +// typedefs +// = + +/** + * @typedef {import('./daemon/manifest').DatDaemon} DatDaemon + * @typedef {import('../dbs/archives').LibraryArchiveRecord} LibraryArchiveRecord + * + * @typedef {Object} InternalDatArchive + * @prop {Buffer} key + * @prop {string} url + * @prop {string?} domain + * @prop {Buffer} discoveryKey + * @prop {boolean} writable + * @prop {function(Function): void} ready + * @prop {function(Object, Function=): void} download + * @prop {function(Object=): NodeJS.ReadableStream} history + * @prop {function(Object=): NodeJS.ReadableStream} createReadStream + * @prop {function(string, Object=, Function=): any} readFile + * @prop {function(number, Object=): NodeJS.ReadableStream} createDiffStream + * @prop {function(string, Object=): NodeJS.WritableStream} createWriteStream + * @prop {function(string, any, Object=, Function=): void} writeFile + * @prop {function(string, Function=): void} unlink + * @prop {function(string, Object=, Function=): void} mkdir + * @prop {function(string, Function=): void} rmdir + * @prop {function(string, Object=, Function=): void} readdir + * @prop {function(string, Object=, Function=): void} stat + * @prop {function(string, Object=, Function=): void} lstat + * @prop {function(string, Object=, Function=): void} access + * @prop {Object} pda + * @prop {function(string): Promise} pda.stat + * @prop {function(string, Object=): Promise} pda.readFile + * @prop {function(string, Object=): Promise>} pda.readdir + * @prop {function(string): Promise} pda.readSize + * @prop {function(string, any, Object=): Promise} pda.writeFile + * @prop {function(string): Promise} pda.mkdir + * @prop {function(string, string): Promise} pda.copy + * @prop {function(string, string): Promise} pda.rename + * @prop {function(string): Promise} pda.unlink + * @prop {function(string, Object=): Promise} pda.rmdir + * @prop {function(string=): Promise} pda.download + * @prop {function(string=): NodeJS.ReadableStream} pda.watch + * @prop {function(): NodeJS.ReadableStream} pda.createNetworkActivityStream + * @prop {function(): Promise} pda.readManifest + * @prop {function(Object): Promise} pda.writeManifest + * @prop {function(Object): Promise} pda.updateManifest + */ + // globals // = @@ -33,17 +83,30 @@ var archives = {} // in-memory cache of archive objects. key -> archive var archiveLoadPromises = {} // key -> promise var archivesEvents = new EventEmitter() var daemonEvents -var daemon +var daemon = /** @type DatDaemon */({}) // exported API // = +/** + * @param {Object} opts + * @param {Object} opts.rpcAPI + * @param {Object} opts.datDaemonProcess + * @param {string[]} opts.disallowedSavePaths + * @return {Promise} + */ exports.setup = async function setup ({rpcAPI, datDaemonProcess, disallowedSavePaths}) { // connect to the daemon daemon = rpcAPI.importAPI('dat-daemon', DAT_DAEMON_MANIFEST, {proc: datDaemonProcess, timeout: false}) daemon.setup({disallowedSavePaths, datPath: archivesDb.getDatPath()}) daemonEvents = emitStream(daemon.createEventStream()) + // pipe the log + var daemonLogEvents = emitStream(daemon.createLogStream()) + daemonLogEvents.on('log', ({level, message, etc}) => { + baseLogger.log(level, message, etc) + }) + // wire up event handlers archivesDb.on('update:archive-user-settings', async (key, userSettings, newUserSettings) => { // emit event @@ -70,6 +133,12 @@ exports.setup = async function setup ({rpcAPI, datDaemonProcess, disallowedSaveP // update the download based on these settings daemon.configureArchive(key, userSettings) }) + datDnsDb.on('update', ({key, name}) => { + var archive = getArchive(key) + if (archive) { + archive.domain = name + } + }) // re-export events daemonEvents.on('network-changed', evt => archivesEvents.emit('network-changed', evt)) @@ -88,14 +157,21 @@ exports.setup = async function setup ({rpcAPI, datDaemonProcess, disallowedSaveP // start the GC manager datGC.setup() + logger.info('Initialized dat library') } +/** + * @returns {DatDaemon} + */ exports.getDaemon = () => daemon +/** + * @returns {Promise} + */ exports.loadSavedArchives = function () { // load and configure all saved archives return archivesDb.query(0, {isSaved: true}).then( - async (archives) => { + async (/** @type LibraryArchiveRecord[] */archives) => { // HACK // load the archives one at a time and give 5 seconds between each // why: the purpose of loading saved archives is to seed them @@ -113,14 +189,24 @@ exports.loadSavedArchives = function () { ) } +/** + * @returns {NodeJS.ReadableStream} + */ exports.createEventStream = function createEventStream () { - return emitStream(archivesEvents) + return emitStream.toStream(archivesEvents) } +/** + * @param {string} key + * @returns {Promise} + */ exports.getDebugLog = function getDebugLog (key) { return daemon.getDebugLog(key) } +/** + * @returns {NodeJS.ReadableStream} + */ exports.createDebugStream = function createDebugStream () { return daemon.createDebugStream() } @@ -133,6 +219,9 @@ const pullLatestArchiveMeta = exports.pullLatestArchiveMeta = async function pul // ready() just in case (we need .blocks) await pify(archive.ready.bind(archive))() + // trigger DNS update + confirmDomain(key) + // read the archive meta and size on disk var [manifest, oldMeta, size] = await Promise.all([ archive.pda.readManifest().catch(_ => {}), @@ -187,16 +276,26 @@ const createNewArchive = exports.createNewArchive = async function createNewArch return `dat://${key}/` } -exports.forkArchive = async function forkArchive (srcArchiveUrl, manifest = {}, settings = false) { +exports.forkArchive = async function forkArchive (srcArchiveUrl, manifest = {}, settings = undefined) { srcArchiveUrl = fromKeyToURL(srcArchiveUrl) - // get the old archive - var srcArchive = getArchive(srcArchiveUrl) - if (!srcArchive) { - throw new Error('Invalid archive key') + // get the source archive + var srcArchive + var downloadRes = await Promise.race([ + (async function () { + srcArchive = await getOrLoadArchive(srcArchiveUrl) + if (!srcArchive) { + throw new Error('Invalid archive key') + } + return srcArchive.pda.download('/') + })(), + new Promise(r => setTimeout(() => r('timeout'), 60e3)) + ]) + if (downloadRes === 'timeout') { + throw new TimeoutError('Timed out while downloading source archive') } - // fetch old archive meta + // fetch source archive meta var srcManifest = await srcArchive.pda.readManifest().catch(_ => {}) srcManifest = srcManifest || {} @@ -245,7 +344,7 @@ const loadArchive = exports.loadArchive = async function loadArchive (key, userS if (key) { if (!Buffer.isBuffer(key)) { // existing dat - key = fromURLToKey(key) + key = await fromURLToKey(key, true) if (!DAT_HASH_REGEX.test(key)) { throw new InvalidURLError() } @@ -307,16 +406,22 @@ async function loadArchiveInner (key, secretKey, userSettings = null) { // create the archive proxy instance var archive = createArchiveProxy(key, undefined, archiveInfo) + // fetch dns name if known + let dnsRecord = await datDnsDb.getCurrentByKey(datEncoding.toStr(key)) + archive.domain = dnsRecord ? dnsRecord.name : undefined + // update db archivesDb.touch(key).catch(err => console.error('Failed to update lastAccessTime for archive', key, err)) await pullLatestArchiveMeta(archive) + datAssets.update(archive) // wire up events - archive.pullLatestArchiveMeta = debounce(opts => pullLatestArchiveMeta(archive, opts), 1e3) + archive.pullLatestArchiveMeta = _debounce(opts => pullLatestArchiveMeta(archive, opts), 1e3) archive.fileActStream = archive.pda.watch() archive.fileActStream.on('data', ([event, {path}]) => { if (event === 'changed') { archive.pullLatestArchiveMeta({updateMTime: true}) + datAssets.update(archive, [path]) } }) @@ -342,11 +447,13 @@ exports.getArchiveCheckout = function getArchiveCheckout (archive, version) { } else if (version === 'preview') { isPreview = true checkoutFS = createArchiveProxy(archive.key, 'preview', archive) + checkoutFS.domain = archive.domain } else { throw new Error('Invalid version identifier:' + version) } } else { checkoutFS = createArchiveProxy(archive.key, version, archive) + checkoutFS.domain = archive.domain isHistoric = true } } @@ -358,6 +465,7 @@ exports.getActiveArchives = function getActiveArchives () { } const getOrLoadArchive = exports.getOrLoadArchive = async function getOrLoadArchive (key, opts) { + key = await fromURLToKey(key, true) var archive = getArchive(key) if (archive) { return archive @@ -366,7 +474,7 @@ const getOrLoadArchive = exports.getOrLoadArchive = async function getOrLoadArch } exports.unloadArchive = async function unloadArchive (key) { - key = fromURLToKey(key) + key = await fromURLToKey(key, true) var archive = archives[key] if (!archive) return if (archive.fileActStream) { @@ -392,6 +500,9 @@ exports.updateSizeTracking = function updateSizeTracking (archive) { exports.queryArchives = async function queryArchives (query) { // run the query var archiveInfos = await archivesDb.query(0, query) + if (!archiveInfos) return undefined + var isArray = Array.isArray(archiveInfos) + if (!isArray) archiveInfos = [archiveInfos] if (query && ('inMemory' in query)) { archiveInfos = archiveInfos.filter(archiveInfo => isArchiveLoaded(archiveInfo.key) === query.inMemory) @@ -412,12 +523,12 @@ exports.queryArchives = async function queryArchives (query) { archiveInfo.peerHistory = [] } })) - return archiveInfos + return isArray ? archiveInfos : archiveInfos[0] } exports.getArchiveInfo = async function getArchiveInfo (key) { // get the archive - key = fromURLToKey(key) + key = await fromURLToKey(key, true) var archive = await getOrLoadArchive(key) // fetch archive data @@ -429,7 +540,8 @@ exports.getArchiveInfo = async function getArchiveInfo (key) { ]) manifest = manifest || {} meta.key = key - meta.url = `dat://${key}` + meta.url = archive.url + meta.domain = archive.domain meta.links = manifest.links || {} meta.manifest = manifest meta.version = archiveInfo.version @@ -452,15 +564,67 @@ exports.getArchiveInfo = async function getArchiveInfo (key) { return meta } +exports.getArchiveNetworkStats = async function getArchiveNetworkStats (key) { + key = await fromURLToKey(key, true) + return daemon.getArchiveNetworkStats(key) +} + exports.clearFileCache = async function clearFileCache (key) { var userSettings = await archivesDb.getUserSettings(0, key) return daemon.clearFileCache(key, userSettings) } +/** + * @desc + * Get the primary URL for a given dat URL + * + * @param {string} url + * @returns {Promise} + */ +const getPrimaryUrl = exports.getPrimaryUrl = async function (url) { + var key = await fromURLToKey(url, true) + var datDnsRecord = await datDnsDb.getCurrentByKey(key) + if (!datDnsRecord) return `dat://${key}` + return `dat://${datDnsRecord.name}` +} + +/** + * @desc + * Check that the archive's dat.json `domain` matches the current DNS + * If yes, write the confirmed entry to the dat_dns table + * + * @param {string} key + * @returns {Promise} + */ +const confirmDomain = exports.confirmDomain = async function (key) { + // fetch the current domain from the manifest + try { + var archive = await getOrLoadArchive(key) + var datJson = await archive.pda.readManifest() + } catch (e) { + return false + } + if (!datJson.domain) { + await datDnsDb.unset(key) + return false + } + + // confirm match with current DNS + var dnsKey = await require('./dns').resolveName(datJson.domain) + if (key !== dnsKey) { + await datDnsDb.unset(key) + return false + } + + // update mapping + await datDnsDb.update({name: datJson.domain, key}) + return true +} + // helpers // = -const fromURLToKey = exports.fromURLToKey = function fromURLToKey (url) { +const fromURLToKey = exports.fromURLToKey = function fromURLToKey (url, lookupDns = false) { if (Buffer.isBuffer(url)) { return url } @@ -476,8 +640,10 @@ const fromURLToKey = exports.fromURLToKey = function fromURLToKey (url) { throw new InvalidURLError('URL must be a dat: scheme') } if (!DAT_HASH_REGEX.test(urlp.host)) { - // TODO- support dns lookup? - throw new InvalidURLError('Hostname is not a valid hash') + if (!lookupDns) { + throw new InvalidURLError('Hostname is not a valid hash') + } + return require('./dns').resolveName(urlp.host) } return urlp.host @@ -529,12 +695,23 @@ function fixStatObject (st) { st.isFIFO = () => false } +/** + * + * @param {string|Buffer} key + * @param {number} version + * @param {Object} archiveInfo + * @returns {InternalDatArchive} + */ function createArchiveProxy (key, version, archiveInfo) { key = datEncoding.toStr(key) const stat = makeArchiveProxyCbFn(key, version, 'stat') const pdaStat = makeArchiveProxyPDAPromiseFn(key, version, 'stat') return { key: datEncoding.toBuf(key), + get url () { + return `dat://${this.domain || key}${version ? '+' + version : ''}` + }, + domain: undefined, discoveryKey: datEncoding.toBuf(archiveInfo.discoveryKey), writable: archiveInfo.writable, diff --git a/dat/protocol.js b/dat/protocol.js index ef00292a..6de06db0 100644 --- a/dat/protocol.js +++ b/dat/protocol.js @@ -1,14 +1,16 @@ -const {join} = require('path') +const {extname} = require('path') const parseDatUrl = require('parse-dat-url') const parseRange = require('range-parser') const once = require('once') -const debug = require('../lib/debug-logger').debugLogger('dat-serve') +const logger = require('../logger').child({category: 'dat', subcategory: 'dat-serve'}) const intoStream = require('into-stream') -const toZipStream = require('hyperdrive-to-zip-stream') +const {toZipStream} = require('../lib/zip') const slugify = require('slugify') +const markdown = require('../lib/markdown') const datDns = require('./dns') const datLibrary = require('./library') +const datServeResolvePath = require('@beaker/dat-serve-resolve-path') const directoryListingPage = require('./directory-listing-page') const errorPage = require('../lib/error-page') @@ -17,17 +19,15 @@ const {makeSafe} = require('../lib/strings') // HACK detect whether the native builds of some key deps are working -prf // -prf -try { - require('utp-native') -} catch (err) { - debug('Failed to load utp-native. Peer-to-peer connectivity may be degraded.', err.toString()) - console.error('Failed to load utp-native. Peer-to-peer connectivity may be degraded.', err) +var utpLoadError = false +try { require('utp-native') } +catch (err) { + utpLoadError = err } -try { - require('sodium-native') -} catch (err) { - debug('Failed to load sodium-native. Performance may be degraded.', err.toString()) - console.error('Failed to load sodium-native. Performance may be degraded.', err) +var sodiumLoadError = false +try { require('sodium-native') } +catch (err) { + sodiumLoadError = err } // constants @@ -40,6 +40,14 @@ const REQUEST_TIMEOUT_MS = 30e3 // 30 seconds // = exports.electronHandler = async function (request, respond) { + // log warnings now, after the logger has setup its transports + if (utpLoadError) { + logger.warn('Failed to load utp-native. Peer-to-peer connectivity may be degraded.', {err: utpLoadError.toString()}) + } + if (sodiumLoadError) { + logger.warn('Failed to load sodium-native. Performance may be degraded.', {err: sodiumLoadError.toString()}) + } + respond = once(respond) var respondError = (code, status, errorPageInfo) => { if (errorPageInfo) { @@ -95,7 +103,7 @@ exports.electronHandler = async function (request, respond) { const cleanup = () => clearTimeout(timeout) timeout = setTimeout(() => { // cleanup - debug('Timed out searching for', archiveKey) + logger.debug('Timed out searching for', {url: archiveKey}) if (fileReadStream) { fileReadStream.destroy() fileReadStream = null @@ -113,7 +121,7 @@ exports.electronHandler = async function (request, respond) { // start searching the network archive = await datLibrary.getOrLoadArchive(archiveKey) } catch (err) { - debug('Failed to open archive', archiveKey, err) + logger.warn('Failed to open archive', {url: archiveKey, err}) cleanup() return respondError(500, 'Failed') } @@ -127,16 +135,21 @@ exports.electronHandler = async function (request, respond) { // checkout version if needed try { var {checkoutFS} = datLibrary.getArchiveCheckout(archive, urlp.version) + if (urlp.version === 'preview') { + await checkoutFS.pda.stat('/') // run a stat to ensure preview mode exists + } } catch (err) { if (err.noPreviewMode) { - let latestUrl = makeSafe(request.url.replace('+preview', '')) - respondError(404, 'Cannot open preview', { - title: 'Cannot open preview', - errorInfo: `You are trying to open the "preview" version of this site, but no preview exists.`, - errorDescription: `You can open the latest published version instead.` + // redirect to non-preview version + return respond({ + statusCode: 303, + headers: { + Location: `dat://${urlp.host}${urlp.pathname || '/'}${urlp.search || ''}` + }, + data: intoStream('') }) } else { - debug('Failed to open archive', archiveKey, err) + logger.warn('Failed to open archive checkout', {url: archiveKey, err}) cleanup() return respondError(500, 'Failed') } @@ -178,8 +191,8 @@ exports.electronHandler = async function (request, respond) { }) } else { // serve the zip - var zs = toZipStream(archive, filepath) - zs.on('error', err => console.log('Error while producing .zip file', err)) + var zs = toZipStream(checkoutFS, filepath) + zs.on('error', err => logger.error('Error while producing .zip file', err)) return respond({ statusCode: 200, headers, @@ -189,44 +202,24 @@ exports.electronHandler = async function (request, respond) { } // lookup entry - debug('Attempting to lookup', archiveKey, filepath) var statusCode = 200 var headers = {} - var entry - const tryStat = async (path) => { - // abort if we've already found it - if (entry) return - // apply the web_root config - if (manifest && manifest.web_root && !urlp.query.disable_web_root) { - if (path) { - path = join(manifest.web_root, path) - } else { - path = manifest.web_root - } - } - // attempt lookup - try { - entry = await checkoutFS.pda.stat(path) - entry.path = path - } catch (e) {} - } + var entry = await datServeResolvePath(checkoutFS.pda, manifest, urlp, request.headers.Accept) - // do lookup - if (hasTrailingSlash) { - await tryStat(filepath + 'index.html') - await tryStat(filepath + 'index.md') - await tryStat(filepath) - } else { - await tryStat(filepath) - await tryStat(filepath + '.html') // fallback to .html - if (entry && entry.isDirectory()) { - // unexpected directory, give the .html fallback a chance - let dirEntry = entry - entry = null - await tryStat(filepath + '.html') // fallback to .html - if (dirEntry && !entry) { - // no .html fallback found, stick with directory that we found - entry = dirEntry + // use theme template if it exists + var themeSettings = { + active: false, + js: false, + css: false + } + if (!urlp.query.disable_theme) { + if (entry && mime.acceptHeaderWantsHTML(request.headers.Accept) && ['.html', '.htm', '.md'].includes(extname(entry.path))) { + let exists = async (path) => await checkoutFS.pda.stat(path).then(() => true, () => false) + let [js, css] = await Promise.all([exists('/theme/index.js'), exists('/theme/index.css')]) + if (js || css) { + themeSettings.active = true + themeSettings.css = css + themeSettings.js = js } } } @@ -246,39 +239,18 @@ exports.electronHandler = async function (request, respond) { }) } - let headers = { - 'Content-Type': 'text/html', - 'Content-Security-Policy': cspHeader, - 'Access-Control-Allow-Origin': '*' - } - if (request.method === 'HEAD') { - return respond({statusCode: 204, headers, data: intoStream('')}) - } else { - return respond({ - statusCode: 200, - headers, - data: intoStream(await directoryListingPage(checkoutFS, filepath, manifest && manifest.web_root)) - }) - } + // 404 + entry = null } // handle not found if (!entry) { - debug('Entry not found:', urlp.path) - - // check for a fallback page - if (manifest && manifest.fallback_page) { - await tryStat(manifest.fallback_page) - } - - if (!entry) { - cleanup() - return respondError(404, 'File Not Found', { - errorDescription: 'File Not Found', - errorInfo: `Beaker could not find the file ${urlp.path}`, - title: 'File Not Found' - }) - } + cleanup() + return respondError(404, 'File Not Found', { + errorDescription: 'File Not Found', + errorInfo: `Beaker could not find the file ${urlp.path}`, + title: 'File Not Found' + }) } // TODO @@ -309,15 +281,47 @@ exports.electronHandler = async function (request, respond) { statusCode = 206 headers['Content-Range'] = 'bytes ' + range.start + '-' + range.end + '/' + entry.size headers['Content-Length'] = range.end - range.start + 1 - debug('Serving range:', range) } else { if (entry.size) { headers['Content-Length'] = entry.size } } + Object.assign(headers, { + 'Content-Security-Policy': cspHeader, + 'Access-Control-Allow-Origin': '*', + 'Cache-Control': 'no-cache' + }) + + // markdown rendering + if (!range && entry.path.endsWith('.md') && mime.acceptHeaderWantsHTML(request.headers.Accept)) { + let content = await checkoutFS.pda.readFile(entry.path, 'utf8') + return respond({ + statusCode: 200, + headers: Object.assign(headers, { + 'Content-Type': 'text/html' + }), + data: intoStream(markdown.render(content, themeSettings)) + }) + } + + // theme wrapping + if (themeSettings.active) { + let html = await checkoutFS.pda.readFile(entry.path, 'utf8') + html = ` +${themeSettings.js ? `` : ''} +${themeSettings.css ? `` : ''} +${html}` + return respond({ + statusCode: 200, + headers: Object.assign(headers, { + 'Content-Type': 'text/html' + }), + data: intoStream(html) + }) + } + // fetch the entry and stream the response - debug('Entry found:', entry.path) fileReadStream = checkoutFS.createReadStream(entry.path, range) var dataStream = fileReadStream .pipe(mime.identifyStream(entry.path, mimeType => { @@ -327,10 +331,7 @@ exports.electronHandler = async function (request, respond) { // send headers, now that we can identify the data headersSent = true Object.assign(headers, { - 'Content-Type': mimeType, - 'Content-Security-Policy': cspHeader, - 'Access-Control-Allow-Origin': '*', - 'Cache-Control': 'no-cache' + 'Content-Type': mimeType }) // TODO // Electron is being really aggressive about caching and not following the headers correctly @@ -354,7 +355,6 @@ exports.electronHandler = async function (request, respond) { fileReadStream.once('end', () => { if (!headersSent) { cleanup() - debug('Served empty file') respond({ statusCode: 200, headers: { @@ -368,7 +368,7 @@ exports.electronHandler = async function (request, respond) { // handle read-stream errors fileReadStream.once('error', err => { - debug('Error reading file', err) + logger.warn('Error reading file', {url: archive.url, path: entry.path, err}) if (!headersSent) respondError(500, 'Failed to read file') }) } diff --git a/dat/watchlist.js b/dat/watchlist.js index f8814556..1be83b1d 100644 --- a/dat/watchlist.js +++ b/dat/watchlist.js @@ -1,5 +1,6 @@ const EventEmitter = require('events') const emitStream = require('emit-stream') +const logger = require('../logger').child({category: 'dat', subcategory: 'watchlist'}) // dat modules const datLibrary = require('../dat/library') @@ -21,6 +22,7 @@ exports.setup = async function setup () { watch(site) } } catch (err) { + logger.error('Error while loading watchlist', {err}) throw new Error('Failed to load the watchlist') } } diff --git a/dbs/archives.js b/dbs/archives.js index a14783cc..0c320592 100644 --- a/dbs/archives.js +++ b/dbs/archives.js @@ -12,38 +12,129 @@ const { DAT_GC_EXPIRATION_AGE } = require('../lib/const') +// typedefs +// = + +/** + * @typedef {import('../dat/library').InternalDatArchive} InternalDatArchive + * + * @typedef {Object} LibraryArchiveRecord + * @prop {string} key + * @prop {string} url + * @prop {string?} domain + * @prop {string} title + * @prop {string} description + * @prop {Array} type + * @prop {number} mtime + * @prop {number} size + * @prop {boolean} isOwner + * @prop {number} lastAccessTime + * @prop {number} lastLibraryAccessTime + * @prop {Object} userSettings + * @prop {boolean} userSettings.isSaved + * @prop {boolean} userSettings.hidden + * @prop {boolean} userSettings.networked + * @prop {boolean} userSettings.autoDownload + * @prop {boolean} userSettings.autoUpload + * @prop {number} userSettings.expiresAt + * @prop {string} userSettings.localSyncPath + * @prop {boolean} userSettings.previewMode + * + * @typedef {Object} LibraryArchiveMeta + * @prop {string} key + * @prop {string} title + * @prop {string} description + * @prop {string | Array} type + * @prop {Array} installedNames + * @prop {number} mtime + * @prop {number} size + * @prop {boolean} isOwner + * @prop {number} lastAccessTime + * @prop {number} lastLibraryAccessTime + * + * @typedef {Object} LibraryArchiveUserSettings + * @prop {number} profileId + * @prop {string} key + * @prop {boolean} isSaved + * @prop {boolean} hidden + * @prop {boolean} networked + * @prop {boolean} autoDownload + * @prop {boolean} autoUpload + * @prop {number} expiresAt + * @prop {string} localSyncPath + * @prop {boolean} previewMode + * @prop {number} createdAt + * + * @typedef {Object} MinimalLibraryArchiveRecord + * @prop {string} key + */ + // globals // = -var datPath // path to the dat folder +var datPath /** @type string - path to the dat folder */ var events = new Events() // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + */ exports.setup = function (opts) { // make sure the folders exist datPath = path.join(opts.userDataPath, 'Dat') mkdirp.sync(path.join(datPath, 'Archives')) } +/** + * @returns {string} + */ exports.getDatPath = function () { return datPath } -// get the path to an archive's files +/** + * @description Get the path to an archive's files. + * @param {string | Buffer | InternalDatArchive} archiveOrKey + * @returns {string} + */ +// const getArchiveMetaPath = exports.getArchiveMetaPath = function (archiveOrKey) { - var key = datEncoding.toStr(archiveOrKey.key || archiveOrKey) + var key /** @type string */ + if (typeof archiveOrKey === 'string') { + key = archiveOrKey + } else if (Buffer.isBuffer(archiveOrKey)) { + key = datEncoding.toStr(archiveOrKey) + } else { + key = datEncoding.toStr(archiveOrKey.key) + } return path.join(datPath, 'Archives', 'Meta', key.slice(0, 2), key.slice(2)) } -// get the path to an archive's temporary local sync path +/** + * @description Get the path to an archive's temporary local sync path. + * @param {string | Buffer | InternalDatArchive} archiveOrKey + * @returns {string} + */ const getInternalLocalSyncPath = exports.getInternalLocalSyncPath = function (archiveOrKey) { - var key = datEncoding.toStr(archiveOrKey.key || archiveOrKey) + var key /** @type string */ + if (typeof archiveOrKey === 'string') { + key = archiveOrKey + } else if (Buffer.isBuffer(archiveOrKey)) { + key = datEncoding.toStr(archiveOrKey) + } else { + key = datEncoding.toStr(archiveOrKey.key) + } return path.join(datPath, 'Archives', 'LocalCopy', key.slice(0, 2), key.slice(2)) } -// delete all db entries and files for an archive +/** + * @description Delete all db entries and files for an archive. + * @param {string} key + * @returns {Promise} + */ exports.deleteArchive = async function (key) { const path = getArchiveMetaPath(key) const info = await jetpack.inspectTreeAsync(path) @@ -54,7 +145,7 @@ exports.deleteArchive = async function (key) { jetpack.removeAsync(path), jetpack.removeAsync(getInternalLocalSyncPath(key)) ]) - return info.size + return info ? info.size : 0 } exports.on = events.on.bind(events) @@ -64,40 +155,42 @@ exports.removeListener = events.removeListener.bind(events) // exported methods: archive user settings // = -// get an array of saved archives -// - optional `query` keys: -// - `isSaved`: bool -// - `isNetworked`: bool -// - `isOwner`: bool, does beaker have the secret key? -// - `type`: string, a type filter -// - `showHidden`: bool, show hidden dats -// - `key`: string, the key of the archive you want (return single result) -exports.query = async function (profileId, query) { - query = query || {} - +/** + * @description Get an array of saved archives. + * @param {number} profileId + * @param {Object} [query] + * @param {string} [query.key] + * @param {boolean} [query.isSaved] + * @param {boolean} [query.isNetworked] + * @param {boolean} [query.isOwner] + * @param {boolean} [query.showHidden] + * @param {string} [query.type] + * @param {string} [query.string] + * @returns {Promise>} + */ +exports.query = async function (profileId, query = {}) { // fetch archive meta var values = [] - var WHERE = [] - if (query.isOwner === true) WHERE.push('archives_meta.isOwner = 1') - if (query.isOwner === false) WHERE.push('archives_meta.isOwner = 0') - if (query.isNetworked === true) WHERE.push('archives.networked = 1') - if (query.isNetworked === false) WHERE.push('archives.networked = 0') + var whereList = [] + if (query.isOwner === true) whereList.push('archives_meta.isOwner = 1') + if (query.isOwner === false) whereList.push('archives_meta.isOwner = 0') + if (query.isNetworked === true) whereList.push('archives.networked = 1') + if (query.isNetworked === false) whereList.push('archives.networked = 0') if ('isSaved' in query) { if (query.isSaved) { - WHERE.push('archives.profileId = ?') + whereList.push('archives.profileId = ?') values.push(profileId) - WHERE.push('archives.isSaved = 1') + whereList.push('archives.isSaved = 1') } else { - WHERE.push('(archives.isSaved = 0 OR archives.isSaved IS NULL)') + whereList.push('(archives.isSaved = 0 OR archives.isSaved IS NULL)') } } - if ('key' in query) { - WHERE.push('archives_meta.key = ?') + if (typeof query.key !== 'undefined') { + whereList.push('archives_meta.key = ?') values.push(query.key) } - if (!query.showHidden) WHERE.push('(archives.hidden = 0 OR archives.hidden IS NULL)') - if (WHERE.length) WHERE = `WHERE ${WHERE.join(' AND ')}` - else WHERE = '' + if (!query.showHidden) whereList.push('(archives.hidden = 0 OR archives.hidden IS NULL)') + var WHERE = whereList.length ? `WHERE ${whereList.join(' AND ')}` : '' var archives = await db.all(` SELECT @@ -110,25 +203,27 @@ exports.query = async function (profileId, query) { archives.autoUpload, archives.expiresAt, archives.localSyncPath, - archives.previewMode + archives.previewMode, + dat_dns.name as domain FROM archives_meta LEFT JOIN archives ON archives.key = archives_meta.key LEFT JOIN archives_meta_type ON archives_meta_type.key = archives_meta.key + LEFT JOIN dat_dns ON dat_dns.key = archives_meta.key AND dat_dns.isCurrent = 1 ${WHERE} GROUP BY archives_meta.key `, values) // massage the output archives.forEach(archive => { - archive.url = `dat://${archive.key}` + archive.url = `dat://${archive.domain || archive.key}` archive.isOwner = archive.isOwner != 0 archive.type = archive.type ? archive.type.split(',') : [] archive.userSettings = { - isSaved: archive.isSaved != 0, - hidden: archive.hidden != 0, - networked: archive.networked != 0, - autoDownload: archive.autoDownload != 0, - autoUpload: archive.autoUpload != 0, + isSaved: archive.isSaved == 1, + hidden: archive.hidden == 0, + networked: archive.networked == 1, + autoDownload: archive.autoDownload == 1, + autoUpload: archive.autoUpload == 1, expiresAt: archive.expiresAt, localSyncPath: archive.localSyncPath, previewMode: archive.previewMode == 1 @@ -156,7 +251,7 @@ exports.query = async function (profileId, query) { // apply manual filters if ('type' in query) { let types = Array.isArray(query.type) ? query.type : [query.type] - archives = archives.filter(a => { + archives = archives.filter((/** @type LibraryArchiveRecord */ a) => { for (let type of types) { if (a.type.indexOf(type) === -1) { return false @@ -169,7 +264,10 @@ exports.query = async function (profileId, query) { return ('key' in query) ? archives[0] : archives } -// get all archives that should be unsaved +/** + * @description Get all archives that should be unsaved. + * @returns {Promise>} + */ exports.listExpiredArchives = async function () { return db.all(` SELECT archives.key @@ -182,10 +280,16 @@ exports.listExpiredArchives = async function () { `, [Date.now()]) } -// get all archives that are ready for garbage collection +/** + * @description Get all archives that are ready for garbage collection. + * @param {Object} [opts] + * @param {number} [opts.olderThan] + * @param {boolean} [opts.isOwner] + * @returns {Promise>} + */ exports.listGarbageCollectableArchives = async function ({olderThan, isOwner} = {}) { olderThan = typeof olderThan === 'number' ? olderThan : DAT_GC_EXPIRATION_AGE - isOwner = typeof isOwner === 'boolean' ? `AND archives_meta.isOwner = ${isOwner ? '1' : '0'}` : '' + var isOwnerClause = typeof isOwner === 'boolean' ? `AND archives_meta.isOwner = ${isOwner ? '1' : '0'}` : '' // fetch archives var records = await db.all(` @@ -195,7 +299,7 @@ exports.listGarbageCollectableArchives = async function ({olderThan, isOwner} = WHERE (archives.isSaved != 1 OR archives.isSaved IS NULL) AND archives_meta.lastAccessTime < ? - ${isOwner} + ${isOwnerClause} `, [Date.now() - olderThan]) var records2 = records.slice() @@ -208,7 +312,13 @@ exports.listGarbageCollectableArchives = async function ({olderThan, isOwner} = return records } -// upsert the last-access time +/** + * @description Upsert the last-access time. + * @param {string | Buffer} key + * @param {string} [timeVar] + * @param {number} [value] + * @returns {Promise} + */ exports.touch = async function (key, timeVar = 'lastAccessTime', value = -1) { var release = await lock('archives-db:meta') try { @@ -216,22 +326,28 @@ exports.touch = async function (key, timeVar = 'lastAccessTime', value = -1) { timeVar = 'lastAccessTime' } if (value === -1) value = Date.now() - key = datEncoding.toStr(key) - await db.run(`UPDATE archives_meta SET ${timeVar}=? WHERE key=?`, [value, key]) - await db.run(`INSERT OR IGNORE INTO archives_meta (key, ${timeVar}) VALUES (?, ?)`, [key, value]) + var keyStr = datEncoding.toStr(key) + await db.run(`UPDATE archives_meta SET ${timeVar}=? WHERE key=?`, [value, keyStr]) + await db.run(`INSERT OR IGNORE INTO archives_meta (key, ${timeVar}) VALUES (?, ?)`, [keyStr, value]) } finally { release() } } -// get a single archive's user settings -// - supresses a not-found with an empty object +/** + * @description + * Get a single archive's user settings. + * (Returns an empty object on not found.) + * @param {number} profileId + * @param {string | Buffer} key + * @returns {Promise} + */ const getUserSettings = exports.getUserSettings = async function (profileId, key) { // massage inputs - key = datEncoding.toStr(key) + var keyStr = typeof key !== 'string' ? datEncoding.toStr(key) : key // validate inputs - if (!DAT_HASH_REGEX.test(key)) { + if (!DAT_HASH_REGEX.test(keyStr)) { throw new InvalidArchiveKeyError() } @@ -239,51 +355,65 @@ const getUserSettings = exports.getUserSettings = async function (profileId, key try { var settings = await db.get(` SELECT * FROM archives WHERE profileId = ? AND key = ? - `, [profileId, key]) + `, [profileId, keyStr]) settings.isSaved = !!settings.isSaved settings.hidden = !!settings.hidden settings.networked = !!settings.networked settings.autoDownload = !!settings.autoDownload settings.autoUpload = !!settings.autoUpload - settings.previewMode = settings.previewMode == 1 - return settings + settings.previewMode = Number(settings.previewMode) === 1 + return /** @type LibraryArchiveUserSettings */(settings) } catch (e) { - return {} + return /** @type LibraryArchiveUserSettings */({}) } } -// write an archive's user setting +/** + * @description Write an archive's user setting. + * @param {number} profileId + * @param {string | Buffer} key + * @param {Object} [newValues] + * @param {boolean} [newValues.isSaved] + * @param {boolean} [newValues.hidden] + * @param {boolean} [newValues.networked] + * @param {boolean} [newValues.autoDownload] + * @param {boolean} [newValues.autoUpload] + * @param {number} [newValues.expiresAt] + * @param {string} [newValues.localSyncPath] + * @param {boolean} [newValues.previewMode] + * @returns {Promise} + */ exports.setUserSettings = async function (profileId, key, newValues = {}) { // massage inputs - key = datEncoding.toStr(key) + var keyStr = datEncoding.toStr(key) // validate inputs - if (!DAT_HASH_REGEX.test(key)) { + if (!DAT_HASH_REGEX.test(keyStr)) { throw new InvalidArchiveKeyError() } var release = await lock('archives-db') try { // fetch current - var value = await getUserSettings(profileId, key) + var value = await getUserSettings(profileId, keyStr) if (!value || typeof value.key === 'undefined') { // create - value = { + value = /** @type LibraryArchiveUserSettings */ ({ profileId, - key, + key: keyStr, isSaved: newValues.isSaved, hidden: newValues.hidden, networked: ('networked' in newValues) ? newValues.networked : true, autoDownload: ('autoDownload' in newValues) ? newValues.autoDownload : newValues.isSaved, autoUpload: ('autoUpload' in newValues) ? newValues.autoUpload : newValues.isSaved, expiresAt: newValues.expiresAt, - localSyncPath: ('localSyncPath' in newValues) ? newValues.localSyncPath : '', + localSyncPath: (newValues.localSyncPath) ? newValues.localSyncPath : '', previewMode: ('previewMode' in newValues) ? newValues.previewMode : '' - } + }) let valueArray = [ profileId, - key, + keyStr, flag(value.isSaved), flag(value.hidden), flag(value.networked), @@ -330,7 +460,7 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { value.localSyncPath, flag(value.previewMode), profileId, - key + keyStr ] await db.run(` UPDATE archives @@ -348,7 +478,7 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { `, valueArray) } - events.emit('update:archive-user-settings', key, value, newValues) + events.emit('update:archive-user-settings', keyStr, value, newValues) return value } finally { release() @@ -358,15 +488,20 @@ exports.setUserSettings = async function (profileId, key, newValues = {}) { // exported methods: archive meta // = -// get a single archive's metadata -// - supresses a not-found with an empty object +/** + * @description + * Get a single archive's metadata. + * Returns an empty object on not-found. + * @param {string | Buffer} key + * @returns {Promise} + */ const getMeta = exports.getMeta = async function (key) { // massage inputs - key = datEncoding.toStr(key) + var keyStr = typeof key !== 'string' ? datEncoding.toStr(key) : key // validate inputs - if (!DAT_HASH_REGEX.test(key)) { - throw new InvalidArchiveKeyError() + if (!DAT_HASH_REGEX.test(keyStr)) { + keyStr = await require('../dat/dns').resolveName(keyStr) } // fetch @@ -380,9 +515,9 @@ const getMeta = exports.getMeta = async function (key) { LEFT JOIN apps ON apps.url = ('dat://' || archives_meta.key) WHERE archives_meta.key = ? GROUP BY archives_meta.key - `, [key]) + `, [keyStr]) if (!meta) { - return defaultMeta(key) + return defaultMeta(keyStr) } // massage some values @@ -401,15 +536,23 @@ const getMeta = exports.getMeta = async function (key) { return meta } -// write an archive's metadata -exports.setMeta = async function (key, value = {}) { +/** + * @description Write an archive's metadata. + * @param {string | Buffer} key + * @param {LibraryArchiveMeta} [value] + * @returns {Promise} + */ +exports.setMeta = async function (key, value) { // massage inputs - key = datEncoding.toStr(key) + var keyStr = datEncoding.toStr(key) // validate inputs - if (!DAT_HASH_REGEX.test(key)) { + if (!DAT_HASH_REGEX.test(keyStr)) { throw new InvalidArchiveKeyError() } + if (!value || typeof value !== 'object') { + return // dont bother + } // extract the desired values var {title, description, type, size, mtime, isOwner} = value @@ -417,30 +560,35 @@ exports.setMeta = async function (key, value = {}) { description = typeof description === 'string' ? description : '' if (typeof type === 'string') type = type.split(' ') else if (Array.isArray(type)) type = type.filter(v => v && typeof v === 'string') - isOwner = flag(isOwner) + var isOwnerFlag = flag(isOwner) // write var release = await lock('archives-db:meta') - var {lastAccessTime, lastLibraryAccessTime} = await getMeta(key) + var {lastAccessTime, lastLibraryAccessTime} = await getMeta(keyStr) try { await db.run(` INSERT OR REPLACE INTO archives_meta (key, title, description, mtime, size, isOwner, lastAccessTime, lastLibraryAccessTime) VALUES (?, ?, ?, ?, ?, ?, ?, ?) - `, [key, title, description, mtime, size, isOwner, lastAccessTime, lastLibraryAccessTime]) - await db.run(`DELETE FROM archives_meta_type WHERE key=?`, key) + `, [keyStr, title, description, mtime, size, isOwnerFlag, lastAccessTime, lastLibraryAccessTime]) + await db.run(`DELETE FROM archives_meta_type WHERE key=?`, keyStr) if (type) { await Promise.all(type.map(t => ( - db.run(`INSERT INTO archives_meta_type (key, type) VALUES (?, ?)`, [key, t]) + db.run(`INSERT INTO archives_meta_type (key, type) VALUES (?, ?)`, [keyStr, t]) ))) } } finally { release() } - events.emit('update:archive-meta', key, value) + events.emit('update:archive-meta', keyStr, value) } -// find the archive currently using a given localSyncPath +/** + * @description Find the archive currently using a given localSyncPath. + * @param {number} profileId + * @param {string} localSyncPath + * @returns {Promise} + */ exports.getByLocalSyncPath = async function (profileId, localSyncPath) { try { return await db.get(` @@ -454,24 +602,37 @@ exports.getByLocalSyncPath = async function (profileId, localSyncPath) { // internal methods // = +/** + * @param {string} key + * @returns {LibraryArchiveMeta} + */ function defaultMeta (key) { return { key, title: null, description: null, type: [], - author: null, mtime: 0, isOwner: false, lastAccessTime: 0, - installedNames: [] + lastLibraryAccessTime: 0, + installedNames: [], + size: 0 } } +/** + * @param {boolean} b + * @returns {number} + */ function flag (b) { return b ? 1 : 0 } +/** + * @param {string} originURL + * @returns {string} + */ exports.extractOrigin = function (originURL) { var urlp = url.parse(originURL) if (!urlp || !urlp.host || !urlp.protocol) return diff --git a/dbs/bookmarks.js b/dbs/bookmarks.js index da53524d..3ef10f4b 100644 --- a/dbs/bookmarks.js +++ b/dbs/bookmarks.js @@ -1,6 +1,9 @@ +const assert = require('assert') +const EventEmitter = require('events') const db = require('./profile-data-db') const normalizeUrl = require('normalize-url') const lock = require('../lib/lock') +const knex = require('../lib/knex') const NORMALIZE_OPTS = { stripFragment: false, @@ -9,75 +12,204 @@ const NORMALIZE_OPTS = { removeTrailingSlash: false } +// typedefs +// = + +/** + * @typedef {Object} Bookmark + * @prop {number} createdAt + * @prop {string} href + * @prop {string} title + * @prop {string} description + * @prop {string[]} tags + * @prop {boolean} pinned + * @prop {boolean} isPublic + * @prop {number} pinOrder + */ + +// globals +// = + +const events = new EventEmitter() + // exported methods // = -exports.bookmark = async function (profileId, url, {title, tags, notes, pinOrder}) { - tags = tagsToString(tags) - var release = await lock(`bookmark:${url}`) +exports.on = events.on.bind(events) +exports.once = events.once.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @param {number} profileId + * @param {Object} values + * @param {string} [values.href] + * @param {string} [values.title] + * @param {string} [values.description] + * @param {string | string[]} [values.tags] + * @param {boolean} [values.pinned] + * @param {boolean} [values.isPublic] + * @returns {Promise} + */ +exports.addBookmark = async function (profileId, {href, title, description, tags, pinned, isPublic} = {}) { + // validate + assertValidHref(href) + assertValidTitle(title) + assertValidDescription(description) + assertValidTags(tags) + + // massage values + href = normalizeUrl(href, NORMALIZE_OPTS) + var tagsStr = tagsToString(tags) + description = description || '' + isPublic = isPublic || false + + // update record + var release = await lock(`bookmarksdb`) try { - // read old bookmark and fallback to old values as needed - var oldBookmark = await db.get(`SELECT url, title, pinned, pinOrder FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, url]) - oldBookmark = oldBookmark || {} - const pinned = oldBookmark.pinned ? 1 : 0 - title = typeof title === 'undefined' ? oldBookmark.title : title - tags = typeof tags === 'undefined' ? oldBookmark.tags : tags - notes = typeof notes === 'undefined' ? oldBookmark.notes : notes - pinOrder = typeof pinOrder === 'undefined' ? oldBookmark.pinOrder : pinOrder - - // update record - return db.run(` + await db.run(` INSERT OR REPLACE - INTO bookmarks (profileId, url, title, tags, notes, pinned, pinOrder) + INTO bookmarks (profileId, url, title, description, tags, pinned, isPublic) VALUES (?, ?, ?, ?, ?, ?, ?) - `, [profileId, url, title, tags, notes, pinned, pinOrder]) + `, [profileId, href, title, description, tagsStr, Number(pinned), Number(isPublic)]) + events.emit('changed') } finally { release() } } -exports.unbookmark = function (profileId, url) { - return db.run(`DELETE FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, url]) +/** + * @param {number} profileId + * @param {string} bookmarkHref + * @param {Object} values + * @param {string} [values.href] + * @param {string} [values.title] + * @param {string} [values.description] + * @param {string | string[]} [values.tags] + * @param {boolean} [values.pinned] + * @param {boolean} [values.isPublic] + * @returns {Promise} + */ +exports.editBookmark = async function (profileId, bookmarkHref, {href, title, description, tags, pinned, isPublic} = {}) { + // validate + assertValidHref(bookmarkHref) + if (href) assertValidHref(href) + if (title) assertValidTitle(title) + if (description) assertValidDescription(description) + if (tags) assertValidTags(tags) + + // massage values + bookmarkHref = normalizeUrl(bookmarkHref, NORMALIZE_OPTS) + href = href ? normalizeUrl(href, NORMALIZE_OPTS) : undefined + var tagsStr = tags ? tagsToString(tags) : undefined + + // read, update, store + var release = await lock(`bookmarksdb`) + try { + var oldBookmark = await db.get(`SELECT url, title, pinned, pinOrder FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, bookmarkHref]) + + if (oldBookmark) { + // update record + let sql = knex('bookmarks') + .where({profileId, url: bookmarkHref}) + if (typeof href !== 'undefined') sql = sql.update('url', href) + if (typeof title !== 'undefined') sql = sql.update('title', title) + if (typeof description !== 'undefined') sql = sql.update('description', description) + if (typeof tagsStr !== 'undefined') sql = sql.update('tags', tagsStr) + if (typeof pinned !== 'undefined') sql = sql.update('pinned', Number(pinned)) + if (typeof isPublic !== 'undefined') sql = sql.update('isPublic', Number(isPublic)) + await db.run(sql) + } else { + // insert record + await db.run(` + INSERT OR REPLACE + INTO bookmarks (profileId, url, title, description, tags, pinned, isPublic) + VALUES (?, ?, ?, ?, ?, ?, ?) + `, [profileId, href, title, description || '', tagsStr, Number(pinned), Number(isPublic)]) + } + events.emit('changed') + } finally { + release() + } } -exports.setBookmarkPinned = function (profileId, url, pinned) { - return db.run(`UPDATE bookmarks SET pinned = ? WHERE profileId = ? AND url = ?`, [pinned ? 1 : 0, profileId, url]) +/** + * @param {number} profileId + * @param {string} href + * @returns {Promise} + */ +exports.removeBookmark = async function (profileId, href) { + href = normalizeUrl(href, NORMALIZE_OPTS) + var release = await lock(`bookmarksdb`) + try { + await db.run(`DELETE FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, href]) + events.emit('changed') + } finally { + release() + } } +/** + * @param {number} profileId + * @param {string[]} urls + * @returns {Promise} + */ exports.setBookmarkPinOrder = async function (profileId, urls) { var len = urls.length - await Promise.all(urls.map((url, i) => ( - db.run(`UPDATE bookmarks SET pinOrder = ? WHERE profileId = ? AND url = ?`, [len - i, profileId, url]) - ))) + var release = await lock(`bookmarksdb`) + try { + await Promise.all(urls.map((url, i) => ( + db.run(`UPDATE bookmarks SET pinOrder = ? WHERE profileId = ? AND url = ?`, [len - i, profileId, url]) + ))) + } finally { + release() + } } -exports.getBookmark = async function (profileId, url) { - return toNewFormat(await db.get(`SELECT url, title, tags, notes, pinned, pinOrder, createdAt FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, url])) +/** + * @param {number} profileId + * @param {string} url + * @returns {Promise} + */ +exports.getBookmark = async function (profileId, href) { + href = normalizeUrl(href, NORMALIZE_OPTS) + return toNewFormat(await db.get(`SELECT * FROM bookmarks WHERE profileId = ? AND url = ?`, [profileId, href])) } -exports.listBookmarks = async function (profileId, {tag} = {}) { - var bookmarks = await db.all(`SELECT url, title, tags, notes, pinned, pinOrder, createdAt FROM bookmarks WHERE profileId = ? ORDER BY createdAt DESC`, [profileId]) - bookmarks = bookmarks.map(toNewFormat) - - // apply tag filter - if (tag) { - if (Array.isArray(tag)) { - bookmarks = bookmarks.filter(b => { - return tag.reduce((agg, t) => agg & b.tags.includes(t), true) - }) - } else { - bookmarks = bookmarks.filter(b => b.tags.includes(tag)) - } +/** + * @param {number} profileId + * @param {Object} [opts] + * @param {Object} [opts.filters] + * @param {boolean} [opts.filters.pinned] + * @param {boolean} [opts.filters.isPublic] + * @returns {Promise>} + */ +exports.listBookmarks = async function (profileId, {filters} = {}) { + let sql = knex('bookmarks') + .select('url') + .select('title') + .select('description') + .select('tags') + .select('pinned') + .select('isPublic') + .select('pinOrder') + .select('createdAt') + .where('profileId', '=', profileId) + .orderBy('createdAt', 'DESC') + if (filters && filters.pinned) { + sql = sql.where('pinned', '=', '1') + } + if (filters && 'isPublic' in filters) { + sql = sql.where('isPublic', '=', filters.isPublic ? '1' : '0') } - return bookmarks -} - -exports.listPinnedBookmarks = async function (profileId) { - var bookmarks = await db.all(`SELECT url, title, tags, notes, pinned, pinOrder, createdAt FROM bookmarks WHERE profileId = ? AND pinned = 1 ORDER BY pinOrder DESC`, [profileId]) + var bookmarks = await db.all(sql) return bookmarks.map(toNewFormat) } +/** + * @param {number} profileId + * @returns {Promise>} + */ exports.listBookmarkTags = async function (profileId) { var tagSet = new Set() var bookmarks = await db.all(`SELECT tags FROM bookmarks WHERE profileId = ?`, [profileId]) @@ -89,37 +221,72 @@ exports.listBookmarkTags = async function (profileId) { return Array.from(tagSet) } -// TEMP -// apply normalization to old bookmarks -// (can probably remove this in 2018 or so) -// -prf -exports.fixOldBookmarks = async function () { - var bookmarks = await db.all(`SELECT url FROM bookmarks`) - bookmarks.forEach(b => { - let newUrl = normalizeUrl(b.url, NORMALIZE_OPTS) - db.run(`UPDATE bookmarks SET url = ? WHERE url = ?`, [newUrl, b.url]) - }) -} - +/** + * @param {string | string[]} v + * @returns {string} + */ function tagsToString (v) { if (Array.isArray(v)) { v = v.join(' ') } + if (typeof v === 'string') { + v = v.replace(/,/g, ' ') // convert any commas to spaces + } return v } +/** + * @param {Object} b + * @returns {Bookmark | null} + */ function toNewFormat (b) { - if (!b) return b + if (!b) return null return { - _origin: false, - _url: false, - private: true, createdAt: b.createdAt * 1e3, // convert to ms href: b.url, title: b.title, + description: b.description, tags: b.tags ? b.tags.split(' ').filter(Boolean) : [], - notes: b.notes, pinned: !!b.pinned, + isPublic: !!b.isPublic, pinOrder: b.pinOrder } } + +/** + * @param {string} v + * @returns {void} + */ +function assertValidHref (v) { + assert(v && typeof v === 'string', 'href must be a valid URL') +} + +/** + * @param {string} v + * @returns {void} + */ +function assertValidTitle (v) { + assert(v && typeof v === 'string', 'title must be a non-empty string') +} + +/** + * @param {string} v + * @returns {void} + */ +function assertValidDescription (v) { + if (!v) return // optional + assert(typeof v === 'string', 'title must be a non-empty string') +} + +/** + * @param {string|string[]} v + * @returns {void} + */ +function assertValidTags (v) { + if (!v) return // optional + if (Array.isArray(v)) { + assert(v.every(item => typeof item === 'string'), 'tags must be a string or array or strings') + } else { + assert(typeof v === 'string', 'tags must be a string or array or strings') + } +} diff --git a/dbs/dat-dns.js b/dbs/dat-dns.js new file mode 100644 index 00000000..19a303e3 --- /dev/null +++ b/dbs/dat-dns.js @@ -0,0 +1,106 @@ +const EventEmitter = require('events') +const db = require('./profile-data-db') +const knex = require('../lib/knex') +const lock = require('../lib/lock') + +// typedefs +// = + +/** + * @typedef {Object} DatDnsRecord + * @prop {string} name + * @prop {string} key + * @prop {boolean} isCurrent + * @prop {number} lastConfirmedAt + * @prop {number} firstConfirmedAt + */ + +// globals +// = + +const events = new EventEmitter() + +// exported api +// = + +exports.on = events.on.bind(events) +exports.once = events.once.bind(events) +exports.removeListener = events.removeListener.bind(events) + +/** + * @param {string} name + * @returns {Promise} + */ +exports.getCurrentByName = async function (name) { + return massageDNSRecord(await db.get(knex('dat_dns').where({name, isCurrent: 1}))) +} + +/** + * @param {string} key + * @returns {Promise} + */ +exports.getCurrentByKey = async function (key) { + return massageDNSRecord(await db.get(knex('dat_dns').where({key, isCurrent: 1}))) +} + +/** + * @param {Object} opts + * @param {string} opts.key + * @param {string} opts.name + * @returns {Promise} + */ +exports.update = async function ({key, name}) { + var release = await lock('dat-dns-update:' + name) + try { + var old = await db.get(knex('dat_dns').where({name, isCurrent: 1})) + if (old && old.key !== key) { + // unset old + await db.run(knex('dat_dns').update({isCurrent: 0}).where({name})) + events.emit('update', {key: old.key, name: undefined}) + } + + let curr = await db.get(knex('dat_dns').where({name, key})) + if (!curr) { + // insert new + await db.run(knex('dat_dns').insert({ + name, + key, + isCurrent: 1, + lastConfirmedAt: Date.now(), + firstConfirmedAt: Date.now() + })) + } else { + // update current + await db.run(knex('dat_dns').update({lastConfirmedAt: Date.now(), isCurrent: 1}).where({name, key})) + } + events.emit('update', {key, name}) + } finally { + release() + } +} + +/** + * @param {string} key + * @returns {Promise} + */ +exports.unset = async function (key) { + var curr = await db.get(knex('dat_dns').where({key, isCurrent: 1})) + if (curr) { + await db.run(knex('dat_dns').update({isCurrent: 0}).where({key})) + events.emit('update', {key, name: undefined}) + } +} + +// internal methods +// = + +function massageDNSRecord (record) { + if (!record) return null + return { + name: record.name, + key: record.key, + isCurrent: Boolean(record.isCurrent), + lastConfirmedAt: record.lastConfirmedAt, + firstConfirmedAt: record.firstConfirmedAt + } +} \ No newline at end of file diff --git a/dbs/history.js b/dbs/history.js index 3d43c6c8..4b70ae57 100644 --- a/dbs/history.js +++ b/dbs/history.js @@ -1,7 +1,13 @@ const lock = require('../lib/lock') const db = require('./profile-data-db') +// typedefs +// = + class BadParamError extends Error { + /** + * @param {string} msg + */ constructor (msg) { super() this.name = 'BadParamError' @@ -9,9 +15,30 @@ class BadParamError extends Error { } } +/** + * @typedef {Object} Visit + * @prop {number} profileId + * @prop {string} url + * @prop {string} title + * @prop {number} ts + * + * @typedef {Object} VisitSearchResult + * @prop {string} offsets + * @prop {string} url + * @prop {string} title + * @prop {number} num_visits + */ + // exported methods // = +/** + * @param {number} profileId + * @param {Object} values + * @param {string} values.url + * @param {string} values.title + * @returns {Promise} + */ exports.addVisit = async function (profileId, {url, title}) { // validate parameters if (!url || typeof url !== 'string') { @@ -25,17 +52,19 @@ exports.addVisit = async function (profileId, {url, title}) { try { await db.run('BEGIN TRANSACTION;') - // get current stats - var stats = await db.get('SELECT * FROM visit_stats WHERE url = ?;', [url]) var ts = Date.now() - - // create or update stats - if (!stats) { - await db.run('INSERT INTO visit_stats (url, num_visits, last_visit_ts) VALUES (?, ?, ?);', [url, 1, ts]) - await db.run('INSERT INTO visit_fts (url, title) VALUES (?, ?);', [url, title]) - } else { - let num_visits = (+stats.num_visits || 1) + 1 - await db.run('UPDATE visit_stats SET num_visits = ?, last_visit_ts = ? WHERE url = ?;', [num_visits, ts, url]) + if (!url.startsWith('beaker://')) { // dont log stats on internal sites, keep them out of the search + // get current stats + var stats = await db.get('SELECT * FROM visit_stats WHERE url = ?;', [url]) + + // create or update stats + if (!stats) { + await db.run('INSERT INTO visit_stats (url, num_visits, last_visit_ts) VALUES (?, ?, ?);', [url, 1, ts]) + await db.run('INSERT INTO visit_fts (url, title) VALUES (?, ?);', [url, title]) + } else { + let num_visits = (+stats.num_visits || 1) + 1 + await db.run('UPDATE visit_stats SET num_visits = ?, last_visit_ts = ? WHERE url = ?;', [num_visits, ts, url]) + } } // visited within 1 hour? @@ -54,14 +83,24 @@ exports.addVisit = async function (profileId, {url, title}) { } } +/** + * @param {number} profileId + * @param {Object} opts + * @param {string} [opts.search] + * @param {number} [opts.offset] + * @param {number} [opts.limit] + * @param {number} [opts.before] + * @param {number} [opts.after] + * @returns {Promise>} + */ exports.getVisitHistory = async function (profileId, {search, offset, limit, before, after}) { var release = await lock('history-db') try { - const params = [ + const params = /** @type Array */([ profileId, limit || 50, offset || 0 - ] + ]) if (search) { // prep search terms params.push( @@ -102,6 +141,13 @@ exports.getVisitHistory = async function (profileId, {search, offset, limit, bef } } +/** + * @param {number} profileId + * @param {Object} opts + * @param {number} [opts.offset] + * @param {number} [opts.limit] + * @returns {Promise>} + */ exports.getMostVisited = async function (profileId, { offset, limit }) { var release = await lock('history-db') try { @@ -121,6 +167,10 @@ exports.getMostVisited = async function (profileId, { offset, limit }) { } } +/** + * @param {string} q + * @returns {Promise>} + */ exports.search = async function (q) { if (!q || typeof q !== 'string') { throw new BadParamError('q must be a string') @@ -140,7 +190,7 @@ exports.search = async function (q) { SELECT offsets(visit_fts) as offsets, visit_fts.url, visit_fts.title, visit_stats.num_visits FROM visit_fts LEFT JOIN visit_stats ON visit_stats.url = visit_fts.url - WHERE visit_fts MATCH ? + WHERE visit_fts MATCH ? AND visit_stats.num_visits > 2 ORDER BY visit_stats.num_visits DESC LIMIT 10; `, [q]) @@ -149,6 +199,10 @@ exports.search = async function (q) { } } +/** + * @param {string} url + * @returns {Promise} + */ exports.removeVisit = async function (url) { // validate parameters if (!url || typeof url !== 'string') { @@ -169,6 +223,10 @@ exports.removeVisit = async function (url) { } } +/** + * @param {number} timestamp + * @returns {Promise} + */ exports.removeVisitsAfter = async function (timestamp) { var release = await lock('history-db') try { @@ -183,6 +241,9 @@ exports.removeVisitsAfter = async function (timestamp) { } } +/** + * @returns {Promise} + */ exports.removeAllVisits = async function () { var release = await lock('history-db') db.run('DELETE FROM visits;') diff --git a/dbs/profile-data-db.js b/dbs/profile-data-db.js index 59734685..9e3b6641 100644 --- a/dbs/profile-data-db.js +++ b/dbs/profile-data-db.js @@ -1,8 +1,15 @@ const sqlite3 = require('sqlite3') const path = require('path') -const fs = require('fs') const {cbPromise} = require('../lib/functions') -const {setupSqliteDB} = require('../lib/db') +const {setupSqliteDB, handleQueryBuilder} = require('../lib/db') + +// typedefs +// = + +/** + * @typedef {Object} SQLiteResult + * @prop {string} lastID + */ // globals // = @@ -14,6 +21,10 @@ var setupPromise // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + */ exports.setup = function (opts) { // open database var dbPath = path.join(opts.userDataPath, 'Profiles') @@ -21,29 +32,55 @@ exports.setup = function (opts) { setupPromise = setupSqliteDB(db, {setup: setupDb, migrations}, '[PROFILES]') } +/** + * @param {...(any)} args + * @return {Promise} + */ exports.get = async function (...args) { await setupPromise + args = handleQueryBuilder(args) return cbPromise(cb => db.get(...args, cb)) } +/** + * @param {...(any)} args + * @return {Promise>} + */ exports.all = async function (...args) { await setupPromise + args = handleQueryBuilder(args) return cbPromise(cb => db.all(...args, cb)) } +/** + * @param {...(any)} args + * @return {Promise} + */ exports.run = async function (...args) { await setupPromise - return cbPromise(cb => db.run(...args, cb)) + args = handleQueryBuilder(args) + return cbPromise(cb => db.run(...args, function (err) { + if (err) cb(err) + else cb(null, {lastID: this.lastID}) + })) } +/** + * @returns {Promise} + */ exports.serialize = function () { return db.serialize() } +/** + * @returns {Promise} + */ exports.parallelize = function () { return db.parallelize() } +exports.getSqliteInstance = () => db + // internal methods // = @@ -74,6 +111,19 @@ migrations = [ migration('profile-data.v21.sql'), migration('profile-data.v22.sql', {canFail: true}), // canFail for the same reason as v16, ffs migration('profile-data.v23.sql'), + migration('profile-data.v24.sql'), + migration('profile-data.v25.sql'), + migration('profile-data.v26.sql'), + migration('profile-data.v27.sql'), + migration('profile-data.v28.sql'), + migration('profile-data.v29.sql'), + migration('profile-data.v30.sql'), + migration('profile-data.v31.sql'), + migration('profile-data.v32.sql'), + migration('profile-data.v33.sql'), + migration('profile-data.v34.sql'), + migration('profile-data.v35.sql'), + migration('profile-data.v36.sql') ] function migration (file, opts = {}) { return cb => { diff --git a/dbs/schemas/profile-data.sql.js b/dbs/schemas/profile-data.sql.js index 39b9090a..9adff13b 100644 --- a/dbs/schemas/profile-data.sql.js +++ b/dbs/schemas/profile-data.sql.js @@ -5,6 +5,25 @@ CREATE TABLE profiles ( createdAt INTEGER DEFAULT (strftime('%s', 'now')) ); +CREATE TABLE users ( + id INTEGER PRIMARY KEY NOT NULL, + label TEXT, + url TEXT, + isDefault INTEGER DEFAULT 0, + isTemporary INTEGER DEFAULT 0, + createdAt INTEGER +); + +CREATE TABLE user_site_sessions ( + id INTEGER PRIMARY KEY NOT NULL, + userId INTEGER NOT NULL, + url TEXT, + permissionsJson TEXT, + createdAt INTEGER, + + FOREIGN KEY (userId) REFERENCES users (id) ON DELETE CASCADE +); + CREATE TABLE archives ( profileId INTEGER NOT NULL, key TEXT NOT NULL, -- dat key @@ -46,22 +65,23 @@ CREATE TABLE archives_meta_type ( type TEXT ); --- a list of the draft-dats for a master-dat -CREATE TABLE archive_drafts ( - profileId INTEGER, - masterKey TEXT, -- key of the master dat - draftKey TEXT, -- key of the draft dat - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - - isActive INTEGER, -- is this the active draft? (deprecated) - - FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +CREATE TABLE dat_dns ( + id INTEGER PRIMARY KEY, + name TEXT, + key TEXT, + isCurrent INTEGER, + lastConfirmedAt INTEGER, + firstConfirmedAt INTEGER ); +CREATE INDEX dat_dns_name ON dat_dns (name); +CREATE INDEX dat_dns_key ON dat_dns (key); CREATE TABLE bookmarks ( profileId INTEGER, url TEXT NOT NULL, title TEXT, + description TEXT, + isPublic INTEGER, pinned INTEGER, pinOrder INTEGER DEFAULT 0, createdAt INTEGER DEFAULT (strftime('%s', 'now')), @@ -72,17 +92,6 @@ CREATE TABLE bookmarks ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); -CREATE TABLE templates ( - profileId INTEGER, - url TEXT NOT NULL, - title TEXT, - screenshot, - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - - PRIMARY KEY (profileId, url), - FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE -); - CREATE TABLE visits ( profileId INTEGER, url TEXT NOT NULL, @@ -102,7 +111,322 @@ CREATE TABLE visit_stats ( CREATE VIRTUAL TABLE visit_fts USING fts4 (url, title); CREATE UNIQUE INDEX visits_stats_url ON visit_stats (url); --- list of the user's installed apps +-- list of the users installed apps +CREATE TABLE installed_applications ( + id INTEGER PRIMARY KEY NOT NULL, + userId INTEGER NOT NULL, + enabled INTEGER DEFAULT 1, + url TEXT, + createdAt INTEGER, + + FOREIGN KEY (userId) REFERENCES users (id) ON DELETE CASCADE +); + +-- list of dats being looked for +CREATE TABLE watchlist ( + profileId INTEGER NOT NULL, + url TEXT NOT NULL, + description TEXT NOT NULL, + seedWhenResolved BOOLEAN NOT NULL, + resolved BOOLEAN NOT NULL DEFAULT (0), + updatedAt INTEGER DEFAULT (strftime('%s', 'now')), + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + + PRIMARY KEY (profileId, url), + FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +); + +-- list of the users current templates +CREATE TABLE templates ( + profileId INTEGER, + url TEXT NOT NULL, + title TEXT, + screenshot, + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + + PRIMARY KEY (profileId, url), + FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +); + +-- list of sites being crawled +CREATE TABLE crawl_sources ( + id INTEGER PRIMARY KEY NOT NULL, + url TEXT NOT NULL, + datDnsId INTEGER +); + +-- tracking information on the crawl-state of the sources +CREATE TABLE crawl_sources_meta ( + crawlSourceId INTEGER NOT NULL, + crawlSourceVersion INTEGER NOT NULL, + crawlDataset TEXT NOT NULL, + crawlDatasetVersion INTEGER NOT NULL, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +-- crawled descriptions of other sites +CREATE TABLE crawl_site_descriptions ( + crawlSourceId INTEGER NOT NULL, + crawledAt INTEGER, + + url TEXT, + title TEXT, + description TEXT, + type TEXT, -- comma separated strings + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE VIRTUAL TABLE crawl_site_descriptions_fts_index USING fts5(title, description, content='crawl_site_descriptions'); + +-- triggers to keep crawl_site_descriptions_fts_index updated +CREATE TRIGGER crawl_site_descriptions_ai AFTER INSERT ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_site_descriptions_ad AFTER DELETE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_site_descriptions_au AFTER UPDATE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- crawled tags +CREATE TABLE crawl_tags ( + id INTEGER PRIMARY KEY, + tag TEXT UNIQUE +); + +-- crawled posts +CREATE TABLE crawl_posts ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + body TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE VIRTUAL TABLE crawl_posts_fts_index USING fts5(body, content='crawl_posts'); + +-- triggers to keep crawl_posts_fts_index updated +CREATE TRIGGER crawl_posts_ai AFTER INSERT ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; +CREATE TRIGGER crawl_posts_ad AFTER DELETE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); +END; +CREATE TRIGGER crawl_posts_au AFTER UPDATE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); + INSERT INTO crawl_posts_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; + +-- crawled comments +CREATE TABLE crawl_comments ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + topic TEXT, + replyTo TEXT, + body TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_comments_topic ON crawl_comments (topic); +CREATE VIRTUAL TABLE crawl_comments_fts_index USING fts5(body, content='crawl_comments'); + +-- triggers to keep crawl_comments_fts_index updated +CREATE TRIGGER crawl_comments_ai AFTER INSERT ON crawl_comments BEGIN + INSERT INTO crawl_comments_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; +CREATE TRIGGER crawl_comments_ad AFTER DELETE ON crawl_comments BEGIN + INSERT INTO crawl_comments_fts_index(crawl_comments_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); +END; +CREATE TRIGGER crawl_comments_au AFTER UPDATE ON crawl_comments BEGIN + INSERT INTO crawl_comments_fts_index(crawl_comments_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); + INSERT INTO crawl_comments_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; + +-- crawled reactions +CREATE TABLE crawl_reactions ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + topic TEXT NOT NULL, + emojis TEXT NOT NULL, + + PRIMARY KEY (crawlSourceId, pathname), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_reactions_topic ON crawl_reactions (topic); + +-- crawled votes +CREATE TABLE crawl_votes ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + topic TEXT NOT NULL, + vote INTEGER NOT NULL, + createdAt INTEGER, + updatedAt INTEGER, + + PRIMARY KEY (crawlSourceId, pathname), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_votes_topic ON crawl_votes (topic); + +-- crawled bookmarks +CREATE TABLE crawl_bookmarks ( + id INTEGER PRIMARY KEY, + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + href TEXT, + title TEXT, + description TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE VIRTUAL TABLE crawl_bookmarks_fts_index USING fts5(title, description, content='crawl_bookmarks'); + +-- triggers to keep crawl_bookmarks_fts_index updated +CREATE TRIGGER crawl_bookmarks_ai AFTER INSERT ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_bookmarks_ad AFTER DELETE ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(crawl_bookmarks_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_bookmarks_au AFTER UPDATE ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(crawl_bookmarks_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); + INSERT INTO crawl_bookmarks_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- crawled bookmark tags +CREATE TABLE crawl_bookmarks_tags ( + crawlBookmarkId INTEGER, + crawlTagId INTEGER, + + FOREIGN KEY (crawlBookmarkId) REFERENCES crawl_bookmarks (id) ON DELETE CASCADE, + FOREIGN KEY (crawlTagId) REFERENCES crawl_tags (id) ON DELETE CASCADE +); + +-- crawled follows +CREATE TABLE crawl_follows ( + crawlSourceId INTEGER NOT NULL, + crawledAt INTEGER, + + destUrl TEXT NOT NULL, + + PRIMARY KEY (crawlSourceId, destUrl), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +-- crawled discussions +CREATE TABLE crawl_discussions ( + id INTEGER PRIMARY KEY, + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + title TEXT NOT NULL, + body TEXT, + href TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_discussions_url ON crawl_discussions (crawlSourceId, pathname); +CREATE VIRTUAL TABLE crawl_discussions_fts_index USING fts5(title, body, content='crawl_discussions'); + +-- triggers to keep crawl_discussions_fts_index updated +CREATE TRIGGER crawl_discussions_ai AFTER INSERT ON crawl_discussions BEGIN + INSERT INTO crawl_discussions_fts_index(rowid, title, body) VALUES (new.rowid, new.title, new.body); +END; +CREATE TRIGGER crawl_discussions_ad AFTER DELETE ON crawl_discussions BEGIN + INSERT INTO crawl_discussions_fts_index(crawl_discussions_fts_index, rowid, title, body) VALUES('delete', old.rowid, old.title, old.body); +END; +CREATE TRIGGER crawl_discussions_au AFTER UPDATE ON crawl_discussions BEGIN + INSERT INTO crawl_discussions_fts_index(crawl_discussions_fts_index, rowid, title, body) VALUES('delete', old.rowid, old.title, old.body); + INSERT INTO crawl_discussions_fts_index(rowid, title, body) VALUES (new.rowid, new.title, new.body); +END; + +-- crawled discussion tags +CREATE TABLE crawl_discussions_tags ( + crawlDiscussionId INTEGER, + crawlTagId INTEGER, + + FOREIGN KEY (crawlDiscussionId) REFERENCES crawl_discussions (id) ON DELETE CASCADE, + FOREIGN KEY (crawlTagId) REFERENCES crawl_tags (id) ON DELETE CASCADE +); + +-- crawled media +CREATE TABLE crawl_media ( + id INTEGER PRIMARY KEY, + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + subtype TEXT NOT NULL, + href TEXT NOT NULL, + title TEXT NOT NULL, + description TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_media_url ON crawl_media (crawlSourceId, pathname); +CREATE INDEX crawl_media_subtype ON crawl_media (subtype); +CREATE INDEX crawl_media_href ON crawl_media (href); +CREATE VIRTUAL TABLE crawl_media_fts_index USING fts5(title, description, content='crawl_media'); + +-- triggers to keep crawl_media_fts_index updated +CREATE TRIGGER crawl_media_ai AFTER INSERT ON crawl_media BEGIN + INSERT INTO crawl_media_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_media_ad AFTER DELETE ON crawl_media BEGIN + INSERT INTO crawl_media_fts_index(crawl_media_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_media_au AFTER UPDATE ON crawl_media BEGIN + INSERT INTO crawl_media_fts_index(crawl_media_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); + INSERT INTO crawl_media_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- crawled media tags +CREATE TABLE crawl_media_tags ( + crawlMediaId INTEGER, + crawlTagId INTEGER, + + FOREIGN KEY (crawlMediaId) REFERENCES crawl_media (id) ON DELETE CASCADE, + FOREIGN KEY (crawlTagId) REFERENCES crawl_tags (id) ON DELETE CASCADE +); + +-- a list of the draft-dats for a master-dat +-- deprecated +CREATE TABLE archive_drafts ( + profileId INTEGER, + masterKey TEXT, -- key of the master dat + draftKey TEXT, -- key of the draft dat + createdAt INTEGER DEFAULT (strftime('%s', 'now')), + + isActive INTEGER, -- is this the active draft? (deprecated) + + FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE +); + +-- list of the users installed apps -- deprecated CREATE TABLE apps ( profileId INTEGER NOT NULL, @@ -115,7 +439,7 @@ CREATE TABLE apps ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); --- log of the user's app installations +-- log of the users app installations -- deprecated CREATE TABLE apps_log ( profileId INTEGER NOT NULL, @@ -126,20 +450,6 @@ CREATE TABLE apps_log ( FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE ); --- add a database for watchlist feature -CREATE TABLE watchlist ( - profileId INTEGER NOT NULL, - url TEXT NOT NULL, - description TEXT NOT NULL, - seedWhenResolved BOOLEAN NOT NULL, - resolved BOOLEAN NOT NULL DEFAULT (0), - updatedAt INTEGER DEFAULT (strftime('%s', 'now')), - createdAt INTEGER DEFAULT (strftime('%s', 'now')), - - PRIMARY KEY (profileId, url), - FOREIGN KEY (profileId) REFERENCES profiles (id) ON DELETE CASCADE -); - -- deprecated CREATE TABLE workspaces ( profileId INTEGER NOT NULL, @@ -157,14 +467,14 @@ CREATE TABLE workspaces ( INSERT INTO profiles (id) VALUES (0); -- default bookmarks -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Beaker Home', 'dat://beakerbrowser.com', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Dat Project', 'dat://datproject.org', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, '@BeakerBrowser', 'https://twitter.com/beakerbrowser', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Hashbase', 'https://hashbase.io', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Beaker Browser', 'dat://beakerbrowser.com', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Dat Project', 'dat://datproject.org', 0); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Hashbase', 'https://hashbase.io', 0); INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Documentation', 'dat://beakerbrowser.com/docs', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Report an issue', 'https://github.com/beakerbrowser/beaker/issues', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Explore the p2p Web', 'dat://taravancil.com/explore-the-p2p-web.md', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Report an issue', 'https://github.com/beakerbrowser/beaker/issues', 0); INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Support Beaker', 'https://opencollective.com/beaker', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Library', 'beaker://library/', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Beaker.Social', 'dat://beaker.social', 1); -PRAGMA user_version = 23; +PRAGMA user_version = 36; ` diff --git a/dbs/schemas/profile-data.v1.sql.js b/dbs/schemas/profile-data.v1.sql.js index fc6bce9b..f22c43dc 100644 --- a/dbs/schemas/profile-data.v1.sql.js +++ b/dbs/schemas/profile-data.v1.sql.js @@ -59,13 +59,11 @@ CREATE UNIQUE INDEX visits_stats_url ON visit_stats (url); INSERT INTO profiles (id) VALUES (0); -- default bookmarks -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Beaker Home', 'dat://beakerbrowser.com', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Dat Project', 'dat://datproject.org', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, '@BeakerBrowser', 'https://twitter.com/beakerbrowser', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Hashbase', 'https://hashbase.io', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Beaker Browser', 'dat://beakerbrowser.com', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Dat Project', 'dat://datproject.org', 0); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Hashbase', 'https://hashbase.io', 0); INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Documentation', 'dat://beakerbrowser.com/docs', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Report an issue', 'https://github.com/beakerbrowser/beaker/issues', 1); -INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Explore the P2P Web', 'dat://explore.beakerbrowser.com/', 1); +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Report an issue', 'https://github.com/beakerbrowser/beaker/issues', 0); INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Support Beaker', 'https://opencollective.com/beaker', 1); PRAGMA user_version = 1; diff --git a/dbs/schemas/profile-data.v24.sql.js b/dbs/schemas/profile-data.v24.sql.js new file mode 100644 index 00000000..bacd4af3 --- /dev/null +++ b/dbs/schemas/profile-data.v24.sql.js @@ -0,0 +1,128 @@ +module.exports = ` +-- new default bookmarks +INSERT INTO bookmarks (profileId, title, url, pinned) VALUES (0, 'Beaker.Social', 'dat://beaker.social', 1); + +-- description of the bookmark's content, often pulled from the bookmarked page +ALTER TABLE bookmarks ADD COLUMN description TEXT; + +-- sync the bookmark to the user's public profile +ALTER TABLE bookmarks ADD COLUMN isPublic INTEGER; + +CREATE TABLE users ( + id INTEGER PRIMARY KEY NOT NULL, + url TEXT, + isDefault INTEGER DEFAULT 0, + createdAt INTEGER +); + +-- list of sites being crawled +CREATE TABLE crawl_sources ( + id INTEGER PRIMARY KEY NOT NULL, + url TEXT NOT NULL +); + +-- tracking information on the crawl-state of the sources +CREATE TABLE crawl_sources_meta ( + crawlSourceId INTEGER NOT NULL, + crawlSourceVersion INTEGER NOT NULL, + crawlDataset TEXT NOT NULL, + crawlDatasetVersion INTEGER NOT NULL, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +-- crawled descriptions of other sites +CREATE TABLE crawl_site_descriptions ( + crawlSourceId INTEGER NOT NULL, + crawledAt INTEGER, + + url TEXT, + title TEXT, + description TEXT, + type TEXT, -- comma separated strings + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE VIRTUAL TABLE crawl_site_descriptions_fts_index USING fts5(title, description, content='crawl_site_descriptions'); + +-- triggers to keep crawl_site_descriptions_fts_index updated +CREATE TRIGGER crawl_site_descriptions_ai AFTER INSERT ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_site_descriptions_ad AFTER DELETE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_site_descriptions_au AFTER UPDATE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.a, old.title, old.description); + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- crawled posts +CREATE TABLE crawl_posts ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + body TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE VIRTUAL TABLE crawl_posts_fts_index USING fts5(body, content='crawl_posts'); + +-- triggers to keep crawl_posts_fts_index updated +CREATE TRIGGER crawl_posts_ai AFTER INSERT ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; +CREATE TRIGGER crawl_posts_ad AFTER DELETE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); +END; +CREATE TRIGGER crawl_posts_au AFTER UPDATE ON crawl_posts BEGIN + INSERT INTO crawl_posts_fts_index(crawl_posts_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); + INSERT INTO crawl_posts_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; + +-- crawled bookmarks +CREATE TABLE crawl_bookmarks ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + href TEXT, + title TEXT, + description TEXT, + tags TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE VIRTUAL TABLE crawl_bookmarks_fts_index USING fts5(title, description, tags, content='crawl_bookmarks'); + +-- triggers to keep crawl_bookmarks_fts_index updated +CREATE TRIGGER crawl_bookmarks_ai AFTER INSERT ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(rowid, title, description, tags) VALUES (new.rowid, new.title, new.description, new.tags); +END; +CREATE TRIGGER crawl_bookmarks_ad AFTER DELETE ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(crawl_bookmarks_fts_index, rowid, title, description, tags) VALUES('delete', old.rowid, old.title, old.description, old.tags); +END; +CREATE TRIGGER crawl_bookmarks_au AFTER UPDATE ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(crawl_bookmarks_fts_index, rowid, title, description, tags) VALUES('delete', old.rowid, old.title, old.description, old.tags); + INSERT INTO crawl_bookmarks_fts_index(rowid, title, description, tags) VALUES (new.rowid, new.title, new.description, new.tags); +END; + +-- crawled follows +CREATE TABLE crawl_graph ( + crawlSourceId INTEGER NOT NULL, + crawledAt INTEGER, + + destUrl TEXT NOT NULL, + + PRIMARY KEY (crawlSourceId, destUrl), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); + +PRAGMA user_version = 24; +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v25.sql.js b/dbs/schemas/profile-data.v25.sql.js new file mode 100644 index 00000000..622ebc64 --- /dev/null +++ b/dbs/schemas/profile-data.v25.sql.js @@ -0,0 +1,18 @@ +module.exports = ` + +-- crawled reactions +CREATE TABLE crawl_reactions ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + topic TEXT NOT NULL, + emojis TEXT NOT NULL, + + PRIMARY KEY (crawlSourceId, pathname), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_reactions_topic ON crawl_reactions (topic); + +PRAGMA user_version = 25; +` diff --git a/dbs/schemas/profile-data.v26.sql.js b/dbs/schemas/profile-data.v26.sql.js new file mode 100644 index 00000000..4a49cf39 --- /dev/null +++ b/dbs/schemas/profile-data.v26.sql.js @@ -0,0 +1,14 @@ +module.exports = ` + +-- fix an incorrect trigger definition +DROP TRIGGER crawl_site_descriptions_au; +CREATE TRIGGER crawl_site_descriptions_au AFTER UPDATE ON crawl_site_descriptions BEGIN + INSERT INTO crawl_site_descriptions_fts_index(crawl_site_descriptions_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); + INSERT INTO crawl_site_descriptions_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- rename 'graph' to 'follows' +ALTER TABLE crawl_graph RENAME TO crawl_follows; + +PRAGMA user_version = 26; +` diff --git a/dbs/schemas/profile-data.v27.sql.js b/dbs/schemas/profile-data.v27.sql.js new file mode 100644 index 00000000..fd957488 --- /dev/null +++ b/dbs/schemas/profile-data.v27.sql.js @@ -0,0 +1,33 @@ +module.exports = ` + +-- add crawled comments +CREATE TABLE crawl_comments ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + topic TEXT, + replyTo TEXT, + body TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_comments_topic ON crawl_comments (topic); +CREATE VIRTUAL TABLE crawl_comments_fts_index USING fts5(body, content='crawl_comments'); + +-- triggers to keep crawl_comments_fts_index updated +CREATE TRIGGER crawl_comments_ai AFTER INSERT ON crawl_comments BEGIN + INSERT INTO crawl_comments_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; +CREATE TRIGGER crawl_comments_ad AFTER DELETE ON crawl_comments BEGIN + INSERT INTO crawl_comments_fts_index(crawl_comments_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); +END; +CREATE TRIGGER crawl_comments_au AFTER UPDATE ON crawl_comments BEGIN + INSERT INTO crawl_comments_fts_index(crawl_comments_fts_index, rowid, body) VALUES('delete', old.rowid, old.body); + INSERT INTO crawl_comments_fts_index(rowid, body) VALUES (new.rowid, new.body); +END; + +PRAGMA user_version = 27; +` diff --git a/dbs/schemas/profile-data.v28.sql.js b/dbs/schemas/profile-data.v28.sql.js new file mode 100644 index 00000000..0bc4e110 --- /dev/null +++ b/dbs/schemas/profile-data.v28.sql.js @@ -0,0 +1,60 @@ +module.exports = ` + +-- we're replacing the bookmark 'tags' field with a new normalized tags table +-- this requires replacing the entire bookmarks table because we need to add an id pkey + + +-- remove the old bookmarks tabes +DROP TRIGGER crawl_bookmarks_ai; +DROP TRIGGER crawl_bookmarks_ad; +DROP TRIGGER crawl_bookmarks_au; +DROP TABLE crawl_bookmarks_fts_index; +DROP TABLE crawl_bookmarks; + + +-- add crawled tags +CREATE TABLE crawl_tags ( + id INTEGER PRIMARY KEY, + tag TEXT UNIQUE +); + +-- add crawled bookmarks +CREATE TABLE crawl_bookmarks ( + id INTEGER PRIMARY KEY, + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + href TEXT, + title TEXT, + description TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE VIRTUAL TABLE crawl_bookmarks_fts_index USING fts5(title, description, content='crawl_bookmarks'); + +-- triggers to keep crawl_bookmarks_fts_index updated +CREATE TRIGGER crawl_bookmarks_ai AFTER INSERT ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_bookmarks_ad AFTER DELETE ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(crawl_bookmarks_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_bookmarks_au AFTER UPDATE ON crawl_bookmarks BEGIN + INSERT INTO crawl_bookmarks_fts_index(crawl_bookmarks_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); + INSERT INTO crawl_bookmarks_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- add bookmark <-> tag join table +CREATE TABLE crawl_bookmarks_tags ( + crawlBookmarkId INTEGER, + crawlTagId INTEGER, + + FOREIGN KEY (crawlBookmarkId) REFERENCES crawl_bookmarks (id) ON DELETE CASCADE, + FOREIGN KEY (crawlTagId) REFERENCES crawl_tags (id) ON DELETE CASCADE +); + +PRAGMA user_version = 28; +` diff --git a/dbs/schemas/profile-data.v29.sql.js b/dbs/schemas/profile-data.v29.sql.js new file mode 100644 index 00000000..9c592154 --- /dev/null +++ b/dbs/schemas/profile-data.v29.sql.js @@ -0,0 +1,20 @@ +module.exports = ` + +-- add crawled votes +CREATE TABLE crawl_votes ( + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + topic TEXT NOT NULL, + vote INTEGER NOT NULL, + createdAt INTEGER, + updatedAt INTEGER, + + PRIMARY KEY (crawlSourceId, pathname), + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_votes_topic ON crawl_votes (topic); + +PRAGMA user_version = 29; +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v30.sql.js b/dbs/schemas/profile-data.v30.sql.js new file mode 100644 index 00000000..14c8f107 --- /dev/null +++ b/dbs/schemas/profile-data.v30.sql.js @@ -0,0 +1,30 @@ +module.exports = ` + +-- add crawled discussions +CREATE TABLE crawl_discussions ( + id INTEGER PRIMARY KEY, + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + title TEXT NOT NULL, + body TEXT, + href TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_discussions_url ON crawl_discussions (crawlSourceId, pathname); + +-- add discussion <-> tag join table +CREATE TABLE crawl_discussions_tags ( + crawlDiscussionId INTEGER, + crawlTagId INTEGER, + + FOREIGN KEY (crawlDiscussionId) REFERENCES crawl_discussions (id) ON DELETE CASCADE, + FOREIGN KEY (crawlTagId) REFERENCES crawl_tags (id) ON DELETE CASCADE +); + +PRAGMA user_version = 30; +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v31.sql.js b/dbs/schemas/profile-data.v31.sql.js new file mode 100644 index 00000000..f7eaacaa --- /dev/null +++ b/dbs/schemas/profile-data.v31.sql.js @@ -0,0 +1,46 @@ +module.exports = ` + +-- add crawled media +CREATE TABLE crawl_media ( + id INTEGER PRIMARY KEY, + crawlSourceId INTEGER NOT NULL, + pathname TEXT NOT NULL, + crawledAt INTEGER, + + subtype TEXT NOT NULL, + href TEXT NOT NULL, + title TEXT NOT NULL, + description TEXT, + createdAt INTEGER, + updatedAt INTEGER, + + FOREIGN KEY (crawlSourceId) REFERENCES crawl_sources (id) ON DELETE CASCADE +); +CREATE INDEX crawl_media_url ON crawl_media (crawlSourceId, pathname); +CREATE INDEX crawl_media_subtype ON crawl_media (subtype); +CREATE INDEX crawl_media_href ON crawl_media (href); +CREATE VIRTUAL TABLE crawl_media_fts_index USING fts5(title, description, content='crawl_media'); + +-- triggers to keep crawl_media_fts_index updated +CREATE TRIGGER crawl_media_ai AFTER INSERT ON crawl_media BEGIN + INSERT INTO crawl_media_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; +CREATE TRIGGER crawl_media_ad AFTER DELETE ON crawl_media BEGIN + INSERT INTO crawl_media_fts_index(crawl_media_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); +END; +CREATE TRIGGER crawl_media_au AFTER UPDATE ON crawl_media BEGIN + INSERT INTO crawl_media_fts_index(crawl_media_fts_index, rowid, title, description) VALUES('delete', old.rowid, old.title, old.description); + INSERT INTO crawl_media_fts_index(rowid, title, description) VALUES (new.rowid, new.title, new.description); +END; + +-- add crawled media tags +CREATE TABLE crawl_media_tags ( + crawlMediaId INTEGER, + crawlTagId INTEGER, + + FOREIGN KEY (crawlMediaId) REFERENCES crawl_media (id) ON DELETE CASCADE, + FOREIGN KEY (crawlTagId) REFERENCES crawl_tags (id) ON DELETE CASCADE +); + +PRAGMA user_version = 31; +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v32.sql.js b/dbs/schemas/profile-data.v32.sql.js new file mode 100644 index 00000000..022ddd39 --- /dev/null +++ b/dbs/schemas/profile-data.v32.sql.js @@ -0,0 +1,19 @@ +module.exports = ` + +CREATE VIRTUAL TABLE crawl_discussions_fts_index USING fts5(title, body, content='crawl_discussions'); + +-- triggers to keep crawl_discussions_fts_index updated +CREATE TRIGGER crawl_discussions_ai AFTER INSERT ON crawl_discussions BEGIN + INSERT INTO crawl_discussions_fts_index(rowid, title, body) VALUES (new.rowid, new.title, new.body); +END; +CREATE TRIGGER crawl_discussions_ad AFTER DELETE ON crawl_discussions BEGIN + INSERT INTO crawl_discussions_fts_index(crawl_discussions_fts_index, rowid, title, body) VALUES('delete', old.rowid, old.title, old.body); +END; +CREATE TRIGGER crawl_discussions_au AFTER UPDATE ON crawl_discussions BEGIN + INSERT INTO crawl_discussions_fts_index(crawl_discussions_fts_index, rowid, title, body) VALUES('delete', old.rowid, old.title, old.body); + INSERT INTO crawl_discussions_fts_index(rowid, title, body) VALUES (new.rowid, new.title, new.body); +END; + +PRAGMA user_version = 32; + +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v33.sql.js b/dbs/schemas/profile-data.v33.sql.js new file mode 100644 index 00000000..756ed185 --- /dev/null +++ b/dbs/schemas/profile-data.v33.sql.js @@ -0,0 +1,9 @@ +module.exports = ` + +-- add label +ALTER TABLE users ADD COLUMN label TEXT; +-- add isTemporary +ALTER TABLE users ADD COLUMN isTemporary INTEGER DEFAULT 0; + +PRAGMA user_version = 33; +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v34.sql.js b/dbs/schemas/profile-data.v34.sql.js new file mode 100644 index 00000000..a3a6741a --- /dev/null +++ b/dbs/schemas/profile-data.v34.sql.js @@ -0,0 +1,15 @@ +module.exports = ` + +-- list of the users installed apps +CREATE TABLE installed_applications ( + id INTEGER PRIMARY KEY NOT NULL, + userId INTEGER NOT NULL, + enabled INTEGER DEFAULT 1, + url TEXT, + createdAt INTEGER, + + FOREIGN KEY (userId) REFERENCES users (id) ON DELETE CASCADE +); + +PRAGMA user_version = 34; +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v35.sql.js b/dbs/schemas/profile-data.v35.sql.js new file mode 100644 index 00000000..ef5270a2 --- /dev/null +++ b/dbs/schemas/profile-data.v35.sql.js @@ -0,0 +1,17 @@ +module.exports = ` + +CREATE TABLE dat_dns ( + id INTEGER PRIMARY KEY, + name TEXT, + key TEXT, + isCurrent INTEGER, + lastConfirmedAt INTEGER, + firstConfirmedAt INTEGER +); +CREATE INDEX dat_dns_name ON dat_dns (name); +CREATE INDEX dat_dns_key ON dat_dns (key); + +ALTER TABLE crawl_sources ADD COLUMN datDnsId INTEGER; + +PRAGMA user_version = 35; +` \ No newline at end of file diff --git a/dbs/schemas/profile-data.v36.sql.js b/dbs/schemas/profile-data.v36.sql.js new file mode 100644 index 00000000..c79e2063 --- /dev/null +++ b/dbs/schemas/profile-data.v36.sql.js @@ -0,0 +1,12 @@ +module.exports = ` +CREATE TABLE user_site_sessions ( + id INTEGER PRIMARY KEY NOT NULL, + userId INTEGER NOT NULL, + url TEXT, + permissionsJson TEXT, + createdAt INTEGER, + + FOREIGN KEY (userId) REFERENCES users (id) ON DELETE CASCADE +); +PRAGMA user_version = 36; +` \ No newline at end of file diff --git a/dbs/settings.js b/dbs/settings.js index e2fafe4b..0a9ce06d 100644 --- a/dbs/settings.js +++ b/dbs/settings.js @@ -16,6 +16,11 @@ var events = new EventEmitter() // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + * @param {string} opts.homePath + */ exports.setup = function (opts) { // open database var dbPath = path.join(opts.userDataPath, 'Settings') @@ -38,10 +43,15 @@ exports.setup = function (opts) { exports.on = events.on.bind(events) exports.once = events.once.bind(events) +/** + * @param {string} key + * @param {string | number} value + * @returns {Promise} + */ exports.set = function (key, value) { events.emit('set', key, value) events.emit('set:' + key, value) - return setupPromise.then(v => cbPromise(cb => { + return setupPromise.then(() => cbPromise(cb => { db.run(` INSERT OR REPLACE INTO settings (key, value, ts) @@ -50,13 +60,17 @@ exports.set = function (key, value) { })) } +/** + * @param {string} key + * @returns {boolean | Promise} + */ exports.get = function (key) { // env variables if (key === 'no_welcome_tab') { - return (getEnvVar('BEAKER_NO_WELCOME_TAB') == 1) + return (Number(getEnvVar('BEAKER_NO_WELCOME_TAB')) === 1) } // stored values - return setupPromise.then(v => cbPromise(cb => { + return setupPromise.then(() => cbPromise(cb => { db.get(`SELECT value FROM settings WHERE key = ?`, [key], (err, row) => { if (row) { row = row.value } if (typeof row === 'undefined') { row = defaultSettings[key] } @@ -65,6 +79,9 @@ exports.get = function (key) { })) } +/** + * @returns {Promise} + */ exports.getAll = function () { return setupPromise.then(v => cbPromise(cb => { db.all(`SELECT key, value FROM settings`, (err, rows) => { @@ -73,7 +90,7 @@ exports.getAll = function () { var obj = {} rows.forEach(row => { obj[row.key] = row.value }) obj = Object.assign({}, defaultSettings, obj) - obj.no_welcome_tab = (getEnvVar('BEAKER_NO_WELCOME_TAB') == 1) + obj.no_welcome_tab = (Number(getEnvVar('BEAKER_NO_WELCOME_TAB')) === 1) cb(null, obj) }) })) diff --git a/dbs/sitedata.js b/dbs/sitedata.js index 9fcfe436..becfb0e4 100644 --- a/dbs/sitedata.js +++ b/dbs/sitedata.js @@ -1,13 +1,13 @@ const sqlite3 = require('sqlite3') const path = require('path') -const url = require('url') +const parseDatUrl = require('parse-dat-url') const { cbPromise } = require('../lib/functions') const { setupSqliteDB } = require('../lib/db') const datDns = require('../dat/dns') -const datLibrary = require('../dat/library') // globals // = + var db var migrations var setupPromise @@ -15,6 +15,10 @@ var setupPromise // exported methods // = +/** + * @param {Object} opts + * @param {string} opts.userDataPath + */ exports.setup = function (opts) { // open database var dbPath = path.join(opts.userDataPath, 'SiteData') @@ -22,6 +26,14 @@ exports.setup = function (opts) { setupPromise = setupSqliteDB(db, {migrations}, '[SITEDATA]') } +/** + * @param {string} url + * @param {string} key + * @param {number | string} value + * @param {Object} [opts] + * @param {boolean} [opts.dontExtractOrigin] + * @returns {Promise} + */ const set = exports.set = async function (url, key, value, opts) { await setupPromise var origin = opts && opts.dontExtractOrigin ? url : await extractOrigin(url) @@ -35,6 +47,11 @@ const set = exports.set = async function (url, key, value, opts) { }) } +/** + * @param {string} url + * @param {string} key + * @returns {Promise} + */ const clear = exports.clear = async function (url, key) { await setupPromise var origin = await extractOrigin(url) @@ -46,6 +63,13 @@ const clear = exports.clear = async function (url, key) { }) } +/** + * @param {string} url + * @param {string} key + * @param {Object} [opts] + * @param {boolean} [opts.dontExtractOrigin] + * @returns {Promise} + */ const get = exports.get = async function (url, key, opts) { await setupPromise var origin = opts && opts.dontExtractOrigin ? url : await extractOrigin(url) @@ -58,6 +82,19 @@ const get = exports.get = async function (url, key, opts) { }) } +/** + * @param {string} url + * @param {string} key + * @returns {Promise} + */ +const getPermission = exports.getPermission = function (url, key) { + return get(url, 'perm:' + key) +} + +/** + * @param {string} url + * @returns {Promise} + */ const getPermissions = exports.getPermissions = async function (url) { await setupPromise var origin = await extractOrigin(url) @@ -75,6 +112,10 @@ const getPermissions = exports.getPermissions = async function (url) { }) } +/** + * @param {string} url + * @returns {Promise>} + */ exports.getNetworkPermissions = async function (url) { await setupPromise var origin = await extractOrigin(url) @@ -84,7 +125,7 @@ exports.getNetworkPermissions = async function (url) { if (err) return cb(err) // convert to array - var origins = [] + var origins = /** @type string[] */([]) if (rows) { rows.forEach(row => { if (row.value) origins.push(row.key.split(':').pop()) @@ -95,65 +136,30 @@ exports.getNetworkPermissions = async function (url) { }) } -const getAppPermissions = exports.getAppPermissions = async function (url) { - await setupPromise - var origin = await extractOrigin(url) - if (!origin) return null - return cbPromise(cb => { - db.all(`SELECT key, value FROM sitedata WHERE origin = ? AND key LIKE 'perm:app:%'`, [origin], (err, rows) => { - if (err) return cb(err) - - // convert to app perms object - var appPerms = {} - if (rows) { - rows.forEach(row => { - let [api, perm] = row.key.split(':').slice(2) - if (!appPerms[api]) appPerms[api] = [] - appPerms[api].push(perm) - }) - } - cb(null, appPerms) - }) - }) -} - -const getPermission = exports.getPermission = function (url, key) { - return get(url, 'perm:' + key) -} - +/** + * @param {string} url + * @param {string} key + * @param {string | number} value + * @returns {Promise} + */ const setPermission = exports.setPermission = function (url, key, value) { value = value ? 1 : 0 return set(url, 'perm:' + key, value) } -const setAppPermissions = exports.setAppPermissions = async function (url, appPerms) { - await setupPromise - var origin = await extractOrigin(url) - if (!origin) return null - appPerms = appPerms || {} - - // clear all existing app perms - await cbPromise(cb => { - db.run(` - DELETE FROM sitedata WHERE origin = ? AND key LIKE 'perm:app:%' - `, [origin], cb) - }) - - // set perms given - for (let api in appPerms) { - if (!Array.isArray(appPerms[api])) { - continue - } - for (let perm of appPerms[api]) { - await set(url, `perm:app:${api}:${perm}`, 1) - } - } -} - +/** + * @param {string} url + * @param {string} key + * @returns {Promise} + */ const clearPermission = exports.clearPermission = function (url, key) { return clear(url, 'perm:' + key) } +/** + * @param {string} key + * @returns {Promise} + */ const clearPermissionAllOrigins = exports.clearPermissionAllOrigins = async function (key) { await setupPromise key = 'perm:' + key @@ -164,34 +170,12 @@ const clearPermissionAllOrigins = exports.clearPermissionAllOrigins = async func }) } -exports.query = async function (values) { - await setupPromise - - // massage query - if ('origin' in values) { - values.origin = await extractOrigin(values.origin) - } - - return cbPromise(cb => { - // run query - const keys = Object.keys(values) - const where = keys.map(k => `${k} = ?`).join(' AND ') - values = keys.map(k => values[k]) - db.all(`SELECT * FROM sitedata WHERE ${where}`, values, (err, res) => { - if (err) return cb(err) - cb(null, res && res.value) - }) - }) -} - exports.WEBAPI = { get, set, getPermissions, getPermission, - getAppPermissions, setPermission, - setAppPermissions, clearPermission, clearPermissionAllOrigins } @@ -199,12 +183,13 @@ exports.WEBAPI = { // internal methods // = +/** + * @param {string} originURL + * @returns {Promise} + */ async function extractOrigin (originURL) { - var urlp = url.parse(originURL) + var urlp = parseDatUrl(originURL) if (!urlp || !urlp.host || !urlp.protocol) return - if (urlp.protocol === 'dat:') { - urlp.host = await datDns.resolveName(urlp.host) - } return (urlp.protocol + urlp.host) } @@ -221,7 +206,7 @@ migrations = [ CREATE UNIQUE INDEX sitedata_origin_key ON sitedata (origin, key); INSERT OR REPLACE INTO "sitedata" VALUES('https:duckduckgo.com','favicon',''); --- beakerbrowser.com - INSERT OR REPLACE INTO "sitedata" VALUES('dat:87ed2e3b160f261a032af03921a3bd09227d0a4cde73466c17114816cae43336','favicon',''); + INSERT OR REPLACE INTO "sitedata" VALUES('dat:1919b7b61d581c7877d20842eea7a1a033f251e9c1bb0050b4209294d2c3a1ee','favicon',''); PRAGMA user_version = 1; `, cb) }, @@ -260,5 +245,14 @@ migrations = [ INSERT OR REPLACE INTO "sitedata" VALUES('dat:4fa30df06cbeda4ae87be8fd4334a61289be6648fb0bf7f44f6b91d2385c9328','favicon',''); PRAGMA user_version = 5; `, cb) + }, + // version 6 + // - more favicons + function (cb) { + db.exec(` + -- beaker.social + INSERT OR REPLACE INTO "sitedata" VALUES('dat:b3c82a26487167c276dc8539dcec97f52a95c8231bc2d41d28886ed36184d3b1','favicon',''); + PRAGMA user_version = 6; + `, cb) } ] diff --git a/dbs/templates.js b/dbs/templates.js index 4be48c6c..61a9e387 100644 --- a/dbs/templates.js +++ b/dbs/templates.js @@ -1,20 +1,56 @@ const db = require('./profile-data-db') +// typedefs +// = + +/** + * @typedef {Object} Template + * @prop {string} url + * @prop {string} title + * @prop {number} createdAt + * + * @typedef {Object} TemplateScreenshot + * @prop {string} url + * @prop {string} screenshot + */ + // exported api // = +/** + * @param {number} profileId + * @param {string} url + * @returns {Promise