From ae995a64451c66dfe330fc6d3564a0e1d9a07e76 Mon Sep 17 00:00:00 2001 From: Jonas Date: Tue, 15 Aug 2023 10:39:24 -0400 Subject: [PATCH] add onboarding to bitcoin core to scrapybot --- scrapybot/scrapybot/pipelines.py | 6 +-- .../spiders/onboardingtobitcoincore.py | 53 +++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 scrapybot/scrapybot/spiders/onboardingtobitcoincore.py diff --git a/scrapybot/scrapybot/pipelines.py b/scrapybot/scrapybot/pipelines.py index 671c746..b97f410 100644 --- a/scrapybot/scrapybot/pipelines.py +++ b/scrapybot/scrapybot/pipelines.py @@ -81,7 +81,7 @@ class ElasticsearchPipeline: def process_item(self, item, spider): def parse_title(chunk: str) -> str: if spider.name in ['andreasbooks', 'btcphilosophy', 'grokkingbtc', - 'programmingbtc']: + 'programmingbtc', "onboardingtobitcoincore"]: delim_end = item['title'].find(']') return item['title'][1:delim_end] + ':' + \ item['title'][delim_end + 1:] + ' - ' + \ @@ -104,7 +104,7 @@ def parse_title(chunk: str) -> str: # Split documents for books if spider.name in ["bolts", "btcphilosophy", "grokkingbtc", "lndocs", "programmingbtc", "bips", "blips", "andreasbooks", - "bitmex"]: + "bitmex", "onboardingtobitcoincore"]: # Split this first for the body_formatted splitter = return_splitter(item['body_type'], 2000) # 2000 character limit @@ -124,7 +124,7 @@ def parse_title(chunk: str) -> str: else: if spider.name in ['andreasbooks', 'btcphilosophy', 'grokkingbtc', - 'programmingbtc']: + 'programmingbtc', 'onboardingtobitcoincore']: delim_end = item['title'].find(']') title = item['title'] item = {**item, 'title': title[1:delim_end] + ':' + title[ diff --git a/scrapybot/scrapybot/spiders/onboardingtobitcoincore.py b/scrapybot/scrapybot/spiders/onboardingtobitcoincore.py new file mode 100644 index 0000000..c793e5b --- /dev/null +++ b/scrapybot/scrapybot/spiders/onboardingtobitcoincore.py @@ -0,0 +1,53 @@ +from datetime import datetime +from bs4 import BeautifulSoup +import json +from .utils import strip_tags, strip_attributes +from scrapy.exceptions import DropItem +import uuid +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule + + +class GrokkingbtcSpider(CrawlSpider): + name = "onboardingtobitcoincore" + allowed_domains = ["github.com"] + start_urls = ["https://github.com/chaincodelabs/onboarding-to-bitcoin-core"] + + rules = ( + Rule( + LinkExtractor(restrict_xpaths="//span/a[contains(@href, 'adoc')]"), + callback="parse_item", + ), + ) + + def parse_item(self, response): + item = {} + soup = BeautifulSoup(response.text, "html.parser") + script_tags = soup.find_all("script") + res = script_tags[-1] + json_object = json.loads(res.contents[0]) + payload = json_object["payload"] + article = payload["blob"]["richText"] + item["id"] = "onboardingtobitcoincore-" + str(uuid.uuid4()) + + soup = BeautifulSoup(article, "html.parser") + header = soup.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) + + if header: + item["title"] = "[Onboarding to Bitcoin Core] " + header.text + else: + item["title"] = "[Onboarding to Bitcoin Core]" + + if not item["title"]: + return None + + item["body_formatted"] = strip_attributes(article) + item["body"] = strip_tags(article) + item["body_type"] = "html" + item["authors"] = ["Will Clark"] + item["domain"] = self.start_urls[0] + item["url"] = response.url + item["created_at"] = datetime.utcnow().isoformat() + item["indexed_at"] = datetime.utcnow().isoformat() + + return item