Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions scrapybot/scrapybot/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class ElasticsearchPipeline:
def process_item(self, item, spider):
def parse_title(chunk: str) -> str:
if spider.name in ['andreasbooks', 'btcphilosophy', 'grokkingbtc',
'programmingbtc']:
'programmingbtc', "onboardingtobitcoincore"]:
delim_end = item['title'].find(']')
return item['title'][1:delim_end] + ':' + \
item['title'][delim_end + 1:] + ' - ' + \
Expand All @@ -104,7 +104,7 @@ def parse_title(chunk: str) -> str:
# Split documents for books
if spider.name in ["bolts", "btcphilosophy", "grokkingbtc", "lndocs",
"programmingbtc", "bips", "blips", "andreasbooks",
"bitmex"]:
"bitmex", "onboardingtobitcoincore"]:
# Split this first for the body_formatted
splitter = return_splitter(item['body_type'],
2000) # 2000 character limit
Expand All @@ -124,7 +124,7 @@ def parse_title(chunk: str) -> str:
else:
if spider.name in ['andreasbooks', 'btcphilosophy',
'grokkingbtc',
'programmingbtc']:
'programmingbtc', 'onboardingtobitcoincore']:
delim_end = item['title'].find(']')
title = item['title']
item = {**item, 'title': title[1:delim_end] + ':' + title[
Expand Down
53 changes: 53 additions & 0 deletions scrapybot/scrapybot/spiders/onboardingtobitcoincore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from datetime import datetime
from bs4 import BeautifulSoup
import json
from .utils import strip_tags, strip_attributes
from scrapy.exceptions import DropItem
import uuid
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class GrokkingbtcSpider(CrawlSpider):
name = "onboardingtobitcoincore"
allowed_domains = ["github.com"]
start_urls = ["https://github.com/chaincodelabs/onboarding-to-bitcoin-core"]

rules = (
Rule(
LinkExtractor(restrict_xpaths="//span/a[contains(@href, 'adoc')]"),
callback="parse_item",
),
)

def parse_item(self, response):
item = {}
soup = BeautifulSoup(response.text, "html.parser")
script_tags = soup.find_all("script")
res = script_tags[-1]
json_object = json.loads(res.contents[0])
payload = json_object["payload"]
article = payload["blob"]["richText"]
item["id"] = "onboardingtobitcoincore-" + str(uuid.uuid4())

soup = BeautifulSoup(article, "html.parser")
header = soup.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

if header:
item["title"] = "[Onboarding to Bitcoin Core] " + header.text
else:
item["title"] = "[Onboarding to Bitcoin Core]"

if not item["title"]:
return None

item["body_formatted"] = strip_attributes(article)
item["body"] = strip_tags(article)
item["body_type"] = "html"
item["authors"] = ["Will Clark"]
item["domain"] = self.start_urls[0]
item["url"] = response.url
item["created_at"] = datetime.utcnow().isoformat()
item["indexed_at"] = datetime.utcnow().isoformat()

return item