diff --git a/.gitignore b/.gitignore index 98a49d6..38140d8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,10 @@ .idea/ -delvingbitcoin_2_elasticsearch\archive +delvingbitcoin_2_elasticsearch/archive/posts +delvingbitcoin_2_elasticsearch/archive/rendered-topics venv/ -archive + +archive/posts +archive/rendered-topics .venv @@ -96,7 +99,6 @@ web_modules/ .yarn-integrity # dotenv environment variable files -.env .env.development.local .env.test.local .env.production.local diff --git a/delvingbitcoin_2_elasticsearch/achieve.py b/delvingbitcoin_2_elasticsearch/achieve.py index ba9520f..bd3e692 100644 --- a/delvingbitcoin_2_elasticsearch/achieve.py +++ b/delvingbitcoin_2_elasticsearch/achieve.py @@ -165,7 +165,7 @@ def download_dumps() -> None: # Resync over the last day to catch any post edits. last_sync_date -= datetime.timedelta(days=1) - log.info("detected latest synced post date:{last_sync_date}") + log.info(f"detected latest synced post date: {last_sync_date}") topics_to_get = {} max_created_at = None diff --git a/delvingbitcoin_2_elasticsearch/archive/.metadata.json b/delvingbitcoin_2_elasticsearch/archive/.metadata.json new file mode 100644 index 0000000..0e17005 --- /dev/null +++ b/delvingbitcoin_2_elasticsearch/archive/.metadata.json @@ -0,0 +1,3 @@ +{ + "last_sync_date": "2025-01-24T00:42:29.140000+00:00" +} \ No newline at end of file diff --git a/delvingbitcoin_2_elasticsearch/delvingbitcoin_2_elasticsearch.py b/delvingbitcoin_2_elasticsearch/delvingbitcoin_2_elasticsearch.py index c5afe37..11424e5 100644 --- a/delvingbitcoin_2_elasticsearch/delvingbitcoin_2_elasticsearch.py +++ b/delvingbitcoin_2_elasticsearch/delvingbitcoin_2_elasticsearch.py @@ -10,7 +10,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from common.elasticsearch_utils import document_add, document_view, create_index +from common.elasticsearch_utils import document_add, document_view, create_index, document_update from achieve import download_dumps @@ -86,10 +86,11 @@ def index_documents(files_path): 'body_type': 'html', 'body': body, 'body_formatted': strip_attributes_but_urls(document['cooked']), - 'created_at': document['updated_at'], + 'created_at': document['created_at'], 'domain': "https://delvingbitcoin.org/", 'url': f"https://delvingbitcoin.org/t/{document['topic_slug']}/{document['topic_id']}", - "indexed_at": datetime.utcnow().isoformat() + "indexed_at": datetime.utcnow().isoformat(), + "updated_at": document['updated_at'] } if document['post_number'] != 1: @@ -98,13 +99,16 @@ def index_documents(files_path): else: doc['type'] = 'original_post' - # Check if a document already exists + # check if document exist and update it if any changes are found resp = document_view(index_name=INDEX, doc_id=doc['id']) - if not resp: + if resp and doc['updated_at'] != resp['_source']['updated_at']: + _ = document_update(index_name=INDEX, doc=doc, doc_id=doc['id']) + log.success(f'Updated doc with ID: {doc["id"]}, Type:{doc["type"]}') + elif not resp: _ = document_add(index_name=INDEX, doc=doc, doc_id=doc['id']) - log.success(f'Successfully added! ID: {doc["id"]}, Type:{doc["type"]}') + log.success(f'Added doc with ID: {doc["id"]}, Type:{doc["type"]}') else: - log.info(f"Document already exist! ID: {doc['id']}") + log.info(f"No changes - ID: {doc['id']}") if __name__ == "__main__":