From 6975488599ca96d927d3ac34bf30ad037b06ecff Mon Sep 17 00:00:00 2001 From: mweber-inventa Date: Mon, 1 May 2023 03:01:58 +0000 Subject: [PATCH] week1 project --- week1/index.py | 4 ++ week1/query.py | 59 ++++++++++++++++++++++++++ week1/results.txt | 104 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 week1/results.txt diff --git a/week1/index.py b/week1/index.py index 4071623..fabf971 100644 --- a/week1/index.py +++ b/week1/index.py @@ -215,6 +215,8 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s client = get_opensearch(host) #TODO: set the refresh interval + client.indices.put_settings(index=index_name, body={'index': {'refresh_interval': refresh_interval}}) + logger.debug(client.indices.get_settings(index=index_name)) start = perf_counter() time_indexing = 0 @@ -228,6 +230,8 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s finish = perf_counter() logger.info(f'Done. {docs_indexed} were indexed in {(finish - start)/60} minutes. Total accumulated time spent in `bulk` indexing: {time_indexing/60} minutes') # TODO set refresh interval back to 5s + client.indices.put_settings(index=index_name, body={'index': {'refresh_interval': '5s'}}) + logger.debug(client.indices.get_settings(index=index_name)) if __name__ == "__main__": diff --git a/week1/query.py b/week1/query.py index 29af49b..45aa19e 100644 --- a/week1/query.py +++ b/week1/query.py @@ -168,6 +168,65 @@ def create_query(user_query, filters=None, sort="_score", sortDir="desc", size=1 query_obj["_source"] = source return query_obj +def create_query_week1(user_query, filters=None, sort="_score", sortDir="desc", size=10, source=None): + query_obj = { + 'size': size, + "sort": [ + {sort: {"order": sortDir}} + ], + "query": { + "function_score": { + "query": { + "bool": { + "must": [ + + ], + "should": [ # + { + "match": { + "name": { + "query": user_query, + "fuzziness": "0", + "fuzzy_transpositions" : False, + "prefix_length": 2, + # short words are often acronyms or usually not misspelled, so don't edit + "boost": 0.01 + } + } + }, + { + "multi_match": { + "query": user_query, + "type": "phrase", + "slop": "6", + "minimum_should_match": "2<75%", + "fields": ["name^10", "shortDescription^5"] + } + } + + ], + "minimum_should_match": 1, + "filter": filters # + } + }, + "boost_mode": "multiply", # how _score and functions are combined + "score_mode": "sum", # how functions are combined + "functions": [ + ] + + } + } + } + if user_query == "*" or user_query == "#": + # replace the bool + try: + query_obj["query"] = {"match_all": {}} + except: + print("Couldn't replace query for *") + if source is not None: # otherwise use the default and retrieve all source + query_obj["_source"] = source + return query_obj + def search(client, user_query, index="bbuy_products"): query_obj = create_query(user_query) diff --git a/week1/results.txt b/week1/results.txt new file mode 100644 index 0000000..2e47d27 --- /dev/null +++ b/week1/results.txt @@ -0,0 +1,104 @@ +---- week1, running in gitpod. + +export BBUY_DATA=/workspace/datasets/product_data/products + +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products.json + +python index.py -s /workspace/datasets/product_data/products +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch. +INFO:Done. 1275077 were indexed in 9.922109846950237 minutes. Total accumulated time spent in `bulk` indexing: 30.307174058161035 minutes + + +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products + +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch. +INFO:Done. 1275077 were indexed in 8.406999807966834 minutes. Total accumulated time spent in `bulk` indexing: 23.900626916195325 minutes + + +-- refresh-intervals: + +-1: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --refresh_interval -1 +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch. +INFO:Done. 1275077 were indexed in 8.115769755166548 minutes. Total accumulated time spent in `bulk` indexing: 23.873669005803823 minutes + +1: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --refresh_interval 1s +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of 1s to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch. +INFO:Done. 1275077 were indexed in 8.852590531449824 minutes. Total accumulated time spent in `bulk` indexing: 24.022950780299045 minutes + +60: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --refresh_interval 60s +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of 60s to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch. +INFO:Done. 1275077 were indexed in 7.781203374333563 minutes. Total accumulated time spent in `bulk` indexing: 22.726982587378007 minutes + +-- Batch sizes: + +400: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --batch_size 400 +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 400 per batch. +INFO:Done. 1275077 were indexed in 8.35581113601705 minutes. Total accumulated time spent in `bulk` indexing: 22.256467518976812 minutes + + +800: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --batch_size 800 +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 800 per batch. +INFO:Done. 1275077 were indexed in 7.519638787550018 minutes. Total accumulated time spent in `bulk` indexing: 20.513053392077563 minutes + + +1600: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --batch_size 1600 +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 1600 per batch. +INFO:Done. 1275077 were indexed in 9.076533603150164 minutes. Total accumulated time spent in `bulk` indexing: 24.444320515682435 minutes + +-- Workers: + +16: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --workers 16 +INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 16 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch. +INFO:Done. 1275077 were indexed in 11.619162508367056 minutes. Total accumulated time spent in `bulk` indexing: 72.77314870621825 minutes + +32: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --workers 32 + +64: +curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products +curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json +python index.py -s /workspace/datasets/product_data/products --workers 64 + + + +-- Query performance +export QUERY_FILE=~/workspace/datasets/train.csv + +python query.py --query_file /workspace/datasets/train.csv --max_queries 1000 + +INFO:Loading query file from /workspace/datasets/train.csv +INFO:Running queries, checking in every 1000 queries: +INFO:Query: Bad teacher has 10 hits. +INFO:Finished running 1000 queries in 0.422766538283516 minutes + + +INFO:Loading query file from /workspace/datasets/train.csv +INFO:Running queries, checking in every 1000 queries: +INFO:Query: Bad teacher has 10 hits. +INFO:Finished running 1000 queries in 0.16969594265004465 minutes