From d8b0a0ea63089c38c58c06191e6dd774a7794c06 Mon Sep 17 00:00:00 2001 From: Matt Dean Date: Mon, 22 Sep 2025 15:31:37 +0100 Subject: [PATCH 1/5] [NRL-1606] Add report to get masterids for pointers --- .../get_pointer_masterids_for_custodian.py | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 reports/get_pointer_masterids_for_custodian.py diff --git a/reports/get_pointer_masterids_for_custodian.py b/reports/get_pointer_masterids_for_custodian.py new file mode 100644 index 000000000..0c59aea66 --- /dev/null +++ b/reports/get_pointer_masterids_for_custodian.py @@ -0,0 +1,80 @@ +import json +from datetime import datetime, timedelta, timezone +from typing import Any + +import boto3 +import fire + +dynamodb = boto3.client("dynamodb") +paginator = dynamodb.get_paginator("scan") + + +def _get_masterids_for_custodians(table_name: str, custodians: str) -> Any: + """ + Get masterids for pointers in the given table for a list of custodians. + Parameters: + - table_name: The name of the pointers table to use. + """ + + print( # noqa + f"Getting masterids for custodians {custodians} in table {table_name}...." + ) + + expression_names_str = ",".join([f":param_{custodian}" for custodian in custodians]) + expression_values_list = { + f":param_{custodian}": {"S": custodian} for custodian in custodians + } + + params: dict[str, Any] = { + "TableName": table_name, + "PaginationConfig": {"PageSize": 50}, + "FilterExpression": f"custodian IN ({expression_names_str})", + "ExpressionAttributeValues": expression_values_list, + "ProjectionExpression": "id, type_id, master_identifier", + } + + pointers_info: list[dict[str, str]] = [] + total_scanned_count = 0 + + start_time = datetime.now(tz=timezone.utc) + + for page in paginator.paginate(**params): + for item in page["Items"]: + pointer_id = item.get("id", {}).get("S", "no-id") + pointer_type = item.get("type_id", {}).get("S", "no-type") + master_id = item.get("master_identifier", {}).get("S", "no-master-id") + + pointers_info.append( + { + "nrl-id": pointer_id, + "pointer-type": pointer_type, + "master_identifier": master_id, + } + ) + + total_scanned_count += page["ScannedCount"] + + if total_scanned_count % 1000 == 0: + print(".", end="", flush=True) # noqa + + if total_scanned_count % 100000 == 0: + print(f"scanned={total_scanned_count} found={len(pointers_info)} ") # noqa + + end_time = datetime.now(tz=timezone.utc) + + print(" Done") # noqa + + print(f"Writing pointers to file ./pointer-masterids.txt ...") # noqa + with open(f"pointer-masterids.txt", "w") as f: + f.write(json.dumps(pointers_info, indent=2)) + + return { + "output-file": "pointer-masterids.txt", + "pointers-found": len(pointers_info), + "scanned-count": total_scanned_count, + "took-secs": timedelta.total_seconds(end_time - start_time), + } + + +if __name__ == "__main__": + fire.Fire(_get_masterids_for_custodians) From 07ed67897bda9209103f42f1d93a2567986db426 Mon Sep 17 00:00:00 2001 From: Matt Dean Date: Mon, 22 Sep 2025 16:18:29 +0100 Subject: [PATCH 2/5] [NRL-1606] Fix case where custodians param is not a tuple --- reports/get_pointer_masterids_for_custodian.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/reports/get_pointer_masterids_for_custodian.py b/reports/get_pointer_masterids_for_custodian.py index 0c59aea66..15c178567 100644 --- a/reports/get_pointer_masterids_for_custodian.py +++ b/reports/get_pointer_masterids_for_custodian.py @@ -9,20 +9,25 @@ paginator = dynamodb.get_paginator("scan") -def _get_masterids_for_custodians(table_name: str, custodians: str) -> Any: +def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) -> Any: """ Get masterids for pointers in the given table for a list of custodians. Parameters: - table_name: The name of the pointers table to use. """ + custodian_list = ( + custodians.split(",") if isinstance(custodians, str) else list(custodians) + ) print( # noqa - f"Getting masterids for custodians {custodians} in table {table_name}...." + f"Getting masterids for custodians {custodian_list} in table {table_name}...." ) - expression_names_str = ",".join([f":param_{custodian}" for custodian in custodians]) + expression_names_str = ",".join( + [f":param{custodian}" for custodian in custodian_list] + ) expression_values_list = { - f":param_{custodian}": {"S": custodian} for custodian in custodians + f":param{custodian}": {"S": custodian} for custodian in custodian_list } params: dict[str, Any] = { From 8910f690ee528d5a3d2409741785463ec87d5f8e Mon Sep 17 00:00:00 2001 From: Matt Dean Date: Mon, 22 Sep 2025 16:45:18 +0100 Subject: [PATCH 3/5] [NRL-1606] Fix sonarcloud warning in new report --- reports/get_pointer_masterids_for_custodian.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/reports/get_pointer_masterids_for_custodian.py b/reports/get_pointer_masterids_for_custodian.py index 15c178567..8582ac4a9 100644 --- a/reports/get_pointer_masterids_for_custodian.py +++ b/reports/get_pointer_masterids_for_custodian.py @@ -1,10 +1,13 @@ import json +import os from datetime import datetime, timedelta, timezone from typing import Any import boto3 import fire +INCLUDE_PATIENT_IDS = os.environ.get("INCLUDE_PATIENT_IDS", "false").lower() == "true" + dynamodb = boto3.client("dynamodb") paginator = dynamodb.get_paginator("scan") @@ -23,6 +26,12 @@ def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) f"Getting masterids for custodians {custodian_list} in table {table_name}...." ) + required_attributes = ( + ["id", "type_id", "master_identifier", "custodian"] + if not INCLUDE_PATIENT_IDS + else ["id", "type_id", "master_identifier", "custodian", "nhs_number"] + ) + expression_names_str = ",".join( [f":param{custodian}" for custodian in custodian_list] ) @@ -35,7 +44,7 @@ def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) "PaginationConfig": {"PageSize": 50}, "FilterExpression": f"custodian IN ({expression_names_str})", "ExpressionAttributeValues": expression_values_list, - "ProjectionExpression": "id, type_id, master_identifier", + "ProjectionExpression": ",".join(required_attributes), } pointers_info: list[dict[str, str]] = [] @@ -48,12 +57,15 @@ def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) pointer_id = item.get("id", {}).get("S", "no-id") pointer_type = item.get("type_id", {}).get("S", "no-type") master_id = item.get("master_identifier", {}).get("S", "no-master-id") + custodian = item.get("custodian", {}).get("S", "no-custodian") pointers_info.append( { "nrl-id": pointer_id, "pointer-type": pointer_type, "master_identifier": master_id, + "custodian": custodian, + "patient_id": item.get("nhs_number", {}).get("S", "no-patient-id"), } ) @@ -70,7 +82,7 @@ def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) print(" Done") # noqa print(f"Writing pointers to file ./pointer-masterids.txt ...") # noqa - with open(f"pointer-masterids.txt", "w") as f: + with open("pointer-masterids.txt", "w") as f: f.write(json.dumps(pointers_info, indent=2)) return { From 857d7beb91fc1f99acec8d6c094712d17d4cb900 Mon Sep 17 00:00:00 2001 From: Matt Dean Date: Tue, 23 Sep 2025 08:56:37 +0100 Subject: [PATCH 4/5] [NRL-1606] Switch report attributes list from tenery to if --- reports/get_pointer_masterids_for_custodian.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/reports/get_pointer_masterids_for_custodian.py b/reports/get_pointer_masterids_for_custodian.py index 8582ac4a9..87e4bcbe4 100644 --- a/reports/get_pointer_masterids_for_custodian.py +++ b/reports/get_pointer_masterids_for_custodian.py @@ -26,11 +26,9 @@ def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) f"Getting masterids for custodians {custodian_list} in table {table_name}...." ) - required_attributes = ( - ["id", "type_id", "master_identifier", "custodian"] - if not INCLUDE_PATIENT_IDS - else ["id", "type_id", "master_identifier", "custodian", "nhs_number"] - ) + required_attributes = ["id", "type_id", "master_identifier", "custodian"] + if INCLUDE_PATIENT_IDS: + required_attributes.append("nhs_number") expression_names_str = ",".join( [f":param{custodian}" for custodian in custodian_list] From 63e5d45306654384250bceaf4b7bf38f849c0078 Mon Sep 17 00:00:00 2001 From: Matt Dean Date: Tue, 23 Sep 2025 08:58:59 +0100 Subject: [PATCH 5/5] [NRL-1606] Set patient_id in report to non-included if flag is not set --- reports/get_pointer_masterids_for_custodian.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reports/get_pointer_masterids_for_custodian.py b/reports/get_pointer_masterids_for_custodian.py index 87e4bcbe4..c437ccf5c 100644 --- a/reports/get_pointer_masterids_for_custodian.py +++ b/reports/get_pointer_masterids_for_custodian.py @@ -56,6 +56,7 @@ def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) pointer_type = item.get("type_id", {}).get("S", "no-type") master_id = item.get("master_identifier", {}).get("S", "no-master-id") custodian = item.get("custodian", {}).get("S", "no-custodian") + patient_id = item.get("nhs_number", {}).get("S", "no-patient-id") pointers_info.append( { @@ -63,7 +64,7 @@ def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) "pointer-type": pointer_type, "master_identifier": master_id, "custodian": custodian, - "patient_id": item.get("nhs_number", {}).get("S", "no-patient-id"), + "patient_id": patient_id if INCLUDE_PATIENT_IDS else "not-included", } )