From 75abeb378b7a8d80cec3db321d8d55eda467274b Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 22 Jan 2026 09:46:18 +0000
Subject: [PATCH 01/15] Added converse api call to bedrock

---
 .../gen_user_names_file.py                    | 120 ++++++++++++------
 generator/utilities/Bedrock.py                |  77 ++++++-----
 2 files changed, 131 insertions(+), 66 deletions(-)
 rename generator/{utilities => batch}/gen_user_names_file.py (72%)
diff --git a/generator/utilities/gen_user_names_file.py b/generator/batch/gen_user_names_file.py
similarity index 72%
rename from generator/utilities/gen_user_names_file.py
rename to generator/batch/gen_user_names_file.py
index c8765fe..753a1b5 100644
--- a/generator/utilities/gen_user_names_file.py
+++ b/generator/batch/gen_user_names_file.py
@@ -1,6 +1,9 @@
+# python generator/batch/gen_user_names_file.py
+
 import os
 import json
 import boto3
+from botocore.config import Config
 import sys
 import time
 import logging
@@ -11,10 +14,47 @@
 sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")
 
 import cons
-from utilities.Bedrock import Bedrock, prompt, system_prompt
+from utilities.Bedrock import Bedrock
+
+system_prompt = """# Task
+
+You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin.
+
+# Requirements
+
+- Generate typical names for both male and female people.
+- The names do not need to be traditional to the target European country.
+- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique.
+- You should return the first names and last names using a valid JSON object tagged as <answer></answer>.
+- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]}
+
+# Examples
+
+- Generate 2 first names and 2 last names for people from the country "Germany" -> <answer>{"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]}</answer>
+- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> <answer>{"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]}</answer>
+- Generate 3 first names and 3 last names for people from the country "France" -> <answer>{"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]}</answer>
+- Generate 5 first names and 5 last names for people from the country "Spain" -> <answer>{"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]}</answer>
+- Generate 6 first names and 6 last names for people from the country "Sweden" -> <answer>{"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]}</answer>
+"""
+
+prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"'
+
+bedrock_config = {
+    "inferenceConfig":{
+        "maxTokens":8192,
+        "temperature":0.5,
+        "topP":0.5,
+    },
+    "system":[
+        {
+            "text":system_prompt
+        }
+    ]
+}
 
 def invoke_bedrock(
     model:Bedrock,
+    model_id:str,
     n_user_names:int,
     country:str,
     countrieseurope:pd.DataFrame,
@@ -62,8 +102,10 @@ def invoke_bedrock(
     logging.info("Calling Bedrock ...")
     # call bedrock model
     formatted_prompt = prompt.format(n_user_names=n_user_names, country=country)
-    logging.info(formatted_prompt)
-    model_response = model.prompt(user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
+    messages = [{"role":"user", "content":[{"text":formatted_prompt}]}]
+    logging.info(messages)
+    model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
+    #model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig'])
     # split out answer
     text = model_response.split("<answer>")[1].split("</answer>")[0]
     # parse json
@@ -117,54 +159,32 @@ def invoke_bedrock(
         logging.info(f"Wrote {fpath_temp_llama_lastnames} ...")
     return (tmp_firstname_country_data, tmp_lastname_country_data)
 
-if __name__ == "__main__":
-    
-    # set up logging
-    lgr = logging.getLogger()
-    lgr.setLevel(logging.INFO)
-    
-    # load aws config
-    with open(cons.fpath_aws_session_token, "r") as j:
-        aws_config = json.loads(j.read())
-    
-    # connect to aws boto3
-    session = boto3.Session(
-        aws_access_key_id=aws_config['Credentials']["AccessKeyId"],
-        aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"],
-        aws_session_token=aws_config['Credentials']["SessionToken"],
-        region_name="us-east-1"
-    )
-    
-    # create bedrock instance
-    bedrock = Bedrock(session=session, model_region="us-east-1", model_id="meta.llama3-70b-instruct-v1:0")
+def main(bedrock, model_id, run_bedrock=False):
+    """
+    Docstring for main
+    """
     
     # load countries, firstnames and surnames files
     countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
     orig_firstnames = pd.read_csv(cons.fpath_firstnames)
     orig_surnames = pd.read_csv(cons.fpath_lastnames)
-    
     # determine file size
     orig_filesize = int((orig_firstnames.shape[0] + orig_surnames.shape[0])/2)
     n_countries = countrieseurope.shape[0]
-    n_user_names = min(50, int(orig_filesize / n_countries))
-    
+    n_user_names = min(2, int(orig_filesize / n_countries))
     # generate user names
-    firstname_country_data = []
-    lastname_country_data = []
-    error_countries = []
-    # switch to toggle bedrock calls
-    run_bedrock = False
-    
+    firstname_country_data, lastname_country_data, error_countries = [], [], []
     # set countries list
-    countries_list = countrieseurope['name'].to_list()
-    #countries_list = ['Cyprus']
+    #countries_list = countrieseurope['name'].to_list()
+    countries_list = ['Cyprus']
     
+    # iterate over countries list
     for country in countries_list:
         logging.info(f"{country} ...")
         try:
             if run_bedrock:
                 # call bedrock model and generate user names data
-                tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, n_user_names=n_user_names, country=country)
+                tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, model_id=model_id, n_user_names=n_user_names, country=country, countrieseurope=countrieseurope)
                 logging.info("Waiting ...")
                 # wait 20 seconds before retrying
                 time.sleep(20)
@@ -205,4 +225,32 @@ def invoke_bedrock(
         logging.info(f"output_lastname_country_df.shape: {output_lastname_country_df.shape}")
         output_lastname_country_df.to_csv(cons.fpath_llama_lastnames, index=False, encoding="latin1")
     else:
-        logging.info("WARNING Insufficient last name data generated.")
\ No newline at end of file
+        logging.info("WARNING Insufficient last name data generated.")
+
+lgr = logging.getLogger()
+lgr.setLevel(logging.INFO)
+
+if __name__ == "__main__":
+    # set aws region
+    aws_region = "us-east-1"
+    model_id="us.meta.llama3-1-70b-instruct-v1:0"
+    # load aws config
+    with open(cons.fpath_aws_session_token, "r") as j:
+        aws_config = json.loads(j.read())
+    # connect to aws boto3
+    session = boto3.Session(
+        aws_access_key_id=aws_config['Credentials']["AccessKeyId"],
+        aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"],
+        aws_session_token=aws_config['Credentials']["SessionToken"],
+        region_name=aws_region
+    )
+    bedrock_runtime = session.client(
+        service_name="bedrock-runtime",
+        region_name=aws_region,
+        config=Config(retries={"max_attempts":1, "mode": "adaptive"})
+        )
+    # create bedrock instance
+    bedrock = Bedrock(bedrock_runtime=bedrock_runtime)
+    # execute main programme
+    main(bedrock=bedrock, run_bedrock=True, model_id=model_id)
+
diff --git a/generator/utilities/Bedrock.py b/generator/utilities/Bedrock.py
index 9ea42d6..1e2ea0a 100644
--- a/generator/utilities/Bedrock.py
+++ b/generator/utilities/Bedrock.py
@@ -1,5 +1,5 @@
 import json
-import boto3
+from typing import Dict, List
 from beartype import beartype
 
 class Bedrock():
@@ -10,7 +10,7 @@ class Bedrock():
     
     Parameters
     ----------
-    session : boto3.Session
+    bedrock_runtime : boto3.Session
         A Boto3 session object configured with appropriate AWS credentials.
     model_region: str
         The AWS region where the Bedrock model is hosted.
@@ -31,16 +31,14 @@ class Bedrock():
     @beartype
     def __init__(
         self,
-        session:boto3.Session,
-        model_region="us-east-1",
-        model_id:str="meta.llama3-8b-instruct-v1:0",
+        bedrock_runtime,
         ):
-        self.client = session.client("bedrock-runtime", region_name=model_region)
-        self.model_id = model_id,
-    
+        self.bedrock_runtime = bedrock_runtime
+
     @beartype
     def prompt(
         self,
+        model_id:str,
         user_prompt:str,
         system_prompt:str="",
         top_p:float=0.5,
@@ -89,32 +87,51 @@ def prompt(
         # call bedrock model
         try:
             # Invoke the model with the request.
-            response = self.client.invoke_model(modelId=self.model_id, body=request)
+            response = self.bedrock_runtime.invoke_model(modelId=model_id, body=request)
         except Exception as e:
-            raise Exception(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}")
+            raise Exception(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
         # Decode and extract the response
         model_response = json.loads(response["body"].read())
         response_text = model_response["generation"]
         return response_text
+    
+    @beartype
+    def converse(
+        self,
+        modelId:str,
+        messages:List,
+        system:List,
+        inference_config:Dict={"maxTokens":512, "temperature":0.5, "topP":0.5,},
+        tools_config:Dict=None
+        ):
+        """
+        Invoke the Bedrock model with the provided messages and configurations.
 
-system_prompt = """# Task
-
-You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin.
-
-# Requirements
-
-- Generate typical names for both male and female people.
-- The names do not need to be traditional to the target European country.
-- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique.
-- You should return the first names and last names using a valid JSON object tagged as <answer></answer>.
-- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]}
-
-# Examples
-
-- Generate 2 first names and 2 last names for people from the country "Germany" -> <answer>{"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]}</answer>
-- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> <answer>{"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]}</answer>
-- Generate 3 first names and 3 last names for people from the country "France" -> <answer>{"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]}</answer>
-- Generate 5 first names and 5 last names for people from the country "Spain" -> <answer>{"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]}</answer>
-- Generate 6 first names and 6 last names for people from the country "Sweden" -> <answer>{"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]}</answer>"""
+        Parameters
+        ----------
+        messages : Dict
+            A list of message objects representing the conversation history.
+        system : Dict
+            A system message object providing context or instructions for the model.
+        inference_config : Dict
+            Configuration settings for inference parameters.
+        tools_config : Dict
+            Configuration settings for any tools to be used during inference.
 
-prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"'
\ No newline at end of file
+        Returns
+        -------
+        Dict:
+            The response from the Bedrock Claude model.
+        
+        References
+        ----------
+        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/converse.html
+        """
+        payload = {"modelId": modelId, "messages": messages, "system": system}
+        if inference_config:
+            payload["inferenceConfig"] = inference_config
+        if tools_config:
+            payload["toolsConfig"] = tools_config
+        # call converse api
+        response = self.bedrock_runtime.converse(**payload)
+        return response

From 3239e618b50137e27d50ab655d35ca08e32dd132 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Sun, 25 Jan 2026 12:10:34 +0000
Subject: [PATCH 02/15] Generalised gen_user_names_file.py batch script to work
 for other data points such as email domains

---
 generator/batch/gen_user_names_file.py | 186 ++++++++++++-------------
 generator/cons.py                      |   8 ++
 2 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/generator/batch/gen_user_names_file.py b/generator/batch/gen_user_names_file.py
index 753a1b5..cfce17f 100644
--- a/generator/batch/gen_user_names_file.py
+++ b/generator/batch/gen_user_names_file.py
@@ -10,34 +10,52 @@
 import unidecode
 import pandas as pd
 import numpy as np
+from typing import Dict
 
 sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")
 
 import cons
 from utilities.Bedrock import Bedrock
 
-system_prompt = """# Task
+system_name_prompt = """# Task
 
-You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin.
+You are a name generator for people from different countries in Europe.
+Your task is to generate an arbitrary N number of distinct and varied first names, or last names, for people from a given European country of origin.
 
 # Requirements
 
 - Generate typical names for both male and female people.
 - The names do not need to be traditional to the target European country.
-- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique.
+- Do not repeat any first names or last names more than once.
+- Each individual first name must be unique and each individual last name must be unique.
 - You should return the first names and last names using a valid JSON object tagged as <answer></answer>.
-- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]}
+- The valid JSON object should be of the following structures; `["name 1","name 2",...,"name N"]`.
 
 # Examples
 
-- Generate 2 first names and 2 last names for people from the country "Germany" -> <answer>{"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]}</answer>
-- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> <answer>{"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]}</answer>
-- Generate 3 first names and 3 last names for people from the country "France" -> <answer>{"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]}</answer>
-- Generate 5 first names and 5 last names for people from the country "Spain" -> <answer>{"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]}</answer>
-- Generate 6 first names and 6 last names for people from the country "Sweden" -> <answer>{"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]}</answer>
+## First Names
+
+- Generate 2 first names for people from the country "Germany" -> <answer>["Max","Hannah"]</answer>
+- Generate 4 first names for people from the country "United Kingdom" -> <answer>["George","Richard","Katie","Mary"]</answer>
+- Generate 3 first names for people from the country "France" -> <answer>["Lola","Mathieu","Léa"]</answer>
+- Generate 5 first names for people from the country "Spain" -> <answer>["Juan","Cristina","Javier","Julia","Isabel"]</answer>
+- Generate 6 first names for people from the country "Sweden" -> <answer>["Tova","Alva","Casper","Märta","Axel","Elsa"]</answer>
+
+## Last Names
+
+- Generate 2 last names for people from the country "Germany" -> <answer>["Müller","Schmidt"]</answer>
+- Generate 4 last names for people from the country "United Kingdom" -> <answer>["Smith","Taylor","Jones","Brown"]</answer>
+- Generate 3 last names for people from the country "France" -> <answer>["Benoît","Pierre","Lefort"]</answer>
+- Generate 5 last names for people from the country "Spain" -> <answer>["Garcia","Martinez","Rodriguez","Lopez","Gomez"]</answer>
+- Generate 6 last names for people from the country "Sweden" -> <answer>["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]</answer>
 """
 
-prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"'
+system_email_prompt = """
+"""
+
+firstname_prompt = 'Generate {n_data_points} first names for people from the country "{country}"'
+surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"'
+email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"'
 
 bedrock_config = {
     "inferenceConfig":{
@@ -47,7 +65,7 @@
     },
     "system":[
         {
-            "text":system_prompt
+            "text":system_name_prompt
         }
     ]
 }
@@ -55,9 +73,13 @@
 def invoke_bedrock(
     model:Bedrock,
     model_id:str,
-    n_user_names:int,
+    data_point:str,
+    n_data_points:int,
     country:str,
     countrieseurope:pd.DataFrame,
+    prompt:str,
+    system_prompt:str,
+    country_fpath:str,
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
     Invokes the Bedrock model to generate user names for a specified country.
@@ -71,8 +93,8 @@ def invoke_bedrock(
     ----------
     model : Bedrock
         The Bedrock model instance used to generate names.
-    n_user_names : int
-        The number of user names to generate.
+    n_data_points : int
+        The number of data points to generate
     country : str
         The country for which to generate names.
     countrieseurope : pd.DataFrame
@@ -101,131 +123,100 @@ def invoke_bedrock(
     """
     logging.info("Calling Bedrock ...")
     # call bedrock model
-    formatted_prompt = prompt.format(n_user_names=n_user_names, country=country)
+    formatted_prompt = prompt.format(n_data_points=n_data_points, country=country)
     messages = [{"role":"user", "content":[{"text":formatted_prompt}]}]
     logging.info(messages)
-    model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
-    #model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig'])
+    #model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
+    model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig'])
     # split out answer
     text = model_response.split("<answer>")[1].split("</answer>")[0]
     # parse json
     try:
-        record_set = json.loads(text)
+        gen_data_list = json.loads(text)
     except json.JSONDecodeError as e:
         raise Exception(f"Error parsing JSON: {e}")
     logging.info("Processing results ...")
     # generate pandas dataframe
-    user_firstname_data = pd.Series(record_set["firstnames"], name="firstnames").to_frame().drop_duplicates(subset=["firstnames"])
-    user_lastname_data = pd.Series(record_set["lastnames"], name="lastnames").to_frame().drop_duplicates(subset=["lastnames"])
-    # add country
-    user_firstname_data['country'] = country
-    user_lastname_data['country'] = country
-    # join on country codes
-    llama_firstname_country_data = user_firstname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name'])
-    llama_lastname_country_data = user_lastname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name'])
-    # print shapes
-    logging.info(f"llama_firstname_country_data.shape: {llama_firstname_country_data.shape}")
-    logging.info(f"llama_lastname_country_data.shape: {llama_lastname_country_data.shape}")
-    # format output file paths
-    fpath_temp_llama_firstnames = cons.fpath_temp_llama_firstnames.format(country=country.lower())
-    fpath_temp_llama_lastnames = cons.fpath_temp_llama_lastnames.format(country=country.lower())
-    # check against previous iterations
-    tmp_firstname_country_data = pd.DataFrame()
-    tmp_lastname_country_data = pd.DataFrame()
-    if os.path.exists(fpath_temp_llama_firstnames):
-        tmp_firstname_country_data = pd.read_csv(fpath_temp_llama_firstnames, encoding="latin1")
-    if os.path.exists(fpath_temp_llama_lastnames):
-        tmp_lastname_country_data = pd.read_csv(fpath_temp_llama_lastnames, encoding="latin1")
-    # concatenate results
-    tmp_firstname_country_data = pd.concat(objs=[tmp_firstname_country_data, llama_firstname_country_data], axis=0, ignore_index=True)
-    tmp_lastname_country_data = pd.concat(objs=[tmp_lastname_country_data, llama_lastname_country_data], axis=0, ignore_index=True)
+    gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame()
+    gen_dataframe['country'] = country
+    gen_country_dataframe = pd.merge(
+        left=gen_dataframe,
+        right=countrieseurope.rename(columns={'name':'country'}),
+        left_on='country',
+        right_on='name',
+        how='inner'
+        )
     # standardise names formatting
     standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x
-    tmp_firstname_country_data["firstnames"] = tmp_firstname_country_data["firstnames"].apply(lambda x: standardise_text_lambda(x))
-    tmp_lastname_country_data["lastnames"] = tmp_lastname_country_data["lastnames"].apply(lambda x: standardise_text_lambda(x))
-    # deduplicate data
-    tmp_firstname_country_data = tmp_firstname_country_data.drop_duplicates(subset=["firstnames"])
-    tmp_lastname_country_data = tmp_lastname_country_data.drop_duplicates(subset=["lastnames"])
-    # print shapes
-    logging.info(f"tmp_firstname_country_data.shape: {tmp_firstname_country_data.shape}")
-    logging.info(f"tmp_lastname_country_data.shape: {tmp_lastname_country_data.shape}")
-    # save firstnames names data to temp directory (if pairwise firstnames have been created)
-    if tmp_firstname_country_data.shape[0] >= llama_firstname_country_data.shape[0]:
-        tmp_firstname_country_data.to_csv(fpath_temp_llama_firstnames, index=False, encoding="latin1")
-        logging.info(f"Wrote {fpath_temp_llama_firstnames} ...")
-    # save lastnames data to temp directory (if pairwise lastnames have been created)
-    if tmp_lastname_country_data.shape[0] >= llama_lastname_country_data.shape[0]:
-        tmp_lastname_country_data.to_csv(fpath_temp_llama_lastnames, index=False, encoding="latin1")
-        logging.info(f"Wrote {fpath_temp_llama_lastnames} ...")
-    return (tmp_firstname_country_data, tmp_lastname_country_data)
-
-def main(bedrock, model_id, run_bedrock=False):
+    gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x))
+    logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}")
+    # save generated data
+    gen_country_dataframe.to_csv(country_fpath, index=False, encoding="latin1")
+    logging.info(f"Wrote {country_fpath} ...")
+    return gen_country_dataframe
+
+def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
     """
     Docstring for main
     """
-    
     # load countries, firstnames and surnames files
     countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
-    orig_firstnames = pd.read_csv(cons.fpath_firstnames)
-    orig_surnames = pd.read_csv(cons.fpath_lastnames)
-    # determine file size
-    orig_filesize = int((orig_firstnames.shape[0] + orig_surnames.shape[0])/2)
     n_countries = countrieseurope.shape[0]
-    n_user_names = min(2, int(orig_filesize / n_countries))
-    # generate user names
-    firstname_country_data, lastname_country_data, error_countries = [], [], []
+    # set lists to collect generated data with
+    gen_country_dataframe_list, error_countries = [], []
     # set countries list
     #countries_list = countrieseurope['name'].to_list()
     countries_list = ['Cyprus']
-    
     # iterate over countries list
     for country in countries_list:
         logging.info(f"{country} ...")
+        country_fpath=fpath_dict['country_fpath'].format(country)
         try:
             if run_bedrock:
                 # call bedrock model and generate user names data
-                tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, model_id=model_id, n_user_names=n_user_names, country=country, countrieseurope=countrieseurope)
+                country_filter = (countrieseurope["name"] == country)
+                country_population = countrieseurope.loc[country_filter, "population"].iloc[0]
+                # set n data points for ai generator depending on type
+                if data_point in ("firstnames", "lastnames"):
+                    n_data_points = int(np.log(country_population)**1.5)
+                elif data_point == "email_domains":
+                    n_data_points = 5
+                else:
+                    raise ValueError(f"Invalid parameter data_point value {data_point}")
+                # invoke bedrock and generate data points
+                tmp_gen_country_data = invoke_bedrock(
+                    model=bedrock,
+                    model_id=model_id,
+                    data_point=data_point,
+                    n_data_points=n_data_points,
+                    country=country,
+                    countrieseurope=countrieseurope,
+                    country_fpath=country_fpath
+                    )
                 logging.info("Waiting ...")
                 # wait 20 seconds before retrying
                 time.sleep(20)
             else:
-                tmp_firstname_country_data = pd.read_csv(cons.fpath_temp_llama_firstnames.format(country=country.lower()), encoding="latin1")
-                tmp_lastname_country_data = pd.read_csv(cons.fpath_temp_llama_lastnames.format(country=country.lower()), encoding="latin1")
+                tmp_gen_country_data = pd.read_csv(country_fpath, encoding="latin1")
             # append to user country data
-            firstname_country_data.append(tmp_firstname_country_data)
-            lastname_country_data.append(tmp_lastname_country_data)
+            gen_country_dataframe_list.append(tmp_gen_country_data)
         except Exception as e:
             logging.info(e)
             error_countries.append(country)
-    
     # log if any countries failed to generate data
     if len(error_countries) > 0:
         logging.info(f"Failed to generated data for countries: {error_countries}")
-    
-    # load existing reference data
-    firstname_country_df = pd.read_csv(cons.fpath_llama_firstnames, encoding="latin1")
-    lastname_country_df = pd.read_csv(cons.fpath_llama_lastnames, encoding="latin1")
-    # append to country data lists
-    firstname_country_data.append(firstname_country_df)
-    lastname_country_data.append(lastname_country_df)
     # concatenate user country data together and deduplicate across firstnames and countries
-    output_firstname_country_df = pd.concat(firstname_country_data, axis=0, ignore_index=True)
-    output_lastname_country_df = pd.concat(lastname_country_data, axis=0, ignore_index=True)
+    output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True)
     # sort and deduplicate output data
-    output_firstname_country_df = output_firstname_country_df.drop_duplicates(subset=["country","firstnames"]).sort_values(by=["country","firstnames"])
-    output_lastname_country_df = output_lastname_country_df.drop_duplicates(subset=["country","lastnames"]).sort_values(by=["country","lastnames"])
-    
+    sort_dedup_cols = ["country",data_point]
+    output_gen_country_dataframe = output_gen_country_dataframe.drop_duplicates(subset=sort_dedup_cols).sort_values(by=sort_dedup_cols)
     # write data to disk
-    if output_firstname_country_df['country'].nunique() == n_countries:
-        logging.info(f"output_firstname_country_df.shape: {output_firstname_country_df.shape}")
-        output_firstname_country_df.to_csv(cons.fpath_llama_firstnames, index=False, encoding="latin1")
+    if output_gen_country_dataframe['country'].nunique() == n_countries:
+        logging.info(f"output_gen_country_dataframe.shape: {output_gen_country_dataframe.shape}")
+        output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="latin1")
     else:
         logging.info("WARNING Insufficient first name data generated.")
-    if output_lastname_country_df['country'].nunique() == n_countries:
-        logging.info(f"output_lastname_country_df.shape: {output_lastname_country_df.shape}")
-        output_lastname_country_df.to_csv(cons.fpath_llama_lastnames, index=False, encoding="latin1")
-    else:
-        logging.info("WARNING Insufficient last name data generated.")
 
 lgr = logging.getLogger()
 lgr.setLevel(logging.INFO)
@@ -252,5 +243,6 @@ def main(bedrock, model_id, run_bedrock=False):
     # create bedrock instance
     bedrock = Bedrock(bedrock_runtime=bedrock_runtime)
     # execute main programme
-    main(bedrock=bedrock, run_bedrock=True, model_id=model_id)
+    for data_point, fpath_dict in cons.llama_data_point_fpaths.items():
+        main(bedrock=bedrock, model_id=model_id, data_point=data_point, fpath_dict=fpath_dict, run_bedrock=True)
 
diff --git a/generator/cons.py b/generator/cons.py
index 6fdb599..77a459e 100644
--- a/generator/cons.py
+++ b/generator/cons.py
@@ -17,6 +17,7 @@
 fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv')
 fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv')
 fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv')
+fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv')
 fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv')
 fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv')
 fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv')
@@ -24,10 +25,17 @@
 fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt')
 fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv')
 fpath_llama_lastnames = os.path.join(subdir_data, 'ref', 'llama_lastnames.csv')
+fpath_llama_email_domains = os.path.join(subdir_data, 'ref', 'llama_email_domains.csv')
 fpath_smartphones = os.path.join(subdir_data, 'ref', 'smartphones.csv')
 fpath_unittest_user_data = os.path.join(subdir_unittest, 'user_data.parquet')
 fpath_unittest_transaction_data = os.path.join(subdir_unittest, 'transaction_data.parquet')
 fpath_aws_session_token = os.path.join(subdir_creds,'sessionToken.json')
+# set data points generated by llama
+llama_data_point_fpaths = {
+    "firstnames":{"fpath":fpath_llama_firstnames, "country_fpath":fpath_temp_llama_firstnames},
+    "lastnames":{"fpath":fpath_llama_lastnames, "country_fpath":fpath_temp_llama_lastnames},
+    "email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains}
+    }
 
 # set url links to files available online
 url_european_populations = 'https://raw.githubusercontent.com/ajturner/acetate/master/places/Countries-Europe.csv'

From 87cbf96319ff6c9e204f9b8363264ae8735898ce Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Sun, 25 Jan 2026 12:12:21 +0000
Subject: [PATCH 03/15] Renamed generate bedrock data batch script

---
 .../batch/{gen_user_names_file.py => gen_bedrock_data.py}     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
 rename generator/batch/{gen_user_names_file.py => gen_bedrock_data.py} (99%)

diff --git a/generator/batch/gen_user_names_file.py b/generator/batch/gen_bedrock_data.py
similarity index 99%
rename from generator/batch/gen_user_names_file.py
rename to generator/batch/gen_bedrock_data.py
index cfce17f..f2af97c 100644
--- a/generator/batch/gen_user_names_file.py
+++ b/generator/batch/gen_bedrock_data.py
@@ -1,6 +1,5 @@
-# python generator/batch/gen_user_names_file.py
+# python generator/batch/gen_bedrock_data.py
 
-import os
 import json
 import boto3
 from botocore.config import Config
@@ -10,7 +9,6 @@
 import unidecode
 import pandas as pd
 import numpy as np
-from typing import Dict
 
 sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")
 

From 1baf3eda009632c8a9d07955d68eb713dddb7ccf Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Tue, 27 Jan 2026 09:36:09 +0000
Subject: [PATCH 04/15] Passing new llama email domain reference file through
 to User object creation.

---
 generator/app/gen_random_telecom_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py
index 28b0283..1c4c103 100644
--- a/generator/app/gen_random_telecom_data.py
+++ b/generator/app/gen_random_telecom_data.py
@@ -75,7 +75,7 @@ def gen_random_telecom_data(
         fpath_firstnames=cons.fpath_llama_firstnames,
         fpath_lastnames=cons.fpath_llama_lastnames,
         fpath_countries_europe=cons.fpath_countries_europe,
-        fpath_email_domain =cons.fpath_email_domain 
+        fpath_email_domain =cons.fpath_llama_email_domains
         )
     
     # generate random entity counts for each user

From c827af09ba80843b18685e8915cd83de39b8dcb6 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Tue, 27 Jan 2026 09:36:51 +0000
Subject: [PATCH 05/15] Generalised function for generating user firstnames and
 lastnames using llama bedrock reference data. Applying generalised function
 to handle new llama email domains file

---
 generator/objects/User.py | 114 +++++++++-----------------------------
 1 file changed, 25 insertions(+), 89 deletions(-)

diff --git a/generator/objects/User.py b/generator/objects/User.py
index 475d79a..2551966 100644
--- a/generator/objects/User.py
+++ b/generator/objects/User.py
@@ -17,10 +17,10 @@ def __init__(
         n_user_ids:int,
         start_date:str,
         end_date:str,
-        fpath_firstnames:str=cons.fpath_firstnames,
-        fpath_lastnames:str=cons.fpath_lastnames,
+        fpath_firstnames:str=cons.fpath_llama_firstnames,
+        fpath_lastnames:str=cons.fpath_llama_lastnames,
         fpath_countries_europe:str=cons.fpath_countries_europe,
-        fpath_email_domain :str=cons.fpath_email_domain ,
+        fpath_email_domain :str=cons.fpath_llama_email_domains ,
         ):
         """
         The randomly generated user data model object
@@ -34,13 +34,13 @@ def __init__(
         end_date : str
             The end date to generate users till
         fpath_firstnames : str
-            The full file path to the first names reference data, default is cons.fpath_firstnames.
+            The full file path to the first names reference data, default is cons.fpath_llama_firstnames.
         fpath_lastnames : str
-            The full file path to the last names reference data, default is cons.fpath_lastnames.
+            The full file path to the last names reference data, default is cons.fpath_llama_lastnames.
         fpath_countries_europe : str
             The full file path to the europe countries reference data, default is cons.fpath_countries_europe.
         fpath_email_domain : str
-            The full file path to the email domain reference data, default is cons.fpath_email_domain .
+            The full file path to the email domain reference data, default is cons.fpath_llama_email_domains .
         
         Attributes
         ----------
@@ -75,111 +75,47 @@ def __init__(
         self.fpath_firstnames = fpath_firstnames
         self.fpath_lastnames = fpath_lastnames
         self.fpath_countries_europe = fpath_countries_europe
-        self.fpath_email_domain = fpath_email_domain 
+        self.fpath_email_domain = fpath_email_domain
         self.lam = cons.data_model_poisson_params["user"]["lambda"]
         self.power = cons.data_model_poisson_params["user"]["power"]
         self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power)
         self.user_ids = list(self.user_ids_cnts_dict.keys())
         self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict)
         self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe)
-        self.user_ids_firstname_dict = self.gen_user_firstname(fpath_firstnames=self.fpath_firstnames)
-        self.user_ids_lastname_dict = self.gen_user_lastname(fpath_lastnames=self.fpath_lastnames)
-        self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain)
+        self.user_ids_firstname_dict = self.gen_user_bedrock_data(fpath_firstnames=self.fpath_firstnames, sample_column_name="firstnames")
+        self.user_ids_lastname_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_lastnames, sample_column_name="lastnames")
+        self.user_ids_email_domain_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_email_domain, sample_column_name="email_domains")
         self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date)
     
     @beartype
-    def gen_user_firstname(
+    def gen_user_bedrock_data(
         self,
-        fpath_firstnames:str,
+        fpath_bedrock_data:str,
+        sample_column_name:str,
         ) -> Dict[str, str]:
         """
-        Generates a dictionary of random user id first names
+        Generates a dictionary of random user bedrock data, e.g. firstnames or lastnames
         
         Parameters
         ----------
-        fpath_firstnames : str
-            The file path to the first names reference file
+        fpath_bedrock_data : str
+            The file path to the bedrock data reference file
+        sample_column_name : str
+            The column name to sample from in the bedrock data reference file
         
         Returns
         -------
         Dict[str, str]
-            A dictionary of user id first names
+            A dictionary of user id bedrock data
         """
         # load in list of first names
-        first_name_data = pd.read_csv(fpath_firstnames)
+        bedrock_data = pd.read_csv(fpath_bedrock_data)
         # randomly sample names firstnames according to country code and counts
         country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1)
         country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index()
-        country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: first_name_data.loc[(first_name_data["ISO numeric"] == series["country_code"]), "firstnames"].sample(n=series["count"], replace=True).to_list(), axis=1)
-        # create the key value pairs mapping user id to firstname
-        user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list()
+        country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True).to_list(), axis=1)
+        # create the key value pairs mapping user id to bedrock data points
+        user_ids_bedrock_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["sample"])), axis=1).to_list()
         # convert key value pairs to dict
-        user_ids_firstname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict()
-        return user_ids_firstname_dict
-    
-    @beartype
-    def gen_user_lastname(
-        self,
-        fpath_lastnames:str,
-        ) -> Dict[str, str]:
-        """
-        Generates a dictionary of random user id last names.
-        
-        Parameters
-        ----------
-        fpath_lastnames : str
-            The file path to the last names reference file.
-        
-        Returns
-        -------
-        Dict[str, str]
-            A dictionary of user id last names.
-        """
-        # load in list of last names
-        last_name_data = pd.read_csv(fpath_lastnames)
-        # randomly sample names firstnames according to country code and counts
-        country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1)
-        country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index()
-        country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: last_name_data.loc[(last_name_data["ISO numeric"] == series["country_code"]), "lastnames"].sample(n=series["count"], replace=True).to_list(), axis=1)
-        # create the key value pairs mapping user id to firstname
-        user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list()
-        # convert key value pairs to dict
-        user_ids_lastname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict()
-        return user_ids_lastname_dict
-    
-    @beartype
-    def gen_user_email_domain(
-        self,
-        fpath_email_domain:str,
-        ) -> Dict[str, str]:
-        """
-        Generates a dictionary of random user id email domains
-        
-        Parameters
-        ----------
-        fpath_email_domain : str
-            The file path to the email domains reference file
-        
-        Returns
-        -------
-        Dict[str, str]
-            A dictionary of user id email domains
-        """
-        # load domain names data
-        email_domain_data = pd.read_csv(fpath_email_domain, index_col=0)
-        # calculate the proportion of email domains
-        email_domain_data["proportion"] = email_domain_data["proportion"].divide(email_domain_data["proportion"].sum())
-        # convert email domain proportions to a dictionary
-        email_domain_dict = email_domain_data.set_index("domain").to_dict()["proportion"]
-        # randomly choose the email domains based on proportions
-        user_email_domain_list = list(
-            np.random.choice(
-                a=list(email_domain_dict.keys()),
-                p=list(email_domain_dict.values()),
-                replace=True,
-                size=len(self.user_ids),
-            )
-        )
-        # return the user ids email domains
-        user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list))
-        return user_ids_email_domain_dict
+        user_ids_bedrock_dict = pd.concat([pd.Series(d) for d in user_ids_bedrock_pairs])[country_code_dataframe["user_ids"]].to_dict()
+        return user_ids_bedrock_dict
\ No newline at end of file

From b3a3c60b2720799a38b561d7a9b1b75621feedde Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 09:08:32 +0000
Subject: [PATCH 06/15] Readded gen domain email function to user object.
 Replace firstname and lastname with first_name and last_name

---
 generator/app/gen_random_telecom_data.py      |  7 +-
 generator/app/gen_trans_data.py               |  9 +--
 generator/app/gen_user_data.py                |  4 +-
 generator/batch/gen_bedrock_data.py           | 14 ++--
 generator/cons.py                             | 33 ++++----
 generator/objects/User.py                     | 78 ++++++++++++++-----
 .../unittests/app/test_gen_user_trans_data.py |  8 +-
 generator/unittests/objects/test_User.py      | 42 +++++-----
 .../utilities/test_gen_obj_idhash_series.py   |  6 +-
 .../test_gen_random_entity_counts.py          |  6 +-
 10 files changed, 124 insertions(+), 83 deletions(-)

diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py
index 1c4c103..363dc19 100644
--- a/generator/app/gen_random_telecom_data.py
+++ b/generator/app/gen_random_telecom_data.py
@@ -72,10 +72,11 @@ def gen_random_telecom_data(
         n_user_ids=programmeparams.n_users,
         start_date=programmeparams.registration_start_date,
         end_date=programmeparams.registration_end_date,
-        fpath_firstnames=cons.fpath_llama_firstnames,
-        fpath_lastnames=cons.fpath_llama_lastnames,
+        fpath_first_names=cons.fpath_llama_first_names,
+        fpath_last_names=cons.fpath_llama_last_names,
         fpath_countries_europe=cons.fpath_countries_europe,
-        fpath_email_domain =cons.fpath_llama_email_domains
+        fpath_email_domain=cons.fpath_email_domain,
+        fpath_bedrock_email_domain=cons.fpath_llama_email_domains
         )
     
     # generate random entity counts for each user
diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py
index 363941d..e9f8bcc 100644
--- a/generator/app/gen_trans_data.py
+++ b/generator/app/gen_trans_data.py
@@ -118,14 +118,7 @@ def gen_trans_data(
     trans_data[['transaction_status', 'transaction_error_code']] = trans_data.apply(lambda series: gen_trans_status(series = series, rejection_rates_dict = rejection_rates_dict), result_type = 'expand', axis = 1)
     
     # order columns and sort rows by transaction date
-    user_cols = ['userid', 'firstname', 'lastname', 'registration_date', 'registration_country_code', 'uid', 'email_domain']
-    device_cols = ['device_hash', 'device_type']
-    card_cols = ['card_hash', 'card_type', 'card_country_code']
-    ip_cols = ['ip_hash', 'ip_country_code']
-    app_cols = ['application_hash']
-    trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code']
-    itr_cols = ['itr_hash']
-    col_order = user_cols +  device_cols + card_cols + ip_cols + app_cols + trans_cols + itr_cols
+    col_order = cons.user_cols + cons. device_cols + cons.card_cols + cons.ip_cols + cons.app_cols + cons.trans_cols + cons.itr_cols
     trans_data = trans_data[col_order].sort_values(by = 'transaction_date').reset_index(drop = True)
     
     return trans_data
\ No newline at end of file
diff --git a/generator/app/gen_user_data.py b/generator/app/gen_user_data.py
index 9c5fa94..96cc651 100644
--- a/generator/app/gen_user_data.py
+++ b/generator/app/gen_user_data.py
@@ -50,8 +50,8 @@ def gen_user_data(
     # take a deep copy of the data
     user_data = random_entity_counts.copy()
     # add user data
-    user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_firstname_dict, idhash_key_name='uid', idhash_val_name='firstname')
-    user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_lastname_dict, idhash_key_name='uid', idhash_val_name='lastname')
+    user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_first_name_dict, idhash_key_name='uid', idhash_val_name='first_name')
+    user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_last_name_dict, idhash_key_name='uid', idhash_val_name='last_name')
     user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_dates_dict, idhash_key_name='uid', idhash_val_name='registration_date')
     user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_country_code_dict, idhash_key_name='uid', idhash_val_name='registration_country_code_alpha')
     user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_email_domain_dict, idhash_key_name='uid', idhash_val_name='email_domain')
diff --git a/generator/batch/gen_bedrock_data.py b/generator/batch/gen_bedrock_data.py
index f2af97c..241828b 100644
--- a/generator/batch/gen_bedrock_data.py
+++ b/generator/batch/gen_bedrock_data.py
@@ -51,7 +51,7 @@
 system_email_prompt = """
 """
 
-firstname_prompt = 'Generate {n_data_points} first names for people from the country "{country}"'
+first_name_prompt = 'Generate {n_data_points} first names for people from the country "{country}"'
 surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"'
 email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"'
 
@@ -102,13 +102,13 @@ def invoke_bedrock(
     -------
     tuple:
         A tuple containing two pandas DataFrames:
-            - tmp_firstname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information.
-            - tmp_lastname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information.
+            - tmp_first_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information.
+            - tmp_last_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information.
     
     Raises
     ------
     json.JSONDecodeError: If the model response cannot be parsed as JSON.
-    KeyError: If the expected keys ("firstnames", "lastnames") are missing from the JSON response.
+    KeyError: If the expected keys ("first_names", "last_names") are missing from the JSON response.
     Exception: If the merge with country data fails or file I/O operations encounter errors.
     
     Notes
@@ -157,7 +157,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
     """
     Docstring for main
     """
-    # load countries, firstnames and surnames files
+    # load countries, first_names and surnames files
     countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
     n_countries = countrieseurope.shape[0]
     # set lists to collect generated data with
@@ -175,7 +175,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
                 country_filter = (countrieseurope["name"] == country)
                 country_population = countrieseurope.loc[country_filter, "population"].iloc[0]
                 # set n data points for ai generator depending on type
-                if data_point in ("firstnames", "lastnames"):
+                if data_point in ("first_names", "last_names"):
                     n_data_points = int(np.log(country_population)**1.5)
                 elif data_point == "email_domains":
                     n_data_points = 5
@@ -204,7 +204,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
     # log if any countries failed to generate data
     if len(error_countries) > 0:
         logging.info(f"Failed to generated data for countries: {error_countries}")
-    # concatenate user country data together and deduplicate across firstnames and countries
+    # concatenate user country data together and deduplicate across first_names and countries
     output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True)
     # sort and deduplicate output data
     sort_dedup_cols = ["country",data_point]
diff --git a/generator/cons.py b/generator/cons.py
index 77a459e..de5225f 100644
--- a/generator/cons.py
+++ b/generator/cons.py
@@ -15,16 +15,16 @@
 fpath_randomtelecomtransdata = os.path.join(subdir_data,'RandomTelecomPayments.csv')
 fpath_randomtelecomusersdata = os.path.join(subdir_data,'RandomTelecomUsers.parquet')
 fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv')
-fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv')
-fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv')
+fpath_temp_llama_first_names = os.path.join(subdir_data, 'temp', 'llama_first_names_{country}.csv')
+fpath_temp_llama_last_names = os.path.join(subdir_data, 'temp', 'llama_last_names_{country}.csv')
 fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv')
 fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv')
 fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv')
 fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv')
-fpath_firstnames = os.path.join(subdir_data, 'ref', 'first-names.txt')
-fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt')
-fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv')
-fpath_llama_lastnames = os.path.join(subdir_data, 'ref', 'llama_lastnames.csv')
+fpath_first_names = os.path.join(subdir_data, 'ref', 'first-names.txt')
+fpath_last_names = os.path.join(subdir_data, 'ref', 'last-names.txt')
+fpath_llama_first_names = os.path.join(subdir_data, 'ref', 'llama_first_names.csv')
+fpath_llama_last_names = os.path.join(subdir_data, 'ref', 'llama_last_names.csv')
 fpath_llama_email_domains = os.path.join(subdir_data, 'ref', 'llama_email_domains.csv')
 fpath_smartphones = os.path.join(subdir_data, 'ref', 'smartphones.csv')
 fpath_unittest_user_data = os.path.join(subdir_unittest, 'user_data.parquet')
@@ -32,8 +32,8 @@
 fpath_aws_session_token = os.path.join(subdir_creds,'sessionToken.json')
 # set data points generated by llama
 llama_data_point_fpaths = {
-    "firstnames":{"fpath":fpath_llama_firstnames, "country_fpath":fpath_temp_llama_firstnames},
-    "lastnames":{"fpath":fpath_llama_lastnames, "country_fpath":fpath_temp_llama_lastnames},
+    "first_names":{"fpath":fpath_llama_first_names, "country_fpath":fpath_temp_llama_first_names},
+    "last_names":{"fpath":fpath_llama_last_names, "country_fpath":fpath_temp_llama_last_names},
     "email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains}
     }
 
@@ -84,11 +84,11 @@
 data_model_poisson_params = {'user':{'lambda':20, 'power':1}, 'device':{'lambda':0.2, 'power':2}, 'card':{'lambda':0.1, 'power':2}, 'ip':{'lambda':1.3, 'power':2}, 'application':{'lambda':1, 'power':2}, 'transaction':{'lambda':5, 'power':2}}
 data_model_shared_entities_dict = {'ip':0.05, 'card':0.005, 'device':0.01}
 data_model_null_rates = {'card':0.05}
-data_model_card_types_dict = {'visa':0.5, 'mastercard':0.5}
-data_model_payment_channels = {'paypal':0.4, 'adyen':0.15, 'appstore':0.25, 'worldpay':0.15, 'docomo':0.05}
-data_model_transaction_status = {'successful':0.94, 'pending':0.03, 'rejected':0.03}
+data_model_card_types_dict = {'Visa':0.5, 'Mastercard':0.5}
+data_model_payment_channels = {'PayPal':0.4, 'Adyen':0.15, 'AppStore':0.25, 'WorldPay':0.15, 'Docomo':0.05}
+data_model_transaction_status = {'Successful':0.94, 'Pending':0.03, 'Rejected':0.03}
 data_model_inconsistent_country_codes_rejection_rate = {1:0.001, 2:0.005, 3:0.01}
-data_model_non_card_trans_methods = {'wallet':0.95, 'points':0.05}
+data_model_non_card_trans_methods = {'Wallet':0.95, 'Points':0.05}
 data_model_rejection_codes_fraud = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.55, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.05, 'E904:InsufficientFunds':0.1}
 data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
 data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3}
@@ -96,4 +96,11 @@
 data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
 
 # set lists of generator object types
-object_types = ["device","card","ip","transaction","application"]
\ No newline at end of file
+object_types = ["device","card","ip","transaction","application"]
+user_cols = ['userid', 'first_name', 'last_name', 'registration_date', 'registration_country_code', 'uid', 'email_domain']
+device_cols = ['device_hash', 'device_type']
+card_cols = ['card_hash', 'card_type', 'card_country_code']
+ip_cols = ['ip_hash', 'ip_country_code']
+app_cols = ['application_hash']
+trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code']
+itr_cols = ['itr_hash']
\ No newline at end of file
diff --git a/generator/objects/User.py b/generator/objects/User.py
index 2551966..40139b8 100644
--- a/generator/objects/User.py
+++ b/generator/objects/User.py
@@ -17,10 +17,11 @@ def __init__(
         n_user_ids:int,
         start_date:str,
         end_date:str,
-        fpath_firstnames:str=cons.fpath_llama_firstnames,
-        fpath_lastnames:str=cons.fpath_llama_lastnames,
+        fpath_first_names:str=cons.fpath_llama_first_names,
+        fpath_last_names:str=cons.fpath_llama_last_names,
         fpath_countries_europe:str=cons.fpath_countries_europe,
-        fpath_email_domain :str=cons.fpath_llama_email_domains ,
+        fpath_email_domain:str=cons.fpath_email_domain,
+        fpath_bedrock_email_domain:str=cons.fpath_llama_email_domains,
         ):
         """
         The randomly generated user data model object
@@ -33,10 +34,10 @@ def __init__(
             The start date to generate users from
         end_date : str
             The end date to generate users till
-        fpath_firstnames : str
-            The full file path to the first names reference data, default is cons.fpath_llama_firstnames.
-        fpath_lastnames : str
-            The full file path to the last names reference data, default is cons.fpath_llama_lastnames.
+        fpath_first_names : str
+            The full file path to the first names reference data, default is cons.fpath_llama_first_names.
+        fpath_last_names : str
+            The full file path to the last names reference data, default is cons.fpath_llama_last_names.
         fpath_countries_europe : str
             The full file path to the europe countries reference data, default is cons.fpath_countries_europe.
         fpath_email_domain : str
@@ -58,9 +59,9 @@ def __init__(
             The user id counts dictionary
         user_ids_props_dict : Dict[str, float]
             The user id proportions dictionary
-        user_ids_firstname_dict : Dict[str, str]
+        user_ids_first_name_dict : Dict[str, str]
             The user id first names dictionary
-        user_ids_lastname_dict : Dict[str, str]
+        user_ids_last_name_dict : Dict[str, str]
             The user id last names dictionary
         user_ids_country_code_dict : Dict[str, str]
             The user id country codes dictionary
@@ -72,29 +73,30 @@ def __init__(
         self.n_user_ids = n_user_ids
         self.start_date = start_date
         self.end_date = end_date
-        self.fpath_firstnames = fpath_firstnames
-        self.fpath_lastnames = fpath_lastnames
+        self.fpath_first_names = fpath_first_names
+        self.fpath_last_names = fpath_last_names
         self.fpath_countries_europe = fpath_countries_europe
         self.fpath_email_domain = fpath_email_domain
+        self.fpath_bedrock_email_domain = fpath_bedrock_email_domain
         self.lam = cons.data_model_poisson_params["user"]["lambda"]
         self.power = cons.data_model_poisson_params["user"]["power"]
         self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power)
         self.user_ids = list(self.user_ids_cnts_dict.keys())
         self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict)
         self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe)
-        self.user_ids_firstname_dict = self.gen_user_bedrock_data(fpath_firstnames=self.fpath_firstnames, sample_column_name="firstnames")
-        self.user_ids_lastname_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_lastnames, sample_column_name="lastnames")
-        self.user_ids_email_domain_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_email_domain, sample_column_name="email_domains")
+        self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_first_names=self.fpath_first_names, sample_column_name="first_names")
+        self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_last_names=self.fpath_last_names, sample_column_name="last_names")
+        self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain)
         self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date)
     
     @beartype
-    def gen_user_bedrock_data(
+    def gen_user_bedrock_name_data(
         self,
         fpath_bedrock_data:str,
         sample_column_name:str,
         ) -> Dict[str, str]:
         """
-        Generates a dictionary of random user bedrock data, e.g. firstnames or lastnames
+        Generates a dictionary of random user bedrock data, e.g. first_names or last_names
         
         Parameters
         ----------
@@ -110,12 +112,50 @@ def gen_user_bedrock_data(
         """
         # load in list of first names
         bedrock_data = pd.read_csv(fpath_bedrock_data)
-        # randomly sample names firstnames according to country code and counts
+        # randomly sample names first_names according to country code and counts
         country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1)
         country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index()
-        country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True).to_list(), axis=1)
+        country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True, weights=None).to_list(), axis=1)
         # create the key value pairs mapping user id to bedrock data points
         user_ids_bedrock_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["sample"])), axis=1).to_list()
         # convert key value pairs to dict
         user_ids_bedrock_dict = pd.concat([pd.Series(d) for d in user_ids_bedrock_pairs])[country_code_dataframe["user_ids"]].to_dict()
-        return user_ids_bedrock_dict
\ No newline at end of file
+        return user_ids_bedrock_dict
+    
+    @beartype
+    def gen_user_bedrock_email_domain(
+        self,
+        fpath_email_domain:str,
+        fpath_bedrock_email_domain:str,
+        ) -> Dict[str, str]:
+        """
+        Generates a dictionary of random user id email domains
+        
+        Parameters
+        ----------
+        fpath_email_domain : str
+            The file path to the email domains reference file
+        
+        Returns
+        -------
+        Dict[str, str]
+            A dictionary of user id email domains
+        """
+        # load domain names data
+        email_domain_data = pd.read_csv(fpath_email_domain, index_col=0)
+        # calculate the proportion of email domains
+        email_domain_data["proportion"] = email_domain_data["proportion"].divide(email_domain_data["proportion"].sum())
+        # convert email domain proportions to a dictionary
+        email_domain_dict = email_domain_data.set_index("domain").to_dict()["proportion"]
+        # randomly choose the email domains based on proportions
+        user_email_domain_list = list(
+            np.random.choice(
+                a=list(email_domain_dict.keys()),
+                p=list(email_domain_dict.values()),
+                replace=True,
+                size=len(self.user_ids),
+            )
+        )
+        # return the user ids email domains
+        user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list))
+        return user_ids_email_domain_dict
\ No newline at end of file
diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py
index 540bb1c..6bef4c7 100644
--- a/generator/unittests/app/test_gen_user_trans_data.py
+++ b/generator/unittests/app/test_gen_user_trans_data.py
@@ -34,8 +34,8 @@
 np.random.seed(seed=programmeparams.random_seed)
 
 # create relative file paths
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
 fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
 fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
 fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1]
@@ -48,8 +48,8 @@
     n_user_ids=programmeparams.n_users,
     start_date=programmeparams.registration_start_date,
     end_date=programmeparams.registration_end_date,
-    fpath_firstnames=fpath_firstnames,
-    fpath_lastnames=fpath_lastnames,
+    fpath_first_names=fpath_first_names,
+    fpath_last_names=fpath_last_names,
     fpath_countries_europe=fpath_countries_europe,
     fpath_email_domain=fpath_email_domain
     )
diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py
index 92a30bd..4471a71 100644
--- a/generator/unittests/objects/test_User.py
+++ b/generator/unittests/objects/test_User.py
@@ -21,13 +21,13 @@
     "4264861381989413": 0.20212765957446807,
     "6720317315593519": 0.2765957446808511,
 }
-exp_user_ids_firstname_dict = {
+exp_user_ids_first_name_dict = {
     "6374692674377254": "simone",
     "1751409580926382": "francesca",
     "4264861381989413": "igor",
     "6720317315593519": "beckett",
 }
-exp_user_ids_lastname_dict = {
+exp_user_ids_last_name_dict = {
     "6374692674377254": "de filippo",
     "1751409580926382": "gagliardi",
     "4264861381989413": "lupu",
@@ -59,16 +59,16 @@
 random.seed(cons.unittest_seed)
 np.random.seed(cons.unittest_seed)
 
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
 fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
 fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
-user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
+user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
 
 obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict
 obs_user_ids_props_dict = user_object.user_ids_props_dict
-obs_user_ids_firstname_dict = user_object.user_ids_firstname_dict
-obs_user_ids_lastname_dict = user_object.user_ids_lastname_dict
+obs_user_ids_first_name_dict = user_object.user_ids_first_name_dict
+obs_user_ids_last_name_dict = user_object.user_ids_last_name_dict
 obs_user_ids_country_code_dict = user_object.user_ids_country_code_dict
 obs_user_ids_email_domain_dict = user_object.user_ids_email_domain_dict
 obs_user_ids_dates_dict = user_object.user_ids_dates_dict
@@ -86,10 +86,10 @@ def setUp(self):
         self.obs_user_ids_cnts_dict = obs_user_ids_cnts_dict
         self.exp_user_ids_props_dict = exp_user_ids_props_dict
         self.obs_user_ids_props_dict = obs_user_ids_props_dict
-        self.exp_user_ids_firstname_dict = exp_user_ids_firstname_dict
-        self.obs_user_ids_firstname_dict = obs_user_ids_firstname_dict
-        self.exp_user_ids_lastname_dict = exp_user_ids_lastname_dict
-        self.obs_user_ids_lastname_dict = obs_user_ids_lastname_dict
+        self.exp_user_ids_first_name_dict = exp_user_ids_first_name_dict
+        self.obs_user_ids_first_name_dict = obs_user_ids_first_name_dict
+        self.exp_user_ids_last_name_dict = exp_user_ids_last_name_dict
+        self.obs_user_ids_last_name_dict = obs_user_ids_last_name_dict
         self.exp_user_ids_country_code_dict = exp_user_ids_country_code_dict
         self.obs_user_ids_country_code_dict = obs_user_ids_country_code_dict
         self.exp_user_ids_email_domain_dict = exp_user_ids_email_domain_dict
@@ -108,8 +108,8 @@ def setUp(self):
     def test_type(self):
         self.assertEqual(type(self.obs_user_ids_cnts_dict), type(self.exp_user_ids_cnts_dict))
         self.assertEqual(type(self.obs_user_ids_props_dict), type(self.exp_user_ids_props_dict))
-        self.assertEqual(type(self.obs_user_ids_firstname_dict),type(self.exp_user_ids_firstname_dict),)
-        self.assertEqual(type(self.obs_user_ids_lastname_dict), type(self.exp_user_ids_lastname_dict))
+        self.assertEqual(type(self.obs_user_ids_first_name_dict),type(self.exp_user_ids_first_name_dict),)
+        self.assertEqual(type(self.obs_user_ids_last_name_dict), type(self.exp_user_ids_last_name_dict))
         self.assertEqual(type(self.obs_user_ids_country_code_dict),type(self.exp_user_ids_country_code_dict),)
         self.assertEqual(type(self.obs_user_ids_email_domain_dict),type(self.exp_user_ids_email_domain_dict),)
         self.assertEqual(type(self.obs_user_ids_dates_dict), type(self.exp_user_ids_dates_dict))
@@ -121,8 +121,8 @@ def test_type(self):
     def test_len(self):
         self.assertEqual(len(self.obs_user_ids_cnts_dict), len(self.exp_user_ids_cnts_dict))
         self.assertEqual(len(self.obs_user_ids_props_dict), len(self.exp_user_ids_props_dict))
-        self.assertEqual(len(self.obs_user_ids_firstname_dict), len(self.exp_user_ids_firstname_dict))
-        self.assertEqual(len(self.obs_user_ids_lastname_dict), len(self.exp_user_ids_lastname_dict))
+        self.assertEqual(len(self.obs_user_ids_first_name_dict), len(self.exp_user_ids_first_name_dict))
+        self.assertEqual(len(self.obs_user_ids_last_name_dict), len(self.exp_user_ids_last_name_dict))
         self.assertEqual(len(self.obs_user_ids_country_code_dict),len(self.exp_user_ids_country_code_dict),)
         self.assertEqual(len(self.obs_user_ids_email_domain_dict),len(self.exp_user_ids_email_domain_dict),)
         self.assertEqual(len(self.obs_user_ids_dates_dict), len(self.exp_user_ids_dates_dict))
@@ -130,8 +130,8 @@ def test_len(self):
     def test_keys(self):
         self.assertEqual(list(self.obs_user_ids_cnts_dict.keys()),list(self.exp_user_ids_cnts_dict.keys()),)
         self.assertEqual(list(self.obs_user_ids_props_dict.keys()),list(self.exp_user_ids_props_dict.keys()),)
-        self.assertEqual(list(self.obs_user_ids_firstname_dict.keys()),list(self.exp_user_ids_firstname_dict.keys()),)
-        self.assertEqual(list(self.obs_user_ids_lastname_dict.keys()),list(self.exp_user_ids_lastname_dict.keys()),)
+        self.assertEqual(list(self.obs_user_ids_first_name_dict.keys()),list(self.exp_user_ids_first_name_dict.keys()),)
+        self.assertEqual(list(self.obs_user_ids_last_name_dict.keys()),list(self.exp_user_ids_last_name_dict.keys()),)
         self.assertEqual(list(self.obs_user_ids_country_code_dict.keys()),list(self.exp_user_ids_country_code_dict.keys()),)
         self.assertEqual(list(self.obs_user_ids_email_domain_dict.keys()),list(self.exp_user_ids_email_domain_dict.keys()),)
         self.assertEqual(list(self.obs_user_ids_dates_dict.keys()),list(self.exp_user_ids_dates_dict.keys()),)
@@ -139,8 +139,8 @@ def test_keys(self):
     def test_values(self):
         self.assertEqual(list(self.obs_user_ids_cnts_dict.values()),list(self.exp_user_ids_cnts_dict.values()),)
         self.assertEqual(list(self.obs_user_ids_props_dict.values()),list(self.exp_user_ids_props_dict.values()),)
-        self.assertEqual(list(self.obs_user_ids_firstname_dict.values()),list(self.exp_user_ids_firstname_dict.values()),)
-        self.assertEqual(list(self.obs_user_ids_lastname_dict.values()),list(self.exp_user_ids_lastname_dict.values()),)
+        self.assertEqual(list(self.obs_user_ids_first_name_dict.values()),list(self.exp_user_ids_first_name_dict.values()),)
+        self.assertEqual(list(self.obs_user_ids_last_name_dict.values()),list(self.exp_user_ids_last_name_dict.values()),)
         self.assertEqual(list(self.obs_user_ids_country_code_dict.values()),list(self.exp_user_ids_country_code_dict.values()),)
         self.assertEqual(list(self.obs_user_ids_email_domain_dict.values()),list(self.exp_user_ids_email_domain_dict.values()),)
         self.assertEqual(list(self.obs_user_ids_dates_dict.values()),list(self.exp_user_ids_dates_dict.values()),)
@@ -148,8 +148,8 @@ def test_values(self):
     def test_object(self):
         self.assertEqual(self.obs_user_ids_cnts_dict, self.exp_user_ids_cnts_dict)
         self.assertEqual(self.obs_user_ids_props_dict, self.exp_user_ids_props_dict)
-        self.assertEqual(self.obs_user_ids_firstname_dict, self.exp_user_ids_firstname_dict)
-        self.assertEqual(self.obs_user_ids_lastname_dict, self.exp_user_ids_lastname_dict)
+        self.assertEqual(self.obs_user_ids_first_name_dict, self.exp_user_ids_first_name_dict)
+        self.assertEqual(self.obs_user_ids_last_name_dict, self.exp_user_ids_last_name_dict)
         self.assertEqual(self.obs_user_ids_country_code_dict, self.exp_user_ids_country_code_dict)
         self.assertEqual(self.obs_user_ids_email_domain_dict, self.exp_user_ids_email_domain_dict)
         self.assertEqual(self.obs_user_ids_dates_dict, self.exp_user_ids_dates_dict)
diff --git a/generator/unittests/utilities/test_gen_obj_idhash_series.py b/generator/unittests/utilities/test_gen_obj_idhash_series.py
index 18faa86..7e3eabd 100644
--- a/generator/unittests/utilities/test_gen_obj_idhash_series.py
+++ b/generator/unittests/utilities/test_gen_obj_idhash_series.py
@@ -20,8 +20,8 @@
 start_date = cons.unittest_registration_start_date
 end_date = cons.unittest_registration_end_date
 n_user_ids = cons.unittest_n_entities
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
 fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
 fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
 fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1]
@@ -30,7 +30,7 @@
 np.random.seed(cons.unittest_seed)
 
 # create user object
-user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
+user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
 # generate random entity counts
 random_entity_counts = gen_random_entity_counts(user_obj=user_object)
 # generate random entity values
diff --git a/generator/unittests/utilities/test_gen_random_entity_counts.py b/generator/unittests/utilities/test_gen_random_entity_counts.py
index 58a5522..45c8d27 100644
--- a/generator/unittests/utilities/test_gen_random_entity_counts.py
+++ b/generator/unittests/utilities/test_gen_random_entity_counts.py
@@ -19,11 +19,11 @@
 random.seed(cons.unittest_seed)
 np.random.seed(cons.unittest_seed)
 
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
 fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
 fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
-user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
+user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
 
 exp_randomentity_counts_dict = {
     'uid': ['6374692674377254', '6720317315593519', '4264861381989413', '1751409580926382'], 

From 31593e227503f76d5f38982ed95bc8b8fed1b8c2 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 09:36:59 +0000
Subject: [PATCH 07/15] Updated names reference files and unittest data

---
 ...a_firstnames.csv => llama_first_names.csv} |   2 +-
 ...ama_lastnames.csv => llama_last_names.csv} |   2 +-
 data/unittest/transaction_data.parquet        | Bin 30585 -> 30616 bytes
 data/unittest/user_data.parquet               | Bin 18980 -> 18988 bytes
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename data/ref/{llama_firstnames.csv => llama_first_names.csv} (99%)
 rename data/ref/{llama_lastnames.csv => llama_last_names.csv} (99%)

diff --git a/data/ref/llama_firstnames.csv b/data/ref/llama_first_names.csv
similarity index 99%
rename from data/ref/llama_firstnames.csv
rename to data/ref/llama_first_names.csv
index f7b0e44..4668401 100644
--- a/data/ref/llama_firstnames.csv
+++ b/data/ref/llama_first_names.csv
@@ -1,4 +1,4 @@
-firstnames,country,ISO numeric
+first_names,country,ISO numeric
 agrina,Albania,8
 agron,Albania,8
 albana,Albania,8
diff --git a/data/ref/llama_lastnames.csv b/data/ref/llama_last_names.csv
similarity index 99%
rename from data/ref/llama_lastnames.csv
rename to data/ref/llama_last_names.csv
index 9808434..a3d215e 100644
--- a/data/ref/llama_lastnames.csv
+++ b/data/ref/llama_last_names.csv
@@ -1,4 +1,4 @@
-lastnames,country,ISO numeric
+last_names,country,ISO numeric
 bajramaj,Albania,8
 bajrami,Albania,8
 bardhi,Albania,8
diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet
index 47b78f12a50db976630997bc7713347cfe14e7f6..d10238f8cde94df5a6c700ecd6373481426bf5f4 100644
GIT binary patch
delta 2654
zcmaJ@TWlj|6}D${F=3Z=cjLs#Zj!xZv(YBi*v@8ISBu!+c6>|h`jU8N%tJDs8QWve
zjJ+OTu&D@r;-M<EJryeLwpvv}Dusu#5k*2mEl3m|DyUF}P^swhP#$=o`+yKgP&ohC
z8?u{)NMq0W&v(9a`Oi7dKmKBW;S>8Ck2yUF(QGOWNo>eNHk&P{nxf0*`kmeN2m7+?
zgJGA`HR2jMeT82fqAK@TVz1&<e9?K=;)ZL;WjkTBod{9GAQFt=Hz#VE(nN~q?XDN^
zqu5*==_}ns6pYyT-57lFk$L;4;0ccJe!4qrxBrc?9kIV_$B%g%^5<OqhRb2OS>hdN
zLhAKwv(`|ktCZMHiQ2Gz;+ou5465a74Nb;lbpI;jdY!pH1IInJdzerQG<ep2_ZdHa
zPuq{ckAttQWt55{H5GXb&$2B^QcSbh(ne8ADTb^XC1ifr?)uGriP;v%{)=^wruJSA
zZ8I}tMYUlzw+&HOF4#6sBZcee1;+gu4ePzVHyL@286802=dtL>;QT~<?1*zv>HUj=
zTd}#l^~5ij*;~xS>&(K3PcaKGJjI;8@<2%`4ic%er{4Sqv+%R05x_4#n6!m?Ej2fF
z^EqbW_Opn2FnqX^iVpwkTg<{Q;n8$u`0W>&g}=ki^cPOuxy?*`FudTHc;_W%+-93v
zc8}CWLl#Z<Z}<M1ZrEq6WX~<1IagE_P3}vNQHd4;p3LU$+wlABG=4sr&F!skzCUK~
zz?tp&qty^0B!CY|3VW&TyY`F4f*x+>voSLkjO!woGxK48OW?d(tWqn*^n6`1k{;d%
z{}vmgg-Ez3hy8F}oH;spx1jK}Zou*O1f-=)!<n)i*@7KqYAhcKd-8myUf|#(dFo_U
z;4_*W(VF>9!Nd7hdwdp-i<dx^E}lH>*?_lNXW?#1ACft(3V#&c{g<D}hCkO7IL+jJ
znRbB-*9+?Z3jPodoI6`EB~3c2c|Knkd^y-D&%u&%68@}~;gT{o<Wcwl+!3da=Oa0v
z$R#y?C9^7FLbr;|;hMlFYw*+39NbYG$Mj@b(lbRls&(>N_>w*g|5AF+WLxG|J(8gn
z1)sk<P+Vv_VO2F4tph($=Z@5+Fnpw%E?w*JzC#9IsZHT+b){W+Po0K$O5+oJpi_uw
z6|7&q;9D(<cnxAOW+n4&<03p$8G}EGC!wV|M<3K(ul98x8;`*4!dKx9!#P?maDH#W
zfbZ>0!PhEI$3RZ^FHQe)Tj0~aWDq{taXa#6!58)<EAVUm+KE6}!Nw>UnPP#<dBlj{
zx)=1j2;V8z#ybL!my6kBQZ<2FZRNQRJW-s5us#jrJ88gR9X%PlLgo|j{T;XEeg9xb
z(cxp`JbYL^549V|@a}6&rb`QUv@S>E2h-jy7&)^L^~xJTq6#Z<4NKFO)Yz3+)Ksxl
zs-%azN*3i{yrXd8uB7`{Vrr)pGx8cXG?wwQ`nXEe#Ad0L$X@Qm3=bWE4n~b_nk>qC
zt}FSpHue^B#JU@mSS!(u`*Cn0`0NrX5^iCC_fX*RAtp%^5+!SNx~%!4a)RKv1qIR}
zB##0KrRn&sQQ1pkgsi87h&M@zAR1-TK{>QVO$|yFv8bw4UZqaf5l8}P?Zo+|2UJ6X
zSC?4$QFG=RWgsG->GHmES@KaZ29|2G>{8|PEPEBDwJ_46iKVk%sYn6>yw@0qw~b?;
z#Q<tcEX&4HJ@{v1HcpYzOcx8&6S$;D<oqpuCF#L}dnJq{n(DEvck}B1r>9{C;JQBj
zIqHoBw%8QQR?x%E*6h|pJsfT|j5<xty3`|jCg}vFC2{&;6~cNUYZf28Rh^u|1HQe|
zN8>45WLZ241k3Qv8?I%`8XK!zFp_0qr9WHN&ayN?Zh#MwPzA1=^YDJdiL*A`&mpww
zXIa0=7UAnRW)>|A;bC)X1r?}Ku3BmZmRqwq8hqp-qn{266y`|{LQZd4v=NKKdhL_#
z+xKs|L-8yPac+@bX#f{Ryn1M{P=XW<`l`vcq0>G$Dfq%&-1c<kX_PsZ!Y=V5QS*{>
zzk>n^78|Gxz2=!ymSf!Ex>m{O26oX3cFR%+e%O3;cVJYfJpLY@A8a}5fj0HMKS~zR
ztOXwSR|`IGS>!HTN-wqV;4L4t4_E~_V2|@e0$!0zlO>Aph1m1=83d_=0}caKY5*ST
zENudk#8zT``%cy^=0kH~)e~+NCMMH#@Jl36vK9bk2$A&)3#578ZQ{znrMHTUbPE+h
z38`6ciiIbdv#YcnBlmtr2arUCCYL6d6iz%6OLS9Uc=rN^-RATp`oaOr_40hYp5ND|
QpX=OkeAk2jD3Of!KTUN#cK`qY

delta 2501
zcma)7U2I$R6}NBVd{H26<KVPON(-db(s6C4ZZU+9zwP+iCUzXR_VqP`=K9`iUpx2O
zjz41~2tptgu_3{c5U`C_6G+pLpd-@u#GWQ?Pn$G>v@wmBy#wQ6(llw~oNK4J6+$@j
z%{^bg|2gOXJI8N)VgJwP_UjK$yG^;#)ar^{S4V9&Tf5Ydoi^uR?at5aXPuuqoYT%R
z=h*!D?3qzAxu(qh^0ek%op#QwJ2|KAh|M-JPk{u0fzR8WS8fd?lWUZo=*PZ%3?4oF
z@QD3+2W$nM5xG`tG^=%O3~e>JTayikeRfstYLyApbgFejMKvP-f^+_gyE&za1GCE^
z!$-mE$Lm^IQ<|DOg`tg>qG*jqp=FE{G+YTS-2RQuHD25QFm#zaK3OQ$8_kMrYWLaJ
z=h4%7=swOhVPIGG?-zc_jrASiAJK@TuU(E!jf@{OV>SK-=lZvO_Bn3pk1umeZ@kP|
zz3P_?EPfB@$&+{e_!Tt$#4_EK^R;AP?yFb1r7N$Z&30z$ru^W)$%x~f*SV#)A+lK>
zfBh|P=}&KRa6NnmzKS22pZ(2;-0Y{0rHR>}{gj)r+2+o=#%gj!l^d?V?BAKH+efe{
zIGjG}uquSNFXj$3qf{L#GW+FAA5Ynf@aW}x4;4d<u>gKqBD!C_{G}cKFE1R*nMPMx
zNwp-=W5B9faqQ|SqUn@F@acIG-d8H{`}{n-U>?1#n-kR<n!8dKp4xs9{@FSScdIVA
zq@0G!C4JN*i<_`#y8164DCTrd^ysp<TgVCirX(7z*9voY-lC#Yp{*SSM`3=<tLuul
z2p9^k!VDccv7O82YLYhtPZXba^v!$b;*jSd$s6wG!hW}w4H$5(aKh1lBi#QLkHfo#
zV{lb@W=hTm3bGLH$w9ckG(UjClV->udHtSzCDqk3{&L=%GN`T*G{A_y=L~lgVHa}<
zyJgYe$_|pmLg5dkWAKuu!({dFjoe(YQJIE4t!`;kx+<ipDWr95OIgHTRp+KmGGq|Y
z0~=d&M{`2hoy(?bc`@xxgqtnJ<A$p7AbeDxfG0H<LBa3p<M3<q*rb^k{JI*F+>qRI
zP3Fwqwv^rUCg5MnS@^^_O<kkIj&!7~(aecHch>7MR6%d&FuVePE{?&^&0`QO&dn%7
z*q0YGEj8>fV{ck{Gt<EIOnB2c44>_c+q>|TcGrQ#b)&#<rD3al`gTL9#M+8+qg!+E
zN#zt2i#q(aIt?qOnGv^Qz_XgNmvJ|u!B|-_Ggz^|8!cChS~u2^#TfD^<|8>;3Hb26
zn_p2%>p@mh%}iJE8tr^3dOjMA?rsFn?XH(usc^AFzIq<Jr+EC{TsGFDE3q)j(N?^4
zt{1Jik>e7Py(5Xf?L8?Q$9Dx4!4C0XVVjJt;UBr+x5LmLqwxrAqreg*vrUvkX;x&c
z#-fY~ouK87a){)_CWZzrD2bqgB!{*MbB7)gdN9glvOjd|y$f)yx%e0Yus}_X#QIz$
zFU)J1=lS!=HqYbvY%`4sGWIw<DA&O!Ora5&sxIs?wo$&o^A8mHB+sKZ5J*{0*7;P^
z;B%4w<5_yXlF+3Uq?gxMzya3PNT!D-h2k^G-glo%AVNY{%v7Pzmq%_U4B1)u(gQrd
z9KR)@2APHul1YY-jeDNNmWMEcj~+y@U9z$7RRV6Swu;0?Y>h8iEx_vu`LQJkvGzEe
zX)L>}S+g)>`GF<QHj*pw-p=9@=_9zvL|;!56T@?DMI@3dJn!wniN?Kqx)KQ>TNB7$
zWTfG*O4&r!@)F93yIdS(aBvB9O~W-ON&dDS!(tTAZ{8A5)E$bS>vQSG_Co$qz;eG3
z2k}F|JvVzBh50G(ZV#?De=xn0@+m@^v_&47CBomDcOSRh`*&&WLNc%QkbcW+Fw^?s
z9fKe(iS)V<LYxx|bccb1Ej(ataGM}kk{95;#-hbNAwvtkk@cq9c`;neTU`jH$t{E=
z8zy#`4jm#k4IPVQYB?4<NB;374-eHBt<oa!igNNt<nKlhK7RwM`mHmvM%qZs$a#$(
z@qTb+11d-iSt9h813|{w4j%RQ=HdcLg7wp9{SuPMVmKB&pXPZeG*1<We<;b?rA&~r
zSPZTM373tvHfR@EnW4)8A!zjsT_|OMQ<+T|ifIttt&qWxz=_Vz#8tP=mSic<*Yq<C
Ai2wiq

diff --git a/data/unittest/user_data.parquet b/data/unittest/user_data.parquet
index bb932c940925027f810d4eeee9cd240c2643ee8d..91a5b484652b910bedf161f133f76857fa30169a 100644
GIT binary patch
delta 1877
zcmbtVy>Ht_6qk%R&PM~(s2v-Q?FMav0975zlw3!SWcx^p6lIB)<;WtLAs{Kslt{@+
zWPL715ujU_Ugl0k(4j+H?^!Z-@Sji=1u}FAx^(H%_fC$ZI4RJeltA3w`@P@$K7RKk
z_`{Rnk8Wu0Qn~V|dGMfa)pl<NGP8j|VDf{TBhwds3r>$7hbBkHP9XH#1?Bw7`5z09
z&jo)zs`Fkj_<(BBtMp}L^c=F#Kcf@$FuFF%2tYJ*dVF*k{cAM1N82l(UXh5<Aw(eX
zWYnXlD=&h#<XFonF8XSvv!~X~hFR=-I=|%F#X@DXLMmp>?>UitnQkPSV@*TX!a7gC
zj?bO0e~Le%zow?>H}PruY;~L-Coa>S#Kc9<(DQyt@ek<Q`V5_0y>vmV=54#KmFWF=
zh(1dIbagKN4wFu8r*45ErzVH3Vyva}TDw$S3OjYbZ16G9+=;n%O4v7OE-^FKu<KjA
zCQ`CKJ?ghgl0nV*=O6lZQEu3B%(MAL*D6No?Zoukhx6;T&q})HJ2l_?w*<VF)7`rT
zy0SJk3vt!_D!8!^`PhF`k$$;8Pd(DOZ}4)@l-~t;o!qxDb-}f3dDmHtg?DwqSF8I_
zlx_nGU_%BPgXqKbtGiKjWW?{#m*Vw^Gi>I)D6Z1)g=_S0@uPLjcFW=ADwM2k`$K-e
z7vU7O#2H5DIw_%P6*c#N5>ESJXV{(7hM~WKC67$d<G6IFZ)5vvJ+ml>?T8AkD=cH?
zjC$I2>K<!lC*{LB-7fXirYfd+Q&0D?zgU*5$!MmUY-PoCEF+SNqsT2&k^6eY^qsoa
zu=yzC$X24gOf_36Z^0Pt!5F1j#kO4s=1PJ?k|GkkJMe=ii+>TIO#<LeqM``)Vut{B
zha?HfAy70Bt>Fs=L@IDOdPV)@(9-igop)V_M`MKabA68MNHLDP1G{8a#zT?GfD0bf
z1_LK6V(3E0R{SaqBgC*&wjd^piAj<xbKGVw0(^)rpe9k6j4g3RHD!D|gBbq5+bpHh
zF3h=WM9$h2JhWNMMJTT<h*jjhbA}i1CkPB*EEa+JlEST}3n%`IQRG=3@Qg)~r+bf<
zl^e5Z1P+fC?vuG?jtk$TPnFxMR1I-ltjV=G4kUs!;AaVB*W$o5D+d$_67uG-69x4{
z4%enxdExvP@^oofxqgTgshvJFRl|g33vGt_0A_`fSww@v41xm;wh8!f_O4=*FHpY=
z#UAL3Oi3ClG@=CL7NF@%4f=fb#wvj>gYJVab55E7QLIOZodYgb>3x>c4HU5yvjs_R
z(TAzY0=u>$g{TkBz!Cv~EO~gUKj;)zk|BK&S(UDf9}BxCADFn`kiaNcpqJH;mzcp<
zpOaUJKC*V1kKna|pC_0Z)&R^)dYrsAjsrzXMc+ezp-M^mi@J5;_xV5|NAipR0DsC-
A$p8QV

delta 1809
zcma)7&2Jk;6t}CS4WUI;5J;OqOG^%wo36dKo46!tXV>2KI`+nPV($lSrQLP>m0jD)
z##x+0eL!5ePX7TUj;KP4_$#;|aRaFmmvZ9|;DC5<Y$r4U)Uh<4H}l?ae(!7cyXWC=
zpNGG1Mi$<xxAy&mj^*txh4PD`P-yP*(uMgM*YXdJzlzLVxHwXv-z>}1quoE2znluc
zINs;?!t`Z)f%XzrT1-sA`hzKE)=ONu7%tIg4{l!(SA~EOfxy^I9KU$*Dm*PGo3=)O
zTfcV6v-I3iO{T{B{;Vxau$&yd?HjrpwLR&;+@_zbdGycq#XFv*Ree)R^$jiND(!AV
z>Ev9KcaLh>T(c&-hqjyAGxV}ce-y6M!p3YwX$1|%OU7(nNb2!&(vd5ztrlsSvJ^G-
za?jQ(yeb{^omiAU6(V=e!L{vpwGZ$~omUS{xmq{0F9J)>(9Xko`t`;=`jaq6mmgl9
zKYxd%C8KJadXMg|Ru2r#^>w~{Xlqi>ZYjPZ=FWTLoz+9nXJQUt9avfkuRIl=u6r}U
zhqpA<rCZ{SclJ!(>=}HuZb2X|dUNgiRo{XbYTy7UA^92=*EXlyhMwC)lkw_C`X>GT
z(c=#-T^QMluhF9~$u$~DzmGb%G0=8LZ5nZi9iRMH{b$0`SpUf_T?lO1g&Y4{{k}L+
ze@qwB@6yl3_hx%=&xs#ThOt^+BrPMJa{=hkX({)Wtlt9Qmg7l--EEM?T(GN21IJ6n
zS<XAA>vUA$m|*!X>@@m??L?q-q8R$X(2`InYQ5%F2M!N~gP5W+$hVb#VUSBfwaQqs
zjx6=fAp{iGD%&6hLelWV0fik91ylnOqy%Z;5y*M0ccDT`gm9>rg9AdMII%`hcUC!W
z6{L)mVVeL10#2iW`oc*NOpEQN3tW-oNHNZFs9y>yxV_2F;HEJ)Tt8k3O(!e(kork$
zbiv~;9uu;Y0s;xHPJfW^?hv1=bDZ(!z)=W$-{Gb9xaUrRWdx2G40u^AAZ~C5H_Hf|
z5|)_kz%29}DK)~%w*iL(UPw?$HxNiFuDs?s^Auw|_Ln7$?4ig?T<4ojhi9QLafcjN
zF0RmDGRv2?5Qjgae@UM%=Fxi6MiT=I6_Q7EL%ui6r6WKjgZ2n4l+l0~II&dXVq7aV
zU3DGL1p;CM^;b^#Rv5{D@MT7<hU7CX&*-MRXrF~_4@aq17{C@YV@tNmVi{!L_{kzr
zq)@yK0uS<t3!<<h5`1uyIVd(6G%FKu@G~aC#Qb)=Y6qVbNPpDQQhj*PptdSRXds}0
z|DzFzU5=xb%&p<+tIjSWU2uv^?ZEVZ_0N(x{y5+l0d1lDVL+3B06sJb6cX#3(rG3p
l{ls`+KYcVN1b!1C29GG}!oetr^k=0s^Xtc<P?3~le*+ieF(v>2


From 624823f22da5eb867ff0c558c69fd2b52260324b Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 09:37:26 +0000
Subject: [PATCH 08/15] Revised unittests given naming convention changes

---
 generator/unittests/app/test_gen_user_trans_data.py |  4 +++-
 generator/unittests/objects/test_Application.py     |  8 ++++----
 generator/unittests/objects/test_Card.py            |  8 ++++----
 generator/unittests/objects/test_Transaction.py     |  8 ++++----
 generator/unittests/objects/test_User.py            | 13 ++++++++++++-
 5 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py
index 6bef4c7..4c4be48 100644
--- a/generator/unittests/app/test_gen_user_trans_data.py
+++ b/generator/unittests/app/test_gen_user_trans_data.py
@@ -38,6 +38,7 @@
 fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
 fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
 fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
+fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1]
 fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1]
 fpath_countrycrimeindex = '.' + cons.fpath_countrycrimeindex.split(cons.fpath_repo_dir)[1]
 fpath_unittest_user_data = '.' + cons.fpath_unittest_user_data.split(cons.fpath_repo_dir)[1]
@@ -51,7 +52,8 @@
     fpath_first_names=fpath_first_names,
     fpath_last_names=fpath_last_names,
     fpath_countries_europe=fpath_countries_europe,
-    fpath_email_domain=fpath_email_domain
+    fpath_email_domain=fpath_email_domain,
+    fpath_bedrock_email_domain=fpath_bedrock_email_domain,
     )
 
 # generate random entity counts for each user
diff --git a/generator/unittests/objects/test_Application.py b/generator/unittests/objects/test_Application.py
index 58435b5..9403daa 100644
--- a/generator/unittests/objects/test_Application.py
+++ b/generator/unittests/objects/test_Application.py
@@ -28,10 +28,10 @@
     "dded2b63f8242648": 0.2727272727272727,
 }
 exp_application_hashes_payment_channel_dict = {
-    "63cea7c46926aa74": "adyen",
-    "37725417bd51fb40": "adyen",
-    "b95cb80aae9fbbfe": "paypal",
-    "dded2b63f8242648": "docomo",
+    "63cea7c46926aa74": "Adyen",
+    "37725417bd51fb40": "Adyen",
+    "b95cb80aae9fbbfe": "PayPal",
+    "dded2b63f8242648": "Docomo",
 }
 exp_n_application_hashes = cons.unittest_n_entities
 exp_lam = cons.data_model_poisson_params["application"]["lambda"]
diff --git a/generator/unittests/objects/test_Card.py b/generator/unittests/objects/test_Card.py
index 688455f..32f0358 100644
--- a/generator/unittests/objects/test_Card.py
+++ b/generator/unittests/objects/test_Card.py
@@ -16,10 +16,10 @@
     "dded2b63f8242648": 1,
 }
 exp_card_hashes_type_dict = {
-    "63cea7c46926aa74": "visa",
-    "37725417bd51fb40": "mastercard",
-    "b95cb80aae9fbbfe": "visa",
-    "dded2b63f8242648": "mastercard",
+    "63cea7c46926aa74": "Visa",
+    "37725417bd51fb40": "Mastercard",
+    "b95cb80aae9fbbfe": "Visa",
+    "dded2b63f8242648": "Mastercard",
 }
 exp_card_hashes_props_dict = {
     "63cea7c46926aa74": 0.16666666666666666,
diff --git a/generator/unittests/objects/test_Transaction.py b/generator/unittests/objects/test_Transaction.py
index 05531ba..0c338da 100644
--- a/generator/unittests/objects/test_Transaction.py
+++ b/generator/unittests/objects/test_Transaction.py
@@ -22,10 +22,10 @@
     "dded2b63f8242648": 0.3793103448275862,
 }
 exp_transaction_hashes_status_dict = {
-    "63cea7c46926aa74": "successful",
-    "37725417bd51fb40": "successful",
-    "b95cb80aae9fbbfe": "successful",
-    "dded2b63f8242648": "successful",
+    "63cea7c46926aa74": "Successful",
+    "37725417bd51fb40": "Successful",
+    "b95cb80aae9fbbfe": "Successful",
+    "dded2b63f8242648": "Successful",
 }
 exp_transaction_hashes_amounts_dict = {
     "63cea7c46926aa74": 2.99,
diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py
index 4471a71..16280c8 100644
--- a/generator/unittests/objects/test_User.py
+++ b/generator/unittests/objects/test_User.py
@@ -63,7 +63,18 @@
 fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
 fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
 fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
-user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
+fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1]
+
+user_object = User(
+    n_user_ids=exp_n_user_ids,
+    start_date=exp_start_date,
+    end_date=exp_end_date,
+    fpath_first_names=fpath_first_names,
+    fpath_last_names=fpath_last_names,
+    fpath_countries_europe=fpath_countries_europe,
+    fpath_email_domain=fpath_email_domain,
+    fpath_bedrock_email_domain=fpath_bedrock_email_domain
+    )
 
 obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict
 obs_user_ids_props_dict = user_object.user_ids_props_dict

From 1222f0924192af7aa9775fccedd9a2388bc7e73a Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 09:37:59 +0000
Subject: [PATCH 09/15] Fixed non payment transaction generation

---
 generator/app/gen_trans_data.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py
index e9f8bcc..60d6814 100644
--- a/generator/app/gen_trans_data.py
+++ b/generator/app/gen_trans_data.py
@@ -89,11 +89,11 @@ def gen_trans_data(
     trans_data.loc[zero_transaction_amount_filter | missing_card_hash_filter, ['card_payment_channel']] = np.nan
     trans_data.loc[zero_transaction_amount_filter, ['card_hash', 'card_type', 'card_country_code_alpha']] = np.nan
     # add payment method as either card, store_wallet or store_points
-    trans_data['transaction_payment_method'] = 'card'
+    trans_data['transaction_payment_method'] = 'Card'
     zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0)
     missing_card_hash_filter = (trans_data['card_hash'].isnull())
-    # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0])
-    trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))[0])
+    # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values())))
+    trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values())))
     trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan
     # align country codes for user, ip and card
     country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha']

From dd619d46d9f702b731c5ab7fec708ceb175723d8 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 09:38:30 +0000
Subject: [PATCH 10/15] Removed lowercase and split operations from
 standardisation lambda

---
 generator/batch/gen_bedrock_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generator/batch/gen_bedrock_data.py b/generator/batch/gen_bedrock_data.py
index 241828b..9238a56 100644
--- a/generator/batch/gen_bedrock_data.py
+++ b/generator/batch/gen_bedrock_data.py
@@ -145,7 +145,7 @@ def invoke_bedrock(
         how='inner'
         )
     # standardise names formatting
-    standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x
+    standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip())) if pd.isna(x) else x
     gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x))
     logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}")
     # save generated data

From 63b0ae741296b5c6ff41428d70ab884a2c95301a Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 09:39:37 +0000
Subject: [PATCH 11/15] Updated status codes naming convention

---
 generator/utilities/gen_trans_status.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/generator/utilities/gen_trans_status.py b/generator/utilities/gen_trans_status.py
index 35fbbd3..f41ac62 100644
--- a/generator/utilities/gen_trans_status.py
+++ b/generator/utilities/gen_trans_status.py
@@ -33,7 +33,7 @@ def gen_trans_status(
     country_code_columns = ["registration_country_code","ip_country_code","card_country_code"]
     # if card hash
     if pd.notna(series['card_hash']):
-        status = "rejected"
+        status = "Rejected"
         # add rejections based on crime rates within country codes
         if rejection_rates_dict["country_code_trans_reject_rate_dict"][np.random.choice(a=series[country_code_columns].dropna().to_list(), size=1)[0]] >= random.uniform(0, 1)/rejection_scaling_factor:
             error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0]
@@ -59,11 +59,11 @@ def gen_trans_status(
             error_code = np.random.choice(a=list(cons.data_model_rejection_codes_funds.keys()),p=list(cons.data_model_rejection_codes_funds.values()),size=1)[0]
         # otherwise return successful status
         else:
-            successful_status = {key:cons.data_model_transaction_status[key] for key in ['successful', 'pending']}
+            successful_status = {key:cons.data_model_transaction_status[key] for key in ['Successful', 'Pending']}
             successful_probs = [value/sum(successful_status.values()) for value in successful_status.values()]
             status = np.random.choice(a=list(successful_status.keys()), size=1, p=successful_probs)[0]
             error_code = np.nan
     else:
-        status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0]
+        status = np.random.choice(a=['Successful', 'Pending'], size=1, p=[0.98, 0.02])[0]
         error_code = np.nan
     return [status, error_code]

From 75affdebb1d38c25bd15f12d38480329f81de834 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 09:40:07 +0000
Subject: [PATCH 12/15] Fixed bedrock fpath for gen user name function

---
 generator/objects/User.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/generator/objects/User.py b/generator/objects/User.py
index 40139b8..039775e 100644
--- a/generator/objects/User.py
+++ b/generator/objects/User.py
@@ -84,9 +84,9 @@ def __init__(
         self.user_ids = list(self.user_ids_cnts_dict.keys())
         self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict)
         self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe)
-        self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_first_names=self.fpath_first_names, sample_column_name="first_names")
-        self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_last_names=self.fpath_last_names, sample_column_name="last_names")
-        self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain)
+        self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_first_names, sample_column_name="first_names")
+        self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_last_names, sample_column_name="last_names")
+        self.user_ids_email_domain_dict = self.gen_user_bedrock_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain)
         self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date)
     
     @beartype

From 783d1b1d7a1d9d2504d6003a8f18a3ee0eeb7cd2 Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 10:52:55 +0000
Subject: [PATCH 13/15] Tweeked null rates for cards, adjusted split between
 non card payment methods. Fixed index assigment logic for non card payment
 methods

---
 generator/app/gen_trans_data.py | 3 +--
 generator/cons.py               | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py
index 60d6814..0c9326c 100644
--- a/generator/app/gen_trans_data.py
+++ b/generator/app/gen_trans_data.py
@@ -92,8 +92,7 @@ def gen_trans_data(
     trans_data['transaction_payment_method'] = 'Card'
     zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0)
     missing_card_hash_filter = (trans_data['card_hash'].isnull())
-    # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values())))
-    trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values())))
+    trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()), replace=True), index=trans_data[missing_card_hash_filter].index)
     trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan
     # align country codes for user, ip and card
     country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha']
diff --git a/generator/cons.py b/generator/cons.py
index de5225f..f846ed8 100644
--- a/generator/cons.py
+++ b/generator/cons.py
@@ -72,7 +72,7 @@
 unittest_seed = 42
 unittest_n_entities = 4
 unittest_n_device_types = 10
-unittest_gen_test_dfs = False
+unittest_gen_test_dfs = True
 unittest_n_users = 10
 unittest_registration_start_date = '2020-01-01'
 unittest_registration_end_date = '2020-12-31'
@@ -83,12 +83,12 @@
 data_model_entity_user_ratios = {'card':1.3, 'device':2.5, 'transaction':5.3, 'ip':4.3}
 data_model_poisson_params = {'user':{'lambda':20, 'power':1}, 'device':{'lambda':0.2, 'power':2}, 'card':{'lambda':0.1, 'power':2}, 'ip':{'lambda':1.3, 'power':2}, 'application':{'lambda':1, 'power':2}, 'transaction':{'lambda':5, 'power':2}}
 data_model_shared_entities_dict = {'ip':0.05, 'card':0.005, 'device':0.01}
-data_model_null_rates = {'card':0.05}
+data_model_null_rates = {'card':0.1}
 data_model_card_types_dict = {'Visa':0.5, 'Mastercard':0.5}
 data_model_payment_channels = {'PayPal':0.4, 'Adyen':0.15, 'AppStore':0.25, 'WorldPay':0.15, 'Docomo':0.05}
 data_model_transaction_status = {'Successful':0.94, 'Pending':0.03, 'Rejected':0.03}
 data_model_inconsistent_country_codes_rejection_rate = {1:0.001, 2:0.005, 3:0.01}
-data_model_non_card_trans_methods = {'Wallet':0.95, 'Points':0.05}
+data_model_non_card_trans_methods = {'Wallet':0.85, 'Points':0.15}
 data_model_rejection_codes_fraud = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.55, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.05, 'E904:InsufficientFunds':0.1}
 data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
 data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3}

From 8222462b3c5522b1a875f7d74695e4c1938fd21e Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 10:53:37 +0000
Subject: [PATCH 14/15] Deactivated unittest data generation

---
 generator/cons.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generator/cons.py b/generator/cons.py
index f846ed8..ad51b95 100644
--- a/generator/cons.py
+++ b/generator/cons.py
@@ -72,7 +72,7 @@
 unittest_seed = 42
 unittest_n_entities = 4
 unittest_n_device_types = 10
-unittest_gen_test_dfs = True
+unittest_gen_test_dfs = False
 unittest_n_users = 10
 unittest_registration_start_date = '2020-01-01'
 unittest_registration_end_date = '2020-12-31'

From 8ff15bd2c4619c5de01c167312dd9c825a42381b Mon Sep 17 00:00:00 2001
From: Oisin <oisin.leonard@gmail.com>
Date: Thu, 29 Jan 2026 10:53:57 +0000
Subject: [PATCH 15/15] Updated unittest transaction data

---
 data/unittest/transaction_data.parquet | Bin 30616 -> 30635 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet
index d10238f8cde94df5a6c700ecd6373481426bf5f4..b871daa5d9c73f25c888ded44b494897f9b8383b 100644
GIT binary patch
delta 1736
zcmbVMZ%h+s9Dd)s_bP?9d$07mf{mf2+zJ})L5Bs@w6w=srO-mFL${35mdHd<{(Klq
zDF1X7>jGY9keP*$YMfKX#4T=1rrFryoH5a`C39}X&1}mugc&!dV6&@!BKzitdw;)s
z-rwCl_dHMDk$L#uJe(Yq!>Djj-lDXa&4qzbNvOyYEU|Fw*Ha#0mEwrF`Jp1$NEtUg
zUlMAtn1V%?qLM;OkP9<_xkoZf<QnD%#b8FlNVLUW!X3qPk%w^MK`{V88ktSbvye(Y
z%7*KbQ|A`vznoi~egBilKdp<A`L`E;dw5|{DQ>@_h8f9;&yt^}ZWbwbPSi%^=W6z%
z%{<^@+powRPK`rWv^|U0ln-zd8a-MWJ4$R$GYoi@t0nTxw7s`RJWje%MaEvU#%m%m
zF2K}|6%ZWZ$oP4f@Ma&6D`jQV8oE!vPu_q6Y*Yv8_8pyc>JnPJtO-c$Y4D06ILaMI
zfpXn1c&iK((u<YSGxETGQvlm;77%9}WZYFO$aGGwK~|H9m7{D~x<1PF#g*w?rG|~=
z>p(DCa01oKas?LWD=$IjDqJp205g%$Lh$L8fzSBP51PnQ3kDl?_2WH+K7U)!ft@px
z{x%fdPdQ(*&!lmi?8=-T1~ERyQ^_b8pah^mMTl<jwNN6=QyG!kEl^^1`cs(8ms)G8
zMK5m>%xz*BVIxtyMMF4vabf~4oOycQq&s?K;2>gDW_WEh)OgLjC$4mz(1#DCv95}N
zKjPE9pL?<?DR#La-~TG1EC0R0kVOs0bFS#~23ZvAGwD>+o;acn&QQW+FSnh-+{J{e
zPSxLL<hP<cUeDapbbc`IOwXzq)*p~2dKz-_rXJAWnIaZuL!}talnUBrB>ss#&=!)2
zbz7jhIoy%zJgrSF9qpn5QGyv-rhwTfc`~L!smH|G0l3h=PORCGOje248c3afq=)4B
zVz}aA^7}#&Qi~b>I7e(8VzVu25WBUIpnw(hr~wJVs?=fxnGgVk5a<EW3PfVC$65eL
z8(sw{n4<zJv527dad8v?iiQ{}fs_#6BaRF8_D#wmf*D!Hw@a9`t0Gaj@W#`7MYuh6
z3}^FyYh`#Jqw-oid{|5Sd7t0rCu@8dIBQhc=~q?aUXRyd^C4t0_-%3u^L%eFRM9WU
z8BHhf5Bc8mJxf84Q0Lek$@F-T16mzkglx3PpT`IT7e)J2ey<N8$omn*ULIx9G>xr(
zOqJ7=!-pWP!a)2i1+Yg7(h#6k(28x4_S!Ht?89E01K1!Cv%UnG+pv;Jf9!MjPPOO|
zUt%(sx?0*h8ykfk-VBr_&3;kX$Y)h;;7V&d+xLWn9pO+#Tc9%}uhN=I?X4{>;UMI+
zzNX#b*3J&4l(Q6-`rE^8_CQN8EM_GnCggvLpBLQTLg5ad9?2|vxjXO26Zflsz|eJi
zIOnf9G`D)@<f+#un=dnSql@(yFMKikXtaCRM8h}A)mvLvk?W%d>2>p7XTl)>oY?P-
zf5jg&^0cvXT-8Ei)0o7X3;UG)*!$6;!)!RFua_}R*eDw*cH^^YBL?gBk^H`Cc`&ha
z<-6VhX+-2ySH)(D;#|7=SUNjd#zu>uoX2^>{mQk9+l2ZK!7f<Ygf#^RaHde<E>^^l
zdI+&eJA2rkf(PAY=w=N&E`03{qJcVgL@-r3sb~{>cZblOZ4{1Il}RpNhwOBVFl#mn
zis~|I>2Ju63RTs+Bqs+5_TMGpLUl7fNvhA0?ASqqy);N@i_Jor$BAc1^-m<b@Fu}7
z9TLGVLGV<sm_AIfH--se%#kG!-c9leioJJ4BpMedQtF0R{5ndo?~D?{WQ9?<>sDr{
T2d4=24T?=u|2dWPj^_ObqUa^u

delta 1749
zcmbu9e@qi+7{~9u-j%ijyZ*RR7%VN&vmi002(-wytF*O6T3UKl+>flKty+X4E&Smh
zTcPO`1x(?vLBxbHGLdOQ#JJ5r9CifJA!BMb;uf+H=gbxdS&X_(*sk+0wk$F6pZDH-
z-fy1g$@lZT@5jrae;JG)O#w&g&J@4AyjbI@Q!Cv?>LQIw?Jo2Jk$TG(mCK{?6sg@B
zbzx~K>Y+EJ#y}T6oSGvp#UZ>O!H1Co{0@p^I2&hYmekS@Qa6UW;D;N427;grz*;K@
z=-TRLkW{d`CIGBYRUm2X&%GF6T^56v7C(9X)oSRmh>^S_21#p|*O=|Ep;rKEmv(63
zygiYeC}c_pQ|iv*pdmXqIeR*NE;%Pvwlugml|S75%`reAg1B<*1RYnmN7LrHha}B&
zA!X7at6e!A4O*nLnf90}yExExKugwDmeot#RqB+UppMhoUoyiVIVC*=g_XA~&MB;J
zUUpb*jmsjO`D0~QPr0kGjhPkf>>HU<onR<SIQa5Xa&uf7F5jc`-OCIt%C={1yIFsC
z@IpuWG}dF-w8WRqXu=X<vc@bNB<2cbZNVAU)d*|OhcHG!v;fNsEdYEGtg}mj?DPd)
z5}w%=Z1D!%u7HQpHmvM_A@hLd_JRCx`ur8Z0R+z)v6(glV%T;#?OgBnw)t@k!jJ6l
z$dGltS|N~P5De&KaZbp-1F*N$RKrYtqa(4JC<gFD7=H?{?-;i4kia@UqqqcCK6{}?
zS@w=t<n3IJ0X2unG0nAM-~D`fZ>|2i05|uZHU3y-k|@mbb75q(F;_lXITXm|@N3?v
z{m!7xl^Z9FM~1n)4AfVn;7uxyVA}9)!_fiR8R_nh4Cl>}0-mk#cy@kRkkixE(s=j%
zx)EdLNkdQe8ScnXJ(33?2uJY#WPD&l6>h<yiADZ3WhfU{R54*l06_v>f?EYcgw1?+
zxf&Y0EsP<303TR~kW#Rp9ZHadn9)|Qn4ZU#wg1k(fVl_E@PkJ%!~(N`S#s%jD_#o?
z!}!?0C~$T)1sXh7*Fg!Y^&Is2nYuk-CHg#t|M$|tc5AvrC#3LVtDf&b^&%__M}&Mw
zxt?PFiraLQ7INn5K|7_tg6O4`$c8x(MBPWB#UfIKqinWSm+5fUCXtRPb(~5=wIqP(
zMqOh_PBQtBog^Ou4ot5@^frW#;Ub$=PvLxs)Zz%l$HY*nUPl6vLko$v0R#df64N07
zMV*+Hq(}(U=|~Q#MfkdQ6vYvn6+tloD3Ad86yWzj70&wy-Fq$Rj2(K7QdOpHZQ1Ab
zYkY23i?7LVaQPZq170CFi!3a&w+0V*-7Q{^A>eBD2)JqGMP-hlH&Eg7yS)r;kC-bq
z(l)(tQ%Z(WSys{H_cL-P3*~F{Hnk>(AqwOD>$qH?oi<enf;zy;;X<Yd`RU1&)63(O
z`R1j5m1s0{X!Vj9lu|!&ANdxS55D<eb|TpPx#Qct=e)BEsvEz&0<%h?F!@DX&*fM-
zaXGgL89@wIah`}<n1j~knB>k_WT3a(d7<s=SiuM5`>LkBcVdboNJJZJ7JY~=O+|#F
zZ7G0Xe=<f!<Crl{nkKppmF(07o_LB!oZm`xlstQ+<uuQ{IrSkdUW18#Eip?b!UnS`
zbp#QQBE&@<alIlDI%VF9p0yLdZKo$i3L3SPp|^Jv_iJhCj(pm0$wC)=MEor}YH_m%
zZULgNiT>T<MrR)Z;%C}by+7$m1SV!y0ZnKN=)bC8$A)0>7)*TM$rSB`GfODiVl`s-
zSmG5Haq$>T#JibbE&Y+T8XfF~iSzUy*4-)NC@~jf()2NB34^`t$pM&na)SQSR?hq3
RJS@J05*+dL1ixlL{x=>>Bo+Vw