From 75abeb378b7a8d80cec3db321d8d55eda467274b Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 22 Jan 2026 09:46:18 +0000 Subject: [PATCH 01/15] Added converse api call to bedrock --- .../gen_user_names_file.py | 120 ++++++++++++------ generator/utilities/Bedrock.py | 77 ++++++----- 2 files changed, 131 insertions(+), 66 deletions(-) rename generator/{utilities => batch}/gen_user_names_file.py (72%) diff --git a/generator/utilities/gen_user_names_file.py b/generator/batch/gen_user_names_file.py similarity index 72% rename from generator/utilities/gen_user_names_file.py rename to generator/batch/gen_user_names_file.py index c8765fe..753a1b5 100644 --- a/generator/utilities/gen_user_names_file.py +++ b/generator/batch/gen_user_names_file.py @@ -1,6 +1,9 @@ +# python generator/batch/gen_user_names_file.py + import os import json import boto3 +from botocore.config import Config import sys import time import logging @@ -11,10 +14,47 @@ sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator") import cons -from utilities.Bedrock import Bedrock, prompt, system_prompt +from utilities.Bedrock import Bedrock + +system_prompt = """# Task + +You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin. + +# Requirements + +- Generate typical names for both male and female people. +- The names do not need to be traditional to the target European country. +- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique. +- You should return the first names and last names using a valid JSON object tagged as . +- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]} + +# Examples + +- Generate 2 first names and 2 last names for people from the country "Germany" -> {"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]} +- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> {"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]} +- Generate 3 first names and 3 last names for people from the country "France" -> {"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]} +- Generate 5 first names and 5 last names for people from the country "Spain" -> {"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]} +- Generate 6 first names and 6 last names for people from the country "Sweden" -> {"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]} +""" + +prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"' + +bedrock_config = { + "inferenceConfig":{ + "maxTokens":8192, + "temperature":0.5, + "topP":0.5, + }, + "system":[ + { + "text":system_prompt + } + ] +} def invoke_bedrock( model:Bedrock, + model_id:str, n_user_names:int, country:str, countrieseurope:pd.DataFrame, @@ -62,8 +102,10 @@ def invoke_bedrock( logging.info("Calling Bedrock ...") # call bedrock model formatted_prompt = prompt.format(n_user_names=n_user_names, country=country) - logging.info(formatted_prompt) - model_response = model.prompt(user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048) + messages = [{"role":"user", "content":[{"text":formatted_prompt}]}] + logging.info(messages) + model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048) + #model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig']) # split out answer text = model_response.split("")[1].split("")[0] # parse json @@ -117,54 +159,32 @@ def invoke_bedrock( logging.info(f"Wrote {fpath_temp_llama_lastnames} ...") return (tmp_firstname_country_data, tmp_lastname_country_data) -if __name__ == "__main__": - - # set up logging - lgr = logging.getLogger() - lgr.setLevel(logging.INFO) - - # load aws config - with open(cons.fpath_aws_session_token, "r") as j: - aws_config = json.loads(j.read()) - - # connect to aws boto3 - session = boto3.Session( - aws_access_key_id=aws_config['Credentials']["AccessKeyId"], - aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"], - aws_session_token=aws_config['Credentials']["SessionToken"], - region_name="us-east-1" - ) - - # create bedrock instance - bedrock = Bedrock(session=session, model_region="us-east-1", model_id="meta.llama3-70b-instruct-v1:0") +def main(bedrock, model_id, run_bedrock=False): + """ + Docstring for main + """ # load countries, firstnames and surnames files countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric']) orig_firstnames = pd.read_csv(cons.fpath_firstnames) orig_surnames = pd.read_csv(cons.fpath_lastnames) - # determine file size orig_filesize = int((orig_firstnames.shape[0] + orig_surnames.shape[0])/2) n_countries = countrieseurope.shape[0] - n_user_names = min(50, int(orig_filesize / n_countries)) - + n_user_names = min(2, int(orig_filesize / n_countries)) # generate user names - firstname_country_data = [] - lastname_country_data = [] - error_countries = [] - # switch to toggle bedrock calls - run_bedrock = False - + firstname_country_data, lastname_country_data, error_countries = [], [], [] # set countries list - countries_list = countrieseurope['name'].to_list() - #countries_list = ['Cyprus'] + #countries_list = countrieseurope['name'].to_list() + countries_list = ['Cyprus'] + # iterate over countries list for country in countries_list: logging.info(f"{country} ...") try: if run_bedrock: # call bedrock model and generate user names data - tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, n_user_names=n_user_names, country=country) + tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, model_id=model_id, n_user_names=n_user_names, country=country, countrieseurope=countrieseurope) logging.info("Waiting ...") # wait 20 seconds before retrying time.sleep(20) @@ -205,4 +225,32 @@ def invoke_bedrock( logging.info(f"output_lastname_country_df.shape: {output_lastname_country_df.shape}") output_lastname_country_df.to_csv(cons.fpath_llama_lastnames, index=False, encoding="latin1") else: - logging.info("WARNING Insufficient last name data generated.") \ No newline at end of file + logging.info("WARNING Insufficient last name data generated.") + +lgr = logging.getLogger() +lgr.setLevel(logging.INFO) + +if __name__ == "__main__": + # set aws region + aws_region = "us-east-1" + model_id="us.meta.llama3-1-70b-instruct-v1:0" + # load aws config + with open(cons.fpath_aws_session_token, "r") as j: + aws_config = json.loads(j.read()) + # connect to aws boto3 + session = boto3.Session( + aws_access_key_id=aws_config['Credentials']["AccessKeyId"], + aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"], + aws_session_token=aws_config['Credentials']["SessionToken"], + region_name=aws_region + ) + bedrock_runtime = session.client( + service_name="bedrock-runtime", + region_name=aws_region, + config=Config(retries={"max_attempts":1, "mode": "adaptive"}) + ) + # create bedrock instance + bedrock = Bedrock(bedrock_runtime=bedrock_runtime) + # execute main programme + main(bedrock=bedrock, run_bedrock=True, model_id=model_id) + diff --git a/generator/utilities/Bedrock.py b/generator/utilities/Bedrock.py index 9ea42d6..1e2ea0a 100644 --- a/generator/utilities/Bedrock.py +++ b/generator/utilities/Bedrock.py @@ -1,5 +1,5 @@ import json -import boto3 +from typing import Dict, List from beartype import beartype class Bedrock(): @@ -10,7 +10,7 @@ class Bedrock(): Parameters ---------- - session : boto3.Session + bedrock_runtime : boto3.Session A Boto3 session object configured with appropriate AWS credentials. model_region: str The AWS region where the Bedrock model is hosted. @@ -31,16 +31,14 @@ class Bedrock(): @beartype def __init__( self, - session:boto3.Session, - model_region="us-east-1", - model_id:str="meta.llama3-8b-instruct-v1:0", + bedrock_runtime, ): - self.client = session.client("bedrock-runtime", region_name=model_region) - self.model_id = model_id, - + self.bedrock_runtime = bedrock_runtime + @beartype def prompt( self, + model_id:str, user_prompt:str, system_prompt:str="", top_p:float=0.5, @@ -89,32 +87,51 @@ def prompt( # call bedrock model try: # Invoke the model with the request. - response = self.client.invoke_model(modelId=self.model_id, body=request) + response = self.bedrock_runtime.invoke_model(modelId=model_id, body=request) except Exception as e: - raise Exception(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}") + raise Exception(f"ERROR: Can't invoke '{model_id}'. Reason: {e}") # Decode and extract the response model_response = json.loads(response["body"].read()) response_text = model_response["generation"] return response_text + + @beartype + def converse( + self, + modelId:str, + messages:List, + system:List, + inference_config:Dict={"maxTokens":512, "temperature":0.5, "topP":0.5,}, + tools_config:Dict=None + ): + """ + Invoke the Bedrock model with the provided messages and configurations. -system_prompt = """# Task - -You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin. - -# Requirements - -- Generate typical names for both male and female people. -- The names do not need to be traditional to the target European country. -- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique. -- You should return the first names and last names using a valid JSON object tagged as . -- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]} - -# Examples - -- Generate 2 first names and 2 last names for people from the country "Germany" -> {"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]} -- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> {"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]} -- Generate 3 first names and 3 last names for people from the country "France" -> {"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]} -- Generate 5 first names and 5 last names for people from the country "Spain" -> {"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]} -- Generate 6 first names and 6 last names for people from the country "Sweden" -> {"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]}""" + Parameters + ---------- + messages : Dict + A list of message objects representing the conversation history. + system : Dict + A system message object providing context or instructions for the model. + inference_config : Dict + Configuration settings for inference parameters. + tools_config : Dict + Configuration settings for any tools to be used during inference. -prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"' \ No newline at end of file + Returns + ------- + Dict: + The response from the Bedrock Claude model. + + References + ---------- + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/converse.html + """ + payload = {"modelId": modelId, "messages": messages, "system": system} + if inference_config: + payload["inferenceConfig"] = inference_config + if tools_config: + payload["toolsConfig"] = tools_config + # call converse api + response = self.bedrock_runtime.converse(**payload) + return response From 3239e618b50137e27d50ab655d35ca08e32dd132 Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 25 Jan 2026 12:10:34 +0000 Subject: [PATCH 02/15] Generalised gen_user_names_file.py batch script to work for other data points such as email domains --- generator/batch/gen_user_names_file.py | 186 ++++++++++++------------- generator/cons.py | 8 ++ 2 files changed, 97 insertions(+), 97 deletions(-) diff --git a/generator/batch/gen_user_names_file.py b/generator/batch/gen_user_names_file.py index 753a1b5..cfce17f 100644 --- a/generator/batch/gen_user_names_file.py +++ b/generator/batch/gen_user_names_file.py @@ -10,34 +10,52 @@ import unidecode import pandas as pd import numpy as np +from typing import Dict sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator") import cons from utilities.Bedrock import Bedrock -system_prompt = """# Task +system_name_prompt = """# Task -You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin. +You are a name generator for people from different countries in Europe. +Your task is to generate an arbitrary N number of distinct and varied first names, or last names, for people from a given European country of origin. # Requirements - Generate typical names for both male and female people. - The names do not need to be traditional to the target European country. -- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique. +- Do not repeat any first names or last names more than once. +- Each individual first name must be unique and each individual last name must be unique. - You should return the first names and last names using a valid JSON object tagged as . -- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]} +- The valid JSON object should be of the following structures; `["name 1","name 2",...,"name N"]`. # Examples -- Generate 2 first names and 2 last names for people from the country "Germany" -> {"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]} -- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> {"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]} -- Generate 3 first names and 3 last names for people from the country "France" -> {"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]} -- Generate 5 first names and 5 last names for people from the country "Spain" -> {"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]} -- Generate 6 first names and 6 last names for people from the country "Sweden" -> {"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]} +## First Names + +- Generate 2 first names for people from the country "Germany" -> ["Max","Hannah"] +- Generate 4 first names for people from the country "United Kingdom" -> ["George","Richard","Katie","Mary"] +- Generate 3 first names for people from the country "France" -> ["Lola","Mathieu","Léa"] +- Generate 5 first names for people from the country "Spain" -> ["Juan","Cristina","Javier","Julia","Isabel"] +- Generate 6 first names for people from the country "Sweden" -> ["Tova","Alva","Casper","Märta","Axel","Elsa"] + +## Last Names + +- Generate 2 last names for people from the country "Germany" -> ["Müller","Schmidt"] +- Generate 4 last names for people from the country "United Kingdom" -> ["Smith","Taylor","Jones","Brown"] +- Generate 3 last names for people from the country "France" -> ["Benoît","Pierre","Lefort"] +- Generate 5 last names for people from the country "Spain" -> ["Garcia","Martinez","Rodriguez","Lopez","Gomez"] +- Generate 6 last names for people from the country "Sweden" -> ["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"] """ -prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"' +system_email_prompt = """ +""" + +firstname_prompt = 'Generate {n_data_points} first names for people from the country "{country}"' +surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"' +email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"' bedrock_config = { "inferenceConfig":{ @@ -47,7 +65,7 @@ }, "system":[ { - "text":system_prompt + "text":system_name_prompt } ] } @@ -55,9 +73,13 @@ def invoke_bedrock( model:Bedrock, model_id:str, - n_user_names:int, + data_point:str, + n_data_points:int, country:str, countrieseurope:pd.DataFrame, + prompt:str, + system_prompt:str, + country_fpath:str, ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Invokes the Bedrock model to generate user names for a specified country. @@ -71,8 +93,8 @@ def invoke_bedrock( ---------- model : Bedrock The Bedrock model instance used to generate names. - n_user_names : int - The number of user names to generate. + n_data_points : int + The number of data points to generate country : str The country for which to generate names. countrieseurope : pd.DataFrame @@ -101,131 +123,100 @@ def invoke_bedrock( """ logging.info("Calling Bedrock ...") # call bedrock model - formatted_prompt = prompt.format(n_user_names=n_user_names, country=country) + formatted_prompt = prompt.format(n_data_points=n_data_points, country=country) messages = [{"role":"user", "content":[{"text":formatted_prompt}]}] logging.info(messages) - model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048) - #model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig']) + #model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048) + model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig']) # split out answer text = model_response.split("")[1].split("")[0] # parse json try: - record_set = json.loads(text) + gen_data_list = json.loads(text) except json.JSONDecodeError as e: raise Exception(f"Error parsing JSON: {e}") logging.info("Processing results ...") # generate pandas dataframe - user_firstname_data = pd.Series(record_set["firstnames"], name="firstnames").to_frame().drop_duplicates(subset=["firstnames"]) - user_lastname_data = pd.Series(record_set["lastnames"], name="lastnames").to_frame().drop_duplicates(subset=["lastnames"]) - # add country - user_firstname_data['country'] = country - user_lastname_data['country'] = country - # join on country codes - llama_firstname_country_data = user_firstname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name']) - llama_lastname_country_data = user_lastname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name']) - # print shapes - logging.info(f"llama_firstname_country_data.shape: {llama_firstname_country_data.shape}") - logging.info(f"llama_lastname_country_data.shape: {llama_lastname_country_data.shape}") - # format output file paths - fpath_temp_llama_firstnames = cons.fpath_temp_llama_firstnames.format(country=country.lower()) - fpath_temp_llama_lastnames = cons.fpath_temp_llama_lastnames.format(country=country.lower()) - # check against previous iterations - tmp_firstname_country_data = pd.DataFrame() - tmp_lastname_country_data = pd.DataFrame() - if os.path.exists(fpath_temp_llama_firstnames): - tmp_firstname_country_data = pd.read_csv(fpath_temp_llama_firstnames, encoding="latin1") - if os.path.exists(fpath_temp_llama_lastnames): - tmp_lastname_country_data = pd.read_csv(fpath_temp_llama_lastnames, encoding="latin1") - # concatenate results - tmp_firstname_country_data = pd.concat(objs=[tmp_firstname_country_data, llama_firstname_country_data], axis=0, ignore_index=True) - tmp_lastname_country_data = pd.concat(objs=[tmp_lastname_country_data, llama_lastname_country_data], axis=0, ignore_index=True) + gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame() + gen_dataframe['country'] = country + gen_country_dataframe = pd.merge( + left=gen_dataframe, + right=countrieseurope.rename(columns={'name':'country'}), + left_on='country', + right_on='name', + how='inner' + ) # standardise names formatting standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x - tmp_firstname_country_data["firstnames"] = tmp_firstname_country_data["firstnames"].apply(lambda x: standardise_text_lambda(x)) - tmp_lastname_country_data["lastnames"] = tmp_lastname_country_data["lastnames"].apply(lambda x: standardise_text_lambda(x)) - # deduplicate data - tmp_firstname_country_data = tmp_firstname_country_data.drop_duplicates(subset=["firstnames"]) - tmp_lastname_country_data = tmp_lastname_country_data.drop_duplicates(subset=["lastnames"]) - # print shapes - logging.info(f"tmp_firstname_country_data.shape: {tmp_firstname_country_data.shape}") - logging.info(f"tmp_lastname_country_data.shape: {tmp_lastname_country_data.shape}") - # save firstnames names data to temp directory (if pairwise firstnames have been created) - if tmp_firstname_country_data.shape[0] >= llama_firstname_country_data.shape[0]: - tmp_firstname_country_data.to_csv(fpath_temp_llama_firstnames, index=False, encoding="latin1") - logging.info(f"Wrote {fpath_temp_llama_firstnames} ...") - # save lastnames data to temp directory (if pairwise lastnames have been created) - if tmp_lastname_country_data.shape[0] >= llama_lastname_country_data.shape[0]: - tmp_lastname_country_data.to_csv(fpath_temp_llama_lastnames, index=False, encoding="latin1") - logging.info(f"Wrote {fpath_temp_llama_lastnames} ...") - return (tmp_firstname_country_data, tmp_lastname_country_data) - -def main(bedrock, model_id, run_bedrock=False): + gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x)) + logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}") + # save generated data + gen_country_dataframe.to_csv(country_fpath, index=False, encoding="latin1") + logging.info(f"Wrote {country_fpath} ...") + return gen_country_dataframe + +def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False): """ Docstring for main """ - # load countries, firstnames and surnames files countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric']) - orig_firstnames = pd.read_csv(cons.fpath_firstnames) - orig_surnames = pd.read_csv(cons.fpath_lastnames) - # determine file size - orig_filesize = int((orig_firstnames.shape[0] + orig_surnames.shape[0])/2) n_countries = countrieseurope.shape[0] - n_user_names = min(2, int(orig_filesize / n_countries)) - # generate user names - firstname_country_data, lastname_country_data, error_countries = [], [], [] + # set lists to collect generated data with + gen_country_dataframe_list, error_countries = [], [] # set countries list #countries_list = countrieseurope['name'].to_list() countries_list = ['Cyprus'] - # iterate over countries list for country in countries_list: logging.info(f"{country} ...") + country_fpath=fpath_dict['country_fpath'].format(country) try: if run_bedrock: # call bedrock model and generate user names data - tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, model_id=model_id, n_user_names=n_user_names, country=country, countrieseurope=countrieseurope) + country_filter = (countrieseurope["name"] == country) + country_population = countrieseurope.loc[country_filter, "population"].iloc[0] + # set n data points for ai generator depending on type + if data_point in ("firstnames", "lastnames"): + n_data_points = int(np.log(country_population)**1.5) + elif data_point == "email_domains": + n_data_points = 5 + else: + raise ValueError(f"Invalid parameter data_point value {data_point}") + # invoke bedrock and generate data points + tmp_gen_country_data = invoke_bedrock( + model=bedrock, + model_id=model_id, + data_point=data_point, + n_data_points=n_data_points, + country=country, + countrieseurope=countrieseurope, + country_fpath=country_fpath + ) logging.info("Waiting ...") # wait 20 seconds before retrying time.sleep(20) else: - tmp_firstname_country_data = pd.read_csv(cons.fpath_temp_llama_firstnames.format(country=country.lower()), encoding="latin1") - tmp_lastname_country_data = pd.read_csv(cons.fpath_temp_llama_lastnames.format(country=country.lower()), encoding="latin1") + tmp_gen_country_data = pd.read_csv(country_fpath, encoding="latin1") # append to user country data - firstname_country_data.append(tmp_firstname_country_data) - lastname_country_data.append(tmp_lastname_country_data) + gen_country_dataframe_list.append(tmp_gen_country_data) except Exception as e: logging.info(e) error_countries.append(country) - # log if any countries failed to generate data if len(error_countries) > 0: logging.info(f"Failed to generated data for countries: {error_countries}") - - # load existing reference data - firstname_country_df = pd.read_csv(cons.fpath_llama_firstnames, encoding="latin1") - lastname_country_df = pd.read_csv(cons.fpath_llama_lastnames, encoding="latin1") - # append to country data lists - firstname_country_data.append(firstname_country_df) - lastname_country_data.append(lastname_country_df) # concatenate user country data together and deduplicate across firstnames and countries - output_firstname_country_df = pd.concat(firstname_country_data, axis=0, ignore_index=True) - output_lastname_country_df = pd.concat(lastname_country_data, axis=0, ignore_index=True) + output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True) # sort and deduplicate output data - output_firstname_country_df = output_firstname_country_df.drop_duplicates(subset=["country","firstnames"]).sort_values(by=["country","firstnames"]) - output_lastname_country_df = output_lastname_country_df.drop_duplicates(subset=["country","lastnames"]).sort_values(by=["country","lastnames"]) - + sort_dedup_cols = ["country",data_point] + output_gen_country_dataframe = output_gen_country_dataframe.drop_duplicates(subset=sort_dedup_cols).sort_values(by=sort_dedup_cols) # write data to disk - if output_firstname_country_df['country'].nunique() == n_countries: - logging.info(f"output_firstname_country_df.shape: {output_firstname_country_df.shape}") - output_firstname_country_df.to_csv(cons.fpath_llama_firstnames, index=False, encoding="latin1") + if output_gen_country_dataframe['country'].nunique() == n_countries: + logging.info(f"output_gen_country_dataframe.shape: {output_gen_country_dataframe.shape}") + output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="latin1") else: logging.info("WARNING Insufficient first name data generated.") - if output_lastname_country_df['country'].nunique() == n_countries: - logging.info(f"output_lastname_country_df.shape: {output_lastname_country_df.shape}") - output_lastname_country_df.to_csv(cons.fpath_llama_lastnames, index=False, encoding="latin1") - else: - logging.info("WARNING Insufficient last name data generated.") lgr = logging.getLogger() lgr.setLevel(logging.INFO) @@ -252,5 +243,6 @@ def main(bedrock, model_id, run_bedrock=False): # create bedrock instance bedrock = Bedrock(bedrock_runtime=bedrock_runtime) # execute main programme - main(bedrock=bedrock, run_bedrock=True, model_id=model_id) + for data_point, fpath_dict in cons.llama_data_point_fpaths.items(): + main(bedrock=bedrock, model_id=model_id, data_point=data_point, fpath_dict=fpath_dict, run_bedrock=True) diff --git a/generator/cons.py b/generator/cons.py index 6fdb599..77a459e 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -17,6 +17,7 @@ fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv') fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv') fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv') +fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv') fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv') fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv') fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv') @@ -24,10 +25,17 @@ fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt') fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv') fpath_llama_lastnames = os.path.join(subdir_data, 'ref', 'llama_lastnames.csv') +fpath_llama_email_domains = os.path.join(subdir_data, 'ref', 'llama_email_domains.csv') fpath_smartphones = os.path.join(subdir_data, 'ref', 'smartphones.csv') fpath_unittest_user_data = os.path.join(subdir_unittest, 'user_data.parquet') fpath_unittest_transaction_data = os.path.join(subdir_unittest, 'transaction_data.parquet') fpath_aws_session_token = os.path.join(subdir_creds,'sessionToken.json') +# set data points generated by llama +llama_data_point_fpaths = { + "firstnames":{"fpath":fpath_llama_firstnames, "country_fpath":fpath_temp_llama_firstnames}, + "lastnames":{"fpath":fpath_llama_lastnames, "country_fpath":fpath_temp_llama_lastnames}, + "email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains} + } # set url links to files available online url_european_populations = 'https://raw.githubusercontent.com/ajturner/acetate/master/places/Countries-Europe.csv' From 87cbf96319ff6c9e204f9b8363264ae8735898ce Mon Sep 17 00:00:00 2001 From: Oisin Date: Sun, 25 Jan 2026 12:12:21 +0000 Subject: [PATCH 03/15] Renamed generate bedrock data batch script --- .../batch/{gen_user_names_file.py => gen_bedrock_data.py} | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) rename generator/batch/{gen_user_names_file.py => gen_bedrock_data.py} (99%) diff --git a/generator/batch/gen_user_names_file.py b/generator/batch/gen_bedrock_data.py similarity index 99% rename from generator/batch/gen_user_names_file.py rename to generator/batch/gen_bedrock_data.py index cfce17f..f2af97c 100644 --- a/generator/batch/gen_user_names_file.py +++ b/generator/batch/gen_bedrock_data.py @@ -1,6 +1,5 @@ -# python generator/batch/gen_user_names_file.py +# python generator/batch/gen_bedrock_data.py -import os import json import boto3 from botocore.config import Config @@ -10,7 +9,6 @@ import unidecode import pandas as pd import numpy as np -from typing import Dict sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator") From 1baf3eda009632c8a9d07955d68eb713dddb7ccf Mon Sep 17 00:00:00 2001 From: Oisin Date: Tue, 27 Jan 2026 09:36:09 +0000 Subject: [PATCH 04/15] Passing new llama email domain reference file through to User object creation. --- generator/app/gen_random_telecom_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py index 28b0283..1c4c103 100644 --- a/generator/app/gen_random_telecom_data.py +++ b/generator/app/gen_random_telecom_data.py @@ -75,7 +75,7 @@ def gen_random_telecom_data( fpath_firstnames=cons.fpath_llama_firstnames, fpath_lastnames=cons.fpath_llama_lastnames, fpath_countries_europe=cons.fpath_countries_europe, - fpath_email_domain =cons.fpath_email_domain + fpath_email_domain =cons.fpath_llama_email_domains ) # generate random entity counts for each user From c827af09ba80843b18685e8915cd83de39b8dcb6 Mon Sep 17 00:00:00 2001 From: Oisin Date: Tue, 27 Jan 2026 09:36:51 +0000 Subject: [PATCH 05/15] Generalised function for generating user firstnames and lastnames using llama bedrock reference data. Applying generalised function to handle new llama email domains file --- generator/objects/User.py | 114 +++++++++----------------------------- 1 file changed, 25 insertions(+), 89 deletions(-) diff --git a/generator/objects/User.py b/generator/objects/User.py index 475d79a..2551966 100644 --- a/generator/objects/User.py +++ b/generator/objects/User.py @@ -17,10 +17,10 @@ def __init__( n_user_ids:int, start_date:str, end_date:str, - fpath_firstnames:str=cons.fpath_firstnames, - fpath_lastnames:str=cons.fpath_lastnames, + fpath_firstnames:str=cons.fpath_llama_firstnames, + fpath_lastnames:str=cons.fpath_llama_lastnames, fpath_countries_europe:str=cons.fpath_countries_europe, - fpath_email_domain :str=cons.fpath_email_domain , + fpath_email_domain :str=cons.fpath_llama_email_domains , ): """ The randomly generated user data model object @@ -34,13 +34,13 @@ def __init__( end_date : str The end date to generate users till fpath_firstnames : str - The full file path to the first names reference data, default is cons.fpath_firstnames. + The full file path to the first names reference data, default is cons.fpath_llama_firstnames. fpath_lastnames : str - The full file path to the last names reference data, default is cons.fpath_lastnames. + The full file path to the last names reference data, default is cons.fpath_llama_lastnames. fpath_countries_europe : str The full file path to the europe countries reference data, default is cons.fpath_countries_europe. fpath_email_domain : str - The full file path to the email domain reference data, default is cons.fpath_email_domain . + The full file path to the email domain reference data, default is cons.fpath_llama_email_domains . Attributes ---------- @@ -75,111 +75,47 @@ def __init__( self.fpath_firstnames = fpath_firstnames self.fpath_lastnames = fpath_lastnames self.fpath_countries_europe = fpath_countries_europe - self.fpath_email_domain = fpath_email_domain + self.fpath_email_domain = fpath_email_domain self.lam = cons.data_model_poisson_params["user"]["lambda"] self.power = cons.data_model_poisson_params["user"]["power"] self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power) self.user_ids = list(self.user_ids_cnts_dict.keys()) self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict) self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe) - self.user_ids_firstname_dict = self.gen_user_firstname(fpath_firstnames=self.fpath_firstnames) - self.user_ids_lastname_dict = self.gen_user_lastname(fpath_lastnames=self.fpath_lastnames) - self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain) + self.user_ids_firstname_dict = self.gen_user_bedrock_data(fpath_firstnames=self.fpath_firstnames, sample_column_name="firstnames") + self.user_ids_lastname_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_lastnames, sample_column_name="lastnames") + self.user_ids_email_domain_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_email_domain, sample_column_name="email_domains") self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date) @beartype - def gen_user_firstname( + def gen_user_bedrock_data( self, - fpath_firstnames:str, + fpath_bedrock_data:str, + sample_column_name:str, ) -> Dict[str, str]: """ - Generates a dictionary of random user id first names + Generates a dictionary of random user bedrock data, e.g. firstnames or lastnames Parameters ---------- - fpath_firstnames : str - The file path to the first names reference file + fpath_bedrock_data : str + The file path to the bedrock data reference file + sample_column_name : str + The column name to sample from in the bedrock data reference file Returns ------- Dict[str, str] - A dictionary of user id first names + A dictionary of user id bedrock data """ # load in list of first names - first_name_data = pd.read_csv(fpath_firstnames) + bedrock_data = pd.read_csv(fpath_bedrock_data) # randomly sample names firstnames according to country code and counts country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1) country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index() - country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: first_name_data.loc[(first_name_data["ISO numeric"] == series["country_code"]), "firstnames"].sample(n=series["count"], replace=True).to_list(), axis=1) - # create the key value pairs mapping user id to firstname - user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list() + country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True).to_list(), axis=1) + # create the key value pairs mapping user id to bedrock data points + user_ids_bedrock_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["sample"])), axis=1).to_list() # convert key value pairs to dict - user_ids_firstname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict() - return user_ids_firstname_dict - - @beartype - def gen_user_lastname( - self, - fpath_lastnames:str, - ) -> Dict[str, str]: - """ - Generates a dictionary of random user id last names. - - Parameters - ---------- - fpath_lastnames : str - The file path to the last names reference file. - - Returns - ------- - Dict[str, str] - A dictionary of user id last names. - """ - # load in list of last names - last_name_data = pd.read_csv(fpath_lastnames) - # randomly sample names firstnames according to country code and counts - country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1) - country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index() - country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: last_name_data.loc[(last_name_data["ISO numeric"] == series["country_code"]), "lastnames"].sample(n=series["count"], replace=True).to_list(), axis=1) - # create the key value pairs mapping user id to firstname - user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list() - # convert key value pairs to dict - user_ids_lastname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict() - return user_ids_lastname_dict - - @beartype - def gen_user_email_domain( - self, - fpath_email_domain:str, - ) -> Dict[str, str]: - """ - Generates a dictionary of random user id email domains - - Parameters - ---------- - fpath_email_domain : str - The file path to the email domains reference file - - Returns - ------- - Dict[str, str] - A dictionary of user id email domains - """ - # load domain names data - email_domain_data = pd.read_csv(fpath_email_domain, index_col=0) - # calculate the proportion of email domains - email_domain_data["proportion"] = email_domain_data["proportion"].divide(email_domain_data["proportion"].sum()) - # convert email domain proportions to a dictionary - email_domain_dict = email_domain_data.set_index("domain").to_dict()["proportion"] - # randomly choose the email domains based on proportions - user_email_domain_list = list( - np.random.choice( - a=list(email_domain_dict.keys()), - p=list(email_domain_dict.values()), - replace=True, - size=len(self.user_ids), - ) - ) - # return the user ids email domains - user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list)) - return user_ids_email_domain_dict + user_ids_bedrock_dict = pd.concat([pd.Series(d) for d in user_ids_bedrock_pairs])[country_code_dataframe["user_ids"]].to_dict() + return user_ids_bedrock_dict \ No newline at end of file From b3a3c60b2720799a38b561d7a9b1b75621feedde Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 09:08:32 +0000 Subject: [PATCH 06/15] Readded gen domain email function to user object. Replace firstname and lastname with first_name and last_name --- generator/app/gen_random_telecom_data.py | 7 +- generator/app/gen_trans_data.py | 9 +-- generator/app/gen_user_data.py | 4 +- generator/batch/gen_bedrock_data.py | 14 ++-- generator/cons.py | 33 ++++---- generator/objects/User.py | 78 ++++++++++++++----- .../unittests/app/test_gen_user_trans_data.py | 8 +- generator/unittests/objects/test_User.py | 42 +++++----- .../utilities/test_gen_obj_idhash_series.py | 6 +- .../test_gen_random_entity_counts.py | 6 +- 10 files changed, 124 insertions(+), 83 deletions(-) diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py index 1c4c103..363dc19 100644 --- a/generator/app/gen_random_telecom_data.py +++ b/generator/app/gen_random_telecom_data.py @@ -72,10 +72,11 @@ def gen_random_telecom_data( n_user_ids=programmeparams.n_users, start_date=programmeparams.registration_start_date, end_date=programmeparams.registration_end_date, - fpath_firstnames=cons.fpath_llama_firstnames, - fpath_lastnames=cons.fpath_llama_lastnames, + fpath_first_names=cons.fpath_llama_first_names, + fpath_last_names=cons.fpath_llama_last_names, fpath_countries_europe=cons.fpath_countries_europe, - fpath_email_domain =cons.fpath_llama_email_domains + fpath_email_domain=cons.fpath_email_domain, + fpath_bedrock_email_domain=cons.fpath_llama_email_domains ) # generate random entity counts for each user diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py index 363941d..e9f8bcc 100644 --- a/generator/app/gen_trans_data.py +++ b/generator/app/gen_trans_data.py @@ -118,14 +118,7 @@ def gen_trans_data( trans_data[['transaction_status', 'transaction_error_code']] = trans_data.apply(lambda series: gen_trans_status(series = series, rejection_rates_dict = rejection_rates_dict), result_type = 'expand', axis = 1) # order columns and sort rows by transaction date - user_cols = ['userid', 'firstname', 'lastname', 'registration_date', 'registration_country_code', 'uid', 'email_domain'] - device_cols = ['device_hash', 'device_type'] - card_cols = ['card_hash', 'card_type', 'card_country_code'] - ip_cols = ['ip_hash', 'ip_country_code'] - app_cols = ['application_hash'] - trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code'] - itr_cols = ['itr_hash'] - col_order = user_cols + device_cols + card_cols + ip_cols + app_cols + trans_cols + itr_cols + col_order = cons.user_cols + cons. device_cols + cons.card_cols + cons.ip_cols + cons.app_cols + cons.trans_cols + cons.itr_cols trans_data = trans_data[col_order].sort_values(by = 'transaction_date').reset_index(drop = True) return trans_data \ No newline at end of file diff --git a/generator/app/gen_user_data.py b/generator/app/gen_user_data.py index 9c5fa94..96cc651 100644 --- a/generator/app/gen_user_data.py +++ b/generator/app/gen_user_data.py @@ -50,8 +50,8 @@ def gen_user_data( # take a deep copy of the data user_data = random_entity_counts.copy() # add user data - user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_firstname_dict, idhash_key_name='uid', idhash_val_name='firstname') - user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_lastname_dict, idhash_key_name='uid', idhash_val_name='lastname') + user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_first_name_dict, idhash_key_name='uid', idhash_val_name='first_name') + user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_last_name_dict, idhash_key_name='uid', idhash_val_name='last_name') user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_dates_dict, idhash_key_name='uid', idhash_val_name='registration_date') user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_country_code_dict, idhash_key_name='uid', idhash_val_name='registration_country_code_alpha') user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_email_domain_dict, idhash_key_name='uid', idhash_val_name='email_domain') diff --git a/generator/batch/gen_bedrock_data.py b/generator/batch/gen_bedrock_data.py index f2af97c..241828b 100644 --- a/generator/batch/gen_bedrock_data.py +++ b/generator/batch/gen_bedrock_data.py @@ -51,7 +51,7 @@ system_email_prompt = """ """ -firstname_prompt = 'Generate {n_data_points} first names for people from the country "{country}"' +first_name_prompt = 'Generate {n_data_points} first names for people from the country "{country}"' surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"' email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"' @@ -102,13 +102,13 @@ def invoke_bedrock( ------- tuple: A tuple containing two pandas DataFrames: - - tmp_firstname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information. - - tmp_lastname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information. + - tmp_first_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information. + - tmp_last_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information. Raises ------ json.JSONDecodeError: If the model response cannot be parsed as JSON. - KeyError: If the expected keys ("firstnames", "lastnames") are missing from the JSON response. + KeyError: If the expected keys ("first_names", "last_names") are missing from the JSON response. Exception: If the merge with country data fails or file I/O operations encounter errors. Notes @@ -157,7 +157,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False): """ Docstring for main """ - # load countries, firstnames and surnames files + # load countries, first_names and surnames files countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric']) n_countries = countrieseurope.shape[0] # set lists to collect generated data with @@ -175,7 +175,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False): country_filter = (countrieseurope["name"] == country) country_population = countrieseurope.loc[country_filter, "population"].iloc[0] # set n data points for ai generator depending on type - if data_point in ("firstnames", "lastnames"): + if data_point in ("first_names", "last_names"): n_data_points = int(np.log(country_population)**1.5) elif data_point == "email_domains": n_data_points = 5 @@ -204,7 +204,7 @@ def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False): # log if any countries failed to generate data if len(error_countries) > 0: logging.info(f"Failed to generated data for countries: {error_countries}") - # concatenate user country data together and deduplicate across firstnames and countries + # concatenate user country data together and deduplicate across first_names and countries output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True) # sort and deduplicate output data sort_dedup_cols = ["country",data_point] diff --git a/generator/cons.py b/generator/cons.py index 77a459e..de5225f 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -15,16 +15,16 @@ fpath_randomtelecomtransdata = os.path.join(subdir_data,'RandomTelecomPayments.csv') fpath_randomtelecomusersdata = os.path.join(subdir_data,'RandomTelecomUsers.parquet') fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv') -fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv') -fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv') +fpath_temp_llama_first_names = os.path.join(subdir_data, 'temp', 'llama_first_names_{country}.csv') +fpath_temp_llama_last_names = os.path.join(subdir_data, 'temp', 'llama_last_names_{country}.csv') fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv') fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv') fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv') fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv') -fpath_firstnames = os.path.join(subdir_data, 'ref', 'first-names.txt') -fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt') -fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv') -fpath_llama_lastnames = os.path.join(subdir_data, 'ref', 'llama_lastnames.csv') +fpath_first_names = os.path.join(subdir_data, 'ref', 'first-names.txt') +fpath_last_names = os.path.join(subdir_data, 'ref', 'last-names.txt') +fpath_llama_first_names = os.path.join(subdir_data, 'ref', 'llama_first_names.csv') +fpath_llama_last_names = os.path.join(subdir_data, 'ref', 'llama_last_names.csv') fpath_llama_email_domains = os.path.join(subdir_data, 'ref', 'llama_email_domains.csv') fpath_smartphones = os.path.join(subdir_data, 'ref', 'smartphones.csv') fpath_unittest_user_data = os.path.join(subdir_unittest, 'user_data.parquet') @@ -32,8 +32,8 @@ fpath_aws_session_token = os.path.join(subdir_creds,'sessionToken.json') # set data points generated by llama llama_data_point_fpaths = { - "firstnames":{"fpath":fpath_llama_firstnames, "country_fpath":fpath_temp_llama_firstnames}, - "lastnames":{"fpath":fpath_llama_lastnames, "country_fpath":fpath_temp_llama_lastnames}, + "first_names":{"fpath":fpath_llama_first_names, "country_fpath":fpath_temp_llama_first_names}, + "last_names":{"fpath":fpath_llama_last_names, "country_fpath":fpath_temp_llama_last_names}, "email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains} } @@ -84,11 +84,11 @@ data_model_poisson_params = {'user':{'lambda':20, 'power':1}, 'device':{'lambda':0.2, 'power':2}, 'card':{'lambda':0.1, 'power':2}, 'ip':{'lambda':1.3, 'power':2}, 'application':{'lambda':1, 'power':2}, 'transaction':{'lambda':5, 'power':2}} data_model_shared_entities_dict = {'ip':0.05, 'card':0.005, 'device':0.01} data_model_null_rates = {'card':0.05} -data_model_card_types_dict = {'visa':0.5, 'mastercard':0.5} -data_model_payment_channels = {'paypal':0.4, 'adyen':0.15, 'appstore':0.25, 'worldpay':0.15, 'docomo':0.05} -data_model_transaction_status = {'successful':0.94, 'pending':0.03, 'rejected':0.03} +data_model_card_types_dict = {'Visa':0.5, 'Mastercard':0.5} +data_model_payment_channels = {'PayPal':0.4, 'Adyen':0.15, 'AppStore':0.25, 'WorldPay':0.15, 'Docomo':0.05} +data_model_transaction_status = {'Successful':0.94, 'Pending':0.03, 'Rejected':0.03} data_model_inconsistent_country_codes_rejection_rate = {1:0.001, 2:0.005, 3:0.01} -data_model_non_card_trans_methods = {'wallet':0.95, 'points':0.05} +data_model_non_card_trans_methods = {'Wallet':0.95, 'Points':0.05} data_model_rejection_codes_fraud = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.55, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.05, 'E904:InsufficientFunds':0.1} data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3} @@ -96,4 +96,11 @@ data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} # set lists of generator object types -object_types = ["device","card","ip","transaction","application"] \ No newline at end of file +object_types = ["device","card","ip","transaction","application"] +user_cols = ['userid', 'first_name', 'last_name', 'registration_date', 'registration_country_code', 'uid', 'email_domain'] +device_cols = ['device_hash', 'device_type'] +card_cols = ['card_hash', 'card_type', 'card_country_code'] +ip_cols = ['ip_hash', 'ip_country_code'] +app_cols = ['application_hash'] +trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code'] +itr_cols = ['itr_hash'] \ No newline at end of file diff --git a/generator/objects/User.py b/generator/objects/User.py index 2551966..40139b8 100644 --- a/generator/objects/User.py +++ b/generator/objects/User.py @@ -17,10 +17,11 @@ def __init__( n_user_ids:int, start_date:str, end_date:str, - fpath_firstnames:str=cons.fpath_llama_firstnames, - fpath_lastnames:str=cons.fpath_llama_lastnames, + fpath_first_names:str=cons.fpath_llama_first_names, + fpath_last_names:str=cons.fpath_llama_last_names, fpath_countries_europe:str=cons.fpath_countries_europe, - fpath_email_domain :str=cons.fpath_llama_email_domains , + fpath_email_domain:str=cons.fpath_email_domain, + fpath_bedrock_email_domain:str=cons.fpath_llama_email_domains, ): """ The randomly generated user data model object @@ -33,10 +34,10 @@ def __init__( The start date to generate users from end_date : str The end date to generate users till - fpath_firstnames : str - The full file path to the first names reference data, default is cons.fpath_llama_firstnames. - fpath_lastnames : str - The full file path to the last names reference data, default is cons.fpath_llama_lastnames. + fpath_first_names : str + The full file path to the first names reference data, default is cons.fpath_llama_first_names. + fpath_last_names : str + The full file path to the last names reference data, default is cons.fpath_llama_last_names. fpath_countries_europe : str The full file path to the europe countries reference data, default is cons.fpath_countries_europe. fpath_email_domain : str @@ -58,9 +59,9 @@ def __init__( The user id counts dictionary user_ids_props_dict : Dict[str, float] The user id proportions dictionary - user_ids_firstname_dict : Dict[str, str] + user_ids_first_name_dict : Dict[str, str] The user id first names dictionary - user_ids_lastname_dict : Dict[str, str] + user_ids_last_name_dict : Dict[str, str] The user id last names dictionary user_ids_country_code_dict : Dict[str, str] The user id country codes dictionary @@ -72,29 +73,30 @@ def __init__( self.n_user_ids = n_user_ids self.start_date = start_date self.end_date = end_date - self.fpath_firstnames = fpath_firstnames - self.fpath_lastnames = fpath_lastnames + self.fpath_first_names = fpath_first_names + self.fpath_last_names = fpath_last_names self.fpath_countries_europe = fpath_countries_europe self.fpath_email_domain = fpath_email_domain + self.fpath_bedrock_email_domain = fpath_bedrock_email_domain self.lam = cons.data_model_poisson_params["user"]["lambda"] self.power = cons.data_model_poisson_params["user"]["power"] self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power) self.user_ids = list(self.user_ids_cnts_dict.keys()) self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict) self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe) - self.user_ids_firstname_dict = self.gen_user_bedrock_data(fpath_firstnames=self.fpath_firstnames, sample_column_name="firstnames") - self.user_ids_lastname_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_lastnames, sample_column_name="lastnames") - self.user_ids_email_domain_dict = self.gen_user_bedrock_data(fpath_lastnames=self.fpath_email_domain, sample_column_name="email_domains") + self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_first_names=self.fpath_first_names, sample_column_name="first_names") + self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_last_names=self.fpath_last_names, sample_column_name="last_names") + self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain) self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date) @beartype - def gen_user_bedrock_data( + def gen_user_bedrock_name_data( self, fpath_bedrock_data:str, sample_column_name:str, ) -> Dict[str, str]: """ - Generates a dictionary of random user bedrock data, e.g. firstnames or lastnames + Generates a dictionary of random user bedrock data, e.g. first_names or last_names Parameters ---------- @@ -110,12 +112,50 @@ def gen_user_bedrock_data( """ # load in list of first names bedrock_data = pd.read_csv(fpath_bedrock_data) - # randomly sample names firstnames according to country code and counts + # randomly sample names first_names according to country code and counts country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1) country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index() - country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True).to_list(), axis=1) + country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True, weights=None).to_list(), axis=1) # create the key value pairs mapping user id to bedrock data points user_ids_bedrock_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["sample"])), axis=1).to_list() # convert key value pairs to dict user_ids_bedrock_dict = pd.concat([pd.Series(d) for d in user_ids_bedrock_pairs])[country_code_dataframe["user_ids"]].to_dict() - return user_ids_bedrock_dict \ No newline at end of file + return user_ids_bedrock_dict + + @beartype + def gen_user_bedrock_email_domain( + self, + fpath_email_domain:str, + fpath_bedrock_email_domain:str, + ) -> Dict[str, str]: + """ + Generates a dictionary of random user id email domains + + Parameters + ---------- + fpath_email_domain : str + The file path to the email domains reference file + + Returns + ------- + Dict[str, str] + A dictionary of user id email domains + """ + # load domain names data + email_domain_data = pd.read_csv(fpath_email_domain, index_col=0) + # calculate the proportion of email domains + email_domain_data["proportion"] = email_domain_data["proportion"].divide(email_domain_data["proportion"].sum()) + # convert email domain proportions to a dictionary + email_domain_dict = email_domain_data.set_index("domain").to_dict()["proportion"] + # randomly choose the email domains based on proportions + user_email_domain_list = list( + np.random.choice( + a=list(email_domain_dict.keys()), + p=list(email_domain_dict.values()), + replace=True, + size=len(self.user_ids), + ) + ) + # return the user ids email domains + user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list)) + return user_ids_email_domain_dict \ No newline at end of file diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py index 540bb1c..6bef4c7 100644 --- a/generator/unittests/app/test_gen_user_trans_data.py +++ b/generator/unittests/app/test_gen_user_trans_data.py @@ -34,8 +34,8 @@ np.random.seed(seed=programmeparams.random_seed) # create relative file paths -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1] @@ -48,8 +48,8 @@ n_user_ids=programmeparams.n_users, start_date=programmeparams.registration_start_date, end_date=programmeparams.registration_end_date, - fpath_firstnames=fpath_firstnames, - fpath_lastnames=fpath_lastnames, + fpath_first_names=fpath_first_names, + fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain ) diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py index 92a30bd..4471a71 100644 --- a/generator/unittests/objects/test_User.py +++ b/generator/unittests/objects/test_User.py @@ -21,13 +21,13 @@ "4264861381989413": 0.20212765957446807, "6720317315593519": 0.2765957446808511, } -exp_user_ids_firstname_dict = { +exp_user_ids_first_name_dict = { "6374692674377254": "simone", "1751409580926382": "francesca", "4264861381989413": "igor", "6720317315593519": "beckett", } -exp_user_ids_lastname_dict = { +exp_user_ids_last_name_dict = { "6374692674377254": "de filippo", "1751409580926382": "gagliardi", "4264861381989413": "lupu", @@ -59,16 +59,16 @@ random.seed(cons.unittest_seed) np.random.seed(cons.unittest_seed) -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] -user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) +user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict obs_user_ids_props_dict = user_object.user_ids_props_dict -obs_user_ids_firstname_dict = user_object.user_ids_firstname_dict -obs_user_ids_lastname_dict = user_object.user_ids_lastname_dict +obs_user_ids_first_name_dict = user_object.user_ids_first_name_dict +obs_user_ids_last_name_dict = user_object.user_ids_last_name_dict obs_user_ids_country_code_dict = user_object.user_ids_country_code_dict obs_user_ids_email_domain_dict = user_object.user_ids_email_domain_dict obs_user_ids_dates_dict = user_object.user_ids_dates_dict @@ -86,10 +86,10 @@ def setUp(self): self.obs_user_ids_cnts_dict = obs_user_ids_cnts_dict self.exp_user_ids_props_dict = exp_user_ids_props_dict self.obs_user_ids_props_dict = obs_user_ids_props_dict - self.exp_user_ids_firstname_dict = exp_user_ids_firstname_dict - self.obs_user_ids_firstname_dict = obs_user_ids_firstname_dict - self.exp_user_ids_lastname_dict = exp_user_ids_lastname_dict - self.obs_user_ids_lastname_dict = obs_user_ids_lastname_dict + self.exp_user_ids_first_name_dict = exp_user_ids_first_name_dict + self.obs_user_ids_first_name_dict = obs_user_ids_first_name_dict + self.exp_user_ids_last_name_dict = exp_user_ids_last_name_dict + self.obs_user_ids_last_name_dict = obs_user_ids_last_name_dict self.exp_user_ids_country_code_dict = exp_user_ids_country_code_dict self.obs_user_ids_country_code_dict = obs_user_ids_country_code_dict self.exp_user_ids_email_domain_dict = exp_user_ids_email_domain_dict @@ -108,8 +108,8 @@ def setUp(self): def test_type(self): self.assertEqual(type(self.obs_user_ids_cnts_dict), type(self.exp_user_ids_cnts_dict)) self.assertEqual(type(self.obs_user_ids_props_dict), type(self.exp_user_ids_props_dict)) - self.assertEqual(type(self.obs_user_ids_firstname_dict),type(self.exp_user_ids_firstname_dict),) - self.assertEqual(type(self.obs_user_ids_lastname_dict), type(self.exp_user_ids_lastname_dict)) + self.assertEqual(type(self.obs_user_ids_first_name_dict),type(self.exp_user_ids_first_name_dict),) + self.assertEqual(type(self.obs_user_ids_last_name_dict), type(self.exp_user_ids_last_name_dict)) self.assertEqual(type(self.obs_user_ids_country_code_dict),type(self.exp_user_ids_country_code_dict),) self.assertEqual(type(self.obs_user_ids_email_domain_dict),type(self.exp_user_ids_email_domain_dict),) self.assertEqual(type(self.obs_user_ids_dates_dict), type(self.exp_user_ids_dates_dict)) @@ -121,8 +121,8 @@ def test_type(self): def test_len(self): self.assertEqual(len(self.obs_user_ids_cnts_dict), len(self.exp_user_ids_cnts_dict)) self.assertEqual(len(self.obs_user_ids_props_dict), len(self.exp_user_ids_props_dict)) - self.assertEqual(len(self.obs_user_ids_firstname_dict), len(self.exp_user_ids_firstname_dict)) - self.assertEqual(len(self.obs_user_ids_lastname_dict), len(self.exp_user_ids_lastname_dict)) + self.assertEqual(len(self.obs_user_ids_first_name_dict), len(self.exp_user_ids_first_name_dict)) + self.assertEqual(len(self.obs_user_ids_last_name_dict), len(self.exp_user_ids_last_name_dict)) self.assertEqual(len(self.obs_user_ids_country_code_dict),len(self.exp_user_ids_country_code_dict),) self.assertEqual(len(self.obs_user_ids_email_domain_dict),len(self.exp_user_ids_email_domain_dict),) self.assertEqual(len(self.obs_user_ids_dates_dict), len(self.exp_user_ids_dates_dict)) @@ -130,8 +130,8 @@ def test_len(self): def test_keys(self): self.assertEqual(list(self.obs_user_ids_cnts_dict.keys()),list(self.exp_user_ids_cnts_dict.keys()),) self.assertEqual(list(self.obs_user_ids_props_dict.keys()),list(self.exp_user_ids_props_dict.keys()),) - self.assertEqual(list(self.obs_user_ids_firstname_dict.keys()),list(self.exp_user_ids_firstname_dict.keys()),) - self.assertEqual(list(self.obs_user_ids_lastname_dict.keys()),list(self.exp_user_ids_lastname_dict.keys()),) + self.assertEqual(list(self.obs_user_ids_first_name_dict.keys()),list(self.exp_user_ids_first_name_dict.keys()),) + self.assertEqual(list(self.obs_user_ids_last_name_dict.keys()),list(self.exp_user_ids_last_name_dict.keys()),) self.assertEqual(list(self.obs_user_ids_country_code_dict.keys()),list(self.exp_user_ids_country_code_dict.keys()),) self.assertEqual(list(self.obs_user_ids_email_domain_dict.keys()),list(self.exp_user_ids_email_domain_dict.keys()),) self.assertEqual(list(self.obs_user_ids_dates_dict.keys()),list(self.exp_user_ids_dates_dict.keys()),) @@ -139,8 +139,8 @@ def test_keys(self): def test_values(self): self.assertEqual(list(self.obs_user_ids_cnts_dict.values()),list(self.exp_user_ids_cnts_dict.values()),) self.assertEqual(list(self.obs_user_ids_props_dict.values()),list(self.exp_user_ids_props_dict.values()),) - self.assertEqual(list(self.obs_user_ids_firstname_dict.values()),list(self.exp_user_ids_firstname_dict.values()),) - self.assertEqual(list(self.obs_user_ids_lastname_dict.values()),list(self.exp_user_ids_lastname_dict.values()),) + self.assertEqual(list(self.obs_user_ids_first_name_dict.values()),list(self.exp_user_ids_first_name_dict.values()),) + self.assertEqual(list(self.obs_user_ids_last_name_dict.values()),list(self.exp_user_ids_last_name_dict.values()),) self.assertEqual(list(self.obs_user_ids_country_code_dict.values()),list(self.exp_user_ids_country_code_dict.values()),) self.assertEqual(list(self.obs_user_ids_email_domain_dict.values()),list(self.exp_user_ids_email_domain_dict.values()),) self.assertEqual(list(self.obs_user_ids_dates_dict.values()),list(self.exp_user_ids_dates_dict.values()),) @@ -148,8 +148,8 @@ def test_values(self): def test_object(self): self.assertEqual(self.obs_user_ids_cnts_dict, self.exp_user_ids_cnts_dict) self.assertEqual(self.obs_user_ids_props_dict, self.exp_user_ids_props_dict) - self.assertEqual(self.obs_user_ids_firstname_dict, self.exp_user_ids_firstname_dict) - self.assertEqual(self.obs_user_ids_lastname_dict, self.exp_user_ids_lastname_dict) + self.assertEqual(self.obs_user_ids_first_name_dict, self.exp_user_ids_first_name_dict) + self.assertEqual(self.obs_user_ids_last_name_dict, self.exp_user_ids_last_name_dict) self.assertEqual(self.obs_user_ids_country_code_dict, self.exp_user_ids_country_code_dict) self.assertEqual(self.obs_user_ids_email_domain_dict, self.exp_user_ids_email_domain_dict) self.assertEqual(self.obs_user_ids_dates_dict, self.exp_user_ids_dates_dict) diff --git a/generator/unittests/utilities/test_gen_obj_idhash_series.py b/generator/unittests/utilities/test_gen_obj_idhash_series.py index 18faa86..7e3eabd 100644 --- a/generator/unittests/utilities/test_gen_obj_idhash_series.py +++ b/generator/unittests/utilities/test_gen_obj_idhash_series.py @@ -20,8 +20,8 @@ start_date = cons.unittest_registration_start_date end_date = cons.unittest_registration_end_date n_user_ids = cons.unittest_n_entities -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1] @@ -30,7 +30,7 @@ np.random.seed(cons.unittest_seed) # create user object -user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) +user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) # generate random entity counts random_entity_counts = gen_random_entity_counts(user_obj=user_object) # generate random entity values diff --git a/generator/unittests/utilities/test_gen_random_entity_counts.py b/generator/unittests/utilities/test_gen_random_entity_counts.py index 58a5522..45c8d27 100644 --- a/generator/unittests/utilities/test_gen_random_entity_counts.py +++ b/generator/unittests/utilities/test_gen_random_entity_counts.py @@ -19,11 +19,11 @@ random.seed(cons.unittest_seed) np.random.seed(cons.unittest_seed) -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] -user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) +user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) exp_randomentity_counts_dict = { 'uid': ['6374692674377254', '6720317315593519', '4264861381989413', '1751409580926382'], From 31593e227503f76d5f38982ed95bc8b8fed1b8c2 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 09:36:59 +0000 Subject: [PATCH 07/15] Updated names reference files and unittest data --- ...a_firstnames.csv => llama_first_names.csv} | 2 +- ...ama_lastnames.csv => llama_last_names.csv} | 2 +- data/unittest/transaction_data.parquet | Bin 30585 -> 30616 bytes data/unittest/user_data.parquet | Bin 18980 -> 18988 bytes 4 files changed, 2 insertions(+), 2 deletions(-) rename data/ref/{llama_firstnames.csv => llama_first_names.csv} (99%) rename data/ref/{llama_lastnames.csv => llama_last_names.csv} (99%) diff --git a/data/ref/llama_firstnames.csv b/data/ref/llama_first_names.csv similarity index 99% rename from data/ref/llama_firstnames.csv rename to data/ref/llama_first_names.csv index f7b0e44..4668401 100644 --- a/data/ref/llama_firstnames.csv +++ b/data/ref/llama_first_names.csv @@ -1,4 +1,4 @@ -firstnames,country,ISO numeric +first_names,country,ISO numeric agrina,Albania,8 agron,Albania,8 albana,Albania,8 diff --git a/data/ref/llama_lastnames.csv b/data/ref/llama_last_names.csv similarity index 99% rename from data/ref/llama_lastnames.csv rename to data/ref/llama_last_names.csv index 9808434..a3d215e 100644 --- a/data/ref/llama_lastnames.csv +++ b/data/ref/llama_last_names.csv @@ -1,4 +1,4 @@ -lastnames,country,ISO numeric +last_names,country,ISO numeric bajramaj,Albania,8 bajrami,Albania,8 bardhi,Albania,8 diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet index 47b78f12a50db976630997bc7713347cfe14e7f6..d10238f8cde94df5a6c700ecd6373481426bf5f4 100644 GIT binary patch delta 2654 zcmaJ@TWlj|6}D${F=3Z=cjLs#Zj!xZv(YBi*v@8ISBu!+c6>|h`jU8N%tJDs8QWve zjJ+OTu&D@r;-M8Ck2yUF(QGOWNo>eNHk&P{nxf0*`kmeN2m7+? zgJGA`HR2jMeT82fqAK@TVz1&hdN zLhAKwv(`|ktCZMHiQ2Gz;+ou5465a74Nb;lbpI;jdY!pH1IInJdzerQG;QT~HUj= zTd}#l^~5ij*;~xS>&(K3PcaKGJjI;8@<2%`4ic%er{4Sqv+%R05x_4#n6!m?Ej2fF z^EqbW_Opn2FnqX^iVpwkTg<{Q;n8$u`0W>&g}=ki^cPOuxy?*`FudTHc;_W%+-93v zc8}CWLl#ZvoSLkjO!woGxK48OW?d(tWqn*^n6`1k{;d% z{}vmgg-Ez3hy8F}oH;spx1jK}Zou*O1f-=)!9!Nd7hdwdp-i*?_lNXW?#1ACft(3V#&c{gN_>w*g|5AF+WLxG|J(8gn z1)sk0!PhEI$3RZ^FHQe)Tj0~aWDq{taXa#6!58)UnPP#E2h-jy7&)^L^~xJTq6#Z<4NKFO)Yz3+)Ksxl zs-%azN*3i{yrXd8uB7`{Vrr)pGx8cXG?wwQ`nXEe#Ad0L$X@Qm3=bWE4n~b_nk>qC zt}FSpHue^B#JU@mSS!(u`*Cn0`0NrX5^iCC_fX*RAtp%^5+!SNx~%!4a)RKv1qIR} zB##0KrRn&sQQ1pkgsi87h&M@zAR1-TK{>QVO$|yFv8bw4UZqaf5l8}P?Zo+|2UJ6X zSC?4$QFG=RWgsG->GHmES@KaZ29|2G>{8|PEPEBDwJ_46iKVk%sYn6>yw@0qw~b?; z#Q9{C;JQBj zIqHoBw%8QQR?x%E*6h|pJsfT|j545WLZ241k3Qv8?I%`8XK!zFp_0qr9WHN&ayN?Zh#MwPzA1=^YDJdiL*A`&mpww zXIa0=7UAnRW)>|A;bC)X1r?}Ku3BmZmRqwq8hqp-qn{266y`|{LQZd4v=NKKdhL_# z+xKs|L-8yPac+@bX#f{Ryn1M{P=XW<`l`vcq0>G$Dfq%&-1c zzk>n^78|Gxz2=!ymSf!Ex>m{O26oX3cFR%+e%O3;cVJYfJpLY@A8a}5fj0HMKS~zR ztOXwSR|`IGS>!HTN-wqV;4L4t4_E~_V2|@e0$!0zlO>Aph1m1=83d_=0}caKY5*ST zENudk#8zT``%cy^=0kH~)e~+NCMMH#@Jl36vK9bk2$A&)3#578ZQ{znrMHTUbPE+h z38`6ciiIbdv#YcnBlmtr2arUCCYL6d6iz%6OLS9Uc=rN^-RATp`oaOr_40hYp5ND| QpX=OkeAk2jD3Of!KTUN#cK`qY delta 2501 zcma)7U2I$R6}NBVd{H26v@wmBy#wQ6(llw~oNK4J6+$@j z%{^bg|2gOXJI8N)VgJwP_UjK$yG^;#)ar^{S4V9&Tf5Ydoi^uR?at5aXPuuqoYT%R z=h*!D?3qzAxu(qh^0ek%op#QwJ2|KAh|M-JPk{u0fzR8WS8fd?lWUZo=*PZ%3?4oF z@QD3+2W$nM5xG`tG^=%O3~e>JTayikeRfstYLyApbgFejMKvP-f^+_gyE&za1GCE^ z!$-mE$Lm^IQ<|DOg`tg>qG*jqp=FE{G+YTS-2RQuHD25QFm#zaK3OQ$8_kMrYWLaJ z=h4%7=swOhVPIGG?-zc_jrASiAJK@TuU(E!jf@{OV>SK-=lZvO_Bn3pk1umeZ@kP| zz3P_?EPfB@$&+{e_!Tt$#4_EK^R;AP?yFb1r7N$Z&30z$ru^W)$%x~f*SV#)A+lK> zfBh|P=}&KRa6NnmzKS22pZ(2;-0Y{0rHR>}{gj)r+2+o=#%gj!l^d?V?BAKH+efe{ zIGjG}uquSNFXj$3qf{L#GW+FAA5Ynf@aW}x4;4dgKqBD!C_{G}cKFE1R*nMPMx zNwp-=W5B9faqQ|SqUn@F@acIG-d8H{`}{n-U>?1#n-kRudHtSzCDqk3{&L=%GN`T*G{A_y=L~lgVHa}< zyJgYe$_|pmLg5dkWAKuu!({dFjoe(YQJIE4t!`;kx+qRI zP3Fwqwv^rUCg5MnS@^^_O7MR6%d&FuVePE{?&^&0`QO&dn%7 z*q0YGEj8>fV{ck{Gt_c+q>|TcGrQ#b)&#;3Hb26 zn_p2%>p@mh%}iJE8tr^3dOjMA?rsFn?XH(usc^AFzIqe7Py(5Xf?L8?Q$9Dx4!4C0XVVjJt;UBr+x5LmLqwxrAqreg*vrUvkX;x&c z#-fY~ouK87a){)_CWZzrD2bqgB!{*MbB7)gdN9glvOjd|y$f)yx%e0Yus}_X#QIz$ zFU)J1=lS!=HqYbvY%`4sGWIwh8iEx_vu`LQJkvGzEe zX)L>}S+g)>`GFTNB7$ zWTfG*O4&r!@)F93yIdS(aBvB9O~W-ON&dDS!(tTAZ{8A5)E$bS>vQSG_Co$qz;eG3 z2k}F|JvVzBh50G(ZV#?De=xn0@+m@^v_&47CBomDcOSRh`*&&WLNc%QkbcW+Fw^?s z9fKe(iS)VHt_6qk%R&PM~(s2v-Q?FMav0975zlw3!SWcx^p6lIB)<;WtLAs{Kslt{@+ zWPL715ujU_Ugl0k(4j+H?^!Z-@Sji=1u}FAx^(H%_fC$ZI4RJeltA3w`@P@$K7RKk z_`{Rnk8Wu0Qn~V|dGMfa)pl$7hbBkHP9XH#1?Bw7`5z09 z&jo)zs`Fkj_<(BBtMp}L^c=F#Kcf@$FuFF%2tYJ*dVF*k{cAM1N82l(UXh5?>UitnQkPSV@*TX!a7gC zj?bO0e~Le%zow?>H}PruY;~L-Coa>S#Kc9<(DQyt@ekvmV=54#KmFWF= zh(1dIbagKN4wFu8r*45ErzVH3Vyva}TDw$S3OjYbZ16G9+=;n%O4v7OE-^FKuI)D6Z1)g=_S0@uPLjcFW=ADwM2k`$K-e z7vU7O#2H5DIw_%P6*c#N5>ESJXV{(7hM~WKC67$d?T8AkD=cH? zjC$I2>KTIO#Fs=L@IDOdPV)@(9-igop)V_M`MKabA68MNHLDP1G{8a#zT?GfD0bf z1_LK6V(3E0R{SaqBgC*&wjd^piAjgYJVab55E7QLIOZodYgb>3x>c4HU5yvjs_R z(TAzY0=u>$g{TkBz!Cv~EO~gUKj;)zk|BK&S(UDf9}BxCADFn`kiaNcpqJH;mzcp< zpOaUJKC*V1kKna|pC_0Z)&R^)dYrsAjsrzXMc+ezp-M^mi@J5;_xV5|NAipR0DsC- A$p8QV delta 1809 zcma)7&2Jk;6t}CS4WUI;5J;OqOG^%wo36dKo46!tXV>2KI`+nPV($lSrQLP>m0jD) z##x+0eL!5ePX7TUj;KP4_$#;|aRaFmmvZ9|;DC5}1quoE2znluc zINs;?!t`Z)f%XzrT1-sA`hzKE)=ONu7%tIg4{l!(SA~EOfxy^I9KU$*Dm*PGo3=)O zTfcV6v-I3iO{T{B{;Vxau$&yd?HjrpwLR&;+@_zbdGycq#XFv*Ree)R^$jiND(!AV z>Ev9KcaLh>T(c&-hqjyAGxV}ce-y6M!p3YwX$1|%OU7(nNb2!&(vd5ztrlsSvJ^G- za?jQ(yeb{^omiAU6(V=e!L{vpwGZ$~omUS{xmq{0F9J)>(9Xko`t`;=`jaq6mmgl9 zKYxd%C8KJadXMg|Ru2r#^>w~{Xlqi>ZYjPZ=FWTLoz+9nXJQUt9avfkuRIl=u6r}U zhqpA=}HuZb2X|dUNgiRo{XbYTy7UA^92=*EXlyhMwC)lkw_C`X>GT z(c=#-T^QMluhF9~$u$~DzmGb%G0=8LZ5nZi9iRMH{b$0`SpUf_T?lO1g&Y4{{k}L+ ze@qwB@6yl3_hx%=&xs#ThOt^+BrPMJa{=hkX({)Wtlt9Qmg7l--EEM?T(GN21IJ6n zSvUA$m|*!X>@@m??L?q-q8R$X(2`InYQ5%F2M!N~gP5W+$hVb#VUSBfwaQqs zjx6=fAp{iGD%&6hLelWV0fik91ylnOqy%Z;5y*M0ccDT`gm9>rg9AdMII%`hcUC!W z6{L)mVVeL10#2iW`oc*NOpEQN3tW-oNHNZFs9y>yxV_2F;HEJ)Tt8k3O(!e(kork$ zbiv~;9uu;Y0s;xHPJfW^?hv1=bDZ(!z)=W$-{Gb9xaUrRWdx2G40u^AAZ~C5H_Hf| z5|)_kz%29}DK)~%w*iL(UPw?$HxNiFuDs?s^Auw|_Ln7$?4ig?T<4ojhi9QLafcjN zF0RmDGRv2?5Qjgae@UM%=Fxi6MiT=I6_Q7EL%ui6r6WKjgZ2n4l+l0~II&dXVq7aV zU3DGL1p;CM^;b^#Rv5{D@MT72 From 624823f22da5eb867ff0c558c69fd2b52260324b Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 09:37:26 +0000 Subject: [PATCH 08/15] Revised unittests given naming convention changes --- generator/unittests/app/test_gen_user_trans_data.py | 4 +++- generator/unittests/objects/test_Application.py | 8 ++++---- generator/unittests/objects/test_Card.py | 8 ++++---- generator/unittests/objects/test_Transaction.py | 8 ++++---- generator/unittests/objects/test_User.py | 13 ++++++++++++- 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py index 6bef4c7..4c4be48 100644 --- a/generator/unittests/app/test_gen_user_trans_data.py +++ b/generator/unittests/app/test_gen_user_trans_data.py @@ -38,6 +38,7 @@ fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] +fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1] fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1] fpath_countrycrimeindex = '.' + cons.fpath_countrycrimeindex.split(cons.fpath_repo_dir)[1] fpath_unittest_user_data = '.' + cons.fpath_unittest_user_data.split(cons.fpath_repo_dir)[1] @@ -51,7 +52,8 @@ fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, - fpath_email_domain=fpath_email_domain + fpath_email_domain=fpath_email_domain, + fpath_bedrock_email_domain=fpath_bedrock_email_domain, ) # generate random entity counts for each user diff --git a/generator/unittests/objects/test_Application.py b/generator/unittests/objects/test_Application.py index 58435b5..9403daa 100644 --- a/generator/unittests/objects/test_Application.py +++ b/generator/unittests/objects/test_Application.py @@ -28,10 +28,10 @@ "dded2b63f8242648": 0.2727272727272727, } exp_application_hashes_payment_channel_dict = { - "63cea7c46926aa74": "adyen", - "37725417bd51fb40": "adyen", - "b95cb80aae9fbbfe": "paypal", - "dded2b63f8242648": "docomo", + "63cea7c46926aa74": "Adyen", + "37725417bd51fb40": "Adyen", + "b95cb80aae9fbbfe": "PayPal", + "dded2b63f8242648": "Docomo", } exp_n_application_hashes = cons.unittest_n_entities exp_lam = cons.data_model_poisson_params["application"]["lambda"] diff --git a/generator/unittests/objects/test_Card.py b/generator/unittests/objects/test_Card.py index 688455f..32f0358 100644 --- a/generator/unittests/objects/test_Card.py +++ b/generator/unittests/objects/test_Card.py @@ -16,10 +16,10 @@ "dded2b63f8242648": 1, } exp_card_hashes_type_dict = { - "63cea7c46926aa74": "visa", - "37725417bd51fb40": "mastercard", - "b95cb80aae9fbbfe": "visa", - "dded2b63f8242648": "mastercard", + "63cea7c46926aa74": "Visa", + "37725417bd51fb40": "Mastercard", + "b95cb80aae9fbbfe": "Visa", + "dded2b63f8242648": "Mastercard", } exp_card_hashes_props_dict = { "63cea7c46926aa74": 0.16666666666666666, diff --git a/generator/unittests/objects/test_Transaction.py b/generator/unittests/objects/test_Transaction.py index 05531ba..0c338da 100644 --- a/generator/unittests/objects/test_Transaction.py +++ b/generator/unittests/objects/test_Transaction.py @@ -22,10 +22,10 @@ "dded2b63f8242648": 0.3793103448275862, } exp_transaction_hashes_status_dict = { - "63cea7c46926aa74": "successful", - "37725417bd51fb40": "successful", - "b95cb80aae9fbbfe": "successful", - "dded2b63f8242648": "successful", + "63cea7c46926aa74": "Successful", + "37725417bd51fb40": "Successful", + "b95cb80aae9fbbfe": "Successful", + "dded2b63f8242648": "Successful", } exp_transaction_hashes_amounts_dict = { "63cea7c46926aa74": 2.99, diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py index 4471a71..16280c8 100644 --- a/generator/unittests/objects/test_User.py +++ b/generator/unittests/objects/test_User.py @@ -63,7 +63,18 @@ fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] -user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) +fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1] + +user_object = User( + n_user_ids=exp_n_user_ids, + start_date=exp_start_date, + end_date=exp_end_date, + fpath_first_names=fpath_first_names, + fpath_last_names=fpath_last_names, + fpath_countries_europe=fpath_countries_europe, + fpath_email_domain=fpath_email_domain, + fpath_bedrock_email_domain=fpath_bedrock_email_domain + ) obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict obs_user_ids_props_dict = user_object.user_ids_props_dict From 1222f0924192af7aa9775fccedd9a2388bc7e73a Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 09:37:59 +0000 Subject: [PATCH 09/15] Fixed non payment transaction generation --- generator/app/gen_trans_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py index e9f8bcc..60d6814 100644 --- a/generator/app/gen_trans_data.py +++ b/generator/app/gen_trans_data.py @@ -89,11 +89,11 @@ def gen_trans_data( trans_data.loc[zero_transaction_amount_filter | missing_card_hash_filter, ['card_payment_channel']] = np.nan trans_data.loc[zero_transaction_amount_filter, ['card_hash', 'card_type', 'card_country_code_alpha']] = np.nan # add payment method as either card, store_wallet or store_points - trans_data['transaction_payment_method'] = 'card' + trans_data['transaction_payment_method'] = 'Card' zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0) missing_card_hash_filter = (trans_data['card_hash'].isnull()) - # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0]) - trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))[0]) + # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))) + trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))) trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan # align country codes for user, ip and card country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha'] From dd619d46d9f702b731c5ab7fec708ceb175723d8 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 09:38:30 +0000 Subject: [PATCH 10/15] Removed lowercase and split operations from standardisation lambda --- generator/batch/gen_bedrock_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generator/batch/gen_bedrock_data.py b/generator/batch/gen_bedrock_data.py index 241828b..9238a56 100644 --- a/generator/batch/gen_bedrock_data.py +++ b/generator/batch/gen_bedrock_data.py @@ -145,7 +145,7 @@ def invoke_bedrock( how='inner' ) # standardise names formatting - standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x + standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip())) if pd.isna(x) else x gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x)) logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}") # save generated data From 63b0ae741296b5c6ff41428d70ab884a2c95301a Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 09:39:37 +0000 Subject: [PATCH 11/15] Updated status codes naming convention --- generator/utilities/gen_trans_status.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/generator/utilities/gen_trans_status.py b/generator/utilities/gen_trans_status.py index 35fbbd3..f41ac62 100644 --- a/generator/utilities/gen_trans_status.py +++ b/generator/utilities/gen_trans_status.py @@ -33,7 +33,7 @@ def gen_trans_status( country_code_columns = ["registration_country_code","ip_country_code","card_country_code"] # if card hash if pd.notna(series['card_hash']): - status = "rejected" + status = "Rejected" # add rejections based on crime rates within country codes if rejection_rates_dict["country_code_trans_reject_rate_dict"][np.random.choice(a=series[country_code_columns].dropna().to_list(), size=1)[0]] >= random.uniform(0, 1)/rejection_scaling_factor: error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0] @@ -59,11 +59,11 @@ def gen_trans_status( error_code = np.random.choice(a=list(cons.data_model_rejection_codes_funds.keys()),p=list(cons.data_model_rejection_codes_funds.values()),size=1)[0] # otherwise return successful status else: - successful_status = {key:cons.data_model_transaction_status[key] for key in ['successful', 'pending']} + successful_status = {key:cons.data_model_transaction_status[key] for key in ['Successful', 'Pending']} successful_probs = [value/sum(successful_status.values()) for value in successful_status.values()] status = np.random.choice(a=list(successful_status.keys()), size=1, p=successful_probs)[0] error_code = np.nan else: - status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0] + status = np.random.choice(a=['Successful', 'Pending'], size=1, p=[0.98, 0.02])[0] error_code = np.nan return [status, error_code] From 75affdebb1d38c25bd15f12d38480329f81de834 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 09:40:07 +0000 Subject: [PATCH 12/15] Fixed bedrock fpath for gen user name function --- generator/objects/User.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/generator/objects/User.py b/generator/objects/User.py index 40139b8..039775e 100644 --- a/generator/objects/User.py +++ b/generator/objects/User.py @@ -84,9 +84,9 @@ def __init__( self.user_ids = list(self.user_ids_cnts_dict.keys()) self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict) self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe) - self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_first_names=self.fpath_first_names, sample_column_name="first_names") - self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_last_names=self.fpath_last_names, sample_column_name="last_names") - self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain) + self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_first_names, sample_column_name="first_names") + self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_last_names, sample_column_name="last_names") + self.user_ids_email_domain_dict = self.gen_user_bedrock_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain) self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date) @beartype From 783d1b1d7a1d9d2504d6003a8f18a3ee0eeb7cd2 Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 10:52:55 +0000 Subject: [PATCH 13/15] Tweeked null rates for cards, adjusted split between non card payment methods. Fixed index assigment logic for non card payment methods --- generator/app/gen_trans_data.py | 3 +-- generator/cons.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py index 60d6814..0c9326c 100644 --- a/generator/app/gen_trans_data.py +++ b/generator/app/gen_trans_data.py @@ -92,8 +92,7 @@ def gen_trans_data( trans_data['transaction_payment_method'] = 'Card' zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0) missing_card_hash_filter = (trans_data['card_hash'].isnull()) - # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))) - trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))) + trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()), replace=True), index=trans_data[missing_card_hash_filter].index) trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan # align country codes for user, ip and card country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha'] diff --git a/generator/cons.py b/generator/cons.py index de5225f..f846ed8 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -72,7 +72,7 @@ unittest_seed = 42 unittest_n_entities = 4 unittest_n_device_types = 10 -unittest_gen_test_dfs = False +unittest_gen_test_dfs = True unittest_n_users = 10 unittest_registration_start_date = '2020-01-01' unittest_registration_end_date = '2020-12-31' @@ -83,12 +83,12 @@ data_model_entity_user_ratios = {'card':1.3, 'device':2.5, 'transaction':5.3, 'ip':4.3} data_model_poisson_params = {'user':{'lambda':20, 'power':1}, 'device':{'lambda':0.2, 'power':2}, 'card':{'lambda':0.1, 'power':2}, 'ip':{'lambda':1.3, 'power':2}, 'application':{'lambda':1, 'power':2}, 'transaction':{'lambda':5, 'power':2}} data_model_shared_entities_dict = {'ip':0.05, 'card':0.005, 'device':0.01} -data_model_null_rates = {'card':0.05} +data_model_null_rates = {'card':0.1} data_model_card_types_dict = {'Visa':0.5, 'Mastercard':0.5} data_model_payment_channels = {'PayPal':0.4, 'Adyen':0.15, 'AppStore':0.25, 'WorldPay':0.15, 'Docomo':0.05} data_model_transaction_status = {'Successful':0.94, 'Pending':0.03, 'Rejected':0.03} data_model_inconsistent_country_codes_rejection_rate = {1:0.001, 2:0.005, 3:0.01} -data_model_non_card_trans_methods = {'Wallet':0.95, 'Points':0.05} +data_model_non_card_trans_methods = {'Wallet':0.85, 'Points':0.15} data_model_rejection_codes_fraud = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.55, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.05, 'E904:InsufficientFunds':0.1} data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3} From 8222462b3c5522b1a875f7d74695e4c1938fd21e Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 10:53:37 +0000 Subject: [PATCH 14/15] Deactivated unittest data generation --- generator/cons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generator/cons.py b/generator/cons.py index f846ed8..ad51b95 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -72,7 +72,7 @@ unittest_seed = 42 unittest_n_entities = 4 unittest_n_device_types = 10 -unittest_gen_test_dfs = True +unittest_gen_test_dfs = False unittest_n_users = 10 unittest_registration_start_date = '2020-01-01' unittest_registration_end_date = '2020-12-31' From 8ff15bd2c4619c5de01c167312dd9c825a42381b Mon Sep 17 00:00:00 2001 From: Oisin Date: Thu, 29 Jan 2026 10:53:57 +0000 Subject: [PATCH 15/15] Updated unittest transaction data --- data/unittest/transaction_data.parquet | Bin 30616 -> 30635 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet index d10238f8cde94df5a6c700ecd6373481426bf5f4..b871daa5d9c73f25c888ded44b494897f9b8383b 100644 GIT binary patch delta 1736 zcmbVMZ%h+s9Dd)s_bP?9d$07mf{mf2+zJ})L5Bs@w6w=srO-mFL${35mdHd<{(Klq zDF1X7>jGY9keP*$YMfKX#4T=1rrFryoH5a`C39}X&1}mugc&!dV6&@!BKzitdw;)s z-rwCl_dHMDk$L#uJe(Yq!>Djj-lDXa&4qzbNvOyYEU|Fw*Ha#0mEwrF`Jp1$NEtUg zUlMAtn1V%?qLM;OkP9<_xkoZf@_h8f9;&yt^}ZWbwbPSi%^=W6z% z%{<^@+powRPK`rWv^|U0ln-zd8a-MWJ4$R$GYoi@t0nTxw7s`RJWje%MaEvU#%m%m zF2K}|6%ZWZ$oP4f@Ma&6D`jQV8oE!vPu_q6Y*Yv8_8pyc>JnPJtO-c$Y4D06ILaMI zfpXn1c&iK((up(DCa01oKas?LWD=$IjDqJp205g%$Lh$L8fzSBP51PnQ3kDl?_2WH+K7U)!ft@px z{x%fdPdQ(*&!lmi?8=-T1~ERyQ^_b8pah^mMTl%xz*BVIxtyMMF4vabf~4oOycQq&s?K;2>gDW_WEh)OgLjC$4mz(1#DCv95}N zKjPE9pL?+o;acn&QQW+FSnh-+{J{e zPSxLLss#&=!)2 zbz7jhIoy%zJgrSF9qpn5QGyv-rhwTfc`~L!smH|G0l3h=PORCGOje248c3afq=)4B zVz}aA^7}#&Qi~b>I7e(8VzVu25WBUIpnw(hr~wJVs?=fxnGgVk5axr( zOqJ7=!-pWP!a)2i1+Yg7(h#6k(28x4_S!Ht?89E01K1!Cv%UnG+pv;Jf9!MjPPOO| zUt%(sx?0*h8ykfk-VBr_&3;kX$Y)h;;7V&d+xLWn9pO+#Tc9%}uhN=I?X4{>;UMI+ zzNX#b*3J&4l(Q6-`rE^8_CQN8EM_GnCggvLpBLQTLg5ad9?2|vxjXO26Zflsz|eJi zIOnf9G`D)@2>p7XTl)>oY?P- zf5jg&^0cvXT-8Ei)0o7X3;UG)*!$6;!)!RFua_}R*eDw*cH^^YBL?gBk^H`Cc`&ha z<-6VhX+-2ySH)(D;#|7=SUNjd#zu>uoX2^>{mQk9+l2ZK!7f^^l zdI+&eJA2rkf(PAY=w=N&E`03{qJcVgL@-r3sb~{>cZblOZ4{1Il}RpNhwOBVFl#mn zis~|I>2Ju63RTs+Bqs+5_TMGpLUl7fNvhA0?ASqqy);N@i_Jor$BAc1^-msDr{ T2d4=24T?=u|2dWPj^_ObqUa^u delta 1749 zcmbu9e@qi+7{~9u-j%ijyZ*RR7%VN&vmi002(-wytF*O6T3UKl+>flKty+X4E&Smh zTcPO`1x(?vLBxbHGLdOQ#JJ5r9CifJA!BMb;uf+H=gbxdS&X_(*sk+0wk$F6pZDH- z-fy1g$@lZT@5jrae;JG)O#w&g&J@4AyjbI@Q!Cv?>LQIw?Jo2Jk$TG(mCK{?6sg@B zbzx~K>Y+EJ#y}T6oSGvp#UZ>O!H1Co{0@p^I2&hYmekS@Qa6UW;D;N427;grz*;K@ z=-TRLkW{d`CIGBYRUm2X&%GF6T^56v7C(9X)oSRmh>^S_21#p|*O=|Ep;rKEmv(63 zygiYeC}c_pQ|iv*pdmXqIeR*NE;%Pvwlugml|S75%`reAg1B<*1RYnmN7LrHha}B& zA!X7at6e!A4O*nLnf90}yExExKugwDmeot#RqB+UppMhoUoyiVIVC*=g_XA~&MB;J zUUpb*jmsjO`D0~QPr0kGjhPkf>>HU|2i05|uZHU3y-k|@mbb75q(F;_lXITXm|@N3?v z{m!7xl^Z9FM~1n)4AfVn;7uxyVA}9)!_fiR8R_nh4Cl>}0-mk#cy@kRkkixE(s=j% zx)EdLNkdQe8ScnXJ(33?2uJY#WPD&l6>hf?EYcgw1?+ zxf&Y0EsP<303TR~kW#Rp9ZHadn9)|Qn4ZU#wg1k(fVl_E@PkJ%!~(N`S#s%jD_#o? z!}!?0C~$T)1sXh7*Fg!Y^&Is2nYuk-CHg#t|M$|tc5AvrC#3LVtDf&b^&%__M}&Mw zxt?PFiraLQ7INn5K|7_tg6O4`$c8x(MBPWB#UfIKqinWSm+5fUCXtRPb(~5=wIqP( zMqOh_PBQtBog^Ou4ot5@^frW#;Ub$=PvLxs)Zz%l$HY*nUPl6vLko$v0R#df64N07 zMV*+Hq(}(U=|~Q#MfkdQ6vYvn6+tloD3Ad86yWzj70&wy-Fq$Rj2(K7QdOpHZQ1Ab zYkY23i?7LVaQPZq170CFi!3a&w+0V*-7Q{^A>eBD2)JqGMP-hlH&Eg7yS)r;kC-bq z(l)(tQ%Z(WSys{H_cL-P3*~F{Hnk>(AqwOD>$qH?oik_WT3a(d7LkBcVdboNJJZJ7JY~=O+|#F zZ7G0Xe=T7)*TM$rSB`GfODiVl`s- zSmG5Haq$>T#JibbE&Y+T8XfF~iSzUy*4-)NC@~jf()2NB34^`t$pM&na)SQSR?hq3 RJS@J05*+dL1ixlL{x=>>Bo+Vw