diff --git a/data/ref/llama_firstnames.csv b/data/ref/llama_first_names.csv similarity index 99% rename from data/ref/llama_firstnames.csv rename to data/ref/llama_first_names.csv index f7b0e44..4668401 100644 --- a/data/ref/llama_firstnames.csv +++ b/data/ref/llama_first_names.csv @@ -1,4 +1,4 @@ -firstnames,country,ISO numeric +first_names,country,ISO numeric agrina,Albania,8 agron,Albania,8 albana,Albania,8 diff --git a/data/ref/llama_lastnames.csv b/data/ref/llama_last_names.csv similarity index 99% rename from data/ref/llama_lastnames.csv rename to data/ref/llama_last_names.csv index 9808434..a3d215e 100644 --- a/data/ref/llama_lastnames.csv +++ b/data/ref/llama_last_names.csv @@ -1,4 +1,4 @@ -lastnames,country,ISO numeric +last_names,country,ISO numeric bajramaj,Albania,8 bajrami,Albania,8 bardhi,Albania,8 diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet index 47b78f1..b871daa 100644 Binary files a/data/unittest/transaction_data.parquet and b/data/unittest/transaction_data.parquet differ diff --git a/data/unittest/user_data.parquet b/data/unittest/user_data.parquet index bb932c9..91a5b48 100644 Binary files a/data/unittest/user_data.parquet and b/data/unittest/user_data.parquet differ diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py index 28b0283..363dc19 100644 --- a/generator/app/gen_random_telecom_data.py +++ b/generator/app/gen_random_telecom_data.py @@ -72,10 +72,11 @@ def gen_random_telecom_data( n_user_ids=programmeparams.n_users, start_date=programmeparams.registration_start_date, end_date=programmeparams.registration_end_date, - fpath_firstnames=cons.fpath_llama_firstnames, - fpath_lastnames=cons.fpath_llama_lastnames, + fpath_first_names=cons.fpath_llama_first_names, + fpath_last_names=cons.fpath_llama_last_names, fpath_countries_europe=cons.fpath_countries_europe, - fpath_email_domain =cons.fpath_email_domain + fpath_email_domain=cons.fpath_email_domain, + fpath_bedrock_email_domain=cons.fpath_llama_email_domains ) # generate random entity counts for each user diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py index 363941d..0c9326c 100644 --- a/generator/app/gen_trans_data.py +++ b/generator/app/gen_trans_data.py @@ -89,11 +89,10 @@ def gen_trans_data( trans_data.loc[zero_transaction_amount_filter | missing_card_hash_filter, ['card_payment_channel']] = np.nan trans_data.loc[zero_transaction_amount_filter, ['card_hash', 'card_type', 'card_country_code_alpha']] = np.nan # add payment method as either card, store_wallet or store_points - trans_data['transaction_payment_method'] = 'card' + trans_data['transaction_payment_method'] = 'Card' zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0) missing_card_hash_filter = (trans_data['card_hash'].isnull()) - # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0]) - trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))[0]) + trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()), replace=True), index=trans_data[missing_card_hash_filter].index) trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan # align country codes for user, ip and card country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha'] @@ -118,14 +117,7 @@ def gen_trans_data( trans_data[['transaction_status', 'transaction_error_code']] = trans_data.apply(lambda series: gen_trans_status(series = series, rejection_rates_dict = rejection_rates_dict), result_type = 'expand', axis = 1) # order columns and sort rows by transaction date - user_cols = ['userid', 'firstname', 'lastname', 'registration_date', 'registration_country_code', 'uid', 'email_domain'] - device_cols = ['device_hash', 'device_type'] - card_cols = ['card_hash', 'card_type', 'card_country_code'] - ip_cols = ['ip_hash', 'ip_country_code'] - app_cols = ['application_hash'] - trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code'] - itr_cols = ['itr_hash'] - col_order = user_cols + device_cols + card_cols + ip_cols + app_cols + trans_cols + itr_cols + col_order = cons.user_cols + cons. device_cols + cons.card_cols + cons.ip_cols + cons.app_cols + cons.trans_cols + cons.itr_cols trans_data = trans_data[col_order].sort_values(by = 'transaction_date').reset_index(drop = True) return trans_data \ No newline at end of file diff --git a/generator/app/gen_user_data.py b/generator/app/gen_user_data.py index 9c5fa94..96cc651 100644 --- a/generator/app/gen_user_data.py +++ b/generator/app/gen_user_data.py @@ -50,8 +50,8 @@ def gen_user_data( # take a deep copy of the data user_data = random_entity_counts.copy() # add user data - user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_firstname_dict, idhash_key_name='uid', idhash_val_name='firstname') - user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_lastname_dict, idhash_key_name='uid', idhash_val_name='lastname') + user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_first_name_dict, idhash_key_name='uid', idhash_val_name='first_name') + user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_last_name_dict, idhash_key_name='uid', idhash_val_name='last_name') user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_dates_dict, idhash_key_name='uid', idhash_val_name='registration_date') user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_country_code_dict, idhash_key_name='uid', idhash_val_name='registration_country_code_alpha') user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_email_domain_dict, idhash_key_name='uid', idhash_val_name='email_domain') diff --git a/generator/batch/gen_bedrock_data.py b/generator/batch/gen_bedrock_data.py new file mode 100644 index 0000000..9238a56 --- /dev/null +++ b/generator/batch/gen_bedrock_data.py @@ -0,0 +1,246 @@ +# python generator/batch/gen_bedrock_data.py + +import json +import boto3 +from botocore.config import Config +import sys +import time +import logging +import unidecode +import pandas as pd +import numpy as np + +sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator") + +import cons +from utilities.Bedrock import Bedrock + +system_name_prompt = """# Task + +You are a name generator for people from different countries in Europe. +Your task is to generate an arbitrary N number of distinct and varied first names, or last names, for people from a given European country of origin. + +# Requirements + +- Generate typical names for both male and female people. +- The names do not need to be traditional to the target European country. +- Do not repeat any first names or last names more than once. +- Each individual first name must be unique and each individual last name must be unique. +- You should return the first names and last names using a valid JSON object tagged as . +- The valid JSON object should be of the following structures; `["name 1","name 2",...,"name N"]`. + +# Examples + +## First Names + +- Generate 2 first names for people from the country "Germany" -> ["Max","Hannah"] +- Generate 4 first names for people from the country "United Kingdom" -> ["George","Richard","Katie","Mary"] +- Generate 3 first names for people from the country "France" -> ["Lola","Mathieu","Léa"] +- Generate 5 first names for people from the country "Spain" -> ["Juan","Cristina","Javier","Julia","Isabel"] +- Generate 6 first names for people from the country "Sweden" -> ["Tova","Alva","Casper","Märta","Axel","Elsa"] + +## Last Names + +- Generate 2 last names for people from the country "Germany" -> ["Müller","Schmidt"] +- Generate 4 last names for people from the country "United Kingdom" -> ["Smith","Taylor","Jones","Brown"] +- Generate 3 last names for people from the country "France" -> ["Benoît","Pierre","Lefort"] +- Generate 5 last names for people from the country "Spain" -> ["Garcia","Martinez","Rodriguez","Lopez","Gomez"] +- Generate 6 last names for people from the country "Sweden" -> ["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"] +""" + +system_email_prompt = """ +""" + +first_name_prompt = 'Generate {n_data_points} first names for people from the country "{country}"' +surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"' +email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"' + +bedrock_config = { + "inferenceConfig":{ + "maxTokens":8192, + "temperature":0.5, + "topP":0.5, + }, + "system":[ + { + "text":system_name_prompt + } + ] +} + +def invoke_bedrock( + model:Bedrock, + model_id:str, + data_point:str, + n_data_points:int, + country:str, + countrieseurope:pd.DataFrame, + prompt:str, + system_prompt:str, + country_fpath:str, + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Invokes the Bedrock model to generate user names for a specified country. + + This function calls the Bedrock model with a formatted prompt to generate first names + and last names for a given country. It processes the model's response, parses the JSON + output, and merges the results with country data. The function deduplicates and standardizes + the name formatting, then persists the data to temporary CSV files. + + Parameters + ---------- + model : Bedrock + The Bedrock model instance used to generate names. + n_data_points : int + The number of data points to generate + country : str + The country for which to generate names. + countrieseurope : pd.DataFrame + A DataFrame containing country information for merging. + + Returns + ------- + tuple: + A tuple containing two pandas DataFrames: + - tmp_first_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information. + - tmp_last_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information. + + Raises + ------ + json.JSONDecodeError: If the model response cannot be parsed as JSON. + KeyError: If the expected keys ("first_names", "last_names") are missing from the JSON response. + Exception: If the merge with country data fails or file I/O operations encounter errors. + + Notes + ----- + - Names are standardized by converting to lowercase, removing extra whitespace, and applying Unicode normalization using unidecode. + - Duplicate names are removed after each processing step. + - Results are concatenated with any previously generated data for the same country and saved to temporary CSV files if the new data increases the dataset size. + - CSV files are encoded in latin1 format. + + """ + logging.info("Calling Bedrock ...") + # call bedrock model + formatted_prompt = prompt.format(n_data_points=n_data_points, country=country) + messages = [{"role":"user", "content":[{"text":formatted_prompt}]}] + logging.info(messages) + #model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048) + model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig']) + # split out answer + text = model_response.split("")[1].split("")[0] + # parse json + try: + gen_data_list = json.loads(text) + except json.JSONDecodeError as e: + raise Exception(f"Error parsing JSON: {e}") + logging.info("Processing results ...") + # generate pandas dataframe + gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame() + gen_dataframe['country'] = country + gen_country_dataframe = pd.merge( + left=gen_dataframe, + right=countrieseurope.rename(columns={'name':'country'}), + left_on='country', + right_on='name', + how='inner' + ) + # standardise names formatting + standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip())) if pd.isna(x) else x + gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x)) + logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}") + # save generated data + gen_country_dataframe.to_csv(country_fpath, index=False, encoding="latin1") + logging.info(f"Wrote {country_fpath} ...") + return gen_country_dataframe + +def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False): + """ + Docstring for main + """ + # load countries, first_names and surnames files + countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric']) + n_countries = countrieseurope.shape[0] + # set lists to collect generated data with + gen_country_dataframe_list, error_countries = [], [] + # set countries list + #countries_list = countrieseurope['name'].to_list() + countries_list = ['Cyprus'] + # iterate over countries list + for country in countries_list: + logging.info(f"{country} ...") + country_fpath=fpath_dict['country_fpath'].format(country) + try: + if run_bedrock: + # call bedrock model and generate user names data + country_filter = (countrieseurope["name"] == country) + country_population = countrieseurope.loc[country_filter, "population"].iloc[0] + # set n data points for ai generator depending on type + if data_point in ("first_names", "last_names"): + n_data_points = int(np.log(country_population)**1.5) + elif data_point == "email_domains": + n_data_points = 5 + else: + raise ValueError(f"Invalid parameter data_point value {data_point}") + # invoke bedrock and generate data points + tmp_gen_country_data = invoke_bedrock( + model=bedrock, + model_id=model_id, + data_point=data_point, + n_data_points=n_data_points, + country=country, + countrieseurope=countrieseurope, + country_fpath=country_fpath + ) + logging.info("Waiting ...") + # wait 20 seconds before retrying + time.sleep(20) + else: + tmp_gen_country_data = pd.read_csv(country_fpath, encoding="latin1") + # append to user country data + gen_country_dataframe_list.append(tmp_gen_country_data) + except Exception as e: + logging.info(e) + error_countries.append(country) + # log if any countries failed to generate data + if len(error_countries) > 0: + logging.info(f"Failed to generated data for countries: {error_countries}") + # concatenate user country data together and deduplicate across first_names and countries + output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True) + # sort and deduplicate output data + sort_dedup_cols = ["country",data_point] + output_gen_country_dataframe = output_gen_country_dataframe.drop_duplicates(subset=sort_dedup_cols).sort_values(by=sort_dedup_cols) + # write data to disk + if output_gen_country_dataframe['country'].nunique() == n_countries: + logging.info(f"output_gen_country_dataframe.shape: {output_gen_country_dataframe.shape}") + output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="latin1") + else: + logging.info("WARNING Insufficient first name data generated.") + +lgr = logging.getLogger() +lgr.setLevel(logging.INFO) + +if __name__ == "__main__": + # set aws region + aws_region = "us-east-1" + model_id="us.meta.llama3-1-70b-instruct-v1:0" + # load aws config + with open(cons.fpath_aws_session_token, "r") as j: + aws_config = json.loads(j.read()) + # connect to aws boto3 + session = boto3.Session( + aws_access_key_id=aws_config['Credentials']["AccessKeyId"], + aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"], + aws_session_token=aws_config['Credentials']["SessionToken"], + region_name=aws_region + ) + bedrock_runtime = session.client( + service_name="bedrock-runtime", + region_name=aws_region, + config=Config(retries={"max_attempts":1, "mode": "adaptive"}) + ) + # create bedrock instance + bedrock = Bedrock(bedrock_runtime=bedrock_runtime) + # execute main programme + for data_point, fpath_dict in cons.llama_data_point_fpaths.items(): + main(bedrock=bedrock, model_id=model_id, data_point=data_point, fpath_dict=fpath_dict, run_bedrock=True) + diff --git a/generator/cons.py b/generator/cons.py index 6fdb599..ad51b95 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -15,19 +15,27 @@ fpath_randomtelecomtransdata = os.path.join(subdir_data,'RandomTelecomPayments.csv') fpath_randomtelecomusersdata = os.path.join(subdir_data,'RandomTelecomUsers.parquet') fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv') -fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv') -fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv') +fpath_temp_llama_first_names = os.path.join(subdir_data, 'temp', 'llama_first_names_{country}.csv') +fpath_temp_llama_last_names = os.path.join(subdir_data, 'temp', 'llama_last_names_{country}.csv') +fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv') fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv') fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv') fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv') -fpath_firstnames = os.path.join(subdir_data, 'ref', 'first-names.txt') -fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt') -fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv') -fpath_llama_lastnames = os.path.join(subdir_data, 'ref', 'llama_lastnames.csv') +fpath_first_names = os.path.join(subdir_data, 'ref', 'first-names.txt') +fpath_last_names = os.path.join(subdir_data, 'ref', 'last-names.txt') +fpath_llama_first_names = os.path.join(subdir_data, 'ref', 'llama_first_names.csv') +fpath_llama_last_names = os.path.join(subdir_data, 'ref', 'llama_last_names.csv') +fpath_llama_email_domains = os.path.join(subdir_data, 'ref', 'llama_email_domains.csv') fpath_smartphones = os.path.join(subdir_data, 'ref', 'smartphones.csv') fpath_unittest_user_data = os.path.join(subdir_unittest, 'user_data.parquet') fpath_unittest_transaction_data = os.path.join(subdir_unittest, 'transaction_data.parquet') fpath_aws_session_token = os.path.join(subdir_creds,'sessionToken.json') +# set data points generated by llama +llama_data_point_fpaths = { + "first_names":{"fpath":fpath_llama_first_names, "country_fpath":fpath_temp_llama_first_names}, + "last_names":{"fpath":fpath_llama_last_names, "country_fpath":fpath_temp_llama_last_names}, + "email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains} + } # set url links to files available online url_european_populations = 'https://raw.githubusercontent.com/ajturner/acetate/master/places/Countries-Europe.csv' @@ -75,12 +83,12 @@ data_model_entity_user_ratios = {'card':1.3, 'device':2.5, 'transaction':5.3, 'ip':4.3} data_model_poisson_params = {'user':{'lambda':20, 'power':1}, 'device':{'lambda':0.2, 'power':2}, 'card':{'lambda':0.1, 'power':2}, 'ip':{'lambda':1.3, 'power':2}, 'application':{'lambda':1, 'power':2}, 'transaction':{'lambda':5, 'power':2}} data_model_shared_entities_dict = {'ip':0.05, 'card':0.005, 'device':0.01} -data_model_null_rates = {'card':0.05} -data_model_card_types_dict = {'visa':0.5, 'mastercard':0.5} -data_model_payment_channels = {'paypal':0.4, 'adyen':0.15, 'appstore':0.25, 'worldpay':0.15, 'docomo':0.05} -data_model_transaction_status = {'successful':0.94, 'pending':0.03, 'rejected':0.03} +data_model_null_rates = {'card':0.1} +data_model_card_types_dict = {'Visa':0.5, 'Mastercard':0.5} +data_model_payment_channels = {'PayPal':0.4, 'Adyen':0.15, 'AppStore':0.25, 'WorldPay':0.15, 'Docomo':0.05} +data_model_transaction_status = {'Successful':0.94, 'Pending':0.03, 'Rejected':0.03} data_model_inconsistent_country_codes_rejection_rate = {1:0.001, 2:0.005, 3:0.01} -data_model_non_card_trans_methods = {'wallet':0.95, 'points':0.05} +data_model_non_card_trans_methods = {'Wallet':0.85, 'Points':0.15} data_model_rejection_codes_fraud = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.55, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.05, 'E904:InsufficientFunds':0.1} data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3} @@ -88,4 +96,11 @@ data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} # set lists of generator object types -object_types = ["device","card","ip","transaction","application"] \ No newline at end of file +object_types = ["device","card","ip","transaction","application"] +user_cols = ['userid', 'first_name', 'last_name', 'registration_date', 'registration_country_code', 'uid', 'email_domain'] +device_cols = ['device_hash', 'device_type'] +card_cols = ['card_hash', 'card_type', 'card_country_code'] +ip_cols = ['ip_hash', 'ip_country_code'] +app_cols = ['application_hash'] +trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code'] +itr_cols = ['itr_hash'] \ No newline at end of file diff --git a/generator/objects/User.py b/generator/objects/User.py index 475d79a..039775e 100644 --- a/generator/objects/User.py +++ b/generator/objects/User.py @@ -17,10 +17,11 @@ def __init__( n_user_ids:int, start_date:str, end_date:str, - fpath_firstnames:str=cons.fpath_firstnames, - fpath_lastnames:str=cons.fpath_lastnames, + fpath_first_names:str=cons.fpath_llama_first_names, + fpath_last_names:str=cons.fpath_llama_last_names, fpath_countries_europe:str=cons.fpath_countries_europe, - fpath_email_domain :str=cons.fpath_email_domain , + fpath_email_domain:str=cons.fpath_email_domain, + fpath_bedrock_email_domain:str=cons.fpath_llama_email_domains, ): """ The randomly generated user data model object @@ -33,14 +34,14 @@ def __init__( The start date to generate users from end_date : str The end date to generate users till - fpath_firstnames : str - The full file path to the first names reference data, default is cons.fpath_firstnames. - fpath_lastnames : str - The full file path to the last names reference data, default is cons.fpath_lastnames. + fpath_first_names : str + The full file path to the first names reference data, default is cons.fpath_llama_first_names. + fpath_last_names : str + The full file path to the last names reference data, default is cons.fpath_llama_last_names. fpath_countries_europe : str The full file path to the europe countries reference data, default is cons.fpath_countries_europe. fpath_email_domain : str - The full file path to the email domain reference data, default is cons.fpath_email_domain . + The full file path to the email domain reference data, default is cons.fpath_llama_email_domains . Attributes ---------- @@ -58,9 +59,9 @@ def __init__( The user id counts dictionary user_ids_props_dict : Dict[str, float] The user id proportions dictionary - user_ids_firstname_dict : Dict[str, str] + user_ids_first_name_dict : Dict[str, str] The user id first names dictionary - user_ids_lastname_dict : Dict[str, str] + user_ids_last_name_dict : Dict[str, str] The user id last names dictionary user_ids_country_code_dict : Dict[str, str] The user id country codes dictionary @@ -72,85 +73,60 @@ def __init__( self.n_user_ids = n_user_ids self.start_date = start_date self.end_date = end_date - self.fpath_firstnames = fpath_firstnames - self.fpath_lastnames = fpath_lastnames + self.fpath_first_names = fpath_first_names + self.fpath_last_names = fpath_last_names self.fpath_countries_europe = fpath_countries_europe - self.fpath_email_domain = fpath_email_domain + self.fpath_email_domain = fpath_email_domain + self.fpath_bedrock_email_domain = fpath_bedrock_email_domain self.lam = cons.data_model_poisson_params["user"]["lambda"] self.power = cons.data_model_poisson_params["user"]["power"] self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power) self.user_ids = list(self.user_ids_cnts_dict.keys()) self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict) self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe) - self.user_ids_firstname_dict = self.gen_user_firstname(fpath_firstnames=self.fpath_firstnames) - self.user_ids_lastname_dict = self.gen_user_lastname(fpath_lastnames=self.fpath_lastnames) - self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain) + self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_first_names, sample_column_name="first_names") + self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_last_names, sample_column_name="last_names") + self.user_ids_email_domain_dict = self.gen_user_bedrock_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain) self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date) @beartype - def gen_user_firstname( + def gen_user_bedrock_name_data( self, - fpath_firstnames:str, + fpath_bedrock_data:str, + sample_column_name:str, ) -> Dict[str, str]: """ - Generates a dictionary of random user id first names + Generates a dictionary of random user bedrock data, e.g. first_names or last_names Parameters ---------- - fpath_firstnames : str - The file path to the first names reference file + fpath_bedrock_data : str + The file path to the bedrock data reference file + sample_column_name : str + The column name to sample from in the bedrock data reference file Returns ------- Dict[str, str] - A dictionary of user id first names + A dictionary of user id bedrock data """ # load in list of first names - first_name_data = pd.read_csv(fpath_firstnames) - # randomly sample names firstnames according to country code and counts + bedrock_data = pd.read_csv(fpath_bedrock_data) + # randomly sample names first_names according to country code and counts country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1) country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index() - country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: first_name_data.loc[(first_name_data["ISO numeric"] == series["country_code"]), "firstnames"].sample(n=series["count"], replace=True).to_list(), axis=1) - # create the key value pairs mapping user id to firstname - user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list() + country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True, weights=None).to_list(), axis=1) + # create the key value pairs mapping user id to bedrock data points + user_ids_bedrock_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["sample"])), axis=1).to_list() # convert key value pairs to dict - user_ids_firstname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict() - return user_ids_firstname_dict + user_ids_bedrock_dict = pd.concat([pd.Series(d) for d in user_ids_bedrock_pairs])[country_code_dataframe["user_ids"]].to_dict() + return user_ids_bedrock_dict @beartype - def gen_user_lastname( - self, - fpath_lastnames:str, - ) -> Dict[str, str]: - """ - Generates a dictionary of random user id last names. - - Parameters - ---------- - fpath_lastnames : str - The file path to the last names reference file. - - Returns - ------- - Dict[str, str] - A dictionary of user id last names. - """ - # load in list of last names - last_name_data = pd.read_csv(fpath_lastnames) - # randomly sample names firstnames according to country code and counts - country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1) - country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index() - country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: last_name_data.loc[(last_name_data["ISO numeric"] == series["country_code"]), "lastnames"].sample(n=series["count"], replace=True).to_list(), axis=1) - # create the key value pairs mapping user id to firstname - user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list() - # convert key value pairs to dict - user_ids_lastname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict() - return user_ids_lastname_dict - - @beartype - def gen_user_email_domain( + def gen_user_bedrock_email_domain( self, fpath_email_domain:str, + fpath_bedrock_email_domain:str, ) -> Dict[str, str]: """ Generates a dictionary of random user id email domains @@ -182,4 +158,4 @@ def gen_user_email_domain( ) # return the user ids email domains user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list)) - return user_ids_email_domain_dict + return user_ids_email_domain_dict \ No newline at end of file diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py index 540bb1c..4c4be48 100644 --- a/generator/unittests/app/test_gen_user_trans_data.py +++ b/generator/unittests/app/test_gen_user_trans_data.py @@ -34,10 +34,11 @@ np.random.seed(seed=programmeparams.random_seed) # create relative file paths -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] +fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1] fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1] fpath_countrycrimeindex = '.' + cons.fpath_countrycrimeindex.split(cons.fpath_repo_dir)[1] fpath_unittest_user_data = '.' + cons.fpath_unittest_user_data.split(cons.fpath_repo_dir)[1] @@ -48,10 +49,11 @@ n_user_ids=programmeparams.n_users, start_date=programmeparams.registration_start_date, end_date=programmeparams.registration_end_date, - fpath_firstnames=fpath_firstnames, - fpath_lastnames=fpath_lastnames, + fpath_first_names=fpath_first_names, + fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, - fpath_email_domain=fpath_email_domain + fpath_email_domain=fpath_email_domain, + fpath_bedrock_email_domain=fpath_bedrock_email_domain, ) # generate random entity counts for each user diff --git a/generator/unittests/objects/test_Application.py b/generator/unittests/objects/test_Application.py index 58435b5..9403daa 100644 --- a/generator/unittests/objects/test_Application.py +++ b/generator/unittests/objects/test_Application.py @@ -28,10 +28,10 @@ "dded2b63f8242648": 0.2727272727272727, } exp_application_hashes_payment_channel_dict = { - "63cea7c46926aa74": "adyen", - "37725417bd51fb40": "adyen", - "b95cb80aae9fbbfe": "paypal", - "dded2b63f8242648": "docomo", + "63cea7c46926aa74": "Adyen", + "37725417bd51fb40": "Adyen", + "b95cb80aae9fbbfe": "PayPal", + "dded2b63f8242648": "Docomo", } exp_n_application_hashes = cons.unittest_n_entities exp_lam = cons.data_model_poisson_params["application"]["lambda"] diff --git a/generator/unittests/objects/test_Card.py b/generator/unittests/objects/test_Card.py index 688455f..32f0358 100644 --- a/generator/unittests/objects/test_Card.py +++ b/generator/unittests/objects/test_Card.py @@ -16,10 +16,10 @@ "dded2b63f8242648": 1, } exp_card_hashes_type_dict = { - "63cea7c46926aa74": "visa", - "37725417bd51fb40": "mastercard", - "b95cb80aae9fbbfe": "visa", - "dded2b63f8242648": "mastercard", + "63cea7c46926aa74": "Visa", + "37725417bd51fb40": "Mastercard", + "b95cb80aae9fbbfe": "Visa", + "dded2b63f8242648": "Mastercard", } exp_card_hashes_props_dict = { "63cea7c46926aa74": 0.16666666666666666, diff --git a/generator/unittests/objects/test_Transaction.py b/generator/unittests/objects/test_Transaction.py index 05531ba..0c338da 100644 --- a/generator/unittests/objects/test_Transaction.py +++ b/generator/unittests/objects/test_Transaction.py @@ -22,10 +22,10 @@ "dded2b63f8242648": 0.3793103448275862, } exp_transaction_hashes_status_dict = { - "63cea7c46926aa74": "successful", - "37725417bd51fb40": "successful", - "b95cb80aae9fbbfe": "successful", - "dded2b63f8242648": "successful", + "63cea7c46926aa74": "Successful", + "37725417bd51fb40": "Successful", + "b95cb80aae9fbbfe": "Successful", + "dded2b63f8242648": "Successful", } exp_transaction_hashes_amounts_dict = { "63cea7c46926aa74": 2.99, diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py index 92a30bd..16280c8 100644 --- a/generator/unittests/objects/test_User.py +++ b/generator/unittests/objects/test_User.py @@ -21,13 +21,13 @@ "4264861381989413": 0.20212765957446807, "6720317315593519": 0.2765957446808511, } -exp_user_ids_firstname_dict = { +exp_user_ids_first_name_dict = { "6374692674377254": "simone", "1751409580926382": "francesca", "4264861381989413": "igor", "6720317315593519": "beckett", } -exp_user_ids_lastname_dict = { +exp_user_ids_last_name_dict = { "6374692674377254": "de filippo", "1751409580926382": "gagliardi", "4264861381989413": "lupu", @@ -59,16 +59,27 @@ random.seed(cons.unittest_seed) np.random.seed(cons.unittest_seed) -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] -user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) +fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1] + +user_object = User( + n_user_ids=exp_n_user_ids, + start_date=exp_start_date, + end_date=exp_end_date, + fpath_first_names=fpath_first_names, + fpath_last_names=fpath_last_names, + fpath_countries_europe=fpath_countries_europe, + fpath_email_domain=fpath_email_domain, + fpath_bedrock_email_domain=fpath_bedrock_email_domain + ) obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict obs_user_ids_props_dict = user_object.user_ids_props_dict -obs_user_ids_firstname_dict = user_object.user_ids_firstname_dict -obs_user_ids_lastname_dict = user_object.user_ids_lastname_dict +obs_user_ids_first_name_dict = user_object.user_ids_first_name_dict +obs_user_ids_last_name_dict = user_object.user_ids_last_name_dict obs_user_ids_country_code_dict = user_object.user_ids_country_code_dict obs_user_ids_email_domain_dict = user_object.user_ids_email_domain_dict obs_user_ids_dates_dict = user_object.user_ids_dates_dict @@ -86,10 +97,10 @@ def setUp(self): self.obs_user_ids_cnts_dict = obs_user_ids_cnts_dict self.exp_user_ids_props_dict = exp_user_ids_props_dict self.obs_user_ids_props_dict = obs_user_ids_props_dict - self.exp_user_ids_firstname_dict = exp_user_ids_firstname_dict - self.obs_user_ids_firstname_dict = obs_user_ids_firstname_dict - self.exp_user_ids_lastname_dict = exp_user_ids_lastname_dict - self.obs_user_ids_lastname_dict = obs_user_ids_lastname_dict + self.exp_user_ids_first_name_dict = exp_user_ids_first_name_dict + self.obs_user_ids_first_name_dict = obs_user_ids_first_name_dict + self.exp_user_ids_last_name_dict = exp_user_ids_last_name_dict + self.obs_user_ids_last_name_dict = obs_user_ids_last_name_dict self.exp_user_ids_country_code_dict = exp_user_ids_country_code_dict self.obs_user_ids_country_code_dict = obs_user_ids_country_code_dict self.exp_user_ids_email_domain_dict = exp_user_ids_email_domain_dict @@ -108,8 +119,8 @@ def setUp(self): def test_type(self): self.assertEqual(type(self.obs_user_ids_cnts_dict), type(self.exp_user_ids_cnts_dict)) self.assertEqual(type(self.obs_user_ids_props_dict), type(self.exp_user_ids_props_dict)) - self.assertEqual(type(self.obs_user_ids_firstname_dict),type(self.exp_user_ids_firstname_dict),) - self.assertEqual(type(self.obs_user_ids_lastname_dict), type(self.exp_user_ids_lastname_dict)) + self.assertEqual(type(self.obs_user_ids_first_name_dict),type(self.exp_user_ids_first_name_dict),) + self.assertEqual(type(self.obs_user_ids_last_name_dict), type(self.exp_user_ids_last_name_dict)) self.assertEqual(type(self.obs_user_ids_country_code_dict),type(self.exp_user_ids_country_code_dict),) self.assertEqual(type(self.obs_user_ids_email_domain_dict),type(self.exp_user_ids_email_domain_dict),) self.assertEqual(type(self.obs_user_ids_dates_dict), type(self.exp_user_ids_dates_dict)) @@ -121,8 +132,8 @@ def test_type(self): def test_len(self): self.assertEqual(len(self.obs_user_ids_cnts_dict), len(self.exp_user_ids_cnts_dict)) self.assertEqual(len(self.obs_user_ids_props_dict), len(self.exp_user_ids_props_dict)) - self.assertEqual(len(self.obs_user_ids_firstname_dict), len(self.exp_user_ids_firstname_dict)) - self.assertEqual(len(self.obs_user_ids_lastname_dict), len(self.exp_user_ids_lastname_dict)) + self.assertEqual(len(self.obs_user_ids_first_name_dict), len(self.exp_user_ids_first_name_dict)) + self.assertEqual(len(self.obs_user_ids_last_name_dict), len(self.exp_user_ids_last_name_dict)) self.assertEqual(len(self.obs_user_ids_country_code_dict),len(self.exp_user_ids_country_code_dict),) self.assertEqual(len(self.obs_user_ids_email_domain_dict),len(self.exp_user_ids_email_domain_dict),) self.assertEqual(len(self.obs_user_ids_dates_dict), len(self.exp_user_ids_dates_dict)) @@ -130,8 +141,8 @@ def test_len(self): def test_keys(self): self.assertEqual(list(self.obs_user_ids_cnts_dict.keys()),list(self.exp_user_ids_cnts_dict.keys()),) self.assertEqual(list(self.obs_user_ids_props_dict.keys()),list(self.exp_user_ids_props_dict.keys()),) - self.assertEqual(list(self.obs_user_ids_firstname_dict.keys()),list(self.exp_user_ids_firstname_dict.keys()),) - self.assertEqual(list(self.obs_user_ids_lastname_dict.keys()),list(self.exp_user_ids_lastname_dict.keys()),) + self.assertEqual(list(self.obs_user_ids_first_name_dict.keys()),list(self.exp_user_ids_first_name_dict.keys()),) + self.assertEqual(list(self.obs_user_ids_last_name_dict.keys()),list(self.exp_user_ids_last_name_dict.keys()),) self.assertEqual(list(self.obs_user_ids_country_code_dict.keys()),list(self.exp_user_ids_country_code_dict.keys()),) self.assertEqual(list(self.obs_user_ids_email_domain_dict.keys()),list(self.exp_user_ids_email_domain_dict.keys()),) self.assertEqual(list(self.obs_user_ids_dates_dict.keys()),list(self.exp_user_ids_dates_dict.keys()),) @@ -139,8 +150,8 @@ def test_keys(self): def test_values(self): self.assertEqual(list(self.obs_user_ids_cnts_dict.values()),list(self.exp_user_ids_cnts_dict.values()),) self.assertEqual(list(self.obs_user_ids_props_dict.values()),list(self.exp_user_ids_props_dict.values()),) - self.assertEqual(list(self.obs_user_ids_firstname_dict.values()),list(self.exp_user_ids_firstname_dict.values()),) - self.assertEqual(list(self.obs_user_ids_lastname_dict.values()),list(self.exp_user_ids_lastname_dict.values()),) + self.assertEqual(list(self.obs_user_ids_first_name_dict.values()),list(self.exp_user_ids_first_name_dict.values()),) + self.assertEqual(list(self.obs_user_ids_last_name_dict.values()),list(self.exp_user_ids_last_name_dict.values()),) self.assertEqual(list(self.obs_user_ids_country_code_dict.values()),list(self.exp_user_ids_country_code_dict.values()),) self.assertEqual(list(self.obs_user_ids_email_domain_dict.values()),list(self.exp_user_ids_email_domain_dict.values()),) self.assertEqual(list(self.obs_user_ids_dates_dict.values()),list(self.exp_user_ids_dates_dict.values()),) @@ -148,8 +159,8 @@ def test_values(self): def test_object(self): self.assertEqual(self.obs_user_ids_cnts_dict, self.exp_user_ids_cnts_dict) self.assertEqual(self.obs_user_ids_props_dict, self.exp_user_ids_props_dict) - self.assertEqual(self.obs_user_ids_firstname_dict, self.exp_user_ids_firstname_dict) - self.assertEqual(self.obs_user_ids_lastname_dict, self.exp_user_ids_lastname_dict) + self.assertEqual(self.obs_user_ids_first_name_dict, self.exp_user_ids_first_name_dict) + self.assertEqual(self.obs_user_ids_last_name_dict, self.exp_user_ids_last_name_dict) self.assertEqual(self.obs_user_ids_country_code_dict, self.exp_user_ids_country_code_dict) self.assertEqual(self.obs_user_ids_email_domain_dict, self.exp_user_ids_email_domain_dict) self.assertEqual(self.obs_user_ids_dates_dict, self.exp_user_ids_dates_dict) diff --git a/generator/unittests/utilities/test_gen_obj_idhash_series.py b/generator/unittests/utilities/test_gen_obj_idhash_series.py index 18faa86..7e3eabd 100644 --- a/generator/unittests/utilities/test_gen_obj_idhash_series.py +++ b/generator/unittests/utilities/test_gen_obj_idhash_series.py @@ -20,8 +20,8 @@ start_date = cons.unittest_registration_start_date end_date = cons.unittest_registration_end_date n_user_ids = cons.unittest_n_entities -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1] @@ -30,7 +30,7 @@ np.random.seed(cons.unittest_seed) # create user object -user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) +user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) # generate random entity counts random_entity_counts = gen_random_entity_counts(user_obj=user_object) # generate random entity values diff --git a/generator/unittests/utilities/test_gen_random_entity_counts.py b/generator/unittests/utilities/test_gen_random_entity_counts.py index 58a5522..45c8d27 100644 --- a/generator/unittests/utilities/test_gen_random_entity_counts.py +++ b/generator/unittests/utilities/test_gen_random_entity_counts.py @@ -19,11 +19,11 @@ random.seed(cons.unittest_seed) np.random.seed(cons.unittest_seed) -fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] -fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] +fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1] +fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1] fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] -user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) +user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) exp_randomentity_counts_dict = { 'uid': ['6374692674377254', '6720317315593519', '4264861381989413', '1751409580926382'], diff --git a/generator/utilities/Bedrock.py b/generator/utilities/Bedrock.py index 9ea42d6..1e2ea0a 100644 --- a/generator/utilities/Bedrock.py +++ b/generator/utilities/Bedrock.py @@ -1,5 +1,5 @@ import json -import boto3 +from typing import Dict, List from beartype import beartype class Bedrock(): @@ -10,7 +10,7 @@ class Bedrock(): Parameters ---------- - session : boto3.Session + bedrock_runtime : boto3.Session A Boto3 session object configured with appropriate AWS credentials. model_region: str The AWS region where the Bedrock model is hosted. @@ -31,16 +31,14 @@ class Bedrock(): @beartype def __init__( self, - session:boto3.Session, - model_region="us-east-1", - model_id:str="meta.llama3-8b-instruct-v1:0", + bedrock_runtime, ): - self.client = session.client("bedrock-runtime", region_name=model_region) - self.model_id = model_id, - + self.bedrock_runtime = bedrock_runtime + @beartype def prompt( self, + model_id:str, user_prompt:str, system_prompt:str="", top_p:float=0.5, @@ -89,32 +87,51 @@ def prompt( # call bedrock model try: # Invoke the model with the request. - response = self.client.invoke_model(modelId=self.model_id, body=request) + response = self.bedrock_runtime.invoke_model(modelId=model_id, body=request) except Exception as e: - raise Exception(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}") + raise Exception(f"ERROR: Can't invoke '{model_id}'. Reason: {e}") # Decode and extract the response model_response = json.loads(response["body"].read()) response_text = model_response["generation"] return response_text + + @beartype + def converse( + self, + modelId:str, + messages:List, + system:List, + inference_config:Dict={"maxTokens":512, "temperature":0.5, "topP":0.5,}, + tools_config:Dict=None + ): + """ + Invoke the Bedrock model with the provided messages and configurations. -system_prompt = """# Task - -You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin. - -# Requirements - -- Generate typical names for both male and female people. -- The names do not need to be traditional to the target European country. -- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique. -- You should return the first names and last names using a valid JSON object tagged as . -- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]} - -# Examples - -- Generate 2 first names and 2 last names for people from the country "Germany" -> {"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]} -- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> {"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]} -- Generate 3 first names and 3 last names for people from the country "France" -> {"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]} -- Generate 5 first names and 5 last names for people from the country "Spain" -> {"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]} -- Generate 6 first names and 6 last names for people from the country "Sweden" -> {"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]}""" + Parameters + ---------- + messages : Dict + A list of message objects representing the conversation history. + system : Dict + A system message object providing context or instructions for the model. + inference_config : Dict + Configuration settings for inference parameters. + tools_config : Dict + Configuration settings for any tools to be used during inference. -prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"' \ No newline at end of file + Returns + ------- + Dict: + The response from the Bedrock Claude model. + + References + ---------- + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/converse.html + """ + payload = {"modelId": modelId, "messages": messages, "system": system} + if inference_config: + payload["inferenceConfig"] = inference_config + if tools_config: + payload["toolsConfig"] = tools_config + # call converse api + response = self.bedrock_runtime.converse(**payload) + return response diff --git a/generator/utilities/gen_trans_status.py b/generator/utilities/gen_trans_status.py index 35fbbd3..f41ac62 100644 --- a/generator/utilities/gen_trans_status.py +++ b/generator/utilities/gen_trans_status.py @@ -33,7 +33,7 @@ def gen_trans_status( country_code_columns = ["registration_country_code","ip_country_code","card_country_code"] # if card hash if pd.notna(series['card_hash']): - status = "rejected" + status = "Rejected" # add rejections based on crime rates within country codes if rejection_rates_dict["country_code_trans_reject_rate_dict"][np.random.choice(a=series[country_code_columns].dropna().to_list(), size=1)[0]] >= random.uniform(0, 1)/rejection_scaling_factor: error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0] @@ -59,11 +59,11 @@ def gen_trans_status( error_code = np.random.choice(a=list(cons.data_model_rejection_codes_funds.keys()),p=list(cons.data_model_rejection_codes_funds.values()),size=1)[0] # otherwise return successful status else: - successful_status = {key:cons.data_model_transaction_status[key] for key in ['successful', 'pending']} + successful_status = {key:cons.data_model_transaction_status[key] for key in ['Successful', 'Pending']} successful_probs = [value/sum(successful_status.values()) for value in successful_status.values()] status = np.random.choice(a=list(successful_status.keys()), size=1, p=successful_probs)[0] error_code = np.nan else: - status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0] + status = np.random.choice(a=['Successful', 'Pending'], size=1, p=[0.98, 0.02])[0] error_code = np.nan return [status, error_code] diff --git a/generator/utilities/gen_user_names_file.py b/generator/utilities/gen_user_names_file.py deleted file mode 100644 index c8765fe..0000000 --- a/generator/utilities/gen_user_names_file.py +++ /dev/null @@ -1,208 +0,0 @@ -import os -import json -import boto3 -import sys -import time -import logging -import unidecode -import pandas as pd -import numpy as np - -sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator") - -import cons -from utilities.Bedrock import Bedrock, prompt, system_prompt - -def invoke_bedrock( - model:Bedrock, - n_user_names:int, - country:str, - countrieseurope:pd.DataFrame, - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """ - Invokes the Bedrock model to generate user names for a specified country. - - This function calls the Bedrock model with a formatted prompt to generate first names - and last names for a given country. It processes the model's response, parses the JSON - output, and merges the results with country data. The function deduplicates and standardizes - the name formatting, then persists the data to temporary CSV files. - - Parameters - ---------- - model : Bedrock - The Bedrock model instance used to generate names. - n_user_names : int - The number of user names to generate. - country : str - The country for which to generate names. - countrieseurope : pd.DataFrame - A DataFrame containing country information for merging. - - Returns - ------- - tuple: - A tuple containing two pandas DataFrames: - - tmp_firstname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information. - - tmp_lastname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information. - - Raises - ------ - json.JSONDecodeError: If the model response cannot be parsed as JSON. - KeyError: If the expected keys ("firstnames", "lastnames") are missing from the JSON response. - Exception: If the merge with country data fails or file I/O operations encounter errors. - - Notes - ----- - - Names are standardized by converting to lowercase, removing extra whitespace, and applying Unicode normalization using unidecode. - - Duplicate names are removed after each processing step. - - Results are concatenated with any previously generated data for the same country and saved to temporary CSV files if the new data increases the dataset size. - - CSV files are encoded in latin1 format. - - """ - logging.info("Calling Bedrock ...") - # call bedrock model - formatted_prompt = prompt.format(n_user_names=n_user_names, country=country) - logging.info(formatted_prompt) - model_response = model.prompt(user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048) - # split out answer - text = model_response.split("")[1].split("")[0] - # parse json - try: - record_set = json.loads(text) - except json.JSONDecodeError as e: - raise Exception(f"Error parsing JSON: {e}") - logging.info("Processing results ...") - # generate pandas dataframe - user_firstname_data = pd.Series(record_set["firstnames"], name="firstnames").to_frame().drop_duplicates(subset=["firstnames"]) - user_lastname_data = pd.Series(record_set["lastnames"], name="lastnames").to_frame().drop_duplicates(subset=["lastnames"]) - # add country - user_firstname_data['country'] = country - user_lastname_data['country'] = country - # join on country codes - llama_firstname_country_data = user_firstname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name']) - llama_lastname_country_data = user_lastname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name']) - # print shapes - logging.info(f"llama_firstname_country_data.shape: {llama_firstname_country_data.shape}") - logging.info(f"llama_lastname_country_data.shape: {llama_lastname_country_data.shape}") - # format output file paths - fpath_temp_llama_firstnames = cons.fpath_temp_llama_firstnames.format(country=country.lower()) - fpath_temp_llama_lastnames = cons.fpath_temp_llama_lastnames.format(country=country.lower()) - # check against previous iterations - tmp_firstname_country_data = pd.DataFrame() - tmp_lastname_country_data = pd.DataFrame() - if os.path.exists(fpath_temp_llama_firstnames): - tmp_firstname_country_data = pd.read_csv(fpath_temp_llama_firstnames, encoding="latin1") - if os.path.exists(fpath_temp_llama_lastnames): - tmp_lastname_country_data = pd.read_csv(fpath_temp_llama_lastnames, encoding="latin1") - # concatenate results - tmp_firstname_country_data = pd.concat(objs=[tmp_firstname_country_data, llama_firstname_country_data], axis=0, ignore_index=True) - tmp_lastname_country_data = pd.concat(objs=[tmp_lastname_country_data, llama_lastname_country_data], axis=0, ignore_index=True) - # standardise names formatting - standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x - tmp_firstname_country_data["firstnames"] = tmp_firstname_country_data["firstnames"].apply(lambda x: standardise_text_lambda(x)) - tmp_lastname_country_data["lastnames"] = tmp_lastname_country_data["lastnames"].apply(lambda x: standardise_text_lambda(x)) - # deduplicate data - tmp_firstname_country_data = tmp_firstname_country_data.drop_duplicates(subset=["firstnames"]) - tmp_lastname_country_data = tmp_lastname_country_data.drop_duplicates(subset=["lastnames"]) - # print shapes - logging.info(f"tmp_firstname_country_data.shape: {tmp_firstname_country_data.shape}") - logging.info(f"tmp_lastname_country_data.shape: {tmp_lastname_country_data.shape}") - # save firstnames names data to temp directory (if pairwise firstnames have been created) - if tmp_firstname_country_data.shape[0] >= llama_firstname_country_data.shape[0]: - tmp_firstname_country_data.to_csv(fpath_temp_llama_firstnames, index=False, encoding="latin1") - logging.info(f"Wrote {fpath_temp_llama_firstnames} ...") - # save lastnames data to temp directory (if pairwise lastnames have been created) - if tmp_lastname_country_data.shape[0] >= llama_lastname_country_data.shape[0]: - tmp_lastname_country_data.to_csv(fpath_temp_llama_lastnames, index=False, encoding="latin1") - logging.info(f"Wrote {fpath_temp_llama_lastnames} ...") - return (tmp_firstname_country_data, tmp_lastname_country_data) - -if __name__ == "__main__": - - # set up logging - lgr = logging.getLogger() - lgr.setLevel(logging.INFO) - - # load aws config - with open(cons.fpath_aws_session_token, "r") as j: - aws_config = json.loads(j.read()) - - # connect to aws boto3 - session = boto3.Session( - aws_access_key_id=aws_config['Credentials']["AccessKeyId"], - aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"], - aws_session_token=aws_config['Credentials']["SessionToken"], - region_name="us-east-1" - ) - - # create bedrock instance - bedrock = Bedrock(session=session, model_region="us-east-1", model_id="meta.llama3-70b-instruct-v1:0") - - # load countries, firstnames and surnames files - countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric']) - orig_firstnames = pd.read_csv(cons.fpath_firstnames) - orig_surnames = pd.read_csv(cons.fpath_lastnames) - - # determine file size - orig_filesize = int((orig_firstnames.shape[0] + orig_surnames.shape[0])/2) - n_countries = countrieseurope.shape[0] - n_user_names = min(50, int(orig_filesize / n_countries)) - - # generate user names - firstname_country_data = [] - lastname_country_data = [] - error_countries = [] - # switch to toggle bedrock calls - run_bedrock = False - - # set countries list - countries_list = countrieseurope['name'].to_list() - #countries_list = ['Cyprus'] - - for country in countries_list: - logging.info(f"{country} ...") - try: - if run_bedrock: - # call bedrock model and generate user names data - tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, n_user_names=n_user_names, country=country) - logging.info("Waiting ...") - # wait 20 seconds before retrying - time.sleep(20) - else: - tmp_firstname_country_data = pd.read_csv(cons.fpath_temp_llama_firstnames.format(country=country.lower()), encoding="latin1") - tmp_lastname_country_data = pd.read_csv(cons.fpath_temp_llama_lastnames.format(country=country.lower()), encoding="latin1") - # append to user country data - firstname_country_data.append(tmp_firstname_country_data) - lastname_country_data.append(tmp_lastname_country_data) - except Exception as e: - logging.info(e) - error_countries.append(country) - - # log if any countries failed to generate data - if len(error_countries) > 0: - logging.info(f"Failed to generated data for countries: {error_countries}") - - # load existing reference data - firstname_country_df = pd.read_csv(cons.fpath_llama_firstnames, encoding="latin1") - lastname_country_df = pd.read_csv(cons.fpath_llama_lastnames, encoding="latin1") - # append to country data lists - firstname_country_data.append(firstname_country_df) - lastname_country_data.append(lastname_country_df) - # concatenate user country data together and deduplicate across firstnames and countries - output_firstname_country_df = pd.concat(firstname_country_data, axis=0, ignore_index=True) - output_lastname_country_df = pd.concat(lastname_country_data, axis=0, ignore_index=True) - # sort and deduplicate output data - output_firstname_country_df = output_firstname_country_df.drop_duplicates(subset=["country","firstnames"]).sort_values(by=["country","firstnames"]) - output_lastname_country_df = output_lastname_country_df.drop_duplicates(subset=["country","lastnames"]).sort_values(by=["country","lastnames"]) - - # write data to disk - if output_firstname_country_df['country'].nunique() == n_countries: - logging.info(f"output_firstname_country_df.shape: {output_firstname_country_df.shape}") - output_firstname_country_df.to_csv(cons.fpath_llama_firstnames, index=False, encoding="latin1") - else: - logging.info("WARNING Insufficient first name data generated.") - if output_lastname_country_df['country'].nunique() == n_countries: - logging.info(f"output_lastname_country_df.shape: {output_lastname_country_df.shape}") - output_lastname_country_df.to_csv(cons.fpath_llama_lastnames, index=False, encoding="latin1") - else: - logging.info("WARNING Insufficient last name data generated.") \ No newline at end of file