diff --git a/data/ref/llama_firstnames.csv b/data/ref/llama_first_names.csv
similarity index 99%
rename from data/ref/llama_firstnames.csv
rename to data/ref/llama_first_names.csv
index f7b0e44..4668401 100644
--- a/data/ref/llama_firstnames.csv
+++ b/data/ref/llama_first_names.csv
@@ -1,4 +1,4 @@
-firstnames,country,ISO numeric
+first_names,country,ISO numeric
agrina,Albania,8
agron,Albania,8
albana,Albania,8
diff --git a/data/ref/llama_lastnames.csv b/data/ref/llama_last_names.csv
similarity index 99%
rename from data/ref/llama_lastnames.csv
rename to data/ref/llama_last_names.csv
index 9808434..a3d215e 100644
--- a/data/ref/llama_lastnames.csv
+++ b/data/ref/llama_last_names.csv
@@ -1,4 +1,4 @@
-lastnames,country,ISO numeric
+last_names,country,ISO numeric
bajramaj,Albania,8
bajrami,Albania,8
bardhi,Albania,8
diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet
index 47b78f1..b871daa 100644
Binary files a/data/unittest/transaction_data.parquet and b/data/unittest/transaction_data.parquet differ
diff --git a/data/unittest/user_data.parquet b/data/unittest/user_data.parquet
index bb932c9..91a5b48 100644
Binary files a/data/unittest/user_data.parquet and b/data/unittest/user_data.parquet differ
diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py
index 28b0283..363dc19 100644
--- a/generator/app/gen_random_telecom_data.py
+++ b/generator/app/gen_random_telecom_data.py
@@ -72,10 +72,11 @@ def gen_random_telecom_data(
n_user_ids=programmeparams.n_users,
start_date=programmeparams.registration_start_date,
end_date=programmeparams.registration_end_date,
- fpath_firstnames=cons.fpath_llama_firstnames,
- fpath_lastnames=cons.fpath_llama_lastnames,
+ fpath_first_names=cons.fpath_llama_first_names,
+ fpath_last_names=cons.fpath_llama_last_names,
fpath_countries_europe=cons.fpath_countries_europe,
- fpath_email_domain =cons.fpath_email_domain
+ fpath_email_domain=cons.fpath_email_domain,
+ fpath_bedrock_email_domain=cons.fpath_llama_email_domains
)
# generate random entity counts for each user
diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py
index 363941d..0c9326c 100644
--- a/generator/app/gen_trans_data.py
+++ b/generator/app/gen_trans_data.py
@@ -89,11 +89,10 @@ def gen_trans_data(
trans_data.loc[zero_transaction_amount_filter | missing_card_hash_filter, ['card_payment_channel']] = np.nan
trans_data.loc[zero_transaction_amount_filter, ['card_hash', 'card_type', 'card_country_code_alpha']] = np.nan
# add payment method as either card, store_wallet or store_points
- trans_data['transaction_payment_method'] = 'card'
+ trans_data['transaction_payment_method'] = 'Card'
zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0)
missing_card_hash_filter = (trans_data['card_hash'].isnull())
- # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0])
- trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))[0])
+ trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()), replace=True), index=trans_data[missing_card_hash_filter].index)
trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan
# align country codes for user, ip and card
country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha']
@@ -118,14 +117,7 @@ def gen_trans_data(
trans_data[['transaction_status', 'transaction_error_code']] = trans_data.apply(lambda series: gen_trans_status(series = series, rejection_rates_dict = rejection_rates_dict), result_type = 'expand', axis = 1)
# order columns and sort rows by transaction date
- user_cols = ['userid', 'firstname', 'lastname', 'registration_date', 'registration_country_code', 'uid', 'email_domain']
- device_cols = ['device_hash', 'device_type']
- card_cols = ['card_hash', 'card_type', 'card_country_code']
- ip_cols = ['ip_hash', 'ip_country_code']
- app_cols = ['application_hash']
- trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code']
- itr_cols = ['itr_hash']
- col_order = user_cols + device_cols + card_cols + ip_cols + app_cols + trans_cols + itr_cols
+ col_order = cons.user_cols + cons. device_cols + cons.card_cols + cons.ip_cols + cons.app_cols + cons.trans_cols + cons.itr_cols
trans_data = trans_data[col_order].sort_values(by = 'transaction_date').reset_index(drop = True)
return trans_data
\ No newline at end of file
diff --git a/generator/app/gen_user_data.py b/generator/app/gen_user_data.py
index 9c5fa94..96cc651 100644
--- a/generator/app/gen_user_data.py
+++ b/generator/app/gen_user_data.py
@@ -50,8 +50,8 @@ def gen_user_data(
# take a deep copy of the data
user_data = random_entity_counts.copy()
# add user data
- user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_firstname_dict, idhash_key_name='uid', idhash_val_name='firstname')
- user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_lastname_dict, idhash_key_name='uid', idhash_val_name='lastname')
+ user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_first_name_dict, idhash_key_name='uid', idhash_val_name='first_name')
+ user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_last_name_dict, idhash_key_name='uid', idhash_val_name='last_name')
user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_dates_dict, idhash_key_name='uid', idhash_val_name='registration_date')
user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_country_code_dict, idhash_key_name='uid', idhash_val_name='registration_country_code_alpha')
user_data = join_idhashes_dict(data=user_data, idhashes_dict=user_obj.user_ids_email_domain_dict, idhash_key_name='uid', idhash_val_name='email_domain')
diff --git a/generator/batch/gen_bedrock_data.py b/generator/batch/gen_bedrock_data.py
new file mode 100644
index 0000000..9238a56
--- /dev/null
+++ b/generator/batch/gen_bedrock_data.py
@@ -0,0 +1,246 @@
+# python generator/batch/gen_bedrock_data.py
+
+import json
+import boto3
+from botocore.config import Config
+import sys
+import time
+import logging
+import unidecode
+import pandas as pd
+import numpy as np
+
+sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")
+
+import cons
+from utilities.Bedrock import Bedrock
+
+system_name_prompt = """# Task
+
+You are a name generator for people from different countries in Europe.
+Your task is to generate an arbitrary N number of distinct and varied first names, or last names, for people from a given European country of origin.
+
+# Requirements
+
+- Generate typical names for both male and female people.
+- The names do not need to be traditional to the target European country.
+- Do not repeat any first names or last names more than once.
+- Each individual first name must be unique and each individual last name must be unique.
+- You should return the first names and last names using a valid JSON object tagged as .
+- The valid JSON object should be of the following structures; `["name 1","name 2",...,"name N"]`.
+
+# Examples
+
+## First Names
+
+- Generate 2 first names for people from the country "Germany" -> ["Max","Hannah"]
+- Generate 4 first names for people from the country "United Kingdom" -> ["George","Richard","Katie","Mary"]
+- Generate 3 first names for people from the country "France" -> ["Lola","Mathieu","Léa"]
+- Generate 5 first names for people from the country "Spain" -> ["Juan","Cristina","Javier","Julia","Isabel"]
+- Generate 6 first names for people from the country "Sweden" -> ["Tova","Alva","Casper","Märta","Axel","Elsa"]
+
+## Last Names
+
+- Generate 2 last names for people from the country "Germany" -> ["Müller","Schmidt"]
+- Generate 4 last names for people from the country "United Kingdom" -> ["Smith","Taylor","Jones","Brown"]
+- Generate 3 last names for people from the country "France" -> ["Benoît","Pierre","Lefort"]
+- Generate 5 last names for people from the country "Spain" -> ["Garcia","Martinez","Rodriguez","Lopez","Gomez"]
+- Generate 6 last names for people from the country "Sweden" -> ["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]
+"""
+
+system_email_prompt = """
+"""
+
+first_name_prompt = 'Generate {n_data_points} first names for people from the country "{country}"'
+surname_prompt = 'Generate {n_data_points} last names for people from the country "{country}"'
+email_domain_prompt = 'Generate {n_data_points} popular email domains names for people from the country "{country}"'
+
+bedrock_config = {
+ "inferenceConfig":{
+ "maxTokens":8192,
+ "temperature":0.5,
+ "topP":0.5,
+ },
+ "system":[
+ {
+ "text":system_name_prompt
+ }
+ ]
+}
+
+def invoke_bedrock(
+ model:Bedrock,
+ model_id:str,
+ data_point:str,
+ n_data_points:int,
+ country:str,
+ countrieseurope:pd.DataFrame,
+ prompt:str,
+ system_prompt:str,
+ country_fpath:str,
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
+ """
+ Invokes the Bedrock model to generate user names for a specified country.
+
+ This function calls the Bedrock model with a formatted prompt to generate first names
+ and last names for a given country. It processes the model's response, parses the JSON
+ output, and merges the results with country data. The function deduplicates and standardizes
+ the name formatting, then persists the data to temporary CSV files.
+
+ Parameters
+ ----------
+ model : Bedrock
+ The Bedrock model instance used to generate names.
+ n_data_points : int
+ The number of data points to generate
+ country : str
+ The country for which to generate names.
+ countrieseurope : pd.DataFrame
+ A DataFrame containing country information for merging.
+
+ Returns
+ -------
+ tuple:
+ A tuple containing two pandas DataFrames:
+ - tmp_first_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information.
+ - tmp_last_name_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information.
+
+ Raises
+ ------
+ json.JSONDecodeError: If the model response cannot be parsed as JSON.
+ KeyError: If the expected keys ("first_names", "last_names") are missing from the JSON response.
+ Exception: If the merge with country data fails or file I/O operations encounter errors.
+
+ Notes
+ -----
+ - Names are standardized by converting to lowercase, removing extra whitespace, and applying Unicode normalization using unidecode.
+ - Duplicate names are removed after each processing step.
+ - Results are concatenated with any previously generated data for the same country and saved to temporary CSV files if the new data increases the dataset size.
+ - CSV files are encoded in latin1 format.
+
+ """
+ logging.info("Calling Bedrock ...")
+ # call bedrock model
+ formatted_prompt = prompt.format(n_data_points=n_data_points, country=country)
+ messages = [{"role":"user", "content":[{"text":formatted_prompt}]}]
+ logging.info(messages)
+ #model_response = model.prompt(model_id=model_id, user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
+ model_response = model.converse(modelId=model_id, messages=messages, system=bedrock_config['system'], inference_config=bedrock_config['inferenceConfig'])
+ # split out answer
+ text = model_response.split("")[1].split("")[0]
+ # parse json
+ try:
+ gen_data_list = json.loads(text)
+ except json.JSONDecodeError as e:
+ raise Exception(f"Error parsing JSON: {e}")
+ logging.info("Processing results ...")
+ # generate pandas dataframe
+ gen_dataframe = pd.Series(gen_data_list, name=data_point).drop_duplicates().to_frame()
+ gen_dataframe['country'] = country
+ gen_country_dataframe = pd.merge(
+ left=gen_dataframe,
+ right=countrieseurope.rename(columns={'name':'country'}),
+ left_on='country',
+ right_on='name',
+ how='inner'
+ )
+ # standardise names formatting
+ standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.strip())) if pd.isna(x) else x
+ gen_country_dataframe[data_point] = gen_country_dataframe[data_point].apply(lambda x: standardise_text_lambda(x))
+ logging.info(f"gen_country_dataframe.shape: {gen_country_dataframe.shape}")
+ # save generated data
+ gen_country_dataframe.to_csv(country_fpath, index=False, encoding="latin1")
+ logging.info(f"Wrote {country_fpath} ...")
+ return gen_country_dataframe
+
+def main(bedrock, model_id, data_point, fpath_dict, run_bedrock=False):
+ """
+ Docstring for main
+ """
+ # load countries, first_names and surnames files
+ countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
+ n_countries = countrieseurope.shape[0]
+ # set lists to collect generated data with
+ gen_country_dataframe_list, error_countries = [], []
+ # set countries list
+ #countries_list = countrieseurope['name'].to_list()
+ countries_list = ['Cyprus']
+ # iterate over countries list
+ for country in countries_list:
+ logging.info(f"{country} ...")
+ country_fpath=fpath_dict['country_fpath'].format(country)
+ try:
+ if run_bedrock:
+ # call bedrock model and generate user names data
+ country_filter = (countrieseurope["name"] == country)
+ country_population = countrieseurope.loc[country_filter, "population"].iloc[0]
+ # set n data points for ai generator depending on type
+ if data_point in ("first_names", "last_names"):
+ n_data_points = int(np.log(country_population)**1.5)
+ elif data_point == "email_domains":
+ n_data_points = 5
+ else:
+ raise ValueError(f"Invalid parameter data_point value {data_point}")
+ # invoke bedrock and generate data points
+ tmp_gen_country_data = invoke_bedrock(
+ model=bedrock,
+ model_id=model_id,
+ data_point=data_point,
+ n_data_points=n_data_points,
+ country=country,
+ countrieseurope=countrieseurope,
+ country_fpath=country_fpath
+ )
+ logging.info("Waiting ...")
+ # wait 20 seconds before retrying
+ time.sleep(20)
+ else:
+ tmp_gen_country_data = pd.read_csv(country_fpath, encoding="latin1")
+ # append to user country data
+ gen_country_dataframe_list.append(tmp_gen_country_data)
+ except Exception as e:
+ logging.info(e)
+ error_countries.append(country)
+ # log if any countries failed to generate data
+ if len(error_countries) > 0:
+ logging.info(f"Failed to generated data for countries: {error_countries}")
+ # concatenate user country data together and deduplicate across first_names and countries
+ output_gen_country_dataframe = pd.concat(gen_country_dataframe_list, axis=0, ignore_index=True)
+ # sort and deduplicate output data
+ sort_dedup_cols = ["country",data_point]
+ output_gen_country_dataframe = output_gen_country_dataframe.drop_duplicates(subset=sort_dedup_cols).sort_values(by=sort_dedup_cols)
+ # write data to disk
+ if output_gen_country_dataframe['country'].nunique() == n_countries:
+ logging.info(f"output_gen_country_dataframe.shape: {output_gen_country_dataframe.shape}")
+ output_gen_country_dataframe.to_csv(fpath_dict["fpath"], index=False, encoding="latin1")
+ else:
+ logging.info("WARNING Insufficient first name data generated.")
+
+lgr = logging.getLogger()
+lgr.setLevel(logging.INFO)
+
+if __name__ == "__main__":
+ # set aws region
+ aws_region = "us-east-1"
+ model_id="us.meta.llama3-1-70b-instruct-v1:0"
+ # load aws config
+ with open(cons.fpath_aws_session_token, "r") as j:
+ aws_config = json.loads(j.read())
+ # connect to aws boto3
+ session = boto3.Session(
+ aws_access_key_id=aws_config['Credentials']["AccessKeyId"],
+ aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"],
+ aws_session_token=aws_config['Credentials']["SessionToken"],
+ region_name=aws_region
+ )
+ bedrock_runtime = session.client(
+ service_name="bedrock-runtime",
+ region_name=aws_region,
+ config=Config(retries={"max_attempts":1, "mode": "adaptive"})
+ )
+ # create bedrock instance
+ bedrock = Bedrock(bedrock_runtime=bedrock_runtime)
+ # execute main programme
+ for data_point, fpath_dict in cons.llama_data_point_fpaths.items():
+ main(bedrock=bedrock, model_id=model_id, data_point=data_point, fpath_dict=fpath_dict, run_bedrock=True)
+
diff --git a/generator/cons.py b/generator/cons.py
index 6fdb599..ad51b95 100644
--- a/generator/cons.py
+++ b/generator/cons.py
@@ -15,19 +15,27 @@
fpath_randomtelecomtransdata = os.path.join(subdir_data,'RandomTelecomPayments.csv')
fpath_randomtelecomusersdata = os.path.join(subdir_data,'RandomTelecomUsers.parquet')
fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv')
-fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv')
-fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv')
+fpath_temp_llama_first_names = os.path.join(subdir_data, 'temp', 'llama_first_names_{country}.csv')
+fpath_temp_llama_last_names = os.path.join(subdir_data, 'temp', 'llama_last_names_{country}.csv')
+fpath_temp_llama_email_domains = os.path.join(subdir_data, 'temp', 'llama_email_domains_{country}.csv')
fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv')
fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv')
fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv')
-fpath_firstnames = os.path.join(subdir_data, 'ref', 'first-names.txt')
-fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt')
-fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv')
-fpath_llama_lastnames = os.path.join(subdir_data, 'ref', 'llama_lastnames.csv')
+fpath_first_names = os.path.join(subdir_data, 'ref', 'first-names.txt')
+fpath_last_names = os.path.join(subdir_data, 'ref', 'last-names.txt')
+fpath_llama_first_names = os.path.join(subdir_data, 'ref', 'llama_first_names.csv')
+fpath_llama_last_names = os.path.join(subdir_data, 'ref', 'llama_last_names.csv')
+fpath_llama_email_domains = os.path.join(subdir_data, 'ref', 'llama_email_domains.csv')
fpath_smartphones = os.path.join(subdir_data, 'ref', 'smartphones.csv')
fpath_unittest_user_data = os.path.join(subdir_unittest, 'user_data.parquet')
fpath_unittest_transaction_data = os.path.join(subdir_unittest, 'transaction_data.parquet')
fpath_aws_session_token = os.path.join(subdir_creds,'sessionToken.json')
+# set data points generated by llama
+llama_data_point_fpaths = {
+ "first_names":{"fpath":fpath_llama_first_names, "country_fpath":fpath_temp_llama_first_names},
+ "last_names":{"fpath":fpath_llama_last_names, "country_fpath":fpath_temp_llama_last_names},
+ "email_domain":{"fpath":fpath_llama_email_domains, "country_fpath":fpath_temp_llama_email_domains}
+ }
# set url links to files available online
url_european_populations = 'https://raw.githubusercontent.com/ajturner/acetate/master/places/Countries-Europe.csv'
@@ -75,12 +83,12 @@
data_model_entity_user_ratios = {'card':1.3, 'device':2.5, 'transaction':5.3, 'ip':4.3}
data_model_poisson_params = {'user':{'lambda':20, 'power':1}, 'device':{'lambda':0.2, 'power':2}, 'card':{'lambda':0.1, 'power':2}, 'ip':{'lambda':1.3, 'power':2}, 'application':{'lambda':1, 'power':2}, 'transaction':{'lambda':5, 'power':2}}
data_model_shared_entities_dict = {'ip':0.05, 'card':0.005, 'device':0.01}
-data_model_null_rates = {'card':0.05}
-data_model_card_types_dict = {'visa':0.5, 'mastercard':0.5}
-data_model_payment_channels = {'paypal':0.4, 'adyen':0.15, 'appstore':0.25, 'worldpay':0.15, 'docomo':0.05}
-data_model_transaction_status = {'successful':0.94, 'pending':0.03, 'rejected':0.03}
+data_model_null_rates = {'card':0.1}
+data_model_card_types_dict = {'Visa':0.5, 'Mastercard':0.5}
+data_model_payment_channels = {'PayPal':0.4, 'Adyen':0.15, 'AppStore':0.25, 'WorldPay':0.15, 'Docomo':0.05}
+data_model_transaction_status = {'Successful':0.94, 'Pending':0.03, 'Rejected':0.03}
data_model_inconsistent_country_codes_rejection_rate = {1:0.001, 2:0.005, 3:0.01}
-data_model_non_card_trans_methods = {'wallet':0.95, 'points':0.05}
+data_model_non_card_trans_methods = {'Wallet':0.85, 'Points':0.15}
data_model_rejection_codes_fraud = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.55, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.05, 'E904:InsufficientFunds':0.1}
data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3}
@@ -88,4 +96,11 @@
data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
# set lists of generator object types
-object_types = ["device","card","ip","transaction","application"]
\ No newline at end of file
+object_types = ["device","card","ip","transaction","application"]
+user_cols = ['userid', 'first_name', 'last_name', 'registration_date', 'registration_country_code', 'uid', 'email_domain']
+device_cols = ['device_hash', 'device_type']
+card_cols = ['card_hash', 'card_type', 'card_country_code']
+ip_cols = ['ip_hash', 'ip_country_code']
+app_cols = ['application_hash']
+trans_cols = ['transaction_hash', 'transaction_date', 'transaction_amount', 'transaction_payment_method', 'card_payment_channel', 'transaction_status', 'transaction_error_code']
+itr_cols = ['itr_hash']
\ No newline at end of file
diff --git a/generator/objects/User.py b/generator/objects/User.py
index 475d79a..039775e 100644
--- a/generator/objects/User.py
+++ b/generator/objects/User.py
@@ -17,10 +17,11 @@ def __init__(
n_user_ids:int,
start_date:str,
end_date:str,
- fpath_firstnames:str=cons.fpath_firstnames,
- fpath_lastnames:str=cons.fpath_lastnames,
+ fpath_first_names:str=cons.fpath_llama_first_names,
+ fpath_last_names:str=cons.fpath_llama_last_names,
fpath_countries_europe:str=cons.fpath_countries_europe,
- fpath_email_domain :str=cons.fpath_email_domain ,
+ fpath_email_domain:str=cons.fpath_email_domain,
+ fpath_bedrock_email_domain:str=cons.fpath_llama_email_domains,
):
"""
The randomly generated user data model object
@@ -33,14 +34,14 @@ def __init__(
The start date to generate users from
end_date : str
The end date to generate users till
- fpath_firstnames : str
- The full file path to the first names reference data, default is cons.fpath_firstnames.
- fpath_lastnames : str
- The full file path to the last names reference data, default is cons.fpath_lastnames.
+ fpath_first_names : str
+ The full file path to the first names reference data, default is cons.fpath_llama_first_names.
+ fpath_last_names : str
+ The full file path to the last names reference data, default is cons.fpath_llama_last_names.
fpath_countries_europe : str
The full file path to the europe countries reference data, default is cons.fpath_countries_europe.
fpath_email_domain : str
- The full file path to the email domain reference data, default is cons.fpath_email_domain .
+ The full file path to the email domain reference data, default is cons.fpath_llama_email_domains .
Attributes
----------
@@ -58,9 +59,9 @@ def __init__(
The user id counts dictionary
user_ids_props_dict : Dict[str, float]
The user id proportions dictionary
- user_ids_firstname_dict : Dict[str, str]
+ user_ids_first_name_dict : Dict[str, str]
The user id first names dictionary
- user_ids_lastname_dict : Dict[str, str]
+ user_ids_last_name_dict : Dict[str, str]
The user id last names dictionary
user_ids_country_code_dict : Dict[str, str]
The user id country codes dictionary
@@ -72,85 +73,60 @@ def __init__(
self.n_user_ids = n_user_ids
self.start_date = start_date
self.end_date = end_date
- self.fpath_firstnames = fpath_firstnames
- self.fpath_lastnames = fpath_lastnames
+ self.fpath_first_names = fpath_first_names
+ self.fpath_last_names = fpath_last_names
self.fpath_countries_europe = fpath_countries_europe
- self.fpath_email_domain = fpath_email_domain
+ self.fpath_email_domain = fpath_email_domain
+ self.fpath_bedrock_email_domain = fpath_bedrock_email_domain
self.lam = cons.data_model_poisson_params["user"]["lambda"]
self.power = cons.data_model_poisson_params["user"]["power"]
self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power)
self.user_ids = list(self.user_ids_cnts_dict.keys())
self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict)
self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe)
- self.user_ids_firstname_dict = self.gen_user_firstname(fpath_firstnames=self.fpath_firstnames)
- self.user_ids_lastname_dict = self.gen_user_lastname(fpath_lastnames=self.fpath_lastnames)
- self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain)
+ self.user_ids_first_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_first_names, sample_column_name="first_names")
+ self.user_ids_last_name_dict = self.gen_user_bedrock_name_data(fpath_bedrock_data=self.fpath_last_names, sample_column_name="last_names")
+ self.user_ids_email_domain_dict = self.gen_user_bedrock_email_domain(fpath_email_domain=self.fpath_email_domain, fpath_bedrock_email_domain=self.fpath_bedrock_email_domain)
self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date)
@beartype
- def gen_user_firstname(
+ def gen_user_bedrock_name_data(
self,
- fpath_firstnames:str,
+ fpath_bedrock_data:str,
+ sample_column_name:str,
) -> Dict[str, str]:
"""
- Generates a dictionary of random user id first names
+ Generates a dictionary of random user bedrock data, e.g. first_names or last_names
Parameters
----------
- fpath_firstnames : str
- The file path to the first names reference file
+ fpath_bedrock_data : str
+ The file path to the bedrock data reference file
+ sample_column_name : str
+ The column name to sample from in the bedrock data reference file
Returns
-------
Dict[str, str]
- A dictionary of user id first names
+ A dictionary of user id bedrock data
"""
# load in list of first names
- first_name_data = pd.read_csv(fpath_firstnames)
- # randomly sample names firstnames according to country code and counts
+ bedrock_data = pd.read_csv(fpath_bedrock_data)
+ # randomly sample names first_names according to country code and counts
country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1)
country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index()
- country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: first_name_data.loc[(first_name_data["ISO numeric"] == series["country_code"]), "firstnames"].sample(n=series["count"], replace=True).to_list(), axis=1)
- # create the key value pairs mapping user id to firstname
- user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list()
+ country_codes_cnt["sample"] = country_codes_cnt.apply(lambda series: bedrock_data.loc[(bedrock_data["ISO numeric"] == series["country_code"]), sample_column_name].sample(n=series["count"], replace=True, weights=None).to_list(), axis=1)
+ # create the key value pairs mapping user id to bedrock data points
+ user_ids_bedrock_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["sample"])), axis=1).to_list()
# convert key value pairs to dict
- user_ids_firstname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict()
- return user_ids_firstname_dict
+ user_ids_bedrock_dict = pd.concat([pd.Series(d) for d in user_ids_bedrock_pairs])[country_code_dataframe["user_ids"]].to_dict()
+ return user_ids_bedrock_dict
@beartype
- def gen_user_lastname(
- self,
- fpath_lastnames:str,
- ) -> Dict[str, str]:
- """
- Generates a dictionary of random user id last names.
-
- Parameters
- ----------
- fpath_lastnames : str
- The file path to the last names reference file.
-
- Returns
- -------
- Dict[str, str]
- A dictionary of user id last names.
- """
- # load in list of last names
- last_name_data = pd.read_csv(fpath_lastnames)
- # randomly sample names firstnames according to country code and counts
- country_code_dataframe = pd.Series(self.user_ids_country_code_dict, name="country_code").to_frame().reset_index().rename(columns={"index":"user_ids"}).assign(count=1)
- country_codes_cnt = country_code_dataframe.groupby(by="country_code").agg({"user_ids":list,"count":"sum"}).reset_index()
- country_codes_cnt["names"] = country_codes_cnt.apply(lambda series: last_name_data.loc[(last_name_data["ISO numeric"] == series["country_code"]), "lastnames"].sample(n=series["count"], replace=True).to_list(), axis=1)
- # create the key value pairs mapping user id to firstname
- user_ids_names_pairs = country_codes_cnt.apply(lambda series: dict(zip(series["user_ids"], series["names"])), axis=1).to_list()
- # convert key value pairs to dict
- user_ids_lastname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict()
- return user_ids_lastname_dict
-
- @beartype
- def gen_user_email_domain(
+ def gen_user_bedrock_email_domain(
self,
fpath_email_domain:str,
+ fpath_bedrock_email_domain:str,
) -> Dict[str, str]:
"""
Generates a dictionary of random user id email domains
@@ -182,4 +158,4 @@ def gen_user_email_domain(
)
# return the user ids email domains
user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list))
- return user_ids_email_domain_dict
+ return user_ids_email_domain_dict
\ No newline at end of file
diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py
index 540bb1c..4c4be48 100644
--- a/generator/unittests/app/test_gen_user_trans_data.py
+++ b/generator/unittests/app/test_gen_user_trans_data.py
@@ -34,10 +34,11 @@
np.random.seed(seed=programmeparams.random_seed)
# create relative file paths
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
+fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1]
fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1]
fpath_countrycrimeindex = '.' + cons.fpath_countrycrimeindex.split(cons.fpath_repo_dir)[1]
fpath_unittest_user_data = '.' + cons.fpath_unittest_user_data.split(cons.fpath_repo_dir)[1]
@@ -48,10 +49,11 @@
n_user_ids=programmeparams.n_users,
start_date=programmeparams.registration_start_date,
end_date=programmeparams.registration_end_date,
- fpath_firstnames=fpath_firstnames,
- fpath_lastnames=fpath_lastnames,
+ fpath_first_names=fpath_first_names,
+ fpath_last_names=fpath_last_names,
fpath_countries_europe=fpath_countries_europe,
- fpath_email_domain=fpath_email_domain
+ fpath_email_domain=fpath_email_domain,
+ fpath_bedrock_email_domain=fpath_bedrock_email_domain,
)
# generate random entity counts for each user
diff --git a/generator/unittests/objects/test_Application.py b/generator/unittests/objects/test_Application.py
index 58435b5..9403daa 100644
--- a/generator/unittests/objects/test_Application.py
+++ b/generator/unittests/objects/test_Application.py
@@ -28,10 +28,10 @@
"dded2b63f8242648": 0.2727272727272727,
}
exp_application_hashes_payment_channel_dict = {
- "63cea7c46926aa74": "adyen",
- "37725417bd51fb40": "adyen",
- "b95cb80aae9fbbfe": "paypal",
- "dded2b63f8242648": "docomo",
+ "63cea7c46926aa74": "Adyen",
+ "37725417bd51fb40": "Adyen",
+ "b95cb80aae9fbbfe": "PayPal",
+ "dded2b63f8242648": "Docomo",
}
exp_n_application_hashes = cons.unittest_n_entities
exp_lam = cons.data_model_poisson_params["application"]["lambda"]
diff --git a/generator/unittests/objects/test_Card.py b/generator/unittests/objects/test_Card.py
index 688455f..32f0358 100644
--- a/generator/unittests/objects/test_Card.py
+++ b/generator/unittests/objects/test_Card.py
@@ -16,10 +16,10 @@
"dded2b63f8242648": 1,
}
exp_card_hashes_type_dict = {
- "63cea7c46926aa74": "visa",
- "37725417bd51fb40": "mastercard",
- "b95cb80aae9fbbfe": "visa",
- "dded2b63f8242648": "mastercard",
+ "63cea7c46926aa74": "Visa",
+ "37725417bd51fb40": "Mastercard",
+ "b95cb80aae9fbbfe": "Visa",
+ "dded2b63f8242648": "Mastercard",
}
exp_card_hashes_props_dict = {
"63cea7c46926aa74": 0.16666666666666666,
diff --git a/generator/unittests/objects/test_Transaction.py b/generator/unittests/objects/test_Transaction.py
index 05531ba..0c338da 100644
--- a/generator/unittests/objects/test_Transaction.py
+++ b/generator/unittests/objects/test_Transaction.py
@@ -22,10 +22,10 @@
"dded2b63f8242648": 0.3793103448275862,
}
exp_transaction_hashes_status_dict = {
- "63cea7c46926aa74": "successful",
- "37725417bd51fb40": "successful",
- "b95cb80aae9fbbfe": "successful",
- "dded2b63f8242648": "successful",
+ "63cea7c46926aa74": "Successful",
+ "37725417bd51fb40": "Successful",
+ "b95cb80aae9fbbfe": "Successful",
+ "dded2b63f8242648": "Successful",
}
exp_transaction_hashes_amounts_dict = {
"63cea7c46926aa74": 2.99,
diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py
index 92a30bd..16280c8 100644
--- a/generator/unittests/objects/test_User.py
+++ b/generator/unittests/objects/test_User.py
@@ -21,13 +21,13 @@
"4264861381989413": 0.20212765957446807,
"6720317315593519": 0.2765957446808511,
}
-exp_user_ids_firstname_dict = {
+exp_user_ids_first_name_dict = {
"6374692674377254": "simone",
"1751409580926382": "francesca",
"4264861381989413": "igor",
"6720317315593519": "beckett",
}
-exp_user_ids_lastname_dict = {
+exp_user_ids_last_name_dict = {
"6374692674377254": "de filippo",
"1751409580926382": "gagliardi",
"4264861381989413": "lupu",
@@ -59,16 +59,27 @@
random.seed(cons.unittest_seed)
np.random.seed(cons.unittest_seed)
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
-user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
+fpath_bedrock_email_domain = '.' + cons.fpath_llama_email_domains.split(cons.fpath_repo_dir)[1]
+
+user_object = User(
+ n_user_ids=exp_n_user_ids,
+ start_date=exp_start_date,
+ end_date=exp_end_date,
+ fpath_first_names=fpath_first_names,
+ fpath_last_names=fpath_last_names,
+ fpath_countries_europe=fpath_countries_europe,
+ fpath_email_domain=fpath_email_domain,
+ fpath_bedrock_email_domain=fpath_bedrock_email_domain
+ )
obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict
obs_user_ids_props_dict = user_object.user_ids_props_dict
-obs_user_ids_firstname_dict = user_object.user_ids_firstname_dict
-obs_user_ids_lastname_dict = user_object.user_ids_lastname_dict
+obs_user_ids_first_name_dict = user_object.user_ids_first_name_dict
+obs_user_ids_last_name_dict = user_object.user_ids_last_name_dict
obs_user_ids_country_code_dict = user_object.user_ids_country_code_dict
obs_user_ids_email_domain_dict = user_object.user_ids_email_domain_dict
obs_user_ids_dates_dict = user_object.user_ids_dates_dict
@@ -86,10 +97,10 @@ def setUp(self):
self.obs_user_ids_cnts_dict = obs_user_ids_cnts_dict
self.exp_user_ids_props_dict = exp_user_ids_props_dict
self.obs_user_ids_props_dict = obs_user_ids_props_dict
- self.exp_user_ids_firstname_dict = exp_user_ids_firstname_dict
- self.obs_user_ids_firstname_dict = obs_user_ids_firstname_dict
- self.exp_user_ids_lastname_dict = exp_user_ids_lastname_dict
- self.obs_user_ids_lastname_dict = obs_user_ids_lastname_dict
+ self.exp_user_ids_first_name_dict = exp_user_ids_first_name_dict
+ self.obs_user_ids_first_name_dict = obs_user_ids_first_name_dict
+ self.exp_user_ids_last_name_dict = exp_user_ids_last_name_dict
+ self.obs_user_ids_last_name_dict = obs_user_ids_last_name_dict
self.exp_user_ids_country_code_dict = exp_user_ids_country_code_dict
self.obs_user_ids_country_code_dict = obs_user_ids_country_code_dict
self.exp_user_ids_email_domain_dict = exp_user_ids_email_domain_dict
@@ -108,8 +119,8 @@ def setUp(self):
def test_type(self):
self.assertEqual(type(self.obs_user_ids_cnts_dict), type(self.exp_user_ids_cnts_dict))
self.assertEqual(type(self.obs_user_ids_props_dict), type(self.exp_user_ids_props_dict))
- self.assertEqual(type(self.obs_user_ids_firstname_dict),type(self.exp_user_ids_firstname_dict),)
- self.assertEqual(type(self.obs_user_ids_lastname_dict), type(self.exp_user_ids_lastname_dict))
+ self.assertEqual(type(self.obs_user_ids_first_name_dict),type(self.exp_user_ids_first_name_dict),)
+ self.assertEqual(type(self.obs_user_ids_last_name_dict), type(self.exp_user_ids_last_name_dict))
self.assertEqual(type(self.obs_user_ids_country_code_dict),type(self.exp_user_ids_country_code_dict),)
self.assertEqual(type(self.obs_user_ids_email_domain_dict),type(self.exp_user_ids_email_domain_dict),)
self.assertEqual(type(self.obs_user_ids_dates_dict), type(self.exp_user_ids_dates_dict))
@@ -121,8 +132,8 @@ def test_type(self):
def test_len(self):
self.assertEqual(len(self.obs_user_ids_cnts_dict), len(self.exp_user_ids_cnts_dict))
self.assertEqual(len(self.obs_user_ids_props_dict), len(self.exp_user_ids_props_dict))
- self.assertEqual(len(self.obs_user_ids_firstname_dict), len(self.exp_user_ids_firstname_dict))
- self.assertEqual(len(self.obs_user_ids_lastname_dict), len(self.exp_user_ids_lastname_dict))
+ self.assertEqual(len(self.obs_user_ids_first_name_dict), len(self.exp_user_ids_first_name_dict))
+ self.assertEqual(len(self.obs_user_ids_last_name_dict), len(self.exp_user_ids_last_name_dict))
self.assertEqual(len(self.obs_user_ids_country_code_dict),len(self.exp_user_ids_country_code_dict),)
self.assertEqual(len(self.obs_user_ids_email_domain_dict),len(self.exp_user_ids_email_domain_dict),)
self.assertEqual(len(self.obs_user_ids_dates_dict), len(self.exp_user_ids_dates_dict))
@@ -130,8 +141,8 @@ def test_len(self):
def test_keys(self):
self.assertEqual(list(self.obs_user_ids_cnts_dict.keys()),list(self.exp_user_ids_cnts_dict.keys()),)
self.assertEqual(list(self.obs_user_ids_props_dict.keys()),list(self.exp_user_ids_props_dict.keys()),)
- self.assertEqual(list(self.obs_user_ids_firstname_dict.keys()),list(self.exp_user_ids_firstname_dict.keys()),)
- self.assertEqual(list(self.obs_user_ids_lastname_dict.keys()),list(self.exp_user_ids_lastname_dict.keys()),)
+ self.assertEqual(list(self.obs_user_ids_first_name_dict.keys()),list(self.exp_user_ids_first_name_dict.keys()),)
+ self.assertEqual(list(self.obs_user_ids_last_name_dict.keys()),list(self.exp_user_ids_last_name_dict.keys()),)
self.assertEqual(list(self.obs_user_ids_country_code_dict.keys()),list(self.exp_user_ids_country_code_dict.keys()),)
self.assertEqual(list(self.obs_user_ids_email_domain_dict.keys()),list(self.exp_user_ids_email_domain_dict.keys()),)
self.assertEqual(list(self.obs_user_ids_dates_dict.keys()),list(self.exp_user_ids_dates_dict.keys()),)
@@ -139,8 +150,8 @@ def test_keys(self):
def test_values(self):
self.assertEqual(list(self.obs_user_ids_cnts_dict.values()),list(self.exp_user_ids_cnts_dict.values()),)
self.assertEqual(list(self.obs_user_ids_props_dict.values()),list(self.exp_user_ids_props_dict.values()),)
- self.assertEqual(list(self.obs_user_ids_firstname_dict.values()),list(self.exp_user_ids_firstname_dict.values()),)
- self.assertEqual(list(self.obs_user_ids_lastname_dict.values()),list(self.exp_user_ids_lastname_dict.values()),)
+ self.assertEqual(list(self.obs_user_ids_first_name_dict.values()),list(self.exp_user_ids_first_name_dict.values()),)
+ self.assertEqual(list(self.obs_user_ids_last_name_dict.values()),list(self.exp_user_ids_last_name_dict.values()),)
self.assertEqual(list(self.obs_user_ids_country_code_dict.values()),list(self.exp_user_ids_country_code_dict.values()),)
self.assertEqual(list(self.obs_user_ids_email_domain_dict.values()),list(self.exp_user_ids_email_domain_dict.values()),)
self.assertEqual(list(self.obs_user_ids_dates_dict.values()),list(self.exp_user_ids_dates_dict.values()),)
@@ -148,8 +159,8 @@ def test_values(self):
def test_object(self):
self.assertEqual(self.obs_user_ids_cnts_dict, self.exp_user_ids_cnts_dict)
self.assertEqual(self.obs_user_ids_props_dict, self.exp_user_ids_props_dict)
- self.assertEqual(self.obs_user_ids_firstname_dict, self.exp_user_ids_firstname_dict)
- self.assertEqual(self.obs_user_ids_lastname_dict, self.exp_user_ids_lastname_dict)
+ self.assertEqual(self.obs_user_ids_first_name_dict, self.exp_user_ids_first_name_dict)
+ self.assertEqual(self.obs_user_ids_last_name_dict, self.exp_user_ids_last_name_dict)
self.assertEqual(self.obs_user_ids_country_code_dict, self.exp_user_ids_country_code_dict)
self.assertEqual(self.obs_user_ids_email_domain_dict, self.exp_user_ids_email_domain_dict)
self.assertEqual(self.obs_user_ids_dates_dict, self.exp_user_ids_dates_dict)
diff --git a/generator/unittests/utilities/test_gen_obj_idhash_series.py b/generator/unittests/utilities/test_gen_obj_idhash_series.py
index 18faa86..7e3eabd 100644
--- a/generator/unittests/utilities/test_gen_obj_idhash_series.py
+++ b/generator/unittests/utilities/test_gen_obj_idhash_series.py
@@ -20,8 +20,8 @@
start_date = cons.unittest_registration_start_date
end_date = cons.unittest_registration_end_date
n_user_ids = cons.unittest_n_entities
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1]
@@ -30,7 +30,7 @@
np.random.seed(cons.unittest_seed)
# create user object
-user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
+user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
# generate random entity counts
random_entity_counts = gen_random_entity_counts(user_obj=user_object)
# generate random entity values
diff --git a/generator/unittests/utilities/test_gen_random_entity_counts.py b/generator/unittests/utilities/test_gen_random_entity_counts.py
index 58a5522..45c8d27 100644
--- a/generator/unittests/utilities/test_gen_random_entity_counts.py
+++ b/generator/unittests/utilities/test_gen_random_entity_counts.py
@@ -19,11 +19,11 @@
random.seed(cons.unittest_seed)
np.random.seed(cons.unittest_seed)
-fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
-fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
+fpath_first_names = '.' + cons.fpath_llama_first_names.split(cons.fpath_repo_dir)[1]
+fpath_last_names = '.' + cons.fpath_llama_last_names.split(cons.fpath_repo_dir)[1]
fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
-user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
+user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_first_names=fpath_first_names, fpath_last_names=fpath_last_names, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
exp_randomentity_counts_dict = {
'uid': ['6374692674377254', '6720317315593519', '4264861381989413', '1751409580926382'],
diff --git a/generator/utilities/Bedrock.py b/generator/utilities/Bedrock.py
index 9ea42d6..1e2ea0a 100644
--- a/generator/utilities/Bedrock.py
+++ b/generator/utilities/Bedrock.py
@@ -1,5 +1,5 @@
import json
-import boto3
+from typing import Dict, List
from beartype import beartype
class Bedrock():
@@ -10,7 +10,7 @@ class Bedrock():
Parameters
----------
- session : boto3.Session
+ bedrock_runtime : boto3.Session
A Boto3 session object configured with appropriate AWS credentials.
model_region: str
The AWS region where the Bedrock model is hosted.
@@ -31,16 +31,14 @@ class Bedrock():
@beartype
def __init__(
self,
- session:boto3.Session,
- model_region="us-east-1",
- model_id:str="meta.llama3-8b-instruct-v1:0",
+ bedrock_runtime,
):
- self.client = session.client("bedrock-runtime", region_name=model_region)
- self.model_id = model_id,
-
+ self.bedrock_runtime = bedrock_runtime
+
@beartype
def prompt(
self,
+ model_id:str,
user_prompt:str,
system_prompt:str="",
top_p:float=0.5,
@@ -89,32 +87,51 @@ def prompt(
# call bedrock model
try:
# Invoke the model with the request.
- response = self.client.invoke_model(modelId=self.model_id, body=request)
+ response = self.bedrock_runtime.invoke_model(modelId=model_id, body=request)
except Exception as e:
- raise Exception(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}")
+ raise Exception(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
# Decode and extract the response
model_response = json.loads(response["body"].read())
response_text = model_response["generation"]
return response_text
+
+ @beartype
+ def converse(
+ self,
+ modelId:str,
+ messages:List,
+ system:List,
+ inference_config:Dict={"maxTokens":512, "temperature":0.5, "topP":0.5,},
+ tools_config:Dict=None
+ ):
+ """
+ Invoke the Bedrock model with the provided messages and configurations.
-system_prompt = """# Task
-
-You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin.
-
-# Requirements
-
-- Generate typical names for both male and female people.
-- The names do not need to be traditional to the target European country.
-- Do not repeat any first names or last names more than once. Each individual first name must be unique and each individual last name must be unique.
-- You should return the first names and last names using a valid JSON object tagged as .
-- The valid JSON object should be of the following structure; {"firstnames":["first name 1","first name 2",...,"first name N"], "lastnames":["last name 1","last name 2",...,"last name N"]}
-
-# Examples
-
-- Generate 2 first names and 2 last names for people from the country "Germany" -> {"firstnames":["Max","Hannah"], "lastnames":["Müller","Schmidt"]}
-- Generate 4 first names and 4 last names for people from the country "United Kingdom" -> {"firstnames":["George","Richard","Katie","Mary"], "lastnames":["Smith","Taylor","Jones","Brown"]}
-- Generate 3 first names and 3 last names for people from the country "France" -> {"firstnames":["Lola","Mathieu","Léa"], "lastnames":["Benoît","Pierre","Lefort"]}
-- Generate 5 first names and 5 last names for people from the country "Spain" -> {"firstnames":["Juan","Cristina","Javier","Julia","Isabel"], "lastnames":["Garcia","Martinez","Rodriguez","Lopez","Gomez"]}
-- Generate 6 first names and 6 last names for people from the country "Sweden" -> {"firstnames":["Tova","Alva","Casper","Märta","Axel","Elsa"], "lastnames":["Andersson","Johansson","Lundberg","Svensson","Pettersson","Nilsson"]}"""
+ Parameters
+ ----------
+ messages : Dict
+ A list of message objects representing the conversation history.
+ system : Dict
+ A system message object providing context or instructions for the model.
+ inference_config : Dict
+ Configuration settings for inference parameters.
+ tools_config : Dict
+ Configuration settings for any tools to be used during inference.
-prompt = 'Generate {n_user_names} first names and {n_user_names} last names for people from the country "{country}"'
\ No newline at end of file
+ Returns
+ -------
+ Dict:
+ The response from the Bedrock Claude model.
+
+ References
+ ----------
+ https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/converse.html
+ """
+ payload = {"modelId": modelId, "messages": messages, "system": system}
+ if inference_config:
+ payload["inferenceConfig"] = inference_config
+ if tools_config:
+ payload["toolsConfig"] = tools_config
+ # call converse api
+ response = self.bedrock_runtime.converse(**payload)
+ return response
diff --git a/generator/utilities/gen_trans_status.py b/generator/utilities/gen_trans_status.py
index 35fbbd3..f41ac62 100644
--- a/generator/utilities/gen_trans_status.py
+++ b/generator/utilities/gen_trans_status.py
@@ -33,7 +33,7 @@ def gen_trans_status(
country_code_columns = ["registration_country_code","ip_country_code","card_country_code"]
# if card hash
if pd.notna(series['card_hash']):
- status = "rejected"
+ status = "Rejected"
# add rejections based on crime rates within country codes
if rejection_rates_dict["country_code_trans_reject_rate_dict"][np.random.choice(a=series[country_code_columns].dropna().to_list(), size=1)[0]] >= random.uniform(0, 1)/rejection_scaling_factor:
error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0]
@@ -59,11 +59,11 @@ def gen_trans_status(
error_code = np.random.choice(a=list(cons.data_model_rejection_codes_funds.keys()),p=list(cons.data_model_rejection_codes_funds.values()),size=1)[0]
# otherwise return successful status
else:
- successful_status = {key:cons.data_model_transaction_status[key] for key in ['successful', 'pending']}
+ successful_status = {key:cons.data_model_transaction_status[key] for key in ['Successful', 'Pending']}
successful_probs = [value/sum(successful_status.values()) for value in successful_status.values()]
status = np.random.choice(a=list(successful_status.keys()), size=1, p=successful_probs)[0]
error_code = np.nan
else:
- status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0]
+ status = np.random.choice(a=['Successful', 'Pending'], size=1, p=[0.98, 0.02])[0]
error_code = np.nan
return [status, error_code]
diff --git a/generator/utilities/gen_user_names_file.py b/generator/utilities/gen_user_names_file.py
deleted file mode 100644
index c8765fe..0000000
--- a/generator/utilities/gen_user_names_file.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import os
-import json
-import boto3
-import sys
-import time
-import logging
-import unidecode
-import pandas as pd
-import numpy as np
-
-sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")
-
-import cons
-from utilities.Bedrock import Bedrock, prompt, system_prompt
-
-def invoke_bedrock(
- model:Bedrock,
- n_user_names:int,
- country:str,
- countrieseurope:pd.DataFrame,
- ) -> tuple[pd.DataFrame, pd.DataFrame]:
- """
- Invokes the Bedrock model to generate user names for a specified country.
-
- This function calls the Bedrock model with a formatted prompt to generate first names
- and last names for a given country. It processes the model's response, parses the JSON
- output, and merges the results with country data. The function deduplicates and standardizes
- the name formatting, then persists the data to temporary CSV files.
-
- Parameters
- ----------
- model : Bedrock
- The Bedrock model instance used to generate names.
- n_user_names : int
- The number of user names to generate.
- country : str
- The country for which to generate names.
- countrieseurope : pd.DataFrame
- A DataFrame containing country information for merging.
-
- Returns
- -------
- tuple:
- A tuple containing two pandas DataFrames:
- - tmp_firstname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information.
- - tmp_lastname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information.
-
- Raises
- ------
- json.JSONDecodeError: If the model response cannot be parsed as JSON.
- KeyError: If the expected keys ("firstnames", "lastnames") are missing from the JSON response.
- Exception: If the merge with country data fails or file I/O operations encounter errors.
-
- Notes
- -----
- - Names are standardized by converting to lowercase, removing extra whitespace, and applying Unicode normalization using unidecode.
- - Duplicate names are removed after each processing step.
- - Results are concatenated with any previously generated data for the same country and saved to temporary CSV files if the new data increases the dataset size.
- - CSV files are encoded in latin1 format.
-
- """
- logging.info("Calling Bedrock ...")
- # call bedrock model
- formatted_prompt = prompt.format(n_user_names=n_user_names, country=country)
- logging.info(formatted_prompt)
- model_response = model.prompt(user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
- # split out answer
- text = model_response.split("")[1].split("")[0]
- # parse json
- try:
- record_set = json.loads(text)
- except json.JSONDecodeError as e:
- raise Exception(f"Error parsing JSON: {e}")
- logging.info("Processing results ...")
- # generate pandas dataframe
- user_firstname_data = pd.Series(record_set["firstnames"], name="firstnames").to_frame().drop_duplicates(subset=["firstnames"])
- user_lastname_data = pd.Series(record_set["lastnames"], name="lastnames").to_frame().drop_duplicates(subset=["lastnames"])
- # add country
- user_firstname_data['country'] = country
- user_lastname_data['country'] = country
- # join on country codes
- llama_firstname_country_data = user_firstname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name'])
- llama_lastname_country_data = user_lastname_data.merge(right=countrieseurope, left_on='country', right_on='name', how='inner').drop(columns=['name'])
- # print shapes
- logging.info(f"llama_firstname_country_data.shape: {llama_firstname_country_data.shape}")
- logging.info(f"llama_lastname_country_data.shape: {llama_lastname_country_data.shape}")
- # format output file paths
- fpath_temp_llama_firstnames = cons.fpath_temp_llama_firstnames.format(country=country.lower())
- fpath_temp_llama_lastnames = cons.fpath_temp_llama_lastnames.format(country=country.lower())
- # check against previous iterations
- tmp_firstname_country_data = pd.DataFrame()
- tmp_lastname_country_data = pd.DataFrame()
- if os.path.exists(fpath_temp_llama_firstnames):
- tmp_firstname_country_data = pd.read_csv(fpath_temp_llama_firstnames, encoding="latin1")
- if os.path.exists(fpath_temp_llama_lastnames):
- tmp_lastname_country_data = pd.read_csv(fpath_temp_llama_lastnames, encoding="latin1")
- # concatenate results
- tmp_firstname_country_data = pd.concat(objs=[tmp_firstname_country_data, llama_firstname_country_data], axis=0, ignore_index=True)
- tmp_lastname_country_data = pd.concat(objs=[tmp_lastname_country_data, llama_lastname_country_data], axis=0, ignore_index=True)
- # standardise names formatting
- standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x
- tmp_firstname_country_data["firstnames"] = tmp_firstname_country_data["firstnames"].apply(lambda x: standardise_text_lambda(x))
- tmp_lastname_country_data["lastnames"] = tmp_lastname_country_data["lastnames"].apply(lambda x: standardise_text_lambda(x))
- # deduplicate data
- tmp_firstname_country_data = tmp_firstname_country_data.drop_duplicates(subset=["firstnames"])
- tmp_lastname_country_data = tmp_lastname_country_data.drop_duplicates(subset=["lastnames"])
- # print shapes
- logging.info(f"tmp_firstname_country_data.shape: {tmp_firstname_country_data.shape}")
- logging.info(f"tmp_lastname_country_data.shape: {tmp_lastname_country_data.shape}")
- # save firstnames names data to temp directory (if pairwise firstnames have been created)
- if tmp_firstname_country_data.shape[0] >= llama_firstname_country_data.shape[0]:
- tmp_firstname_country_data.to_csv(fpath_temp_llama_firstnames, index=False, encoding="latin1")
- logging.info(f"Wrote {fpath_temp_llama_firstnames} ...")
- # save lastnames data to temp directory (if pairwise lastnames have been created)
- if tmp_lastname_country_data.shape[0] >= llama_lastname_country_data.shape[0]:
- tmp_lastname_country_data.to_csv(fpath_temp_llama_lastnames, index=False, encoding="latin1")
- logging.info(f"Wrote {fpath_temp_llama_lastnames} ...")
- return (tmp_firstname_country_data, tmp_lastname_country_data)
-
-if __name__ == "__main__":
-
- # set up logging
- lgr = logging.getLogger()
- lgr.setLevel(logging.INFO)
-
- # load aws config
- with open(cons.fpath_aws_session_token, "r") as j:
- aws_config = json.loads(j.read())
-
- # connect to aws boto3
- session = boto3.Session(
- aws_access_key_id=aws_config['Credentials']["AccessKeyId"],
- aws_secret_access_key=aws_config['Credentials']["SecretAccessKey"],
- aws_session_token=aws_config['Credentials']["SessionToken"],
- region_name="us-east-1"
- )
-
- # create bedrock instance
- bedrock = Bedrock(session=session, model_region="us-east-1", model_id="meta.llama3-70b-instruct-v1:0")
-
- # load countries, firstnames and surnames files
- countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
- orig_firstnames = pd.read_csv(cons.fpath_firstnames)
- orig_surnames = pd.read_csv(cons.fpath_lastnames)
-
- # determine file size
- orig_filesize = int((orig_firstnames.shape[0] + orig_surnames.shape[0])/2)
- n_countries = countrieseurope.shape[0]
- n_user_names = min(50, int(orig_filesize / n_countries))
-
- # generate user names
- firstname_country_data = []
- lastname_country_data = []
- error_countries = []
- # switch to toggle bedrock calls
- run_bedrock = False
-
- # set countries list
- countries_list = countrieseurope['name'].to_list()
- #countries_list = ['Cyprus']
-
- for country in countries_list:
- logging.info(f"{country} ...")
- try:
- if run_bedrock:
- # call bedrock model and generate user names data
- tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, n_user_names=n_user_names, country=country)
- logging.info("Waiting ...")
- # wait 20 seconds before retrying
- time.sleep(20)
- else:
- tmp_firstname_country_data = pd.read_csv(cons.fpath_temp_llama_firstnames.format(country=country.lower()), encoding="latin1")
- tmp_lastname_country_data = pd.read_csv(cons.fpath_temp_llama_lastnames.format(country=country.lower()), encoding="latin1")
- # append to user country data
- firstname_country_data.append(tmp_firstname_country_data)
- lastname_country_data.append(tmp_lastname_country_data)
- except Exception as e:
- logging.info(e)
- error_countries.append(country)
-
- # log if any countries failed to generate data
- if len(error_countries) > 0:
- logging.info(f"Failed to generated data for countries: {error_countries}")
-
- # load existing reference data
- firstname_country_df = pd.read_csv(cons.fpath_llama_firstnames, encoding="latin1")
- lastname_country_df = pd.read_csv(cons.fpath_llama_lastnames, encoding="latin1")
- # append to country data lists
- firstname_country_data.append(firstname_country_df)
- lastname_country_data.append(lastname_country_df)
- # concatenate user country data together and deduplicate across firstnames and countries
- output_firstname_country_df = pd.concat(firstname_country_data, axis=0, ignore_index=True)
- output_lastname_country_df = pd.concat(lastname_country_data, axis=0, ignore_index=True)
- # sort and deduplicate output data
- output_firstname_country_df = output_firstname_country_df.drop_duplicates(subset=["country","firstnames"]).sort_values(by=["country","firstnames"])
- output_lastname_country_df = output_lastname_country_df.drop_duplicates(subset=["country","lastnames"]).sort_values(by=["country","lastnames"])
-
- # write data to disk
- if output_firstname_country_df['country'].nunique() == n_countries:
- logging.info(f"output_firstname_country_df.shape: {output_firstname_country_df.shape}")
- output_firstname_country_df.to_csv(cons.fpath_llama_firstnames, index=False, encoding="latin1")
- else:
- logging.info("WARNING Insufficient first name data generated.")
- if output_lastname_country_df['country'].nunique() == n_countries:
- logging.info(f"output_lastname_country_df.shape: {output_lastname_country_df.shape}")
- output_lastname_country_df.to_csv(cons.fpath_llama_lastnames, index=False, encoding="latin1")
- else:
- logging.info("WARNING Insufficient last name data generated.")
\ No newline at end of file