diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet index 1067bf5..47b78f1 100644 Binary files a/data/unittest/transaction_data.parquet and b/data/unittest/transaction_data.parquet differ diff --git a/data/unittest/user_data.parquet b/data/unittest/user_data.parquet index af44975..bb932c9 100644 Binary files a/data/unittest/user_data.parquet and b/data/unittest/user_data.parquet differ diff --git a/generator/app/ProgrammeParams.py b/generator/app/ProgrammeParams.py index 963db5b..01de6d5 100644 --- a/generator/app/ProgrammeParams.py +++ b/generator/app/ProgrammeParams.py @@ -1,8 +1,51 @@ -import cons from datetime import datetime from beartype import beartype +import cons + class ProgrammeParams(): + """ + Class to manage and store programme parameters for the telecom payment generator. + This class validates and initializes all configuration parameters needed for the + payment generation process, including user counts, application volumes, and date ranges + for registration and transaction periods. + + Parameters + ---------- + n_users : int, optional + Number of users. Defaults to 100. + random_seed : int, optional + Seed for reproducible randomization. Defaults to None. + n_applications : int, optional + Number of applications. Defaults to 20000. + registration_start_date : str, optional + Registration period start date. Defaults to cons.default_registration_start_date. + registration_end_date : str, optional + Registration period end date. Defaults to cons.default_registration_end_date. + transaction_start_date : str, optional + Transaction period start date. Defaults to cons.default_transaction_start_date. + transaction_end_date : str, optional + Transaction period end date. Defaults to cons.default_transaction_end_date. + + Attributes + ---------- + random_seed : int, optional + Seed for random number generation for reproducibility. + n_users : int + Number of users to generate. Defaults to 100. + n_applications : int + Number of applications to generate. Defaults to 20000. + registration_start_date : str + Start date for user registration (format: YYYY-MM-DD). + registration_end_date : str + End date for user registration (format: YYYY-MM-DD). + transaction_start_date : str + Start date for transactions (format: YYYY-MM-DD). + transaction_end_date : str + End date for transactions (format: YYYY-MM-DD). + transaction_timescale : float + The transaction period duration in years. + """ @beartype def __init__( @@ -13,7 +56,7 @@ def __init__( registration_start_date:str=cons.default_registration_start_date, registration_end_date:str=cons.default_registration_end_date, transaction_start_date:str=cons.default_transaction_start_date, - transaction_end_date:str=cons.default_transaction_end_date + transaction_end_date:str=cons.default_transaction_end_date, ): # take programme parameters from class parameters self.random_seed = random_seed diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py index 9a0be63..1b400cd 100644 --- a/generator/app/gen_random_telecom_data.py +++ b/generator/app/gen_random_telecom_data.py @@ -1,4 +1,6 @@ import numpy as np +import pandas as pd +from typing import Dict import random from beartype import beartype @@ -16,22 +18,25 @@ @beartype def gen_random_telecom_data( - n_users=1, - random_seed=None, - registration_start_date=cons.default_registration_start_date, - registration_end_date=cons.default_registration_end_date, - transaction_start_date=cons.default_transaction_start_date, - transaction_end_date=cons.default_transaction_end_date - ): + n_users:int=1, + random_seed:int=None, + n_applications:int=20000, + registration_start_date:str=cons.default_registration_start_date, + registration_end_date:str=cons.default_registration_end_date, + transaction_start_date:str=cons.default_transaction_start_date, + transaction_end_date:str=cons.default_transaction_end_date, + ) -> Dict[str, pd.DataFrame]: """ Generates random telecommunications data. - + Parameters ---------- - n_users : float + n_users : int The number of users to generate random telecom payments data for, default is 1. random_seed : int A set random seed for reproducible results, default is None. + n_applications : int + The number of applications to generate, default is 20000. registration_start_date : str The user registration start date, default is cons.default_registration_start_date. registration_end_date : str @@ -40,28 +45,28 @@ def gen_random_telecom_data( The user transaction start date, default is cons.default_transaction_start_date. transaction_end_date : str The user transaction end date, default is cons.default_transaction_end_date. - + Returns ------- - pandas.DataFrame + Dict[str, pandas.DataFrame] A random telecommunication payments dataset. """ - + # initalise programme parameters programmeparams = ProgrammeParams( - n_users=n_users, + n_users=n_users, random_seed=random_seed, - n_applications=20000, - registration_start_date=registration_start_date, + n_applications=n_applications, + registration_start_date=registration_start_date, registration_end_date=registration_end_date, transaction_start_date=transaction_start_date, transaction_end_date=transaction_end_date ) - + # set random seed random.seed(programmeparams.random_seed) np.random.seed(seed=programmeparams.random_seed) - + # generate random users user_obj = User( n_user_ids=programmeparams.n_users, @@ -69,23 +74,23 @@ def gen_random_telecom_data( end_date=programmeparams.registration_end_date, fpath_firstnames=cons.fpath_llama_firstnames, fpath_lastnames=cons.fpath_llama_lastnames, - fpath_countrieseurope=cons.fpath_countrieseurope, - fpath_domain_email=cons.fpath_domain_email + fpath_countries_europe=cons.fpath_countries_europe, + fpath_email_domain =cons.fpath_email_domain ) - + # generate random entity counts for each user random_entity_counts = gen_random_entity_counts( user_obj=user_obj, transaction_timescale=programmeparams.transaction_timescale ) - + # generate random entity values device_obj = Device(n_device_hashes=random_entity_counts['n_devices'].sum()) card_obj = Card(n_card_hashes=random_entity_counts['n_cards'].sum()) ip_obj = Ip(n_ip_hashes=random_entity_counts['n_ips'].sum()) transaction_obj = Transaction(n_transaction_hashes=random_entity_counts['n_transactions'].sum(), start_date=programmeparams.transaction_start_date, end_date=programmeparams.transaction_end_date) application_obj = Application(n_application_hashes=programmeparams.n_applications) - + # generate user level data user_data = gen_user_data( random_entity_counts=random_entity_counts, @@ -96,7 +101,7 @@ def gen_random_telecom_data( transaction_obj=transaction_obj, application_obj=application_obj, ) - + # generate transaction level data trans_data = gen_trans_data( user_data=user_data, @@ -108,5 +113,5 @@ def gen_random_telecom_data( application_obj=application_obj, fpath_countrycrimeindex=cons.fpath_countrycrimeindex ) - + return {"user_data":user_data, "trans_data":trans_data} diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py index 69cfc8a..363941d 100644 --- a/generator/app/gen_trans_data.py +++ b/generator/app/gen_trans_data.py @@ -1,8 +1,9 @@ import random import pandas as pd import numpy as np -import cons from datetime import datetime +from beartype import beartype + from objects.User import User from objects.Device import Device from objects.Card import Card @@ -14,7 +15,7 @@ from utilities.gen_trans_rejection_rates import gen_trans_rejection_rates from utilities.gen_trans_status import gen_trans_status from utilities.join_idhashes_dict import join_idhashes_dict -from beartype import beartype +import cons @beartype def gen_trans_data( @@ -25,11 +26,11 @@ def gen_trans_data( ip_obj:Ip, transaction_obj:Transaction, application_obj:Application, - fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex + fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex, ): """ Generates random transaction level telecom payments data. - + Parameters ---------- user_data : pandas.DataFrame @@ -48,22 +49,23 @@ def gen_trans_data( The random application data model object. fpath_countrycrimeindex : str The full file path to the country crime index reference data, default is cons.fpath_countrycrimeindex. - + Returns ------- pandas.DataFrame The random transaction level telecom payments data. """ - + # explode user data to transaction level trans_data = user_data.explode('transaction_hash').dropna(subset = ['transaction_hash']).reset_index(drop = True) # select uid entity hashes for each transaction - trans_data['device_hash'] = trans_data['device_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan) - trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan) - trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan) - trans_data['application_hash'] = trans_data['application_hash'].apply(lambda x: np.random.choice(x, size = 1)[0]) + trans_data['device_hash'] = trans_data['device_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan) + trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan) + trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan) + trans_data['application_hash'] = trans_data['application_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan) # add null values card hashes - trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.nan if random.uniform(0, 1) <= cons.data_model_null_rates['card'] else x) + trans_null_mask = np.random.uniform(size=trans_data.shape[0]) <= cons.data_model_null_rates['card'] + trans_data.loc[trans_null_mask, 'card_hash'] = np.nan # add shared hashed entities between users trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: ip_obj.ip_shared_idhash_map_dict[x] if x in ip_obj.ip_shared_idhash_map_dict.keys() else x) trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: card_obj.card_shared_idhash_map_dict[x] if x in card_obj.card_shared_idhash_map_dict.keys() else x) @@ -79,7 +81,7 @@ def gen_trans_data( trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=transaction_obj.transaction_hashes_dates_dict, idhash_key_name='transaction_hash', idhash_val_name='transaction_date') # add application data trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=application_obj.application_hashes_payment_channel_dict, idhash_key_name='application_hash', idhash_val_name='card_payment_channel') - + # TODO: wrap this logic up into a separate function # align payment channel with missing card hashes and 0 transaction amounts zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0) @@ -90,7 +92,8 @@ def gen_trans_data( trans_data['transaction_payment_method'] = 'card' zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0) missing_card_hash_filter = (trans_data['card_hash'].isnull()) - trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0]) + # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0]) + trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))[0]) trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan # align country codes for user, ip and card country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha'] @@ -105,15 +108,15 @@ def gen_trans_data( dates_series = pd.date_range(start=datetime.strptime(transaction_obj.start_date, "%Y-%m-%d"), end=datetime.strptime(transaction_obj.end_date, "%Y-%m-%d") - pd.Timedelta(days=1), freq="d") trans_data[date_columns] = trans_data[date_columns].apply(lambda s: [s['registration_date'], np.random.choice(a=dates_series[dates_series >= max(s['registration_date'], s['transaction_date'])], size=1)[0]], result_type = 'expand', axis = 1).copy() # map iso numeric country codes to iso alpha country codes - country_codes_map = gen_country_codes_map(fpath_countrieseurope=user_obj.fpath_countrieseurope) + country_codes_map = gen_country_codes_map(fpath_countries_europe=user_obj.fpath_countries_europe) trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='registration_country_code_alpha', idhash_val_name='registration_country_code') trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='card_country_code_alpha', idhash_val_name='card_country_code') trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='ip_country_code_alpha', idhash_val_name='ip_country_code') - + # generate transaction status and error code - rejection_rates_dict = gen_trans_rejection_rates(trans_data=trans_data, fpath_countrieseurope=user_obj.fpath_countrieseurope, fpath_countrycrimeindex=fpath_countrycrimeindex, fpath_domain_email=user_obj.fpath_domain_email) + rejection_rates_dict = gen_trans_rejection_rates(trans_data=trans_data, fpath_countries_europe=user_obj.fpath_countries_europe, fpath_countrycrimeindex=fpath_countrycrimeindex, fpath_email_domain =user_obj.fpath_email_domain ) trans_data[['transaction_status', 'transaction_error_code']] = trans_data.apply(lambda series: gen_trans_status(series = series, rejection_rates_dict = rejection_rates_dict), result_type = 'expand', axis = 1) - + # order columns and sort rows by transaction date user_cols = ['userid', 'firstname', 'lastname', 'registration_date', 'registration_country_code', 'uid', 'email_domain'] device_cols = ['device_hash', 'device_type'] @@ -124,5 +127,5 @@ def gen_trans_data( itr_cols = ['itr_hash'] col_order = user_cols + device_cols + card_cols + ip_cols + app_cols + trans_cols + itr_cols trans_data = trans_data[col_order].sort_values(by = 'transaction_date').reset_index(drop = True) - + return trans_data \ No newline at end of file diff --git a/generator/app/gen_user_data.py b/generator/app/gen_user_data.py index 7d52dd5..9c5fa94 100644 --- a/generator/app/gen_user_data.py +++ b/generator/app/gen_user_data.py @@ -1,5 +1,7 @@ import pandas as pd import numpy as np +from beartype import beartype + from objects.User import User from objects.Device import Device from objects.Card import Card @@ -9,7 +11,6 @@ from utilities.gen_obj_idhash_series import gen_obj_idhash_series from utilities.join_idhashes_dict import join_idhashes_dict from utilities.gen_random_hash import gen_random_hash -from beartype import beartype @beartype def gen_user_data( @@ -28,17 +29,17 @@ def gen_user_data( ---------- random_entity_counts : pd.DataFrame The randomly generated entities count data - user_obj : class + user_obj : User The random user data model object - device_obj : class + device_obj : Device The random device data model object - card_obj : class + card_obj : Card The random card data model object - ip_obj : class + ip_obj : Ip The random ip data model object - transaction_obj : class + transaction_obj : Transaction The random transaction data model object - application_obj : class + application_obj : Application The random application data model object Returns @@ -58,14 +59,18 @@ def gen_user_data( zero_pad = (userid_date_country_code.str.len() - 11).abs().apply(lambda x: '0'*x) user_data['userid'] = userid_date_country_code + zero_pad + user_data['uid'].astype(str).str[-5:] # add hash data lists - user_data['device_hash'] = gen_obj_idhash_series(idhashes_props_dict=device_obj.device_hashes_props_dict, n_counts_series=user_data['n_devices']) - user_data['card_hash'] = gen_obj_idhash_series(idhashes_props_dict=card_obj.card_hashes_props_dict, n_counts_series=user_data['n_cards']) - user_data['ip_hash'] = gen_obj_idhash_series(idhashes_props_dict=ip_obj.ip_hashes_props_dict, n_counts_series=user_data['n_ips']) - user_data['transaction_hash'] = gen_obj_idhash_series(idhashes_props_dict=transaction_obj.transaction_hashes_props_dict, n_counts_series=user_data['n_transactions']) - user_data['application_hash'] = user_data['n_applications'].apply(lambda x: list(np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p = list(application_obj.application_hashes_props_dict.values()), replace = True, size = x))) + user_data['device_hash'] = gen_obj_idhash_series(idhashes=device_obj.device_hashes, n_counts_series=user_data['n_devices']) + user_data['card_hash'] = gen_obj_idhash_series(idhashes=card_obj.card_hashes, n_counts_series=user_data['n_cards']) + user_data['ip_hash'] = gen_obj_idhash_series(idhashes=ip_obj.ip_hashes, n_counts_series=user_data['n_ips']) + user_data['transaction_hash'] = gen_obj_idhash_series(idhashes=transaction_obj.transaction_hashes, n_counts_series=user_data['n_transactions']) + # generate application hashes per user + #user_data['application_hash'] = user_data['n_applications'].apply(lambda x: list(np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p = list(application_obj.application_hashes_props_dict.values()), replace = True, size = x))) + total_application_hashes = user_data['n_applications'].sum() + split_indices = user_data['n_applications'].cumsum()[:-1].values + application_hashes = np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p=list(application_obj.application_hashes_props_dict.values()), replace=True, size=total_application_hashes) + user_data['application_hash'] = pd.Series(np.split(application_hashes, split_indices)).apply(lambda x: x.tolist()) # drop excess columns - drop_columns = ['n_devices', 'n_cards', 'n_ips', 'n_applications', 'n_transactions'] - user_data = user_data.drop(columns = drop_columns) + user_data = user_data.drop(columns = ['n_devices', 'n_cards', 'n_ips', 'n_applications', 'n_transactions']) # create a hash value for the dataset (to distinguish between different iterations) user_data['itr_hash'] = gen_random_hash(size=1)[0] return user_data \ No newline at end of file diff --git a/generator/cons.py b/generator/cons.py index 783ac56..25fb9e1 100644 --- a/generator/cons.py +++ b/generator/cons.py @@ -17,9 +17,9 @@ fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv') fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv') fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv') -fpath_domain_email = os.path.join(subdir_data, 'ref', 'email-domains.csv') +fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv') fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv') -fpath_countrieseurope = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv') +fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv') fpath_firstnames = os.path.join(subdir_data, 'ref', 'first-names.txt') fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt') fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv') @@ -73,4 +73,7 @@ data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3} data_model_rejection_codes_funds = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.25, 'E904:InsufficientFunds':0.45} -data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} \ No newline at end of file +data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1} + +# set lists of generator object types +object_types = ["device","card","ip","transaction","application"] \ No newline at end of file diff --git a/generator/main.py b/generator/main.py index 032a19a..6c1a101 100644 --- a/generator/main.py +++ b/generator/main.py @@ -22,7 +22,7 @@ input_params_dict = commandline_interface() # run input error handling - res = input_error_handling(input_params_dict) + input_error_handling(input_params_dict) logging.info(f'Input Parameters: {input_params_dict}') @@ -35,6 +35,7 @@ ( input_params_dict['n_users'], None if input_params_dict['use_random_seed'] == 0 else itr, + 20000, input_params_dict['registration_start_date'], input_params_dict['registration_end_date'], input_params_dict['transaction_start_date'], diff --git a/generator/objects/Application.py b/generator/objects/Application.py index 013bf2e..8d5d5cb 100644 --- a/generator/objects/Application.py +++ b/generator/objects/Application.py @@ -1,66 +1,75 @@ -import numpy as np import cons from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict from utilities.cnt2prop_dict import cnt2prop_dict + +import numpy as np from beartype import beartype +from typing import List, Dict class Application: - + @beartype def __init__( - self, - n_application_hashes:int + self, + n_application_hashes:int, ): """ - The randomly generated application data model object. - + Initialize the Application object with randomly generated data model. + Parameters ---------- n_application_hashes : int The number of application hashes to generate. - + Attributes ---------- n_application_hashes : int The number of application hashes generated. lam : float - The lambda parameter of the squared poisson distribution used to generate the application hash counts. - application_hashes_cnts_dict : dict - The application hash counts dictionary. - application_hashes_props_dict : dict - The application hash proportions dictionary. + The lambda parameter for the Poisson distribution used to generate application hash counts. + power : float + The power parameter for the Poisson distribution. + payment_channels : Dict[str, float] + The population proportions of available payment channels. + application_hashes_cnts_dict : Dict[str, int] + Mapping of application hashes to their occurrence counts. + application_hashes_props_dict : Dict[str, float] + Mapping of application hashes to their proportions. + application_hashes_payment_channel_dict : Dict[str, str] + Mapping of application hashes to randomly assigned payment channels. """ self.n_application_hashes = n_application_hashes self.lam = cons.data_model_poisson_params["application"]["lambda"] self.power = cons.data_model_poisson_params["application"]["power"] self.payment_channels = cons.data_model_payment_channels self.application_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_application_hashes, lam=self.lam) - self.application_hashes_props_dict = cnt2prop_dict(self.application_hashes_cnts_dict) - self.application_hashes_payment_channel_dict = self.gen_transaction_payment_channel(list(self.application_hashes_cnts_dict.keys()), self.payment_channels) - + self.application_hashes = list(self.application_hashes_cnts_dict.keys()) + self.application_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.application_hashes_cnts_dict) + self.application_hashes_payment_channel_dict = self.gen_transaction_payment_channel(application_hashes=self.application_hashes, payment_channels=self.payment_channels) + @beartype def gen_transaction_payment_channel( self, - application_hashes:list, - payment_channels:dict - ) -> dict: + application_hashes:List[str], + payment_channels:Dict[str, float], + ) -> Dict[str, str]: """ Generates a dictionary of random application payment channels. - + Parameters ---------- - application_hashes : list + application_hashes : List[str] The application hashes. - payment_channels : dict + payment_channels : Dict[str, float] The population proportion of payment channels. - + Returns ------- - dict + Dict[str, str] A dictionary of transaction payment channels. """ # randomly sample payment channels based on population proportions - transactoin_payment_channels = list( + transaction_payment_channels = list( np.random.choice( a=list(payment_channels.keys()), p=list(payment_channels.values()), @@ -69,5 +78,5 @@ def gen_transaction_payment_channel( ) ) # return payment channels and application hashes - application_hashes_payment_channels_dict = dict(zip(application_hashes, transactoin_payment_channels)) + application_hashes_payment_channels_dict = dict(zip(application_hashes, transaction_payment_channels)) return application_hashes_payment_channels_dict diff --git a/generator/objects/Card.py b/generator/objects/Card.py index c76f7ac..371423f 100644 --- a/generator/objects/Card.py +++ b/generator/objects/Card.py @@ -1,83 +1,87 @@ -import numpy as np import cons from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict from utilities.cnt2prop_dict import cnt2prop_dict from utilities.gen_country_codes_dict import gen_country_codes_dict from utilities.gen_shared_idhashes import gen_shared_idhashes + +import numpy as np from beartype import beartype -from typing import Union +from typing import List, Dict, Union class Card: - + @beartype def __init__( self, n_card_hashes:Union[int,np.int64], - fpath_countrieseurope:str=cons.fpath_countrieseurope + fpath_countries_europe:str=cons.fpath_countries_europe, ): """ The randomly generated card data model object. - + Parameters ---------- n_card_hashes : int The number of card hashes to generate. - fpath_countrieseurope : str - The file path to the european countries reference file, default is cons.fpath_countrieseurope. - + fpath_countries_europe : str + The file path to the european countries reference file, default is cons.fpath_countries_europe. + Attributes ---------- n_card_hashes : int The number of card hashes generated. - card_types_dict : dict + card_types_dict : Dict[str, float] The population proportions of card types. lam : float The lambda parameter of the squared poisson distribution used to generate the card hash counts. + power : float + The power parameter of the squared poisson distribution used to generate the card hash counts. prop_shared_card_hashes : float The population proportion of shared card hashes. - card_hashes_cnts_dict : dict + card_hashes_cnts_dict : Dict[str, int] The card hash counts dictionary. - card_hashes_props_dict : dict + card_hashes_props_dict : Dict[str, float] The card hash proportions dictionary. - card_hashes_type_dict : dict + card_hashes_type_dict : Dict[str, str] The card hash types dictionary. - card_hashes_country_code_dict : dict + card_hashes_country_code_dict : Dict[str, str] The card hash country codes dictionary. - card_hashes_shared_props_dict : dict - The shared card hash proportions dictionary. + card_shared_idhash_map_dict : Dict[str, str] + The card shared idhash mapping dictionary. """ self.n_card_hashes = n_card_hashes - self.fpath_countrieseurope = fpath_countrieseurope + self.fpath_countries_europe = fpath_countries_europe self.card_types_dict = cons.data_model_card_types_dict self.lam = cons.data_model_poisson_params["card"]["lambda"] self.power = cons.data_model_poisson_params["card"]["power"] self.prop_shared_card_hashes = cons.data_model_shared_entities_dict["card"] - self.card_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_card_hashes, lam=self.lam) - self.card_hashes_props_dict = cnt2prop_dict(self.card_hashes_cnts_dict) - self.card_hashes_type_dict = self.gen_card_type(list(self.card_hashes_cnts_dict.keys()), self.card_types_dict) - self.card_hashes_country_code_dict = gen_country_codes_dict(self.card_hashes_cnts_dict, self.fpath_countrieseurope) - self.card_shared_idhash_map_dict = gen_shared_idhashes(self.card_hashes_cnts_dict, self.prop_shared_card_hashes) - + self.card_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_card_hashes, lam=self.lam, power=self.power) + self.card_hashes = list(self.card_hashes_cnts_dict.keys()) + self.card_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.card_hashes_cnts_dict) + self.card_hashes_type_dict = self.gen_card_type(card_hashes=self.card_hashes, card_types_dict=self.card_types_dict) + self.card_hashes_country_code_dict = gen_country_codes_dict(idhashes=self.card_hashes, fpath_countries_europe=self.fpath_countries_europe) + self.card_shared_idhash_map_dict = gen_shared_idhashes(idhashes=self.card_hashes, prop_shared_idhashes=self.prop_shared_card_hashes) + @beartype def gen_card_type( self, - card_hashes:list, - card_types_dict:dict - ) -> dict: + card_hashes:List[str], + card_types_dict:Dict[str, float], + ) -> Dict[str, str]: """ Generates a dictionary of random card types. - + Parameters ---------- - card_hashes : list + card_hashes : List[str] The card hashes. - card_types_dict : dict + card_types_dict : Dict[str, float] The population proportions of card types. - + Returns ------- - dict - A dictionary of card hash prices. + Dict[str, str] + A dictionary of card types. """ # randomly choose card types based on the population proportions of card types card_types = np.random.choice( diff --git a/generator/objects/Device.py b/generator/objects/Device.py index 4f6ad4d..90d4a30 100644 --- a/generator/objects/Device.py +++ b/generator/objects/Device.py @@ -1,77 +1,80 @@ -import string -import numpy as np -import pandas as pd import cons from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict from utilities.cnt2prop_dict import cnt2prop_dict from utilities.gen_shared_idhashes import gen_shared_idhashes + +import numpy as np +import pandas as pd from beartype import beartype -from typing import Union +from typing import List, Dict, Union class Device: - + @beartype def __init__( self, n_device_hashes:Union[int,np.int64], - fpath_smartphones:str=cons.fpath_smartphones + fpath_smartphones:str=cons.fpath_smartphones, ): """ The randomly generated device data model object. - + Parameters ---------- n_device_hashes : int The number of device hashes to generate. fpath_smartphones : str The file path to the smart phones reference file, default is cons.fpath_smartphones. - + Attributes ---------- n_device_hashes : int The number of device hashes generated. lam : float The lambda parameter of the squared poisson distribution used to generate the device hash counts. + power : float + The power parameter of the squared poisson distribution used to generate the device hash counts. prop_shared_device_hashes : float The population proportion of shared device hashes. - device_hashes_cnts_dict : dict + device_hashes_cnts_dict : Dict[str, int] The device hash counts dictionary. - device_hashes_props_dict : dict + device_hashes_props_dict : Dict[str, float] The device hash proportions dictionary. - device_hashes_type_dict : dict + device_hashes_type_dict : Dict[str, str] The device hash types dictionary. - device_hashes_shared_props_dict : dict - The shared device hash proportions dictionary. + device_shared_idhash_map_dict : Dict[str, str] + The device shared idhash mapping dictionary. """ self.n_device_hashes = n_device_hashes self.fpath_smartphones = fpath_smartphones self.lam = cons.data_model_poisson_params["device"]["lambda"] self.power = cons.data_model_poisson_params["device"]["power"] self.prop_shared_device_hashes = cons.data_model_shared_entities_dict["device"] - self.device_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_device_hashes, lam=self.lam) - self.device_hashes_props_dict = cnt2prop_dict(self.device_hashes_cnts_dict) - self.device_hashes_type_dict = self.gen_device_type(list(self.device_hashes_cnts_dict.keys()), self.fpath_smartphones) - self.device_shared_idhash_map_dict = gen_shared_idhashes(self.device_hashes_cnts_dict, self.prop_shared_device_hashes) + self.device_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_device_hashes, lam=self.lam, power=self.power) + self.device_hashes = list(self.device_hashes_cnts_dict.keys()) + self.device_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.device_hashes_cnts_dict) + self.device_hashes_type_dict = self.gen_device_types(device_hashes=self.device_hashes, fpath_smartphones=self.fpath_smartphones) + self.device_shared_idhash_map_dict = gen_shared_idhashes(idhashes=self.device_hashes, prop_shared_idhashes=self.prop_shared_device_hashes) @beartype - def gen_device_type( + def gen_device_types( self, - device_hashes:list, - fpath_smartphones:str - ) -> dict: + device_hashes:List[str], + fpath_smartphones:str, + ) -> Dict[str, str]: """ Generates a dictionary of random device types - + Parameters ---------- - device_hashes : list + device_hashes : List[str] The device hashes. fpath_smartphones : str The file path to the smart phones reference file. - + Returns ------- - dict + Dict[str, str] A dictionary of device hash types. """ # load in smartphone data diff --git a/generator/objects/Ip.py b/generator/objects/Ip.py index ad28150..48644b6 100644 --- a/generator/objects/Ip.py +++ b/generator/objects/Ip.py @@ -1,53 +1,57 @@ import cons -import numpy as np from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict from utilities.cnt2prop_dict import cnt2prop_dict from utilities.gen_country_codes_dict import gen_country_codes_dict from utilities.gen_shared_idhashes import gen_shared_idhashes + +import numpy as np from beartype import beartype from typing import Union class Ip: - + @beartype def __init__( self, n_ip_hashes:Union[int,np.int64], - fpath_countrieseurope:str=cons.fpath_countrieseurope + fpath_countries_europe:str=cons.fpath_countries_europe, ): """ The randomly generated ip data model object. - + Parameters ---------- n_ip_hashes : int The number of ip hashes to generate. - fpath_countrieseurope : str - The file path to the european countries reference file, default is cons.fpath_countrieseurope. - + fpath_countries_europe : str + The file path to the european countries reference file, default is cons.fpath_countries_europe. + Attributes ---------- n_ip_hashes : int The number of ip hashes generated. lam : float The lambda parameter of the squared poisson distribution used to generate the ip hash counts. + power : float + The power parameter of the squared poisson distribution used to generate the ip hash counts. prop_shared_ip_hashes : float The population proportion of shared ip hashes. - ip_hashes_cnts_dict : dict + ip_hashes_cnts_dict : Dict[str, int] The ip hash counts dictionary. - ip_hashes_props_dict : dict + ip_hashes_props_dict : Dict[str, float] The ip hash proportions dictionary. - ip_hashes_country_code_dict : dict + ip_hashes_country_code_dict : Dict[str, str] The ip hash country codes dictionary. - ip_hashes_shared_props_dict : dict - The shared ip hash proportions dictionary. + ip_shared_idhash_map_dict : Dict[str, str] + The shared ip hash mapping dictionary. """ self.n_ip_hashes = n_ip_hashes - self.fpath_countrieseurope = fpath_countrieseurope + self.fpath_countries_europe = fpath_countries_europe self.lam = cons.data_model_poisson_params["ip"]["lambda"] self.power = cons.data_model_poisson_params["ip"]["power"] self.prop_shared_ip_hashes = cons.data_model_shared_entities_dict["ip"] - self.ip_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_ip_hashes, lam=self.lam) - self.ip_hashes_props_dict = cnt2prop_dict(self.ip_hashes_cnts_dict) - self.ip_hashes_country_code_dict = gen_country_codes_dict(self.ip_hashes_cnts_dict, self.fpath_countrieseurope) - self.ip_shared_idhash_map_dict = gen_shared_idhashes(self.ip_hashes_cnts_dict, self.prop_shared_ip_hashes) + self.ip_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_ip_hashes, lam=self.lam, power=self.power) + self.ip_hashes = list(self.ip_hashes_cnts_dict.keys()) + self.ip_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.ip_hashes_cnts_dict) + self.ip_hashes_country_code_dict = gen_country_codes_dict(idhashes=self.ip_hashes, fpath_countries_europe=self.fpath_countries_europe) + self.ip_shared_idhash_map_dict = gen_shared_idhashes(idhashes=self.ip_hashes, prop_shared_idhashes=self.prop_shared_ip_hashes) \ No newline at end of file diff --git a/generator/objects/Transaction.py b/generator/objects/Transaction.py index 1ef0911..3265174 100644 --- a/generator/objects/Transaction.py +++ b/generator/objects/Transaction.py @@ -1,25 +1,25 @@ -import numpy as np -import pandas as pd -from datetime import datetime import cons from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict from utilities.cnt2prop_dict import cnt2prop_dict from utilities.gen_dates_dict import gen_dates_dict from utilities.round_trans_amount import round_trans_amount + +import numpy as np from beartype import beartype +from typing import List, Dict, Union class Transaction: - + @beartype def __init__( self, - n_transaction_hashes, - start_date, - end_date + n_transaction_hashes:Union[int,np.int64], + start_date:str, + end_date:str, ): """ The randomly generated transaction data model object. - + Parameters ---------- n_transaction_hashes : int @@ -28,7 +28,7 @@ def __init__( The start date to generate transactions from. end_date : str The end date to generate transaction till. - + Attributes ---------- n_transaction_hashes : int @@ -39,23 +39,19 @@ def __init__( The date transactions are generated till, must be of the form '%Y-%m-%d'. lam : float The lambda parameter of the squared poisson distribution used to generate the transaction hash counts. - payment_channels : float - The population proportion of payment channels. - transaction_status : float + power : float + The power parameter of the squared poisson distribution used to generate the transaction hash counts. + transaction_status : Dict[str, float] The population proportion of transaction statuses. - rejection_codes : float - The population proportion of rejection codes. - transaction_hashes_cnts_dict : dict + transaction_hashes_cnts_dict : Dict[str, int] The transaction hash counts dictionary. - transaction_hashes_props_dict : dict + transaction_hashes_props_dict : Dict[str, float] The transaction hash proportions dictionary. - transaction_hashes_dates_dict : dict + transaction_hashes_dates_dict : Dict[str, str] The transaction hash dates dictionary. - transaction_hashes_payment_channel_dict : dict - The transaction hash payment channels dictionary. - transaction_hashes_status_dict : dict + transaction_hashes_status_dict : Dict[str, str] The transaction hash status dictionary. - transaction_hashes_amounts_dict : dict + transaction_hashes_amounts_dict : Dict[str, float] The transaction hash amount dictionary. """ self.n_transaction_hashes = n_transaction_hashes @@ -64,31 +60,32 @@ def __init__( self.lam = cons.data_model_poisson_params["transaction"]["lambda"] self.power = cons.data_model_poisson_params["transaction"]["power"] self.transaction_status = cons.data_model_transaction_status - self.transaction_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_transaction_hashes, lam=self.lam) - self.transaction_hashes_props_dict = cnt2prop_dict(self.transaction_hashes_cnts_dict) - self.transaction_hashes_dates_dict = gen_dates_dict(self.transaction_hashes_cnts_dict,start_date=self.start_date,end_date=self.end_date,) - self.transaction_hashes_status_dict = self.gen_transaction_status(list(self.transaction_hashes_cnts_dict.keys()), self.transaction_status) - self.transaction_hashes_amounts_dict = self.gen_transaction_amounts(list(self.transaction_hashes_cnts_dict.keys())) - + self.transaction_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_transaction_hashes, lam=self.lam, power=self.power) + self.transaction_hashes = list(self.transaction_hashes_cnts_dict.keys()) + self.transaction_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.transaction_hashes_cnts_dict) + self.transaction_hashes_dates_dict = gen_dates_dict(idhashes=self.transaction_hashes,start_date=self.start_date,end_date=self.end_date,) + self.transaction_hashes_status_dict = self.gen_transaction_status(transaction_hashes=self.transaction_hashes, transaction_status=self.transaction_status) + self.transaction_hashes_amounts_dict = self.gen_transaction_amounts(transaction_hashes=self.transaction_hashes, loc=0, scale=2) + @beartype def gen_transaction_status( self, - transaction_hashes:list, - transaction_status:dict + transaction_hashes:List[str], + transaction_status:Dict[str, float], ): """ Generates a dictionary of random transaction statuses - + Parameters ---------- - transaction_hashes : list + transaction_hashes : List[str] The transaction hashes - transaction_status : dict + transaction_status : Dict[str, float] The population proportion of transaction statuses - + Returns ------- - dict + Dict[str, str] A dictionary of transaction statuses """ # randomly sample transaction status based on population proportions @@ -103,29 +100,29 @@ def gen_transaction_status( # return transaction hashes and statuses transaction_hashes_status_dict = dict(zip(transaction_hashes, transaction_status)) return transaction_hashes_status_dict - + @beartype def gen_transaction_amounts( self, - transaction_hashes:list, - loc:float=0, - scale:float=2 - ): + transaction_hashes:List[str], + loc:Union[int, float]=0, + scale:Union[int, float]=2, + ) -> Dict[str, float]: """ Generates a dictionary of random transaction hash amounts. - + Parameters ---------- - transaction_hashes : list + transaction_hashes : List[str] The transaction hashes. loc : float The mean of the transaction amount distribution to generate, default is 0. scale : float The scale of the transaction amount distribution to generate, default is 2. - + Returns ------- - dict + Dict[str, float] A dictionary of transaction hash prices """ # randomly sample transaction prices from an absolute normal distribution with mean 0 and standard deviation 2 diff --git a/generator/objects/User.py b/generator/objects/User.py index f89aeef..475d79a 100644 --- a/generator/objects/User.py +++ b/generator/objects/User.py @@ -1,14 +1,16 @@ import cons -import numpy as np -import pandas as pd from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict from utilities.cnt2prop_dict import cnt2prop_dict from utilities.gen_country_codes_dict import gen_country_codes_dict from utilities.gen_dates_dict import gen_dates_dict + +import numpy as np +import pandas as pd from beartype import beartype +from typing import Dict class User: - + @beartype def __init__( self, @@ -17,12 +19,12 @@ def __init__( end_date:str, fpath_firstnames:str=cons.fpath_firstnames, fpath_lastnames:str=cons.fpath_lastnames, - fpath_countrieseurope:str=cons.fpath_countrieseurope, - fpath_domain_email:str=cons.fpath_domain_email + fpath_countries_europe:str=cons.fpath_countries_europe, + fpath_email_domain :str=cons.fpath_email_domain , ): """ The randomly generated user data model object - + Parameters ---------- n_user_ids : int @@ -35,11 +37,11 @@ def __init__( The full file path to the first names reference data, default is cons.fpath_firstnames. fpath_lastnames : str The full file path to the last names reference data, default is cons.fpath_lastnames. - fpath_countrieseurope : str - The full file path to the europe countries reference data, default is cons.fpath_countrieseurope. - fpath_domain_email : str - The full file path to the email domain reference daa, default is cons.fpath_domain_email. - + fpath_countries_europe : str + The full file path to the europe countries reference data, default is cons.fpath_countries_europe. + fpath_email_domain : str + The full file path to the email domain reference data, default is cons.fpath_email_domain . + Attributes ---------- n_user_ids : int @@ -50,19 +52,21 @@ def __init__( The date user ids are generated till, must be of the form '%Y-%m-%d' lam : float The lambda parameter of the squared poisson distribution used to generate the user ids counts - user_ids_cnts_dict : dict + power : float + The power parameter of the squared poisson distribution used to generate the user ids counts + user_ids_cnts_dict : Dict[str, int] The user id counts dictionary - user_ids_props_dict : dict + user_ids_props_dict : Dict[str, float] The user id proportions dictionary - user_ids_firstname_dict : dict + user_ids_firstname_dict : Dict[str, str] The user id first names dictionary - user_ids_lastname_dict : dict + user_ids_lastname_dict : Dict[str, str] The user id last names dictionary - user_ids_country_code_dict : dict + user_ids_country_code_dict : Dict[str, str] The user id country codes dictionary - user_ids_email_domain_dict : dict + user_ids_email_domain_dict : Dict[str, str] The user id email domains dictionary - user_ids_dates_dict : dict + user_ids_dates_dict : Dict[str, str] The user id dates dictionary """ self.n_user_ids = n_user_ids @@ -70,34 +74,35 @@ def __init__( self.end_date = end_date self.fpath_firstnames = fpath_firstnames self.fpath_lastnames = fpath_lastnames - self.fpath_countrieseurope = fpath_countrieseurope - self.fpath_domain_email = fpath_domain_email + self.fpath_countries_europe = fpath_countries_europe + self.fpath_email_domain = fpath_email_domain self.lam = cons.data_model_poisson_params["user"]["lambda"] self.power = cons.data_model_poisson_params["user"]["power"] - self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam) - self.user_ids_props_dict = cnt2prop_dict(self.user_ids_cnts_dict) - self.user_ids_country_code_dict = gen_country_codes_dict(self.user_ids_cnts_dict, self.fpath_countrieseurope) - self.user_ids_firstname_dict = self.gen_user_firstname(self.fpath_firstnames) - self.user_ids_lastname_dict = self.gen_user_lastname(self.fpath_lastnames) - self.user_ids_email_domain_dict = self.gen_user_email_domain(self.fpath_domain_email) - self.user_ids_dates_dict = gen_dates_dict(self.user_ids_cnts_dict, start_date=self.start_date, end_date=self.end_date) - + self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power) + self.user_ids = list(self.user_ids_cnts_dict.keys()) + self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict) + self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe) + self.user_ids_firstname_dict = self.gen_user_firstname(fpath_firstnames=self.fpath_firstnames) + self.user_ids_lastname_dict = self.gen_user_lastname(fpath_lastnames=self.fpath_lastnames) + self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain) + self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date) + @beartype def gen_user_firstname( self, - fpath_firstnames:str - ) -> dict: + fpath_firstnames:str, + ) -> Dict[str, str]: """ Generates a dictionary of random user id first names - + Parameters ---------- fpath_firstnames : str The file path to the first names reference file - + Returns ------- - dict + Dict[str, str] A dictionary of user id first names """ # load in list of first names @@ -111,23 +116,23 @@ def gen_user_firstname( # convert key value pairs to dict user_ids_firstname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict() return user_ids_firstname_dict - + @beartype def gen_user_lastname( self, - fpath_lastnames:str - ) -> dict: + fpath_lastnames:str, + ) -> Dict[str, str]: """ Generates a dictionary of random user id last names. - + Parameters ---------- fpath_lastnames : str The file path to the last names reference file. - + Returns ------- - dict + Dict[str, str] A dictionary of user id last names. """ # load in list of last names @@ -141,42 +146,40 @@ def gen_user_lastname( # convert key value pairs to dict user_ids_lastname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict() return user_ids_lastname_dict - + @beartype def gen_user_email_domain( self, - fpath_domain_email:str - ) -> dict: + fpath_email_domain:str, + ) -> Dict[str, str]: """ Generates a dictionary of random user id email domains - + Parameters ---------- - fpath_domain_email : str + fpath_email_domain : str The file path to the email domains reference file - + Returns ------- - dict + Dict[str, str] A dictionary of user id email domains """ # load domain names data - email_domain_data = pd.read_csv(fpath_domain_email, index_col=0) + email_domain_data = pd.read_csv(fpath_email_domain, index_col=0) # calculate the proportion of email domains email_domain_data["proportion"] = email_domain_data["proportion"].divide(email_domain_data["proportion"].sum()) # convert email domain proportions to a dictionary email_domain_dict = email_domain_data.set_index("domain").to_dict()["proportion"] - # extract the user ids - user_ids_list = list(self.user_ids_cnts_dict.keys()) # randomly choose the email domains based on proportions user_email_domain_list = list( np.random.choice( a=list(email_domain_dict.keys()), p=list(email_domain_dict.values()), replace=True, - size=len(user_ids_list), + size=len(self.user_ids), ) ) # return the user ids email domains - user_ids_email_domain_dict = dict(zip(user_ids_list, user_email_domain_list)) + user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list)) return user_ids_email_domain_dict diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py index 07e5df8..540bb1c 100644 --- a/generator/unittests/app/test_gen_user_trans_data.py +++ b/generator/unittests/app/test_gen_user_trans_data.py @@ -36,8 +36,8 @@ # create relative file paths fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1] +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1] fpath_countrycrimeindex = '.' + cons.fpath_countrycrimeindex.split(cons.fpath_repo_dir)[1] fpath_unittest_user_data = '.' + cons.fpath_unittest_user_data.split(cons.fpath_repo_dir)[1] @@ -45,13 +45,13 @@ # generate random users user_obj = User( - n_user_ids=programmeparams.n_users, - start_date=programmeparams.registration_start_date, - end_date=programmeparams.registration_end_date, - fpath_firstnames=fpath_firstnames, - fpath_lastnames=fpath_lastnames, - fpath_countrieseurope=fpath_countrieseurope, - fpath_domain_email=fpath_domain_email + n_user_ids=programmeparams.n_users, + start_date=programmeparams.registration_start_date, + end_date=programmeparams.registration_end_date, + fpath_firstnames=fpath_firstnames, + fpath_lastnames=fpath_lastnames, + fpath_countries_europe=fpath_countries_europe, + fpath_email_domain=fpath_email_domain ) # generate random entity counts for each user @@ -59,8 +59,8 @@ # generate random entity values device_obj = Device(n_device_hashes=random_entity_counts['n_devices'].sum(), fpath_smartphones=fpath_smartphones) -card_obj = Card(n_card_hashes=random_entity_counts['n_cards'].sum(), fpath_countrieseurope=fpath_countrieseurope) -ip_obj = Ip(n_ip_hashes=random_entity_counts['n_ips'].sum(), fpath_countrieseurope=fpath_countrieseurope) +card_obj = Card(n_card_hashes=random_entity_counts['n_cards'].sum(), fpath_countries_europe=fpath_countries_europe) +ip_obj = Ip(n_ip_hashes=random_entity_counts['n_ips'].sum(), fpath_countries_europe=fpath_countries_europe) transaction_obj = Transaction(n_transaction_hashes=random_entity_counts['n_transactions'].sum(), start_date=programmeparams.transaction_start_date, end_date=programmeparams.transaction_end_date) application_obj = Application(n_application_hashes=programmeparams.n_applications) diff --git a/generator/unittests/objects/test_Card.py b/generator/unittests/objects/test_Card.py index 4b9ef35..688455f 100644 --- a/generator/unittests/objects/test_Card.py +++ b/generator/unittests/objects/test_Card.py @@ -42,8 +42,8 @@ random.seed(cons.unittest_seed) np.random.seed(cons.unittest_seed) -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -card_object = Card(n_card_hashes=exp_n_card_hashes, fpath_countrieseurope=fpath_countrieseurope) +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +card_object = Card(n_card_hashes=exp_n_card_hashes, fpath_countries_europe=fpath_countries_europe) obs_card_hashes_cnts_dict = card_object.card_hashes_cnts_dict obs_card_types_dict = card_object.card_types_dict diff --git a/generator/unittests/objects/test_Ip.py b/generator/unittests/objects/test_Ip.py index b8207b8..d7490fa 100644 --- a/generator/unittests/objects/test_Ip.py +++ b/generator/unittests/objects/test_Ip.py @@ -35,8 +35,8 @@ random.seed(cons.unittest_seed) np.random.seed(cons.unittest_seed) -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -ip_object = Ip(n_ip_hashes=exp_n_ip_hashes, fpath_countrieseurope=fpath_countrieseurope) +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +ip_object = Ip(n_ip_hashes=exp_n_ip_hashes, fpath_countries_europe=fpath_countries_europe) obs_ip_hashes_cnts_dict = ip_object.ip_hashes_cnts_dict obs_ip_hashes_props_dict = ip_object.ip_hashes_props_dict diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py index 80d911c..92a30bd 100644 --- a/generator/unittests/objects/test_User.py +++ b/generator/unittests/objects/test_User.py @@ -10,46 +10,46 @@ from objects.User import User exp_user_ids_cnts_dict = { - "6374692674377254": 420, - "1751409580926382": 318, - "4264861381989413": 244, - "6720317315593519": 387, + "6374692674377254": 20, + "1751409580926382": 29, + "4264861381989413": 19, + "6720317315593519": 26, } exp_user_ids_props_dict = { - "6374692674377254": 0.30679327976625276, - "1751409580926382": 0.2322863403944485, - "4264861381989413": 0.17823228634039445, - "6720317315593519": 0.28268809349890434, + "6374692674377254": 0.2127659574468085, + "1751409580926382": 0.30851063829787234, + "4264861381989413": 0.20212765957446807, + "6720317315593519": 0.2765957446808511, } exp_user_ids_firstname_dict = { - "6374692674377254": "ernst", - "1751409580926382": "mykhaylo", - "4264861381989413": "hugo", - "6720317315593519": "alexandra", + "6374692674377254": "simone", + "1751409580926382": "francesca", + "4264861381989413": "igor", + "6720317315593519": "beckett", } exp_user_ids_lastname_dict = { - "6374692674377254": "buchmann", - "1751409580926382": "lyashenko", - "4264861381989413": "diaz", - "6720317315593519": "mariana", + "6374692674377254": "de filippo", + "1751409580926382": "gagliardi", + "4264861381989413": "lupu", + "6720317315593519": "leslie", } exp_user_ids_country_code_dict = { - "6374692674377254": 276, - "1751409580926382": 804, - "4264861381989413": 724, - "6720317315593519": 642, + "6374692674377254": 380, + "1751409580926382": 380, + "4264861381989413": 498, + "6720317315593519": 826, } exp_user_ids_email_domain_dict = { - "6374692674377254": "gmail.com", + "6374692674377254": "yahoo.com", "1751409580926382": "yahoo.com", - "4264861381989413": "aol.com", - "6720317315593519": "hotmail.com", + "4264861381989413": "yahoo.com", + "6720317315593519": "gmail.com", } exp_user_ids_dates_dict = { - "6374692674377254": np.datetime64("2020-06-20T00:00:00.000000000"), - "1751409580926382": np.datetime64("2020-12-25T00:00:00.000000000"), - "4264861381989413": np.datetime64("2020-08-01T00:00:00.000000000"), - "6720317315593519": np.datetime64("2020-02-04T00:00:00.000000000"), + "6374692674377254": np.datetime64("2020-03-21T00:00:00.000000000"), + "1751409580926382": np.datetime64("2020-06-11T00:00:00.000000000"), + "4264861381989413": np.datetime64("2020-10-15T00:00:00.000000000"), + "6720317315593519": np.datetime64("2020-09-17T00:00:00.000000000"), } exp_start_date = cons.unittest_registration_start_date exp_end_date = cons.unittest_registration_end_date @@ -61,9 +61,9 @@ fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1] -user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countrieseurope=fpath_countrieseurope, fpath_domain_email=fpath_domain_email) +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] +user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict obs_user_ids_props_dict = user_object.user_ids_props_dict diff --git a/generator/unittests/utilities/test_align_country_codes.py b/generator/unittests/utilities/test_align_country_codes.py index 787d9f9..d0626b1 100644 --- a/generator/unittests/utilities/test_align_country_codes.py +++ b/generator/unittests/utilities/test_align_country_codes.py @@ -45,7 +45,7 @@ }, { "registration_country_code_alpha": 353, - "ip_country_code_alpha": 353.0, + "ip_country_code_alpha": 42.0, "card_country_code_alpha": np.nan, }, { @@ -62,7 +62,7 @@ ) obs_data_df = input_data_df.apply( lambda series: align_country_codes( - series, proba_comm_ip=0.95, proba_comm_card=0.99 + series, proba_comm_ip=0.05, proba_comm_card=0.01 ), axis=1, ) diff --git a/generator/unittests/utilities/test_gen_country_codes_dict.py b/generator/unittests/utilities/test_gen_country_codes_dict.py index fbad890..f13fd23 100644 --- a/generator/unittests/utilities/test_gen_country_codes_dict.py +++ b/generator/unittests/utilities/test_gen_country_codes_dict.py @@ -10,16 +10,16 @@ np.random.seed(cons.unittest_seed) -cnt_data = {"a": 1, "b": 2, "c": 3, "d": 4} +idhashes = ["a", "b", "c", "d"] exp_prop_dict = {"a": 276, "b": 756, "c": 642, "d": 826} -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -obs_prop_dict = gen_country_codes_dict(cnt_data, fpath_countrieseurope=fpath_countrieseurope) +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +obs_prop_dict = gen_country_codes_dict(idhashes=idhashes, fpath_countries_europe=fpath_countries_europe) class Test_gen_country_codes_dict(unittest.TestCase): """""" def setUp(self): - self.cnt_data = cnt_data + self.idhashes = idhashes self.obs_prop_dict = obs_prop_dict self.exp_prop_dict = exp_prop_dict diff --git a/generator/unittests/utilities/test_gen_country_codes_map.py b/generator/unittests/utilities/test_gen_country_codes_map.py index 6ab2d89..3e84fa5 100644 --- a/generator/unittests/utilities/test_gen_country_codes_map.py +++ b/generator/unittests/utilities/test_gen_country_codes_map.py @@ -18,10 +18,10 @@ 292: 'GI', 492: 'MC', 336: 'VA' } -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -obs_country_codes_map = gen_country_codes_map(fpath_countrieseurope=fpath_countrieseurope) +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +obs_country_codes_map = gen_country_codes_map(fpath_countries_europe=fpath_countries_europe) -class Test_gen_country_codes_dict(unittest.TestCase): +class Test_gen_country_codes_map(unittest.TestCase): """""" def setUp(self): diff --git a/generator/unittests/utilities/test_gen_dates_dict.py b/generator/unittests/utilities/test_gen_dates_dict.py index 09e8dc8..fb3d99a 100644 --- a/generator/unittests/utilities/test_gen_dates_dict.py +++ b/generator/unittests/utilities/test_gen_dates_dict.py @@ -10,21 +10,21 @@ np.random.seed(cons.unittest_seed) -cnt_data = {"a": 1, "b": 2, "c": 3, "d": 4} +idhashes = ["a", "b", "c", "d"] exp_prop_dict = { "a": np.datetime64("2020-04-12T00:00:00.000000000"), "b": np.datetime64("2021-03-11T00:00:00.000000000"), "c": np.datetime64("2020-09-27T00:00:00.000000000"), "d": np.datetime64("2020-04-16T00:00:00.000000000"), } -obs_prop_dict = gen_dates_dict(cnt_data, start_date="2020-01-01", end_date="2021-12-31") +obs_prop_dict = gen_dates_dict(idhashes, start_date="2020-01-01", end_date="2021-12-31") class Test_gen_dates_dict(unittest.TestCase): """""" def setUp(self): - self.cnt_data = cnt_data + self.idhashes = idhashes self.obs_prop_dict = obs_prop_dict self.exp_prop_dict = exp_prop_dict diff --git a/generator/unittests/utilities/test_gen_obj_idhash_series.py b/generator/unittests/utilities/test_gen_obj_idhash_series.py index e94bf44..18faa86 100644 --- a/generator/unittests/utilities/test_gen_obj_idhash_series.py +++ b/generator/unittests/utilities/test_gen_obj_idhash_series.py @@ -22,23 +22,23 @@ n_user_ids = cons.unittest_n_entities fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1] +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1] random.seed(cons.unittest_seed) np.random.seed(cons.unittest_seed) # create user object -user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countrieseurope=fpath_countrieseurope, fpath_domain_email=fpath_domain_email) +user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) # generate random entity counts random_entity_counts = gen_random_entity_counts(user_obj=user_object) # generate random entity values device_obj = Device(n_device_hashes=random_entity_counts['n_devices'].sum(), fpath_smartphones=fpath_smartphones) # generate user data and device hashes user_data = random_entity_counts.copy() -obs_obj_idhash_series = gen_obj_idhash_series(idhashes_props_dict=device_obj.device_hashes_props_dict, n_counts_series=user_data['n_devices']) -exp_obj_idhash_series = pd.Series([['8c1fd1152fc83030', 'd4f37f7620f0fba2', '565dd55c257aa14d'], ['0bef04bcf232f0f0'], ['bbdcd452b847c0d4'], ['e2b03ec4f60f2f18']]) +obs_obj_idhash_series = gen_obj_idhash_series(idhashes=device_obj.device_hashes, n_counts_series=user_data['n_devices']) +exp_obj_idhash_series = pd.Series([['2e23f63807f6170a'], ['b8816ed926bf9f83', 'b010fdb44fa68822'], ['ff23757073a07357'], ['3d2fd828c1fd1152']]) class Test_gen_idhash_cnt_dict(unittest.TestCase): """""" diff --git a/generator/unittests/utilities/test_gen_random_entity_counts.py b/generator/unittests/utilities/test_gen_random_entity_counts.py index dffdb3d..58a5522 100644 --- a/generator/unittests/utilities/test_gen_random_entity_counts.py +++ b/generator/unittests/utilities/test_gen_random_entity_counts.py @@ -21,17 +21,17 @@ fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1] fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1] -fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1] -fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1] -user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countrieseurope=fpath_countrieseurope, fpath_domain_email=fpath_domain_email) +fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1] +fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1] +user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain) exp_randomentity_counts_dict = { - 'uid': ['1751409580926382', '6720317315593519', '4264861381989413', '6374692674377254'], - 'n_devices': [3, 1, 1, 1], - 'n_cards': [1, 1, 1, 1], - 'n_ips': [5, 6, 3, 4], - 'n_transactions': [55, 69, 54, 54], - 'n_applications': [3, 10, 28, 6] + 'uid': ['6374692674377254', '6720317315593519', '4264861381989413', '1751409580926382'], + 'n_devices': [1, 2, 1, 1], + 'n_cards': [1, 1, 1, 1], + 'n_ips': [3, 5, 5, 1], + 'n_transactions': [72, 16, 13, 29], + 'n_applications': [4, 2, 3, 5] } exp_randomentity_counts_df = pd.DataFrame.from_dict(exp_randomentity_counts_dict) diff --git a/generator/unittests/utilities/test_gen_shared_idhashes.py b/generator/unittests/utilities/test_gen_shared_idhashes.py index e1e24f2..6afc519 100644 --- a/generator/unittests/utilities/test_gen_shared_idhashes.py +++ b/generator/unittests/utilities/test_gen_shared_idhashes.py @@ -14,8 +14,8 @@ np.random.seed(cons.unittest_seed) obs_prop_shared_idhashes=cons.data_model_shared_entities_dict["ip"] -obs_hash_cnt_dict = gen_idhash_cnt_dict(idhash_type="hash", n=4, lam=1, nbytes=16) -obs_shared_idhashes = gen_shared_idhashes(idhash_cnt_dict=obs_hash_cnt_dict, prop_shared_idhashes=obs_prop_shared_idhashes) +idhashes = list(gen_idhash_cnt_dict(idhash_type="hash", n=4, lam=1, nbytes=16).keys()) +obs_shared_idhashes = gen_shared_idhashes(idhashes=idhashes, prop_shared_idhashes=obs_prop_shared_idhashes) exp_shared_idhashes = {} class Test_gen_shared_idhashes(unittest.TestCase): diff --git a/generator/utilities/Bedrock.py b/generator/utilities/Bedrock.py index ab2fa79..9ea42d6 100644 --- a/generator/utilities/Bedrock.py +++ b/generator/utilities/Bedrock.py @@ -4,29 +4,86 @@ class Bedrock(): """ + Bedrock AWS API client wrapper for invoking language models. + This class provides a simplified interface to interact with AWS Bedrock runtime, + enabling prompt-based interactions with language models like Llama 3. + + Parameters + ---------- + session : boto3.Session + A Boto3 session object configured with appropriate AWS credentials. + model_region: str + The AWS region where the Bedrock model is hosted. + model_id: str + The identifier of the Bedrock model to use. + + Attributes + ---------- + client: boto3.Session.client + Boto3 Bedrock runtime client for model invocation. + model_id: str + The identifier of the Bedrock model to use. + + References + ---------- https://docs.aws.amazon.com/general/latest/gr/bedrock.html """ @beartype def __init__( - self, + self, session:boto3.Session, model_region="us-east-1", - model_id:str="meta.llama3-8b-instruct-v1:0" + model_id:str="meta.llama3-8b-instruct-v1:0", ): self.client = session.client("bedrock-runtime", region_name=model_region) - self.model_id = model_id + self.model_id = model_id, @beartype def prompt( self, - prompt:str, - system:str="", + user_prompt:str, + system_prompt:str="", top_p:float=0.5, temperature:float=0.5, - max_gen_len:int=512 + max_gen_len:int=512, ) -> str: - # generate bedrock request - formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system}<|eot_id|><|start_header_id|>user<|end_header_id|>{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""" + """ + Invoke the Bedrock model with the provided prompts and generation parameters. + + Formats the user and system prompts according to the Llama 2 chat template, + sends a request to the configured Bedrock model, and returns the generated response. + + Parameters + ---------- + user_prompt : str + The main prompt or query to send to the model. + system_prompt : str, optional + System-level instructions for the model behavior. Defaults to "". + top_p : float, optional + Nucleus sampling parameter controlling diversity. Defaults to 0.5. + temperature : float, optional + Temperature parameter controlling randomness. Defaults to 0.5. + max_gen_len : int, optional + Maximum length of the generated response. Defaults to 512. + + Returns + ------- + str: + The generated text response from the Bedrock model. + + Raises + ------ + Exception: If the model invocation fails. + + Examples + -------- + ``` + bedrockModel = Bedrock(session=boto3.Session(...), model_region="us-east-1") + bedrockModel.prompt(user_prompt="Who was the first president of the United States?", system_prompt="You are a helpful assistant.", max_gen_len=100) + ``` + """ + # generate bedrock request payload + formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""" native_request = {"prompt": formatted_prompt, "max_gen_len": max_gen_len, "temperature": temperature, "top_p":top_p} request = json.dumps(native_request) # call bedrock model @@ -34,14 +91,13 @@ def prompt( # Invoke the model with the request. response = self.client.invoke_model(modelId=self.model_id, body=request) except Exception as e: - print(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}") - exit(1) + raise Exception(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}") # Decode and extract the response model_response = json.loads(response["body"].read()) response_text = model_response["generation"] - return(response_text) + return response_text -system = """# Task +system_prompt = """# Task You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin. diff --git a/generator/utilities/align_country_codes.py b/generator/utilities/align_country_codes.py index 3c1646e..e27173b 100644 --- a/generator/utilities/align_country_codes.py +++ b/generator/utilities/align_country_codes.py @@ -6,25 +6,32 @@ @beartype def align_country_codes( series:pd.Series, - proba_comm_ip:float=0.95, - proba_comm_card:float=0.99 + proba_comm_ip:float=0.05, + proba_comm_card:float=0.01, ) -> pd.Series: """ Aligns inconsistent registration, ip and card country codes to have mostly common values; with a random chance of inconsistencies. - + Parameters ---------- series : pandas.Series A series from the random transaction dataframe with inconsistent country codes to align. proba_comm_ip : float - The probability of a common / shared registration country code and ip country code. + The probability of a common / shared registration country code and ip country code, default is 0.05. proba_comm_card : float - The probability of a common / shared registration country code and card country code. - + The probability of a common / shared registration country code and card country code, default is 0.01. + Returns ------- pandas.Series A pandas series containing only the aligned country codes; registration, ip and card. + + Examples + -------- + ``` + series = pd.Series({'registration_country_code_alpha': 353.0, 'ip_country_code_alpha': 42.0, 'card_country_code_alpha': 42.0}) + align_country_codes(series=series, proba_comm_ip=0.05, proba_comm_card=0.01,) + ``` """ # generate random value between 0 and 1 random_unif = random.uniform(0, 1) @@ -33,16 +40,16 @@ def align_country_codes( ip_country_code = series["ip_country_code_alpha"] card_country_code = series["card_country_code_alpha"] # determine shared or new ip country code - if ip_country_code == ip_country_code: - if random_unif >= proba_comm_ip: + if pd.notna(ip_country_code): + if random_unif <= proba_comm_ip: new_ip_country_code = ip_country_code else: new_ip_country_code = registration_country_code else: new_ip_country_code = np.nan # determine shared or new card country code - if card_country_code == card_country_code: - if random_unif >= proba_comm_card: + if pd.notna(card_country_code): + if random_unif <= proba_comm_card: new_card_country_code = card_country_code else: new_card_country_code = registration_country_code diff --git a/generator/utilities/cnt2prop_dict.py b/generator/utilities/cnt2prop_dict.py index 45ef11f..937c32a 100644 --- a/generator/utilities/cnt2prop_dict.py +++ b/generator/utilities/cnt2prop_dict.py @@ -1,27 +1,36 @@ from beartype import beartype +import numpy as np +from typing import Dict, Union @beartype def cnt2prop_dict( - idhash_cnt_dict:dict - ) -> dict: + idhashes_cnts_dict:Dict[Union[str, int], Union[int,np.int64]], + ) -> Dict[Union[str, int], float]: """ Converts a dictionary of counts to a dictionary of proportions. - + Parameters ---------- - idhash_cnt_dict : dict + idhashes_cnts_dict : Dict[Union[str, int], Union[int,np.int64] A dictionary of key, value pairs where the value indicates a count. - + Returns ------- - dict + Dict[Union[str, int], float] A dictionary of key, value pairs where the value indicates a proportion. + + Examples + -------- + ``` + idhashes_cnts_dict = {'7125135c8882b0f6': 2, '049dd291d9506532': 3, 'd6708d344cb6f498': 5} + prop_dict = cnt2prop_dict(idhashes_cnts_dict=idhashes_cnts_dict) + ``` """ # empty dictionary for proportions prop_dict = {} - # sum of dictionary counts - cnt_total = sum(idhash_cnt_dict.values()) - # iterate over input dictionary and convert counts to proportions - for idhash, cnt in idhash_cnt_dict.items(): - prop_dict[idhash] = cnt / cnt_total + if idhashes_cnts_dict != {}: + # sum of dictionary counts + cnt_total = sum(idhashes_cnts_dict.values()) + # iterate over input dictionary and convert counts to proportions + prop_dict = {idhash: cnt / cnt_total for idhash, cnt in idhashes_cnts_dict.items()} return prop_dict diff --git a/generator/utilities/commandline_interface.py b/generator/utilities/commandline_interface.py index a8de020..0b72757 100644 --- a/generator/utilities/commandline_interface.py +++ b/generator/utilities/commandline_interface.py @@ -1,31 +1,45 @@ -import argparse -from beartype import beartype import cons -@beartype -def commandline_interface() -> dict: +import argparse +from typing import Dict + +def commandline_interface() -> Dict[str, object]: """ A commandline interface for parsing input parameters with - + Windows python RandomTeleComData\\generator\\main.py --n_users 100 --random_seed 1 --n_itr 2 - + Linux python3 RandomTeleComData/generator/main.py --n_users 100 --random_seed 1 --n_itr 2 - + Parameters ---------- - + n_users : int + The number of users to generate random telecom payments data for. + use_random_seed : int + Use a set random seed for reproducible results; must be either 0 or 1. + n_itr : int + Number of iterations to run. + registration_start_date : str + The start date for registrations. + registration_end_date : str + The end date for registrations. + transaction_start_date : str + The start date for transactions. + transaction_end_date : str + The end date for transactions. + Returns ------- - dict + Dict[str, object] A dictionary of key, value pairs where the values are parsed input parameters. """ # define argument parser object parser = argparse.ArgumentParser(description="Execute Random TeleCom Data Programme.") # add input arguments parser.add_argument("--n_users", action="store", dest="n_users", type=int, default=cons.default_n_users, help="Integer, the number of users to generate random telecom payments data for",) - parser.add_argument("--use_random_seed", action="store", dest="use_random_seed", type=int, default=cons.default_use_random_seed, help="Integer, use a set random seed for reproducible results; must be either 0 or 1",) + parser.add_argument("--use_random_seed", action="store", dest="use_random_seed", type=int, default=cons.default_use_random_seed, choices=[0, 1], help="Integer, use a set random seed for reproducible results; must be either 0 or 1",) parser.add_argument("--n_itr", action="store", dest="n_itr", type=int, default=cons.default_n_itr, help="Integer, number of iterations to run",) parser.add_argument("--registration_start_date", action="store", dest="registration_start_date", type=str, default=cons.default_registration_start_date, help="String, the start date for registrations",) parser.add_argument("--registration_end_date", action="store", dest="registration_end_date", type=str, default=cons.default_registration_end_date, help="String, the end date for registrations",) diff --git a/generator/utilities/gen_country_codes_dict.py b/generator/utilities/gen_country_codes_dict.py index c36d9fa..4aeac49 100644 --- a/generator/utilities/gen_country_codes_dict.py +++ b/generator/utilities/gen_country_codes_dict.py @@ -1,47 +1,63 @@ import cons +from utilities.cnt2prop_dict import cnt2prop_dict + +import os import numpy as np import pandas as pd -from utilities.cnt2prop_dict import cnt2prop_dict from beartype import beartype +from typing import Dict, Union, List @beartype def gen_country_codes_dict( - idhashes_cnts_dict:dict, - fpath_countrieseurope:str=cons.fpath_countrieseurope - ) -> dict: + idhashes:List[str], + fpath_countries_europe:str=cons.fpath_countries_europe, + ) -> Dict[str, Union[int, np.int64]]: """ - Generates a dictionary of random country codes for an input dictionary of idhashes counts. - + Generates a dictionary of randomLy sampled country codes for an input list of idhashes. + Parameters ---------- - idhashes_cnts_dict : dict - A dictionary of idhashes counts. - fpath_countrieseurope : str - The file path to the european countries reference file, default is cons.fpath_countrieseurope. - + idhashes : List[str] + A list of idhashes. + fpath_countries_europe : str + The file path to the european countries reference file, default is cons.fpath_countries_europe. + Returns ------- - dict + Dict[str, Union[int, np.int64]] A dictionary of idhashes country codes. + + Examples + -------- + ``` + import cons + idhashes_cnts_dict:{'abcd1234': 5, 'defg4567': 3, 'ghij7891': 7} + gen_country_codes_dict(idhashes_cnts_dict=idhashes_cnts_dict, + fpath_countries_europe=cons.fpath_countries_europe, + ) + ``` """ - + # check file path exists + if os.path.exists(fpath_countries_europe) == False: + raise FileNotFoundError(f"File not found: {fpath_countries_europe}") # load population data of european countries - european_populations_cnt_data = pd.read_csv(filepath_or_buffer=fpath_countrieseurope, usecols=["ISO numeric", "population"],) + european_populations_cnt_data = pd.read_csv(filepath_or_buffer=fpath_countries_europe, usecols=["ISO numeric", "population"],) # convert to a dictionary of ISO country codes with population counts - european_populations_cnt_dict = european_populations_cnt_data.set_index("ISO numeric").to_dict()["population"] + european_populations_cnt_dict = european_populations_cnt_data.set_index("ISO numeric")["population"].to_dict() # convert dictionary of population counts to dictionary of population proportions european_populations_props_dict = cnt2prop_dict(european_populations_cnt_dict) - # extract out idhashes from idhashes counts dictionary - idhashes_list = list(idhashes_cnts_dict.keys()) + # check population proportions sum to 1.0 + if np.isclose(sum(european_populations_props_dict.values()), 1.0) == False: + raise ValueError("Population proportions do not sum to 1.0") # randomly generate country codes for all idhashes based on population proportions country_codes_list = list( np.random.choice( a=list(european_populations_props_dict.keys()), p=list(european_populations_props_dict.values()), replace=True, - size=len(idhashes_list), + size=len(idhashes), ) ) # return a dictionary of idhashes and country codes - idhashes_country_codes = dict(zip(idhashes_list, country_codes_list)) + idhashes_country_codes = dict(zip(idhashes, country_codes_list)) return idhashes_country_codes diff --git a/generator/utilities/gen_country_codes_map.py b/generator/utilities/gen_country_codes_map.py index d6254ff..160a5a7 100644 --- a/generator/utilities/gen_country_codes_map.py +++ b/generator/utilities/gen_country_codes_map.py @@ -1,26 +1,36 @@ import cons + +import numpy as np import pandas as pd from beartype import beartype +from typing import Dict, Union @beartype def gen_country_codes_map( - fpath_countrieseurope:str=cons.fpath_countrieseurope - ) -> dict: + fpath_countries_europe:str=cons.fpath_countries_europe, + ) -> Dict[int, Union[str, np.int64]]: """ Generates a dictionary of ISO numeric codes mapping to ISO alpha codes. - + Parameters ---------- - fpath_countrieseurope : str - The full file path to the european countries reference file, default is cons.fpath_countrieseurope. - + fpath_countries_europe : str + The full file path to the european countries reference file, default is cons.fpath_countries_europe. + Returns ------- - dict + Dict[int, Union[str, np.int64]] A dictionary of ISO numeric codes mapping to ISO alpha codes. + + Examples + -------- + ``` + import cons + gen_country_codes_map(fpath_countries_europe=cons.fpath_countries_europe) + ``` """ # load european county codes data - country_codes_data = pd.read_csv(filepath_or_buffer=fpath_countrieseurope, usecols=["ISO numeric", "ISO alpha 2"],) + country_codes_data = pd.read_csv(filepath_or_buffer=fpath_countries_europe, usecols=["ISO numeric", "ISO alpha 2"],) # convert data to a dictionary of ISO numeric codes mapping to ISO alpha codes - country_codes_map = country_codes_data.set_index("ISO numeric").to_dict()["ISO alpha 2"] + country_codes_map = country_codes_data.set_index("ISO numeric")["ISO alpha 2"].to_dict() return country_codes_map diff --git a/generator/utilities/gen_dates_dict.py b/generator/utilities/gen_dates_dict.py index 05f29f9..b02bca0 100644 --- a/generator/utilities/gen_dates_dict.py +++ b/generator/utilities/gen_dates_dict.py @@ -2,20 +2,21 @@ import numpy as np from datetime import datetime from beartype import beartype +from typing import Dict, Union, List @beartype def gen_dates_dict( - idhashes_cnts_dict:dict, + idhashes:List[str], start_date:str, - end_date:str - ) -> dict: + end_date:str, + ) -> Dict[str, Union[pd.Timestamp, np.datetime64]]: """ - Generates a dictionary of random dates for an input dictionary of idhashes counts. + Generates a dictionary of random dates for an input list of idhashes. Parameters ---------- - idhashes_cnts_dict : dict - A dictionary of idhashes counts. + idhashes : List[str] + A list of idhashes. start_date : str The start date ("%Y-%m-%d") to generate random dates from. end_date : str @@ -23,15 +24,20 @@ def gen_dates_dict( Returns ------- - dict + Dict[str, Union[pd.Timestamp,int, np.datetime64]] A dictionary of idhashes dates. + + Examples + -------- + ``` + idhashes = ['2e23f63807f6170a', 'b8816ed926bf9f83', 'b010fdb44fa68822'] + gen_dates_dict(idhashes=idhashes, start_date='2020-01-01', end_date='2023-01-01') + ``` """ # generate a range of dates between the given input start and end dates - dates = pd.date_range(start=datetime.strptime(start_date, "%Y-%m-%d"), end=datetime.strptime(end_date, "%Y-%m-%d") - pd.Timedelta(days=1), freq="d",) - # extract out the idhashes from idhashes counts dictionary - idhashes_list = list(idhashes_cnts_dict.keys()) + dates = pd.date_range(start=datetime.strptime(start_date, "%Y-%m-%d"), end=datetime.strptime(end_date, "%Y-%m-%d"), freq="d", inclusive="both",) # randomly sample dates for each of the idhashes - dates_list = list(np.random.choice(a=dates, replace=True, size=len(idhashes_list))) + dates_list = list(np.random.choice(a=dates, replace=True, size=len(idhashes))) # return a dictionary of idhashes and dates - idhashes_dates_dict = dict(zip(idhashes_list, dates_list)) + idhashes_dates_dict = dict(zip(idhashes, dates_list)) return idhashes_dates_dict diff --git a/generator/utilities/gen_idhash_cnt_dict.py b/generator/utilities/gen_idhash_cnt_dict.py index 58e5bda..381897b 100644 --- a/generator/utilities/gen_idhash_cnt_dict.py +++ b/generator/utilities/gen_idhash_cnt_dict.py @@ -1,9 +1,10 @@ -import numpy as np from utilities.gen_random_hash import gen_random_hash from utilities.gen_random_id import gen_random_id from utilities.gen_random_poisson_power import gen_random_poisson_power + +import numpy as np from beartype import beartype -from typing import Union +from typing import Union, Dict @beartype def gen_idhash_cnt_dict( @@ -11,15 +12,15 @@ def gen_idhash_cnt_dict( n:Union[int,np.int64], lam:Union[int,float], nbytes:int=16, - power:int=2 - ) -> dict: + power:int=2, + ) -> Dict[str, Union[str, int, np.int64]]: """ Generates a dictionary of n random idhashes and associated counts. - + Parameters ---------- idhash_type : str - Whether to generate a "id2 or "hash" value. + Whether to generate a "id" or "hash" value. n : int The total number of idhash values to generate. lam : float @@ -28,11 +29,24 @@ def gen_idhash_cnt_dict( The number bytes to include in the idhash value, default is 16. power : int The power of the polynomial random poisson variable, default is 2. - + Returns ------- - dict + Dict[str, Union[str, int, np.int64]] A dictionary of idhashes counts. + + Examples + -------- + ``` + import cons + gen_idhash_cnt_dict( + idhash_type="hash", + n=10, + lam=5.0, + nbytes=16, + power=2, + ) + ``` """ # if generating a random hash value if idhash_type == "hash": @@ -40,8 +54,12 @@ def gen_idhash_cnt_dict( # else if generating a random id value elif idhash_type == "id": idhash_list = gen_random_id(size=n, nbytes=nbytes) + else: + raise ValueError("idhash_type must be either 'id' or 'hash'") # randomly sample n counts from a squared poisson distribution with given lam value - cnts_list = list(gen_random_poisson_power(lam=lam, size=n, power=power)) + cnts_list = gen_random_poisson_power(lam=lam, size=n, power=power).tolist() # return a dictionary of idhashes and counts + if len(idhash_list) != len(set(idhash_list)): + raise ValueError("Generated idhash values are not unique, please increase nbytes value") idhash_dict = dict(zip(idhash_list, cnts_list)) return idhash_dict diff --git a/generator/utilities/gen_obj_idhash_series.py b/generator/utilities/gen_obj_idhash_series.py index cb15463..8900b04 100644 --- a/generator/utilities/gen_obj_idhash_series.py +++ b/generator/utilities/gen_obj_idhash_series.py @@ -1,28 +1,37 @@ import pandas as pd from beartype import beartype +from typing import List @beartype def gen_obj_idhash_series( - idhashes_props_dict:dict, - n_counts_series:pd.Series + idhashes:List[str], + n_counts_series:pd.Series, ) -> pd.Series: """ - Generates a series of entity idhash lists using the entity counts per user Series and idhashes proportions dictionary. - + Generates a series of entity idhash lists using the entity counts per user Series and idhashes list. + Parameters ---------- - idhashes_props_dict : dict - The idhash proportions dictionary. + idhashes : List[str] + The idhashes list. n_counts_series : pd.Series The entity counts for each uid as Series. - + Returns ------- pd.Series A Series of lists containing entity idhashes for each user. + + Examples + -------- + ``` + idhashes = ['2e23f63807f6170a', 'b8816ed926bf9f83', 'b010fdb44fa68822'] + n_counts_series = pd.Series(data=[2, 1, 2], index=range(3), name='n_entities') + gen_obj_idhash_series(idhashes=idhashes, n_counts_series=n_counts_series) + ``` """ # create an exploded series for idhashes within the entity object - obj_idhash_series = pd.Series(data=idhashes_props_dict.keys(), index=n_counts_series.apply(lambda x: range(x)).explode().index) + obj_idhash_series = pd.Series(data=idhashes, index=n_counts_series.index.repeat(n_counts_series.values).to_list()) # group by uid index and collate idhashes as lists - obj_idhash_agg = obj_idhash_series.groupby(level=0).apply(lambda series: series.to_list()) + obj_idhash_agg = obj_idhash_series.groupby(level=0).apply(list) return obj_idhash_agg \ No newline at end of file diff --git a/generator/utilities/gen_random_entity_counts.py b/generator/utilities/gen_random_entity_counts.py index 4490b0e..7b18b49 100644 --- a/generator/utilities/gen_random_entity_counts.py +++ b/generator/utilities/gen_random_entity_counts.py @@ -1,40 +1,46 @@ -import numpy as np -import pandas as pd import cons from objects.User import User from utilities.gen_random_poisson_power import gen_random_poisson_power + +import numpy as np +import pandas as pd from beartype import beartype @beartype def gen_random_entity_counts( user_obj:User, - transaction_timescale:float=1.0 + transaction_timescale:float=1.0, ) -> pd.DataFrame: """ Generates a dataframe of entity counts for all users from a given user object. - + Parameters ---------- user_obj : User The User class object. transaction_timescale : float The transaction timescale where 1.0 is a single year of transactions, default is 1.0. - + Returns ------- pd.DataFrame A dataframe of entity counts for all users from the specified user object. + + Examples + -------- + ``` + from objects.User import User + user_obj=User(n_user_ids=1000, start_date='2020-01-01', end_date='2023-01-01') + gen_random_entity_counts(user_obj=user_obj, transaction_timescale=1.0) + ``` """ # create an empty pandas dataframe to hold the random aggregated data random_entity_counts = pd.DataFrame() # randomly sample from the random user uids - random_entity_counts['uid'] = np.random.choice(a = list(user_obj.user_ids_props_dict.keys()), size = user_obj.n_user_ids, replace = False) + random_entity_counts["uid"] = np.random.choice(a=user_obj.user_ids, size=user_obj.n_user_ids, replace=False) # randomly simulate the number of entities per user - random_entity_counts['n_devices'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["device"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["device"]["power"]) - random_entity_counts['n_cards'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["card"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["card"]["power"]) - random_entity_counts['n_ips'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["ip"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["ip"]["power"]) - random_entity_counts['n_transactions'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["transaction"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["transaction"]["power"]) - random_entity_counts['n_applications'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["application"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["application"]["power"]) - # scale n transactions by - random_entity_counts['n_transactions'] = (random_entity_counts['n_transactions'] * transaction_timescale).round().astype(int) + for object_type in cons.object_types: + random_entity_counts[f"n_{object_type}s"] = gen_random_poisson_power(lam = cons.data_model_poisson_params[object_type]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params[object_type]["power"]) + # scale n transactions by + random_entity_counts["n_transactions"] = (random_entity_counts["n_transactions"] * transaction_timescale).astype(int) return random_entity_counts diff --git a/generator/utilities/gen_random_hash.py b/generator/utilities/gen_random_hash.py index e7c6f98..2cec880 100644 --- a/generator/utilities/gen_random_hash.py +++ b/generator/utilities/gen_random_hash.py @@ -1,30 +1,36 @@ import string import numpy as np from beartype import beartype -from typing import Union +from typing import Union, List @beartype def gen_random_hash( size:Union[int,np.int64], - nbytes:int=16 - ) -> list: + nbytes:int=16, + ) -> List[str]: """ Generates a list of random hashes. - + Parameters ---------- size : int The total number of hashes to generate. nbytes : int The number of alphanumeric values in each hash, default is 16. - + Returns ------- list A list of random hashes. + + Examples + -------- + ``` + gen_random_hash(size=5, nbytes=16) + ``` """ # generate a list of digits and lower case letters from string library alphanumeric = list(string.digits) + list(string.ascii_lowercase)[:6] # randomly sample nbytes digits, string concatenate and convert to integers - random_hashes = ["".join(np.random.choice(a=alphanumeric, size=nbytes, replace=True)) for i in range(size)] + random_hashes = [''.join(row) for row in np.random.choice(a=alphanumeric, size=(size, nbytes), replace=True).tolist()] return random_hashes diff --git a/generator/utilities/gen_random_id.py b/generator/utilities/gen_random_id.py index a6e8c8f..43d1f5a 100644 --- a/generator/utilities/gen_random_id.py +++ b/generator/utilities/gen_random_id.py @@ -1,29 +1,36 @@ import string import numpy as np from beartype import beartype +from typing import Union, List @beartype def gen_random_id( - size:int, - nbytes:int=16 - ) -> list: + size:Union[int,np.int64], + nbytes:int=16, + ) -> List[str]: """ Generates a list of random ids. - + Parameters ---------- size : int The total number of ids to generate. nbytes : int The number of numeric values in each id, default is 16. - + Returns ------- list A list of random ids. + + Examples + -------- + ``` + gen_random_id(size=5, nbytes=16) + ``` """ # generate a list of digits from string library digits = list(string.digits) - # randomly sample nbytes digits, string concatenate and convert to integers - random_ids = ["".join(np.random.choice(a=digits, size=nbytes, replace=True))for i in range(size)] + # randomly sample nbytes digits, string concatenate + random_ids = ["".join(row) for row in np.random.choice(a=digits, size=(size, nbytes), replace=True).tolist()] return random_ids diff --git a/generator/utilities/gen_random_poisson_power.py b/generator/utilities/gen_random_poisson_power.py index e3d64ca..383b9b6 100644 --- a/generator/utilities/gen_random_poisson_power.py +++ b/generator/utilities/gen_random_poisson_power.py @@ -6,24 +6,30 @@ def gen_random_poisson_power( lam:Union[int,float], size:Union[int,np.int64], - power:int + power:int, ) -> np.ndarray: """ Generates data from a polynomial random poisson variable to a given power. - + Parameters ---------- - lam : int + lam : int,float The lambda of the underlying poisson random variable. size : int The number of values to generate. power : int The power of the polynomial sum. - + Returns ------- numpy.ndarray - The random squared poisson values. + The random sum of powered poisson values. + + Examples + -------- + ``` + gen_random_poisson_power(lam=3.0, size=10, power=2) + ``` """ # randomly generate a square poisson distribution a = np.array([np.random.poisson(lam, size) ** p for p in range(1, power+1)]).sum(axis = 0) + 1 diff --git a/generator/utilities/gen_shared_idhashes.py b/generator/utilities/gen_shared_idhashes.py index aa72f7b..e2901ed 100644 --- a/generator/utilities/gen_shared_idhashes.py +++ b/generator/utilities/gen_shared_idhashes.py @@ -1,41 +1,47 @@ import numpy as np import pandas as pd from beartype import beartype +from typing import Dict, Union, List @beartype def gen_shared_idhashes( - idhash_cnt_dict:dict, - prop_shared_idhashes:float - ) -> dict: + idhashes:List[str], + prop_shared_idhashes:float, + ) -> Dict[str, str]: """ Generates a dictionary of shared idhashes proportions - + Parameters ---------- - idhashes_cnts_dict : dict - A dictionary of idhashes counts. + idhashes : list of str + A list of idhashes. prop_shared_idhashes : float The total proportion of shared idhashes. - + Returns ------- - dict - A dictionary of shared idhashes proportion. + Dict[str, str] + A dictionary idhashes and their shared idhashes. + + Examples + -------- + ``` + idhashes=['2e23f63807f6170a', 'b8816ed926bf9f83', 'b010fdb44fa68822'] + gen_shared_idhashes(idhashes=idhashes, prop_shared_idhashes=0.01) + ``` """ # calculate the total number of idhashes - n_idhashes = len(idhash_cnt_dict) + n_idhashes = len(idhashes) # randomly sample the idhashes based on the total proportion of shared idhashes - shared_idhashes_list = list( - np.random.choice( - a=list(idhash_cnt_dict.keys()), - size=int(np.round(n_idhashes * prop_shared_idhashes)), - replace=False - ) - ) + shared_idhashes_list = np.random.choice( + a=idhashes, + size=int(np.round(n_idhashes * prop_shared_idhashes)), + replace=False + ).tolist() shared_idhash_map_dict = {} - if shared_idhashes_list != []: + if (shared_idhashes_list != []): # determine how many networks - n_groups = int(np.floor(np.sqrt(len(shared_idhashes_list)))) + n_groups = int(np.ceil(np.sqrt(len(shared_idhashes_list)))) group_uniform_dict = {g:np.random.uniform() for g in range(n_groups)} group_prop_dict = {key:value/sum(group_uniform_dict.values()) for key, value in group_uniform_dict.items()} # generate groups for all shared id hashes @@ -43,7 +49,7 @@ def gen_shared_idhashes( shared_idhashes_groups_dict = dict(zip(shared_idhashes_list, shared_idhashes_groups_list)) shared_idhashes_groups_df = pd.Series(shared_idhashes_groups_dict, name="shared_idhashes_group").to_frame().reset_index().rename(columns={'index':'idhash'}) shared_entity_groups_dict = shared_idhashes_groups_df.groupby('shared_idhashes_group').agg({'idhash':list}).to_dict()['idhash'] - shared_idhashes_groups_df['shared_idhash'] = shared_idhashes_groups_df.apply(lambda series: np.random.choice(a=shared_entity_groups_dict[series['shared_idhashes_group']]), axis=1) + shared_idhashes_groups_df['shared_idhash'] = [np.random.choice(shared_entity_groups_dict[group]) for group in shared_idhashes_groups_df['shared_idhashes_group']] # create the shared idhash map dictionary shared_idhash_map_dict = shared_idhashes_groups_df.set_index('idhash')['shared_idhash'].to_dict() return shared_idhash_map_dict diff --git a/generator/utilities/gen_trans_rejection_rates.py b/generator/utilities/gen_trans_rejection_rates.py index af8ab6e..f24ce63 100644 --- a/generator/utilities/gen_trans_rejection_rates.py +++ b/generator/utilities/gen_trans_rejection_rates.py @@ -1,70 +1,72 @@ -import pandas as pd import cons + +import pandas as pd from beartype import beartype +from typing import Dict @beartype def gen_trans_rejection_rates( trans_data:pd.DataFrame, - fpath_countrieseurope=cons.fpath_countrieseurope, - fpath_countrycrimeindex=cons.fpath_countrycrimeindex, - fpath_domain_email=cons.fpath_domain_email - ) -> dict: + fpath_countries_europe:str=cons.fpath_countries_europe, + fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex, + fpath_email_domain:str=cons.fpath_email_domain, + ) -> Dict[str, Dict[str, float]]: """ Generates the transaction rejection rates based on features within the transaction level telecom payments data. - + Parameters ---------- trans_data : pandas.DataFrame The transaction level telecom payments data. - fpath_countrieseurope : str - The file path to the europe countries reference data, default is cons.fpath_countrieseurope. + fpath_countries_europe : str + The file path to the europe countries reference data, default is cons.fpath_countries_europe. fpath_countrycrimeindex : str The file path to the country crime index reference data, default is cons.fpath_countrycrimeindex. - fpath_domain_email :str - The file path to the email domains reference data, default is cons.fpath_domain_email. - + fpath_email_domain :str + The file path to the email domains reference data, default is cons.fpath_email_domain. + Returns ------- dict The rejection rates based on features within the transaction level telecom payments data. """ - # create empty dictionary to hold rejection rates + # initialize dictionary to store all computed rejection rates rejection_rates_dict = {} - + # generate country code rejection based rates - countrieseurope = pd.read_csv(fpath_countrieseurope, usecols=["ISO numeric", "ISO alpha 2"]) + countrieseurope = pd.read_csv(fpath_countries_europe, usecols=["ISO alpha 2"]) countrycrimeindex = pd.read_csv(fpath_countrycrimeindex, usecols=["country_code", "crime_index"]) europecountrycrimeindex = pd.merge(left=countrieseurope, right=countrycrimeindex, left_on="ISO alpha 2", right_on="country_code", how="left",) europecountrycrimeindex["trans_reject_rate"] = europecountrycrimeindex["crime_index"].divide(europecountrycrimeindex["crime_index"].sum()) country_code_trans_reject_rate_dict = europecountrycrimeindex.set_index("ISO alpha 2")["trans_reject_rate"].to_dict() rejection_rates_dict["country_code_trans_reject_rate_dict"] = country_code_trans_reject_rate_dict - + # generate domain email rejection based rates - domain_email = pd.read_csv(fpath_domain_email, usecols=["domain", "proportion"]) + domain_email = pd.read_csv(fpath_email_domain, usecols=["domain", "proportion"]) domain_email["trans_reject_rate"] = (1 - domain_email["proportion"]) / (1 - domain_email["proportion"]).sum() domain_email_trans_reject_rate_dict = domain_email.set_index("domain")["trans_reject_rate"].to_dict() rejection_rates_dict["domain_email_trans_reject_rate_dict"] = domain_email_trans_reject_rate_dict - + # generate shared entities with rejection rates dictionary shared_devices = (trans_data.groupby(by="device_hash").agg({"userid": "nunique"}).sort_values(by="userid")) shared_ips = (trans_data.groupby(by="ip_hash").agg({"userid": "nunique"}).sort_values(by="userid")) shared_cards = (trans_data.groupby(by="card_hash").agg({"userid": "nunique"}).sort_values(by="userid")) - shared_devices_reject_rate_dict = shared_devices.divide(shared_devices["userid"].sum()).to_dict()["userid"] + shared_devices_reject_rate_dict = shared_devices.divide(shared_devices["userid"].sum())["userid"].to_dict() shared_ips_reject_rate_dict = shared_ips.divide(shared_ips["userid"].sum()).to_dict()["userid"] shared_cards_reject_rate_dict = shared_cards.divide(shared_cards["userid"].sum()).to_dict()["userid"] rejection_rates_dict["shared_devices_reject_rate_dict"] = shared_devices_reject_rate_dict rejection_rates_dict["shared_ips_reject_rate_dict"] = shared_ips_reject_rate_dict rejection_rates_dict["shared_cards_reject_rate_dict"] = shared_cards_reject_rate_dict - + # generate occurrence based rejection rates count_devices = (trans_data.groupby(by="userid").agg({"device_hash": "nunique"}).sort_values(by="device_hash")) count_ips = (trans_data.groupby(by="userid").agg({"ip_hash": "nunique"}).sort_values(by="ip_hash")) count_cards = (trans_data.groupby(by="userid").agg({"card_hash": "nunique"}).sort_values(by="card_hash")) - count_devices_reject_rate_dict = count_devices.divide(count_devices["device_hash"].sum()).to_dict()["device_hash"] + count_devices_reject_rate_dict = count_devices.divide(count_devices["device_hash"].sum())["device_hash"].to_dict() count_ips_reject_rate_dict = count_ips.divide(count_ips["ip_hash"].sum()).to_dict()["ip_hash"] count_cards_reject_rate_dict = count_cards.divide(count_cards["card_hash"].sum()).to_dict()["card_hash"] rejection_rates_dict["count_devices_reject_rate_dict"] = count_devices_reject_rate_dict rejection_rates_dict["count_ips_reject_rate_dict"] = count_ips_reject_rate_dict rejection_rates_dict["count_cards_reject_rate_dict"] = count_cards_reject_rate_dict - + return rejection_rates_dict diff --git a/generator/utilities/gen_trans_status.py b/generator/utilities/gen_trans_status.py index 492d88d..35fbbd3 100644 --- a/generator/utilities/gen_trans_status.py +++ b/generator/utilities/gen_trans_status.py @@ -1,36 +1,38 @@ +import cons + import random import numpy as np import pandas as pd -import cons from beartype import beartype +from typing import List, Dict, Union @beartype def gen_trans_status( series:pd.Series, - rejection_rates_dict:dict, - rejection_scaling_factor:int=2 - ) -> list: + rejection_rates_dict:Dict[str, Dict[str, float]], + rejection_scaling_factor:int=2, + ) -> List[Union[str, float]]: """ Generates the transaction status for a pandas series from the transaction level telecom payments data given the rejection rates dictionary from the same data. - + Parameters ---------- series : pandas.Series A pandas series from the transaction level telecom payments data. - rejection_rates_dict : dict + rejection_rates_dict : Dict[str, Dict[str, float]] Rejection rates generated the transaction level telecom payments data. rejection_scaling_factor : int A multiplicative scaling factor for rejection rates, default is 2. - + Returns ------- - list - The transaction status for the pandas series. + List[str] + The transaction status and error code. """ # set country code columns country_code_columns = ["registration_country_code","ip_country_code","card_country_code"] - - if series['card_hash'] == series['card_hash']: + # if card hash + if pd.notna(series['card_hash']): status = "rejected" # add rejections based on crime rates within country codes if rejection_rates_dict["country_code_trans_reject_rate_dict"][np.random.choice(a=series[country_code_columns].dropna().to_list(), size=1)[0]] >= random.uniform(0, 1)/rejection_scaling_factor: @@ -42,7 +44,7 @@ def gen_trans_status( elif cons.data_model_inconsistent_country_codes_rejection_rate[series[country_code_columns].dropna().nunique()] >= random.uniform(0, 1)/rejection_scaling_factor: error_code = np.random.choice(a=list(cons.data_model_rejection_codes_connection.keys()),p=list(cons.data_model_rejection_codes_connection.values()),size=1)[0] # add rejections based on shared ips, cards and devices - elif series["device_hash"] == series["device_hash"] and rejection_rates_dict["shared_devices_reject_rate_dict"][series["device_hash"]] >= random.uniform(0, 1)/rejection_scaling_factor: + elif pd.notna(series["device_hash"]) and rejection_rates_dict["shared_devices_reject_rate_dict"][series["device_hash"]] >= random.uniform(0, 1)/rejection_scaling_factor: error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0] elif series["ip_hash"] == series["ip_hash"] and rejection_rates_dict["shared_ips_reject_rate_dict"][series["ip_hash"]] >= random.uniform(0, 1)/rejection_scaling_factor: error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0] @@ -57,7 +59,9 @@ def gen_trans_status( error_code = np.random.choice(a=list(cons.data_model_rejection_codes_funds.keys()),p=list(cons.data_model_rejection_codes_funds.values()),size=1)[0] # otherwise return successful status else: - status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0] + successful_status = {key:cons.data_model_transaction_status[key] for key in ['successful', 'pending']} + successful_probs = [value/sum(successful_status.values()) for value in successful_status.values()] + status = np.random.choice(a=list(successful_status.keys()), size=1, p=successful_probs)[0] error_code = np.nan else: status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0] diff --git a/generator/utilities/gen_user_names_file.py b/generator/utilities/gen_user_names_file.py index 8f3dfe6..c8765fe 100644 --- a/generator/utilities/gen_user_names_file.py +++ b/generator/utilities/gen_user_names_file.py @@ -11,20 +11,66 @@ sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator") import cons -from utilities.Bedrock import Bedrock, prompt, system +from utilities.Bedrock import Bedrock, prompt, system_prompt -def invoke_bedrock(model, n_user_names, country): +def invoke_bedrock( + model:Bedrock, + n_user_names:int, + country:str, + countrieseurope:pd.DataFrame, + ) -> tuple[pd.DataFrame, pd.DataFrame]: """ + Invokes the Bedrock model to generate user names for a specified country. + + This function calls the Bedrock model with a formatted prompt to generate first names + and last names for a given country. It processes the model's response, parses the JSON + output, and merges the results with country data. The function deduplicates and standardizes + the name formatting, then persists the data to temporary CSV files. + + Parameters + ---------- + model : Bedrock + The Bedrock model instance used to generate names. + n_user_names : int + The number of user names to generate. + country : str + The country for which to generate names. + countrieseurope : pd.DataFrame + A DataFrame containing country information for merging. + + Returns + ------- + tuple: + A tuple containing two pandas DataFrames: + - tmp_firstname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information. + - tmp_lastname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information. + + Raises + ------ + json.JSONDecodeError: If the model response cannot be parsed as JSON. + KeyError: If the expected keys ("firstnames", "lastnames") are missing from the JSON response. + Exception: If the merge with country data fails or file I/O operations encounter errors. + + Notes + ----- + - Names are standardized by converting to lowercase, removing extra whitespace, and applying Unicode normalization using unidecode. + - Duplicate names are removed after each processing step. + - Results are concatenated with any previously generated data for the same country and saved to temporary CSV files if the new data increases the dataset size. + - CSV files are encoded in latin1 format. + """ logging.info("Calling Bedrock ...") # call bedrock model formatted_prompt = prompt.format(n_user_names=n_user_names, country=country) logging.info(formatted_prompt) - model_response = model.prompt(prompt=formatted_prompt, system=system, max_gen_len=2048) + model_response = model.prompt(user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048) # split out answer text = model_response.split("")[1].split("")[0] # parse json - record_set = json.loads(text) + try: + record_set = json.loads(text) + except json.JSONDecodeError as e: + raise Exception(f"Error parsing JSON: {e}") logging.info("Processing results ...") # generate pandas dataframe user_firstname_data = pd.Series(record_set["firstnames"], name="firstnames").to_frame().drop_duplicates(subset=["firstnames"]) @@ -52,7 +98,7 @@ def invoke_bedrock(model, n_user_names, country): tmp_firstname_country_data = pd.concat(objs=[tmp_firstname_country_data, llama_firstname_country_data], axis=0, ignore_index=True) tmp_lastname_country_data = pd.concat(objs=[tmp_lastname_country_data, llama_lastname_country_data], axis=0, ignore_index=True) # standardise names formatting - standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if x not in [None, "", np.nan] else x + standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x tmp_firstname_country_data["firstnames"] = tmp_firstname_country_data["firstnames"].apply(lambda x: standardise_text_lambda(x)) tmp_lastname_country_data["lastnames"] = tmp_lastname_country_data["lastnames"].apply(lambda x: standardise_text_lambda(x)) # deduplicate data @@ -61,22 +107,22 @@ def invoke_bedrock(model, n_user_names, country): # print shapes logging.info(f"tmp_firstname_country_data.shape: {tmp_firstname_country_data.shape}") logging.info(f"tmp_lastname_country_data.shape: {tmp_lastname_country_data.shape}") - # save firstnames names data to temp directory + # save firstnames names data to temp directory (if pairwise firstnames have been created) if tmp_firstname_country_data.shape[0] >= llama_firstname_country_data.shape[0]: tmp_firstname_country_data.to_csv(fpath_temp_llama_firstnames, index=False, encoding="latin1") logging.info(f"Wrote {fpath_temp_llama_firstnames} ...") - # save lastnames data to temp directory + # save lastnames data to temp directory (if pairwise lastnames have been created) if tmp_lastname_country_data.shape[0] >= llama_lastname_country_data.shape[0]: tmp_lastname_country_data.to_csv(fpath_temp_llama_lastnames, index=False, encoding="latin1") logging.info(f"Wrote {fpath_temp_llama_lastnames} ...") return (tmp_firstname_country_data, tmp_lastname_country_data) if __name__ == "__main__": - + # set up logging lgr = logging.getLogger() lgr.setLevel(logging.INFO) - + # load aws config with open(cons.fpath_aws_session_token, "r") as j: aws_config = json.loads(j.read()) @@ -93,7 +139,7 @@ def invoke_bedrock(model, n_user_names, country): bedrock = Bedrock(session=session, model_region="us-east-1", model_id="meta.llama3-70b-instruct-v1:0") # load countries, firstnames and surnames files - countrieseurope = pd.read_csv(cons.fpath_countrieseurope, usecols=['name', 'ISO numeric']) + countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric']) orig_firstnames = pd.read_csv(cons.fpath_firstnames) orig_surnames = pd.read_csv(cons.fpath_lastnames) @@ -106,12 +152,13 @@ def invoke_bedrock(model, n_user_names, country): firstname_country_data = [] lastname_country_data = [] error_countries = [] + # switch to toggle bedrock calls run_bedrock = False - + # set countries list countries_list = countrieseurope['name'].to_list() #countries_list = ['Cyprus'] - + for country in countries_list: logging.info(f"{country} ...") try: @@ -119,7 +166,7 @@ def invoke_bedrock(model, n_user_names, country): # call bedrock model and generate user names data tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, n_user_names=n_user_names, country=country) logging.info("Waiting ...") - # wait 30 seconds before retrying + # wait 20 seconds before retrying time.sleep(20) else: tmp_firstname_country_data = pd.read_csv(cons.fpath_temp_llama_firstnames.format(country=country.lower()), encoding="latin1") @@ -134,7 +181,7 @@ def invoke_bedrock(model, n_user_names, country): # log if any countries failed to generate data if len(error_countries) > 0: logging.info(f"Failed to generated data for countries: {error_countries}") - + # load existing reference data firstname_country_df = pd.read_csv(cons.fpath_llama_firstnames, encoding="latin1") lastname_country_df = pd.read_csv(cons.fpath_llama_lastnames, encoding="latin1") @@ -147,7 +194,7 @@ def invoke_bedrock(model, n_user_names, country): # sort and deduplicate output data output_firstname_country_df = output_firstname_country_df.drop_duplicates(subset=["country","firstnames"]).sort_values(by=["country","firstnames"]) output_lastname_country_df = output_lastname_country_df.drop_duplicates(subset=["country","lastnames"]).sort_values(by=["country","lastnames"]) - + # write data to disk if output_firstname_country_df['country'].nunique() == n_countries: logging.info(f"output_firstname_country_df.shape: {output_firstname_country_df.shape}") diff --git a/generator/utilities/input_error_handling.py b/generator/utilities/input_error_handling.py index ffce36a..20c93cc 100644 --- a/generator/utilities/input_error_handling.py +++ b/generator/utilities/input_error_handling.py @@ -1,29 +1,31 @@ from beartype import beartype +from typing import Dict @beartype def input_error_handling( - input_params_dict:dict - ) -> int: + input_params_dict:Dict[str, object], + ): """ Runs error handling on the input params dictionary. - + Parameters ---------- - input_params_dict : dict + input_params_dict : Dict[str, object] A dictionary of input parameters. - - Returns - ------- - int - Returns 0 for successful completion, otherwise returns value errors depending on failed input parameter check. + + Examples + -------- + ``` + input_params_dict = {'n_users': 1000, 'use_random_seed': 1, 'n_itr': 10} + input_error_handling(input_params_dict=input_params_dict) + ``` """ # check if the n users parameter is positive - if not input_params_dict["n_users"] >= 1: + if not ((input_params_dict["n_users"] >= 1) and (isinstance(input_params_dict["n_users"], int))): raise ValueError(f"Invalid n_users parameter value {input_params_dict['n_users']}; must be a integer >= 1.") # check if the random seed is either 0 or 1 - if not input_params_dict["use_random_seed"] in (0, 1): - raise ValueError(f"Invalid random_seed use_random_seed value {input_params_dict['use_random_seed']}; must be either 0 or 1.") + if not ((input_params_dict["use_random_seed"] in (0, 1)) and (isinstance(input_params_dict["use_random_seed"], int))): + raise ValueError(f"Invalid use_random_seed value {input_params_dict['use_random_seed']}; must be either 0 or 1.") # check if the number of iterations is greater than or equal to 1 - if not input_params_dict["n_itr"] >= 1: + if not ((input_params_dict["n_itr"] >= 1) and (isinstance(input_params_dict["n_itr"], int))): raise ValueError(f"Invalid n_itr parameter value {input_params_dict['n_itr']}; must be an integer >= 1.") - return 0 diff --git a/generator/utilities/join_idhashes_dict.py b/generator/utilities/join_idhashes_dict.py index 04663a8..c5c6e1b 100644 --- a/generator/utilities/join_idhashes_dict.py +++ b/generator/utilities/join_idhashes_dict.py @@ -1,32 +1,34 @@ +import numpy as np import pandas as pd from beartype import beartype +from typing import Dict, Union @beartype def join_idhashes_dict( data:pd.DataFrame, - idhashes_dict:dict, + idhashes_dict:Dict[Union[str, int], object], idhash_key_name:str, - idhash_val_name:str + idhash_val_name:str, ): """ Joins an entity attribute dictionary to either the user or transaction data. - + Parameters ---------- data : pd.DataFrame The user or transaction data. - idhashes_dict : dict + idhashes_dict : Dict[Union[str, int], object] The entity attribute dictionary with an idhash as the key for joining to the user or transaction data. idhash_key_name : str The name of the idhash key for joining to the user or transaction data. idhash_val_name : str The name to set for the idhash attribute when joining to the user or transaction data. - + Returns ------- pd.DataFrame The user or transaction data returned with the joined idhash attribute dictionary values. """ - idhashes_df = pd.Series(idhashes_dict, name=idhash_val_name).to_frame().reset_index().rename(columns={'index':idhash_key_name}) + idhashes_df = pd.DataFrame(list(idhashes_dict.items()), columns=[idhash_key_name, idhash_val_name]) idhashes_join = pd.merge(left=data, right=idhashes_df, on=idhash_key_name, how='left') return idhashes_join \ No newline at end of file diff --git a/generator/utilities/multiprocess.py b/generator/utilities/multiprocess.py index f8595bb..6a02bd2 100644 --- a/generator/utilities/multiprocess.py +++ b/generator/utilities/multiprocess.py @@ -1,34 +1,37 @@ import os from multiprocessing import Pool from beartype import beartype +from typing import List, Any @beartype def multiprocess( func, - args:list, - ncpu:int=os.cpu_count() - ) -> list: + args:List[tuple], + ncpu:int=None, + ) -> List[Any]: """ - Generates a dictionary of random dates for an input dictionary of idhashes counts + Generates a dictionary of random dates for an input dictionary of idhashes counts by utilizing multiprocessing. Parameters ---------- - func : + func : Callable[..., Any] The function to be executed in parallel - args : list - The input parameters as a list of tuples to be passed with the function in parallel + args : List[tuple] + The input parameters as a list of tuples to be passed with the function in parallel via starmap. ncpu : int - The number of cpus to execute across, default is os.cpu_count(). + The number of cpus to execute across, default is None. Returns ------- - list + List[Any] A list of output returned from the func calls ran in parallel """ + # set number of cpus + if ncpu is None: + ncpu = os.cpu_count() # initialize a pool of ncpus - pool = Pool(ncpu) - # execution given function and arguments across pool of ncpus - results = pool.starmap(func, args) - # close pool of ncpus - pool.close() + results = [] + with Pool(ncpu) as pool: + # execution given function and arguments across pool of ncpus + results = pool.starmap(func, args) return results diff --git a/generator/utilities/remove_duplicate_idhashes.py b/generator/utilities/remove_duplicate_idhashes.py index 820d77b..115f00b 100644 --- a/generator/utilities/remove_duplicate_idhashes.py +++ b/generator/utilities/remove_duplicate_idhashes.py @@ -5,17 +5,18 @@ @beartype def remove_duplicate_idhashes( user_data:pd.DataFrame, - idhash_col:str + idhash_col:str, ): - """Removes duplicate idhashes from a given idhash column. - + """ + Removes duplicate idhashes from a given idhash column. + Parameters ---------- user_data : pandas.DataFrame The user level telecom payments data. idhash_col : str The column with duplicate idhashes to be removed. - + Returns ------- pandas.DataFrame @@ -30,5 +31,5 @@ def remove_duplicate_idhashes( # overwrite series with empty lists tmp_data[idhash_col] = np.nan tmp_data[idhash_col] = tmp_deduplicate_series - tmp_data[idhash_col] = tmp_data[idhash_col].apply(lambda x: x if x == x else []) + tmp_data[idhash_col] = tmp_data[idhash_col].apply(lambda x: x if pd.notnull(x) else []) return tmp_data diff --git a/generator/utilities/round_trans_amount.py b/generator/utilities/round_trans_amount.py index d52f018..8b1002e 100644 --- a/generator/utilities/round_trans_amount.py +++ b/generator/utilities/round_trans_amount.py @@ -1,24 +1,31 @@ import numpy as np -import pandas as pd from beartype import beartype @beartype def round_trans_amount(amounts:np.ndarray) -> np.ndarray: """ Rounds transaction amounts to have store price like remainders such as 1.99, 3.45, and 2.5. - + Parameters ---------- - amounts : np.array + amounts : np.ndarray The transaction amounts to round. Returns ------- np.array - The rounded transaction amounts with store rice like remainders. + The rounded transaction amounts with store price like remainders. + + Examples + -------- + ``` + import numpy as np + amounts = np.array([2.34, 5.67, 3.21]) + round_trans_amount(amounts=amounts) + ``` """ + # a probability distribution for remainders round_dict = {0.01:0.4, 0.5:0.1, 0.45:0.1, 0.51:0.1, 0.41:0.1, 0.71:0.1, 1:0.1} remainder = np.random.choice(a=list(round_dict.keys()), size=amounts.shape[0], replace=True, p=list(round_dict.values())) - rounded_amounts = np.round(np.ceil(amounts) - remainder, 2) - rounded_amounts = pd.Series(rounded_amounts).apply(lambda x: max(0, x)).values + rounded_amounts =np.maximum(0, np.round(np.ceil(amounts) - remainder, 2)) return rounded_amounts \ No newline at end of file