diff --git a/data/unittest/transaction_data.parquet b/data/unittest/transaction_data.parquet
index 1067bf5..47b78f1 100644
Binary files a/data/unittest/transaction_data.parquet and b/data/unittest/transaction_data.parquet differ
diff --git a/data/unittest/user_data.parquet b/data/unittest/user_data.parquet
index af44975..bb932c9 100644
Binary files a/data/unittest/user_data.parquet and b/data/unittest/user_data.parquet differ
diff --git a/generator/app/ProgrammeParams.py b/generator/app/ProgrammeParams.py
index 963db5b..01de6d5 100644
--- a/generator/app/ProgrammeParams.py
+++ b/generator/app/ProgrammeParams.py
@@ -1,8 +1,51 @@
-import cons
from datetime import datetime
from beartype import beartype
+import cons
+
class ProgrammeParams():
+ """
+ Class to manage and store programme parameters for the telecom payment generator.
+ This class validates and initializes all configuration parameters needed for the
+ payment generation process, including user counts, application volumes, and date ranges
+ for registration and transaction periods.
+
+ Parameters
+ ----------
+ n_users : int, optional
+ Number of users. Defaults to 100.
+ random_seed : int, optional
+ Seed for reproducible randomization. Defaults to None.
+ n_applications : int, optional
+ Number of applications. Defaults to 20000.
+ registration_start_date : str, optional
+ Registration period start date. Defaults to cons.default_registration_start_date.
+ registration_end_date : str, optional
+ Registration period end date. Defaults to cons.default_registration_end_date.
+ transaction_start_date : str, optional
+ Transaction period start date. Defaults to cons.default_transaction_start_date.
+ transaction_end_date : str, optional
+ Transaction period end date. Defaults to cons.default_transaction_end_date.
+
+ Attributes
+ ----------
+ random_seed : int, optional
+ Seed for random number generation for reproducibility.
+ n_users : int
+ Number of users to generate. Defaults to 100.
+ n_applications : int
+ Number of applications to generate. Defaults to 20000.
+ registration_start_date : str
+ Start date for user registration (format: YYYY-MM-DD).
+ registration_end_date : str
+ End date for user registration (format: YYYY-MM-DD).
+ transaction_start_date : str
+ Start date for transactions (format: YYYY-MM-DD).
+ transaction_end_date : str
+ End date for transactions (format: YYYY-MM-DD).
+ transaction_timescale : float
+ The transaction period duration in years.
+ """
@beartype
def __init__(
@@ -13,7 +56,7 @@ def __init__(
registration_start_date:str=cons.default_registration_start_date,
registration_end_date:str=cons.default_registration_end_date,
transaction_start_date:str=cons.default_transaction_start_date,
- transaction_end_date:str=cons.default_transaction_end_date
+ transaction_end_date:str=cons.default_transaction_end_date,
):
# take programme parameters from class parameters
self.random_seed = random_seed
diff --git a/generator/app/gen_random_telecom_data.py b/generator/app/gen_random_telecom_data.py
index 9a0be63..1b400cd 100644
--- a/generator/app/gen_random_telecom_data.py
+++ b/generator/app/gen_random_telecom_data.py
@@ -1,4 +1,6 @@
import numpy as np
+import pandas as pd
+from typing import Dict
import random
from beartype import beartype
@@ -16,22 +18,25 @@
@beartype
def gen_random_telecom_data(
- n_users=1,
- random_seed=None,
- registration_start_date=cons.default_registration_start_date,
- registration_end_date=cons.default_registration_end_date,
- transaction_start_date=cons.default_transaction_start_date,
- transaction_end_date=cons.default_transaction_end_date
- ):
+ n_users:int=1,
+ random_seed:int=None,
+ n_applications:int=20000,
+ registration_start_date:str=cons.default_registration_start_date,
+ registration_end_date:str=cons.default_registration_end_date,
+ transaction_start_date:str=cons.default_transaction_start_date,
+ transaction_end_date:str=cons.default_transaction_end_date,
+ ) -> Dict[str, pd.DataFrame]:
"""
Generates random telecommunications data.
-
+
Parameters
----------
- n_users : float
+ n_users : int
The number of users to generate random telecom payments data for, default is 1.
random_seed : int
A set random seed for reproducible results, default is None.
+ n_applications : int
+ The number of applications to generate, default is 20000.
registration_start_date : str
The user registration start date, default is cons.default_registration_start_date.
registration_end_date : str
@@ -40,28 +45,28 @@ def gen_random_telecom_data(
The user transaction start date, default is cons.default_transaction_start_date.
transaction_end_date : str
The user transaction end date, default is cons.default_transaction_end_date.
-
+
Returns
-------
- pandas.DataFrame
+ Dict[str, pandas.DataFrame]
A random telecommunication payments dataset.
"""
-
+
# initalise programme parameters
programmeparams = ProgrammeParams(
- n_users=n_users,
+ n_users=n_users,
random_seed=random_seed,
- n_applications=20000,
- registration_start_date=registration_start_date,
+ n_applications=n_applications,
+ registration_start_date=registration_start_date,
registration_end_date=registration_end_date,
transaction_start_date=transaction_start_date,
transaction_end_date=transaction_end_date
)
-
+
# set random seed
random.seed(programmeparams.random_seed)
np.random.seed(seed=programmeparams.random_seed)
-
+
# generate random users
user_obj = User(
n_user_ids=programmeparams.n_users,
@@ -69,23 +74,23 @@ def gen_random_telecom_data(
end_date=programmeparams.registration_end_date,
fpath_firstnames=cons.fpath_llama_firstnames,
fpath_lastnames=cons.fpath_llama_lastnames,
- fpath_countrieseurope=cons.fpath_countrieseurope,
- fpath_domain_email=cons.fpath_domain_email
+ fpath_countries_europe=cons.fpath_countries_europe,
+ fpath_email_domain =cons.fpath_email_domain
)
-
+
# generate random entity counts for each user
random_entity_counts = gen_random_entity_counts(
user_obj=user_obj,
transaction_timescale=programmeparams.transaction_timescale
)
-
+
# generate random entity values
device_obj = Device(n_device_hashes=random_entity_counts['n_devices'].sum())
card_obj = Card(n_card_hashes=random_entity_counts['n_cards'].sum())
ip_obj = Ip(n_ip_hashes=random_entity_counts['n_ips'].sum())
transaction_obj = Transaction(n_transaction_hashes=random_entity_counts['n_transactions'].sum(), start_date=programmeparams.transaction_start_date, end_date=programmeparams.transaction_end_date)
application_obj = Application(n_application_hashes=programmeparams.n_applications)
-
+
# generate user level data
user_data = gen_user_data(
random_entity_counts=random_entity_counts,
@@ -96,7 +101,7 @@ def gen_random_telecom_data(
transaction_obj=transaction_obj,
application_obj=application_obj,
)
-
+
# generate transaction level data
trans_data = gen_trans_data(
user_data=user_data,
@@ -108,5 +113,5 @@ def gen_random_telecom_data(
application_obj=application_obj,
fpath_countrycrimeindex=cons.fpath_countrycrimeindex
)
-
+
return {"user_data":user_data, "trans_data":trans_data}
diff --git a/generator/app/gen_trans_data.py b/generator/app/gen_trans_data.py
index 69cfc8a..363941d 100644
--- a/generator/app/gen_trans_data.py
+++ b/generator/app/gen_trans_data.py
@@ -1,8 +1,9 @@
import random
import pandas as pd
import numpy as np
-import cons
from datetime import datetime
+from beartype import beartype
+
from objects.User import User
from objects.Device import Device
from objects.Card import Card
@@ -14,7 +15,7 @@
from utilities.gen_trans_rejection_rates import gen_trans_rejection_rates
from utilities.gen_trans_status import gen_trans_status
from utilities.join_idhashes_dict import join_idhashes_dict
-from beartype import beartype
+import cons
@beartype
def gen_trans_data(
@@ -25,11 +26,11 @@ def gen_trans_data(
ip_obj:Ip,
transaction_obj:Transaction,
application_obj:Application,
- fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex
+ fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex,
):
"""
Generates random transaction level telecom payments data.
-
+
Parameters
----------
user_data : pandas.DataFrame
@@ -48,22 +49,23 @@ def gen_trans_data(
The random application data model object.
fpath_countrycrimeindex : str
The full file path to the country crime index reference data, default is cons.fpath_countrycrimeindex.
-
+
Returns
-------
pandas.DataFrame
The random transaction level telecom payments data.
"""
-
+
# explode user data to transaction level
trans_data = user_data.explode('transaction_hash').dropna(subset = ['transaction_hash']).reset_index(drop = True)
# select uid entity hashes for each transaction
- trans_data['device_hash'] = trans_data['device_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan)
- trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan)
- trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan)
- trans_data['application_hash'] = trans_data['application_hash'].apply(lambda x: np.random.choice(x, size = 1)[0])
+ trans_data['device_hash'] = trans_data['device_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
+ trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
+ trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
+ trans_data['application_hash'] = trans_data['application_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
# add null values card hashes
- trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.nan if random.uniform(0, 1) <= cons.data_model_null_rates['card'] else x)
+ trans_null_mask = np.random.uniform(size=trans_data.shape[0]) <= cons.data_model_null_rates['card']
+ trans_data.loc[trans_null_mask, 'card_hash'] = np.nan
# add shared hashed entities between users
trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: ip_obj.ip_shared_idhash_map_dict[x] if x in ip_obj.ip_shared_idhash_map_dict.keys() else x)
trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: card_obj.card_shared_idhash_map_dict[x] if x in card_obj.card_shared_idhash_map_dict.keys() else x)
@@ -79,7 +81,7 @@ def gen_trans_data(
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=transaction_obj.transaction_hashes_dates_dict, idhash_key_name='transaction_hash', idhash_val_name='transaction_date')
# add application data
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=application_obj.application_hashes_payment_channel_dict, idhash_key_name='application_hash', idhash_val_name='card_payment_channel')
-
+
# TODO: wrap this logic up into a separate function
# align payment channel with missing card hashes and 0 transaction amounts
zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0)
@@ -90,7 +92,8 @@ def gen_trans_data(
trans_data['transaction_payment_method'] = 'card'
zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0)
missing_card_hash_filter = (trans_data['card_hash'].isnull())
- trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0])
+ # trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0])
+ trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))[0])
trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan
# align country codes for user, ip and card
country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha']
@@ -105,15 +108,15 @@ def gen_trans_data(
dates_series = pd.date_range(start=datetime.strptime(transaction_obj.start_date, "%Y-%m-%d"), end=datetime.strptime(transaction_obj.end_date, "%Y-%m-%d") - pd.Timedelta(days=1), freq="d")
trans_data[date_columns] = trans_data[date_columns].apply(lambda s: [s['registration_date'], np.random.choice(a=dates_series[dates_series >= max(s['registration_date'], s['transaction_date'])], size=1)[0]], result_type = 'expand', axis = 1).copy()
# map iso numeric country codes to iso alpha country codes
- country_codes_map = gen_country_codes_map(fpath_countrieseurope=user_obj.fpath_countrieseurope)
+ country_codes_map = gen_country_codes_map(fpath_countries_europe=user_obj.fpath_countries_europe)
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='registration_country_code_alpha', idhash_val_name='registration_country_code')
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='card_country_code_alpha', idhash_val_name='card_country_code')
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='ip_country_code_alpha', idhash_val_name='ip_country_code')
-
+
# generate transaction status and error code
- rejection_rates_dict = gen_trans_rejection_rates(trans_data=trans_data, fpath_countrieseurope=user_obj.fpath_countrieseurope, fpath_countrycrimeindex=fpath_countrycrimeindex, fpath_domain_email=user_obj.fpath_domain_email)
+ rejection_rates_dict = gen_trans_rejection_rates(trans_data=trans_data, fpath_countries_europe=user_obj.fpath_countries_europe, fpath_countrycrimeindex=fpath_countrycrimeindex, fpath_email_domain =user_obj.fpath_email_domain )
trans_data[['transaction_status', 'transaction_error_code']] = trans_data.apply(lambda series: gen_trans_status(series = series, rejection_rates_dict = rejection_rates_dict), result_type = 'expand', axis = 1)
-
+
# order columns and sort rows by transaction date
user_cols = ['userid', 'firstname', 'lastname', 'registration_date', 'registration_country_code', 'uid', 'email_domain']
device_cols = ['device_hash', 'device_type']
@@ -124,5 +127,5 @@ def gen_trans_data(
itr_cols = ['itr_hash']
col_order = user_cols + device_cols + card_cols + ip_cols + app_cols + trans_cols + itr_cols
trans_data = trans_data[col_order].sort_values(by = 'transaction_date').reset_index(drop = True)
-
+
return trans_data
\ No newline at end of file
diff --git a/generator/app/gen_user_data.py b/generator/app/gen_user_data.py
index 7d52dd5..9c5fa94 100644
--- a/generator/app/gen_user_data.py
+++ b/generator/app/gen_user_data.py
@@ -1,5 +1,7 @@
import pandas as pd
import numpy as np
+from beartype import beartype
+
from objects.User import User
from objects.Device import Device
from objects.Card import Card
@@ -9,7 +11,6 @@
from utilities.gen_obj_idhash_series import gen_obj_idhash_series
from utilities.join_idhashes_dict import join_idhashes_dict
from utilities.gen_random_hash import gen_random_hash
-from beartype import beartype
@beartype
def gen_user_data(
@@ -28,17 +29,17 @@ def gen_user_data(
----------
random_entity_counts : pd.DataFrame
The randomly generated entities count data
- user_obj : class
+ user_obj : User
The random user data model object
- device_obj : class
+ device_obj : Device
The random device data model object
- card_obj : class
+ card_obj : Card
The random card data model object
- ip_obj : class
+ ip_obj : Ip
The random ip data model object
- transaction_obj : class
+ transaction_obj : Transaction
The random transaction data model object
- application_obj : class
+ application_obj : Application
The random application data model object
Returns
@@ -58,14 +59,18 @@ def gen_user_data(
zero_pad = (userid_date_country_code.str.len() - 11).abs().apply(lambda x: '0'*x)
user_data['userid'] = userid_date_country_code + zero_pad + user_data['uid'].astype(str).str[-5:]
# add hash data lists
- user_data['device_hash'] = gen_obj_idhash_series(idhashes_props_dict=device_obj.device_hashes_props_dict, n_counts_series=user_data['n_devices'])
- user_data['card_hash'] = gen_obj_idhash_series(idhashes_props_dict=card_obj.card_hashes_props_dict, n_counts_series=user_data['n_cards'])
- user_data['ip_hash'] = gen_obj_idhash_series(idhashes_props_dict=ip_obj.ip_hashes_props_dict, n_counts_series=user_data['n_ips'])
- user_data['transaction_hash'] = gen_obj_idhash_series(idhashes_props_dict=transaction_obj.transaction_hashes_props_dict, n_counts_series=user_data['n_transactions'])
- user_data['application_hash'] = user_data['n_applications'].apply(lambda x: list(np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p = list(application_obj.application_hashes_props_dict.values()), replace = True, size = x)))
+ user_data['device_hash'] = gen_obj_idhash_series(idhashes=device_obj.device_hashes, n_counts_series=user_data['n_devices'])
+ user_data['card_hash'] = gen_obj_idhash_series(idhashes=card_obj.card_hashes, n_counts_series=user_data['n_cards'])
+ user_data['ip_hash'] = gen_obj_idhash_series(idhashes=ip_obj.ip_hashes, n_counts_series=user_data['n_ips'])
+ user_data['transaction_hash'] = gen_obj_idhash_series(idhashes=transaction_obj.transaction_hashes, n_counts_series=user_data['n_transactions'])
+ # generate application hashes per user
+ #user_data['application_hash'] = user_data['n_applications'].apply(lambda x: list(np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p = list(application_obj.application_hashes_props_dict.values()), replace = True, size = x)))
+ total_application_hashes = user_data['n_applications'].sum()
+ split_indices = user_data['n_applications'].cumsum()[:-1].values
+ application_hashes = np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p=list(application_obj.application_hashes_props_dict.values()), replace=True, size=total_application_hashes)
+ user_data['application_hash'] = pd.Series(np.split(application_hashes, split_indices)).apply(lambda x: x.tolist())
# drop excess columns
- drop_columns = ['n_devices', 'n_cards', 'n_ips', 'n_applications', 'n_transactions']
- user_data = user_data.drop(columns = drop_columns)
+ user_data = user_data.drop(columns = ['n_devices', 'n_cards', 'n_ips', 'n_applications', 'n_transactions'])
# create a hash value for the dataset (to distinguish between different iterations)
user_data['itr_hash'] = gen_random_hash(size=1)[0]
return user_data
\ No newline at end of file
diff --git a/generator/cons.py b/generator/cons.py
index 783ac56..25fb9e1 100644
--- a/generator/cons.py
+++ b/generator/cons.py
@@ -17,9 +17,9 @@
fpath_arch_randomtelecomdata = os.path.join(subdir_data, 'arch', 'RandomTelecomPayments.csv')
fpath_temp_llama_firstnames = os.path.join(subdir_data, 'temp', 'llama_firstnames_{country}.csv')
fpath_temp_llama_lastnames = os.path.join(subdir_data, 'temp', 'llama_lastnames_{country}.csv')
-fpath_domain_email = os.path.join(subdir_data, 'ref', 'email-domains.csv')
+fpath_email_domain = os.path.join(subdir_data, 'ref', 'email-domains.csv')
fpath_countrycrimeindex = os.path.join(subdir_data, 'ref', 'country_crime_index.csv')
-fpath_countrieseurope = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv')
+fpath_countries_europe = os.path.join(subdir_data, 'ref', 'Countries-Europe.csv')
fpath_firstnames = os.path.join(subdir_data, 'ref', 'first-names.txt')
fpath_lastnames = os.path.join(subdir_data, 'ref', 'last-names.txt')
fpath_llama_firstnames = os.path.join(subdir_data, 'ref', 'llama_firstnames.csv')
@@ -73,4 +73,7 @@
data_model_rejection_codes_connection = {'E900:ConnectionTimeout':0.45, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.2, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
data_model_rejection_codes_user = {'E900:ConnectionTimeout':0.05, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.45, 'E904:InsufficientFunds':0.3}
data_model_rejection_codes_funds = {'E900:ConnectionTimeout':0.1, 'E901:SuspectedFraud':0.1, 'E902:AuthenicationFailure':0.1, 'E903:UserCancelled':0.25, 'E904:InsufficientFunds':0.45}
-data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
\ No newline at end of file
+data_model_rejection_codes_authentication = {'E900:ConnectionTimeout':0.25, 'E901:SuspectedFraud':0.05, 'E902:AuthenicationFailure':0.45, 'E903:UserCancelled':0.15, 'E904:InsufficientFunds':0.1}
+
+# set lists of generator object types
+object_types = ["device","card","ip","transaction","application"]
\ No newline at end of file
diff --git a/generator/main.py b/generator/main.py
index 032a19a..6c1a101 100644
--- a/generator/main.py
+++ b/generator/main.py
@@ -22,7 +22,7 @@
input_params_dict = commandline_interface()
# run input error handling
- res = input_error_handling(input_params_dict)
+ input_error_handling(input_params_dict)
logging.info(f'Input Parameters: {input_params_dict}')
@@ -35,6 +35,7 @@
(
input_params_dict['n_users'],
None if input_params_dict['use_random_seed'] == 0 else itr,
+ 20000,
input_params_dict['registration_start_date'],
input_params_dict['registration_end_date'],
input_params_dict['transaction_start_date'],
diff --git a/generator/objects/Application.py b/generator/objects/Application.py
index 013bf2e..8d5d5cb 100644
--- a/generator/objects/Application.py
+++ b/generator/objects/Application.py
@@ -1,66 +1,75 @@
-import numpy as np
import cons
from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict
from utilities.cnt2prop_dict import cnt2prop_dict
+
+import numpy as np
from beartype import beartype
+from typing import List, Dict
class Application:
-
+
@beartype
def __init__(
- self,
- n_application_hashes:int
+ self,
+ n_application_hashes:int,
):
"""
- The randomly generated application data model object.
-
+ Initialize the Application object with randomly generated data model.
+
Parameters
----------
n_application_hashes : int
The number of application hashes to generate.
-
+
Attributes
----------
n_application_hashes : int
The number of application hashes generated.
lam : float
- The lambda parameter of the squared poisson distribution used to generate the application hash counts.
- application_hashes_cnts_dict : dict
- The application hash counts dictionary.
- application_hashes_props_dict : dict
- The application hash proportions dictionary.
+ The lambda parameter for the Poisson distribution used to generate application hash counts.
+ power : float
+ The power parameter for the Poisson distribution.
+ payment_channels : Dict[str, float]
+ The population proportions of available payment channels.
+ application_hashes_cnts_dict : Dict[str, int]
+ Mapping of application hashes to their occurrence counts.
+ application_hashes_props_dict : Dict[str, float]
+ Mapping of application hashes to their proportions.
+ application_hashes_payment_channel_dict : Dict[str, str]
+ Mapping of application hashes to randomly assigned payment channels.
"""
self.n_application_hashes = n_application_hashes
self.lam = cons.data_model_poisson_params["application"]["lambda"]
self.power = cons.data_model_poisson_params["application"]["power"]
self.payment_channels = cons.data_model_payment_channels
self.application_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_application_hashes, lam=self.lam)
- self.application_hashes_props_dict = cnt2prop_dict(self.application_hashes_cnts_dict)
- self.application_hashes_payment_channel_dict = self.gen_transaction_payment_channel(list(self.application_hashes_cnts_dict.keys()), self.payment_channels)
-
+ self.application_hashes = list(self.application_hashes_cnts_dict.keys())
+ self.application_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.application_hashes_cnts_dict)
+ self.application_hashes_payment_channel_dict = self.gen_transaction_payment_channel(application_hashes=self.application_hashes, payment_channels=self.payment_channels)
+
@beartype
def gen_transaction_payment_channel(
self,
- application_hashes:list,
- payment_channels:dict
- ) -> dict:
+ application_hashes:List[str],
+ payment_channels:Dict[str, float],
+ ) -> Dict[str, str]:
"""
Generates a dictionary of random application payment channels.
-
+
Parameters
----------
- application_hashes : list
+ application_hashes : List[str]
The application hashes.
- payment_channels : dict
+ payment_channels : Dict[str, float]
The population proportion of payment channels.
-
+
Returns
-------
- dict
+ Dict[str, str]
A dictionary of transaction payment channels.
"""
# randomly sample payment channels based on population proportions
- transactoin_payment_channels = list(
+ transaction_payment_channels = list(
np.random.choice(
a=list(payment_channels.keys()),
p=list(payment_channels.values()),
@@ -69,5 +78,5 @@ def gen_transaction_payment_channel(
)
)
# return payment channels and application hashes
- application_hashes_payment_channels_dict = dict(zip(application_hashes, transactoin_payment_channels))
+ application_hashes_payment_channels_dict = dict(zip(application_hashes, transaction_payment_channels))
return application_hashes_payment_channels_dict
diff --git a/generator/objects/Card.py b/generator/objects/Card.py
index c76f7ac..371423f 100644
--- a/generator/objects/Card.py
+++ b/generator/objects/Card.py
@@ -1,83 +1,87 @@
-import numpy as np
import cons
from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict
from utilities.cnt2prop_dict import cnt2prop_dict
from utilities.gen_country_codes_dict import gen_country_codes_dict
from utilities.gen_shared_idhashes import gen_shared_idhashes
+
+import numpy as np
from beartype import beartype
-from typing import Union
+from typing import List, Dict, Union
class Card:
-
+
@beartype
def __init__(
self,
n_card_hashes:Union[int,np.int64],
- fpath_countrieseurope:str=cons.fpath_countrieseurope
+ fpath_countries_europe:str=cons.fpath_countries_europe,
):
"""
The randomly generated card data model object.
-
+
Parameters
----------
n_card_hashes : int
The number of card hashes to generate.
- fpath_countrieseurope : str
- The file path to the european countries reference file, default is cons.fpath_countrieseurope.
-
+ fpath_countries_europe : str
+ The file path to the european countries reference file, default is cons.fpath_countries_europe.
+
Attributes
----------
n_card_hashes : int
The number of card hashes generated.
- card_types_dict : dict
+ card_types_dict : Dict[str, float]
The population proportions of card types.
lam : float
The lambda parameter of the squared poisson distribution used to generate the card hash counts.
+ power : float
+ The power parameter of the squared poisson distribution used to generate the card hash counts.
prop_shared_card_hashes : float
The population proportion of shared card hashes.
- card_hashes_cnts_dict : dict
+ card_hashes_cnts_dict : Dict[str, int]
The card hash counts dictionary.
- card_hashes_props_dict : dict
+ card_hashes_props_dict : Dict[str, float]
The card hash proportions dictionary.
- card_hashes_type_dict : dict
+ card_hashes_type_dict : Dict[str, str]
The card hash types dictionary.
- card_hashes_country_code_dict : dict
+ card_hashes_country_code_dict : Dict[str, str]
The card hash country codes dictionary.
- card_hashes_shared_props_dict : dict
- The shared card hash proportions dictionary.
+ card_shared_idhash_map_dict : Dict[str, str]
+ The card shared idhash mapping dictionary.
"""
self.n_card_hashes = n_card_hashes
- self.fpath_countrieseurope = fpath_countrieseurope
+ self.fpath_countries_europe = fpath_countries_europe
self.card_types_dict = cons.data_model_card_types_dict
self.lam = cons.data_model_poisson_params["card"]["lambda"]
self.power = cons.data_model_poisson_params["card"]["power"]
self.prop_shared_card_hashes = cons.data_model_shared_entities_dict["card"]
- self.card_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_card_hashes, lam=self.lam)
- self.card_hashes_props_dict = cnt2prop_dict(self.card_hashes_cnts_dict)
- self.card_hashes_type_dict = self.gen_card_type(list(self.card_hashes_cnts_dict.keys()), self.card_types_dict)
- self.card_hashes_country_code_dict = gen_country_codes_dict(self.card_hashes_cnts_dict, self.fpath_countrieseurope)
- self.card_shared_idhash_map_dict = gen_shared_idhashes(self.card_hashes_cnts_dict, self.prop_shared_card_hashes)
-
+ self.card_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_card_hashes, lam=self.lam, power=self.power)
+ self.card_hashes = list(self.card_hashes_cnts_dict.keys())
+ self.card_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.card_hashes_cnts_dict)
+ self.card_hashes_type_dict = self.gen_card_type(card_hashes=self.card_hashes, card_types_dict=self.card_types_dict)
+ self.card_hashes_country_code_dict = gen_country_codes_dict(idhashes=self.card_hashes, fpath_countries_europe=self.fpath_countries_europe)
+ self.card_shared_idhash_map_dict = gen_shared_idhashes(idhashes=self.card_hashes, prop_shared_idhashes=self.prop_shared_card_hashes)
+
@beartype
def gen_card_type(
self,
- card_hashes:list,
- card_types_dict:dict
- ) -> dict:
+ card_hashes:List[str],
+ card_types_dict:Dict[str, float],
+ ) -> Dict[str, str]:
"""
Generates a dictionary of random card types.
-
+
Parameters
----------
- card_hashes : list
+ card_hashes : List[str]
The card hashes.
- card_types_dict : dict
+ card_types_dict : Dict[str, float]
The population proportions of card types.
-
+
Returns
-------
- dict
- A dictionary of card hash prices.
+ Dict[str, str]
+ A dictionary of card types.
"""
# randomly choose card types based on the population proportions of card types
card_types = np.random.choice(
diff --git a/generator/objects/Device.py b/generator/objects/Device.py
index 4f6ad4d..90d4a30 100644
--- a/generator/objects/Device.py
+++ b/generator/objects/Device.py
@@ -1,77 +1,80 @@
-import string
-import numpy as np
-import pandas as pd
import cons
from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict
from utilities.cnt2prop_dict import cnt2prop_dict
from utilities.gen_shared_idhashes import gen_shared_idhashes
+
+import numpy as np
+import pandas as pd
from beartype import beartype
-from typing import Union
+from typing import List, Dict, Union
class Device:
-
+
@beartype
def __init__(
self,
n_device_hashes:Union[int,np.int64],
- fpath_smartphones:str=cons.fpath_smartphones
+ fpath_smartphones:str=cons.fpath_smartphones,
):
"""
The randomly generated device data model object.
-
+
Parameters
----------
n_device_hashes : int
The number of device hashes to generate.
fpath_smartphones : str
The file path to the smart phones reference file, default is cons.fpath_smartphones.
-
+
Attributes
----------
n_device_hashes : int
The number of device hashes generated.
lam : float
The lambda parameter of the squared poisson distribution used to generate the device hash counts.
+ power : float
+ The power parameter of the squared poisson distribution used to generate the device hash counts.
prop_shared_device_hashes : float
The population proportion of shared device hashes.
- device_hashes_cnts_dict : dict
+ device_hashes_cnts_dict : Dict[str, int]
The device hash counts dictionary.
- device_hashes_props_dict : dict
+ device_hashes_props_dict : Dict[str, float]
The device hash proportions dictionary.
- device_hashes_type_dict : dict
+ device_hashes_type_dict : Dict[str, str]
The device hash types dictionary.
- device_hashes_shared_props_dict : dict
- The shared device hash proportions dictionary.
+ device_shared_idhash_map_dict : Dict[str, str]
+ The device shared idhash mapping dictionary.
"""
self.n_device_hashes = n_device_hashes
self.fpath_smartphones = fpath_smartphones
self.lam = cons.data_model_poisson_params["device"]["lambda"]
self.power = cons.data_model_poisson_params["device"]["power"]
self.prop_shared_device_hashes = cons.data_model_shared_entities_dict["device"]
- self.device_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_device_hashes, lam=self.lam)
- self.device_hashes_props_dict = cnt2prop_dict(self.device_hashes_cnts_dict)
- self.device_hashes_type_dict = self.gen_device_type(list(self.device_hashes_cnts_dict.keys()), self.fpath_smartphones)
- self.device_shared_idhash_map_dict = gen_shared_idhashes(self.device_hashes_cnts_dict, self.prop_shared_device_hashes)
+ self.device_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_device_hashes, lam=self.lam, power=self.power)
+ self.device_hashes = list(self.device_hashes_cnts_dict.keys())
+ self.device_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.device_hashes_cnts_dict)
+ self.device_hashes_type_dict = self.gen_device_types(device_hashes=self.device_hashes, fpath_smartphones=self.fpath_smartphones)
+ self.device_shared_idhash_map_dict = gen_shared_idhashes(idhashes=self.device_hashes, prop_shared_idhashes=self.prop_shared_device_hashes)
@beartype
- def gen_device_type(
+ def gen_device_types(
self,
- device_hashes:list,
- fpath_smartphones:str
- ) -> dict:
+ device_hashes:List[str],
+ fpath_smartphones:str,
+ ) -> Dict[str, str]:
"""
Generates a dictionary of random device types
-
+
Parameters
----------
- device_hashes : list
+ device_hashes : List[str]
The device hashes.
fpath_smartphones : str
The file path to the smart phones reference file.
-
+
Returns
-------
- dict
+ Dict[str, str]
A dictionary of device hash types.
"""
# load in smartphone data
diff --git a/generator/objects/Ip.py b/generator/objects/Ip.py
index ad28150..48644b6 100644
--- a/generator/objects/Ip.py
+++ b/generator/objects/Ip.py
@@ -1,53 +1,57 @@
import cons
-import numpy as np
from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict
from utilities.cnt2prop_dict import cnt2prop_dict
from utilities.gen_country_codes_dict import gen_country_codes_dict
from utilities.gen_shared_idhashes import gen_shared_idhashes
+
+import numpy as np
from beartype import beartype
from typing import Union
class Ip:
-
+
@beartype
def __init__(
self,
n_ip_hashes:Union[int,np.int64],
- fpath_countrieseurope:str=cons.fpath_countrieseurope
+ fpath_countries_europe:str=cons.fpath_countries_europe,
):
"""
The randomly generated ip data model object.
-
+
Parameters
----------
n_ip_hashes : int
The number of ip hashes to generate.
- fpath_countrieseurope : str
- The file path to the european countries reference file, default is cons.fpath_countrieseurope.
-
+ fpath_countries_europe : str
+ The file path to the european countries reference file, default is cons.fpath_countries_europe.
+
Attributes
----------
n_ip_hashes : int
The number of ip hashes generated.
lam : float
The lambda parameter of the squared poisson distribution used to generate the ip hash counts.
+ power : float
+ The power parameter of the squared poisson distribution used to generate the ip hash counts.
prop_shared_ip_hashes : float
The population proportion of shared ip hashes.
- ip_hashes_cnts_dict : dict
+ ip_hashes_cnts_dict : Dict[str, int]
The ip hash counts dictionary.
- ip_hashes_props_dict : dict
+ ip_hashes_props_dict : Dict[str, float]
The ip hash proportions dictionary.
- ip_hashes_country_code_dict : dict
+ ip_hashes_country_code_dict : Dict[str, str]
The ip hash country codes dictionary.
- ip_hashes_shared_props_dict : dict
- The shared ip hash proportions dictionary.
+ ip_shared_idhash_map_dict : Dict[str, str]
+ The shared ip hash mapping dictionary.
"""
self.n_ip_hashes = n_ip_hashes
- self.fpath_countrieseurope = fpath_countrieseurope
+ self.fpath_countries_europe = fpath_countries_europe
self.lam = cons.data_model_poisson_params["ip"]["lambda"]
self.power = cons.data_model_poisson_params["ip"]["power"]
self.prop_shared_ip_hashes = cons.data_model_shared_entities_dict["ip"]
- self.ip_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_ip_hashes, lam=self.lam)
- self.ip_hashes_props_dict = cnt2prop_dict(self.ip_hashes_cnts_dict)
- self.ip_hashes_country_code_dict = gen_country_codes_dict(self.ip_hashes_cnts_dict, self.fpath_countrieseurope)
- self.ip_shared_idhash_map_dict = gen_shared_idhashes(self.ip_hashes_cnts_dict, self.prop_shared_ip_hashes)
+ self.ip_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_ip_hashes, lam=self.lam, power=self.power)
+ self.ip_hashes = list(self.ip_hashes_cnts_dict.keys())
+ self.ip_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.ip_hashes_cnts_dict)
+ self.ip_hashes_country_code_dict = gen_country_codes_dict(idhashes=self.ip_hashes, fpath_countries_europe=self.fpath_countries_europe)
+ self.ip_shared_idhash_map_dict = gen_shared_idhashes(idhashes=self.ip_hashes, prop_shared_idhashes=self.prop_shared_ip_hashes)
\ No newline at end of file
diff --git a/generator/objects/Transaction.py b/generator/objects/Transaction.py
index 1ef0911..3265174 100644
--- a/generator/objects/Transaction.py
+++ b/generator/objects/Transaction.py
@@ -1,25 +1,25 @@
-import numpy as np
-import pandas as pd
-from datetime import datetime
import cons
from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict
from utilities.cnt2prop_dict import cnt2prop_dict
from utilities.gen_dates_dict import gen_dates_dict
from utilities.round_trans_amount import round_trans_amount
+
+import numpy as np
from beartype import beartype
+from typing import List, Dict, Union
class Transaction:
-
+
@beartype
def __init__(
self,
- n_transaction_hashes,
- start_date,
- end_date
+ n_transaction_hashes:Union[int,np.int64],
+ start_date:str,
+ end_date:str,
):
"""
The randomly generated transaction data model object.
-
+
Parameters
----------
n_transaction_hashes : int
@@ -28,7 +28,7 @@ def __init__(
The start date to generate transactions from.
end_date : str
The end date to generate transaction till.
-
+
Attributes
----------
n_transaction_hashes : int
@@ -39,23 +39,19 @@ def __init__(
The date transactions are generated till, must be of the form '%Y-%m-%d'.
lam : float
The lambda parameter of the squared poisson distribution used to generate the transaction hash counts.
- payment_channels : float
- The population proportion of payment channels.
- transaction_status : float
+ power : float
+ The power parameter of the squared poisson distribution used to generate the transaction hash counts.
+ transaction_status : Dict[str, float]
The population proportion of transaction statuses.
- rejection_codes : float
- The population proportion of rejection codes.
- transaction_hashes_cnts_dict : dict
+ transaction_hashes_cnts_dict : Dict[str, int]
The transaction hash counts dictionary.
- transaction_hashes_props_dict : dict
+ transaction_hashes_props_dict : Dict[str, float]
The transaction hash proportions dictionary.
- transaction_hashes_dates_dict : dict
+ transaction_hashes_dates_dict : Dict[str, str]
The transaction hash dates dictionary.
- transaction_hashes_payment_channel_dict : dict
- The transaction hash payment channels dictionary.
- transaction_hashes_status_dict : dict
+ transaction_hashes_status_dict : Dict[str, str]
The transaction hash status dictionary.
- transaction_hashes_amounts_dict : dict
+ transaction_hashes_amounts_dict : Dict[str, float]
The transaction hash amount dictionary.
"""
self.n_transaction_hashes = n_transaction_hashes
@@ -64,31 +60,32 @@ def __init__(
self.lam = cons.data_model_poisson_params["transaction"]["lambda"]
self.power = cons.data_model_poisson_params["transaction"]["power"]
self.transaction_status = cons.data_model_transaction_status
- self.transaction_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_transaction_hashes, lam=self.lam)
- self.transaction_hashes_props_dict = cnt2prop_dict(self.transaction_hashes_cnts_dict)
- self.transaction_hashes_dates_dict = gen_dates_dict(self.transaction_hashes_cnts_dict,start_date=self.start_date,end_date=self.end_date,)
- self.transaction_hashes_status_dict = self.gen_transaction_status(list(self.transaction_hashes_cnts_dict.keys()), self.transaction_status)
- self.transaction_hashes_amounts_dict = self.gen_transaction_amounts(list(self.transaction_hashes_cnts_dict.keys()))
-
+ self.transaction_hashes_cnts_dict = gen_idhash_cnt_dict(idhash_type="hash", n=self.n_transaction_hashes, lam=self.lam, power=self.power)
+ self.transaction_hashes = list(self.transaction_hashes_cnts_dict.keys())
+ self.transaction_hashes_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.transaction_hashes_cnts_dict)
+ self.transaction_hashes_dates_dict = gen_dates_dict(idhashes=self.transaction_hashes,start_date=self.start_date,end_date=self.end_date,)
+ self.transaction_hashes_status_dict = self.gen_transaction_status(transaction_hashes=self.transaction_hashes, transaction_status=self.transaction_status)
+ self.transaction_hashes_amounts_dict = self.gen_transaction_amounts(transaction_hashes=self.transaction_hashes, loc=0, scale=2)
+
@beartype
def gen_transaction_status(
self,
- transaction_hashes:list,
- transaction_status:dict
+ transaction_hashes:List[str],
+ transaction_status:Dict[str, float],
):
"""
Generates a dictionary of random transaction statuses
-
+
Parameters
----------
- transaction_hashes : list
+ transaction_hashes : List[str]
The transaction hashes
- transaction_status : dict
+ transaction_status : Dict[str, float]
The population proportion of transaction statuses
-
+
Returns
-------
- dict
+ Dict[str, str]
A dictionary of transaction statuses
"""
# randomly sample transaction status based on population proportions
@@ -103,29 +100,29 @@ def gen_transaction_status(
# return transaction hashes and statuses
transaction_hashes_status_dict = dict(zip(transaction_hashes, transaction_status))
return transaction_hashes_status_dict
-
+
@beartype
def gen_transaction_amounts(
self,
- transaction_hashes:list,
- loc:float=0,
- scale:float=2
- ):
+ transaction_hashes:List[str],
+ loc:Union[int, float]=0,
+ scale:Union[int, float]=2,
+ ) -> Dict[str, float]:
"""
Generates a dictionary of random transaction hash amounts.
-
+
Parameters
----------
- transaction_hashes : list
+ transaction_hashes : List[str]
The transaction hashes.
loc : float
The mean of the transaction amount distribution to generate, default is 0.
scale : float
The scale of the transaction amount distribution to generate, default is 2.
-
+
Returns
-------
- dict
+ Dict[str, float]
A dictionary of transaction hash prices
"""
# randomly sample transaction prices from an absolute normal distribution with mean 0 and standard deviation 2
diff --git a/generator/objects/User.py b/generator/objects/User.py
index f89aeef..475d79a 100644
--- a/generator/objects/User.py
+++ b/generator/objects/User.py
@@ -1,14 +1,16 @@
import cons
-import numpy as np
-import pandas as pd
from utilities.gen_idhash_cnt_dict import gen_idhash_cnt_dict
from utilities.cnt2prop_dict import cnt2prop_dict
from utilities.gen_country_codes_dict import gen_country_codes_dict
from utilities.gen_dates_dict import gen_dates_dict
+
+import numpy as np
+import pandas as pd
from beartype import beartype
+from typing import Dict
class User:
-
+
@beartype
def __init__(
self,
@@ -17,12 +19,12 @@ def __init__(
end_date:str,
fpath_firstnames:str=cons.fpath_firstnames,
fpath_lastnames:str=cons.fpath_lastnames,
- fpath_countrieseurope:str=cons.fpath_countrieseurope,
- fpath_domain_email:str=cons.fpath_domain_email
+ fpath_countries_europe:str=cons.fpath_countries_europe,
+ fpath_email_domain :str=cons.fpath_email_domain ,
):
"""
The randomly generated user data model object
-
+
Parameters
----------
n_user_ids : int
@@ -35,11 +37,11 @@ def __init__(
The full file path to the first names reference data, default is cons.fpath_firstnames.
fpath_lastnames : str
The full file path to the last names reference data, default is cons.fpath_lastnames.
- fpath_countrieseurope : str
- The full file path to the europe countries reference data, default is cons.fpath_countrieseurope.
- fpath_domain_email : str
- The full file path to the email domain reference daa, default is cons.fpath_domain_email.
-
+ fpath_countries_europe : str
+ The full file path to the europe countries reference data, default is cons.fpath_countries_europe.
+ fpath_email_domain : str
+ The full file path to the email domain reference data, default is cons.fpath_email_domain .
+
Attributes
----------
n_user_ids : int
@@ -50,19 +52,21 @@ def __init__(
The date user ids are generated till, must be of the form '%Y-%m-%d'
lam : float
The lambda parameter of the squared poisson distribution used to generate the user ids counts
- user_ids_cnts_dict : dict
+ power : float
+ The power parameter of the squared poisson distribution used to generate the user ids counts
+ user_ids_cnts_dict : Dict[str, int]
The user id counts dictionary
- user_ids_props_dict : dict
+ user_ids_props_dict : Dict[str, float]
The user id proportions dictionary
- user_ids_firstname_dict : dict
+ user_ids_firstname_dict : Dict[str, str]
The user id first names dictionary
- user_ids_lastname_dict : dict
+ user_ids_lastname_dict : Dict[str, str]
The user id last names dictionary
- user_ids_country_code_dict : dict
+ user_ids_country_code_dict : Dict[str, str]
The user id country codes dictionary
- user_ids_email_domain_dict : dict
+ user_ids_email_domain_dict : Dict[str, str]
The user id email domains dictionary
- user_ids_dates_dict : dict
+ user_ids_dates_dict : Dict[str, str]
The user id dates dictionary
"""
self.n_user_ids = n_user_ids
@@ -70,34 +74,35 @@ def __init__(
self.end_date = end_date
self.fpath_firstnames = fpath_firstnames
self.fpath_lastnames = fpath_lastnames
- self.fpath_countrieseurope = fpath_countrieseurope
- self.fpath_domain_email = fpath_domain_email
+ self.fpath_countries_europe = fpath_countries_europe
+ self.fpath_email_domain = fpath_email_domain
self.lam = cons.data_model_poisson_params["user"]["lambda"]
self.power = cons.data_model_poisson_params["user"]["power"]
- self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam)
- self.user_ids_props_dict = cnt2prop_dict(self.user_ids_cnts_dict)
- self.user_ids_country_code_dict = gen_country_codes_dict(self.user_ids_cnts_dict, self.fpath_countrieseurope)
- self.user_ids_firstname_dict = self.gen_user_firstname(self.fpath_firstnames)
- self.user_ids_lastname_dict = self.gen_user_lastname(self.fpath_lastnames)
- self.user_ids_email_domain_dict = self.gen_user_email_domain(self.fpath_domain_email)
- self.user_ids_dates_dict = gen_dates_dict(self.user_ids_cnts_dict, start_date=self.start_date, end_date=self.end_date)
-
+ self.user_ids_cnts_dict = gen_idhash_cnt_dict(idhash_type="id", n=self.n_user_ids, lam=self.lam, power=self.power)
+ self.user_ids = list(self.user_ids_cnts_dict.keys())
+ self.user_ids_props_dict = cnt2prop_dict(idhashes_cnts_dict=self.user_ids_cnts_dict)
+ self.user_ids_country_code_dict = gen_country_codes_dict(idhashes=self.user_ids, fpath_countries_europe=self.fpath_countries_europe)
+ self.user_ids_firstname_dict = self.gen_user_firstname(fpath_firstnames=self.fpath_firstnames)
+ self.user_ids_lastname_dict = self.gen_user_lastname(fpath_lastnames=self.fpath_lastnames)
+ self.user_ids_email_domain_dict = self.gen_user_email_domain(fpath_email_domain=self.fpath_email_domain)
+ self.user_ids_dates_dict = gen_dates_dict(idhashes=self.user_ids, start_date=self.start_date, end_date=self.end_date)
+
@beartype
def gen_user_firstname(
self,
- fpath_firstnames:str
- ) -> dict:
+ fpath_firstnames:str,
+ ) -> Dict[str, str]:
"""
Generates a dictionary of random user id first names
-
+
Parameters
----------
fpath_firstnames : str
The file path to the first names reference file
-
+
Returns
-------
- dict
+ Dict[str, str]
A dictionary of user id first names
"""
# load in list of first names
@@ -111,23 +116,23 @@ def gen_user_firstname(
# convert key value pairs to dict
user_ids_firstname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict()
return user_ids_firstname_dict
-
+
@beartype
def gen_user_lastname(
self,
- fpath_lastnames:str
- ) -> dict:
+ fpath_lastnames:str,
+ ) -> Dict[str, str]:
"""
Generates a dictionary of random user id last names.
-
+
Parameters
----------
fpath_lastnames : str
The file path to the last names reference file.
-
+
Returns
-------
- dict
+ Dict[str, str]
A dictionary of user id last names.
"""
# load in list of last names
@@ -141,42 +146,40 @@ def gen_user_lastname(
# convert key value pairs to dict
user_ids_lastname_dict = pd.concat([pd.Series(d) for d in user_ids_names_pairs])[country_code_dataframe["user_ids"]].to_dict()
return user_ids_lastname_dict
-
+
@beartype
def gen_user_email_domain(
self,
- fpath_domain_email:str
- ) -> dict:
+ fpath_email_domain:str,
+ ) -> Dict[str, str]:
"""
Generates a dictionary of random user id email domains
-
+
Parameters
----------
- fpath_domain_email : str
+ fpath_email_domain : str
The file path to the email domains reference file
-
+
Returns
-------
- dict
+ Dict[str, str]
A dictionary of user id email domains
"""
# load domain names data
- email_domain_data = pd.read_csv(fpath_domain_email, index_col=0)
+ email_domain_data = pd.read_csv(fpath_email_domain, index_col=0)
# calculate the proportion of email domains
email_domain_data["proportion"] = email_domain_data["proportion"].divide(email_domain_data["proportion"].sum())
# convert email domain proportions to a dictionary
email_domain_dict = email_domain_data.set_index("domain").to_dict()["proportion"]
- # extract the user ids
- user_ids_list = list(self.user_ids_cnts_dict.keys())
# randomly choose the email domains based on proportions
user_email_domain_list = list(
np.random.choice(
a=list(email_domain_dict.keys()),
p=list(email_domain_dict.values()),
replace=True,
- size=len(user_ids_list),
+ size=len(self.user_ids),
)
)
# return the user ids email domains
- user_ids_email_domain_dict = dict(zip(user_ids_list, user_email_domain_list))
+ user_ids_email_domain_dict = dict(zip(self.user_ids, user_email_domain_list))
return user_ids_email_domain_dict
diff --git a/generator/unittests/app/test_gen_user_trans_data.py b/generator/unittests/app/test_gen_user_trans_data.py
index 07e5df8..540bb1c 100644
--- a/generator/unittests/app/test_gen_user_trans_data.py
+++ b/generator/unittests/app/test_gen_user_trans_data.py
@@ -36,8 +36,8 @@
# create relative file paths
fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1]
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1]
fpath_countrycrimeindex = '.' + cons.fpath_countrycrimeindex.split(cons.fpath_repo_dir)[1]
fpath_unittest_user_data = '.' + cons.fpath_unittest_user_data.split(cons.fpath_repo_dir)[1]
@@ -45,13 +45,13 @@
# generate random users
user_obj = User(
- n_user_ids=programmeparams.n_users,
- start_date=programmeparams.registration_start_date,
- end_date=programmeparams.registration_end_date,
- fpath_firstnames=fpath_firstnames,
- fpath_lastnames=fpath_lastnames,
- fpath_countrieseurope=fpath_countrieseurope,
- fpath_domain_email=fpath_domain_email
+ n_user_ids=programmeparams.n_users,
+ start_date=programmeparams.registration_start_date,
+ end_date=programmeparams.registration_end_date,
+ fpath_firstnames=fpath_firstnames,
+ fpath_lastnames=fpath_lastnames,
+ fpath_countries_europe=fpath_countries_europe,
+ fpath_email_domain=fpath_email_domain
)
# generate random entity counts for each user
@@ -59,8 +59,8 @@
# generate random entity values
device_obj = Device(n_device_hashes=random_entity_counts['n_devices'].sum(), fpath_smartphones=fpath_smartphones)
-card_obj = Card(n_card_hashes=random_entity_counts['n_cards'].sum(), fpath_countrieseurope=fpath_countrieseurope)
-ip_obj = Ip(n_ip_hashes=random_entity_counts['n_ips'].sum(), fpath_countrieseurope=fpath_countrieseurope)
+card_obj = Card(n_card_hashes=random_entity_counts['n_cards'].sum(), fpath_countries_europe=fpath_countries_europe)
+ip_obj = Ip(n_ip_hashes=random_entity_counts['n_ips'].sum(), fpath_countries_europe=fpath_countries_europe)
transaction_obj = Transaction(n_transaction_hashes=random_entity_counts['n_transactions'].sum(), start_date=programmeparams.transaction_start_date, end_date=programmeparams.transaction_end_date)
application_obj = Application(n_application_hashes=programmeparams.n_applications)
diff --git a/generator/unittests/objects/test_Card.py b/generator/unittests/objects/test_Card.py
index 4b9ef35..688455f 100644
--- a/generator/unittests/objects/test_Card.py
+++ b/generator/unittests/objects/test_Card.py
@@ -42,8 +42,8 @@
random.seed(cons.unittest_seed)
np.random.seed(cons.unittest_seed)
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-card_object = Card(n_card_hashes=exp_n_card_hashes, fpath_countrieseurope=fpath_countrieseurope)
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+card_object = Card(n_card_hashes=exp_n_card_hashes, fpath_countries_europe=fpath_countries_europe)
obs_card_hashes_cnts_dict = card_object.card_hashes_cnts_dict
obs_card_types_dict = card_object.card_types_dict
diff --git a/generator/unittests/objects/test_Ip.py b/generator/unittests/objects/test_Ip.py
index b8207b8..d7490fa 100644
--- a/generator/unittests/objects/test_Ip.py
+++ b/generator/unittests/objects/test_Ip.py
@@ -35,8 +35,8 @@
random.seed(cons.unittest_seed)
np.random.seed(cons.unittest_seed)
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-ip_object = Ip(n_ip_hashes=exp_n_ip_hashes, fpath_countrieseurope=fpath_countrieseurope)
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+ip_object = Ip(n_ip_hashes=exp_n_ip_hashes, fpath_countries_europe=fpath_countries_europe)
obs_ip_hashes_cnts_dict = ip_object.ip_hashes_cnts_dict
obs_ip_hashes_props_dict = ip_object.ip_hashes_props_dict
diff --git a/generator/unittests/objects/test_User.py b/generator/unittests/objects/test_User.py
index 80d911c..92a30bd 100644
--- a/generator/unittests/objects/test_User.py
+++ b/generator/unittests/objects/test_User.py
@@ -10,46 +10,46 @@
from objects.User import User
exp_user_ids_cnts_dict = {
- "6374692674377254": 420,
- "1751409580926382": 318,
- "4264861381989413": 244,
- "6720317315593519": 387,
+ "6374692674377254": 20,
+ "1751409580926382": 29,
+ "4264861381989413": 19,
+ "6720317315593519": 26,
}
exp_user_ids_props_dict = {
- "6374692674377254": 0.30679327976625276,
- "1751409580926382": 0.2322863403944485,
- "4264861381989413": 0.17823228634039445,
- "6720317315593519": 0.28268809349890434,
+ "6374692674377254": 0.2127659574468085,
+ "1751409580926382": 0.30851063829787234,
+ "4264861381989413": 0.20212765957446807,
+ "6720317315593519": 0.2765957446808511,
}
exp_user_ids_firstname_dict = {
- "6374692674377254": "ernst",
- "1751409580926382": "mykhaylo",
- "4264861381989413": "hugo",
- "6720317315593519": "alexandra",
+ "6374692674377254": "simone",
+ "1751409580926382": "francesca",
+ "4264861381989413": "igor",
+ "6720317315593519": "beckett",
}
exp_user_ids_lastname_dict = {
- "6374692674377254": "buchmann",
- "1751409580926382": "lyashenko",
- "4264861381989413": "diaz",
- "6720317315593519": "mariana",
+ "6374692674377254": "de filippo",
+ "1751409580926382": "gagliardi",
+ "4264861381989413": "lupu",
+ "6720317315593519": "leslie",
}
exp_user_ids_country_code_dict = {
- "6374692674377254": 276,
- "1751409580926382": 804,
- "4264861381989413": 724,
- "6720317315593519": 642,
+ "6374692674377254": 380,
+ "1751409580926382": 380,
+ "4264861381989413": 498,
+ "6720317315593519": 826,
}
exp_user_ids_email_domain_dict = {
- "6374692674377254": "gmail.com",
+ "6374692674377254": "yahoo.com",
"1751409580926382": "yahoo.com",
- "4264861381989413": "aol.com",
- "6720317315593519": "hotmail.com",
+ "4264861381989413": "yahoo.com",
+ "6720317315593519": "gmail.com",
}
exp_user_ids_dates_dict = {
- "6374692674377254": np.datetime64("2020-06-20T00:00:00.000000000"),
- "1751409580926382": np.datetime64("2020-12-25T00:00:00.000000000"),
- "4264861381989413": np.datetime64("2020-08-01T00:00:00.000000000"),
- "6720317315593519": np.datetime64("2020-02-04T00:00:00.000000000"),
+ "6374692674377254": np.datetime64("2020-03-21T00:00:00.000000000"),
+ "1751409580926382": np.datetime64("2020-06-11T00:00:00.000000000"),
+ "4264861381989413": np.datetime64("2020-10-15T00:00:00.000000000"),
+ "6720317315593519": np.datetime64("2020-09-17T00:00:00.000000000"),
}
exp_start_date = cons.unittest_registration_start_date
exp_end_date = cons.unittest_registration_end_date
@@ -61,9 +61,9 @@
fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1]
-user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countrieseurope=fpath_countrieseurope, fpath_domain_email=fpath_domain_email)
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
+user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
obs_user_ids_cnts_dict = user_object.user_ids_cnts_dict
obs_user_ids_props_dict = user_object.user_ids_props_dict
diff --git a/generator/unittests/utilities/test_align_country_codes.py b/generator/unittests/utilities/test_align_country_codes.py
index 787d9f9..d0626b1 100644
--- a/generator/unittests/utilities/test_align_country_codes.py
+++ b/generator/unittests/utilities/test_align_country_codes.py
@@ -45,7 +45,7 @@
},
{
"registration_country_code_alpha": 353,
- "ip_country_code_alpha": 353.0,
+ "ip_country_code_alpha": 42.0,
"card_country_code_alpha": np.nan,
},
{
@@ -62,7 +62,7 @@
)
obs_data_df = input_data_df.apply(
lambda series: align_country_codes(
- series, proba_comm_ip=0.95, proba_comm_card=0.99
+ series, proba_comm_ip=0.05, proba_comm_card=0.01
),
axis=1,
)
diff --git a/generator/unittests/utilities/test_gen_country_codes_dict.py b/generator/unittests/utilities/test_gen_country_codes_dict.py
index fbad890..f13fd23 100644
--- a/generator/unittests/utilities/test_gen_country_codes_dict.py
+++ b/generator/unittests/utilities/test_gen_country_codes_dict.py
@@ -10,16 +10,16 @@
np.random.seed(cons.unittest_seed)
-cnt_data = {"a": 1, "b": 2, "c": 3, "d": 4}
+idhashes = ["a", "b", "c", "d"]
exp_prop_dict = {"a": 276, "b": 756, "c": 642, "d": 826}
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-obs_prop_dict = gen_country_codes_dict(cnt_data, fpath_countrieseurope=fpath_countrieseurope)
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+obs_prop_dict = gen_country_codes_dict(idhashes=idhashes, fpath_countries_europe=fpath_countries_europe)
class Test_gen_country_codes_dict(unittest.TestCase):
""""""
def setUp(self):
- self.cnt_data = cnt_data
+ self.idhashes = idhashes
self.obs_prop_dict = obs_prop_dict
self.exp_prop_dict = exp_prop_dict
diff --git a/generator/unittests/utilities/test_gen_country_codes_map.py b/generator/unittests/utilities/test_gen_country_codes_map.py
index 6ab2d89..3e84fa5 100644
--- a/generator/unittests/utilities/test_gen_country_codes_map.py
+++ b/generator/unittests/utilities/test_gen_country_codes_map.py
@@ -18,10 +18,10 @@
292: 'GI', 492: 'MC', 336: 'VA'
}
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-obs_country_codes_map = gen_country_codes_map(fpath_countrieseurope=fpath_countrieseurope)
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+obs_country_codes_map = gen_country_codes_map(fpath_countries_europe=fpath_countries_europe)
-class Test_gen_country_codes_dict(unittest.TestCase):
+class Test_gen_country_codes_map(unittest.TestCase):
""""""
def setUp(self):
diff --git a/generator/unittests/utilities/test_gen_dates_dict.py b/generator/unittests/utilities/test_gen_dates_dict.py
index 09e8dc8..fb3d99a 100644
--- a/generator/unittests/utilities/test_gen_dates_dict.py
+++ b/generator/unittests/utilities/test_gen_dates_dict.py
@@ -10,21 +10,21 @@
np.random.seed(cons.unittest_seed)
-cnt_data = {"a": 1, "b": 2, "c": 3, "d": 4}
+idhashes = ["a", "b", "c", "d"]
exp_prop_dict = {
"a": np.datetime64("2020-04-12T00:00:00.000000000"),
"b": np.datetime64("2021-03-11T00:00:00.000000000"),
"c": np.datetime64("2020-09-27T00:00:00.000000000"),
"d": np.datetime64("2020-04-16T00:00:00.000000000"),
}
-obs_prop_dict = gen_dates_dict(cnt_data, start_date="2020-01-01", end_date="2021-12-31")
+obs_prop_dict = gen_dates_dict(idhashes, start_date="2020-01-01", end_date="2021-12-31")
class Test_gen_dates_dict(unittest.TestCase):
""""""
def setUp(self):
- self.cnt_data = cnt_data
+ self.idhashes = idhashes
self.obs_prop_dict = obs_prop_dict
self.exp_prop_dict = exp_prop_dict
diff --git a/generator/unittests/utilities/test_gen_obj_idhash_series.py b/generator/unittests/utilities/test_gen_obj_idhash_series.py
index e94bf44..18faa86 100644
--- a/generator/unittests/utilities/test_gen_obj_idhash_series.py
+++ b/generator/unittests/utilities/test_gen_obj_idhash_series.py
@@ -22,23 +22,23 @@
n_user_ids = cons.unittest_n_entities
fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1]
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
fpath_smartphones = '.' + cons.fpath_smartphones.split(cons.fpath_repo_dir)[1]
random.seed(cons.unittest_seed)
np.random.seed(cons.unittest_seed)
# create user object
-user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countrieseurope=fpath_countrieseurope, fpath_domain_email=fpath_domain_email)
+user_object = User(n_user_ids=n_user_ids, start_date=start_date, end_date=end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
# generate random entity counts
random_entity_counts = gen_random_entity_counts(user_obj=user_object)
# generate random entity values
device_obj = Device(n_device_hashes=random_entity_counts['n_devices'].sum(), fpath_smartphones=fpath_smartphones)
# generate user data and device hashes
user_data = random_entity_counts.copy()
-obs_obj_idhash_series = gen_obj_idhash_series(idhashes_props_dict=device_obj.device_hashes_props_dict, n_counts_series=user_data['n_devices'])
-exp_obj_idhash_series = pd.Series([['8c1fd1152fc83030', 'd4f37f7620f0fba2', '565dd55c257aa14d'], ['0bef04bcf232f0f0'], ['bbdcd452b847c0d4'], ['e2b03ec4f60f2f18']])
+obs_obj_idhash_series = gen_obj_idhash_series(idhashes=device_obj.device_hashes, n_counts_series=user_data['n_devices'])
+exp_obj_idhash_series = pd.Series([['2e23f63807f6170a'], ['b8816ed926bf9f83', 'b010fdb44fa68822'], ['ff23757073a07357'], ['3d2fd828c1fd1152']])
class Test_gen_idhash_cnt_dict(unittest.TestCase):
""""""
diff --git a/generator/unittests/utilities/test_gen_random_entity_counts.py b/generator/unittests/utilities/test_gen_random_entity_counts.py
index dffdb3d..58a5522 100644
--- a/generator/unittests/utilities/test_gen_random_entity_counts.py
+++ b/generator/unittests/utilities/test_gen_random_entity_counts.py
@@ -21,17 +21,17 @@
fpath_firstnames = '.' + cons.fpath_llama_firstnames.split(cons.fpath_repo_dir)[1]
fpath_lastnames = '.' + cons.fpath_llama_lastnames.split(cons.fpath_repo_dir)[1]
-fpath_countrieseurope = '.' + cons.fpath_countrieseurope.split(cons.fpath_repo_dir)[1]
-fpath_domain_email = '.' + cons.fpath_domain_email.split(cons.fpath_repo_dir)[1]
-user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countrieseurope=fpath_countrieseurope, fpath_domain_email=fpath_domain_email)
+fpath_countries_europe = '.' + cons.fpath_countries_europe.split(cons.fpath_repo_dir)[1]
+fpath_email_domain = '.' + cons.fpath_email_domain.split(cons.fpath_repo_dir)[1]
+user_object = User(n_user_ids=exp_n_user_ids, start_date=exp_start_date, end_date=exp_end_date, fpath_firstnames=fpath_firstnames, fpath_lastnames=fpath_lastnames, fpath_countries_europe=fpath_countries_europe, fpath_email_domain=fpath_email_domain)
exp_randomentity_counts_dict = {
- 'uid': ['1751409580926382', '6720317315593519', '4264861381989413', '6374692674377254'],
- 'n_devices': [3, 1, 1, 1],
- 'n_cards': [1, 1, 1, 1],
- 'n_ips': [5, 6, 3, 4],
- 'n_transactions': [55, 69, 54, 54],
- 'n_applications': [3, 10, 28, 6]
+ 'uid': ['6374692674377254', '6720317315593519', '4264861381989413', '1751409580926382'],
+ 'n_devices': [1, 2, 1, 1],
+ 'n_cards': [1, 1, 1, 1],
+ 'n_ips': [3, 5, 5, 1],
+ 'n_transactions': [72, 16, 13, 29],
+ 'n_applications': [4, 2, 3, 5]
}
exp_randomentity_counts_df = pd.DataFrame.from_dict(exp_randomentity_counts_dict)
diff --git a/generator/unittests/utilities/test_gen_shared_idhashes.py b/generator/unittests/utilities/test_gen_shared_idhashes.py
index e1e24f2..6afc519 100644
--- a/generator/unittests/utilities/test_gen_shared_idhashes.py
+++ b/generator/unittests/utilities/test_gen_shared_idhashes.py
@@ -14,8 +14,8 @@
np.random.seed(cons.unittest_seed)
obs_prop_shared_idhashes=cons.data_model_shared_entities_dict["ip"]
-obs_hash_cnt_dict = gen_idhash_cnt_dict(idhash_type="hash", n=4, lam=1, nbytes=16)
-obs_shared_idhashes = gen_shared_idhashes(idhash_cnt_dict=obs_hash_cnt_dict, prop_shared_idhashes=obs_prop_shared_idhashes)
+idhashes = list(gen_idhash_cnt_dict(idhash_type="hash", n=4, lam=1, nbytes=16).keys())
+obs_shared_idhashes = gen_shared_idhashes(idhashes=idhashes, prop_shared_idhashes=obs_prop_shared_idhashes)
exp_shared_idhashes = {}
class Test_gen_shared_idhashes(unittest.TestCase):
diff --git a/generator/utilities/Bedrock.py b/generator/utilities/Bedrock.py
index ab2fa79..9ea42d6 100644
--- a/generator/utilities/Bedrock.py
+++ b/generator/utilities/Bedrock.py
@@ -4,29 +4,86 @@
class Bedrock():
"""
+ Bedrock AWS API client wrapper for invoking language models.
+ This class provides a simplified interface to interact with AWS Bedrock runtime,
+ enabling prompt-based interactions with language models like Llama 3.
+
+ Parameters
+ ----------
+ session : boto3.Session
+ A Boto3 session object configured with appropriate AWS credentials.
+ model_region: str
+ The AWS region where the Bedrock model is hosted.
+ model_id: str
+ The identifier of the Bedrock model to use.
+
+ Attributes
+ ----------
+ client: boto3.Session.client
+ Boto3 Bedrock runtime client for model invocation.
+ model_id: str
+ The identifier of the Bedrock model to use.
+
+ References
+ ----------
https://docs.aws.amazon.com/general/latest/gr/bedrock.html
"""
@beartype
def __init__(
- self,
+ self,
session:boto3.Session,
model_region="us-east-1",
- model_id:str="meta.llama3-8b-instruct-v1:0"
+ model_id:str="meta.llama3-8b-instruct-v1:0",
):
self.client = session.client("bedrock-runtime", region_name=model_region)
- self.model_id = model_id
+ self.model_id = model_id,
@beartype
def prompt(
self,
- prompt:str,
- system:str="",
+ user_prompt:str,
+ system_prompt:str="",
top_p:float=0.5,
temperature:float=0.5,
- max_gen_len:int=512
+ max_gen_len:int=512,
) -> str:
- # generate bedrock request
- formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system}<|eot_id|><|start_header_id|>user<|end_header_id|>{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
+ """
+ Invoke the Bedrock model with the provided prompts and generation parameters.
+
+ Formats the user and system prompts according to the Llama 2 chat template,
+ sends a request to the configured Bedrock model, and returns the generated response.
+
+ Parameters
+ ----------
+ user_prompt : str
+ The main prompt or query to send to the model.
+ system_prompt : str, optional
+ System-level instructions for the model behavior. Defaults to "".
+ top_p : float, optional
+ Nucleus sampling parameter controlling diversity. Defaults to 0.5.
+ temperature : float, optional
+ Temperature parameter controlling randomness. Defaults to 0.5.
+ max_gen_len : int, optional
+ Maximum length of the generated response. Defaults to 512.
+
+ Returns
+ -------
+ str:
+ The generated text response from the Bedrock model.
+
+ Raises
+ ------
+ Exception: If the model invocation fails.
+
+ Examples
+ --------
+ ```
+ bedrockModel = Bedrock(session=boto3.Session(...), model_region="us-east-1")
+ bedrockModel.prompt(user_prompt="Who was the first president of the United States?", system_prompt="You are a helpful assistant.", max_gen_len=100)
+ ```
+ """
+ # generate bedrock request payload
+ formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
native_request = {"prompt": formatted_prompt, "max_gen_len": max_gen_len, "temperature": temperature, "top_p":top_p}
request = json.dumps(native_request)
# call bedrock model
@@ -34,14 +91,13 @@ def prompt(
# Invoke the model with the request.
response = self.client.invoke_model(modelId=self.model_id, body=request)
except Exception as e:
- print(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}")
- exit(1)
+ raise Exception(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}")
# Decode and extract the response
model_response = json.loads(response["body"].read())
response_text = model_response["generation"]
- return(response_text)
+ return response_text
-system = """# Task
+system_prompt = """# Task
You are a name generator for people from different countries in Europe. Your task is to generate an arbitrary N number of distinct and varied first names and last names for people from a given European country of origin.
diff --git a/generator/utilities/align_country_codes.py b/generator/utilities/align_country_codes.py
index 3c1646e..e27173b 100644
--- a/generator/utilities/align_country_codes.py
+++ b/generator/utilities/align_country_codes.py
@@ -6,25 +6,32 @@
@beartype
def align_country_codes(
series:pd.Series,
- proba_comm_ip:float=0.95,
- proba_comm_card:float=0.99
+ proba_comm_ip:float=0.05,
+ proba_comm_card:float=0.01,
) -> pd.Series:
"""
Aligns inconsistent registration, ip and card country codes to have mostly common values; with a random chance of inconsistencies.
-
+
Parameters
----------
series : pandas.Series
A series from the random transaction dataframe with inconsistent country codes to align.
proba_comm_ip : float
- The probability of a common / shared registration country code and ip country code.
+ The probability of a common / shared registration country code and ip country code, default is 0.05.
proba_comm_card : float
- The probability of a common / shared registration country code and card country code.
-
+ The probability of a common / shared registration country code and card country code, default is 0.01.
+
Returns
-------
pandas.Series
A pandas series containing only the aligned country codes; registration, ip and card.
+
+ Examples
+ --------
+ ```
+ series = pd.Series({'registration_country_code_alpha': 353.0, 'ip_country_code_alpha': 42.0, 'card_country_code_alpha': 42.0})
+ align_country_codes(series=series, proba_comm_ip=0.05, proba_comm_card=0.01,)
+ ```
"""
# generate random value between 0 and 1
random_unif = random.uniform(0, 1)
@@ -33,16 +40,16 @@ def align_country_codes(
ip_country_code = series["ip_country_code_alpha"]
card_country_code = series["card_country_code_alpha"]
# determine shared or new ip country code
- if ip_country_code == ip_country_code:
- if random_unif >= proba_comm_ip:
+ if pd.notna(ip_country_code):
+ if random_unif <= proba_comm_ip:
new_ip_country_code = ip_country_code
else:
new_ip_country_code = registration_country_code
else:
new_ip_country_code = np.nan
# determine shared or new card country code
- if card_country_code == card_country_code:
- if random_unif >= proba_comm_card:
+ if pd.notna(card_country_code):
+ if random_unif <= proba_comm_card:
new_card_country_code = card_country_code
else:
new_card_country_code = registration_country_code
diff --git a/generator/utilities/cnt2prop_dict.py b/generator/utilities/cnt2prop_dict.py
index 45ef11f..937c32a 100644
--- a/generator/utilities/cnt2prop_dict.py
+++ b/generator/utilities/cnt2prop_dict.py
@@ -1,27 +1,36 @@
from beartype import beartype
+import numpy as np
+from typing import Dict, Union
@beartype
def cnt2prop_dict(
- idhash_cnt_dict:dict
- ) -> dict:
+ idhashes_cnts_dict:Dict[Union[str, int], Union[int,np.int64]],
+ ) -> Dict[Union[str, int], float]:
"""
Converts a dictionary of counts to a dictionary of proportions.
-
+
Parameters
----------
- idhash_cnt_dict : dict
+ idhashes_cnts_dict : Dict[Union[str, int], Union[int,np.int64]
A dictionary of key, value pairs where the value indicates a count.
-
+
Returns
-------
- dict
+ Dict[Union[str, int], float]
A dictionary of key, value pairs where the value indicates a proportion.
+
+ Examples
+ --------
+ ```
+ idhashes_cnts_dict = {'7125135c8882b0f6': 2, '049dd291d9506532': 3, 'd6708d344cb6f498': 5}
+ prop_dict = cnt2prop_dict(idhashes_cnts_dict=idhashes_cnts_dict)
+ ```
"""
# empty dictionary for proportions
prop_dict = {}
- # sum of dictionary counts
- cnt_total = sum(idhash_cnt_dict.values())
- # iterate over input dictionary and convert counts to proportions
- for idhash, cnt in idhash_cnt_dict.items():
- prop_dict[idhash] = cnt / cnt_total
+ if idhashes_cnts_dict != {}:
+ # sum of dictionary counts
+ cnt_total = sum(idhashes_cnts_dict.values())
+ # iterate over input dictionary and convert counts to proportions
+ prop_dict = {idhash: cnt / cnt_total for idhash, cnt in idhashes_cnts_dict.items()}
return prop_dict
diff --git a/generator/utilities/commandline_interface.py b/generator/utilities/commandline_interface.py
index a8de020..0b72757 100644
--- a/generator/utilities/commandline_interface.py
+++ b/generator/utilities/commandline_interface.py
@@ -1,31 +1,45 @@
-import argparse
-from beartype import beartype
import cons
-@beartype
-def commandline_interface() -> dict:
+import argparse
+from typing import Dict
+
+def commandline_interface() -> Dict[str, object]:
"""
A commandline interface for parsing input parameters with
-
+
Windows
python RandomTeleComData\\generator\\main.py --n_users 100 --random_seed 1 --n_itr 2
-
+
Linux
python3 RandomTeleComData/generator/main.py --n_users 100 --random_seed 1 --n_itr 2
-
+
Parameters
----------
-
+ n_users : int
+ The number of users to generate random telecom payments data for.
+ use_random_seed : int
+ Use a set random seed for reproducible results; must be either 0 or 1.
+ n_itr : int
+ Number of iterations to run.
+ registration_start_date : str
+ The start date for registrations.
+ registration_end_date : str
+ The end date for registrations.
+ transaction_start_date : str
+ The start date for transactions.
+ transaction_end_date : str
+ The end date for transactions.
+
Returns
-------
- dict
+ Dict[str, object]
A dictionary of key, value pairs where the values are parsed input parameters.
"""
# define argument parser object
parser = argparse.ArgumentParser(description="Execute Random TeleCom Data Programme.")
# add input arguments
parser.add_argument("--n_users", action="store", dest="n_users", type=int, default=cons.default_n_users, help="Integer, the number of users to generate random telecom payments data for",)
- parser.add_argument("--use_random_seed", action="store", dest="use_random_seed", type=int, default=cons.default_use_random_seed, help="Integer, use a set random seed for reproducible results; must be either 0 or 1",)
+ parser.add_argument("--use_random_seed", action="store", dest="use_random_seed", type=int, default=cons.default_use_random_seed, choices=[0, 1], help="Integer, use a set random seed for reproducible results; must be either 0 or 1",)
parser.add_argument("--n_itr", action="store", dest="n_itr", type=int, default=cons.default_n_itr, help="Integer, number of iterations to run",)
parser.add_argument("--registration_start_date", action="store", dest="registration_start_date", type=str, default=cons.default_registration_start_date, help="String, the start date for registrations",)
parser.add_argument("--registration_end_date", action="store", dest="registration_end_date", type=str, default=cons.default_registration_end_date, help="String, the end date for registrations",)
diff --git a/generator/utilities/gen_country_codes_dict.py b/generator/utilities/gen_country_codes_dict.py
index c36d9fa..4aeac49 100644
--- a/generator/utilities/gen_country_codes_dict.py
+++ b/generator/utilities/gen_country_codes_dict.py
@@ -1,47 +1,63 @@
import cons
+from utilities.cnt2prop_dict import cnt2prop_dict
+
+import os
import numpy as np
import pandas as pd
-from utilities.cnt2prop_dict import cnt2prop_dict
from beartype import beartype
+from typing import Dict, Union, List
@beartype
def gen_country_codes_dict(
- idhashes_cnts_dict:dict,
- fpath_countrieseurope:str=cons.fpath_countrieseurope
- ) -> dict:
+ idhashes:List[str],
+ fpath_countries_europe:str=cons.fpath_countries_europe,
+ ) -> Dict[str, Union[int, np.int64]]:
"""
- Generates a dictionary of random country codes for an input dictionary of idhashes counts.
-
+ Generates a dictionary of randomLy sampled country codes for an input list of idhashes.
+
Parameters
----------
- idhashes_cnts_dict : dict
- A dictionary of idhashes counts.
- fpath_countrieseurope : str
- The file path to the european countries reference file, default is cons.fpath_countrieseurope.
-
+ idhashes : List[str]
+ A list of idhashes.
+ fpath_countries_europe : str
+ The file path to the european countries reference file, default is cons.fpath_countries_europe.
+
Returns
-------
- dict
+ Dict[str, Union[int, np.int64]]
A dictionary of idhashes country codes.
+
+ Examples
+ --------
+ ```
+ import cons
+ idhashes_cnts_dict:{'abcd1234': 5, 'defg4567': 3, 'ghij7891': 7}
+ gen_country_codes_dict(idhashes_cnts_dict=idhashes_cnts_dict,
+ fpath_countries_europe=cons.fpath_countries_europe,
+ )
+ ```
"""
-
+ # check file path exists
+ if os.path.exists(fpath_countries_europe) == False:
+ raise FileNotFoundError(f"File not found: {fpath_countries_europe}")
# load population data of european countries
- european_populations_cnt_data = pd.read_csv(filepath_or_buffer=fpath_countrieseurope, usecols=["ISO numeric", "population"],)
+ european_populations_cnt_data = pd.read_csv(filepath_or_buffer=fpath_countries_europe, usecols=["ISO numeric", "population"],)
# convert to a dictionary of ISO country codes with population counts
- european_populations_cnt_dict = european_populations_cnt_data.set_index("ISO numeric").to_dict()["population"]
+ european_populations_cnt_dict = european_populations_cnt_data.set_index("ISO numeric")["population"].to_dict()
# convert dictionary of population counts to dictionary of population proportions
european_populations_props_dict = cnt2prop_dict(european_populations_cnt_dict)
- # extract out idhashes from idhashes counts dictionary
- idhashes_list = list(idhashes_cnts_dict.keys())
+ # check population proportions sum to 1.0
+ if np.isclose(sum(european_populations_props_dict.values()), 1.0) == False:
+ raise ValueError("Population proportions do not sum to 1.0")
# randomly generate country codes for all idhashes based on population proportions
country_codes_list = list(
np.random.choice(
a=list(european_populations_props_dict.keys()),
p=list(european_populations_props_dict.values()),
replace=True,
- size=len(idhashes_list),
+ size=len(idhashes),
)
)
# return a dictionary of idhashes and country codes
- idhashes_country_codes = dict(zip(idhashes_list, country_codes_list))
+ idhashes_country_codes = dict(zip(idhashes, country_codes_list))
return idhashes_country_codes
diff --git a/generator/utilities/gen_country_codes_map.py b/generator/utilities/gen_country_codes_map.py
index d6254ff..160a5a7 100644
--- a/generator/utilities/gen_country_codes_map.py
+++ b/generator/utilities/gen_country_codes_map.py
@@ -1,26 +1,36 @@
import cons
+
+import numpy as np
import pandas as pd
from beartype import beartype
+from typing import Dict, Union
@beartype
def gen_country_codes_map(
- fpath_countrieseurope:str=cons.fpath_countrieseurope
- ) -> dict:
+ fpath_countries_europe:str=cons.fpath_countries_europe,
+ ) -> Dict[int, Union[str, np.int64]]:
"""
Generates a dictionary of ISO numeric codes mapping to ISO alpha codes.
-
+
Parameters
----------
- fpath_countrieseurope : str
- The full file path to the european countries reference file, default is cons.fpath_countrieseurope.
-
+ fpath_countries_europe : str
+ The full file path to the european countries reference file, default is cons.fpath_countries_europe.
+
Returns
-------
- dict
+ Dict[int, Union[str, np.int64]]
A dictionary of ISO numeric codes mapping to ISO alpha codes.
+
+ Examples
+ --------
+ ```
+ import cons
+ gen_country_codes_map(fpath_countries_europe=cons.fpath_countries_europe)
+ ```
"""
# load european county codes data
- country_codes_data = pd.read_csv(filepath_or_buffer=fpath_countrieseurope, usecols=["ISO numeric", "ISO alpha 2"],)
+ country_codes_data = pd.read_csv(filepath_or_buffer=fpath_countries_europe, usecols=["ISO numeric", "ISO alpha 2"],)
# convert data to a dictionary of ISO numeric codes mapping to ISO alpha codes
- country_codes_map = country_codes_data.set_index("ISO numeric").to_dict()["ISO alpha 2"]
+ country_codes_map = country_codes_data.set_index("ISO numeric")["ISO alpha 2"].to_dict()
return country_codes_map
diff --git a/generator/utilities/gen_dates_dict.py b/generator/utilities/gen_dates_dict.py
index 05f29f9..b02bca0 100644
--- a/generator/utilities/gen_dates_dict.py
+++ b/generator/utilities/gen_dates_dict.py
@@ -2,20 +2,21 @@
import numpy as np
from datetime import datetime
from beartype import beartype
+from typing import Dict, Union, List
@beartype
def gen_dates_dict(
- idhashes_cnts_dict:dict,
+ idhashes:List[str],
start_date:str,
- end_date:str
- ) -> dict:
+ end_date:str,
+ ) -> Dict[str, Union[pd.Timestamp, np.datetime64]]:
"""
- Generates a dictionary of random dates for an input dictionary of idhashes counts.
+ Generates a dictionary of random dates for an input list of idhashes.
Parameters
----------
- idhashes_cnts_dict : dict
- A dictionary of idhashes counts.
+ idhashes : List[str]
+ A list of idhashes.
start_date : str
The start date ("%Y-%m-%d") to generate random dates from.
end_date : str
@@ -23,15 +24,20 @@ def gen_dates_dict(
Returns
-------
- dict
+ Dict[str, Union[pd.Timestamp,int, np.datetime64]]
A dictionary of idhashes dates.
+
+ Examples
+ --------
+ ```
+ idhashes = ['2e23f63807f6170a', 'b8816ed926bf9f83', 'b010fdb44fa68822']
+ gen_dates_dict(idhashes=idhashes, start_date='2020-01-01', end_date='2023-01-01')
+ ```
"""
# generate a range of dates between the given input start and end dates
- dates = pd.date_range(start=datetime.strptime(start_date, "%Y-%m-%d"), end=datetime.strptime(end_date, "%Y-%m-%d") - pd.Timedelta(days=1), freq="d",)
- # extract out the idhashes from idhashes counts dictionary
- idhashes_list = list(idhashes_cnts_dict.keys())
+ dates = pd.date_range(start=datetime.strptime(start_date, "%Y-%m-%d"), end=datetime.strptime(end_date, "%Y-%m-%d"), freq="d", inclusive="both",)
# randomly sample dates for each of the idhashes
- dates_list = list(np.random.choice(a=dates, replace=True, size=len(idhashes_list)))
+ dates_list = list(np.random.choice(a=dates, replace=True, size=len(idhashes)))
# return a dictionary of idhashes and dates
- idhashes_dates_dict = dict(zip(idhashes_list, dates_list))
+ idhashes_dates_dict = dict(zip(idhashes, dates_list))
return idhashes_dates_dict
diff --git a/generator/utilities/gen_idhash_cnt_dict.py b/generator/utilities/gen_idhash_cnt_dict.py
index 58e5bda..381897b 100644
--- a/generator/utilities/gen_idhash_cnt_dict.py
+++ b/generator/utilities/gen_idhash_cnt_dict.py
@@ -1,9 +1,10 @@
-import numpy as np
from utilities.gen_random_hash import gen_random_hash
from utilities.gen_random_id import gen_random_id
from utilities.gen_random_poisson_power import gen_random_poisson_power
+
+import numpy as np
from beartype import beartype
-from typing import Union
+from typing import Union, Dict
@beartype
def gen_idhash_cnt_dict(
@@ -11,15 +12,15 @@ def gen_idhash_cnt_dict(
n:Union[int,np.int64],
lam:Union[int,float],
nbytes:int=16,
- power:int=2
- ) -> dict:
+ power:int=2,
+ ) -> Dict[str, Union[str, int, np.int64]]:
"""
Generates a dictionary of n random idhashes and associated counts.
-
+
Parameters
----------
idhash_type : str
- Whether to generate a "id2 or "hash" value.
+ Whether to generate a "id" or "hash" value.
n : int
The total number of idhash values to generate.
lam : float
@@ -28,11 +29,24 @@ def gen_idhash_cnt_dict(
The number bytes to include in the idhash value, default is 16.
power : int
The power of the polynomial random poisson variable, default is 2.
-
+
Returns
-------
- dict
+ Dict[str, Union[str, int, np.int64]]
A dictionary of idhashes counts.
+
+ Examples
+ --------
+ ```
+ import cons
+ gen_idhash_cnt_dict(
+ idhash_type="hash",
+ n=10,
+ lam=5.0,
+ nbytes=16,
+ power=2,
+ )
+ ```
"""
# if generating a random hash value
if idhash_type == "hash":
@@ -40,8 +54,12 @@ def gen_idhash_cnt_dict(
# else if generating a random id value
elif idhash_type == "id":
idhash_list = gen_random_id(size=n, nbytes=nbytes)
+ else:
+ raise ValueError("idhash_type must be either 'id' or 'hash'")
# randomly sample n counts from a squared poisson distribution with given lam value
- cnts_list = list(gen_random_poisson_power(lam=lam, size=n, power=power))
+ cnts_list = gen_random_poisson_power(lam=lam, size=n, power=power).tolist()
# return a dictionary of idhashes and counts
+ if len(idhash_list) != len(set(idhash_list)):
+ raise ValueError("Generated idhash values are not unique, please increase nbytes value")
idhash_dict = dict(zip(idhash_list, cnts_list))
return idhash_dict
diff --git a/generator/utilities/gen_obj_idhash_series.py b/generator/utilities/gen_obj_idhash_series.py
index cb15463..8900b04 100644
--- a/generator/utilities/gen_obj_idhash_series.py
+++ b/generator/utilities/gen_obj_idhash_series.py
@@ -1,28 +1,37 @@
import pandas as pd
from beartype import beartype
+from typing import List
@beartype
def gen_obj_idhash_series(
- idhashes_props_dict:dict,
- n_counts_series:pd.Series
+ idhashes:List[str],
+ n_counts_series:pd.Series,
) -> pd.Series:
"""
- Generates a series of entity idhash lists using the entity counts per user Series and idhashes proportions dictionary.
-
+ Generates a series of entity idhash lists using the entity counts per user Series and idhashes list.
+
Parameters
----------
- idhashes_props_dict : dict
- The idhash proportions dictionary.
+ idhashes : List[str]
+ The idhashes list.
n_counts_series : pd.Series
The entity counts for each uid as Series.
-
+
Returns
-------
pd.Series
A Series of lists containing entity idhashes for each user.
+
+ Examples
+ --------
+ ```
+ idhashes = ['2e23f63807f6170a', 'b8816ed926bf9f83', 'b010fdb44fa68822']
+ n_counts_series = pd.Series(data=[2, 1, 2], index=range(3), name='n_entities')
+ gen_obj_idhash_series(idhashes=idhashes, n_counts_series=n_counts_series)
+ ```
"""
# create an exploded series for idhashes within the entity object
- obj_idhash_series = pd.Series(data=idhashes_props_dict.keys(), index=n_counts_series.apply(lambda x: range(x)).explode().index)
+ obj_idhash_series = pd.Series(data=idhashes, index=n_counts_series.index.repeat(n_counts_series.values).to_list())
# group by uid index and collate idhashes as lists
- obj_idhash_agg = obj_idhash_series.groupby(level=0).apply(lambda series: series.to_list())
+ obj_idhash_agg = obj_idhash_series.groupby(level=0).apply(list)
return obj_idhash_agg
\ No newline at end of file
diff --git a/generator/utilities/gen_random_entity_counts.py b/generator/utilities/gen_random_entity_counts.py
index 4490b0e..7b18b49 100644
--- a/generator/utilities/gen_random_entity_counts.py
+++ b/generator/utilities/gen_random_entity_counts.py
@@ -1,40 +1,46 @@
-import numpy as np
-import pandas as pd
import cons
from objects.User import User
from utilities.gen_random_poisson_power import gen_random_poisson_power
+
+import numpy as np
+import pandas as pd
from beartype import beartype
@beartype
def gen_random_entity_counts(
user_obj:User,
- transaction_timescale:float=1.0
+ transaction_timescale:float=1.0,
) -> pd.DataFrame:
"""
Generates a dataframe of entity counts for all users from a given user object.
-
+
Parameters
----------
user_obj : User
The User class object.
transaction_timescale : float
The transaction timescale where 1.0 is a single year of transactions, default is 1.0.
-
+
Returns
-------
pd.DataFrame
A dataframe of entity counts for all users from the specified user object.
+
+ Examples
+ --------
+ ```
+ from objects.User import User
+ user_obj=User(n_user_ids=1000, start_date='2020-01-01', end_date='2023-01-01')
+ gen_random_entity_counts(user_obj=user_obj, transaction_timescale=1.0)
+ ```
"""
# create an empty pandas dataframe to hold the random aggregated data
random_entity_counts = pd.DataFrame()
# randomly sample from the random user uids
- random_entity_counts['uid'] = np.random.choice(a = list(user_obj.user_ids_props_dict.keys()), size = user_obj.n_user_ids, replace = False)
+ random_entity_counts["uid"] = np.random.choice(a=user_obj.user_ids, size=user_obj.n_user_ids, replace=False)
# randomly simulate the number of entities per user
- random_entity_counts['n_devices'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["device"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["device"]["power"])
- random_entity_counts['n_cards'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["card"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["card"]["power"])
- random_entity_counts['n_ips'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["ip"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["ip"]["power"])
- random_entity_counts['n_transactions'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["transaction"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["transaction"]["power"])
- random_entity_counts['n_applications'] = gen_random_poisson_power(lam = cons.data_model_poisson_params["application"]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params["application"]["power"])
- # scale n transactions by
- random_entity_counts['n_transactions'] = (random_entity_counts['n_transactions'] * transaction_timescale).round().astype(int)
+ for object_type in cons.object_types:
+ random_entity_counts[f"n_{object_type}s"] = gen_random_poisson_power(lam = cons.data_model_poisson_params[object_type]["lambda"], size = user_obj.n_user_ids, power = cons.data_model_poisson_params[object_type]["power"])
+ # scale n transactions by
+ random_entity_counts["n_transactions"] = (random_entity_counts["n_transactions"] * transaction_timescale).astype(int)
return random_entity_counts
diff --git a/generator/utilities/gen_random_hash.py b/generator/utilities/gen_random_hash.py
index e7c6f98..2cec880 100644
--- a/generator/utilities/gen_random_hash.py
+++ b/generator/utilities/gen_random_hash.py
@@ -1,30 +1,36 @@
import string
import numpy as np
from beartype import beartype
-from typing import Union
+from typing import Union, List
@beartype
def gen_random_hash(
size:Union[int,np.int64],
- nbytes:int=16
- ) -> list:
+ nbytes:int=16,
+ ) -> List[str]:
"""
Generates a list of random hashes.
-
+
Parameters
----------
size : int
The total number of hashes to generate.
nbytes : int
The number of alphanumeric values in each hash, default is 16.
-
+
Returns
-------
list
A list of random hashes.
+
+ Examples
+ --------
+ ```
+ gen_random_hash(size=5, nbytes=16)
+ ```
"""
# generate a list of digits and lower case letters from string library
alphanumeric = list(string.digits) + list(string.ascii_lowercase)[:6]
# randomly sample nbytes digits, string concatenate and convert to integers
- random_hashes = ["".join(np.random.choice(a=alphanumeric, size=nbytes, replace=True)) for i in range(size)]
+ random_hashes = [''.join(row) for row in np.random.choice(a=alphanumeric, size=(size, nbytes), replace=True).tolist()]
return random_hashes
diff --git a/generator/utilities/gen_random_id.py b/generator/utilities/gen_random_id.py
index a6e8c8f..43d1f5a 100644
--- a/generator/utilities/gen_random_id.py
+++ b/generator/utilities/gen_random_id.py
@@ -1,29 +1,36 @@
import string
import numpy as np
from beartype import beartype
+from typing import Union, List
@beartype
def gen_random_id(
- size:int,
- nbytes:int=16
- ) -> list:
+ size:Union[int,np.int64],
+ nbytes:int=16,
+ ) -> List[str]:
"""
Generates a list of random ids.
-
+
Parameters
----------
size : int
The total number of ids to generate.
nbytes : int
The number of numeric values in each id, default is 16.
-
+
Returns
-------
list
A list of random ids.
+
+ Examples
+ --------
+ ```
+ gen_random_id(size=5, nbytes=16)
+ ```
"""
# generate a list of digits from string library
digits = list(string.digits)
- # randomly sample nbytes digits, string concatenate and convert to integers
- random_ids = ["".join(np.random.choice(a=digits, size=nbytes, replace=True))for i in range(size)]
+ # randomly sample nbytes digits, string concatenate
+ random_ids = ["".join(row) for row in np.random.choice(a=digits, size=(size, nbytes), replace=True).tolist()]
return random_ids
diff --git a/generator/utilities/gen_random_poisson_power.py b/generator/utilities/gen_random_poisson_power.py
index e3d64ca..383b9b6 100644
--- a/generator/utilities/gen_random_poisson_power.py
+++ b/generator/utilities/gen_random_poisson_power.py
@@ -6,24 +6,30 @@
def gen_random_poisson_power(
lam:Union[int,float],
size:Union[int,np.int64],
- power:int
+ power:int,
) -> np.ndarray:
"""
Generates data from a polynomial random poisson variable to a given power.
-
+
Parameters
----------
- lam : int
+ lam : int,float
The lambda of the underlying poisson random variable.
size : int
The number of values to generate.
power : int
The power of the polynomial sum.
-
+
Returns
-------
numpy.ndarray
- The random squared poisson values.
+ The random sum of powered poisson values.
+
+ Examples
+ --------
+ ```
+ gen_random_poisson_power(lam=3.0, size=10, power=2)
+ ```
"""
# randomly generate a square poisson distribution
a = np.array([np.random.poisson(lam, size) ** p for p in range(1, power+1)]).sum(axis = 0) + 1
diff --git a/generator/utilities/gen_shared_idhashes.py b/generator/utilities/gen_shared_idhashes.py
index aa72f7b..e2901ed 100644
--- a/generator/utilities/gen_shared_idhashes.py
+++ b/generator/utilities/gen_shared_idhashes.py
@@ -1,41 +1,47 @@
import numpy as np
import pandas as pd
from beartype import beartype
+from typing import Dict, Union, List
@beartype
def gen_shared_idhashes(
- idhash_cnt_dict:dict,
- prop_shared_idhashes:float
- ) -> dict:
+ idhashes:List[str],
+ prop_shared_idhashes:float,
+ ) -> Dict[str, str]:
"""
Generates a dictionary of shared idhashes proportions
-
+
Parameters
----------
- idhashes_cnts_dict : dict
- A dictionary of idhashes counts.
+ idhashes : list of str
+ A list of idhashes.
prop_shared_idhashes : float
The total proportion of shared idhashes.
-
+
Returns
-------
- dict
- A dictionary of shared idhashes proportion.
+ Dict[str, str]
+ A dictionary idhashes and their shared idhashes.
+
+ Examples
+ --------
+ ```
+ idhashes=['2e23f63807f6170a', 'b8816ed926bf9f83', 'b010fdb44fa68822']
+ gen_shared_idhashes(idhashes=idhashes, prop_shared_idhashes=0.01)
+ ```
"""
# calculate the total number of idhashes
- n_idhashes = len(idhash_cnt_dict)
+ n_idhashes = len(idhashes)
# randomly sample the idhashes based on the total proportion of shared idhashes
- shared_idhashes_list = list(
- np.random.choice(
- a=list(idhash_cnt_dict.keys()),
- size=int(np.round(n_idhashes * prop_shared_idhashes)),
- replace=False
- )
- )
+ shared_idhashes_list = np.random.choice(
+ a=idhashes,
+ size=int(np.round(n_idhashes * prop_shared_idhashes)),
+ replace=False
+ ).tolist()
shared_idhash_map_dict = {}
- if shared_idhashes_list != []:
+ if (shared_idhashes_list != []):
# determine how many networks
- n_groups = int(np.floor(np.sqrt(len(shared_idhashes_list))))
+ n_groups = int(np.ceil(np.sqrt(len(shared_idhashes_list))))
group_uniform_dict = {g:np.random.uniform() for g in range(n_groups)}
group_prop_dict = {key:value/sum(group_uniform_dict.values()) for key, value in group_uniform_dict.items()}
# generate groups for all shared id hashes
@@ -43,7 +49,7 @@ def gen_shared_idhashes(
shared_idhashes_groups_dict = dict(zip(shared_idhashes_list, shared_idhashes_groups_list))
shared_idhashes_groups_df = pd.Series(shared_idhashes_groups_dict, name="shared_idhashes_group").to_frame().reset_index().rename(columns={'index':'idhash'})
shared_entity_groups_dict = shared_idhashes_groups_df.groupby('shared_idhashes_group').agg({'idhash':list}).to_dict()['idhash']
- shared_idhashes_groups_df['shared_idhash'] = shared_idhashes_groups_df.apply(lambda series: np.random.choice(a=shared_entity_groups_dict[series['shared_idhashes_group']]), axis=1)
+ shared_idhashes_groups_df['shared_idhash'] = [np.random.choice(shared_entity_groups_dict[group]) for group in shared_idhashes_groups_df['shared_idhashes_group']]
# create the shared idhash map dictionary
shared_idhash_map_dict = shared_idhashes_groups_df.set_index('idhash')['shared_idhash'].to_dict()
return shared_idhash_map_dict
diff --git a/generator/utilities/gen_trans_rejection_rates.py b/generator/utilities/gen_trans_rejection_rates.py
index af8ab6e..f24ce63 100644
--- a/generator/utilities/gen_trans_rejection_rates.py
+++ b/generator/utilities/gen_trans_rejection_rates.py
@@ -1,70 +1,72 @@
-import pandas as pd
import cons
+
+import pandas as pd
from beartype import beartype
+from typing import Dict
@beartype
def gen_trans_rejection_rates(
trans_data:pd.DataFrame,
- fpath_countrieseurope=cons.fpath_countrieseurope,
- fpath_countrycrimeindex=cons.fpath_countrycrimeindex,
- fpath_domain_email=cons.fpath_domain_email
- ) -> dict:
+ fpath_countries_europe:str=cons.fpath_countries_europe,
+ fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex,
+ fpath_email_domain:str=cons.fpath_email_domain,
+ ) -> Dict[str, Dict[str, float]]:
"""
Generates the transaction rejection rates based on features within the transaction level telecom payments data.
-
+
Parameters
----------
trans_data : pandas.DataFrame
The transaction level telecom payments data.
- fpath_countrieseurope : str
- The file path to the europe countries reference data, default is cons.fpath_countrieseurope.
+ fpath_countries_europe : str
+ The file path to the europe countries reference data, default is cons.fpath_countries_europe.
fpath_countrycrimeindex : str
The file path to the country crime index reference data, default is cons.fpath_countrycrimeindex.
- fpath_domain_email :str
- The file path to the email domains reference data, default is cons.fpath_domain_email.
-
+ fpath_email_domain :str
+ The file path to the email domains reference data, default is cons.fpath_email_domain.
+
Returns
-------
dict
The rejection rates based on features within the transaction level telecom payments data.
"""
- # create empty dictionary to hold rejection rates
+ # initialize dictionary to store all computed rejection rates
rejection_rates_dict = {}
-
+
# generate country code rejection based rates
- countrieseurope = pd.read_csv(fpath_countrieseurope, usecols=["ISO numeric", "ISO alpha 2"])
+ countrieseurope = pd.read_csv(fpath_countries_europe, usecols=["ISO alpha 2"])
countrycrimeindex = pd.read_csv(fpath_countrycrimeindex, usecols=["country_code", "crime_index"])
europecountrycrimeindex = pd.merge(left=countrieseurope, right=countrycrimeindex, left_on="ISO alpha 2", right_on="country_code", how="left",)
europecountrycrimeindex["trans_reject_rate"] = europecountrycrimeindex["crime_index"].divide(europecountrycrimeindex["crime_index"].sum())
country_code_trans_reject_rate_dict = europecountrycrimeindex.set_index("ISO alpha 2")["trans_reject_rate"].to_dict()
rejection_rates_dict["country_code_trans_reject_rate_dict"] = country_code_trans_reject_rate_dict
-
+
# generate domain email rejection based rates
- domain_email = pd.read_csv(fpath_domain_email, usecols=["domain", "proportion"])
+ domain_email = pd.read_csv(fpath_email_domain, usecols=["domain", "proportion"])
domain_email["trans_reject_rate"] = (1 - domain_email["proportion"]) / (1 - domain_email["proportion"]).sum()
domain_email_trans_reject_rate_dict = domain_email.set_index("domain")["trans_reject_rate"].to_dict()
rejection_rates_dict["domain_email_trans_reject_rate_dict"] = domain_email_trans_reject_rate_dict
-
+
# generate shared entities with rejection rates dictionary
shared_devices = (trans_data.groupby(by="device_hash").agg({"userid": "nunique"}).sort_values(by="userid"))
shared_ips = (trans_data.groupby(by="ip_hash").agg({"userid": "nunique"}).sort_values(by="userid"))
shared_cards = (trans_data.groupby(by="card_hash").agg({"userid": "nunique"}).sort_values(by="userid"))
- shared_devices_reject_rate_dict = shared_devices.divide(shared_devices["userid"].sum()).to_dict()["userid"]
+ shared_devices_reject_rate_dict = shared_devices.divide(shared_devices["userid"].sum())["userid"].to_dict()
shared_ips_reject_rate_dict = shared_ips.divide(shared_ips["userid"].sum()).to_dict()["userid"]
shared_cards_reject_rate_dict = shared_cards.divide(shared_cards["userid"].sum()).to_dict()["userid"]
rejection_rates_dict["shared_devices_reject_rate_dict"] = shared_devices_reject_rate_dict
rejection_rates_dict["shared_ips_reject_rate_dict"] = shared_ips_reject_rate_dict
rejection_rates_dict["shared_cards_reject_rate_dict"] = shared_cards_reject_rate_dict
-
+
# generate occurrence based rejection rates
count_devices = (trans_data.groupby(by="userid").agg({"device_hash": "nunique"}).sort_values(by="device_hash"))
count_ips = (trans_data.groupby(by="userid").agg({"ip_hash": "nunique"}).sort_values(by="ip_hash"))
count_cards = (trans_data.groupby(by="userid").agg({"card_hash": "nunique"}).sort_values(by="card_hash"))
- count_devices_reject_rate_dict = count_devices.divide(count_devices["device_hash"].sum()).to_dict()["device_hash"]
+ count_devices_reject_rate_dict = count_devices.divide(count_devices["device_hash"].sum())["device_hash"].to_dict()
count_ips_reject_rate_dict = count_ips.divide(count_ips["ip_hash"].sum()).to_dict()["ip_hash"]
count_cards_reject_rate_dict = count_cards.divide(count_cards["card_hash"].sum()).to_dict()["card_hash"]
rejection_rates_dict["count_devices_reject_rate_dict"] = count_devices_reject_rate_dict
rejection_rates_dict["count_ips_reject_rate_dict"] = count_ips_reject_rate_dict
rejection_rates_dict["count_cards_reject_rate_dict"] = count_cards_reject_rate_dict
-
+
return rejection_rates_dict
diff --git a/generator/utilities/gen_trans_status.py b/generator/utilities/gen_trans_status.py
index 492d88d..35fbbd3 100644
--- a/generator/utilities/gen_trans_status.py
+++ b/generator/utilities/gen_trans_status.py
@@ -1,36 +1,38 @@
+import cons
+
import random
import numpy as np
import pandas as pd
-import cons
from beartype import beartype
+from typing import List, Dict, Union
@beartype
def gen_trans_status(
series:pd.Series,
- rejection_rates_dict:dict,
- rejection_scaling_factor:int=2
- ) -> list:
+ rejection_rates_dict:Dict[str, Dict[str, float]],
+ rejection_scaling_factor:int=2,
+ ) -> List[Union[str, float]]:
"""
Generates the transaction status for a pandas series from the transaction level telecom payments data given the rejection rates dictionary from the same data.
-
+
Parameters
----------
series : pandas.Series
A pandas series from the transaction level telecom payments data.
- rejection_rates_dict : dict
+ rejection_rates_dict : Dict[str, Dict[str, float]]
Rejection rates generated the transaction level telecom payments data.
rejection_scaling_factor : int
A multiplicative scaling factor for rejection rates, default is 2.
-
+
Returns
-------
- list
- The transaction status for the pandas series.
+ List[str]
+ The transaction status and error code.
"""
# set country code columns
country_code_columns = ["registration_country_code","ip_country_code","card_country_code"]
-
- if series['card_hash'] == series['card_hash']:
+ # if card hash
+ if pd.notna(series['card_hash']):
status = "rejected"
# add rejections based on crime rates within country codes
if rejection_rates_dict["country_code_trans_reject_rate_dict"][np.random.choice(a=series[country_code_columns].dropna().to_list(), size=1)[0]] >= random.uniform(0, 1)/rejection_scaling_factor:
@@ -42,7 +44,7 @@ def gen_trans_status(
elif cons.data_model_inconsistent_country_codes_rejection_rate[series[country_code_columns].dropna().nunique()] >= random.uniform(0, 1)/rejection_scaling_factor:
error_code = np.random.choice(a=list(cons.data_model_rejection_codes_connection.keys()),p=list(cons.data_model_rejection_codes_connection.values()),size=1)[0]
# add rejections based on shared ips, cards and devices
- elif series["device_hash"] == series["device_hash"] and rejection_rates_dict["shared_devices_reject_rate_dict"][series["device_hash"]] >= random.uniform(0, 1)/rejection_scaling_factor:
+ elif pd.notna(series["device_hash"]) and rejection_rates_dict["shared_devices_reject_rate_dict"][series["device_hash"]] >= random.uniform(0, 1)/rejection_scaling_factor:
error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0]
elif series["ip_hash"] == series["ip_hash"] and rejection_rates_dict["shared_ips_reject_rate_dict"][series["ip_hash"]] >= random.uniform(0, 1)/rejection_scaling_factor:
error_code = np.random.choice(a=list(cons.data_model_rejection_codes_fraud.keys()),p=list(cons.data_model_rejection_codes_fraud.values()),size=1)[0]
@@ -57,7 +59,9 @@ def gen_trans_status(
error_code = np.random.choice(a=list(cons.data_model_rejection_codes_funds.keys()),p=list(cons.data_model_rejection_codes_funds.values()),size=1)[0]
# otherwise return successful status
else:
- status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0]
+ successful_status = {key:cons.data_model_transaction_status[key] for key in ['successful', 'pending']}
+ successful_probs = [value/sum(successful_status.values()) for value in successful_status.values()]
+ status = np.random.choice(a=list(successful_status.keys()), size=1, p=successful_probs)[0]
error_code = np.nan
else:
status = np.random.choice(a=['successful', 'pending'], size=1, p=[0.98, 0.02])[0]
diff --git a/generator/utilities/gen_user_names_file.py b/generator/utilities/gen_user_names_file.py
index 8f3dfe6..c8765fe 100644
--- a/generator/utilities/gen_user_names_file.py
+++ b/generator/utilities/gen_user_names_file.py
@@ -11,20 +11,66 @@
sys.path.append("E:\\GitHub\\RandomTelecomPayments\\generator")
import cons
-from utilities.Bedrock import Bedrock, prompt, system
+from utilities.Bedrock import Bedrock, prompt, system_prompt
-def invoke_bedrock(model, n_user_names, country):
+def invoke_bedrock(
+ model:Bedrock,
+ n_user_names:int,
+ country:str,
+ countrieseurope:pd.DataFrame,
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
+ Invokes the Bedrock model to generate user names for a specified country.
+
+ This function calls the Bedrock model with a formatted prompt to generate first names
+ and last names for a given country. It processes the model's response, parses the JSON
+ output, and merges the results with country data. The function deduplicates and standardizes
+ the name formatting, then persists the data to temporary CSV files.
+
+ Parameters
+ ----------
+ model : Bedrock
+ The Bedrock model instance used to generate names.
+ n_user_names : int
+ The number of user names to generate.
+ country : str
+ The country for which to generate names.
+ countrieseurope : pd.DataFrame
+ A DataFrame containing country information for merging.
+
+ Returns
+ -------
+ tuple:
+ A tuple containing two pandas DataFrames:
+ - tmp_firstname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized first names along with country information.
+ - tmp_lastname_country_data (pd.DataFrame): DataFrame with deduplicated and standardized last names along with country information.
+
+ Raises
+ ------
+ json.JSONDecodeError: If the model response cannot be parsed as JSON.
+ KeyError: If the expected keys ("firstnames", "lastnames") are missing from the JSON response.
+ Exception: If the merge with country data fails or file I/O operations encounter errors.
+
+ Notes
+ -----
+ - Names are standardized by converting to lowercase, removing extra whitespace, and applying Unicode normalization using unidecode.
+ - Duplicate names are removed after each processing step.
+ - Results are concatenated with any previously generated data for the same country and saved to temporary CSV files if the new data increases the dataset size.
+ - CSV files are encoded in latin1 format.
+
"""
logging.info("Calling Bedrock ...")
# call bedrock model
formatted_prompt = prompt.format(n_user_names=n_user_names, country=country)
logging.info(formatted_prompt)
- model_response = model.prompt(prompt=formatted_prompt, system=system, max_gen_len=2048)
+ model_response = model.prompt(user_prompt=formatted_prompt, system_prompt=system_prompt, max_gen_len=2048)
# split out answer
text = model_response.split("")[1].split("")[0]
# parse json
- record_set = json.loads(text)
+ try:
+ record_set = json.loads(text)
+ except json.JSONDecodeError as e:
+ raise Exception(f"Error parsing JSON: {e}")
logging.info("Processing results ...")
# generate pandas dataframe
user_firstname_data = pd.Series(record_set["firstnames"], name="firstnames").to_frame().drop_duplicates(subset=["firstnames"])
@@ -52,7 +98,7 @@ def invoke_bedrock(model, n_user_names, country):
tmp_firstname_country_data = pd.concat(objs=[tmp_firstname_country_data, llama_firstname_country_data], axis=0, ignore_index=True)
tmp_lastname_country_data = pd.concat(objs=[tmp_lastname_country_data, llama_lastname_country_data], axis=0, ignore_index=True)
# standardise names formatting
- standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if x not in [None, "", np.nan] else x
+ standardise_text_lambda = lambda x: unidecode.unidecode(" ".join(x.lower().strip().split())) if pd.isna(x) else x
tmp_firstname_country_data["firstnames"] = tmp_firstname_country_data["firstnames"].apply(lambda x: standardise_text_lambda(x))
tmp_lastname_country_data["lastnames"] = tmp_lastname_country_data["lastnames"].apply(lambda x: standardise_text_lambda(x))
# deduplicate data
@@ -61,22 +107,22 @@ def invoke_bedrock(model, n_user_names, country):
# print shapes
logging.info(f"tmp_firstname_country_data.shape: {tmp_firstname_country_data.shape}")
logging.info(f"tmp_lastname_country_data.shape: {tmp_lastname_country_data.shape}")
- # save firstnames names data to temp directory
+ # save firstnames names data to temp directory (if pairwise firstnames have been created)
if tmp_firstname_country_data.shape[0] >= llama_firstname_country_data.shape[0]:
tmp_firstname_country_data.to_csv(fpath_temp_llama_firstnames, index=False, encoding="latin1")
logging.info(f"Wrote {fpath_temp_llama_firstnames} ...")
- # save lastnames data to temp directory
+ # save lastnames data to temp directory (if pairwise lastnames have been created)
if tmp_lastname_country_data.shape[0] >= llama_lastname_country_data.shape[0]:
tmp_lastname_country_data.to_csv(fpath_temp_llama_lastnames, index=False, encoding="latin1")
logging.info(f"Wrote {fpath_temp_llama_lastnames} ...")
return (tmp_firstname_country_data, tmp_lastname_country_data)
if __name__ == "__main__":
-
+
# set up logging
lgr = logging.getLogger()
lgr.setLevel(logging.INFO)
-
+
# load aws config
with open(cons.fpath_aws_session_token, "r") as j:
aws_config = json.loads(j.read())
@@ -93,7 +139,7 @@ def invoke_bedrock(model, n_user_names, country):
bedrock = Bedrock(session=session, model_region="us-east-1", model_id="meta.llama3-70b-instruct-v1:0")
# load countries, firstnames and surnames files
- countrieseurope = pd.read_csv(cons.fpath_countrieseurope, usecols=['name', 'ISO numeric'])
+ countrieseurope = pd.read_csv(cons.fpath_countries_europe, usecols=['name', 'ISO numeric'])
orig_firstnames = pd.read_csv(cons.fpath_firstnames)
orig_surnames = pd.read_csv(cons.fpath_lastnames)
@@ -106,12 +152,13 @@ def invoke_bedrock(model, n_user_names, country):
firstname_country_data = []
lastname_country_data = []
error_countries = []
+ # switch to toggle bedrock calls
run_bedrock = False
-
+
# set countries list
countries_list = countrieseurope['name'].to_list()
#countries_list = ['Cyprus']
-
+
for country in countries_list:
logging.info(f"{country} ...")
try:
@@ -119,7 +166,7 @@ def invoke_bedrock(model, n_user_names, country):
# call bedrock model and generate user names data
tmp_firstname_country_data, tmp_lastname_country_data = invoke_bedrock(model=bedrock, n_user_names=n_user_names, country=country)
logging.info("Waiting ...")
- # wait 30 seconds before retrying
+ # wait 20 seconds before retrying
time.sleep(20)
else:
tmp_firstname_country_data = pd.read_csv(cons.fpath_temp_llama_firstnames.format(country=country.lower()), encoding="latin1")
@@ -134,7 +181,7 @@ def invoke_bedrock(model, n_user_names, country):
# log if any countries failed to generate data
if len(error_countries) > 0:
logging.info(f"Failed to generated data for countries: {error_countries}")
-
+
# load existing reference data
firstname_country_df = pd.read_csv(cons.fpath_llama_firstnames, encoding="latin1")
lastname_country_df = pd.read_csv(cons.fpath_llama_lastnames, encoding="latin1")
@@ -147,7 +194,7 @@ def invoke_bedrock(model, n_user_names, country):
# sort and deduplicate output data
output_firstname_country_df = output_firstname_country_df.drop_duplicates(subset=["country","firstnames"]).sort_values(by=["country","firstnames"])
output_lastname_country_df = output_lastname_country_df.drop_duplicates(subset=["country","lastnames"]).sort_values(by=["country","lastnames"])
-
+
# write data to disk
if output_firstname_country_df['country'].nunique() == n_countries:
logging.info(f"output_firstname_country_df.shape: {output_firstname_country_df.shape}")
diff --git a/generator/utilities/input_error_handling.py b/generator/utilities/input_error_handling.py
index ffce36a..20c93cc 100644
--- a/generator/utilities/input_error_handling.py
+++ b/generator/utilities/input_error_handling.py
@@ -1,29 +1,31 @@
from beartype import beartype
+from typing import Dict
@beartype
def input_error_handling(
- input_params_dict:dict
- ) -> int:
+ input_params_dict:Dict[str, object],
+ ):
"""
Runs error handling on the input params dictionary.
-
+
Parameters
----------
- input_params_dict : dict
+ input_params_dict : Dict[str, object]
A dictionary of input parameters.
-
- Returns
- -------
- int
- Returns 0 for successful completion, otherwise returns value errors depending on failed input parameter check.
+
+ Examples
+ --------
+ ```
+ input_params_dict = {'n_users': 1000, 'use_random_seed': 1, 'n_itr': 10}
+ input_error_handling(input_params_dict=input_params_dict)
+ ```
"""
# check if the n users parameter is positive
- if not input_params_dict["n_users"] >= 1:
+ if not ((input_params_dict["n_users"] >= 1) and (isinstance(input_params_dict["n_users"], int))):
raise ValueError(f"Invalid n_users parameter value {input_params_dict['n_users']}; must be a integer >= 1.")
# check if the random seed is either 0 or 1
- if not input_params_dict["use_random_seed"] in (0, 1):
- raise ValueError(f"Invalid random_seed use_random_seed value {input_params_dict['use_random_seed']}; must be either 0 or 1.")
+ if not ((input_params_dict["use_random_seed"] in (0, 1)) and (isinstance(input_params_dict["use_random_seed"], int))):
+ raise ValueError(f"Invalid use_random_seed value {input_params_dict['use_random_seed']}; must be either 0 or 1.")
# check if the number of iterations is greater than or equal to 1
- if not input_params_dict["n_itr"] >= 1:
+ if not ((input_params_dict["n_itr"] >= 1) and (isinstance(input_params_dict["n_itr"], int))):
raise ValueError(f"Invalid n_itr parameter value {input_params_dict['n_itr']}; must be an integer >= 1.")
- return 0
diff --git a/generator/utilities/join_idhashes_dict.py b/generator/utilities/join_idhashes_dict.py
index 04663a8..c5c6e1b 100644
--- a/generator/utilities/join_idhashes_dict.py
+++ b/generator/utilities/join_idhashes_dict.py
@@ -1,32 +1,34 @@
+import numpy as np
import pandas as pd
from beartype import beartype
+from typing import Dict, Union
@beartype
def join_idhashes_dict(
data:pd.DataFrame,
- idhashes_dict:dict,
+ idhashes_dict:Dict[Union[str, int], object],
idhash_key_name:str,
- idhash_val_name:str
+ idhash_val_name:str,
):
"""
Joins an entity attribute dictionary to either the user or transaction data.
-
+
Parameters
----------
data : pd.DataFrame
The user or transaction data.
- idhashes_dict : dict
+ idhashes_dict : Dict[Union[str, int], object]
The entity attribute dictionary with an idhash as the key for joining to the user or transaction data.
idhash_key_name : str
The name of the idhash key for joining to the user or transaction data.
idhash_val_name : str
The name to set for the idhash attribute when joining to the user or transaction data.
-
+
Returns
-------
pd.DataFrame
The user or transaction data returned with the joined idhash attribute dictionary values.
"""
- idhashes_df = pd.Series(idhashes_dict, name=idhash_val_name).to_frame().reset_index().rename(columns={'index':idhash_key_name})
+ idhashes_df = pd.DataFrame(list(idhashes_dict.items()), columns=[idhash_key_name, idhash_val_name])
idhashes_join = pd.merge(left=data, right=idhashes_df, on=idhash_key_name, how='left')
return idhashes_join
\ No newline at end of file
diff --git a/generator/utilities/multiprocess.py b/generator/utilities/multiprocess.py
index f8595bb..6a02bd2 100644
--- a/generator/utilities/multiprocess.py
+++ b/generator/utilities/multiprocess.py
@@ -1,34 +1,37 @@
import os
from multiprocessing import Pool
from beartype import beartype
+from typing import List, Any
@beartype
def multiprocess(
func,
- args:list,
- ncpu:int=os.cpu_count()
- ) -> list:
+ args:List[tuple],
+ ncpu:int=None,
+ ) -> List[Any]:
"""
- Generates a dictionary of random dates for an input dictionary of idhashes counts
+ Generates a dictionary of random dates for an input dictionary of idhashes counts by utilizing multiprocessing.
Parameters
----------
- func :
+ func : Callable[..., Any]
The function to be executed in parallel
- args : list
- The input parameters as a list of tuples to be passed with the function in parallel
+ args : List[tuple]
+ The input parameters as a list of tuples to be passed with the function in parallel via starmap.
ncpu : int
- The number of cpus to execute across, default is os.cpu_count().
+ The number of cpus to execute across, default is None.
Returns
-------
- list
+ List[Any]
A list of output returned from the func calls ran in parallel
"""
+ # set number of cpus
+ if ncpu is None:
+ ncpu = os.cpu_count()
# initialize a pool of ncpus
- pool = Pool(ncpu)
- # execution given function and arguments across pool of ncpus
- results = pool.starmap(func, args)
- # close pool of ncpus
- pool.close()
+ results = []
+ with Pool(ncpu) as pool:
+ # execution given function and arguments across pool of ncpus
+ results = pool.starmap(func, args)
return results
diff --git a/generator/utilities/remove_duplicate_idhashes.py b/generator/utilities/remove_duplicate_idhashes.py
index 820d77b..115f00b 100644
--- a/generator/utilities/remove_duplicate_idhashes.py
+++ b/generator/utilities/remove_duplicate_idhashes.py
@@ -5,17 +5,18 @@
@beartype
def remove_duplicate_idhashes(
user_data:pd.DataFrame,
- idhash_col:str
+ idhash_col:str,
):
- """Removes duplicate idhashes from a given idhash column.
-
+ """
+ Removes duplicate idhashes from a given idhash column.
+
Parameters
----------
user_data : pandas.DataFrame
The user level telecom payments data.
idhash_col : str
The column with duplicate idhashes to be removed.
-
+
Returns
-------
pandas.DataFrame
@@ -30,5 +31,5 @@ def remove_duplicate_idhashes(
# overwrite series with empty lists
tmp_data[idhash_col] = np.nan
tmp_data[idhash_col] = tmp_deduplicate_series
- tmp_data[idhash_col] = tmp_data[idhash_col].apply(lambda x: x if x == x else [])
+ tmp_data[idhash_col] = tmp_data[idhash_col].apply(lambda x: x if pd.notnull(x) else [])
return tmp_data
diff --git a/generator/utilities/round_trans_amount.py b/generator/utilities/round_trans_amount.py
index d52f018..8b1002e 100644
--- a/generator/utilities/round_trans_amount.py
+++ b/generator/utilities/round_trans_amount.py
@@ -1,24 +1,31 @@
import numpy as np
-import pandas as pd
from beartype import beartype
@beartype
def round_trans_amount(amounts:np.ndarray) -> np.ndarray:
"""
Rounds transaction amounts to have store price like remainders such as 1.99, 3.45, and 2.5.
-
+
Parameters
----------
- amounts : np.array
+ amounts : np.ndarray
The transaction amounts to round.
Returns
-------
np.array
- The rounded transaction amounts with store rice like remainders.
+ The rounded transaction amounts with store price like remainders.
+
+ Examples
+ --------
+ ```
+ import numpy as np
+ amounts = np.array([2.34, 5.67, 3.21])
+ round_trans_amount(amounts=amounts)
+ ```
"""
+ # a probability distribution for remainders
round_dict = {0.01:0.4, 0.5:0.1, 0.45:0.1, 0.51:0.1, 0.41:0.1, 0.71:0.1, 1:0.1}
remainder = np.random.choice(a=list(round_dict.keys()), size=amounts.shape[0], replace=True, p=list(round_dict.values()))
- rounded_amounts = np.round(np.ceil(amounts) - remainder, 2)
- rounded_amounts = pd.Series(rounded_amounts).apply(lambda x: max(0, x)).values
+ rounded_amounts =np.maximum(0, np.round(np.ceil(amounts) - remainder, 2))
return rounded_amounts
\ No newline at end of file