Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified data/unittest/transaction_data.parquet
Binary file not shown.
Binary file modified data/unittest/user_data.parquet
Binary file not shown.
47 changes: 45 additions & 2 deletions generator/app/ProgrammeParams.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,51 @@
import cons
from datetime import datetime
from beartype import beartype

import cons

class ProgrammeParams():
"""
Class to manage and store programme parameters for the telecom payment generator.
This class validates and initializes all configuration parameters needed for the
payment generation process, including user counts, application volumes, and date ranges
for registration and transaction periods.

Parameters
----------
n_users : int, optional
Number of users. Defaults to 100.
random_seed : int, optional
Seed for reproducible randomization. Defaults to None.
n_applications : int, optional
Number of applications. Defaults to 20000.
registration_start_date : str, optional
Registration period start date. Defaults to cons.default_registration_start_date.
registration_end_date : str, optional
Registration period end date. Defaults to cons.default_registration_end_date.
transaction_start_date : str, optional
Transaction period start date. Defaults to cons.default_transaction_start_date.
transaction_end_date : str, optional
Transaction period end date. Defaults to cons.default_transaction_end_date.

Attributes
----------
random_seed : int, optional
Seed for random number generation for reproducibility.
n_users : int
Number of users to generate. Defaults to 100.
n_applications : int
Number of applications to generate. Defaults to 20000.
registration_start_date : str
Start date for user registration (format: YYYY-MM-DD).
registration_end_date : str
End date for user registration (format: YYYY-MM-DD).
transaction_start_date : str
Start date for transactions (format: YYYY-MM-DD).
transaction_end_date : str
End date for transactions (format: YYYY-MM-DD).
transaction_timescale : float
The transaction period duration in years.
"""

@beartype
def __init__(
Expand All @@ -13,7 +56,7 @@ def __init__(
registration_start_date:str=cons.default_registration_start_date,
registration_end_date:str=cons.default_registration_end_date,
transaction_start_date:str=cons.default_transaction_start_date,
transaction_end_date:str=cons.default_transaction_end_date
transaction_end_date:str=cons.default_transaction_end_date,
):
# take programme parameters from class parameters
self.random_seed = random_seed
Expand Down
53 changes: 29 additions & 24 deletions generator/app/gen_random_telecom_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
import pandas as pd
from typing import Dict
import random
from beartype import beartype

Expand All @@ -16,22 +18,25 @@

@beartype
def gen_random_telecom_data(
n_users=1,
random_seed=None,
registration_start_date=cons.default_registration_start_date,
registration_end_date=cons.default_registration_end_date,
transaction_start_date=cons.default_transaction_start_date,
transaction_end_date=cons.default_transaction_end_date
):
n_users:int=1,
random_seed:int=None,
n_applications:int=20000,
registration_start_date:str=cons.default_registration_start_date,
registration_end_date:str=cons.default_registration_end_date,
transaction_start_date:str=cons.default_transaction_start_date,
transaction_end_date:str=cons.default_transaction_end_date,
) -> Dict[str, pd.DataFrame]:
"""
Generates random telecommunications data.

Parameters
----------
n_users : float
n_users : int
The number of users to generate random telecom payments data for, default is 1.
random_seed : int
A set random seed for reproducible results, default is None.
n_applications : int
The number of applications to generate, default is 20000.
registration_start_date : str
The user registration start date, default is cons.default_registration_start_date.
registration_end_date : str
Expand All @@ -40,52 +45,52 @@ def gen_random_telecom_data(
The user transaction start date, default is cons.default_transaction_start_date.
transaction_end_date : str
The user transaction end date, default is cons.default_transaction_end_date.

Returns
-------
pandas.DataFrame
Dict[str, pandas.DataFrame]
A random telecommunication payments dataset.
"""

# initalise programme parameters
programmeparams = ProgrammeParams(
n_users=n_users,
n_users=n_users,
random_seed=random_seed,
n_applications=20000,
registration_start_date=registration_start_date,
n_applications=n_applications,
registration_start_date=registration_start_date,
registration_end_date=registration_end_date,
transaction_start_date=transaction_start_date,
transaction_end_date=transaction_end_date
)

# set random seed
random.seed(programmeparams.random_seed)
np.random.seed(seed=programmeparams.random_seed)

# generate random users
user_obj = User(
n_user_ids=programmeparams.n_users,
start_date=programmeparams.registration_start_date,
end_date=programmeparams.registration_end_date,
fpath_firstnames=cons.fpath_llama_firstnames,
fpath_lastnames=cons.fpath_llama_lastnames,
fpath_countrieseurope=cons.fpath_countrieseurope,
fpath_domain_email=cons.fpath_domain_email
fpath_countries_europe=cons.fpath_countries_europe,
fpath_email_domain =cons.fpath_email_domain
)

# generate random entity counts for each user
random_entity_counts = gen_random_entity_counts(
user_obj=user_obj,
transaction_timescale=programmeparams.transaction_timescale
)

# generate random entity values
device_obj = Device(n_device_hashes=random_entity_counts['n_devices'].sum())
card_obj = Card(n_card_hashes=random_entity_counts['n_cards'].sum())
ip_obj = Ip(n_ip_hashes=random_entity_counts['n_ips'].sum())
transaction_obj = Transaction(n_transaction_hashes=random_entity_counts['n_transactions'].sum(), start_date=programmeparams.transaction_start_date, end_date=programmeparams.transaction_end_date)
application_obj = Application(n_application_hashes=programmeparams.n_applications)

# generate user level data
user_data = gen_user_data(
random_entity_counts=random_entity_counts,
Expand All @@ -96,7 +101,7 @@ def gen_random_telecom_data(
transaction_obj=transaction_obj,
application_obj=application_obj,
)

# generate transaction level data
trans_data = gen_trans_data(
user_data=user_data,
Expand All @@ -108,5 +113,5 @@ def gen_random_telecom_data(
application_obj=application_obj,
fpath_countrycrimeindex=cons.fpath_countrycrimeindex
)

return {"user_data":user_data, "trans_data":trans_data}
39 changes: 21 additions & 18 deletions generator/app/gen_trans_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import random
import pandas as pd
import numpy as np
import cons
from datetime import datetime
from beartype import beartype

from objects.User import User
from objects.Device import Device
from objects.Card import Card
Expand All @@ -14,7 +15,7 @@
from utilities.gen_trans_rejection_rates import gen_trans_rejection_rates
from utilities.gen_trans_status import gen_trans_status
from utilities.join_idhashes_dict import join_idhashes_dict
from beartype import beartype
import cons

@beartype
def gen_trans_data(
Expand All @@ -25,11 +26,11 @@ def gen_trans_data(
ip_obj:Ip,
transaction_obj:Transaction,
application_obj:Application,
fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex
fpath_countrycrimeindex:str=cons.fpath_countrycrimeindex,
):
"""
Generates random transaction level telecom payments data.

Parameters
----------
user_data : pandas.DataFrame
Expand All @@ -48,22 +49,23 @@ def gen_trans_data(
The random application data model object.
fpath_countrycrimeindex : str
The full file path to the country crime index reference data, default is cons.fpath_countrycrimeindex.

Returns
-------
pandas.DataFrame
The random transaction level telecom payments data.
"""

# explode user data to transaction level
trans_data = user_data.explode('transaction_hash').dropna(subset = ['transaction_hash']).reset_index(drop = True)
# select uid entity hashes for each transaction
trans_data['device_hash'] = trans_data['device_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan)
trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan)
trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if x != [] else np.nan)
trans_data['application_hash'] = trans_data['application_hash'].apply(lambda x: np.random.choice(x, size = 1)[0])
trans_data['device_hash'] = trans_data['device_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
trans_data['application_hash'] = trans_data['application_hash'].apply(lambda x: np.random.choice(x, size = 1)[0] if isinstance(x, list) and x != [] else np.nan)
# add null values card hashes
trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: np.nan if random.uniform(0, 1) <= cons.data_model_null_rates['card'] else x)
trans_null_mask = np.random.uniform(size=trans_data.shape[0]) <= cons.data_model_null_rates['card']
trans_data.loc[trans_null_mask, 'card_hash'] = np.nan
# add shared hashed entities between users
trans_data['ip_hash'] = trans_data['ip_hash'].apply(lambda x: ip_obj.ip_shared_idhash_map_dict[x] if x in ip_obj.ip_shared_idhash_map_dict.keys() else x)
trans_data['card_hash'] = trans_data['card_hash'].apply(lambda x: card_obj.card_shared_idhash_map_dict[x] if x in card_obj.card_shared_idhash_map_dict.keys() else x)
Expand All @@ -79,7 +81,7 @@ def gen_trans_data(
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=transaction_obj.transaction_hashes_dates_dict, idhash_key_name='transaction_hash', idhash_val_name='transaction_date')
# add application data
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=application_obj.application_hashes_payment_channel_dict, idhash_key_name='application_hash', idhash_val_name='card_payment_channel')

# TODO: wrap this logic up into a separate function
# align payment channel with missing card hashes and 0 transaction amounts
zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0)
Expand All @@ -90,7 +92,8 @@ def gen_trans_data(
trans_data['transaction_payment_method'] = 'card'
zero_transaction_amount_filter = (trans_data['transaction_amount'] == 0.0)
missing_card_hash_filter = (trans_data['card_hash'].isnull())
trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0])
# trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = missing_card_hash_filter.apply(lambda x: np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = 1, p = list(cons.data_model_non_card_trans_methods.values()))[0])
trans_data.loc[missing_card_hash_filter, 'transaction_payment_method'] = pd.Series(np.random.choice(a = list(cons.data_model_non_card_trans_methods.keys()), size = missing_card_hash_filter.sum(), p = list(cons.data_model_non_card_trans_methods.values()))[0])
trans_data.loc[zero_transaction_amount_filter, 'transaction_payment_method'] = np.nan
# align country codes for user, ip and card
country_code_columns = ['registration_country_code_alpha', 'ip_country_code_alpha', 'card_country_code_alpha']
Expand All @@ -105,15 +108,15 @@ def gen_trans_data(
dates_series = pd.date_range(start=datetime.strptime(transaction_obj.start_date, "%Y-%m-%d"), end=datetime.strptime(transaction_obj.end_date, "%Y-%m-%d") - pd.Timedelta(days=1), freq="d")
trans_data[date_columns] = trans_data[date_columns].apply(lambda s: [s['registration_date'], np.random.choice(a=dates_series[dates_series >= max(s['registration_date'], s['transaction_date'])], size=1)[0]], result_type = 'expand', axis = 1).copy()
# map iso numeric country codes to iso alpha country codes
country_codes_map = gen_country_codes_map(fpath_countrieseurope=user_obj.fpath_countrieseurope)
country_codes_map = gen_country_codes_map(fpath_countries_europe=user_obj.fpath_countries_europe)
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='registration_country_code_alpha', idhash_val_name='registration_country_code')
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='card_country_code_alpha', idhash_val_name='card_country_code')
trans_data = join_idhashes_dict(data=trans_data, idhashes_dict=country_codes_map, idhash_key_name='ip_country_code_alpha', idhash_val_name='ip_country_code')

# generate transaction status and error code
rejection_rates_dict = gen_trans_rejection_rates(trans_data=trans_data, fpath_countrieseurope=user_obj.fpath_countrieseurope, fpath_countrycrimeindex=fpath_countrycrimeindex, fpath_domain_email=user_obj.fpath_domain_email)
rejection_rates_dict = gen_trans_rejection_rates(trans_data=trans_data, fpath_countries_europe=user_obj.fpath_countries_europe, fpath_countrycrimeindex=fpath_countrycrimeindex, fpath_email_domain =user_obj.fpath_email_domain )
trans_data[['transaction_status', 'transaction_error_code']] = trans_data.apply(lambda series: gen_trans_status(series = series, rejection_rates_dict = rejection_rates_dict), result_type = 'expand', axis = 1)

# order columns and sort rows by transaction date
user_cols = ['userid', 'firstname', 'lastname', 'registration_date', 'registration_country_code', 'uid', 'email_domain']
device_cols = ['device_hash', 'device_type']
Expand All @@ -124,5 +127,5 @@ def gen_trans_data(
itr_cols = ['itr_hash']
col_order = user_cols + device_cols + card_cols + ip_cols + app_cols + trans_cols + itr_cols
trans_data = trans_data[col_order].sort_values(by = 'transaction_date').reset_index(drop = True)

return trans_data
33 changes: 19 additions & 14 deletions generator/app/gen_user_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
import numpy as np
from beartype import beartype

from objects.User import User
from objects.Device import Device
from objects.Card import Card
Expand All @@ -9,7 +11,6 @@
from utilities.gen_obj_idhash_series import gen_obj_idhash_series
from utilities.join_idhashes_dict import join_idhashes_dict
from utilities.gen_random_hash import gen_random_hash
from beartype import beartype

@beartype
def gen_user_data(
Expand All @@ -28,17 +29,17 @@ def gen_user_data(
----------
random_entity_counts : pd.DataFrame
The randomly generated entities count data
user_obj : class
user_obj : User
The random user data model object
device_obj : class
device_obj : Device
The random device data model object
card_obj : class
card_obj : Card
The random card data model object
ip_obj : class
ip_obj : Ip
The random ip data model object
transaction_obj : class
transaction_obj : Transaction
The random transaction data model object
application_obj : class
application_obj : Application
The random application data model object

Returns
Expand All @@ -58,14 +59,18 @@ def gen_user_data(
zero_pad = (userid_date_country_code.str.len() - 11).abs().apply(lambda x: '0'*x)
user_data['userid'] = userid_date_country_code + zero_pad + user_data['uid'].astype(str).str[-5:]
# add hash data lists
user_data['device_hash'] = gen_obj_idhash_series(idhashes_props_dict=device_obj.device_hashes_props_dict, n_counts_series=user_data['n_devices'])
user_data['card_hash'] = gen_obj_idhash_series(idhashes_props_dict=card_obj.card_hashes_props_dict, n_counts_series=user_data['n_cards'])
user_data['ip_hash'] = gen_obj_idhash_series(idhashes_props_dict=ip_obj.ip_hashes_props_dict, n_counts_series=user_data['n_ips'])
user_data['transaction_hash'] = gen_obj_idhash_series(idhashes_props_dict=transaction_obj.transaction_hashes_props_dict, n_counts_series=user_data['n_transactions'])
user_data['application_hash'] = user_data['n_applications'].apply(lambda x: list(np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p = list(application_obj.application_hashes_props_dict.values()), replace = True, size = x)))
user_data['device_hash'] = gen_obj_idhash_series(idhashes=device_obj.device_hashes, n_counts_series=user_data['n_devices'])
user_data['card_hash'] = gen_obj_idhash_series(idhashes=card_obj.card_hashes, n_counts_series=user_data['n_cards'])
user_data['ip_hash'] = gen_obj_idhash_series(idhashes=ip_obj.ip_hashes, n_counts_series=user_data['n_ips'])
user_data['transaction_hash'] = gen_obj_idhash_series(idhashes=transaction_obj.transaction_hashes, n_counts_series=user_data['n_transactions'])
# generate application hashes per user
#user_data['application_hash'] = user_data['n_applications'].apply(lambda x: list(np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p = list(application_obj.application_hashes_props_dict.values()), replace = True, size = x)))
total_application_hashes = user_data['n_applications'].sum()
split_indices = user_data['n_applications'].cumsum()[:-1].values
application_hashes = np.random.choice(a = list(application_obj.application_hashes_props_dict.keys()), p=list(application_obj.application_hashes_props_dict.values()), replace=True, size=total_application_hashes)
user_data['application_hash'] = pd.Series(np.split(application_hashes, split_indices)).apply(lambda x: x.tolist())
# drop excess columns
drop_columns = ['n_devices', 'n_cards', 'n_ips', 'n_applications', 'n_transactions']
user_data = user_data.drop(columns = drop_columns)
user_data = user_data.drop(columns = ['n_devices', 'n_cards', 'n_ips', 'n_applications', 'n_transactions'])
# create a hash value for the dataset (to distinguish between different iterations)
user_data['itr_hash'] = gen_random_hash(size=1)[0]
return user_data
Loading