diff --git a/cape_privacy/coordinator/client.py b/cape_privacy/coordinator/client.py index f5a2175..cf671fd 100644 --- a/cape_privacy/coordinator/client.py +++ b/cape_privacy/coordinator/client.py @@ -7,7 +7,7 @@ from cape_privacy.coordinator.auth.api_token import APIToken from cape_privacy.policy import parse_policy -from cape_privacy.policy.data import Policy +from cape_privacy.policy import Policy from cape_privacy.utils import base64 diff --git a/cape_privacy/policy/__init__.py b/cape_privacy/policy/__init__.py index 5cf13b9..be37fdf 100644 --- a/cape_privacy/policy/__init__.py +++ b/cape_privacy/policy/__init__.py @@ -1,4 +1,4 @@ -from cape_privacy.policy.data import Policy +from cape_privacy.policy.policy import Policy from cape_privacy.policy.exceptions import NamedTransformNotFound from cape_privacy.policy.exceptions import TransformNotFound from cape_privacy.policy.policy import parse_policy diff --git a/cape_privacy/policy/data.py b/cape_privacy/policy/data.py index 17f1693..e29db52 100644 --- a/cape_privacy/policy/data.py +++ b/cape_privacy/policy/data.py @@ -14,11 +14,6 @@ policy = Policy(**d) """ -from typing import List - -import yaml - -from cape_privacy.audit import AuditLogger from cape_privacy.utils import base64 @@ -124,54 +119,3 @@ def __init__(self, name, type, **kwargs): # then set the arg value to the inner value self.args[key] = bytes(base64.from_string(arg["value"])) - - -class Policy: - """Top level policy object. - - The top level policy object holds the all of the relevant information - for applying policy to data. - - Attributes: - label: The label of the policy. - version: The version of the policy. - rules: List of rules that will be applied to a data frame. - transformations: The named transformations for this policy. - """ - - def __init__( - self, - logger: AuditLogger = AuditLogger(), - id: str = "", - label: str = "", - version: int = 1, - rules: List[Rule] = [], - transformations: List[NamedTransform] = [], - ): - self.id = id - self.logger = logger - self.label = label - self.version = version - - self._raw_transforms = transformations - self.transformations = [ - NamedTransform(**transform) for transform in transformations - ] - - if len(rules) == 0: - raise ValueError( - f"At least one rule must be specified for policy specification {label}" - ) - - self._raw_rules = rules - self.rules = [Rule(**rule) for rule in rules] - - def __repr__(self): - d = { - "label": self.label, - "version": self.version, - "transformations": self._raw_transforms, - "rules": self._raw_rules, - } - - return "Policy:\n\n" + yaml.dump(d, sort_keys=False) diff --git a/cape_privacy/policy/data_test.py b/cape_privacy/policy/data_test.py index 9a0574f..755fbd8 100644 --- a/cape_privacy/policy/data_test.py +++ b/cape_privacy/policy/data_test.py @@ -2,8 +2,8 @@ from cape_privacy.utils import base64 -from .data import Policy -from .policy_test_fixtures import named_with_secret_y +from cape_privacy.policy.policy import Policy +from cape_privacy.policy.policy_test_fixtures import named_with_secret_y y = """label: test_policy version: 1 diff --git a/cape_privacy/policy/policy.py b/cape_privacy/policy/policy.py index 9cec23a..6218a8d 100644 --- a/cape_privacy/policy/policy.py +++ b/cape_privacy/policy/policy.py @@ -35,6 +35,7 @@ from typing import Any from typing import Callable from typing import Dict +from typing import List from typing import Union import pandas as pd @@ -50,9 +51,118 @@ from cape_privacy.pandas import transformations from cape_privacy.policy import data from cape_privacy.policy import exceptions +from cape_privacy.policy.data import NamedTransform +from cape_privacy.policy.data import Rule -def apply_policy(policy: data.Policy, df, inplace=False): +class Policy: + """Top level policy object. + + The top level policy object holds the all of the relevant information + for applying policy to data. + + Attributes: + label: The label of the policy. + version: The version of the policy. + rules: List of rules that will be applied to a data frame. + transformations: The named transformations for this policy. + """ + + def __init__( + self, + logger: AuditLogger = AuditLogger(), + id: str = "", + label: str = "", + version: int = 1, + rules: List[Rule] = [], + transformations: List[NamedTransform] = [], + ): + self.id = id + self.logger = logger + self.label = label + self.version = version + + self._raw_transforms = transformations + self.transformations = [ + NamedTransform(**transform) for transform in transformations + ] + + if len(rules) == 0: + raise ValueError( + f"At least one rule must be specified for policy specification {label}" + ) + + self._raw_rules = rules + self.rules = [Rule(**rule) for rule in rules] + + def apply(self, df, inplace=False): + """Applies a Policy to some DataFrame. + + This function is responsible for inferring the type of the DataFrame, + preparing the relevant Spark or Pandas Transformations, and applying + them to produce a transformed DataFrame that conforms to the Policy. + + Args: + policy: The `Policy` object that the transformed DataFrame will + conform to, e.g. as returned by `cape_privacy.parse_policy`. + df: The DataFrame object to transform according to `policies`. + Must be of type pandas.DataFrame or pyspark.sql.DataFrame. + inplace: Whether to mutate the `df` or produce a new one. + This argument is only relevant for Pandas DataFrames, as Spark + DataFrames do not support mutation. + + Raises: + ValueError: If df is a Spark DataFrame and inplace=True, or if df + is something other than a Pandas or Spark DataFrame. + DependencyError: If Spark is not configured correctly in the + Python environment. + TransformNotFound, NamedTransformNotFound: If the Policy contains + a reference to a Transformation or NamedTransformation that + is unrecognized in the Transformation registry. + """ + if isinstance(df, pd.DataFrame): + registry = pandas_lib.registry + transformer = pandas_lib.transformer + dtypes = pandas_lib.dtypes + if not inplace: + result_df = df.copy() + else: + result_df = df + elif not spark_lib.is_available(): + raise exceptions.DependencyError + elif isinstance(df, spark_lib.DataFrame): + if inplace: + raise ValueError( + "Spark does not support DataFrame mutation, " + + "so inplace=True is invalid." + ) + registry = spark_lib.registry + transformer = spark_lib.transformer + dtypes = spark_lib.dtypes + result_df = df + else: + raise ValueError(f"Expected df to be a DataFrame, found {type(df)}.") + for rule in self.rules: + result_df = _do_transformations( + self, rule, result_df, registry, transformer, dtypes + ) + + self.logger.audit_log(APPLY_POLICY_EVENT, self.id, "policy", self.label) + + return result_df + + def __repr__(self): + d = { + "label": self.label, + "version": self.version, + "transformations": self._raw_transforms, + "rules": self._raw_rules, + } + + return "Policy:\n\n" + yaml.dump(d, sort_keys=False) + + +def apply_policy(policy: Policy, df, inplace=False): """Applies a Policy to some DataFrame. This function is responsible for inferring the type of the DataFrame, preparing the @@ -75,40 +185,12 @@ def apply_policy(policy: data.Policy, df, inplace=False): a Transformation or NamedTransformation that is unrecognized in the Transformation registry. """ - if isinstance(df, pd.DataFrame): - registry = pandas_lib.registry - transformer = pandas_lib.transformer - dtypes = pandas_lib.dtypes - if not inplace: - result_df = df.copy() - else: - result_df = df - elif not spark_lib.is_available(): - raise exceptions.DependencyError - elif isinstance(df, spark_lib.DataFrame): - if inplace: - raise ValueError( - "Spark does not support DataFrame mutation, so inplace=True is invalid." - ) - registry = spark_lib.registry - transformer = spark_lib.transformer - dtypes = spark_lib.dtypes - result_df = df - else: - raise ValueError(f"Expected df to be a DataFrame, found {type(df)}.") - for rule in policy.rules: - result_df = _do_transformations( - policy, rule, result_df, registry, transformer, dtypes - ) - - policy.logger.audit_log(APPLY_POLICY_EVENT, policy.id, "policy", policy.label) - - return result_df + return policy.apply(df, inplace) def parse_policy( p: Union[str, Dict[Any, Any]], logger: AuditLogger = AuditLogger() -) -> data.Policy: +) -> Policy: """Parses a policy YAML file. The passed in string can either be a path to a local file, @@ -133,7 +215,7 @@ def parse_policy( else: policy = p - return data.Policy(logger=logger, **policy) + return Policy(logger=logger, **policy) def _maybe_replace_dtype_arg(args, dtypes): @@ -143,7 +225,7 @@ def _maybe_replace_dtype_arg(args, dtypes): def _get_transformation( - policy: data.Policy, transform: data.Transform, registry: types.ModuleType, dtypes, + policy: Policy, transform: data.Transform, registry: types.ModuleType, dtypes, ): """Looks up the correct transform class. @@ -186,7 +268,7 @@ def _get_transformation( def _do_transformations( - policy: data.Policy, + policy: Policy, rule: data.Rule, df, registry: types.ModuleType, @@ -229,7 +311,7 @@ def _do_transformations( def _load_named_transform( - policy: data.Policy, transformLabel: str, registry: types.ModuleType, dtypes, + policy: Policy, transformLabel: str, registry: types.ModuleType, dtypes, ): """Attempts to load a named transform from the top level policy. @@ -275,7 +357,7 @@ def _load_named_transform( return initTransform -def reverse(policy: data.Policy) -> data.Policy: +def reverse(policy: Policy) -> Policy: """Turns reversible tokenizations into token reversers If any named transformations contain a reversible tokenization transformation diff --git a/cape_privacy/policy/policy_test.py b/cape_privacy/policy/policy_test.py index 8feb747..7aabbd2 100644 --- a/cape_privacy/policy/policy_test.py +++ b/cape_privacy/policy/policy_test.py @@ -10,7 +10,6 @@ from cape_privacy import pandas as pandas_lib from cape_privacy import spark as spark_lib from cape_privacy.pandas.transformations import test_utils -from cape_privacy.policy import data from cape_privacy.policy import exceptions from cape_privacy.policy import policy as policy_lib from cape_privacy.policy import policy_test_fixtures as fixtures @@ -46,7 +45,7 @@ def test_named_transform_not_found(): df = pd.DataFrame(np.ones(5,), columns=["test"]) - p = data.Policy(**d) + p = policy_lib.Policy(**d) tfm = p.rules[0].transformations[0] with pytest.raises(exceptions.NamedTransformNotFound) as e: @@ -62,7 +61,7 @@ def test_named_transform_type_not_found(): fixtures.named_not_found_y("plusOne", "plusOne", "plusM"), Loader=yaml.FullLoader, ) - p = data.Policy(**d) + p = policy_lib.Policy(**d) tfm = p.rules[0].transformations[0] with pytest.raises(exceptions.NamedTransformNotFound) as e: @@ -95,7 +94,7 @@ def test_apply_policy_pandas(): expected_df = df + 3 - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df) @@ -110,7 +109,7 @@ def test_missing_column(): expected_df = df - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df) @@ -140,7 +139,7 @@ def test_apply_complex_policies_pandas(): } ) - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df) @@ -155,7 +154,7 @@ def test_named_transformation_pandas(): expected_df = df + 3 - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df) @@ -168,7 +167,7 @@ def test_column_redact_pandas(): df = pd.DataFrame(np.ones((5, 2)), columns=["test", "apple"]) - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df) @@ -187,7 +186,7 @@ def test_apply_policy_spark(): spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) d = yaml.load(fixtures.y, Loader=yaml.FullLoader) - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df) @@ -220,7 +219,7 @@ def test_apply_complex_policies_spark(): df = sess.createDataFrame(pd_df) d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader) - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df, check_dtype=True) @@ -233,7 +232,7 @@ def test_named_transformation_spark(): spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader) - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df) @@ -249,7 +248,7 @@ def test_column_redaction_spark(): spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) d = yaml.load(fixtures.redact_y, Loader=yaml.FullLoader) - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df).toPandas() pdt.assert_frame_equal(new_df, expected_df) @@ -261,7 +260,7 @@ def test_secret_in_named_transform(): df = pd.DataFrame({"name": ["bob", "alice"]}) - p = data.Policy(**d) + p = policy_lib.Policy(**d) new_df = policy_lib.apply_policy(p, df)