From cc94af5551e86c7cc71b5a3bf50e2c3205e63d55 Mon Sep 17 00:00:00 2001 From: SDUgitrep Date: Tue, 24 Sep 2024 21:40:01 +0400 Subject: [PATCH 1/4] Update the PredictionRejectionArea to allow for PRR curve generation --- requirements.txt | 3 + src/lm_polygraph/ue_metrics/pred_rej_area.py | 146 +++++++++++++------ src/lm_polygraph/utils/manager.py | 8 +- 3 files changed, 115 insertions(+), 42 deletions(-) diff --git a/requirements.txt b/requirements.txt index 949524b00..cbd1b7ba1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,6 @@ nltk>=3.7,<4 evaluate spacy>=3.4.0,<4 fastchat +seaborn +uuid +os \ No newline at end of file diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py index bcbfbc802..27343342d 100644 --- a/src/lm_polygraph/ue_metrics/pred_rej_area.py +++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py @@ -3,51 +3,115 @@ from typing import List from .ue_metric import UEMetric, normalize - +import seaborn as sns +import matplotlib.pyplot as plt +import os +import uuid class PredictionRejectionArea(UEMetric): """ Calculates area under Prediction-Rejection curve. """ - def __init__(self, max_rejection: float = 1.0): - """ - Parameters: - max_rejection (float): a maximum proportion of instances that will be rejected. - 1.0 indicates entire set, 0.5 - half of the set - """ - super().__init__() - self.max_rejection = max_rejection - def __str__(self): - if self.max_rejection == 1: - return "prr" - return f"prr_{self.max_rejection}" - - def __call__(self, estimator: List[float], target: List[float]) -> float: - """ - Measures the area under the Prediction-Rejection curve between `estimator` and `target`. - - Parameters: - estimator (List[int]): a batch of uncertainty estimations. - Higher values indicate more uncertainty. - target (List[int]): a batch of ground-truth uncertainty estimations. - Higher values indicate less uncertainty. - Returns: - float: area under the Prediction-Rejection curve. - Higher values indicate better uncertainty estimations. - """ - target = normalize(target) - # ue: greater is more uncertain - ue = np.array(estimator) - num_obs = len(ue) - num_rej = int(self.max_rejection * num_obs) - # Sort in ascending order: the least uncertain come first - ue_argsort = np.argsort(ue) - # want sorted_metrics to be increasing => smaller scores is better - sorted_metrics = np.array(target)[ue_argsort] - # Since we want all plots to coincide when all the data is discarded - cumsum = np.cumsum(sorted_metrics)[-num_rej:] - scores = (cumsum / np.arange((num_obs - num_rej) + 1, num_obs + 1))[::-1] - prr_score = np.sum(scores) / num_rej - return prr_score + return "prr" + + def get_ue_rejection(self, estimator, target, num_remaining_points): + return np.flip(np.cumsum(target[np.argsort(estimator)]) / num_remaining_points) + + def get_oracle_rejection(self, estimator, target, num_remaining_points): + return np.flip(np.cumsum(np.flip(np.sort(target))) / num_remaining_points) + + def get_random_rejection(self, estimator, target, num_remaining_points, N_EXAMPLES): + random_rejection_accuracies = [] + for _ in range(1000): + order = np.arange(0, N_EXAMPLES) + np.random.shuffle(order) + random_rejection_accuracies.append(np.flip(np.cumsum(target[order]) / num_remaining_points)) + + return np.mean(random_rejection_accuracies, axis=0) + +def __call__(self, estimator: List[float], target: List[float], generate_curve:bool = False, e_level:str = '', e_name:str ='', gen_name:str ='', ue_metric:str ='') -> float: + """ + Measures the area under the Prediction-Rejection curve between `estimator` and `target`. + + Parameters: + estimator (List[float]): A batch of uncertainty estimations. + Higher values indicate more uncertainty. + target (List[float]): A batch of ground-truth uncertainty estimations. + Higher values indicate less uncertainty. + generate_curve (bool): A flag to generate and save the PRR curve if set to True. + e_level (str): Name of method level. + e_name (str): Name of estimattor method. + gen_name (str): Name of generation metric. + ue_metric (str): The uncertainty estimation metric used (for labeling purposes). + + Returns: + float: Area under the Prediction-Rejection curve (PRR score). + Higher values indicate better uncertainty estimations. + """ + # Normalize the target values to a common scale + target = normalize(target) + + # Convert the estimator list to a NumPy array (UE stands for uncertainty estimation) + ue = np.array(estimator) + num_obs = len(ue) + + # Sort the indices of `ue` in ascending order, so least uncertain examples come first + ue_argsort = np.argsort(ue) + + # Sort the target metrics based on the sorted indices from the estimator + sorted_metrics = np.array(target)[ue_argsort] + + # Compute the cumulative sum of the sorted metrics for calculating the PRR score + cumsum = np.cumsum(sorted_metrics) + + # Calculate the scores as cumulative sums divided by the index (from the sorted order) + # and reverse the scores to get the final rejection curve + scores = (cumsum / np.arange(1, num_obs + 1))[::-1] + + # The PRR score is the average of the reversed cumulative sums, divided by the number of observations + prr_score = np.sum(scores) / num_obs + + # If `generate_curve` is set to True, generate and save the PRR curve plot + if generate_curve: + plots_dir = './plots' + if not os.path.exists(plots_dir): + os.makedirs(plots_dir) + + # Get the number of examples and remaining points to calculate rejection accuracies + N_EXAMPLES = len(estimator) + num_remaining_points = np.arange(1, N_EXAMPLES + 1) + + # Calculate rejection accuracies for the UE (uncertainty estimator), Oracle, and Random baselines + ue_rejected_accuracy = self.get_ue_rejection(estimator, target, num_remaining_points) + oracle_rejected_accuracy = self.get_oracle_rejection(estimator, target, num_remaining_points) + random_rejection_accuracy = self.get_random_rejection(estimator, target, num_remaining_points, N_EXAMPLES) + + # Define the rejection rates, ranging from 0 (keeping all data) to 1 (discarding all data) + rejection_rates = np.linspace(0, 1, N_EXAMPLES) + + # Plot the rejection curves for UE, Oracle, and Random using Seaborn for better visualization + sns.lineplot(x=rejection_rates, y=ue_rejected_accuracy, label='UE') + sns.lineplot(x=rejection_rates, y=oracle_rejected_accuracy, label='Oracle') + g = sns.lineplot(x=rejection_rates, y=random_rejection_accuracy, label='Random') + + # Customize the plot's labels, title, and grid for better readability + g.set_xlabel('Rejection Rate') + g.set_ylabel(f'{gen_name}') + g.set_title(f'PRR curve: {e_level}, {e_name}') + g.grid() + + # Generate a unique filename for the plot using UUID and save it as a PNG file + base_filename = 'prr_curve' + extension = 'png' + unique_id = uuid.uuid4() + new_filename = f"{base_filename}_{e_name}_{gen_name}_{unique_id}.{extension}" + save_path = os.path.join(plots_dir, new_filename) + + # Save the generated plot and close the figure to free up memory + plt.savefig(save_path) + plt.close() + + # Return the computed PRR score + return prr_score diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 263034002..c9956d102 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -529,7 +529,11 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: else: oracle_score = ue_metric(-metric, metric) random_score = get_random_scores(ue_metric, metric) - ue_metric_val = ue_metric(ue, metric) + # For prr metric only - option to generate PRR curve + if str(ue_metric) == 'prr': + ue_metric_val = ue_metric(ue, metric, True, e_level, e_name, gen_name, str(ue_metric)) + else: + ue_metric_val = ue_metric(ue, metric) self.metrics[e_level, e_name, gen_name, str(ue_metric)] = ( ue_metric_val ) @@ -537,11 +541,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: e_level, e_name, gen_name, str(ue_metric) + "_normalized" ] = normalize_metric(ue_metric_val, oracle_score, random_score) + for processor in self.processors: processor.on_eval(self.metrics, self.total_bad_estimators) return self.metrics + def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> dict: """ Runs stat calculators and handles errors if any occur. Returns updated batch stats From 35bf72d51947b2e06d71a5a7b7da8464d8cc3f69 Mon Sep 17 00:00:00 2001 From: silvimica Date: Thu, 26 Sep 2024 14:52:55 +0400 Subject: [PATCH 2/4] PRR curve generation optimized --- requirements.txt | 5 +- src/lm_polygraph/ue_metrics/pred_rej_area.py | 135 +++++-------------- src/lm_polygraph/ue_metrics/ue_metric.py | 95 ++++++++++++- src/lm_polygraph/utils/manager.py | 16 ++- 4 files changed, 132 insertions(+), 119 deletions(-) diff --git a/requirements.txt b/requirements.txt index cbd1b7ba1..078246218 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,7 +34,4 @@ unbabel-comet==2.2.1 nltk>=3.7,<4 evaluate spacy>=3.4.0,<4 -fastchat -seaborn -uuid -os \ No newline at end of file +fastchat \ No newline at end of file diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py index 27343342d..ec3d5d8ad 100644 --- a/src/lm_polygraph/ue_metrics/pred_rej_area.py +++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py @@ -3,10 +3,7 @@ from typing import List from .ue_metric import UEMetric, normalize -import seaborn as sns -import matplotlib.pyplot as plt -import os -import uuid + class PredictionRejectionArea(UEMetric): """ @@ -16,102 +13,34 @@ class PredictionRejectionArea(UEMetric): def __str__(self): return "prr" - def get_ue_rejection(self, estimator, target, num_remaining_points): - return np.flip(np.cumsum(target[np.argsort(estimator)]) / num_remaining_points) - - def get_oracle_rejection(self, estimator, target, num_remaining_points): - return np.flip(np.cumsum(np.flip(np.sort(target))) / num_remaining_points) - - def get_random_rejection(self, estimator, target, num_remaining_points, N_EXAMPLES): - random_rejection_accuracies = [] - for _ in range(1000): - order = np.arange(0, N_EXAMPLES) - np.random.shuffle(order) - random_rejection_accuracies.append(np.flip(np.cumsum(target[order]) / num_remaining_points)) - - return np.mean(random_rejection_accuracies, axis=0) - -def __call__(self, estimator: List[float], target: List[float], generate_curve:bool = False, e_level:str = '', e_name:str ='', gen_name:str ='', ue_metric:str ='') -> float: - """ - Measures the area under the Prediction-Rejection curve between `estimator` and `target`. - - Parameters: - estimator (List[float]): A batch of uncertainty estimations. - Higher values indicate more uncertainty. - target (List[float]): A batch of ground-truth uncertainty estimations. - Higher values indicate less uncertainty. - generate_curve (bool): A flag to generate and save the PRR curve if set to True. - e_level (str): Name of method level. - e_name (str): Name of estimattor method. - gen_name (str): Name of generation metric. - ue_metric (str): The uncertainty estimation metric used (for labeling purposes). - - Returns: - float: Area under the Prediction-Rejection curve (PRR score). - Higher values indicate better uncertainty estimations. - """ - # Normalize the target values to a common scale - target = normalize(target) - - # Convert the estimator list to a NumPy array (UE stands for uncertainty estimation) - ue = np.array(estimator) - num_obs = len(ue) - - # Sort the indices of `ue` in ascending order, so least uncertain examples come first - ue_argsort = np.argsort(ue) - - # Sort the target metrics based on the sorted indices from the estimator - sorted_metrics = np.array(target)[ue_argsort] - - # Compute the cumulative sum of the sorted metrics for calculating the PRR score - cumsum = np.cumsum(sorted_metrics) - - # Calculate the scores as cumulative sums divided by the index (from the sorted order) - # and reverse the scores to get the final rejection curve - scores = (cumsum / np.arange(1, num_obs + 1))[::-1] - - # The PRR score is the average of the reversed cumulative sums, divided by the number of observations - prr_score = np.sum(scores) / num_obs - - # If `generate_curve` is set to True, generate and save the PRR curve plot - if generate_curve: - plots_dir = './plots' - if not os.path.exists(plots_dir): - os.makedirs(plots_dir) - - # Get the number of examples and remaining points to calculate rejection accuracies - N_EXAMPLES = len(estimator) - num_remaining_points = np.arange(1, N_EXAMPLES + 1) - - # Calculate rejection accuracies for the UE (uncertainty estimator), Oracle, and Random baselines - ue_rejected_accuracy = self.get_ue_rejection(estimator, target, num_remaining_points) - oracle_rejected_accuracy = self.get_oracle_rejection(estimator, target, num_remaining_points) - random_rejection_accuracy = self.get_random_rejection(estimator, target, num_remaining_points, N_EXAMPLES) - - # Define the rejection rates, ranging from 0 (keeping all data) to 1 (discarding all data) - rejection_rates = np.linspace(0, 1, N_EXAMPLES) - - # Plot the rejection curves for UE, Oracle, and Random using Seaborn for better visualization - sns.lineplot(x=rejection_rates, y=ue_rejected_accuracy, label='UE') - sns.lineplot(x=rejection_rates, y=oracle_rejected_accuracy, label='Oracle') - g = sns.lineplot(x=rejection_rates, y=random_rejection_accuracy, label='Random') - - # Customize the plot's labels, title, and grid for better readability - g.set_xlabel('Rejection Rate') - g.set_ylabel(f'{gen_name}') - g.set_title(f'PRR curve: {e_level}, {e_name}') - g.grid() - - # Generate a unique filename for the plot using UUID and save it as a PNG file - base_filename = 'prr_curve' - extension = 'png' - unique_id = uuid.uuid4() - new_filename = f"{base_filename}_{e_name}_{gen_name}_{unique_id}.{extension}" - save_path = os.path.join(plots_dir, new_filename) - - # Save the generated plot and close the figure to free up memory - plt.savefig(save_path) - plt.close() - - # Return the computed PRR score - return prr_score + def __call__(self, estimator: List[float], target: List[float], return_scores:bool = False) -> float: + """ + Measures the area under the Prediction-Rejection curve between `estimator` and `target`. + + Parameters: + estimator (List[int]): a batch of uncertainty estimations. + Higher values indicate more uncertainty. + target (List[int]): a batch of ground-truth uncertainty estimations. + Higher values indicate less uncertainty. + return_scores(bool): a marker for returning PRR scores. + Returns: + float: area under the Prediction-Rejection curve. + Higher values indicate better uncertainty estimations. + """ + target = normalize(target) + # ue: greater is more uncertain + ue = np.array(estimator) + num_obs = len(ue) + # Sort in ascending order: the least uncertain come first + ue_argsort = np.argsort(ue) + # want sorted_metrics to be increasing => smaller scores is better + sorted_metrics = np.array(target)[ue_argsort] + # Since we want all plots to coincide when all the data is discarded + cumsum = np.cumsum(sorted_metrics) + scores = (cumsum / np.arange(1, num_obs + 1))[::-1] + prr_score = np.sum(scores) / num_obs + + if return_scores: + return prr_score, scores + + return prr_score diff --git a/src/lm_polygraph/ue_metrics/ue_metric.py b/src/lm_polygraph/ue_metrics/ue_metric.py index 4e9f4b04c..ec9a60884 100644 --- a/src/lm_polygraph/ue_metrics/ue_metric.py +++ b/src/lm_polygraph/ue_metrics/ue_metric.py @@ -3,6 +3,9 @@ from typing import List from abc import ABC, abstractmethod +import matplotlib.pyplot as plt +import os +import uuid def normalize(target: List[float]): min_t, max_t = np.min(target), np.max(target) @@ -57,19 +60,99 @@ def __call__(self, estimator: List[float], target: List[float]) -> float: raise Exception("Not implemented") -def get_random_scores(function, metrics, num_iter=1000, seed=42): + +def get_random_scores(function, metrics, return_scores:bool = False, num_iter=1000, seed=42): np.random.seed(seed) + rand_scores = np.arange(len(metrics)) - value = [] - for i in range(num_iter): + prr_values = [] # To store PRR scores across iterations + score_values = [] # To store detailed scores or rejection accuracies across iterations + + for _ in range(num_iter): np.random.shuffle(rand_scores) - rand_val = function(rand_scores, metrics) - value.append(rand_val) - return np.mean(value) + + # Use the function like __call__ to compute PRR score and optionally return detailed scores + if return_scores: + # Call the function to get both PRR score and detailed scores + prr_score, detailed_scores = function(rand_scores, metrics, return_scores=True) + prr_values.append(prr_score) + score_values.append(detailed_scores) + else: + # Call the function to get only the PRR score + prr_score = function(rand_scores, metrics) + prr_values.append(prr_score) + + # Compute the mean PRR score across all iterations + mean_prr_score = np.mean(prr_values) + + # If return_scores is True, also compute the mean of the detailed scores (or rejection accuracies) + if return_scores: + mean_scores = np.mean(score_values, axis=0) + return mean_prr_score, mean_scores + + # Otherwise, just return the PRR score (no detailed scores) + return mean_prr_score + def normalize_metric(target_score, oracle_score, random_score): if not (oracle_score == random_score): target_score = (target_score - random_score) / (oracle_score - random_score) return target_score + + + + +def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_rejected_accuracy, e_level: str, e_name: str, gen_name: str): + """ + Generates and saves a PRR curve plot using only matplotlib. + + Parameters: + ue_rejected_accuracy (np.array): Rejection curve for uncertainty estimation (UE). + oracle_rejected_accuracy (np.array): Rejection curve for Oracle (ideal). + random_rejected_accuracy (np.array): Rejection curve for Random baseline. + e_level (str): Experiment level. + e_name (str): Experiment name. + gen_name (str): General name for the plot label. + + Returns: + str: The path where the plot is saved. + """ + # Directory to save plots + plots_dir = './plots' + os.makedirs(plots_dir, exist_ok=True) + + # Number of examples + N_EXAMPLES = len(ue_rejected_accuracy) + + # Rejection rates (x-axis) + rejection_rates = np.linspace(0, 1, N_EXAMPLES) + + # Create plot + plt.figure(figsize=(8, 6)) + + # Plot each line (UE, Oracle, Random) + plt.plot(rejection_rates, ue_rejected_accuracy, label='UE', linestyle='-') + plt.plot(rejection_rates, oracle_rejected_accuracy, label='Oracle', linestyle='-') + plt.plot(rejection_rates, random_rejected_accuracy, label='Random', linestyle='-') + + # Add labels and title + plt.xlabel('Rejection Rate') + plt.ylabel(f'{gen_name}') + plt.title(f'PRR curve: {e_level}, {e_name}') + + # Add grid and legend + plt.grid(True) + plt.legend() + + # Generate a random UUID for the filename + base_filename = 'prr_curve' + extension = 'png' + unique_id = uuid.uuid4() + new_filename = f"{base_filename}_{e_name}_{gen_name}_{unique_id}.{extension}" + save_path = os.path.join(plots_dir, new_filename) + + # Save the plot + plt.savefig(save_path) + plt.close() \ No newline at end of file diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index c9956d102..f8f9053b9 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -19,6 +19,7 @@ UEMetric, get_random_scores, normalize_metric, + generate_prr_curve ) from lm_polygraph.estimators.estimator import Estimator from lm_polygraph.stat_calculators.stat_calculator import StatCalculator @@ -511,6 +512,7 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: torch.cuda.empty_cache() gc.collect() + for (e_level, e_name), estimator_values in self.estimations.items(): for (gen_level, gen_name), generation_metric in self.gen_metrics.items(): for ue_metric in self.ue_metrics: @@ -527,13 +529,17 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: if len(ue) == 0: self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan else: - oracle_score = ue_metric(-metric, metric) - random_score = get_random_scores(ue_metric, metric) - # For prr metric only - option to generate PRR curve + # For prr, generate plot if str(ue_metric) == 'prr': - ue_metric_val = ue_metric(ue, metric, True, e_level, e_name, gen_name, str(ue_metric)) + oracle_score, oracle_scores = ue_metric(-metric, metric, True) + random_score, random_scores = get_random_scores(ue_metric, metric, True) + ue_metric_val , ue_scores = ue_metric(ue, metric, True) + generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name) else: + oracle_score= ue_metric(-metric, metric) + random_score = get_random_scores(ue_metric, metric) ue_metric_val = ue_metric(ue, metric) + self.metrics[e_level, e_name, gen_name, str(ue_metric)] = ( ue_metric_val ) @@ -541,13 +547,11 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: e_level, e_name, gen_name, str(ue_metric) + "_normalized" ] = normalize_metric(ue_metric_val, oracle_score, random_score) - for processor in self.processors: processor.on_eval(self.metrics, self.total_bad_estimators) return self.metrics - def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> dict: """ Runs stat calculators and handles errors if any occur. Returns updated batch stats From cbbfe0d90a42371264c1d673d0a3c0f01a78e341 Mon Sep 17 00:00:00 2001 From: silvimica Date: Tue, 8 Oct 2024 08:08:13 +0400 Subject: [PATCH 3/4] Added output_prr_curve to config and UEManager --- scripts/polygraph_eval | 4 ++ src/lm_polygraph/ue_metrics/pred_rej_area.py | 20 ++++++- src/lm_polygraph/ue_metrics/ue_metric.py | 9 ++- src/lm_polygraph/utils/manager.py | 59 ++++++++++++++++---- 4 files changed, 75 insertions(+), 17 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 23f6db69e..5fc9d68d4 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -40,6 +40,7 @@ def main(args): os.chdir(hydra.utils.get_original_cwd()) save_path = args.save_path if "save_path" in args else save_path + output_prr_curves = getattr(args, "output_prr_curves", False) if args.seed is None or len(args.seed) == 0: args.seed = [1] @@ -201,8 +202,11 @@ def main(args): ensemble_model=ensemble_model, cache_path=args.cache_path, language=getattr(args, 'language', 'en'), + output_prr_curves = output_prr_curves, + save_path= save_path ) + man() man.save(save_path + f"/ue_manager_seed{seed}") diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py index ec3d5d8ad..a365d3e53 100644 --- a/src/lm_polygraph/ue_metrics/pred_rej_area.py +++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py @@ -10,7 +10,19 @@ class PredictionRejectionArea(UEMetric): Calculates area under Prediction-Rejection curve. """ + def __init__(self, max_rejection: float = 1.0): + """ + Parameters: + max_rejection (float): a maximum proportion of instances that will be rejected. + 1.0 indicates entire set, 0.5 - half of the set + """ + super().__init__() + self.max_rejection = max_rejection + def __str__(self): + if self.max_rejection == 1: + return "prr" + return f"prr_{self.max_rejection}" return "prr" def __call__(self, estimator: List[float], target: List[float], return_scores:bool = False) -> float: @@ -31,14 +43,16 @@ def __call__(self, estimator: List[float], target: List[float], return_scores:bo # ue: greater is more uncertain ue = np.array(estimator) num_obs = len(ue) + num_rej = int(self.max_rejection * num_obs) # Sort in ascending order: the least uncertain come first ue_argsort = np.argsort(ue) # want sorted_metrics to be increasing => smaller scores is better sorted_metrics = np.array(target)[ue_argsort] # Since we want all plots to coincide when all the data is discarded - cumsum = np.cumsum(sorted_metrics) - scores = (cumsum / np.arange(1, num_obs + 1))[::-1] - prr_score = np.sum(scores) / num_obs + cumsum = np.cumsum(sorted_metrics)[-num_rej:] + scores = (cumsum / np.arange((num_obs - num_rej) + 1, num_obs + 1))[::-1] + prr_score = np.sum(scores) / num_rej + if return_scores: return prr_score, scores diff --git a/src/lm_polygraph/ue_metrics/ue_metric.py b/src/lm_polygraph/ue_metrics/ue_metric.py index ec9a60884..7a1c96d3d 100644 --- a/src/lm_polygraph/ue_metrics/ue_metric.py +++ b/src/lm_polygraph/ue_metrics/ue_metric.py @@ -104,7 +104,9 @@ def normalize_metric(target_score, oracle_score, random_score): -def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_rejected_accuracy, e_level: str, e_name: str, gen_name: str): + + +def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_rejected_accuracy, e_level: str, e_name: str, gen_name: str, path: str): """ Generates and saves a PRR curve plot using only matplotlib. @@ -120,8 +122,9 @@ def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_re str: The path where the plot is saved. """ # Directory to save plots - plots_dir = './plots' - os.makedirs(plots_dir, exist_ok=True) + + plots_dir = path + # os.makedirs(plots_dir, exist_ok=True) # Number of examples N_EXAMPLES = len(ue_rejected_accuracy) diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index f8f9053b9..7e14fc3b7 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -257,6 +257,8 @@ def __init__( max_new_tokens: int = 100, background_train_dataset_max_new_tokens: int = 100, cache_path=os.path.expanduser("~") + "/.cache", + output_prr_curves=False, + save_path = '' ): """ Parameters: @@ -276,8 +278,42 @@ def __init__( deberta_device (Optional[str]): The device to run deberta on. If None, will use 'cuda:0' if available, 'cpu' otherwise. Default: None. language (str): Language to test in claim-level benchmark, one of 'en', 'zh', 'ar', 'ru'. Default: 'en'. - verbose (bool): If set, will print useful info during batch processing. Default: True. + verbose for (e_level, e_name), estimator_values in self.estimations.items(): + for (gen_level, gen_name), generation_metric in self.gen_metrics.items(): + for ue_metric in self.ue_metrics: + if gen_level != e_level: + continue + if len(estimator_values) != len(generation_metric): + raise Exception( + f"Got different number of metrics for {e_name} and {gen_name}: " + f"{len(estimator_values)} and {len(generation_metric)}" + ) + # TODO: Report how many nans! + # This is important to know for a user + ue, metric = _delete_nans(estimator_values, generation_metric) + if len(ue) == 0: + self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan + else: + # For prr, generate plot + if str(ue_metric) == 'prr': + oracle_score, oracle_scores = ue_metric(-metric, metric, True) + random_score, random_scores = get_random_scores(ue_metric, metric, True) + ue_metric_val , ue_scores = ue_metric(ue, metric, True) + generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name) + else: + oracle_score= ue_metric(-metric, metric) + random_score = get_random_scores(ue_metric, metric) + ue_metric_val = ue_metric(ue, metric) + + self.metrics[e_level, e_name, gen_name, str(ue_metric)] = ( + ue_metric_val + ) + self.metrics[ + e_level, e_name, gen_name, str(ue_metric) + "_normalized" + ] = normalize_metric(ue_metric_val, oracle_score, random_score) (bool): If set, will print useful info during batch processing. Default: True. max_new_tokens (int): Maximum new tokens to use in generation. Default: 100. + output_prr_curves: Flag for generating PRR curves in the save_path directory + save_path: save_path from config """ stat_calculators_dict, stat_dependencies_dict = register_stat_calculators( @@ -285,12 +321,11 @@ def __init__( deberta_device=deberta_device, language=language, cache_path=cache_path, - model=model, ) self.stat_calculators_dict = stat_calculators_dict - - self.model: Model = model + self.save_path = save_path + self.model: WhiteboxModel = model self.train_data: Dataset = train_data self.background_train_data: Dataset = background_train_data self.ensemble_model = ensemble_model @@ -298,13 +333,15 @@ def __init__( self.estimators: List[Estimator] = estimators self.generation_metrics: List[GenerationMetric] = generation_metrics self.ue_metrics: List[UEMetric] = ue_metrics + self.output_prr_curves = output_prr_curves _check_unique_names(generation_metrics) _check_unique_names(estimators) _check_unique_names(ue_metrics) - greedy = ["greedy_texts"] - if not isinstance(self.model, BlackboxModel): - greedy += ["greedy_tokens"] + if isinstance(model, BlackboxModel): + greedy = ["blackbox_greedy_texts"] + else: + greedy = ["greedy_tokens", "greedy_texts"] stats = ( [s for e in self.estimators for s in e.stats_dependencies] @@ -384,7 +421,7 @@ def __init__( ensemble_stats = [ s for e in self.ensemble_estimators - for s in e.stats_dependencies + for s in e.stats_dependenciesgenerate_prr_curve if s.startswith("ensemble") ] ensemble_stats, _ = _order_calculators( @@ -530,13 +567,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]: self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan else: # For prr, generate plot - if str(ue_metric) == 'prr': + if str(ue_metric) == 'prr' and self.output_prr_curves: oracle_score, oracle_scores = ue_metric(-metric, metric, True) random_score, random_scores = get_random_scores(ue_metric, metric, True) ue_metric_val , ue_scores = ue_metric(ue, metric, True) - generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name) + generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name, self.save_path) else: - oracle_score= ue_metric(-metric, metric) + oracle_score = ue_metric(-metric, metric) random_score = get_random_scores(ue_metric, metric) ue_metric_val = ue_metric(ue, metric) From 63a2baa26762f65b03f3492a81473409e9c1adac Mon Sep 17 00:00:00 2001 From: silvimica Date: Wed, 16 Oct 2024 15:46:15 +0400 Subject: [PATCH 4/4] Fixes to code --- scripts/polygraph_eval | 3 ++- src/lm_polygraph/ue_metrics/pred_rej_area.py | 1 - src/lm_polygraph/ue_metrics/ue_metric.py | 12 ++++++------ src/lm_polygraph/utils/manager.py | 2 -- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval index 5fc9d68d4..d1df37cb6 100755 --- a/scripts/polygraph_eval +++ b/scripts/polygraph_eval @@ -40,7 +40,7 @@ def main(args): os.chdir(hydra.utils.get_original_cwd()) save_path = args.save_path if "save_path" in args else save_path - output_prr_curves = getattr(args, "output_prr_curves", False) + if args.seed is None or len(args.seed) == 0: args.seed = [1] @@ -184,6 +184,7 @@ def main(args): generation_metrics = get_generation_metrics(args) ue_metrics = get_ue_metrics(args) + output_prr_curves = getattr(args, "output_prr_curves", False) man = UEManager( dataset, diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py index a365d3e53..9457faf20 100644 --- a/src/lm_polygraph/ue_metrics/pred_rej_area.py +++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py @@ -23,7 +23,6 @@ def __str__(self): if self.max_rejection == 1: return "prr" return f"prr_{self.max_rejection}" - return "prr" def __call__(self, estimator: List[float], target: List[float], return_scores:bool = False) -> float: """ diff --git a/src/lm_polygraph/ue_metrics/ue_metric.py b/src/lm_polygraph/ue_metrics/ue_metric.py index 7a1c96d3d..c3a414420 100644 --- a/src/lm_polygraph/ue_metrics/ue_metric.py +++ b/src/lm_polygraph/ue_metrics/ue_metric.py @@ -66,7 +66,7 @@ def get_random_scores(function, metrics, return_scores:bool = False, num_iter=10 rand_scores = np.arange(len(metrics)) - prr_values = [] # To store PRR scores across iterations + ue_metric_scores = [] # To store PRR scores across iterations score_values = [] # To store detailed scores or rejection accuracies across iterations for _ in range(num_iter): @@ -76,23 +76,23 @@ def get_random_scores(function, metrics, return_scores:bool = False, num_iter=10 if return_scores: # Call the function to get both PRR score and detailed scores prr_score, detailed_scores = function(rand_scores, metrics, return_scores=True) - prr_values.append(prr_score) + ue_metric_scores.append(prr_score) score_values.append(detailed_scores) else: # Call the function to get only the PRR score prr_score = function(rand_scores, metrics) - prr_values.append(prr_score) + ue_metric_scores.append(prr_score) # Compute the mean PRR score across all iterations - mean_prr_score = np.mean(prr_values) + ue_metric_score = np.mean(ue_metric_scores) # If return_scores is True, also compute the mean of the detailed scores (or rejection accuracies) if return_scores: mean_scores = np.mean(score_values, axis=0) - return mean_prr_score, mean_scores + return ue_metric_score, mean_scores # Otherwise, just return the PRR score (no detailed scores) - return mean_prr_score + return ue_metric_score diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py index 7e14fc3b7..01a6c112f 100644 --- a/src/lm_polygraph/utils/manager.py +++ b/src/lm_polygraph/utils/manager.py @@ -278,7 +278,6 @@ def __init__( deberta_device (Optional[str]): The device to run deberta on. If None, will use 'cuda:0' if available, 'cpu' otherwise. Default: None. language (str): Language to test in claim-level benchmark, one of 'en', 'zh', 'ar', 'ru'. Default: 'en'. - verbose for (e_level, e_name), estimator_values in self.estimations.items(): for (gen_level, gen_name), generation_metric in self.gen_metrics.items(): for ue_metric in self.ue_metrics: if gen_level != e_level: @@ -421,7 +420,6 @@ def __init__( ensemble_stats = [ s for e in self.ensemble_estimators - for s in e.stats_dependenciesgenerate_prr_curve if s.startswith("ensemble") ] ensemble_stats, _ = _order_calculators(