From cc94af5551e86c7cc71b5a3bf50e2c3205e63d55 Mon Sep 17 00:00:00 2001
From: SDUgitrep <mayagoloburda@gmail.com>
Date: Tue, 24 Sep 2024 21:40:01 +0400
Subject: [PATCH 1/4] Update the PredictionRejectionArea to allow for PRR curve
 generation

---
 requirements.txt                             |   3 +
 src/lm_polygraph/ue_metrics/pred_rej_area.py | 146 +++++++++++++------
 src/lm_polygraph/utils/manager.py            |   8 +-
 3 files changed, 115 insertions(+), 42 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 949524b00..cbd1b7ba1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,3 +35,6 @@ nltk>=3.7,<4
 evaluate
 spacy>=3.4.0,<4
 fastchat
+seaborn
+uuid
+os
\ No newline at end of file
diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py
index bcbfbc802..27343342d 100644
--- a/src/lm_polygraph/ue_metrics/pred_rej_area.py
+++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py
@@ -3,51 +3,115 @@
 from typing import List
 
 from .ue_metric import UEMetric, normalize
-
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+import uuid
 
 class PredictionRejectionArea(UEMetric):
     """
     Calculates area under Prediction-Rejection curve.
     """
 
-    def __init__(self, max_rejection: float = 1.0):
-        """
-        Parameters:
-            max_rejection (float): a maximum proportion of instances that will be rejected.
-                1.0 indicates entire set, 0.5 - half of the set
-        """
-        super().__init__()
-        self.max_rejection = max_rejection
-
     def __str__(self):
-        if self.max_rejection == 1:
-            return "prr"
-        return f"prr_{self.max_rejection}"
-
-    def __call__(self, estimator: List[float], target: List[float]) -> float:
-        """
-        Measures the area under the Prediction-Rejection curve between `estimator` and `target`.
-
-        Parameters:
-            estimator (List[int]): a batch of uncertainty estimations.
-                Higher values indicate more uncertainty.
-            target (List[int]): a batch of ground-truth uncertainty estimations.
-                Higher values indicate less uncertainty.
-        Returns:
-            float: area under the Prediction-Rejection curve.
-                Higher values indicate better uncertainty estimations.
-        """
-        target = normalize(target)
-        # ue: greater is more uncertain
-        ue = np.array(estimator)
-        num_obs = len(ue)
-        num_rej = int(self.max_rejection * num_obs)
-        # Sort in ascending order: the least uncertain come first
-        ue_argsort = np.argsort(ue)
-        # want sorted_metrics to be increasing => smaller scores is better
-        sorted_metrics = np.array(target)[ue_argsort]
-        # Since we want all plots to coincide when all the data is discarded
-        cumsum = np.cumsum(sorted_metrics)[-num_rej:]
-        scores = (cumsum / np.arange((num_obs - num_rej) + 1, num_obs + 1))[::-1]
-        prr_score = np.sum(scores) / num_rej
-        return prr_score
+        return "prr"
+
+    def get_ue_rejection(self, estimator, target, num_remaining_points):
+        return np.flip(np.cumsum(target[np.argsort(estimator)]) / num_remaining_points)
+
+    def get_oracle_rejection(self, estimator, target, num_remaining_points):
+        return np.flip(np.cumsum(np.flip(np.sort(target))) / num_remaining_points)
+
+    def get_random_rejection(self, estimator, target, num_remaining_points, N_EXAMPLES):
+        random_rejection_accuracies = []
+        for _ in range(1000):
+            order = np.arange(0, N_EXAMPLES)
+            np.random.shuffle(order)
+            random_rejection_accuracies.append(np.flip(np.cumsum(target[order]) / num_remaining_points))
+
+        return np.mean(random_rejection_accuracies, axis=0)
+
+def __call__(self, estimator: List[float], target: List[float], generate_curve:bool = False, e_level:str = '', e_name:str ='', gen_name:str ='', ue_metric:str ='') -> float:
+    """
+    Measures the area under the Prediction-Rejection curve between `estimator` and `target`.
+
+    Parameters:
+        estimator (List[float]): A batch of uncertainty estimations.
+            Higher values indicate more uncertainty.
+        target (List[float]): A batch of ground-truth uncertainty estimations.
+            Higher values indicate less uncertainty.
+        generate_curve (bool): A flag to generate and save the PRR curve if set to True.
+        e_level (str): Name of method level.
+        e_name (str): Name of estimattor method.
+        gen_name (str): Name of generation metric.
+        ue_metric (str): The uncertainty estimation metric used (for labeling purposes).
+
+    Returns:
+        float: Area under the Prediction-Rejection curve (PRR score).
+            Higher values indicate better uncertainty estimations.
+    """
+    # Normalize the target values to a common scale
+    target = normalize(target)
+    
+    # Convert the estimator list to a NumPy array (UE stands for uncertainty estimation)
+    ue = np.array(estimator)
+    num_obs = len(ue)
+    
+    # Sort the indices of `ue` in ascending order, so least uncertain examples come first
+    ue_argsort = np.argsort(ue)
+    
+    # Sort the target metrics based on the sorted indices from the estimator
+    sorted_metrics = np.array(target)[ue_argsort]
+    
+    # Compute the cumulative sum of the sorted metrics for calculating the PRR score
+    cumsum = np.cumsum(sorted_metrics)
+    
+    # Calculate the scores as cumulative sums divided by the index (from the sorted order)
+    # and reverse the scores to get the final rejection curve
+    scores = (cumsum / np.arange(1, num_obs + 1))[::-1]
+    
+    # The PRR score is the average of the reversed cumulative sums, divided by the number of observations
+    prr_score = np.sum(scores) / num_obs
+
+    # If `generate_curve` is set to True, generate and save the PRR curve plot
+    if generate_curve:
+        plots_dir = './plots'
+        if not os.path.exists(plots_dir):
+            os.makedirs(plots_dir)
+
+        # Get the number of examples and remaining points to calculate rejection accuracies
+        N_EXAMPLES = len(estimator)
+        num_remaining_points = np.arange(1, N_EXAMPLES + 1)
+
+        # Calculate rejection accuracies for the UE (uncertainty estimator), Oracle, and Random baselines
+        ue_rejected_accuracy = self.get_ue_rejection(estimator, target, num_remaining_points)
+        oracle_rejected_accuracy = self.get_oracle_rejection(estimator, target, num_remaining_points)
+        random_rejection_accuracy = self.get_random_rejection(estimator, target, num_remaining_points, N_EXAMPLES)
+
+        # Define the rejection rates, ranging from 0 (keeping all data) to 1 (discarding all data)
+        rejection_rates = np.linspace(0, 1, N_EXAMPLES)
+
+        # Plot the rejection curves for UE, Oracle, and Random using Seaborn for better visualization
+        sns.lineplot(x=rejection_rates, y=ue_rejected_accuracy, label='UE')
+        sns.lineplot(x=rejection_rates, y=oracle_rejected_accuracy, label='Oracle')
+        g = sns.lineplot(x=rejection_rates, y=random_rejection_accuracy, label='Random')
+        
+        # Customize the plot's labels, title, and grid for better readability
+        g.set_xlabel('Rejection Rate')
+        g.set_ylabel(f'{gen_name}')
+        g.set_title(f'PRR curve: {e_level}, {e_name}')
+        g.grid()
+        
+        # Generate a unique filename for the plot using UUID and save it as a PNG file
+        base_filename = 'prr_curve'
+        extension = 'png'
+        unique_id = uuid.uuid4()
+        new_filename = f"{base_filename}_{e_name}_{gen_name}_{unique_id}.{extension}"
+        save_path = os.path.join(plots_dir, new_filename)
+
+        # Save the generated plot and close the figure to free up memory
+        plt.savefig(save_path)
+        plt.close() 
+
+    # Return the computed PRR score
+    return prr_score
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 263034002..c9956d102 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -529,7 +529,11 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                     else:
                         oracle_score = ue_metric(-metric, metric)
                         random_score = get_random_scores(ue_metric, metric)
-                        ue_metric_val = ue_metric(ue, metric)
+                        # For prr metric only - option to generate PRR curve
+                        if str(ue_metric) == 'prr':
+                            ue_metric_val = ue_metric(ue, metric, True, e_level, e_name, gen_name, str(ue_metric))
+                        else:
+                            ue_metric_val = ue_metric(ue, metric)
                         self.metrics[e_level, e_name, gen_name, str(ue_metric)] = (
                             ue_metric_val
                         )
@@ -537,11 +541,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                             e_level, e_name, gen_name, str(ue_metric) + "_normalized"
                         ] = normalize_metric(ue_metric_val, oracle_score, random_score)
 
+
         for processor in self.processors:
             processor.on_eval(self.metrics, self.total_bad_estimators)
 
         return self.metrics
 
+
     def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> dict:
         """
         Runs stat calculators and handles errors if any occur. Returns updated batch stats

From 35bf72d51947b2e06d71a5a7b7da8464d8cc3f69 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Thu, 26 Sep 2024 14:52:55 +0400
Subject: [PATCH 2/4] PRR curve generation optimized

---
 requirements.txt                             |   5 +-
 src/lm_polygraph/ue_metrics/pred_rej_area.py | 135 +++++--------------
 src/lm_polygraph/ue_metrics/ue_metric.py     |  95 ++++++++++++-
 src/lm_polygraph/utils/manager.py            |  16 ++-
 4 files changed, 132 insertions(+), 119 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cbd1b7ba1..078246218 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,7 +34,4 @@ unbabel-comet==2.2.1
 nltk>=3.7,<4
 evaluate
 spacy>=3.4.0,<4
-fastchat
-seaborn
-uuid
-os
\ No newline at end of file
+fastchat
\ No newline at end of file
diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py
index 27343342d..ec3d5d8ad 100644
--- a/src/lm_polygraph/ue_metrics/pred_rej_area.py
+++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py
@@ -3,10 +3,7 @@
 from typing import List
 
 from .ue_metric import UEMetric, normalize
-import seaborn as sns
-import matplotlib.pyplot as plt
-import os
-import uuid
+
 
 class PredictionRejectionArea(UEMetric):
     """
@@ -16,102 +13,34 @@ class PredictionRejectionArea(UEMetric):
     def __str__(self):
         return "prr"
 
-    def get_ue_rejection(self, estimator, target, num_remaining_points):
-        return np.flip(np.cumsum(target[np.argsort(estimator)]) / num_remaining_points)
-
-    def get_oracle_rejection(self, estimator, target, num_remaining_points):
-        return np.flip(np.cumsum(np.flip(np.sort(target))) / num_remaining_points)
-
-    def get_random_rejection(self, estimator, target, num_remaining_points, N_EXAMPLES):
-        random_rejection_accuracies = []
-        for _ in range(1000):
-            order = np.arange(0, N_EXAMPLES)
-            np.random.shuffle(order)
-            random_rejection_accuracies.append(np.flip(np.cumsum(target[order]) / num_remaining_points))
-
-        return np.mean(random_rejection_accuracies, axis=0)
-
-def __call__(self, estimator: List[float], target: List[float], generate_curve:bool = False, e_level:str = '', e_name:str ='', gen_name:str ='', ue_metric:str ='') -> float:
-    """
-    Measures the area under the Prediction-Rejection curve between `estimator` and `target`.
-
-    Parameters:
-        estimator (List[float]): A batch of uncertainty estimations.
-            Higher values indicate more uncertainty.
-        target (List[float]): A batch of ground-truth uncertainty estimations.
-            Higher values indicate less uncertainty.
-        generate_curve (bool): A flag to generate and save the PRR curve if set to True.
-        e_level (str): Name of method level.
-        e_name (str): Name of estimattor method.
-        gen_name (str): Name of generation metric.
-        ue_metric (str): The uncertainty estimation metric used (for labeling purposes).
-
-    Returns:
-        float: Area under the Prediction-Rejection curve (PRR score).
-            Higher values indicate better uncertainty estimations.
-    """
-    # Normalize the target values to a common scale
-    target = normalize(target)
-    
-    # Convert the estimator list to a NumPy array (UE stands for uncertainty estimation)
-    ue = np.array(estimator)
-    num_obs = len(ue)
-    
-    # Sort the indices of `ue` in ascending order, so least uncertain examples come first
-    ue_argsort = np.argsort(ue)
-    
-    # Sort the target metrics based on the sorted indices from the estimator
-    sorted_metrics = np.array(target)[ue_argsort]
-    
-    # Compute the cumulative sum of the sorted metrics for calculating the PRR score
-    cumsum = np.cumsum(sorted_metrics)
-    
-    # Calculate the scores as cumulative sums divided by the index (from the sorted order)
-    # and reverse the scores to get the final rejection curve
-    scores = (cumsum / np.arange(1, num_obs + 1))[::-1]
-    
-    # The PRR score is the average of the reversed cumulative sums, divided by the number of observations
-    prr_score = np.sum(scores) / num_obs
-
-    # If `generate_curve` is set to True, generate and save the PRR curve plot
-    if generate_curve:
-        plots_dir = './plots'
-        if not os.path.exists(plots_dir):
-            os.makedirs(plots_dir)
-
-        # Get the number of examples and remaining points to calculate rejection accuracies
-        N_EXAMPLES = len(estimator)
-        num_remaining_points = np.arange(1, N_EXAMPLES + 1)
-
-        # Calculate rejection accuracies for the UE (uncertainty estimator), Oracle, and Random baselines
-        ue_rejected_accuracy = self.get_ue_rejection(estimator, target, num_remaining_points)
-        oracle_rejected_accuracy = self.get_oracle_rejection(estimator, target, num_remaining_points)
-        random_rejection_accuracy = self.get_random_rejection(estimator, target, num_remaining_points, N_EXAMPLES)
-
-        # Define the rejection rates, ranging from 0 (keeping all data) to 1 (discarding all data)
-        rejection_rates = np.linspace(0, 1, N_EXAMPLES)
-
-        # Plot the rejection curves for UE, Oracle, and Random using Seaborn for better visualization
-        sns.lineplot(x=rejection_rates, y=ue_rejected_accuracy, label='UE')
-        sns.lineplot(x=rejection_rates, y=oracle_rejected_accuracy, label='Oracle')
-        g = sns.lineplot(x=rejection_rates, y=random_rejection_accuracy, label='Random')
-        
-        # Customize the plot's labels, title, and grid for better readability
-        g.set_xlabel('Rejection Rate')
-        g.set_ylabel(f'{gen_name}')
-        g.set_title(f'PRR curve: {e_level}, {e_name}')
-        g.grid()
-        
-        # Generate a unique filename for the plot using UUID and save it as a PNG file
-        base_filename = 'prr_curve'
-        extension = 'png'
-        unique_id = uuid.uuid4()
-        new_filename = f"{base_filename}_{e_name}_{gen_name}_{unique_id}.{extension}"
-        save_path = os.path.join(plots_dir, new_filename)
-
-        # Save the generated plot and close the figure to free up memory
-        plt.savefig(save_path)
-        plt.close() 
-
-    # Return the computed PRR score
-    return prr_score
+    def __call__(self, estimator: List[float], target: List[float], return_scores:bool = False) -> float:
+        """
+        Measures the area under the Prediction-Rejection curve between `estimator` and `target`.
+
+        Parameters:
+            estimator (List[int]): a batch of uncertainty estimations.
+                Higher values indicate more uncertainty.
+            target (List[int]): a batch of ground-truth uncertainty estimations.
+                Higher values indicate less uncertainty.
+            return_scores(bool): a marker for returning PRR scores.
+        Returns:
+            float: area under the Prediction-Rejection curve.
+                Higher values indicate better uncertainty estimations.
+        """
+        target = normalize(target)
+        # ue: greater is more uncertain
+        ue = np.array(estimator)
+        num_obs = len(ue)
+        # Sort in ascending order: the least uncertain come first
+        ue_argsort = np.argsort(ue)
+        # want sorted_metrics to be increasing => smaller scores is better
+        sorted_metrics = np.array(target)[ue_argsort]
+        # Since we want all plots to coincide when all the data is discarded
+        cumsum = np.cumsum(sorted_metrics)
+        scores = (cumsum / np.arange(1, num_obs + 1))[::-1]
+        prr_score = np.sum(scores) / num_obs
+
+        if return_scores:
+            return prr_score, scores
+
+        return prr_score
diff --git a/src/lm_polygraph/ue_metrics/ue_metric.py b/src/lm_polygraph/ue_metrics/ue_metric.py
index 4e9f4b04c..ec9a60884 100644
--- a/src/lm_polygraph/ue_metrics/ue_metric.py
+++ b/src/lm_polygraph/ue_metrics/ue_metric.py
@@ -3,6 +3,9 @@
 from typing import List
 from abc import ABC, abstractmethod
 
+import matplotlib.pyplot as plt
+import os
+import uuid
 
 def normalize(target: List[float]):
     min_t, max_t = np.min(target), np.max(target)
@@ -57,19 +60,99 @@ def __call__(self, estimator: List[float], target: List[float]) -> float:
         raise Exception("Not implemented")
 
 
-def get_random_scores(function, metrics, num_iter=1000, seed=42):
+
+def get_random_scores(function, metrics, return_scores:bool = False, num_iter=1000, seed=42):
     np.random.seed(seed)
+   
     rand_scores = np.arange(len(metrics))
 
-    value = []
-    for i in range(num_iter):
+    prr_values = []      # To store PRR scores across iterations
+    score_values = []    # To store detailed scores or rejection accuracies across iterations
+
+    for _ in range(num_iter):
         np.random.shuffle(rand_scores)
-        rand_val = function(rand_scores, metrics)
-        value.append(rand_val)
-    return np.mean(value)
+
+        # Use the function like __call__ to compute PRR score and optionally return detailed scores
+        if return_scores:
+            # Call the function to get both PRR score and detailed scores
+            prr_score, detailed_scores = function(rand_scores, metrics, return_scores=True)
+            prr_values.append(prr_score)
+            score_values.append(detailed_scores)
+        else:
+            # Call the function to get only the PRR score
+            prr_score = function(rand_scores, metrics)
+            prr_values.append(prr_score)
+
+    # Compute the mean PRR score across all iterations
+    mean_prr_score = np.mean(prr_values)
+
+    # If return_scores is True, also compute the mean of the detailed scores (or rejection accuracies)
+    if return_scores:
+        mean_scores = np.mean(score_values, axis=0)
+        return mean_prr_score, mean_scores
+
+    # Otherwise, just return the PRR score (no detailed scores)
+    return mean_prr_score
+
 
 
 def normalize_metric(target_score, oracle_score, random_score):
     if not (oracle_score == random_score):
         target_score = (target_score - random_score) / (oracle_score - random_score)
     return target_score
+
+
+
+
+def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_rejected_accuracy, e_level: str, e_name: str, gen_name: str):
+    """
+    Generates and saves a PRR curve plot using only matplotlib.
+
+    Parameters:
+        ue_rejected_accuracy (np.array): Rejection curve for uncertainty estimation (UE).
+        oracle_rejected_accuracy (np.array): Rejection curve for Oracle (ideal).
+        random_rejected_accuracy (np.array): Rejection curve for Random baseline.
+        e_level (str): Experiment level.
+        e_name (str): Experiment name.
+        gen_name (str): General name for the plot label.
+
+    Returns:
+        str: The path where the plot is saved.
+    """
+    # Directory to save plots
+    plots_dir = './plots'
+    os.makedirs(plots_dir, exist_ok=True)
+
+    # Number of examples
+    N_EXAMPLES = len(ue_rejected_accuracy)
+    
+    # Rejection rates (x-axis)
+    rejection_rates = np.linspace(0, 1, N_EXAMPLES)
+
+    # Create plot
+    plt.figure(figsize=(8, 6))
+
+    # Plot each line (UE, Oracle, Random)
+    plt.plot(rejection_rates, ue_rejected_accuracy, label='UE', linestyle='-')
+    plt.plot(rejection_rates, oracle_rejected_accuracy, label='Oracle', linestyle='-')
+    plt.plot(rejection_rates, random_rejected_accuracy, label='Random', linestyle='-')
+
+    # Add labels and title
+    plt.xlabel('Rejection Rate')
+    plt.ylabel(f'{gen_name}')
+    plt.title(f'PRR curve: {e_level}, {e_name}')
+    
+    # Add grid and legend
+    plt.grid(True)
+    plt.legend()
+
+    # Generate a random UUID for the filename
+    base_filename = 'prr_curve'
+    extension = 'png'
+    unique_id = uuid.uuid4()
+    new_filename = f"{base_filename}_{e_name}_{gen_name}_{unique_id}.{extension}"
+    save_path = os.path.join(plots_dir, new_filename)
+
+    # Save the plot
+    plt.savefig(save_path)
+    plt.close()
\ No newline at end of file
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index c9956d102..f8f9053b9 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -19,6 +19,7 @@
     UEMetric,
     get_random_scores,
     normalize_metric,
+    generate_prr_curve
 )
 from lm_polygraph.estimators.estimator import Estimator
 from lm_polygraph.stat_calculators.stat_calculator import StatCalculator
@@ -511,6 +512,7 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
             torch.cuda.empty_cache()
             gc.collect()
 
+
         for (e_level, e_name), estimator_values in self.estimations.items():
             for (gen_level, gen_name), generation_metric in self.gen_metrics.items():
                 for ue_metric in self.ue_metrics:
@@ -527,13 +529,17 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                     if len(ue) == 0:
                         self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan
                     else:
-                        oracle_score = ue_metric(-metric, metric)
-                        random_score = get_random_scores(ue_metric, metric)
-                        # For prr metric only - option to generate PRR curve
+                        # For prr, generate plot
                         if str(ue_metric) == 'prr':
-                            ue_metric_val = ue_metric(ue, metric, True, e_level, e_name, gen_name, str(ue_metric))
+                            oracle_score, oracle_scores = ue_metric(-metric, metric, True)
+                            random_score, random_scores = get_random_scores(ue_metric, metric, True)
+                            ue_metric_val , ue_scores = ue_metric(ue, metric, True)
+                            generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name)
                         else:
+                            oracle_score= ue_metric(-metric, metric)
+                            random_score = get_random_scores(ue_metric, metric)
                             ue_metric_val = ue_metric(ue, metric)
+
                         self.metrics[e_level, e_name, gen_name, str(ue_metric)] = (
                             ue_metric_val
                         )
@@ -541,13 +547,11 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                             e_level, e_name, gen_name, str(ue_metric) + "_normalized"
                         ] = normalize_metric(ue_metric_val, oracle_score, random_score)
 
-
         for processor in self.processors:
             processor.on_eval(self.metrics, self.total_bad_estimators)
 
         return self.metrics
 
-
     def calculate(self, batch_stats: dict, calculators: list, inp_texts: list) -> dict:
         """
         Runs stat calculators and handles errors if any occur. Returns updated batch stats

From cbbfe0d90a42371264c1d673d0a3c0f01a78e341 Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Tue, 8 Oct 2024 08:08:13 +0400
Subject: [PATCH 3/4] Added output_prr_curve to config and UEManager

---
 scripts/polygraph_eval                       |  4 ++
 src/lm_polygraph/ue_metrics/pred_rej_area.py | 20 ++++++-
 src/lm_polygraph/ue_metrics/ue_metric.py     |  9 ++-
 src/lm_polygraph/utils/manager.py            | 59 ++++++++++++++++----
 4 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 23f6db69e..5fc9d68d4 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -40,6 +40,7 @@ def main(args):
     os.chdir(hydra.utils.get_original_cwd())
 
     save_path = args.save_path if "save_path" in args else save_path
+    output_prr_curves = getattr(args, "output_prr_curves", False)
 
     if args.seed is None or len(args.seed) == 0:
         args.seed = [1]
@@ -201,8 +202,11 @@ def main(args):
             ensemble_model=ensemble_model,
             cache_path=args.cache_path,
             language=getattr(args, 'language', 'en'),
+            output_prr_curves = output_prr_curves,
+            save_path= save_path
         )
 
+
         man()
 
         man.save(save_path + f"/ue_manager_seed{seed}")
diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py
index ec3d5d8ad..a365d3e53 100644
--- a/src/lm_polygraph/ue_metrics/pred_rej_area.py
+++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py
@@ -10,7 +10,19 @@ class PredictionRejectionArea(UEMetric):
     Calculates area under Prediction-Rejection curve.
     """
 
+    def __init__(self, max_rejection: float = 1.0):
+        """
+        Parameters:
+            max_rejection (float): a maximum proportion of instances that will be rejected.
+                1.0 indicates entire set, 0.5 - half of the set
+        """
+        super().__init__()
+        self.max_rejection = max_rejection
+
     def __str__(self):
+        if self.max_rejection == 1:
+            return "prr"
+        return f"prr_{self.max_rejection}"
         return "prr"
 
     def __call__(self, estimator: List[float], target: List[float], return_scores:bool = False) -> float:
@@ -31,14 +43,16 @@ def __call__(self, estimator: List[float], target: List[float], return_scores:bo
         # ue: greater is more uncertain
         ue = np.array(estimator)
         num_obs = len(ue)
+        num_rej = int(self.max_rejection * num_obs)
         # Sort in ascending order: the least uncertain come first
         ue_argsort = np.argsort(ue)
         # want sorted_metrics to be increasing => smaller scores is better
         sorted_metrics = np.array(target)[ue_argsort]
         # Since we want all plots to coincide when all the data is discarded
-        cumsum = np.cumsum(sorted_metrics)
-        scores = (cumsum / np.arange(1, num_obs + 1))[::-1]
-        prr_score = np.sum(scores) / num_obs
+        cumsum = np.cumsum(sorted_metrics)[-num_rej:]
+        scores = (cumsum / np.arange((num_obs - num_rej) + 1, num_obs + 1))[::-1]
+        prr_score = np.sum(scores) / num_rej
+
 
         if return_scores:
             return prr_score, scores
diff --git a/src/lm_polygraph/ue_metrics/ue_metric.py b/src/lm_polygraph/ue_metrics/ue_metric.py
index ec9a60884..7a1c96d3d 100644
--- a/src/lm_polygraph/ue_metrics/ue_metric.py
+++ b/src/lm_polygraph/ue_metrics/ue_metric.py
@@ -104,7 +104,9 @@ def normalize_metric(target_score, oracle_score, random_score):
 
 
 
-def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_rejected_accuracy, e_level: str, e_name: str, gen_name: str):
+
+
+def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_rejected_accuracy, e_level: str, e_name: str, gen_name: str, path: str):
     """
     Generates and saves a PRR curve plot using only matplotlib.
 
@@ -120,8 +122,9 @@ def generate_prr_curve(ue_rejected_accuracy, oracle_rejected_accuracy, random_re
         str: The path where the plot is saved.
     """
     # Directory to save plots
-    plots_dir = './plots'
-    os.makedirs(plots_dir, exist_ok=True)
+    
+    plots_dir = path
+    # os.makedirs(plots_dir, exist_ok=True)
 
     # Number of examples
     N_EXAMPLES = len(ue_rejected_accuracy)
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index f8f9053b9..7e14fc3b7 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -257,6 +257,8 @@ def __init__(
         max_new_tokens: int = 100,
         background_train_dataset_max_new_tokens: int = 100,
         cache_path=os.path.expanduser("~") + "/.cache",
+        output_prr_curves=False,
+        save_path = ''
     ):
         """
         Parameters:
@@ -276,8 +278,42 @@ def __init__(
             deberta_device (Optional[str]): The device to run deberta on. If None, will use 'cuda:0' if available,
                 'cpu' otherwise. Default: None.
             language (str): Language to test in claim-level benchmark, one of 'en', 'zh', 'ar', 'ru'. Default: 'en'.
-            verbose (bool): If set, will print useful info during batch processing. Default: True.
+            verbose for (e_level, e_name), estimator_values in self.estimations.items():
+            for (gen_level, gen_name), generation_metric in self.gen_metrics.items():
+                for ue_metric in self.ue_metrics:
+                    if gen_level != e_level:
+                        continue
+                    if len(estimator_values) != len(generation_metric):
+                        raise Exception(
+                            f"Got different number of metrics for {e_name} and {gen_name}: "
+                            f"{len(estimator_values)} and {len(generation_metric)}"
+                        )
+                    # TODO: Report how many nans!
+                    # This is important to know for a user
+                    ue, metric = _delete_nans(estimator_values, generation_metric)
+                    if len(ue) == 0:
+                        self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan
+                    else:
+                        # For prr, generate plot
+                        if str(ue_metric) == 'prr':
+                            oracle_score, oracle_scores = ue_metric(-metric, metric, True)
+                            random_score, random_scores = get_random_scores(ue_metric, metric, True)
+                            ue_metric_val , ue_scores = ue_metric(ue, metric, True)
+                            generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name)
+                        else:
+                            oracle_score= ue_metric(-metric, metric)
+                            random_score = get_random_scores(ue_metric, metric)
+                            ue_metric_val = ue_metric(ue, metric)
+
+                        self.metrics[e_level, e_name, gen_name, str(ue_metric)] = (
+                            ue_metric_val
+                        )
+                        self.metrics[
+                            e_level, e_name, gen_name, str(ue_metric) + "_normalized"
+                        ] = normalize_metric(ue_metric_val, oracle_score, random_score) (bool): If set, will print useful info during batch processing. Default: True.
             max_new_tokens (int): Maximum new tokens to use in generation. Default: 100.
+            output_prr_curves: Flag for generating PRR curves in the save_path directory
+            save_path: save_path from config
         """
 
         stat_calculators_dict, stat_dependencies_dict = register_stat_calculators(
@@ -285,12 +321,11 @@ def __init__(
             deberta_device=deberta_device,
             language=language,
             cache_path=cache_path,
-            model=model,
         )
 
         self.stat_calculators_dict = stat_calculators_dict
-
-        self.model: Model = model
+        self.save_path = save_path
+        self.model: WhiteboxModel = model
         self.train_data: Dataset = train_data
         self.background_train_data: Dataset = background_train_data
         self.ensemble_model = ensemble_model
@@ -298,13 +333,15 @@ def __init__(
         self.estimators: List[Estimator] = estimators
         self.generation_metrics: List[GenerationMetric] = generation_metrics
         self.ue_metrics: List[UEMetric] = ue_metrics
+        self.output_prr_curves = output_prr_curves
         _check_unique_names(generation_metrics)
         _check_unique_names(estimators)
         _check_unique_names(ue_metrics)
 
-        greedy = ["greedy_texts"]
-        if not isinstance(self.model, BlackboxModel):
-            greedy += ["greedy_tokens"]
+        if isinstance(model, BlackboxModel):
+            greedy = ["blackbox_greedy_texts"]
+        else:
+            greedy = ["greedy_tokens", "greedy_texts"]
 
         stats = (
             [s for e in self.estimators for s in e.stats_dependencies]
@@ -384,7 +421,7 @@ def __init__(
         ensemble_stats = [
             s
             for e in self.ensemble_estimators
-            for s in e.stats_dependencies
+            for s in e.stats_dependenciesgenerate_prr_curve
             if s.startswith("ensemble")
         ]
         ensemble_stats, _ = _order_calculators(
@@ -530,13 +567,13 @@ def __call__(self) -> Dict[Tuple[str, str, str, str], float]:
                         self.metrics[e_level, e_name, gen_name, str(ue_metric)] = np.nan
                     else:
                         # For prr, generate plot
-                        if str(ue_metric) == 'prr':
+                        if str(ue_metric) == 'prr' and self.output_prr_curves:
                             oracle_score, oracle_scores = ue_metric(-metric, metric, True)
                             random_score, random_scores = get_random_scores(ue_metric, metric, True)
                             ue_metric_val , ue_scores = ue_metric(ue, metric, True)
-                            generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name)
+                            generate_prr_curve(ue_scores, oracle_scores, random_scores, e_level, e_name, gen_name, self.save_path)
                         else:
-                            oracle_score= ue_metric(-metric, metric)
+                            oracle_score = ue_metric(-metric, metric)
                             random_score = get_random_scores(ue_metric, metric)
                             ue_metric_val = ue_metric(ue, metric)
 

From 63a2baa26762f65b03f3492a81473409e9c1adac Mon Sep 17 00:00:00 2001
From: silvimica <mayagoloburda@gmail.com>
Date: Wed, 16 Oct 2024 15:46:15 +0400
Subject: [PATCH 4/4] Fixes to code

---
 scripts/polygraph_eval                       |  3 ++-
 src/lm_polygraph/ue_metrics/pred_rej_area.py |  1 -
 src/lm_polygraph/ue_metrics/ue_metric.py     | 12 ++++++------
 src/lm_polygraph/utils/manager.py            |  2 --
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 5fc9d68d4..d1df37cb6 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -40,7 +40,7 @@ def main(args):
     os.chdir(hydra.utils.get_original_cwd())
 
     save_path = args.save_path if "save_path" in args else save_path
-    output_prr_curves = getattr(args, "output_prr_curves", False)
+    
 
     if args.seed is None or len(args.seed) == 0:
         args.seed = [1]
@@ -184,6 +184,7 @@ def main(args):
         generation_metrics = get_generation_metrics(args)
 
         ue_metrics = get_ue_metrics(args)
+        output_prr_curves = getattr(args, "output_prr_curves", False)
 
         man = UEManager(
             dataset,
diff --git a/src/lm_polygraph/ue_metrics/pred_rej_area.py b/src/lm_polygraph/ue_metrics/pred_rej_area.py
index a365d3e53..9457faf20 100644
--- a/src/lm_polygraph/ue_metrics/pred_rej_area.py
+++ b/src/lm_polygraph/ue_metrics/pred_rej_area.py
@@ -23,7 +23,6 @@ def __str__(self):
         if self.max_rejection == 1:
             return "prr"
         return f"prr_{self.max_rejection}"
-        return "prr"
 
     def __call__(self, estimator: List[float], target: List[float], return_scores:bool = False) -> float:
         """
diff --git a/src/lm_polygraph/ue_metrics/ue_metric.py b/src/lm_polygraph/ue_metrics/ue_metric.py
index 7a1c96d3d..c3a414420 100644
--- a/src/lm_polygraph/ue_metrics/ue_metric.py
+++ b/src/lm_polygraph/ue_metrics/ue_metric.py
@@ -66,7 +66,7 @@ def get_random_scores(function, metrics, return_scores:bool = False, num_iter=10
    
     rand_scores = np.arange(len(metrics))
 
-    prr_values = []      # To store PRR scores across iterations
+    ue_metric_scores = []      # To store PRR scores across iterations
     score_values = []    # To store detailed scores or rejection accuracies across iterations
 
     for _ in range(num_iter):
@@ -76,23 +76,23 @@ def get_random_scores(function, metrics, return_scores:bool = False, num_iter=10
         if return_scores:
             # Call the function to get both PRR score and detailed scores
             prr_score, detailed_scores = function(rand_scores, metrics, return_scores=True)
-            prr_values.append(prr_score)
+            ue_metric_scores.append(prr_score)
             score_values.append(detailed_scores)
         else:
             # Call the function to get only the PRR score
             prr_score = function(rand_scores, metrics)
-            prr_values.append(prr_score)
+            ue_metric_scores.append(prr_score)
 
     # Compute the mean PRR score across all iterations
-    mean_prr_score = np.mean(prr_values)
+    ue_metric_score = np.mean(ue_metric_scores)
 
     # If return_scores is True, also compute the mean of the detailed scores (or rejection accuracies)
     if return_scores:
         mean_scores = np.mean(score_values, axis=0)
-        return mean_prr_score, mean_scores
+        return ue_metric_score, mean_scores
 
     # Otherwise, just return the PRR score (no detailed scores)
-    return mean_prr_score
+    return ue_metric_score
 
 
 
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index 7e14fc3b7..01a6c112f 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -278,7 +278,6 @@ def __init__(
             deberta_device (Optional[str]): The device to run deberta on. If None, will use 'cuda:0' if available,
                 'cpu' otherwise. Default: None.
             language (str): Language to test in claim-level benchmark, one of 'en', 'zh', 'ar', 'ru'. Default: 'en'.
-            verbose for (e_level, e_name), estimator_values in self.estimations.items():
             for (gen_level, gen_name), generation_metric in self.gen_metrics.items():
                 for ue_metric in self.ue_metrics:
                     if gen_level != e_level:
@@ -421,7 +420,6 @@ def __init__(
         ensemble_stats = [
             s
             for e in self.ensemble_estimators
-            for s in e.stats_dependenciesgenerate_prr_curve
             if s.startswith("ensemble")
         ]
         ensemble_stats, _ = _order_calculators(