diff --git a/measurements/label_distribution/README.md b/measurements/label_distribution/README.md index f8101d394..cd82cdff3 100644 --- a/measurements/label_distribution/README.md +++ b/measurements/label_distribution/README.md @@ -11,7 +11,7 @@ tags: - evaluate - measurement description: >- - Returns the label distribution and skew of the input data. + Returns the label distribution and entropy of the input data. --- # Measurement Card for Label Distribution @@ -41,13 +41,14 @@ The measurement takes a list of labels as input: ### Output Values By default, this metric outputs a dictionary that contains : -**label_distribution** (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label. --**label_skew** (`scalar`) : the asymmetry of the label distribution. +-**label_entropy** (`float`) : the Shannon entropy of the label distribution (in nats). Maximized at log(k) for k classes when labels are uniformly distributed, and 0 when all labels are the same. +-**label_entropy_normalized** (`float`) : the Shannon entropy normalized by log(k), giving a value between 0 and 1. A value of 1.0 means perfectly balanced; a value close to 0 means highly imbalanced. ```python -{'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_skew': 0.7417688338666573} +{'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_entropy': 0.8979457248567798, 'label_entropy_normalized': 0.8173454221465101} ``` -If skewness is 0, the dataset is perfectly balanced; if it is less than -1 or greater than 1, the distribution is highly skewed; anything in between can be considered moderately skewed. +If normalized entropy is 1.0, the dataset is perfectly balanced; values closer to 0 indicate increasing imbalance. Unlike skewness, entropy is permutation-invariant and correctly measures uniformity for categorical variables. #### Values from Popular Papers @@ -60,7 +61,7 @@ Calculating the label distribution of a dataset with binary labels: >>> distribution = evaluate.load("label_distribution") >>> results = distribution.compute(data=data) >>> print(results) -{'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}} +{'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_entropy': 0.6829081047004717, 'label_entropy_normalized': 0.9852281360342515} ``` Calculating the label distribution of the test subset of the [IMDb dataset](https://huggingface.co/datasets/imdb): @@ -70,9 +71,9 @@ Calculating the label distribution of the test subset of the [IMDb dataset](http >>> distribution = evaluate.load("label_distribution") >>> results = distribution.compute(data=imdb['label']) >>> print(results) -{'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_skew': 0.0} +{'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_entropy': 0.6931471805599453, 'label_entropy_normalized': 1.0} ``` -N.B. The IMDb dataset is perfectly balanced. +N.B. The IMDb dataset is perfectly balanced (normalized entropy = 1.0). The output of the measurement can easily be passed to matplotlib to plot a histogram of each label: @@ -91,4 +92,4 @@ While label distribution can be a useful signal for analyzing datasets and choos ## Further References - [Facing Imbalanced Data Recommendations for the Use of Performance Metrics](https://sites.pitt.edu/~jeffcohn/skew/PID2829477.pdf) -- [Scipy Stats Skew Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html#scipy-stats-skew) +- [Scipy Stats Entropy Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html) diff --git a/measurements/label_distribution/label_distribution.py b/measurements/label_distribution/label_distribution.py index 81ea693fe..eaaef66c6 100644 --- a/measurements/label_distribution/label_distribution.py +++ b/measurements/label_distribution/label_distribution.py @@ -16,14 +16,14 @@ from collections import Counter import datasets -import pandas as pd -from scipy import stats +import numpy as np +from scipy.stats import entropy import evaluate _DESCRIPTION = """ -Returns the label ratios of the dataset labels, as well as a scalar for skewness. +Returns the label ratios of the dataset labels, as well as the Shannon entropy of the label distribution. """ _KWARGS_DESCRIPTION = """ @@ -32,13 +32,14 @@ Returns: `label_distribution` (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label. - `label_skew` (`scalar`) : the asymmetry of the label distribution. + `label_entropy` (`float`) : the Shannon entropy of the label distribution (in nats). Maximized at log(k) for k classes when labels are uniformly distributed, and 0 when all labels are the same. + `label_entropy_normalized` (`float`) : the Shannon entropy normalized by log(k), giving a value between 0 and 1. A value of 1.0 means perfectly balanced; a value close to 0 means highly imbalanced. Examples: >>> data = [1, 0, 1, 1, 0, 1, 0] >>> distribution = evaluate.load("label_distribution") >>> results = distribution.compute(data=data) >>> print(results) - {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_skew': -0.2886751345948127} + {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_entropy': 0.6829081047004717, 'label_entropy_normalized': 0.9852281360342515} """ _CITATION = """\ @@ -83,11 +84,16 @@ def _info(self): ) def _compute(self, data): - """Returns the fraction of each label present in the data""" + """Returns the fraction of each label present in the data and the entropy of the distribution.""" c = Counter(data) - label_distribution = {"labels": [k for k in c.keys()], "fractions": [f / len(data) for f in c.values()]} - if isinstance(data[0], str): - label2id = {label: id for id, label in enumerate(label_distribution["labels"])} - data = [label2id[d] for d in data] - skew = stats.skew(data) - return {"label_distribution": label_distribution, "label_skew": skew} + label_distribution = {"labels": list(c.keys()), "fractions": [f / len(data) for f in c.values()]} + label_entropy = float(entropy(label_distribution["fractions"])) + if len(c) > 1: + label_entropy_normalized = float(label_entropy / np.log(len(c))) + else: + label_entropy_normalized = 0.0 + return { + "label_distribution": label_distribution, + "label_entropy": label_entropy, + "label_entropy_normalized": label_entropy_normalized, + } diff --git a/measurements/label_distribution/test_label_distribution.py b/measurements/label_distribution/test_label_distribution.py new file mode 100644 index 000000000..955f34f63 --- /dev/null +++ b/measurements/label_distribution/test_label_distribution.py @@ -0,0 +1,97 @@ +# Copyright 2025 The HuggingFace Evaluate Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the label_distribution measurement.""" + +import math +import unittest + +from label_distribution import LabelDistribution + + +measurement = LabelDistribution() + + +class TestLabelDistribution(unittest.TestCase): + def test_uniform_binary(self): + """Perfectly balanced binary labels should have normalized entropy of 1.0.""" + data = [0, 1, 0, 1, 0, 1] + result = measurement.compute(data=data) + self.assertAlmostEqual(result["label_entropy_normalized"], 1.0) + self.assertAlmostEqual(result["label_entropy"], math.log(2)) + self.assertEqual(result["label_distribution"]["fractions"], [0.5, 0.5]) + + def test_uniform_multiclass(self): + """Perfectly balanced 3-class labels should have normalized entropy of 1.0.""" + data = [0, 1, 2, 0, 1, 2, 0, 1, 2] + result = measurement.compute(data=data) + self.assertAlmostEqual(result["label_entropy_normalized"], 1.0) + self.assertAlmostEqual(result["label_entropy"], math.log(3)) + + def test_single_class(self): + """All labels the same should have entropy 0.""" + data = [1, 1, 1, 1, 1] + result = measurement.compute(data=data) + self.assertAlmostEqual(result["label_entropy"], 0.0) + self.assertAlmostEqual(result["label_entropy_normalized"], 0.0) + + def test_imbalanced(self): + """Imbalanced labels should have normalized entropy less than 1.""" + data = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1] + result = measurement.compute(data=data) + self.assertGreater(result["label_entropy"], 0.0) + self.assertLess(result["label_entropy_normalized"], 1.0) + + def test_permutation_invariance(self): + """Entropy should be the same regardless of which integer is assigned to which class. + + This is the key property that skewness lacked: [0,0,1,1,1,1,1,2,2] and + [0,0,1,1,2,2,2,2,2] have the same class distribution (2,5,2) but + different skewness. Entropy must be identical for both. + """ + data_a = [0, 0, 1, 1, 1, 1, 1, 2, 2] + data_b = [0, 0, 1, 1, 2, 2, 2, 2, 2] + result_a = measurement.compute(data=data_a) + result_b = measurement.compute(data=data_b) + self.assertAlmostEqual(result_a["label_entropy"], result_b["label_entropy"]) + self.assertAlmostEqual(result_a["label_entropy_normalized"], result_b["label_entropy_normalized"]) + + def test_string_labels(self): + """String labels should work the same as integer labels.""" + data = ["cat", "dog", "cat", "cat", "dog"] + result = measurement.compute(data=data) + self.assertGreater(result["label_entropy"], 0.0) + self.assertLess(result["label_entropy_normalized"], 1.0) + self.assertIn("cat", result["label_distribution"]["labels"]) + self.assertIn("dog", result["label_distribution"]["labels"]) + + def test_output_keys(self): + """Output should contain label_distribution, label_entropy, and label_entropy_normalized.""" + data = [0, 1, 2] + result = measurement.compute(data=data) + self.assertIn("label_distribution", result) + self.assertIn("label_entropy", result) + self.assertIn("label_entropy_normalized", result) + self.assertIn("labels", result["label_distribution"]) + self.assertIn("fractions", result["label_distribution"]) + + def test_fractions_sum_to_one(self): + """Label fractions should always sum to 1.""" + data = [0, 0, 1, 2, 2, 2, 3] + result = measurement.compute(data=data) + self.assertAlmostEqual(sum(result["label_distribution"]["fractions"]), 1.0) + + +if __name__ == "__main__": + unittest.main()