From bad3e64b7a1e695e6b72592c6197c5d4fd803700 Mon Sep 17 00:00:00 2001
From: Michael Ellis <michaelellis003@gmail.com>
Date: Thu, 12 Feb 2026 19:01:56 -0600
Subject: [PATCH 1/2] Replace skewness with entropy in label_distribution
 measurement

Skewness is statistically inappropriate for categorical label variables
because it depends on arbitrary integer encoding and measures symmetry
rather than uniformity. Replace it with Shannon entropy, which is
permutation-invariant and correctly quantifies how balanced a label
distribution is.

Changes:
- Replace scipy.stats.skew with scipy.stats.entropy
- Return label_entropy (nats) and label_entropy_normalized (0 to 1)
- Remove unused pandas import and string-to-integer conversion
- Update docstrings, README examples, and references
- Add test suite covering uniformity, imbalance, permutation invariance,
  string labels, and edge cases

Fixes #659
---
 measurements/label_distribution/README.md     | 17 ++--
 .../label_distribution/label_distribution.py  | 30 +++---
 .../test_label_distribution.py                | 97 +++++++++++++++++++
 3 files changed, 124 insertions(+), 20 deletions(-)
 create mode 100644 measurements/label_distribution/test_label_distribution.py

diff --git a/measurements/label_distribution/README.md b/measurements/label_distribution/README.md
index f8101d394..cd82cdff3 100644
--- a/measurements/label_distribution/README.md
+++ b/measurements/label_distribution/README.md
@@ -11,7 +11,7 @@ tags:
 - evaluate
 - measurement
 description: >-
-  Returns the label distribution and skew of the input data.
+  Returns the label distribution and entropy of the input data.
 ---
 
 # Measurement Card for Label Distribution
@@ -41,13 +41,14 @@ The measurement takes a list of labels as input:
 ### Output Values
 By default, this metric outputs a dictionary that contains :
 -**label_distribution** (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label.
--**label_skew** (`scalar`) : the asymmetry of the label distribution.
+-**label_entropy** (`float`) : the Shannon entropy of the label distribution (in nats). Maximized at log(k) for k classes when labels are uniformly distributed, and 0 when all labels are the same.
+-**label_entropy_normalized** (`float`) : the Shannon entropy normalized by log(k), giving a value between 0 and 1. A value of 1.0 means perfectly balanced; a value close to 0 means highly imbalanced.
 
 ```python
-{'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_skew': 0.7417688338666573}
+{'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_entropy': 0.8979457248567798, 'label_entropy_normalized': 0.8173454221465101}
 ```
 
-If skewness is 0, the dataset is perfectly balanced; if it is less than -1 or greater than 1, the distribution is highly skewed; anything in between can be considered moderately skewed.
+If normalized entropy is 1.0, the dataset is perfectly balanced; values closer to 0 indicate increasing imbalance. Unlike skewness, entropy is permutation-invariant and correctly measures uniformity for categorical variables.
 
 #### Values from Popular Papers
 
@@ -60,7 +61,7 @@ Calculating the label distribution of a dataset with binary labels:
 >>> distribution = evaluate.load("label_distribution")
 >>> results = distribution.compute(data=data)
 >>> print(results)
-{'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}}
+{'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_entropy': 0.6829081047004717, 'label_entropy_normalized': 0.9852281360342515}
 ```
 
 Calculating the label distribution of the test subset of the [IMDb dataset](https://huggingface.co/datasets/imdb):
@@ -70,9 +71,9 @@ Calculating the label distribution of the test subset of the [IMDb dataset](http
 >>> distribution = evaluate.load("label_distribution")
 >>> results = distribution.compute(data=imdb['label'])
 >>> print(results)
-{'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_skew': 0.0}
+{'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_entropy': 0.6931471805599453, 'label_entropy_normalized': 1.0}
 ```
-N.B. The IMDb dataset is perfectly balanced.
+N.B. The IMDb dataset is perfectly balanced (normalized entropy = 1.0).
 
 The output of the measurement can easily be passed to matplotlib to plot a histogram of each label:
 
@@ -91,4 +92,4 @@ While label distribution can be a useful signal for analyzing datasets and choos
 
 ## Further References
 - [Facing Imbalanced Data Recommendations for the Use of Performance Metrics](https://sites.pitt.edu/~jeffcohn/skew/PID2829477.pdf)
-- [Scipy Stats Skew Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html#scipy-stats-skew)
+- [Scipy Stats Entropy Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)
diff --git a/measurements/label_distribution/label_distribution.py b/measurements/label_distribution/label_distribution.py
index 81ea693fe..73293f782 100644
--- a/measurements/label_distribution/label_distribution.py
+++ b/measurements/label_distribution/label_distribution.py
@@ -16,14 +16,14 @@
 from collections import Counter
 
 import datasets
-import pandas as pd
-from scipy import stats
+import numpy as np
+from scipy.stats import entropy
 
 import evaluate
 
 
 _DESCRIPTION = """
-Returns the label ratios of the dataset labels, as well as a scalar for skewness.
+Returns the label ratios of the dataset labels, as well as the Shannon entropy of the label distribution.
 """
 
 _KWARGS_DESCRIPTION = """
@@ -32,13 +32,14 @@
 
 Returns:
     `label_distribution` (`dict`) :  a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label.
-    `label_skew` (`scalar`) : the asymmetry of the label distribution.
+    `label_entropy` (`float`) : the Shannon entropy of the label distribution (in nats). Maximized at log(k) for k classes when labels are uniformly distributed, and 0 when all labels are the same.
+    `label_entropy_normalized` (`float`) : the Shannon entropy normalized by log(k), giving a value between 0 and 1. A value of 1.0 means perfectly balanced; a value close to 0 means highly imbalanced.
 Examples:
     >>> data = [1, 0, 1, 1, 0, 1, 0]
     >>> distribution = evaluate.load("label_distribution")
     >>> results = distribution.compute(data=data)
     >>> print(results)
-    {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_skew': -0.2886751345948127}
+    {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_entropy': 0.6829081047004717, 'label_entropy_normalized': 0.9852281360342515}
 """
 
 _CITATION = """\
@@ -83,11 +84,16 @@ def _info(self):
         )
 
     def _compute(self, data):
-        """Returns the fraction of each label present in the data"""
+        """Returns the fraction of each label present in the data and the entropy of the distribution."""
         c = Counter(data)
-        label_distribution = {"labels": [k for k in c.keys()], "fractions": [f / len(data) for f in c.values()]}
-        if isinstance(data[0], str):
-            label2id = {label: id for id, label in enumerate(label_distribution["labels"])}
-            data = [label2id[d] for d in data]
-        skew = stats.skew(data)
-        return {"label_distribution": label_distribution, "label_skew": skew}
+        label_distribution = {"labels": list(c.keys()), "fractions": [f / len(data) for f in c.values()]}
+        label_entropy = entropy(label_distribution["fractions"])
+        if len(c) > 1:
+            label_entropy_normalized = label_entropy / np.log(len(c))
+        else:
+            label_entropy_normalized = 0.0
+        return {
+            "label_distribution": label_distribution,
+            "label_entropy": label_entropy,
+            "label_entropy_normalized": label_entropy_normalized,
+        }
diff --git a/measurements/label_distribution/test_label_distribution.py b/measurements/label_distribution/test_label_distribution.py
new file mode 100644
index 000000000..955f34f63
--- /dev/null
+++ b/measurements/label_distribution/test_label_distribution.py
@@ -0,0 +1,97 @@
+# Copyright 2025 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the label_distribution measurement."""
+
+import math
+import unittest
+
+from label_distribution import LabelDistribution
+
+
+measurement = LabelDistribution()
+
+
+class TestLabelDistribution(unittest.TestCase):
+    def test_uniform_binary(self):
+        """Perfectly balanced binary labels should have normalized entropy of 1.0."""
+        data = [0, 1, 0, 1, 0, 1]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(result["label_entropy_normalized"], 1.0)
+        self.assertAlmostEqual(result["label_entropy"], math.log(2))
+        self.assertEqual(result["label_distribution"]["fractions"], [0.5, 0.5])
+
+    def test_uniform_multiclass(self):
+        """Perfectly balanced 3-class labels should have normalized entropy of 1.0."""
+        data = [0, 1, 2, 0, 1, 2, 0, 1, 2]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(result["label_entropy_normalized"], 1.0)
+        self.assertAlmostEqual(result["label_entropy"], math.log(3))
+
+    def test_single_class(self):
+        """All labels the same should have entropy 0."""
+        data = [1, 1, 1, 1, 1]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(result["label_entropy"], 0.0)
+        self.assertAlmostEqual(result["label_entropy_normalized"], 0.0)
+
+    def test_imbalanced(self):
+        """Imbalanced labels should have normalized entropy less than 1."""
+        data = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
+        result = measurement.compute(data=data)
+        self.assertGreater(result["label_entropy"], 0.0)
+        self.assertLess(result["label_entropy_normalized"], 1.0)
+
+    def test_permutation_invariance(self):
+        """Entropy should be the same regardless of which integer is assigned to which class.
+
+        This is the key property that skewness lacked: [0,0,1,1,1,1,1,2,2] and
+        [0,0,1,1,2,2,2,2,2] have the same class distribution (2,5,2) but
+        different skewness.  Entropy must be identical for both.
+        """
+        data_a = [0, 0, 1, 1, 1, 1, 1, 2, 2]
+        data_b = [0, 0, 1, 1, 2, 2, 2, 2, 2]
+        result_a = measurement.compute(data=data_a)
+        result_b = measurement.compute(data=data_b)
+        self.assertAlmostEqual(result_a["label_entropy"], result_b["label_entropy"])
+        self.assertAlmostEqual(result_a["label_entropy_normalized"], result_b["label_entropy_normalized"])
+
+    def test_string_labels(self):
+        """String labels should work the same as integer labels."""
+        data = ["cat", "dog", "cat", "cat", "dog"]
+        result = measurement.compute(data=data)
+        self.assertGreater(result["label_entropy"], 0.0)
+        self.assertLess(result["label_entropy_normalized"], 1.0)
+        self.assertIn("cat", result["label_distribution"]["labels"])
+        self.assertIn("dog", result["label_distribution"]["labels"])
+
+    def test_output_keys(self):
+        """Output should contain label_distribution, label_entropy, and label_entropy_normalized."""
+        data = [0, 1, 2]
+        result = measurement.compute(data=data)
+        self.assertIn("label_distribution", result)
+        self.assertIn("label_entropy", result)
+        self.assertIn("label_entropy_normalized", result)
+        self.assertIn("labels", result["label_distribution"])
+        self.assertIn("fractions", result["label_distribution"])
+
+    def test_fractions_sum_to_one(self):
+        """Label fractions should always sum to 1."""
+        data = [0, 0, 1, 2, 2, 2, 3]
+        result = measurement.compute(data=data)
+        self.assertAlmostEqual(sum(result["label_distribution"]["fractions"]), 1.0)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 6d8c0222c41dd5baa764b12086360af98c114859 Mon Sep 17 00:00:00 2001
From: Michael Ellis <michaelellis003@gmail.com>
Date: Fri, 13 Feb 2026 08:17:57 -0600
Subject: [PATCH 2/2] Cast entropy values to float for consistent output across
 numpy versions

---
 measurements/label_distribution/label_distribution.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/measurements/label_distribution/label_distribution.py b/measurements/label_distribution/label_distribution.py
index 73293f782..eaaef66c6 100644
--- a/measurements/label_distribution/label_distribution.py
+++ b/measurements/label_distribution/label_distribution.py
@@ -87,9 +87,9 @@ def _compute(self, data):
         """Returns the fraction of each label present in the data and the entropy of the distribution."""
         c = Counter(data)
         label_distribution = {"labels": list(c.keys()), "fractions": [f / len(data) for f in c.values()]}
-        label_entropy = entropy(label_distribution["fractions"])
+        label_entropy = float(entropy(label_distribution["fractions"]))
         if len(c) > 1:
-            label_entropy_normalized = label_entropy / np.log(len(c))
+            label_entropy_normalized = float(label_entropy / np.log(len(c)))
         else:
             label_entropy_normalized = 0.0
         return {