Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions examples/Advanced/huggingface_integration_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
HuggingFace Hub Integration Tutorial
=====================================

This example demonstrates how to share OpenML flows with HuggingFace Hub,
enabling bidirectional model sharing between the two platforms.

Prerequisites:
- huggingface_hub installed: pip install huggingface_hub
- HuggingFace account with API token
"""

import openml
from openml.extensions.huggingface import (
download_flow_from_huggingface,
upload_flow_to_huggingface,
)

# %%
# Setup
# -----
# Configure OpenML (you need an API key from openml.org)
openml.config.apikey = "YOUR_OPENML_API_KEY"

# Your HuggingFace token (get from huggingface.co/settings/tokens)
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN"

# %%
# Example 1: Upload an OpenML Flow to HuggingFace
# ------------------------------------------------

# Get a flow from OpenML (this example uses a RandomForest classifier)
flow_id = 8365 # sklearn RandomForestClassifier
flow = openml.flows.get_flow(flow_id, reinstantiate=True)

print(f"Flow Name: {flow.name}")
print(f"Flow ID: {flow.flow_id}")

# Upload to HuggingFace Hub
hf_url = upload_flow_to_huggingface(
flow=flow,
repo_id="your-username/openml-randomforest", # Change to your username
token=HF_TOKEN,
private=False, # Set to True for private repositories
)

print(f"Model uploaded to: {hf_url}")

# %%
# Example 2: Download a Model from HuggingFace
# ---------------------------------------------

result = download_flow_from_huggingface(
repo_id="your-username/openml-randomforest",
token=HF_TOKEN, # Only needed for private repos
)

# Access the model
model = result["model"]
metadata = result["metadata"]

print(f"Downloaded model: {type(model)}")
print(f"Original OpenML Flow ID: {metadata['openml_flow_id']}")
print(f"OpenML URL: {metadata['openml_url']}")

# %%
# Example 3: Share Your Own Model
# --------------------------------
# Train a model, create a flow, publish to OpenML, then share on HuggingFace

from sklearn.ensemble import RandomForestClassifier

# Get a dataset
dataset = openml.datasets.get_dataset(31) # credit-g dataset
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# Train a model
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X, y)

# Create and publish flow
flow = openml.flows.sklearn_to_flow(clf)
flow.publish()

print(f"Published flow with ID: {flow.flow_id}")

# Share on HuggingFace
hf_url = upload_flow_to_huggingface(
flow=flow,
repo_id="your-username/my-credit-model",
token=HF_TOKEN,
commit_message="Initial upload of credit scoring model",
)

print(f"Shared on HuggingFace: {hf_url}")

# %%
# Example 4: Using Configuration
# -------------------------------
from openml.extensions.huggingface.config import get_config, set_cache_directory

# Set custom cache directory
set_cache_directory("/path/to/custom/cache")

# Check configuration
config = get_config()
print(f"Cache directory: {config.cache_dir}")
print(f"Model filename: {config.model_filename}")
14 changes: 14 additions & 0 deletions openml/extensions/huggingface/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
HuggingFace Hub integration for OpenML.
Enables bidirectional model sharing between OpenML and HuggingFace Hub.
"""

from .functions import (
download_flow_from_huggingface,
upload_flow_to_huggingface,
)

__all__ = [
"download_flow_from_huggingface",
"upload_flow_to_huggingface",
]
80 changes: 80 additions & 0 deletions openml/extensions/huggingface/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Configuration for HuggingFace Hub integration."""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

import openml


@dataclass
class HuggingFaceConfig:
"""Configuration for HuggingFace Hub integration.

Attributes
----------
cache_dir : Path
Directory to cache downloaded models from HuggingFace.
default_commit_message : str
Default commit message when uploading to HuggingFace.
model_filename : str
Filename for serialized model in HuggingFace repos.
metadata_filename : str
Filename for OpenML metadata in HuggingFace repos.
"""

cache_dir: Path | None = None
default_commit_message: str = "Upload from OpenML"
model_filename: str = "model.pkl"
metadata_filename: str = "openml_metadata.json"
readme_filename: str = "README.md"

def __post_init__(self) -> None:
"""Initialize cache directory."""
if self.cache_dir is None:
# Use OpenML cache directory + huggingface subdirectory
self.cache_dir = Path(openml.config.get_cache_directory()) / "huggingface"

# Ensure cache directory exists
self.cache_dir.mkdir(parents=True, exist_ok=True)


# Global configuration instance
_config = HuggingFaceConfig()


def get_config() -> HuggingFaceConfig:
"""Get the current HuggingFace integration configuration.

Returns
-------
HuggingFaceConfig
Current configuration object.
"""
return _config


def set_cache_directory(path: str | Path) -> None:
"""Set the cache directory for HuggingFace downloads.

Parameters
----------
path : str or Path
Path to cache directory.
"""
_config.cache_dir = Path(path)
_config.cache_dir.mkdir(parents=True, exist_ok=True)


def reset_config() -> None:
"""Reset configuration to defaults.

Note: This recreates the configuration by reinitializing fields.
"""
_config.cache_dir = None
_config.default_commit_message = "Upload from OpenML"
_config.model_filename = "model.pkl"
_config.metadata_filename = "openml_metadata.json"
_config.readme_filename = "README.md"
_config.__post_init__()
Loading