From 3cd5daedcf8de2f7a070baa3acdce203de647290 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 17 Jan 2023 14:28:23 -0800 Subject: [PATCH 1/3] add ops with 2 outputs --- hummingbird/ml/_parse.py | 27 +++++++++---- hummingbird/ml/supported.py | 71 +++++++++++++++++++++++++++++++++ tests/test_xgboost_converter.py | 37 ++++++++++++++--- 3 files changed, 123 insertions(+), 12 deletions(-) diff --git a/hummingbird/ml/_parse.py b/hummingbird/ml/_parse.py index 40ebd388f..514fb498d 100644 --- a/hummingbird/ml/_parse.py +++ b/hummingbird/ml/_parse.py @@ -11,7 +11,6 @@ from collections import OrderedDict from copy import deepcopy import pprint -from uuid import uuid4 import numpy as np from onnxconverter_common.optimizer import LinkedNode, _topological_sort @@ -27,7 +26,12 @@ from ._topology import Topology from ._utils import sklearn_installed, sparkml_installed from .operator_converters import constants -from .supported import get_sklearn_api_operator_name, get_onnxml_api_operator_name, get_sparkml_api_operator_name +from .supported import ( + get_sklearn_api_operator_name, + is_sklearn_models_with_two_outputs, + get_onnxml_api_operator_name, + get_sparkml_api_operator_name, +) # Stacking is only supported starting from scikit-learn 0.22. try: @@ -249,13 +253,22 @@ def _parse_sklearn_single_model(topology, model, inputs): if isinstance(model, str): raise RuntimeError("Parameter model must be an object not a " "string '{0}'.".format(model)) - alias = get_sklearn_api_operator_name(type(model)) + model_type = type(model) + alias = get_sklearn_api_operator_name(model_type) this_operator = topology.declare_logical_operator(alias, model) this_operator.inputs = inputs - # We assume that all scikit-learn operators produce a single output. - variable = topology.declare_logical_variable("variable") - this_operator.outputs.append(variable) + if is_sklearn_models_with_two_outputs(model_type): + # This operator produces two outputs (e.g., label and probability) + variable = topology.declare_logical_variable("variable1") + this_operator.outputs.append(variable) + + variable = topology.declare_logical_variable("variable2") + this_operator.outputs.append(variable) + else: + # We assume that all scikit-learn operators produce a single output. + variable = topology.declare_logical_variable("variable") + this_operator.outputs.append(variable) return this_operator.outputs @@ -602,7 +615,7 @@ def _parse_onnx_api(topology, model, inputs): node_list = LinkedNode.build_from_onnx(graph.node, [], inputs_names + [in_.name for in_ in initializers], output_names) # Make sure the entire node_list isn't only 'Identity' - if all([x.op_type == 'Identity' for x in node_list]): + if all([x.op_type == "Identity" for x in node_list]): raise RuntimeError("ONNX model contained only Identity nodes {}.".format(node_list)) # This a new node list but with some node been removed plus eventual variable renaming. diff --git a/hummingbird/ml/supported.py b/hummingbird/ml/supported.py index 03264b1e8..dde7f5ae8 100644 --- a/hummingbird/ml/supported.py +++ b/hummingbird/ml/supported.py @@ -427,6 +427,60 @@ def _build_sklearn_api_operator_name_map(): } +def _build_sklearn_operator_with_two_outputs(): + """ + Associate Sklearn with the operator class names that have two outputs. + """ + """ + Put all supported Sklearn operators on a list. + """ + ops = set() + + if sklearn_installed(): + # Tree-based models + from sklearn.ensemble import ( + ExtraTreesClassifier, + GradientBoostingClassifier, + HistGradientBoostingClassifier, + RandomForestClassifier, + ) + + from sklearn.tree import DecisionTreeClassifier + + # Linear-based models + from sklearn.linear_model import ( + LogisticRegression, + LogisticRegressionCV, + SGDClassifier, + ) + + # SVM-based models + from sklearn.svm import LinearSVC, SVC, NuSVC + + ops.add( + [ + # Trees + DecisionTreeClassifier, + ExtraTreesClassifier, + GradientBoostingClassifier, + HistGradientBoostingClassifier, + RandomForestClassifier, + LinearSVC, + LogisticRegression, + LogisticRegressionCV, + SGDClassifier, + # SVM + NuSVC, + SVC, + ] + ) + + ops.add(xgb_operator_list[0] if len(xgb_operator_list) > 0 else None) + ops.add(lgbm_operator_list[0] if len(lgbm_operator_list) > 0 else None) + + return ops + + def _build_onnxml_api_operator_name_map(): """ Associate ONNXML with the operator class names. @@ -466,6 +520,22 @@ def get_sklearn_api_operator_name(model_type): return sklearn_api_operator_name_map[model_type] +def is_sklearn_models_with_two_outputs(model_type): + """ + Get the operator name for the input model type in *scikit-learn API* format. + + Args: + model_type: A scikit-learn model object (e.g., RandomForestClassifier) + or an object with scikit-learn API (e.g., LightGBM) + + Returns: + A string which stands for the type of the input model in the Hummingbird conversion framework + """ + assert model_type in sklearn_api_operator_name_map + + return sklearn_operator_with_two_outputs[model_type] + + def get_onnxml_api_operator_name(model_type): """ Get the operator name for the input model type in *ONNX-ML API* format. @@ -507,6 +577,7 @@ def get_sparkml_api_operator_name(model_type): prophet_operator_list = _build_prophet_operator_list() sklearn_api_operator_name_map = _build_sklearn_api_operator_name_map() +sklearn_operator_with_two_outputs = _build_sklearn_operator_with_two_outputs() onnxml_api_operator_name_map = _build_onnxml_api_operator_name_map() sparkml_api_operator_name_map = _build_sparkml_api_operator_name_map() diff --git a/tests/test_xgboost_converter.py b/tests/test_xgboost_converter.py index 42abaf31b..6759afca2 100644 --- a/tests/test_xgboost_converter.py +++ b/tests/test_xgboost_converter.py @@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split import hummingbird.ml -from hummingbird.ml._utils import xgboost_installed, tvm_installed, pandas_installed +from hummingbird.ml._utils import xgboost_installed, tvm_installed, pandas_installed, onnx_runtime_installed from hummingbird.ml import constants from tree_utils import gbdt_implementation_map @@ -253,7 +253,6 @@ def test_run_xgb_pandas(self): @unittest.skipIf(not xgboost_installed(), reason="XGBoost test requires XGBoost installed") def test_xgb_regressor_converter_torchscript(self): warnings.filterwarnings("ignore") - import torch for max_depth in [1, 3, 8, 10, 12]: model = xgb.XGBRegressor(n_estimators=10, max_depth=max_depth) @@ -272,7 +271,6 @@ def test_xgb_regressor_converter_torchscript(self): @unittest.skipIf(not xgboost_installed(), reason="XGBoost test requires XGBoost installed") def test_xgb_classifier_converter_torchscript(self): warnings.filterwarnings("ignore") - import torch for max_depth in [1, 3, 8, 10, 12]: model = xgb.XGBClassifier(n_estimators=10, max_depth=max_depth) @@ -293,7 +291,6 @@ def test_xgb_classifier_converter_torchscript(self): @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_xgb_regressor_converter_tvm(self): warnings.filterwarnings("ignore") - import torch for max_depth in [1, 3, 8, 10, 12]: model = xgb.XGBRegressor(n_estimators=10, max_depth=max_depth) @@ -313,7 +310,6 @@ def test_xgb_regressor_converter_tvm(self): @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed") def test_xgb_classifier_converter_tvm(self): warnings.filterwarnings("ignore") - import torch for max_depth in [1, 3, 8, 10, 12]: model = xgb.XGBClassifier(n_estimators=10, max_depth=max_depth) @@ -328,6 +324,37 @@ def test_xgb_classifier_converter_tvm(self): self.assertIsNotNone(tvm_model) np.testing.assert_allclose(model.predict_proba(X), tvm_model.predict_proba(X), rtol=1e-06, atol=1e-06) + # Check that we can export into ONNX. + @unittest.skipIf(not onnx_runtime_installed(), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") + @unittest.skipIf(not xgboost_installed(), reason="XGBoost test requires LightGBM installed") + def test_xgb_onnx(self): + warnings.filterwarnings("ignore") + + X = [[0, 1], [1, 1], [2, 0]] + X = np.array(X, dtype=np.float32) + y = np.array([100, -10, 50], dtype=np.float32) + model = xgb.XGBRegressor(n_estimators=3, min_child_samples=1) + model.fit(X, y) + + # Create ONNX model + onnx_model = hummingbird.ml.convert(model, "onnx", X) + + np.testing.assert_allclose(onnx_model.predict(X).flatten(), model.predict(X)) + + # Check output renaming with two outputs + @unittest.skipIf(not onnx_runtime_installed(), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS") + @unittest.skipIf(not xgboost_installed(), reason="XGBoost test requires LightGBM installed") + def test_xgb_onnx_two_outputs(self): + model = xgb.XGBClassifier(n_estimators=3, max_depth=5) + X = [[0, 1], [1, 1], [2, 0]] + X = np.array(X, dtype=np.float32) + y = np.array([100, -10, 50], dtype=np.float32) + + model.fit(X, y) + + torch_model = hummingbird.ml.convert(model, "onnx", X) + self.assertIsNotNone(torch_model) + if __name__ == "__main__": unittest.main() From b136889aa8c1c45d26013dd63664021894ebf337 Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Tue, 17 Jan 2023 23:31:54 +0000 Subject: [PATCH 2/3] add test --- hummingbird/ml/supported.py | 8 ++++---- tests/test_xgboost_converter.py | 10 +++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/hummingbird/ml/supported.py b/hummingbird/ml/supported.py index dde7f5ae8..47b63ca8b 100644 --- a/hummingbird/ml/supported.py +++ b/hummingbird/ml/supported.py @@ -457,7 +457,7 @@ def _build_sklearn_operator_with_two_outputs(): # SVM-based models from sklearn.svm import LinearSVC, SVC, NuSVC - ops.add( + ops.update( [ # Trees DecisionTreeClassifier, @@ -475,8 +475,8 @@ def _build_sklearn_operator_with_two_outputs(): ] ) - ops.add(xgb_operator_list[0] if len(xgb_operator_list) > 0 else None) - ops.add(lgbm_operator_list[0] if len(lgbm_operator_list) > 0 else None) + ops.update([xgb_operator_list[0]] if len(xgb_operator_list) > 0 else []) + ops.update([lgbm_operator_list[0]] if len(lgbm_operator_list) > 0 else []) return ops @@ -533,7 +533,7 @@ def is_sklearn_models_with_two_outputs(model_type): """ assert model_type in sklearn_api_operator_name_map - return sklearn_operator_with_two_outputs[model_type] + return model_type in sklearn_operator_with_two_outputs def get_onnxml_api_operator_name(model_type): diff --git a/tests/test_xgboost_converter.py b/tests/test_xgboost_converter.py index 6759afca2..e043d9057 100644 --- a/tests/test_xgboost_converter.py +++ b/tests/test_xgboost_converter.py @@ -346,15 +346,19 @@ def test_xgb_onnx(self): @unittest.skipIf(not xgboost_installed(), reason="XGBoost test requires LightGBM installed") def test_xgb_onnx_two_outputs(self): model = xgb.XGBClassifier(n_estimators=3, max_depth=5) - X = [[0, 1], [1, 1], [2, 0]] + np.random.seed(0) + X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) - y = np.array([100, -10, 50], dtype=np.float32) + y = np.random.randint(2, size=100) model.fit(X, y) - torch_model = hummingbird.ml.convert(model, "onnx", X) + torch_model = hummingbird.ml.convert(model, "onnx", X, extra_config={constants.OUTPUT_NAMES: ['labels', 'predictions']}) self.assertIsNotNone(torch_model) + assert(torch_model.model.graph.output[0].name == 'labels') + assert(torch_model.model.graph.output[1].name == 'predictions') + if __name__ == "__main__": unittest.main() From 301b0436f348e42fe028f2069fc380dad9bac30b Mon Sep 17 00:00:00 2001 From: Matteo Interlandi Date: Thu, 2 Feb 2023 15:18:50 -0800 Subject: [PATCH 3/3] lint --- tests/test_xgboost_converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_xgboost_converter.py b/tests/test_xgboost_converter.py index e043d9057..ea8e6a6c7 100644 --- a/tests/test_xgboost_converter.py +++ b/tests/test_xgboost_converter.py @@ -356,8 +356,8 @@ def test_xgb_onnx_two_outputs(self): torch_model = hummingbird.ml.convert(model, "onnx", X, extra_config={constants.OUTPUT_NAMES: ['labels', 'predictions']}) self.assertIsNotNone(torch_model) - assert(torch_model.model.graph.output[0].name == 'labels') - assert(torch_model.model.graph.output[1].name == 'predictions') + self.assertTrue(torch_model.model.graph.output[0].name == 'labels') + self.assertTrue(torch_model.model.graph.output[1].name == 'predictions') if __name__ == "__main__":