diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8694b6..077b9cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 
+### Changed
+- Upgrade Azure ML SDK from V1 to V2 before [V1 deperecation](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-migrate-from-v1) ([#73](https://github.com/microsoft/molecule-generation/pull/73))
+
 ## [0.4.1] - 2024-01-03
 
 ### Changed
diff --git a/environment-py37.yml b/environment-py37.yml
index 00161b4..4aefb9a 100644
--- a/environment-py37.yml
+++ b/environment-py37.yml
@@ -8,4 +8,6 @@ dependencies:
   - rdkit==2020.09.1.0
   - tensorflow==2.1.0
   - pip:
-    - numpy==1.19.2
+      - numpy==1.19.2
+      - mlflow==1.30.1
+      - azureml-mlflow==1.55.0
diff --git a/environment-py38.yml b/environment-py38.yml
index 4ffa715..dd6eb8e 100644
--- a/environment-py38.yml
+++ b/environment-py38.yml
@@ -9,3 +9,5 @@ dependencies:
   - tensorflow==2.6.2
   - pip:
     - numpy==1.22.4
+    - mlflow==2.17.2
+    - azureml-mlflow==1.60.0
diff --git a/environment-py39.yml b/environment-py39.yml
index 4a9979d..1370097 100644
--- a/environment-py39.yml
+++ b/environment-py39.yml
@@ -9,3 +9,5 @@ dependencies:
   - tensorflow==2.9.1
   - pip:
     - numpy==1.24.3
+    - mlflow==3.1.4
+    - azureml-mlflow==1.60.0
diff --git a/environment.yml b/environment.yml
index 6dd760e..73474aa 100644
--- a/environment.yml
+++ b/environment.yml
@@ -9,3 +9,5 @@ dependencies:
   - tensorflow<2.10
   - pip:
     - numpy
+    - mlflow
+    - azureml-mlflow
diff --git a/molecule_generation/cli/train.py b/molecule_generation/cli/train.py
index 794fbcc..8f432c4 100644
--- a/molecule_generation/cli/train.py
+++ b/molecule_generation/cli/train.py
@@ -7,6 +7,7 @@
 import time
 from typing import Dict, Any, Callable, Tuple, Union
 
+import mlflow
 import numpy as np
 import tensorflow as tf
 import tf2_gnn.cli_utils as cli
@@ -163,13 +164,6 @@ def log(msg) -> None:
     log(f"Dataset parameters: {json.dumps(training_utils.unwrap_tf_tracked_data(dataset._params))}")
     log(f"Model parameters: {json.dumps(training_utils.unwrap_tf_tracked_data(model._params))}")
 
-    if args.azureml_logging:
-        from azureml.core.run import Run
-
-        aml_run = Run.get_context()
-    else:
-        aml_run = None
-
     # Set up tensorboard logging.
     if args.tensorboard or args.profile:
         writer = tf.summary.create_file_writer(os.path.join(args.save_dir, "tensorboard"))
@@ -185,7 +179,7 @@ def log(msg) -> None:
         patience=args.patience,
         save_dir=args.save_dir,
         quiet=args.quiet,
-        aml_run=aml_run,
+        should_log_aml_run=args.azureml_logging,    # argument indicating the need of logging azureml runs
         profile=args.profile,
     )
 
@@ -208,12 +202,12 @@ def log(msg) -> None:
         try:
             with dataset.get_context_managed_tf_dataset(training_utils.DataFold.TEST) as test_data:
                 _, _, test_results = model.run_on_data_iterator(
-                    iter(test_data.tf_dataset), training=False, quiet=args.quiet, aml_run=aml_run
+                    iter(test_data.tf_dataset), training=False, quiet=args.quiet, should_log_aml_run=args.azureml_logging
                 )
                 test_metric, test_metric_string = model.compute_epoch_metrics(test_results)
                 log(test_metric_string)
-                if aml_run is not None:
-                    aml_run.log("task_test_metric", float(test_metric))
+                if args.azureml_logging is not None:
+                    mlflow.log_metric("task_test_metric", float(test_metric))
         finally:
             dataset._params["trace_element_keep_prob"] = orig_keep_prob
             dataset._params["trace_element_non_carbon_keep_prob"] = orig_non_carbon_keep_prob
@@ -230,7 +224,7 @@ def train(
     patience: int,
     save_dir: str,
     quiet: bool = False,
-    aml_run=None,
+    should_log_aml_run=None,
     profile: bool = False,
 ):
     save_file = os.path.join(save_dir, f"{run_id}_best.pkl")
@@ -254,7 +248,7 @@ def train(
                 training=False,
                 quiet=quiet,
                 max_num_steps=num_valid_steps,
-                aml_run=aml_run,
+                should_log_aml_run=should_log_aml_run,
             )
             best_valid_metric, best_val_str = model.compute_epoch_metrics(initial_valid_results)
             log_fun(f"Initial valid metric: {best_val_str}.")
@@ -276,7 +270,7 @@ def train(
                     training=True,
                     quiet=quiet,
                     max_num_steps=num_train_steps_between_valid,
-                    aml_run=aml_run,
+                    should_log_aml_run=should_log_aml_run,
                 )
 
                 if profile and epoch == 2:
@@ -294,7 +288,7 @@ def train(
                     training=False,
                     quiet=quiet,
                     max_num_steps=num_valid_steps,
-                    aml_run=aml_run,
+                    should_log_aml_run=should_log_aml_run,
                 )
                 tf.summary.scalar("valid_loss", data=valid_loss, step=epoch)
 
@@ -303,11 +297,11 @@ def train(
                     f" Valid:  {valid_loss:.4f} loss | {valid_metric_string} | {valid_speed:.2f} graphs/s",
                 )
 
-                if aml_run is not None:
-                    aml_run.log("task_train_metric", float(train_metric))
-                    aml_run.log("train_speed", float(train_speed))
-                    aml_run.log("task_valid_metric", float(valid_metric))
-                    aml_run.log("valid_speed", float(valid_speed))
+                if should_log_aml_run is not None:
+                    mlflow.log_metric("task_train_metric", float(train_metric))
+                    mlflow.log_metric("train_speed", float(train_speed))
+                    mlflow.log_metric("task_valid_metric", float(valid_metric))
+                    mlflow.log_metric("valid_speed", float(valid_speed))
 
                 # Save if good enough.
                 if valid_metric < best_valid_metric:
diff --git a/molecule_generation/models/cgvae.py b/molecule_generation/models/cgvae.py
index e75b251..089c341 100644
--- a/molecule_generation/models/cgvae.py
+++ b/molecule_generation/models/cgvae.py
@@ -629,12 +629,12 @@ def run_on_data_iterator(
         quiet: bool = False,
         training: bool = True,
         max_num_steps: Optional[int] = None,  # Run until dataset ends if None
-        aml_run: Optional = None,
+        should_log_aml_run: Optional[bool] = None,
     ) -> Tuple[float, float, List[Any]]:
         with EpochMetricsLogger(
             window_size=self._logged_loss_smoothing_window_size,
             quiet=quiet,
-            aml_run=aml_run,
+            should_log_aml_run=should_log_aml_run,
             training=training,
         ) as metrics_logger:
             for step, (batch_features, batch_labels) in enumerate(data_iterator):
diff --git a/molecule_generation/models/moler_base_model.py b/molecule_generation/models/moler_base_model.py
index 6f13954..a17b291 100644
--- a/molecule_generation/models/moler_base_model.py
+++ b/molecule_generation/models/moler_base_model.py
@@ -229,12 +229,12 @@ def run_on_data_iterator(
         quiet: bool = False,
         training: bool = True,
         max_num_steps: Optional[int] = None,  # Run until dataset ends if None
-        aml_run: Optional[Any] = None,
+        should_log_aml_run: Optional[bool] = None,
     ) -> Tuple[float, float, List[Any]]:
         with EpochMetricsLogger(
             window_size=self._logged_loss_smoothing_window_size,
             quiet=quiet,
-            aml_run=aml_run,
+            should_log_aml_run=should_log_aml_run,
             training=training,
         ) as metrics_logger:
             for step, (batch_features, batch_labels) in enumerate(data_iterator):
diff --git a/molecule_generation/utils/epoch_metrics_logger.py b/molecule_generation/utils/epoch_metrics_logger.py
index 054ece4..a971ff5 100644
--- a/molecule_generation/utils/epoch_metrics_logger.py
+++ b/molecule_generation/utils/epoch_metrics_logger.py
@@ -2,6 +2,7 @@
 from collections import defaultdict, deque
 import time
 
+from mlflow
 import tensorflow as tf
 import numpy as np
 
@@ -10,11 +11,11 @@ class EpochMetricsLogger:
     """Logs metrics for an epoch of training"""
 
     def __init__(
-        self, *, window_size: int = 100, quiet: bool, aml_run: Optional, training: bool
+        self, *, window_size: int = 100, quiet: bool, should_log_aml_run: Optional[bool], training: bool
     ) -> None:
         self._window_size = window_size
         self._quiet = quiet
-        self._aml_run = aml_run
+        self._should_log_aml_run = should_log_aml_run
         self._training = training
 
         # Initialise everything in case you don't want to use this as a contextmanager
@@ -53,9 +54,9 @@ def log_step_metrics(self, task_metrics, batch_features):
         )
         if self._step >= self._window_size and self._step % self._window_size == 0:
             self._moving_average_metrics = self._get_moving_average_metrics()
-            if self._aml_run is not None:
+            if self._should_log_aml_run is not None:
                 for k, v in self._moving_average_metrics.items():
-                    self._aml_run.log("smoothed_" + k, float(v))
+                    mlflow.log_metric("smoothed_" + k, float(v))
 
         # Tensorboard logging:
         batch_graph_average_loss = task_metrics["loss"] / float(
diff --git a/molecule_generation/utils/property_models.py b/molecule_generation/utils/property_models.py
index 92747d3..561fc92 100644
--- a/molecule_generation/utils/property_models.py
+++ b/molecule_generation/utils/property_models.py
@@ -3,6 +3,7 @@
 from abc import abstractmethod
 from typing import Any, Dict, List, Tuple, Callable, Optional
 
+import mlflow
 import numpy as np
 import tensorflow as tf
 import sklearn.metrics as metrics
@@ -172,7 +173,7 @@ def print_evaluation_report(
 
     @staticmethod
     def log_evaluation_report(
-        prop_name: str, predictions, labels, aml_run=None, log_fun: Callable[[str], None] = print
+        prop_name: str, predictions, labels, should_log_aml_run=None, log_fun: Callable[[str], None] = print
     ) -> None:
         mae = metrics.mean_absolute_error(y_true=labels, y_pred=predictions)
         mse = metrics.mean_squared_error(y_true=labels, y_pred=predictions)
@@ -187,15 +188,12 @@ def log_evaluation_report(
         log_fun(f" Explained Variance:  {expl_var:.3f}")
         log_fun(f" R2 Score:            {r2_score:.3f}")
 
-        if aml_run:
-            aml_run.log_row(
-                f"{prop_name}_test_metrics",
-                mean_abs_err=float(mae),
-                mse=float(mse),
-                max_err=float(max_err),
-                explained_variance=float(expl_var),
-                r2_score=float(r2_score),
-            )
+        if should_log_aml_run:
+            mlflow.log_metric(f"{prop_name}_test_metrics", mean_abs_err=float(mae))
+            mlflow.log_metric(f"{prop_name}_test_metrics", mse=float(mse))
+            mlflow.log_metric(f"{prop_name}_test_metrics", max_err=float(max_err))
+            mlflow.log_metric(f"{prop_name}_test_metrics", explained_variance=float(expl_var))
+            mlflow.log_metric(f"{prop_name}_test_metrics", r2_score=float(r2_score))
 
 
 class MLPBinaryClassifierLayer(MLPRegressionLayer):
@@ -255,7 +253,7 @@ def print_evaluation_report(
 
     @staticmethod
     def log_evaluation_report(
-        prop_name: str, predictions, labels, aml_run=None, log_fun: Callable[[str], None] = print
+        prop_name: str, predictions, labels, should_log_aml_run=None, log_fun: Callable[[str], None] = print
     ) -> None:
         rounded_predictions = np.round(predictions)
         acc = metrics.accuracy_score(y_true=labels, y_pred=rounded_predictions)
@@ -273,13 +271,10 @@ def log_evaluation_report(
         log_fun(f" F1 Score:          {f1_score:.4f}")
         log_fun(f" ROC AUC:           {roc_auc:.4f}")
 
-        if aml_run:
-            aml_run.log_row(
-                f"{prop_name}_test_metrics",
-                accuracy=float(acc),
-                balanced_accuracy=float(balanced_acc),
-                precision=float(precision),
-                recall=float(recall),
-                fl_score=float(f1_score),
-                roc_auc_score=float(roc_auc),
-            )
+        if should_log_aml_run:
+            mlflow.log_metric(f"{prop_name}_test_metrics", accuracy=float(acc))
+            mlflow.log_metric(f"{prop_name}_test_metrics", balanced_accuracy=float(balanced_acc))
+            mlflow.log_metric(f"{prop_name}_test_metrics", precision=float(precision))
+            mlflow.log_metric(f"{prop_name}_test_metrics", recall=float(recall))
+            mlflow.log_metric(f"{prop_name}_test_metrics", fl_score=float(f1_score))
+            mlflow.log_metric(f"{prop_name}_test_metrics", roc_auc_score=float(roc_auc))