Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions spacy_loggers/mlflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
from spacy.training.loggers import console_logger


class ModelDir:
def __init__(self) -> None:
self.path = None

def update(self, path: str) -> None:
self.path = path

Comment on lines +14 to +20
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can ditch ModelDir - let's use a simple string for the latest model path (latest_model_dir or similar).

# entry point: spacy.MLflowLogger.v1
def mlflow_logger_v1(
run_id: Optional[str] = None,
Expand All @@ -19,6 +26,7 @@ def mlflow_logger_v1(
nested: bool = False,
tags: Optional[Dict[str, Any]] = None,
remove_config_values: List[str] = [],
log_latest_dir: bool = True,
):
try:
import mlflow
Expand All @@ -33,7 +41,7 @@ def mlflow_logger_v1(
)

console = console_logger(progress_bar=False)

def setup_logger(
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
Expand All @@ -58,6 +66,15 @@ def setup_logger(
mlflow.log_params({k.replace("@", ""): v for k, v in batch})

console_log_step, console_finalize = console(nlp, stdout, stderr)

if log_latest_dir:
latest_model = ModelDir()

def log_model(path, name):
mlflow.log_artifacts(
path,
name
)
Comment on lines +73 to +77
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar: since this is just a wrapper without added functionality, let's use mlflow.log_artifacts() directly.


def log_step(info: Optional[Dict[str, Any]]):
console_log_step(info)
Expand All @@ -66,23 +83,34 @@ def log_step(info: Optional[Dict[str, Any]]):
other_scores = info["other_scores"]
losses = info["losses"]
output_path = info.get("output_path", None)
if log_latest_dir:
latest_model.update(output_path)

if score is not None:
mlflow.log_metric("score", score)
mlflow.log_metric("score", score, info["step"])
if losses:
mlflow.log_metrics({f"loss_{k}": v for k, v in losses.items()})
mlflow.log_metrics({f"loss_{k}": v for k, v in losses.items()}, info["step"])
if isinstance(other_scores, dict):
mlflow.log_metrics(
{
k: v
for k, v in util.dict_to_dot(other_scores).items()
if isinstance(v, float) or isinstance(v, int)
}
},
info["step"]
)
if output_path and score == max(info["checkpoints"])[0]:
nlp = load(output_path)
mlflow.spacy.log_model(nlp, "best")
log_model(output_path, 'model_best')
Comment on lines 104 to +105
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It irks me that now we have three artifacts - best, model_best and model_last. Ideally I'd like to have best and last, both the complete set of associated data (like best has, see screenshot). Just for the sake of consistency. If that doesn't work out due to the issues you described, I guess this is fine. I'll look into this.
image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: I ended up using mlflow.log_artifact() rather than mlflow.spacy.log_model() because when I use the latter method outside of the log_step() function it looks like spacy just aborts the process abruptly, without letting the mlflow.spacy function do its thing.

Hm, I ran this with

        def finalize() -> None:
            if log_latest_dir:
                mlflow.spacy.log_model(nlp, "model_last")

now and this seemed to work (see image). What's your outcome?
image



def finalize() -> None:

if log_latest_dir:
log_model(latest_model.path, 'model_last')

print('End run')
Comment on lines +109 to +113
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if log_latest_dir:
log_model(latest_model.path, 'model_last')
print('End run')
if log_latest_dir:
log_model(latest_model.path, 'model_last')

console_finalize()
mlflow.end_run()

Expand Down