Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions nextstrain-pathogen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,26 @@
---
compatibility:
nextstrain run: true
# [Discussion] Realistically we won't implement buildpacks for a number of months
# and so that can't block our dependency checking. We can try to second guess the
# correct syntax or (easier) deliberately choose a top-level key name which won't
# be used by buildpacks and then migrate to the buildpacks syntax when we implement
# them for each repo.
dependencies:
# nextstrain-augur is the name of the python package, so this will check we have augur>=30 installed
nextstrain-augur: ">=30"
# augur is the name of the CLI (not a python package which I have), so this will check we have augur>=33
# installed as a CLI *as long as* we don't have a python package called 'augur'
# (We don't need to list 'augur' and 'nextstrain-augur' - this is for testing!)
# Note: Augur 33 doesn't exist, so this is reported as a "Version incompatibilities"
augur: ">=33"
snakemake: ">=9,<10"
nextclade: '>=3.15'
nextstrain: '>=10.2'
# The following program should be reported under "Not found dependencies"
this-program-doesnt-exist: '>=1.1'
# The following program should be reported under "Declaration errors" as '1' is not a valid specifier
invalid-specifier: '1'
# I have a executable program called 'program-which-exits-2' which exits 2 every time. This dependency
# is reported under "Unexpected errors" for me, but may be "Not found dependencies" for you
program-which-exits-2: '>3.0'
3 changes: 2 additions & 1 deletion phylogenetic/Snakefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

# Utility functions shared across all workflows.
include: "../shared/vendored/snakemake/config.smk"

include: "../shared/vendored/snakemake/versioning.smk"

# Use default configuration values. Extend with Snakemake's --configfile/--config options.Add commentMore actions
configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
Expand All @@ -10,6 +10,7 @@ configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
if os.path.exists("config.yaml"):
configfile: "config.yaml"

check_pathogen_required_versions()

rule all:
input:
Expand Down
1 change: 1 addition & 0 deletions shared/vendored/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Potential Nextstrain CLI scripts
Snakemake workflow functions that are shared across many pathogen workflows that don’t really belong in any of our existing tools.

- [config.smk](snakemake/config.smk) - Shared functions for parsing workflow configs.
- [versioning.smk](snakemake/versioning.smk) - Shared functions for enforcing dependency versions.

## Software requirements

Expand Down
191 changes: 191 additions & 0 deletions shared/vendored/snakemake/versioning.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""
Shared functions to be used within a Snakemake workflow for enforcing
versions of dependencies the repo defines within its `nextstrain-pathogen.yaml`
"""

from os import path
from sys import stderr
from packaging.specifiers import SpecifierSet, InvalidSpecifier # snakemake dependency
from packaging.version import Version, InvalidVersion # snakemake dependency
from importlib.metadata import version as importlib_version, PackageNotFoundError
from snakemake.common import __version__ as snakemake_version
import subprocess
from shutil import which
import re

class ProgramNotFoundError(Exception):
pass

class DependencyChecker():
def __init__(self, registration):
super().__init__()
self.error_attrs = ["version_incompatibilities", "not_found_dependencies", "declaration_errors", "unexpected_errors"]
for attr in self.error_attrs:
setattr(self, attr, [])
self.declared_dependencies = self.parse_dependencies(registration)

def parse_dependencies(self, registration):
declared_dependencies = {}
dependencies = registration.get('dependencies', {})
if type(dependencies) is not dict:
raise WorkflowError(f"Within `nextstrain-pathogen.yaml` the dependencies must be a dict of <name>: <specifier>. You provided {type(dependencies).__name__}")
for name, spec in dependencies.items():
try:
declared_dependencies[name] = SpecifierSet(spec)
except InvalidSpecifier:
self.declaration_errors.append(f"This pathogen declared an invalid version specification for CLI program {name!r} of {spec}")
return declared_dependencies

def check(self):
for name, specifier in self.declared_dependencies.items():
try: # First assume it's a python package
self.check_python_package(name, specifier)
except PackageNotFoundError:
try: # if it's not a python package, maybe it's a CLI?
self.check_cli_version(name, specifier)
except ProgramNotFoundError:
self.not_found_dependencies.append(f"{name!r} is not installed as a python dependency nor a CLI program. This pathogen requires a version satisfying {str(specifier)!r}")

def report_errors(self) -> bool:
if sum([len(getattr(self, attr)) for attr in self.error_attrs])==0:
print("All dependencies declared by this pathogen satisfied", file=stderr)
return False

print(file=stderr)
print('_'*80, file=stderr)
print(f"This pathogen declares dependencies which were not met.", file=stderr)
for attr in self.error_attrs:
errors = getattr(self, attr)
if len(errors)==0:
continue
print(attr.replace('_', ' ').capitalize() + ":", file=stderr)
print("-"*(len(attr)+1), file=stderr)
for msg in errors:
print(f"\t{msg}", file=stderr)
print('_'*80, file=stderr)
print(file=stderr)
return True

def check_python_package(self, name: str, specifier: SpecifierSet):
"""
Check whether the installed python library *name* meets the specifier *specifier*.
This uses importlib.metadata to check the available version which avoids importing
the top-level import.

If the package is found but the version doesn't satisfy the provided *specifier*
we log an error. Raises `PackageNotFoundError` if the package is not found.
"""
try:
if name=='snakemake':
# in conda environments importlib reports a snakemake version of 0.0.0,
# so follow the approach of Snakemake's own min_version function
version = Version(snakemake_version)
else:
version = Version(importlib_version(name))
except InvalidVersion: # <https://packaging.pypa.io/en/stable/version.html#packaging.version.InvalidVersion>
self.unexpected_errors.append(f"Python dependency {name!r} reported a version of {output} which we were unable to parse")
return

ok = specifier.contains(version)
# print(f"[DEBUG] Checking python dependency: {name!r} installed: {version} requirements: {specifier} OK? {ok}", file=stderr)
if not ok:
self.version_incompatibilities.append(f"Python dependency {name!r} version incompatibility. You have {version} but this pathogen declares {specifier}")

def check_cli_version(self, name: str, specifier: SpecifierSet) -> None:
"""
Check whether the requested *name* is (a) installed and (b) reports a version
which satisfies the *specifier*. Both (a) and (b) are achieved by calling
`<name> --version`.

If *name* isn't found (or is not executable) we raise a ProgramNotFoundError.
If the package is found but the version doesn't satisfy the provided *specifier*
we log an error.
"""
if which(name) is None:
raise ProgramNotFoundError()

cmd = [name, "--version"]
try:
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
output = ((proc.stdout or "") + " " + (proc.stderr or "")).strip()
except subprocess.CalledProcessError as e:
self.unexpected_errors.append(f"CLI program {name!r} exited code {e.returncode} when called using {' '.join(cmd)!r}")
return

m = re.search(r"\d+(\.\d+(\.\d+)?)?([.-][0-9A-Za-z]+)*", output)
# 1 . 2 . 3 alpha etc
if not m:
self.unexpected_errors.append(f"CLI program {name!r} didn't report a parseable version when called via {' '.join(cmd)!r}")
return

try:
version = Version(m.group(0))
except InvalidVersion: # <https://packaging.pypa.io/en/stable/version.html#packaging.version.InvalidVersion>
self.unexpected_errors.append(f"CLI program {name!r} reported a version of {m.group(0)} which we were unable to parse")
Comment on lines +121 to +124
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will fail for versions which don't conform to Python's idea of a version, of course. Non-Python CLI tools seem more likely to hit that. FWIW, Nextstrain CLI has code for parsing any version string in a way that leaves the version at least comparable to like versions.


ok = specifier.contains(version)
# print(f"[DEBUG] Checking CLI program: {name!r} installed: {version} requirements: {specifier} OK? {ok}", file=stderr)
if not ok:
self.version_incompatibilities.append(f"CLI program {name!r} version incompatibility. You have {version} but this pathogen declares {specifier}")


def _read_nextstrain_pathogen_yaml(path: str) -> dict:
"""
Reads a ``nextstrain-pathogen.yaml`` file at *path* and returns a dict of
its deserialized contents.

Taken from <https://github.com/nextstrain/cli/blob/4dbac262b22a3db9c48267e23f713ad56251ffd0/nextstrain/cli/pathogens.py#L843C1-L858C24>
with modifications. (Note: pathogen repos don't need the nextstrain CLI to be installed and thus we can't import the code.)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the note here 😄 I was initially confused why this was getting implemented as part of the Snakemake workflow instead of the Nextstrain CLI which has all of the code for parsing nextstrain-pathogen.yaml. Will have to think on whether we will eventually also duplicate schema validation for nextstrain-pathogen.yaml.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After implementing this, my takeaway is that using the nextstrain-pathogen.yaml comes with a lot of baggage. Whatever I choose now will almost certainly be replaced with some other syntax when we implement workflows-as-programs, and then we have a backwards-compatibility debt to pay. We also have the conceptual conflict that the CLI is the tool which uses this file, but I want a solution that doesn't depend on the CLI. The very existence of the file implies a specific directory structure which is independent of my desire to just check some dependencies!

Alternatively I can define the dependencies elsewhere (and still use the code implemented here but just read the definitions from elsewhere). The could be (i) in the config YAMLs, (ii) hardcoded in the snakemake files or (iii) in a new file nextstrain-dependencies.yaml. Pros and cons. (i) and (ii) allow for per-workflow dependencies, which could be very nice in cetain situations but is prone to lists getting out of sync. I'm thinking about letting the snakefile decide by supplying them to check_pathogen_required_versions(dependencies, *, fatal=True); an example usage might be check_pathogen_required_versions(config['dependencies'], fatal=True). Any opinions welcomed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After implementing this, my takeaway is that using the nextstrain-pathogen.yaml comes with a lot of baggage. Whatever I choose now will almost certainly be replaced with some other syntax when we implement workflows-as-programs, and then we have a backwards-compatibility debt to pay. We also have the conceptual conflict that the CLI is the tool which uses this file, but I want a solution that doesn't depend on the CLI.

To me, the takeaway from the above is "don't try to solve this problem right now".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me, the takeaway from the above is "don't try to solve this problem right now".

👆 yeah, same.

Why do you want a solution that doesn't depend on Nextstrain CLI?

I agree we definitely want to keep workflows run-able independent of Nextstrain CLI, but it seems fine for dependency checking to happen at the Nextstrain CLI level and not the workflow level. I don't think we need to support all the niceties we want for workflows at the lowest possible level (the workflow), esp. if they're easier or better implemented at a higher level (Nextstrain CLI).

"""
import yaml
with open(path, encoding = "utf-8") as f:
registration = yaml.safe_load(f)

if not isinstance(registration, dict):
raise ValueError(f"nextstrain-pathogen.yaml not a dict (got a {type(registration).__name__}): {str(path)!r}")

return registration

def pathogen_yaml(*, subdir_max=3):
_searched_paths = []
for i in range(0, subdir_max):
p = path.normpath(path.join(workflow.basedir, *['..']*i, "nextstrain-pathogen.yaml"))
_searched_paths.append(p)
if path.isfile(p):
Comment on lines +149 to +154
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, this is a more limited search than what Nextstrain CLI does, so it's prone to cause different behaviour if the search depth (height) is exceeded.

try:
registration = _read_nextstrain_pathogen_yaml(p)
except Exception as e:
raise WorkflowError(f"Unable to parse {p} (as YAML). Error: {e}")
break
else:
print("Could not find a nextstrain-pathogen.yaml file to check version dependencies.\n"
"Searched paths:\n\t" + "\n\t".join(_searched_paths))
raise WorkflowError()
return registration


def check_pathogen_required_versions(*, fatal=True):
"""
Checks if dependencies declared via the pathogen's 'nextstrain-pathogen.yaml'
are satisfied. Dependencies should be defined within the YAML like so:

dependencies:
<name>: <specification>

The syntax of <specification> is detailed in <https://packaging.python.org/en/latest/specifications/version-specifiers/#id5>

We first check if the <name> is a python package. If it is not installed
as a python package we check if it's an installed CLI and attempt to
get the version by running `<name> --version`.

If *fatal* is True (default) we raise a WorkflowError if
all conditions are not satisfied.
"""
if config.get('skip_dependency_version_checking', False) is True:
print("Skipping dependency version checking as per config setting", file=stderr)
return
checker = DependencyChecker(pathogen_yaml())
checker.check()
errors = checker.report_errors()
if errors and fatal:
raise WorkflowError("Dependencies not satisfied")
Loading