From 2b9cdf1716876eb312f0d8d266bf6a113c69c933 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 15 Jul 2025 21:04:58 +1200 Subject: [PATCH] WIP - shared smk to enforce declared deps --- nextstrain-pathogen.yaml | 23 +++ phylogenetic/Snakefile | 3 +- shared/vendored/README.md | 1 + shared/vendored/snakemake/versioning.smk | 191 +++++++++++++++++++++++ 4 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 shared/vendored/snakemake/versioning.smk diff --git a/nextstrain-pathogen.yaml b/nextstrain-pathogen.yaml index 9a723f0..5837ee8 100644 --- a/nextstrain-pathogen.yaml +++ b/nextstrain-pathogen.yaml @@ -8,3 +8,26 @@ --- compatibility: nextstrain run: true +# [Discussion] Realistically we won't implement buildpacks for a number of months +# and so that can't block our dependency checking. We can try to second guess the +# correct syntax or (easier) deliberately choose a top-level key name which won't +# be used by buildpacks and then migrate to the buildpacks syntax when we implement +# them for each repo. +dependencies: + # nextstrain-augur is the name of the python package, so this will check we have augur>=30 installed + nextstrain-augur: ">=30" + # augur is the name of the CLI (not a python package which I have), so this will check we have augur>=33 + # installed as a CLI *as long as* we don't have a python package called 'augur' + # (We don't need to list 'augur' and 'nextstrain-augur' - this is for testing!) + # Note: Augur 33 doesn't exist, so this is reported as a "Version incompatibilities" + augur: ">=33" + snakemake: ">=9,<10" + nextclade: '>=3.15' + nextstrain: '>=10.2' + # The following program should be reported under "Not found dependencies" + this-program-doesnt-exist: '>=1.1' + # The following program should be reported under "Declaration errors" as '1' is not a valid specifier + invalid-specifier: '1' + # I have a executable program called 'program-which-exits-2' which exits 2 every time. This dependency + # is reported under "Unexpected errors" for me, but may be "Not found dependencies" for you + program-which-exits-2: '>3.0' \ No newline at end of file diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index d012646..b6e95a8 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,7 +1,7 @@ # Utility functions shared across all workflows. include: "../shared/vendored/snakemake/config.smk" - +include: "../shared/vendored/snakemake/versioning.smk" # Use default configuration values. Extend with Snakemake's --configfile/--config options.Add commentMore actions configfile: os.path.join(workflow.basedir, "defaults/config.yaml") @@ -10,6 +10,7 @@ configfile: os.path.join(workflow.basedir, "defaults/config.yaml") if os.path.exists("config.yaml"): configfile: "config.yaml" +check_pathogen_required_versions() rule all: input: diff --git a/shared/vendored/README.md b/shared/vendored/README.md index b6aeadf..ca67a80 100644 --- a/shared/vendored/README.md +++ b/shared/vendored/README.md @@ -122,6 +122,7 @@ Potential Nextstrain CLI scripts Snakemake workflow functions that are shared across many pathogen workflows that don’t really belong in any of our existing tools. - [config.smk](snakemake/config.smk) - Shared functions for parsing workflow configs. +- [versioning.smk](snakemake/versioning.smk) - Shared functions for enforcing dependency versions. ## Software requirements diff --git a/shared/vendored/snakemake/versioning.smk b/shared/vendored/snakemake/versioning.smk new file mode 100644 index 0000000..c8c07a7 --- /dev/null +++ b/shared/vendored/snakemake/versioning.smk @@ -0,0 +1,191 @@ +""" +Shared functions to be used within a Snakemake workflow for enforcing +versions of dependencies the repo defines within its `nextstrain-pathogen.yaml` +""" + +from os import path +from sys import stderr +from packaging.specifiers import SpecifierSet, InvalidSpecifier # snakemake dependency +from packaging.version import Version, InvalidVersion # snakemake dependency +from importlib.metadata import version as importlib_version, PackageNotFoundError +from snakemake.common import __version__ as snakemake_version +import subprocess +from shutil import which +import re + +class ProgramNotFoundError(Exception): + pass + +class DependencyChecker(): + def __init__(self, registration): + super().__init__() + self.error_attrs = ["version_incompatibilities", "not_found_dependencies", "declaration_errors", "unexpected_errors"] + for attr in self.error_attrs: + setattr(self, attr, []) + self.declared_dependencies = self.parse_dependencies(registration) + + def parse_dependencies(self, registration): + declared_dependencies = {} + dependencies = registration.get('dependencies', {}) + if type(dependencies) is not dict: + raise WorkflowError(f"Within `nextstrain-pathogen.yaml` the dependencies must be a dict of : . You provided {type(dependencies).__name__}") + for name, spec in dependencies.items(): + try: + declared_dependencies[name] = SpecifierSet(spec) + except InvalidSpecifier: + self.declaration_errors.append(f"This pathogen declared an invalid version specification for CLI program {name!r} of {spec}") + return declared_dependencies + + def check(self): + for name, specifier in self.declared_dependencies.items(): + try: # First assume it's a python package + self.check_python_package(name, specifier) + except PackageNotFoundError: + try: # if it's not a python package, maybe it's a CLI? + self.check_cli_version(name, specifier) + except ProgramNotFoundError: + self.not_found_dependencies.append(f"{name!r} is not installed as a python dependency nor a CLI program. This pathogen requires a version satisfying {str(specifier)!r}") + + def report_errors(self) -> bool: + if sum([len(getattr(self, attr)) for attr in self.error_attrs])==0: + print("All dependencies declared by this pathogen satisfied", file=stderr) + return False + + print(file=stderr) + print('_'*80, file=stderr) + print(f"This pathogen declares dependencies which were not met.", file=stderr) + for attr in self.error_attrs: + errors = getattr(self, attr) + if len(errors)==0: + continue + print(attr.replace('_', ' ').capitalize() + ":", file=stderr) + print("-"*(len(attr)+1), file=stderr) + for msg in errors: + print(f"\t{msg}", file=stderr) + print('_'*80, file=stderr) + print(file=stderr) + return True + + def check_python_package(self, name: str, specifier: SpecifierSet): + """ + Check whether the installed python library *name* meets the specifier *specifier*. + This uses importlib.metadata to check the available version which avoids importing + the top-level import. + + If the package is found but the version doesn't satisfy the provided *specifier* + we log an error. Raises `PackageNotFoundError` if the package is not found. + """ + try: + if name=='snakemake': + # in conda environments importlib reports a snakemake version of 0.0.0, + # so follow the approach of Snakemake's own min_version function + version = Version(snakemake_version) + else: + version = Version(importlib_version(name)) + except InvalidVersion: # + self.unexpected_errors.append(f"Python dependency {name!r} reported a version of {output} which we were unable to parse") + return + + ok = specifier.contains(version) + # print(f"[DEBUG] Checking python dependency: {name!r} installed: {version} requirements: {specifier} OK? {ok}", file=stderr) + if not ok: + self.version_incompatibilities.append(f"Python dependency {name!r} version incompatibility. You have {version} but this pathogen declares {specifier}") + + def check_cli_version(self, name: str, specifier: SpecifierSet) -> None: + """ + Check whether the requested *name* is (a) installed and (b) reports a version + which satisfies the *specifier*. Both (a) and (b) are achieved by calling + ` --version`. + + If *name* isn't found (or is not executable) we raise a ProgramNotFoundError. + If the package is found but the version doesn't satisfy the provided *specifier* + we log an error. + """ + if which(name) is None: + raise ProgramNotFoundError() + + cmd = [name, "--version"] + try: + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True) + output = ((proc.stdout or "") + " " + (proc.stderr or "")).strip() + except subprocess.CalledProcessError as e: + self.unexpected_errors.append(f"CLI program {name!r} exited code {e.returncode} when called using {' '.join(cmd)!r}") + return + + m = re.search(r"\d+(\.\d+(\.\d+)?)?([.-][0-9A-Za-z]+)*", output) + # 1 . 2 . 3 alpha etc + if not m: + self.unexpected_errors.append(f"CLI program {name!r} didn't report a parseable version when called via {' '.join(cmd)!r}") + return + + try: + version = Version(m.group(0)) + except InvalidVersion: # + self.unexpected_errors.append(f"CLI program {name!r} reported a version of {m.group(0)} which we were unable to parse") + + ok = specifier.contains(version) + # print(f"[DEBUG] Checking CLI program: {name!r} installed: {version} requirements: {specifier} OK? {ok}", file=stderr) + if not ok: + self.version_incompatibilities.append(f"CLI program {name!r} version incompatibility. You have {version} but this pathogen declares {specifier}") + + +def _read_nextstrain_pathogen_yaml(path: str) -> dict: + """ + Reads a ``nextstrain-pathogen.yaml`` file at *path* and returns a dict of + its deserialized contents. + + Taken from + with modifications. (Note: pathogen repos don't need the nextstrain CLI to be installed and thus we can't import the code.) + """ + import yaml + with open(path, encoding = "utf-8") as f: + registration = yaml.safe_load(f) + + if not isinstance(registration, dict): + raise ValueError(f"nextstrain-pathogen.yaml not a dict (got a {type(registration).__name__}): {str(path)!r}") + + return registration + +def pathogen_yaml(*, subdir_max=3): + _searched_paths = [] + for i in range(0, subdir_max): + p = path.normpath(path.join(workflow.basedir, *['..']*i, "nextstrain-pathogen.yaml")) + _searched_paths.append(p) + if path.isfile(p): + try: + registration = _read_nextstrain_pathogen_yaml(p) + except Exception as e: + raise WorkflowError(f"Unable to parse {p} (as YAML). Error: {e}") + break + else: + print("Could not find a nextstrain-pathogen.yaml file to check version dependencies.\n" + "Searched paths:\n\t" + "\n\t".join(_searched_paths)) + raise WorkflowError() + return registration + + +def check_pathogen_required_versions(*, fatal=True): + """ + Checks if dependencies declared via the pathogen's 'nextstrain-pathogen.yaml' + are satisfied. Dependencies should be defined within the YAML like so: + + dependencies: + : + + The syntax of is detailed in + + We first check if the is a python package. If it is not installed + as a python package we check if it's an installed CLI and attempt to + get the version by running ` --version`. + + If *fatal* is True (default) we raise a WorkflowError if + all conditions are not satisfied. + """ + if config.get('skip_dependency_version_checking', False) is True: + print("Skipping dependency version checking as per config setting", file=stderr) + return + checker = DependencyChecker(pathogen_yaml()) + checker.check() + errors = checker.report_errors() + if errors and fatal: + raise WorkflowError("Dependencies not satisfied")