poldrack · poldrack · Dec 29, 2025 · Dec 27, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,41 @@
+# Git
+.git
+.gitignore
+
+# Python
+__pycache__
+*.py[cod]
+*$py.class
+*.so
+.Python
+.venv
+venv
+ENV
+
+# Build artifacts
+_build
+*.egg-info
+dist
+build
+
+# IDE
+.idea
+.vscode
+*.swp
+*.swo
+
+# Jupyter
+.ipynb_checkpoints
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Docker (keep Dockerfile accessible)
+docker/Makefile
+docker/README.md
+
+# Misc
+*.log
+.env
+.env.*
diff --git a/book/workflows.md b/book/workflows.md
@@ -505,6 +505,18 @@ total                              5
 
 Similarly, Snakemake will rerun the workflow if any of the scripts used to run the workflow are modified.  However, it's important to note that it will not identify changes in the modules that are imported.  In that case you would need to rerun the workflow in order to re-execute the relevant steps.
 
+#### Reproducible environments with Conda
+
+after installing miniconda:
+
+
+```bash
+conda create -c conda-forge -c bioconda -c nodefaults -n bettercode snakemake
+conda activate bettercode
+pip install -e .
+```
+
+
 ## Scaling to a complex workflow
 
 We now turn to a more realistic and complex scientific data analysis workflow. For this example I will use an analysis of single-cell RNA-sequencing data to determine how gene expression in immune system cells changes with age. This analysis will utilize a [large openly available dataset](https://cellxgene.cziscience.com/collections/dde06e0f-ab3b-46be-96a2-a8082383c4a1) that includes data from 982 people comprising about 1.3 million peripheral blood mononuclear cells (i.e. white blood cells) for about 35K transcripts.  I chose this particular example for several reasons:

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,73 @@
+# Dockerfile for BetterCodeBetterScience
+# Builds an image with all dependencies for running the code examples
+
+FROM python:3.12-slim-bookworm
+
+LABEL maintainer="Russell Poldrack"
+LABEL description="Container for BetterCodeBetterScience book code examples"
+
+# Prevent interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    # Build essentials
+    build-essential \
+    gcc \
+    g++ \
+    gfortran \
+    # Git for datalad and version control
+    git \
+    git-annex \
+    # HDF5 for h5py
+    libhdf5-dev \
+    # For scientific packages
+    libopenblas-dev \
+    liblapack-dev \
+    # For igraph/leidenalg
+    libigraph-dev \
+    # For image processing
+    libjpeg-dev \
+    libpng-dev \
+    # For SSL/networking
+    libssl-dev \
+    libcurl4-openssl-dev \
+    # For XML parsing
+    libxml2-dev \
+    libxslt1-dev \
+    # R and dependencies for rpy2
+    r-base \
+    r-base-dev \
+    # Misc utilities
+    curl \
+    wget \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install uv for fast Python package management
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
+# Set up working directory
+WORKDIR /app
+
+# Copy project files
+COPY pyproject.toml README.md ./
+COPY src/ ./src/
+
+# Create virtual environment and install dependencies
+RUN uv venv /app/.venv
+ENV VIRTUAL_ENV=/app/.venv
+ENV PATH="/app/.venv/bin:$PATH"
+
+# Install the project and all dependencies
+RUN uv pip install -e .
+
+# Copy remaining project files
+COPY book/ ./book/
+COPY tests/ ./tests/
+COPY data/ ./data/
+COPY scripts/ ./scripts/
+COPY myst.yml ./
+
+# Set default command
+CMD ["python", "--version"]
diff --git a/docker/Makefile b/docker/Makefile
@@ -0,0 +1,7 @@
+# Run from project root: make -f docker/Makefile build
+build:
+	docker build -f docker/Dockerfile -t bettercode .
+
+# Or run from docker directory: make build-from-here
+build-from-here:
+	docker build -f Dockerfile -t bettercode ..
diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,26 @@
+# Docker for BetterCodeBetterScience
+
+## Building the image
+
+From the repository root:
+
+```bash
+docker build -f docker/Dockerfile -t bettercode .
+```
+
+## Running the container
+
+Interactive shell:
+```bash
+docker run -it bettercode /bin/bash
+```
+
+Run tests:
+```bash
+docker run bettercode pytest
+```
+
+Mount local data directory:
+```bash
+docker run -v /path/to/local/data:/data bettercode python script.py
+```
diff --git a/Dockerfile → docker_example/Dockerfile b/Dockerfile → docker_example/Dockerfile
diff --git a/src/bettercode/rnaseq/snakemake_workflow/Makefile b/src/bettercode/rnaseq/snakemake_workflow/Makefile
@@ -7,7 +7,7 @@ rulegraph:
 
 # Run the full workflow
 run:
-	snakemake --cores 8 --config datadir=$(DATADIR)/immune_aging/wf_snakemake/
+	snakemake --cores 8 --sdm conda --config datadir=$(DATADIR)/immune_aging/wf_snakemake/
 # Generate HTML report (run after workflow completes)
 report:
 	snakemake --report $(DATADIR)/immune_aging/wf_snakemake/report.html --config datadir=$(DATADIR)/immune_aging/

diff --git a/src/bettercode/rnaseq/snakemake_workflow/Snakefile b/src/bettercode/rnaseq/snakemake_workflow/Snakefile
@@ -88,6 +88,8 @@ rule aggregate_results:
         RESULTS_DIR / "workflow_complete.txt",
     log:
         LOG_DIR / "aggregate_results.log",
+    conda:
+        "bettercode"
     script:
         "scripts/aggregate_results.py"
 

diff --git a/src/bettercode/rnaseq/snakemake_workflow/rules/per_cell_type.smk b/src/bettercode/rnaseq/snakemake_workflow/rules/per_cell_type.smk
@@ -29,6 +29,8 @@ rule differential_expression:
     threads: config["differential_expression"]["n_cpus"]
     log:
         LOG_DIR / "step08_de_{cell_type}.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/differential_expression.py"
 
@@ -54,6 +56,8 @@ rule pathway_analysis:
         ),
     log:
         LOG_DIR / "step09_gsea_{cell_type}.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/gsea.py"
 
@@ -81,6 +85,8 @@ rule overrepresentation:
         ),
     log:
         LOG_DIR / "step10_enrichr_{cell_type}.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/enrichr.py"
 
@@ -110,5 +116,7 @@ rule predictive_modeling:
         ),
     log:
         LOG_DIR / "step11_prediction_{cell_type}.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/prediction.py"
diff --git a/src/bettercode/rnaseq/snakemake_workflow/rules/preprocessing.smk b/src/bettercode/rnaseq/snakemake_workflow/rules/preprocessing.smk
@@ -18,6 +18,8 @@ rule download_data:
         url=config["url"],
     log:
         LOG_DIR / "step01_download.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/download.py"
 
@@ -40,6 +42,8 @@ rule filter_data:
         figure_dir=str(FIGURE_DIR),
     log:
         LOG_DIR / "step02_filtering.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/filter.py"
 
@@ -81,6 +85,8 @@ rule quality_control:
         figure_dir=str(FIGURE_DIR),
     log:
         LOG_DIR / "step03_qc.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/qc.py"
 
@@ -98,6 +104,8 @@ rule preprocess:
         batch_key=config["preprocessing"]["batch_key"],
     log:
         LOG_DIR / "step04_preprocessing.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/preprocess.py"
 
@@ -126,6 +134,8 @@ rule dimensionality_reduction:
         figure_dir=str(FIGURE_DIR),
     log:
         LOG_DIR / "step05_dimred.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/dimred.py"
 
@@ -146,5 +156,7 @@ rule clustering:
         figure_dir=str(FIGURE_DIR),
     log:
         LOG_DIR / "step06_clustering.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/cluster.py"
diff --git a/src/bettercode/rnaseq/snakemake_workflow/rules/pseudobulk.smk b/src/bettercode/rnaseq/snakemake_workflow/rules/pseudobulk.smk
@@ -41,5 +41,7 @@ checkpoint pseudobulk:
         figure_dir=str(FIGURE_DIR),
     log:
         LOG_DIR / "step07_pseudobulk.log",
+    conda:
+        "bettercode"
     script:
         "../scripts/pseudobulk.py"
diff --git a/src/bettercode/simple_workflow/snakemake_workflow/Makefile b/src/bettercode/simple_workflow/snakemake_workflow/Makefile
@@ -3,14 +3,14 @@ OUTPUT_DIR := /Users/poldrack/data_unsynced/BCBS/simple_workflow/wf_snakemake
 .PHONY: run report graph
 
 clean:
-	-rm .snakemake
+	-rm -rf .snakemake
 	-rm -rf $(OUTPUT_DIR)/*
 
 run:
-	snakemake --cores 1 --config output_dir=$(OUTPUT_DIR)
+	snakemake --cores 1 --sdm conda --config output_dir=$(OUTPUT_DIR)
 
 lint:
-	snakemake --lint --cores 1 --config output_dir=$(OUTPUT_DIR)
+	snakemake --sdm conda --lint --cores 1 --config output_dir=$(OUTPUT_DIR)
 
 dryrun:
 	snakemake --dry-run --cores 1 --config output_dir=$(OUTPUT_DIR)
@@ -23,4 +23,4 @@ graph:
 
 export-env:
 	-mkdir envs
-	micromamba env export -n bettercode > envs/bettercode.yml
+	conda env export -n bettercode > envs/bettercode.yml
diff --git a/src/bettercode/simple_workflow/snakemake_workflow/Snakefile b/src/bettercode/simple_workflow/snakemake_workflow/Snakefile
@@ -63,6 +63,8 @@ rule download_meaningful_variables:
         url=config["meaningful_variables_url"],
     log:
         OUTPUT_DIR / "logs" / "download_meaningful_variables.log",
+    conda:
+        "envs/simple.yml"
     script:
         "scripts/download_data.py"
 
@@ -75,6 +77,8 @@ rule download_demographics:
         url=config["demographics_url"],
     log:
         OUTPUT_DIR / "logs" / "download_demographics.log",
+    conda:
+        "envs/simple.yml"
     script:
         "scripts/download_data.py"
 
@@ -87,6 +91,8 @@ rule filter_meaningful_variables:
         DATA_DIR / "meaningful_variables_numerical.csv",
     log:
         OUTPUT_DIR / "logs" / "filter_meaningful_variables.log",
+    conda:
+        "envs/simple.yml"
     script:
         "scripts/filter_data.py"
 
@@ -99,6 +105,8 @@ rule filter_demographics:
         DATA_DIR / "demographics_numerical.csv",
     log:
         OUTPUT_DIR / "logs" / "filter_demographics.log",
+    conda:
+        "envs/simple.yml"
     script:
         "scripts/filter_data.py"
 
@@ -112,6 +120,8 @@ rule join_datasets:
         DATA_DIR / "joined_data.csv",
     log:
         OUTPUT_DIR / "logs" / "join_datasets.log",
+    conda:
+        "envs/simple.yml"
     script:
         "scripts/join_data.py"
 
@@ -126,6 +136,8 @@ rule compute_correlation:
         method=config["correlation_method"],
     log:
         OUTPUT_DIR / "logs" / "compute_correlation.log",
+    conda:
+        "envs/simple.yml"
     script:
         "scripts/compute_correlation.py"
 
@@ -147,5 +159,7 @@ rule generate_heatmap:
         vmax=config["heatmap"]["vmax"],
     log:
         OUTPUT_DIR / "logs" / "generate_heatmap.log",
+    conda:
+        "envs/simple.yml"
     script:
         "scripts/generate_heatmap.py"