Skip to content
This repository was archived by the owner on Dec 31, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Git
.git
.gitignore

# Python
__pycache__
*.py[cod]
*$py.class
*.so
.Python
.venv
venv
ENV

# Build artifacts
_build
*.egg-info
dist
build

# IDE
.idea
.vscode
*.swp
*.swo

# Jupyter
.ipynb_checkpoints

# OS
.DS_Store
Thumbs.db

# Docker (keep Dockerfile accessible)
docker/Makefile
docker/README.md

# Misc
*.log
.env
.env.*
12 changes: 12 additions & 0 deletions book/workflows.md
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,18 @@ total 5

Similarly, Snakemake will rerun the workflow if any of the scripts used to run the workflow are modified. However, it's important to note that it will not identify changes in the modules that are imported. In that case you would need to rerun the workflow in order to re-execute the relevant steps.

#### Reproducible environments with Conda

after installing miniconda:


```bash
conda create -c conda-forge -c bioconda -c nodefaults -n bettercode snakemake
conda activate bettercode
pip install -e .
```


## Scaling to a complex workflow

We now turn to a more realistic and complex scientific data analysis workflow. For this example I will use an analysis of single-cell RNA-sequencing data to determine how gene expression in immune system cells changes with age. This analysis will utilize a [large openly available dataset](https://cellxgene.cziscience.com/collections/dde06e0f-ab3b-46be-96a2-a8082383c4a1) that includes data from 982 people comprising about 1.3 million peripheral blood mononuclear cells (i.e. white blood cells) for about 35K transcripts. I chose this particular example for several reasons:
Expand Down
73 changes: 73 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Dockerfile for BetterCodeBetterScience
# Builds an image with all dependencies for running the code examples

FROM python:3.12-slim-bookworm

LABEL maintainer="Russell Poldrack"
LABEL description="Container for BetterCodeBetterScience book code examples"

# Prevent interactive prompts during package installation
ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
# Build essentials
build-essential \
gcc \
g++ \
gfortran \
# Git for datalad and version control
git \
git-annex \
# HDF5 for h5py
libhdf5-dev \
# For scientific packages
libopenblas-dev \
liblapack-dev \
# For igraph/leidenalg
libigraph-dev \
# For image processing
libjpeg-dev \
libpng-dev \
# For SSL/networking
libssl-dev \
libcurl4-openssl-dev \
# For XML parsing
libxml2-dev \
libxslt1-dev \
# R and dependencies for rpy2
r-base \
r-base-dev \
# Misc utilities
curl \
wget \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*

# Install uv for fast Python package management
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv

# Set up working directory
WORKDIR /app

# Copy project files
COPY pyproject.toml README.md ./
COPY src/ ./src/

# Create virtual environment and install dependencies
RUN uv venv /app/.venv
ENV VIRTUAL_ENV=/app/.venv
ENV PATH="/app/.venv/bin:$PATH"

# Install the project and all dependencies
RUN uv pip install -e .

# Copy remaining project files
COPY book/ ./book/
COPY tests/ ./tests/
COPY data/ ./data/
COPY scripts/ ./scripts/
COPY myst.yml ./

# Set default command
CMD ["python", "--version"]
7 changes: 7 additions & 0 deletions docker/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Run from project root: make -f docker/Makefile build
build:
docker build -f docker/Dockerfile -t bettercode .

# Or run from docker directory: make build-from-here
build-from-here:
docker build -f Dockerfile -t bettercode ..
26 changes: 26 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Docker for BetterCodeBetterScience

## Building the image

From the repository root:

```bash
docker build -f docker/Dockerfile -t bettercode .
```

## Running the container

Interactive shell:
```bash
docker run -it bettercode /bin/bash
```

Run tests:
```bash
docker run bettercode pytest
```

Mount local data directory:
```bash
docker run -v /path/to/local/data:/data bettercode python script.py
```
File renamed without changes.
2 changes: 1 addition & 1 deletion src/bettercode/rnaseq/snakemake_workflow/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rulegraph:

# Run the full workflow
run:
snakemake --cores 8 --config datadir=$(DATADIR)/immune_aging/wf_snakemake/
snakemake --cores 8 --sdm conda --config datadir=$(DATADIR)/immune_aging/wf_snakemake/
# Generate HTML report (run after workflow completes)
report:
snakemake --report $(DATADIR)/immune_aging/wf_snakemake/report.html --config datadir=$(DATADIR)/immune_aging/
Expand Down
2 changes: 2 additions & 0 deletions src/bettercode/rnaseq/snakemake_workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ rule aggregate_results:
RESULTS_DIR / "workflow_complete.txt",
log:
LOG_DIR / "aggregate_results.log",
conda:
"bettercode"
script:
"scripts/aggregate_results.py"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ rule differential_expression:
threads: config["differential_expression"]["n_cpus"]
log:
LOG_DIR / "step08_de_{cell_type}.log",
conda:
"bettercode"
script:
"../scripts/differential_expression.py"

Expand All @@ -54,6 +56,8 @@ rule pathway_analysis:
),
log:
LOG_DIR / "step09_gsea_{cell_type}.log",
conda:
"bettercode"
script:
"../scripts/gsea.py"

Expand Down Expand Up @@ -81,6 +85,8 @@ rule overrepresentation:
),
log:
LOG_DIR / "step10_enrichr_{cell_type}.log",
conda:
"bettercode"
script:
"../scripts/enrichr.py"

Expand Down Expand Up @@ -110,5 +116,7 @@ rule predictive_modeling:
),
log:
LOG_DIR / "step11_prediction_{cell_type}.log",
conda:
"bettercode"
script:
"../scripts/prediction.py"
12 changes: 12 additions & 0 deletions src/bettercode/rnaseq/snakemake_workflow/rules/preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ rule download_data:
url=config["url"],
log:
LOG_DIR / "step01_download.log",
conda:
"bettercode"
script:
"../scripts/download.py"

Expand All @@ -40,6 +42,8 @@ rule filter_data:
figure_dir=str(FIGURE_DIR),
log:
LOG_DIR / "step02_filtering.log",
conda:
"bettercode"
script:
"../scripts/filter.py"

Expand Down Expand Up @@ -81,6 +85,8 @@ rule quality_control:
figure_dir=str(FIGURE_DIR),
log:
LOG_DIR / "step03_qc.log",
conda:
"bettercode"
script:
"../scripts/qc.py"

Expand All @@ -98,6 +104,8 @@ rule preprocess:
batch_key=config["preprocessing"]["batch_key"],
log:
LOG_DIR / "step04_preprocessing.log",
conda:
"bettercode"
script:
"../scripts/preprocess.py"

Expand Down Expand Up @@ -126,6 +134,8 @@ rule dimensionality_reduction:
figure_dir=str(FIGURE_DIR),
log:
LOG_DIR / "step05_dimred.log",
conda:
"bettercode"
script:
"../scripts/dimred.py"

Expand All @@ -146,5 +156,7 @@ rule clustering:
figure_dir=str(FIGURE_DIR),
log:
LOG_DIR / "step06_clustering.log",
conda:
"bettercode"
script:
"../scripts/cluster.py"
2 changes: 2 additions & 0 deletions src/bettercode/rnaseq/snakemake_workflow/rules/pseudobulk.smk
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,7 @@ checkpoint pseudobulk:
figure_dir=str(FIGURE_DIR),
log:
LOG_DIR / "step07_pseudobulk.log",
conda:
"bettercode"
script:
"../scripts/pseudobulk.py"
8 changes: 4 additions & 4 deletions src/bettercode/simple_workflow/snakemake_workflow/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ OUTPUT_DIR := /Users/poldrack/data_unsynced/BCBS/simple_workflow/wf_snakemake
.PHONY: run report graph

clean:
-rm .snakemake
-rm -rf .snakemake
-rm -rf $(OUTPUT_DIR)/*

run:
snakemake --cores 1 --config output_dir=$(OUTPUT_DIR)
snakemake --cores 1 --sdm conda --config output_dir=$(OUTPUT_DIR)

lint:
snakemake --lint --cores 1 --config output_dir=$(OUTPUT_DIR)
snakemake --sdm conda --lint --cores 1 --config output_dir=$(OUTPUT_DIR)

dryrun:
snakemake --dry-run --cores 1 --config output_dir=$(OUTPUT_DIR)
Expand All @@ -23,4 +23,4 @@ graph:

export-env:
-mkdir envs
micromamba env export -n bettercode > envs/bettercode.yml
conda env export -n bettercode > envs/bettercode.yml
14 changes: 14 additions & 0 deletions src/bettercode/simple_workflow/snakemake_workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ rule download_meaningful_variables:
url=config["meaningful_variables_url"],
log:
OUTPUT_DIR / "logs" / "download_meaningful_variables.log",
conda:
"envs/simple.yml"
script:
"scripts/download_data.py"

Expand All @@ -75,6 +77,8 @@ rule download_demographics:
url=config["demographics_url"],
log:
OUTPUT_DIR / "logs" / "download_demographics.log",
conda:
"envs/simple.yml"
script:
"scripts/download_data.py"

Expand All @@ -87,6 +91,8 @@ rule filter_meaningful_variables:
DATA_DIR / "meaningful_variables_numerical.csv",
log:
OUTPUT_DIR / "logs" / "filter_meaningful_variables.log",
conda:
"envs/simple.yml"
script:
"scripts/filter_data.py"

Expand All @@ -99,6 +105,8 @@ rule filter_demographics:
DATA_DIR / "demographics_numerical.csv",
log:
OUTPUT_DIR / "logs" / "filter_demographics.log",
conda:
"envs/simple.yml"
script:
"scripts/filter_data.py"

Expand All @@ -112,6 +120,8 @@ rule join_datasets:
DATA_DIR / "joined_data.csv",
log:
OUTPUT_DIR / "logs" / "join_datasets.log",
conda:
"envs/simple.yml"
script:
"scripts/join_data.py"

Expand All @@ -126,6 +136,8 @@ rule compute_correlation:
method=config["correlation_method"],
log:
OUTPUT_DIR / "logs" / "compute_correlation.log",
conda:
"envs/simple.yml"
script:
"scripts/compute_correlation.py"

Expand All @@ -147,5 +159,7 @@ rule generate_heatmap:
vmax=config["heatmap"]["vmax"],
log:
OUTPUT_DIR / "logs" / "generate_heatmap.log",
conda:
"envs/simple.yml"
script:
"scripts/generate_heatmap.py"
Loading