From df4d8f9bde50e5604fb6b2b3c8cd3e0d23377299 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste SCHIRATTI Date: Tue, 17 Oct 2023 12:05:14 +0200 Subject: [PATCH 1/5] First commit with code to download raw counts. --- .gitignore | 3 ++ examples/plot_real_data_pydeseq2_example.py | 33 +++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 examples/plot_real_data_pydeseq2_example.py diff --git a/.gitignore b/.gitignore index fe2ff70e..b61d611b 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ dmypy.json # Pyre type checker .pyre/ + +# PyCharm +.idea diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py new file mode 100644 index 00000000..a73a3c00 --- /dev/null +++ b/examples/plot_real_data_pydeseq2_example.py @@ -0,0 +1,33 @@ +import shutil +from pathlib import Path +from typing import Union +from urllib.request import urlopen + +import pandas as pd + +_URL = "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv" # noqa +_RAW_COUNTS_FNAME = _URL.split("/")[-1] + + +def _download_raw_counts(output_dir: Union[str, Path]) -> None: + """Downloads sample RNASeq data (raw counts). + + Parameters + ---------- + output_dir : Union[str, Path] + Local directory where the sample data will be downloaded. + """ + + fname = str(Path(output_dir).joinpath(_RAW_COUNTS_FNAME)) + with urlopen(_URL) as response, open(fname, "wb") as out_file: + shutil.copyfileobj(response, out_file) + # Check that file was downloaded + if not Path(fname).is_file(): + raise FileNotFoundError(f"Failed to download sample data from {_URL}!") + + +_download_raw_counts(Path.cwd()) + +df = pd.read_csv(Path.cwd().joinpath(_RAW_COUNTS_FNAME), sep="\t") + +print(df.head()) From 7ae6a5ba58145ec34b95ea9bfb03c5c23f111e65 Mon Sep 17 00:00:00 2001 From: tobias-zehnder Date: Tue, 17 Oct 2023 12:34:25 +0000 Subject: [PATCH 2/5] Add real example dataset --- pydeseq2/utils.py | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 5b18559e..13ff5259 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -27,15 +27,16 @@ def load_example_data( modality: Literal["raw_counts", "metadata"] = "raw_counts", - dataset: Literal["synthetic"] = "synthetic", + dataset: Literal["synthetic", "real"] = "synthetic", + url_to_data: Union[str, None] = None, debug: bool = False, debug_seed: int = 42, ) -> pd.DataFrame: - """Load synthetic example data. + """Load example data. - May load either metadata or rna-seq data. For now, this function may only return the - synthetic data provided as part of this repo, but new datasets might be added in the - future. + May load either metadata or rna-seq data. This function may either return the + synthetic data provided as part of this repo, download an example dataset + from BioStudies, or use a provided direct link to an processed read count dataset. Parameters ---------- @@ -45,8 +46,15 @@ def load_example_data( dataset : str The dataset for which to return gene expression data. If "synthetic", will return the synthetic data that is used for CI unit tests. + If "real", will download and return a real dataset from a given url. (default: ``"synthetic"``). + url_to_data : str + The url for the real dataset. + If not given, it will be set to the url of a public gene expression profiling + study by RNA-seq in colorectal cancer, depending on the desired modality. + (default: None). + debug : bool If true, subsample 10 samples and 100 genes at random. (Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``). @@ -60,13 +68,15 @@ def load_example_data( Requested data modality. """ - assert modality in ["raw_counts", "metadata"], ( - "The modality argument must be one of the following: " "raw_counts, metadata" - ) + assert modality in [ + "raw_counts", + "metadata", + ], "The modality argument must be one of the following: raw_counts, metadata" assert dataset in [ - "synthetic" - ], "The dataset argument must be one of the following: synthetic." + "synthetic", + "real", + ], "The dataset argument must be one of the following: synthetic, real" # Load data datasets_path = Path(pydeseq2.__file__).parent.parent / "datasets" @@ -100,6 +110,21 @@ def load_example_data( index_col=0, ) + elif dataset == "real": + default_urls = { + "raw_counts": ( + "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/" + "tutorial/E-GEOD-50760-raw-counts.tsv" + ), + "metadata": ( + "https://www.ebi.ac.uk/gxa/experiments-content/E-GEOD-50760/" + "resources/ExperimentDesignFile.RnaSeq/experiment-design" + ), + } + if not url_to_data: + url_to_data = default_urls[modality] + df = pd.read_csv(url_to_data, sep="\t") + if debug: # TODO: until we provide a larger dataset, this option is useless # subsample 10 samples and 100 genes From 94da22bcbd3642db8205c80d860b14461c4fbf32 Mon Sep 17 00:00:00 2001 From: Valerieducret Date: Tue, 17 Oct 2023 14:07:19 +0000 Subject: [PATCH 3/5] update with loading and deseq2 function --- examples/plot_real_data_pydeseq2_example.py | 110 ++++++++++++++++---- 1 file changed, 87 insertions(+), 23 deletions(-) diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py index a73a3c00..39415b32 100644 --- a/examples/plot_real_data_pydeseq2_example.py +++ b/examples/plot_real_data_pydeseq2_example.py @@ -1,33 +1,97 @@ -import shutil -from pathlib import Path -from typing import Union -from urllib.request import urlopen +""" +Pydeseq2 with real data +====================================================== -import pandas as pd +This experiment aims to run pydeseq2 on real data. -_URL = "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv" # noqa -_RAW_COUNTS_FNAME = _URL.split("/")[-1] +""" +from pydeseq2.dds import DeseqDataSet +from pydeseq2.utils import load_example_data -def _download_raw_counts(output_dir: Union[str, Path]) -> None: - """Downloads sample RNASeq data (raw counts). +# %% +# Data loading +# ------------ +# +# Let's first download and load the real data. - Parameters - ---------- - output_dir : Union[str, Path] - Local directory where the sample data will be downloaded. - """ +counts_ori = load_example_data( + modality="raw_counts", + dataset="real", + debug=False, +) - fname = str(Path(output_dir).joinpath(_RAW_COUNTS_FNAME)) - with urlopen(_URL) as response, open(fname, "wb") as out_file: - shutil.copyfileobj(response, out_file) - # Check that file was downloaded - if not Path(fname).is_file(): - raise FileNotFoundError(f"Failed to download sample data from {_URL}!") +metadata = load_example_data( + modality="metadata", + dataset="real", + debug=False, +) +print(counts_ori.tail()) -_download_raw_counts(Path.cwd()) +# %% +# In this example, the counts data contains both EnsemblIDs +# and gene symbols (gene names). +# We also see that some EnsemblID do not map to any gene symbol. +# We decide to stay with EnsemblID rather than gene symbol and +# continue with some processing step. -df = pd.read_csv(Path.cwd().joinpath(_RAW_COUNTS_FNAME), sep="\t") +# %% +# Data filtering read counts +# ^^^^^^^^^^^^^^^^^^^^^^^^^^ -print(df.head()) +counts_df = counts_ori.drop(columns="Gene Name") +# remove the gene name for now +counts_df = counts_df.set_index("Gene ID").T +# now we have a matrice of shape n_samples x n_genes + +# %% +# Further processing might be needed because some genes +# have very high counts on average. + +# %% +print(metadata) + +# %% +# Data filtering metadata +# ^^^^^^^^^^^^^^^^^^^^^^^^^^ + +metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename( + columns={"Sample Characteristic[biopsy site]": "condition"} +) + +# %% +# Now that we have loaded and filtered our data, we may proceed with the differential +# analysis. + + +# %% +# Single factor analysis +# -------------------------- +# +# In this analysis, we use the ``condition`` +# column as our design factor. That is, we compare gene expressions of samples that have +# ``primary tumor`` to those that have ``normal`` condition. +# + +# %% +# .. currentmodule:: pydeseq2.dds +# +# Read counts modeling with the :class:`DeseqDataSet` class +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# We start by creating a :class:`DeseqDataSet` +# object from the count and metadata data. +# A :class:`DeseqDataSet` fits dispersion and +# log-fold change (LFC) parameters from the data, and stores them. +# + +dds = DeseqDataSet( + counts=counts_df, + metadata=metadata, + design_factors="condition", + refit_cooks=True, + n_cpus=8, +) + +dds.deseq2() From bfee8d549d14de5f232ebcaaacb76ec19054d6af Mon Sep 17 00:00:00 2001 From: tobias-zehnder Date: Tue, 17 Oct 2023 14:25:26 +0000 Subject: [PATCH 4/5] Fix subsampling along wrong axis --- pydeseq2/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 13ff5259..5280f9ce 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -128,7 +128,7 @@ def load_example_data( if debug: # TODO: until we provide a larger dataset, this option is useless # subsample 10 samples and 100 genes - df = df.sample(n=10, axis=0, random_state=debug_seed) + df = df.sample(n=10, axis=1, random_state=debug_seed) if modality == "raw_counts": df = df.sample(n=100, axis="index", random_state=debug_seed) From 404fad368761cc12d46dab7dff0a8afc3489b75e Mon Sep 17 00:00:00 2001 From: Valerieducret Date: Tue, 17 Oct 2023 14:55:45 +0000 Subject: [PATCH 5/5] feat: add results and visualizations --- examples/plot_real_data_pydeseq2_example.py | 41 +++++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py index 39415b32..d34eba2d 100644 --- a/examples/plot_real_data_pydeseq2_example.py +++ b/examples/plot_real_data_pydeseq2_example.py @@ -6,7 +6,11 @@ """ +import matplotlib.pyplot as plt +import numpy as np + from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats from pydeseq2.utils import load_example_data # %% @@ -74,18 +78,6 @@ # ``primary tumor`` to those that have ``normal`` condition. # -# %% -# .. currentmodule:: pydeseq2.dds -# -# Read counts modeling with the :class:`DeseqDataSet` class -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We start by creating a :class:`DeseqDataSet` -# object from the count and metadata data. -# A :class:`DeseqDataSet` fits dispersion and -# log-fold change (LFC) parameters from the data, and stores them. -# - dds = DeseqDataSet( counts=counts_df, metadata=metadata, @@ -95,3 +87,28 @@ ) dds.deseq2() + +# %% +# Generate summary statistics +# -------------------------- +# We are here interested in comparing "primary tumor vs "normal" samples + +stat_res_tumor_vs_normal = DeseqStats( + dds, contrast=["condition", "primary tumor", "normal"], n_cpus=8 +) + +# As we did not really performed a thorough exploration of the data +# and gene counts, we would like to test for a lfc of 1 at least + +stat_res_tumor_vs_normal.summary(lfc_null=1, alt_hypothesis="greaterAbs") + +# Some visualization + +stat_res_tumor_vs_normal.plot_MA(s=20) + +# Volcano plot + +plt.scatter( + stat_res_tumor_vs_normal.results_df["log2FoldChange"], + -np.log(stat_res_tumor_vs_normal.results_df["padj"]), +)