From df4d8f9bde50e5604fb6b2b3c8cd3e0d23377299 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste SCHIRATTI <jean.baptiste.schiratti@gmail.com>
Date: Tue, 17 Oct 2023 12:05:14 +0200
Subject: [PATCH 1/5] First commit with code to download raw counts.

---
 .gitignore                                  |  3 ++
 examples/plot_real_data_pydeseq2_example.py | 33 +++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 examples/plot_real_data_pydeseq2_example.py

diff --git a/.gitignore b/.gitignore
index fe2ff70e..b61d611b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,3 +130,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# PyCharm
+.idea
diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py
new file mode 100644
index 00000000..a73a3c00
--- /dev/null
+++ b/examples/plot_real_data_pydeseq2_example.py
@@ -0,0 +1,33 @@
+import shutil
+from pathlib import Path
+from typing import Union
+from urllib.request import urlopen
+
+import pandas as pd
+
+_URL = "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv"  # noqa
+_RAW_COUNTS_FNAME = _URL.split("/")[-1]
+
+
+def _download_raw_counts(output_dir: Union[str, Path]) -> None:
+    """Downloads sample RNASeq data (raw counts).
+
+    Parameters
+    ----------
+    output_dir : Union[str, Path]
+        Local directory where the sample data will be downloaded.
+    """
+
+    fname = str(Path(output_dir).joinpath(_RAW_COUNTS_FNAME))
+    with urlopen(_URL) as response, open(fname, "wb") as out_file:
+        shutil.copyfileobj(response, out_file)
+    # Check that file was downloaded
+    if not Path(fname).is_file():
+        raise FileNotFoundError(f"Failed to download sample data from {_URL}!")
+
+
+_download_raw_counts(Path.cwd())
+
+df = pd.read_csv(Path.cwd().joinpath(_RAW_COUNTS_FNAME), sep="\t")
+
+print(df.head())

From 7ae6a5ba58145ec34b95ea9bfb03c5c23f111e65 Mon Sep 17 00:00:00 2001
From: tobias-zehnder <tobias.zehnder@owkin.com>
Date: Tue, 17 Oct 2023 12:34:25 +0000
Subject: [PATCH 2/5] Add real example dataset

---
 pydeseq2/utils.py | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 5b18559e..13ff5259 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -27,15 +27,16 @@
 
 def load_example_data(
     modality: Literal["raw_counts", "metadata"] = "raw_counts",
-    dataset: Literal["synthetic"] = "synthetic",
+    dataset: Literal["synthetic", "real"] = "synthetic",
+    url_to_data: Union[str, None] = None,
     debug: bool = False,
     debug_seed: int = 42,
 ) -> pd.DataFrame:
-    """Load synthetic example data.
+    """Load example data.
 
-    May load either metadata or rna-seq data. For now, this function may only return the
-    synthetic data provided as part of this repo, but new datasets might be added in the
-    future.
+    May load either metadata or rna-seq data. This function may either return the
+    synthetic data provided as part of this repo, download an example dataset
+    from BioStudies, or use a provided direct link to an processed read count dataset.
 
     Parameters
     ----------
@@ -45,8 +46,15 @@ def load_example_data(
     dataset : str
         The dataset for which to return gene expression data.
         If "synthetic", will return the synthetic data that is used for CI unit tests.
+        If "real", will download and return a real dataset from a given url.
         (default: ``"synthetic"``).
 
+    url_to_data : str
+        The url for the real dataset.
+        If not given, it will be set to the url of a public gene expression profiling
+        study by RNA-seq in colorectal cancer, depending on the desired modality.
+        (default: None).
+
     debug : bool
         If true, subsample 10 samples and 100 genes at random.
         (Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``).
@@ -60,13 +68,15 @@ def load_example_data(
         Requested data modality.
     """
 
-    assert modality in ["raw_counts", "metadata"], (
-        "The modality argument must be one of the following: " "raw_counts, metadata"
-    )
+    assert modality in [
+        "raw_counts",
+        "metadata",
+    ], "The modality argument must be one of the following: raw_counts, metadata"
 
     assert dataset in [
-        "synthetic"
-    ], "The dataset argument must be one of the following: synthetic."
+        "synthetic",
+        "real",
+    ], "The dataset argument must be one of the following: synthetic, real"
 
     # Load data
     datasets_path = Path(pydeseq2.__file__).parent.parent / "datasets"
@@ -100,6 +110,21 @@ def load_example_data(
                 index_col=0,
             )
 
+    elif dataset == "real":
+        default_urls = {
+            "raw_counts": (
+                "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/"
+                "tutorial/E-GEOD-50760-raw-counts.tsv"
+            ),
+            "metadata": (
+                "https://www.ebi.ac.uk/gxa/experiments-content/E-GEOD-50760/"
+                "resources/ExperimentDesignFile.RnaSeq/experiment-design"
+            ),
+        }
+        if not url_to_data:
+            url_to_data = default_urls[modality]
+        df = pd.read_csv(url_to_data, sep="\t")
+
     if debug:
         # TODO: until we provide a larger dataset, this option is useless
         # subsample 10 samples and 100 genes

From 94da22bcbd3642db8205c80d860b14461c4fbf32 Mon Sep 17 00:00:00 2001
From: Valerieducret <valerie.ducret@owkin.com>
Date: Tue, 17 Oct 2023 14:07:19 +0000
Subject: [PATCH 3/5] update with loading and deseq2 function

---
 examples/plot_real_data_pydeseq2_example.py | 110 ++++++++++++++++----
 1 file changed, 87 insertions(+), 23 deletions(-)

diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py
index a73a3c00..39415b32 100644
--- a/examples/plot_real_data_pydeseq2_example.py
+++ b/examples/plot_real_data_pydeseq2_example.py
@@ -1,33 +1,97 @@
-import shutil
-from pathlib import Path
-from typing import Union
-from urllib.request import urlopen
+"""
+Pydeseq2 with real data
+======================================================
 
-import pandas as pd
+This experiment aims to run pydeseq2 on real data.
 
-_URL = "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv"  # noqa
-_RAW_COUNTS_FNAME = _URL.split("/")[-1]
+"""
 
+from pydeseq2.dds import DeseqDataSet
+from pydeseq2.utils import load_example_data
 
-def _download_raw_counts(output_dir: Union[str, Path]) -> None:
-    """Downloads sample RNASeq data (raw counts).
+# %%
+# Data loading
+# ------------
+#
+# Let's first download and load the real data.
 
-    Parameters
-    ----------
-    output_dir : Union[str, Path]
-        Local directory where the sample data will be downloaded.
-    """
+counts_ori = load_example_data(
+    modality="raw_counts",
+    dataset="real",
+    debug=False,
+)
 
-    fname = str(Path(output_dir).joinpath(_RAW_COUNTS_FNAME))
-    with urlopen(_URL) as response, open(fname, "wb") as out_file:
-        shutil.copyfileobj(response, out_file)
-    # Check that file was downloaded
-    if not Path(fname).is_file():
-        raise FileNotFoundError(f"Failed to download sample data from {_URL}!")
+metadata = load_example_data(
+    modality="metadata",
+    dataset="real",
+    debug=False,
+)
 
+print(counts_ori.tail())
 
-_download_raw_counts(Path.cwd())
+# %%
+# In this example, the counts data contains both EnsemblIDs
+# and gene symbols (gene names).
+# We also see that some EnsemblID do not map to any gene symbol.
+# We decide to stay with EnsemblID rather than gene symbol and
+# continue with some processing step.
 
-df = pd.read_csv(Path.cwd().joinpath(_RAW_COUNTS_FNAME), sep="\t")
+# %%
+# Data filtering read counts
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-print(df.head())
+counts_df = counts_ori.drop(columns="Gene Name")
+# remove the gene name for now
+counts_df = counts_df.set_index("Gene ID").T
+# now we have a matrice of shape n_samples x n_genes
+
+# %%
+# Further processing might be needed because some genes
+# have very high counts on average.
+
+# %%
+print(metadata)
+
+# %%
+# Data filtering metadata
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename(
+    columns={"Sample Characteristic[biopsy site]": "condition"}
+)
+
+# %%
+# Now that we have loaded and filtered our data, we may proceed with the differential
+# analysis.
+
+
+# %%
+# Single factor analysis
+# --------------------------
+#
+# In this analysis, we use the ``condition``
+# column as our design factor. That is, we compare gene expressions of samples that have
+# ``primary tumor`` to those that have ``normal`` condition.
+#
+
+# %%
+# .. currentmodule:: pydeseq2.dds
+#
+# Read counts modeling with the :class:`DeseqDataSet` class
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We start by creating a :class:`DeseqDataSet`
+# object from the count and metadata data.
+# A :class:`DeseqDataSet` fits dispersion and
+# log-fold change (LFC) parameters from the data, and stores them.
+#
+
+dds = DeseqDataSet(
+    counts=counts_df,
+    metadata=metadata,
+    design_factors="condition",
+    refit_cooks=True,
+    n_cpus=8,
+)
+
+dds.deseq2()

From bfee8d549d14de5f232ebcaaacb76ec19054d6af Mon Sep 17 00:00:00 2001
From: tobias-zehnder <tobias.zehnder@owkin.com>
Date: Tue, 17 Oct 2023 14:25:26 +0000
Subject: [PATCH 4/5] Fix subsampling along wrong axis

---
 pydeseq2/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 13ff5259..5280f9ce 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -128,7 +128,7 @@ def load_example_data(
     if debug:
         # TODO: until we provide a larger dataset, this option is useless
         # subsample 10 samples and 100 genes
-        df = df.sample(n=10, axis=0, random_state=debug_seed)
+        df = df.sample(n=10, axis=1, random_state=debug_seed)
         if modality == "raw_counts":
             df = df.sample(n=100, axis="index", random_state=debug_seed)
 

From 404fad368761cc12d46dab7dff0a8afc3489b75e Mon Sep 17 00:00:00 2001
From: Valerieducret <valerie.ducret@owkin.com>
Date: Tue, 17 Oct 2023 14:55:45 +0000
Subject: [PATCH 5/5] feat: add results and visualizations

---
 examples/plot_real_data_pydeseq2_example.py | 41 +++++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py
index 39415b32..d34eba2d 100644
--- a/examples/plot_real_data_pydeseq2_example.py
+++ b/examples/plot_real_data_pydeseq2_example.py
@@ -6,7 +6,11 @@
 
 """
 
+import matplotlib.pyplot as plt
+import numpy as np
+
 from pydeseq2.dds import DeseqDataSet
+from pydeseq2.ds import DeseqStats
 from pydeseq2.utils import load_example_data
 
 # %%
@@ -74,18 +78,6 @@
 # ``primary tumor`` to those that have ``normal`` condition.
 #
 
-# %%
-# .. currentmodule:: pydeseq2.dds
-#
-# Read counts modeling with the :class:`DeseqDataSet` class
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# We start by creating a :class:`DeseqDataSet`
-# object from the count and metadata data.
-# A :class:`DeseqDataSet` fits dispersion and
-# log-fold change (LFC) parameters from the data, and stores them.
-#
-
 dds = DeseqDataSet(
     counts=counts_df,
     metadata=metadata,
@@ -95,3 +87,28 @@
 )
 
 dds.deseq2()
+
+# %%
+# Generate summary statistics
+# --------------------------
+# We are here interested in comparing "primary tumor vs "normal" samples
+
+stat_res_tumor_vs_normal = DeseqStats(
+    dds, contrast=["condition", "primary tumor", "normal"], n_cpus=8
+)
+
+# As we did not really performed a thorough exploration of the data
+# and gene counts, we would like to test for a lfc of 1 at least
+
+stat_res_tumor_vs_normal.summary(lfc_null=1, alt_hypothesis="greaterAbs")
+
+# Some visualization
+
+stat_res_tumor_vs_normal.plot_MA(s=20)
+
+# Volcano plot
+
+plt.scatter(
+    stat_res_tumor_vs_normal.results_df["log2FoldChange"],
+    -np.log(stat_res_tumor_vs_normal.results_df["padj"]),
+)