diff --git a/.BBSoptions b/.BBSoptions new file mode 100644 index 0000000..c51a3fc --- /dev/null +++ b/.BBSoptions @@ -0,0 +1 @@ +UnsupportedPlatforms: win32 \ No newline at end of file diff --git a/.Rbuildignore b/.Rbuildignore index 989c4b9..d809f26 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,9 @@ ^__autograph_generated_filezt06eymn\.py$ ^\.github$ ^LICENSE\.md$ +^\.devcontainer +^\.trigger_build$ +^Dockerfile$ +^dev +^\.BBSoptions$ +^\.vscode \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..37091c3 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,13 @@ +{ + "name": "Ibex devcontainer", + "shutdownAction": "none", + "build": { + "context": "..", + "dockerfile": "../Dockerfile" + }, + "runArgs": [ + // to use GPUs in container uncomment next line + // "--gpus=all", + "--name=ibex-devbox" + ] +} diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 32c3e56..ccce637 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, devel] pull_request: - branches: [main, master] + branches: [main, devel] name: R-CMD-check diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 9ee1f3d..d0a0796 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, devel] pull_request: - branches: [main, master] + branches: [main, devel] name: test-coverage diff --git a/.gitignore b/.gitignore index c67ac69..df54527 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ docs .DS_Store dev/ +.vscode/ diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..cc51b65 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,7 @@ +{ + "MD007" : { "indent": 4 }, + "MD013": false, + "MD024": false, + "MD025": false, + "MD033": false +} diff --git a/DESCRIPTION b/DESCRIPTION index 8db19a6..ae66fcd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,14 @@ Package: Ibex Title: Methods for BCR single-cell embedding -Version: 0.99.33 +Version: 1.0.0.9000 Authors@R: c( - person(given = "Nick", family = "Borcherding", role = c("aut", "cre"), email = "ncborch@gmail.com")) + person(given = "Nick", family = "Borcherding", role = c("aut", "cre", "cph"), email = "ncborch@gmail.com"), + person(given = "Qile", family = "Yang", role = "ctb", email = "qile.yang@berkeley.edu", comment = c(ORCID = "0009-0005-0148-2499"))) Description: Implementation of the Ibex algorithm for single-cell embedding based on BCR sequences. The package includes a standalone function to encode BCR sequence information by amino acid properties or sequence order using tensorflow-based autoencoder. In addition, the package interacts with SingleCellExperiment or Seurat data objects. License: MIT + file LICENSE Encoding: UTF-8 LazyData: false -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 biocViews: Software, ImmunoOncology, SingleCell, Classification, Annotation, Sequencing Depends: R (>= 4.5.0) @@ -17,7 +18,6 @@ Imports: methods, Matrix, reticulate (>= 1.43.0), - rlang, SeuratObject, scRepertoire, SingleCellExperiment, diff --git a/DockerFile b/DockerFile new file mode 100644 index 0000000..ff09f8c --- /dev/null +++ b/DockerFile @@ -0,0 +1,8 @@ +FROM rocker/verse:4.5.1 +RUN apt-get update && apt-get install -y texlive-fonts-extra +WORKDIR /Ibex +COPY . . +RUN apt install -y libgsl-dev +RUN Rscript -e "install.packages('Seurat')" +RUN Rscript -e "remotes::install_deps(dependencies = TRUE)" +RUN Rscript -e "devtools::test()" diff --git a/NAMESPACE b/NAMESPACE index 9bf1cfd..a7da0e9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,7 +4,6 @@ export(CoNGAfy) export(Ibex_matrix) export(combineExpandedBCR) export(runIbex) -import(basilisk) importFrom(Matrix,colSums) importFrom(Matrix,sparse.model.matrix) importFrom(SeuratObject,CreateAssayObject) @@ -30,7 +29,6 @@ importFrom(immApex,onehotEncoder) importFrom(immApex,propertyEncoder) importFrom(methods,is) importFrom(methods,slot) -importFrom(rlang,"%||%") importFrom(scRepertoire,combineBCR) importFrom(stats,as.formula) importFrom(stats,dist) diff --git a/NEWS b/NEWS index 82536fc..5a20b98 100644 --- a/NEWS +++ b/NEWS @@ -1,124 +1,173 @@ -CHANGES IN VERSION 0.99.31 ------------------------- -* ibex_ensure_basilisk_external_dir no longer importsFrom basilisk.utils directly - -CHANGES IN VERSION 0.99.30 ------------------------- -* Moved data processing script out of vignette to inst/scripts -* Added ibex_ensure_basilisk_external_dir with basilisk.utils - -CHANGES IN VERSION 0.99.29 ------------------------- -* Adding internal .OnLoad() function to handle basilisk lock dir issue - -CHANGES IN VERSION 0.99.28 ------------------------- -* Deprecated `quietBCRgenes()` -* Converted `Ibex.matrix()` to `Ibex_matrix()` -* Added Install Instructions for BioCondcutor on README and Vignette -* Removed references to Keras3 Installation -* Removed LazyData TRUE statement - -CHANGES IN VERSION 0.99.10 ------------------------- -* Added information to example data - -CHANGES IN VERSION 0.99.9 ------------------------- -* Examples now check if python is installed and running - -CHANGES IN VERSION 0.99.8 ------------------------- -* Updated example data to 2k HEL BEAM-Ab from 10x -* Converted ibex_example into SCE object for compliance -* Large revision of vignette to fit new data/format -* Added species argument to runIbex -* Updated CoNGA handling of assay for Seurat and Single-Cell Objects. - -CHANGES IN VERSION 0.99.7 ------------------------- -* Integration of Ibex with immApex -* Updated Seurat object to v5 -* Updated support for SCE format for ```runIbex()``` -* Update ```CoNGAfy()``` to function with all versions of Seurat -* Updated ```quietBCRgenes()``` to use VariableFeatures() call for SeuratV5 and backward compatibility. -* Add ```getHumanIgPseudoGenes()``` to return a list of human Immunoglobulin Pseudo genes that are kept by ```quietBCRgenes()``` +# Ibex Development Version + +- removed rlang from Imports +- Reformatted `NEWS` and `README.md` +- As per `basilisk` documentation: + - Add `.BBSoptions` with `UnsupportedPlatforms: win32` + - Add `configure` and `configure.win` scripts +- Add Docker infrastructure with `Dockerfile` and `.devcontainer/devcontainer.json` +- Add `.markdownlint.json` to sourcecode +- Improved testthat compatibility across platforms + +# Ibex 1.0.0 + +## Major Underlying Changes + +- Integration of Ibex with immApex +- Updated Seurat object to v5 +- Runs using basilisk instead of reticulate - no installation of python packages + +## Feature Changes + +- Renamed `Ibex.matrix()` to `Ibex_matrix()` +- Updated support for SCE format for `runIbex()` +- Update `CoNGAfy()` to function with all versions of Seurat +- Updated `quietBCRgenes()` to use `VariableFeatures()` call for SeuratV5 and backward compatibility. +- Add `getHumanIgPseudoGenes()` to return a list of human Immunoglobulin Pseudo genes that are kept by `quietBCRgenes()` + +## New Models + +- Added New Light and Heavy Chain Models +- Encoding methods now accepted: "OHE", "atchleyFactors", "crucianiProperties", "kideraFactors", "MSWHIM","tScales", "zScales" +- Sequence input: + - Human Heavy: 10000000 + - Human Light: 5000000 + - Human Heavy-Expanded: 5000000 + - Human Light-Expanded: 2500000 + - Mouse Heavy: 5000000 + - Mouse Heavy-Expanded: 5000000 +- Trained convolutional and variational autoencoders for Heavy/Light chains + - Architecture: 512-256-128-256-512 + - Parameters: + - Batch Size = 128 + - Latent Dimensions = 128 + - Epochs = 100 + - Loss = Mean Squared Error (CNN) & KL Divergence (VAE) + - Activation = relu + - Learning rate = 1e-6 + - Optimizers: Adam + - Early stopping was set to patients of 10 for minimal validation loss and restoration of best weights + - CNN autoencoders have batch normalization layers between the dense layers. + +# Ibex 0.99.31 + +- ibex_ensure_basilisk_external_dir no longer importsFrom basilisk.utils directly + +# Ibex 0.99.30 + +- Moved data processing script out of vignette to inst/scripts +- Added ibex_ensure_basilisk_external_dir with basilisk.utils + +# Ibex 0.99.29 + +- Adding internal .OnLoad() function to handle basilisk lock dir issue + +# Ibex 0.99.28 + +- Deprecated `quietBCRgenes()` +- Converted `Ibex.matrix()` to `Ibex_matrix()` +- Added Install Instructions for BioCondcutor on README and Vignette +- Removed references to Keras3 Installation +- Removed LazyData TRUE statement + +# Ibex 0.99.10 + +- Added information to example data + +# Ibex 0.99.9 + +- Examples now check if python is installed and running + +# Ibex 0.99.8 + +- Updated example data to 2k HEL BEAM-Ab from 10x +- Converted ibex_example into SCE object for compliance +- Large revision of vignette to fit new data/format +- Added species argument to runIbex +- Updated CoNGA handling of assay for Seurat and Single-Cell Objects. + +# Ibex 0.99.7 + +- Integration of Ibex with immApex +- Updated Seurat object to v5 +- Updated support for SCE format for ```runIbex()``` +- Update ```CoNGAfy()``` to function with all versions of Seurat +- Updated ```quietBCRgenes()``` to use VariableFeatures() call for SeuratV5 and backward compatibility. +- Add ```getHumanIgPseudoGenes()``` to return a list of human Immunoglobulin Pseudo genes that are kept by ```quietBCRgenes()``` ## New Models -* Added New Light and Heavy Chain Models -* Encoding methods now accepted: "OHE", "atchleyFactors", "crucianiProperties", "kideraFactors", "MSWHIM","tScales", "zScales" -* Sequence input: - - Human Heavy: 10000000 - - Human Light: 5000000 - - Human Heavy-Expanded: 5000000 - - Human Light-Expanded: 2500000 - - Mouse Heavy: 5000000 - - Mouse Heavy-Expanded: 5000000 -* Trained convolutional and variational autoencoders for Heavy/Light chains - - Architecture: 512-256-128-256-512 - - Parameters: - Batch Size = 128 - Latent Dimensions = 128 - Epochs = 100 - Loss = Mean Squared Error (CNN) & KL Divergence (VAE) - Activation = relu - Learning rate = 1e-6 - - Optimizers: Adam - - Early stopping was set to patients of 10 for minimal validation loss and restoration of best weights - - CNN autoencoders have batch normalization layers between the dense layers. - -CHANGES IN VERSION 0.99.6 ------------------------- -* Implementing GitHub action workflows -* Adding testthat framework -* Deprecating clonalCommunity - -CHANGES IN VERSION 0.99.5 ------------------------- -* Added geometric encoding using the BLOSUM62 matrix -* Trained classical and variational autoencoders for light/heavy chains with 1.5 million cdr sequences - - Architecture: 256-128-30-128-256 - - Parameters: - Batch Size = 64 - Latent Dimensions = 30 - Epochs = 100 - Loss = Mean Squared Error - - Optimizers: Adam - - Early stopping was set to patients of 10 for minimal validation loss and restoration of best weights - - learn rate varied by models - - classical auto encoders have batch normalization layers between the dense layers. - -CHANGES IN VERSION 0.99.4 ------------------------- -* Added chain.checker() function to allow for uncapitlized chain calls - -CHANGES IN VERSION 0.99.3 ------------------------- -* Updated models for manuscript revision - - Architecture: 256-128-30-128-256 - - Parameters: - Batch Size = 64 - Learning Rate = 0.001 - Latent Dimensions = 30 - Epochs = 50 - Loss = Mean Squared Error - - Optimizers: RAdam (for amino acid properties) and RMSprop (for OHE) - - Early stopping was set to patients of 10 for minimal validation loss and restoration of best weights - - -CHANGES IN VERSION 0.99.2 ------------------------- -* Updated models to include radam optimization, early stop for min 10 epochs, and all trained on 800,000 unique cdr3s -* quietBCRgenes() now does not remove human Ig pseudogenes - - -CHANGES IN VERSION 0.99.1 ------------------------- -* Added detection of chain length to function call -* Added support for direct output of combineBCR() -* Modified quietBCR() to include constant regions and J-chains - - -CHANGES IN VERSION 0.99.0 ------------------------- -* Initial commit \ No newline at end of file + +- Added New Light and Heavy Chain Models +- Encoding methods now accepted: "OHE", "atchleyFactors", "crucianiProperties", "kideraFactors", "MSWHIM","tScales", "zScales" +- Sequence input: + - Human Heavy: 10000000 + - Human Light: 5000000 + - Human Heavy-Expanded: 5000000 + - Human Light-Expanded: 2500000 + - Mouse Heavy: 5000000 + - Mouse Heavy-Expanded: 5000000 +- Trained convolutional and variational autoencoders for Heavy/Light chains + - Architecture: 512-256-128-256-512 + - Parameters: + - Batch Size = 128 + - Latent Dimensions = 128 + - Epochs = 100 + - Loss = Mean Squared Error (CNN) & KL Divergence (VAE) + - Activation = relu + - Learning rate = 1e-6 + - Optimizers: Adam + - Early stopping was set to patients of 10 for minimal validation loss and restoration of best weights + - CNN autoencoders have batch normalization layers between the dense layers. + +# Ibex 0.99.6 + +- Implementing GitHub action workflows +- Adding testthat framework +- Deprecating clonalCommunity + +# Ibex 0.99.5 + +- Added geometric encoding using the BLOSUM62 matrix +- Trained classical and variational autoencoders for light/heavy chains with 1.5 million cdr sequences + - Architecture: 256-128-30-128-256 + - Parameters: + - Batch Size = 64 + - Latent Dimensions = 30 + - Epochs = 100 + - Loss = Mean Squared Error + - Optimizers: Adam + - Early stopping was set to patients of 10 for minimal validation loss and restoration of best weights + - learn rate varied by models + - classical auto encoders have batch normalization layers between the dense layers. + +# Ibex 0.99.4 + +- Added chain.checker() function to allow for uncapitlized chain calls + +# Ibex 0.99.3 + +- Updated models for manuscript revision + - Architecture: 256-128-30-128-256 + - Parameters: + Batch Size = 64 + Learning Rate = 0.001 + Latent Dimensions = 30 + Epochs = 50 + Loss = Mean Squared Error + - Optimizers: RAdam (for amino acid properties) and RMSprop (for OHE) + - Early stopping was set to patients of 10 for minimal validation loss and restoration of best weights + +# Ibex 0.99.2 + +- Updated models to include radam optimization, early stop for min 10 epochs, and all trained on 800,000 unique cdr3s +- quietBCRgenes() now does not remove human Ig pseudogenes + +# Ibex 0.99.1 + +- Added detection of chain length to function call +- Added support for direct output of combineBCR() +- Modified quietBCR() to include constant regions and J-chains + +# Ibex 0.99.0 + +- Initial commit diff --git a/R/CoNGAfy.R b/R/CoNGAfy.R index 2bee708..7b737fb 100644 --- a/R/CoNGAfy.R +++ b/R/CoNGAfy.R @@ -192,7 +192,6 @@ grabAssay <- function(input.data, assay) { } # Calculate mean across individual clones -#' @importFrom rlang %||% #' @importFrom Matrix sparse.model.matrix colSums #' @importFrom SummarizedExperiment assay #' @importFrom SeuratObject GetAssayData diff --git a/R/Ibex-package.R b/R/Ibex-package.R index 6e1a392..6ad30a1 100644 --- a/R/Ibex-package.R +++ b/R/Ibex-package.R @@ -37,3 +37,8 @@ #' #' @keywords package "_PACKAGE" + + +## usethis namespace: start +## usethis namespace: end +NULL diff --git a/R/basiliskEnv.R b/R/basiliskEnv.R index 5d63f7c..dd5d892 100644 --- a/R/basiliskEnv.R +++ b/R/basiliskEnv.R @@ -1,5 +1,4 @@ -#' @import basilisk -IbexEnv <- BasiliskEnvironment( +IbexEnv <- basilisk::BasiliskEnvironment( envname = "IbexEnv", pkgname = "Ibex", packages = c( @@ -9,4 +8,4 @@ IbexEnv <- BasiliskEnvironment( "h5py=3.13", "numpy=1.26" ) -) \ No newline at end of file +) diff --git a/R/utils.R b/R/utils.R index f9d50e6..b1ddac7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -3,7 +3,6 @@ amino.acids <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V") # Add to meta data some of the metrics calculated -#' @importFrom rlang %||% #' @importFrom SingleCellExperiment colData add.meta.data <- function(sc, meta, header) { if (inherits(x=sc, what ="Seurat")) { diff --git a/README.md b/README.md index a67bb8a..a65d47d 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,28 @@ - Ibex +# Ibex + Using BCR sequences for graph embedding + [![R-CMD-check](https://github.com/BorchLab/Ibex/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/BorchLab/Ibex/actions/workflows/R-CMD-check.yaml) [![Codecov test coverage](https://codecov.io/gh/BorchLab/Ibex/graph/badge.svg)](https://app.codecov.io/gh/BorchLab/Ibex?branch=master) [![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://www.borch.dev/uploads/screpertoire/articles/ibex) + ## Introduction + Single-cell sequencing is an integral tool in immunology and oncology, enabling researchers to measure gene expression and immune cell receptor profiling at the level of individual cells. We developed the [scRepertoire](https://github.com/BorchLab/scRepertoire) R package to facilitate the integration of immune receptor and gene expression data. However, leveraging clonal indices for more complex analyses—such as using clonality in cell embedding—remains challenging. **Ibex** addresses this need by using deep learning to vectorize BCR sequences based on amino acid properties or their underlying order. Ibex is the sister package to [Trex](https://github.com/BorchLab/Trex), which focuses on TCR sequence data. -# System Requirements +# System Requirements + Ibex has been tested on R versions >= 4.0. For details on required R packages, refer to the package’s DESCRIPTION file. It is designed to work with single-cell objects containing BCR data generated using [scRepertoire](https://github.com/BorchLab/scRepertoire). Ibex has been tested on macOS and Linux. -# Installation +## Installation -Ibex relies on the [immApex](https://github.com/BorchLab/immApex) API can be installed directly from GitHub: +Ibex relies on the [immApex](https://github.com/BorchLab/immApex) API can be installed directly from GitHub: ```r devtools::install_github("BorchLab/immApex") @@ -32,13 +37,13 @@ if (!require("BiocManager", quietly = TRUE)) BiocManager::install("immApex") ``` -After immApex installation, you can install Ibex with: +After immApex installation, you can install Ibex with: ```r devtools::install_github("BorchLab/Ibex") ``` -Or via Bioconductor +Or via Bioconductor: ```r BiocManager::install("Ibex") @@ -46,28 +51,28 @@ BiocManager::install("Ibex") The main version of Ibex is submitted to Bioconductor (installation instructions will be updated after review). By default, Ibex will automatically pull deep learning models from a [Zenodo repository](https://zenodo.org/records/14919286) and cache them locally. -# Usage/Demos +## Usage/Demos Ibex integrates smoothly into most popular R-based single-cell workflows, including **Seurat** and **Bioconductor/SingleCellExperiment.** -## Quick Start +### Quick Start -See the [vignette](https://www.borch.dev/uploads/screpertoire/articles/ibex) for a step-by-step tutorial. +See the [vignette](https://www.borch.dev/uploads/screpertoire/articles/ibex) for a step-by-step tutorial. -## Autoencoded Matrix +### Autoencoded Matrix -The Ibex algorithm allows users to select BCR-based metrics to return autoencoded values to be used in dimensional reduction. If single-cell objects are not filtered for B cells with BCR, `Ibex_matrix()` will still return values, however IBEX_1 will be based on the disparity of BCR-containing and BCR-non-containing cells based on the Ibex algorithm. +The Ibex algorithm allows users to select BCR-based metrics to return autoencoded values to be used in dimensional reduction. If single-cell objects are not filtered for B cells with BCR, `Ibex_matrix()` will still return values, however IBEX_1 will be based on the disparity of BCR-containing and BCR-non-containing cells based on the Ibex algorithm. ```r library(Ibex) my_ibex <- Ibex_matrix(singleObject) ``` -## Seurat or Single-Cell Experiment +### Seurat or Single-Cell Experiment -You can run Ibex within your Seurat or Single-Cell Experiemt workflow. **Importantly** `runIbex()` will automatically filter single-cells that do not contain BCR information in the meta data of the single-cell object. +You can run Ibex within your Seurat or Single-Cell Experiemt workflow. **Importantly** `runIbex()` will automatically filter single-cells that do not contain BCR information in the meta data of the single-cell object. ```r seuratObj_Bonly <- runIbex(seuratObj, #The single cell object @@ -78,11 +83,11 @@ seuratObj_Bonly <- runIbex(seuratObj, #The single cell object "kideraFactors", "MSWHIM", "tScales", "OHE"), # Method of Encoding geometric.theta = pi/3, # theta for Geometric Encoding species = "Human") # "Mouse" or "Human" - + seuratObj_Bonly <- runIbex(seuratObj, reduction.name = "Ibex") ``` -## After Running Ibex +### After Running Ibex Once the Ibex embeddings are part of your Seurat object, you can use these embeddings to generate a t-SNE or UMAP: @@ -93,13 +98,15 @@ seuratObj <- RunUMAP(seuratObj, reduction = "Ibex", reduction.key = "Ibex_") If using Seurat package, the Ibex embedding information and gene expression PCA can be used to find the [Weighted Nearest Neighbors](https://pubmed.ncbi.nlm.nih.gov/34062119/). Before applying the WNN approach, best practice would be to remove the BCR-related genes from the list of variable genes and rerunning the PCA analysis. -### Recalculate PCA without BCR genes with quietBCRgenes() function in Ibex. +#### Recalculate PCA without BCR genes with quietBCRgenes() function in Ibex. + ```r seuratObj <- quietBCRgenes(seuratObj) seuratObj <- RunPCA(seuratObj) ``` -### Running WNN approach +#### Running WNN approach + ```r seuratObj <- FindMultiModalNeighbors(seuratObj, reduction.list = list("pca", "Ibex"), @@ -111,17 +118,19 @@ seuratObj <- RunUMAP(seuratObj, reduction.name = "wnn.umap", reduction.key = "wnnUMAP_") ``` + ## Bug Reports/New Features -#### If you run into any issues or bugs please submit a [GitHub issue](https://github.com/BorchLab/Ibex/issues) with details of the issue. +### If you run into any issues or bugs please submit a [GitHub issue](https://github.com/BorchLab/Ibex/issues) with details of the issue. - If possible please include a [reproducible example](https://reprex.tidyverse.org/). Alternatively, an example with the internal **ibex_example** would be extremely helpful. -#### Any requests for new features or enhancements can also be submitted as [GitHub issues](https://github.com/BorchLab/Ibex/issues). +### Any requests for new features or enhancements can also be submitted as [GitHub issues](https://github.com/BorchLab/Ibex/issues). -#### [Pull Requests](https://github.com/BorchLab/Ibex/pulls) are welcome for bug fixes, new features, or enhancements. +### [Pull Requests](https://github.com/BorchLab/Ibex/pulls) are welcome for bug fixes, new features, or enhancements. ## Citation -More information on Ibex is available at our [Biorxiv preprint](https://www.biorxiv.org/content/10.1101/2022.11.09.515787v2). + +More information on Ibex is available at our [Biorxiv preprint](https://www.biorxiv.org/content/10.1101/2022.11.09.515787v2). diff --git a/configure b/configure new file mode 100755 index 0000000..c5ad505 --- /dev/null +++ b/configure @@ -0,0 +1,2 @@ +#!/bin/sh +${R_HOME}/bin/Rscript -e "basilisk::configureBasiliskEnv(src = 'R/basiliskEnv.R')" diff --git a/configure.win b/configure.win new file mode 100755 index 0000000..dd3e17d --- /dev/null +++ b/configure.win @@ -0,0 +1,2 @@ +#!/bin/sh +${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe -e "basilisk::configureBasiliskEnv(src = 'R/basiliskEnv.R')" diff --git a/data/ibex_vdj.rda b/data/ibex_vdj.rda index 521de08..7c05d7d 100644 Binary files a/data/ibex_vdj.rda and b/data/ibex_vdj.rda differ diff --git a/inst/WORDLIST b/inst/WORDLIST index e6ff5b4..e536e44 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,7 +5,6 @@ Autoencoder Autoencoders BCR BLOSUM -Bioconductor's Biorxiv CDR CMD @@ -23,11 +22,14 @@ HEL IGH IGK Ig +Interoperate Kidera Lysozyme MSWHIM MultiUMAP OHE +ORCID +Pretrained SNE Schattgen SingleCellExperiment @@ -55,21 +57,19 @@ combineBCR contig crucianiProperties csv -customizable embeddings gp hydrophobicity ident -identOriginal immApex +interoperates interpretability -keras kideraFactors nCount nFeature physicochemical preprint -pseudogenes +pretrained pv quietBCRgenes runIbex diff --git a/man/Ibex-package.Rd b/man/Ibex-package.Rd index 7759772..1e11fea 100644 --- a/man/Ibex-package.Rd +++ b/man/Ibex-package.Rd @@ -42,7 +42,12 @@ runtime; no manual setup is usually required. \url{https://github.com/BorchLab/Ibex/issues} } \author{ -\strong{Maintainer}: Nick Borcherding \email{ncborch@gmail.com} +\strong{Maintainer}: Nick Borcherding \email{ncborch@gmail.com} [copyright holder] + +Other contributors: +\itemize{ + \item Qile Yang \email{qile.yang@berkeley.edu} (\href{https://orcid.org/0009-0005-0148-2499}{ORCID}) [contributor] +} } \keyword{package} diff --git a/tests/testthat/helper-testingFunctions.R b/tests/testthat/helper-testingFunctions.R index 316f0da..714d981 100644 --- a/tests/testthat/helper-testingFunctions.R +++ b/tests/testthat/helper-testingFunctions.R @@ -1,3 +1,25 @@ getdata <- function(dir, name) { - readRDS(paste("testdata/", dir, "/", name, ".rds", sep = "")) # could move testdata 1 dir lvl up nstead -} \ No newline at end of file + readRDS(paste("testdata/", dir, "/", name, ".rds", sep = "")) +} + +skip_if_py_not_installed <- function(python_packages) { + + missing_packages <- basilisk::basiliskRun( + env = IbexEnv, + fun = function(packages) { + packages[sapply(packages, Negate(reticulate::py_module_available))] + }, + packages = python_packages + ) + + if (length(missing_packages) > 0) { + testthat::skip(paste0( + "Required Python Module", + if (length(missing_packages) > 1) "s" else "", + " `", + paste(missing_packages, collapse = "`, `"), + "` not available." + )) + } + +} diff --git a/tests/testthat/test-CoNGAfy.R b/tests/testthat/test-CoNGAfy.R index 2ba7305..72b4a2f 100644 --- a/tests/testthat/test-CoNGAfy.R +++ b/tests/testthat/test-CoNGAfy.R @@ -25,6 +25,9 @@ test_that("CoNGAfy filters cells correctly", { }) test_that("CoNGAfy stops if amino acid sequences are missing", { + + local_reproducible_output(unicode = FALSE) + sc_example <- suppressWarnings(CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) diff --git a/tests/testthat/test-Ibex_matrix.R b/tests/testthat/test-Ibex_matrix.R index e0a85c1..53eca53 100644 --- a/tests/testthat/test-Ibex_matrix.R +++ b/tests/testthat/test-Ibex_matrix.R @@ -1,8 +1,10 @@ # test script for Ibex_matrix.R - testcases are NOT comprehensive! -library(Ibex) ibex_example <- get(data("ibex_example")) test_that("Ibex_matrix handles incorrect inputs gracefully", { + + local_reproducible_output(unicode = FALSE) + expect_error(Ibex_matrix(input.data = ibex_example, chain = "Middle", method = "encoder"), "'arg' should be one of \"Heavy\", \"Light\"") expect_error(Ibex_matrix(input.data = ibex_example, chain = "Heavy", method = "xyz"), @@ -15,68 +17,61 @@ test_that("Ibex_matrix handles incorrect inputs gracefully", { "non-numeric argument to mathematical function") }) -keras_installed <- reticulate::py_module_available("keras") -numpy_installed <- reticulate::py_module_available("numpy") +test_that("Ibex_matrix returns expected output format", { + skip_if_py_not_installed(c("keras", "numpy")) + result <- Ibex_matrix(input.data = ibex_example, + chain = "Heavy", + method = "encoder", + encoder.model = "VAE", + encoder.input = "atchleyFactors", + verbose = FALSE) + expect_true(is.data.frame(result)) + expect_true(all(grepl("^Ibex_", colnames(result)))) + expect_gt(nrow(result), 0) + expect_gt(ncol(result), 0) +}) + +test_that("Ibex_matrix works with encoder method", { + skip_if_py_not_installed(c("keras", "numpy")) + result <- Ibex_matrix(input.data = ibex_example, + chain = "Light", + method = "encoder", + encoder.model = "CNN", + encoder.input = "OHE", + verbose = FALSE) + expect_true(is.data.frame(result)) + expect_true(all(grepl("^Ibex_", colnames(result)))) +}) + +test_that("Ibex_matrix works with geometric method", { + skip_if_py_not_installed(c("keras", "numpy")) + result <- Ibex_matrix(input.data = ibex_example, + chain = "Heavy", + method = "geometric", + geometric.theta = pi / 4, + verbose = FALSE) + expect_true(is.data.frame(result)) + expect_true(all(grepl("^Ibex_", colnames(result)))) +}) -# 2. If not installed, skip everything: -if (!keras_installed || !numpy_installed) { - test_that("Skipping Ibex_matrix tests", { - skip("Required Python modules (Keras, NumPy) are not available.") - }) -} else { - - test_that("Ibex_matrix returns expected output format", { - result <- Ibex_matrix(input.data = ibex_example, +test_that("Ibex_matrix handles different species options", { + skip_if_py_not_installed(c("keras", "numpy")) + result1 <- Ibex_matrix(input.data = ibex_example, chain = "Heavy", method = "encoder", encoder.model = "VAE", encoder.input = "atchleyFactors", + species = "Human", verbose = FALSE) - expect_true(is.data.frame(result)) - expect_true(all(grepl("^Ibex_", colnames(result)))) - expect_gt(nrow(result), 0) - expect_gt(ncol(result), 0) - }) - - test_that("Ibex_matrix works with encoder method", { - result <- Ibex_matrix(input.data = ibex_example, - chain = "Light", - method = "encoder", - encoder.model = "CNN", - encoder.input = "OHE", - verbose = FALSE) - expect_true(is.data.frame(result)) - expect_true(all(grepl("^Ibex_", colnames(result)))) - }) - - test_that("Ibex_matrix works with geometric method", { - result <- Ibex_matrix(input.data = ibex_example, + result2 <- Ibex_matrix(input.data = ibex_example, chain = "Heavy", - method = "geometric", - geometric.theta = pi / 4, + method = "encoder", + encoder.model = "VAE", + encoder.input = "atchleyFactors", + species = "Mouse", verbose = FALSE) - expect_true(is.data.frame(result)) - expect_true(all(grepl("^Ibex_", colnames(result)))) - }) - - test_that("Ibex_matrix handles different species options", { - result1 <- Ibex_matrix(input.data = ibex_example, - chain = "Heavy", - method = "encoder", - encoder.model = "VAE", - encoder.input = "atchleyFactors", - species = "Human", - verbose = FALSE) - result2 <- Ibex_matrix(input.data = ibex_example, - chain = "Heavy", - method = "encoder", - encoder.model = "VAE", - encoder.input = "atchleyFactors", - species = "Mouse", - verbose = FALSE) - expect_true(is.data.frame(result1)) - expect_true(is.data.frame(result2)) - expect_true(all(grepl("^Ibex_", colnames(result1)))) - expect_true(all(grepl("^Ibex_", colnames(result2)))) - }) -} + expect_true(is.data.frame(result1)) + expect_true(is.data.frame(result2)) + expect_true(all(grepl("^Ibex_", colnames(result1)))) + expect_true(all(grepl("^Ibex_", colnames(result2)))) +}) diff --git a/tests/testthat/test-runIbex.R b/tests/testthat/test-runIbex.R index 6b6d856..aa3dc68 100644 --- a/tests/testthat/test-runIbex.R +++ b/tests/testthat/test-runIbex.R @@ -1,8 +1,10 @@ # test script for runIbex.R - testcases are NOT comprehensive! -library(Ibex) ibex_example <- get(data("ibex_example")) test_that("runIbex handles incorrect inputs gracefully", { + + local_reproducible_output(unicode = FALSE) + expect_error(runIbex(sc.data = ibex_example, chain = "Middle", method = "encoder"), "'arg' should be one of \"Heavy\", \"Light\"") expect_error(runIbex(sc.data = ibex_example, chain = "Heavy", method = "xyz"), @@ -15,97 +17,94 @@ test_that("runIbex handles incorrect inputs gracefully", { "non-numeric argument to mathematical function") }) -keras_installed <- reticulate::py_module_available("keras") -numpy_installed <- reticulate::py_module_available("numpy") +test_that("runIbex works with Seurat object", { + skip_if_py_not_installed(c("keras", "numpy")) + suppressWarnings(sc_example <- CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) + sc_example[["CTaa"]] <- sample(c("CASSL", "CASST", NA, "NA_IGHV1", "None_IGHV2"), 100, replace = TRUE) + sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) + + result <- runIbex(sc_example, + chain = "Heavy", + method = "encoder", + encoder.model = "VAE", + encoder.input = "atchleyFactors", + reduction.name = "IbexTest", + verbose = FALSE) + + expect_true("IbexTest" %in% names(result@reductions)) + expect_true(inherits(result, "Seurat")) +}) + +test_that("runIbex works with geometric method", { + skip_if_py_not_installed(c("keras", "numpy")) + sc_example <- suppressWarnings(SeuratObject::CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) + sc_example[["CTaa"]] <- sample(c("CASSL", "CASST", NA, "NA_IGHV1", "None_IGHV2"), 100, replace = TRUE) + sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) + + result <- runIbex(sc_example, + chain = "Heavy", + method = "geometric", + geometric.theta = pi / 4, + reduction.name = "IbexGeo", + verbose = FALSE) + + expect_true("IbexGeo" %in% names(result@reductions)) + expect_true(inherits(result, "Seurat")) +}) + +test_that("runIbex filters cells correctly", { + skip_if_py_not_installed(c("keras", "numpy")) + sc_example <- suppressWarnings(CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) + sc_example[["CTaa"]] <- c(rep("CASSL", 50), rep(NA, 50)) + sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) + result <- runIbex(sc_example, + chain = "Heavy", + method = "encoder", + encoder.model = "VAE", + encoder.input = "atchleyFactors", + reduction.name = "IbexFiltered", + verbose = FALSE) + + expect_true("IbexFiltered" %in% names(result@reductions)) + expect_lt(ncol(result), 100) # Ensures some cells were filtered out +}) -# 2. If not installed, skip everything: -if (!keras_installed || !numpy_installed) { - test_that("Skipping runIbex tests", { - skip("Required Python modules (Keras, NumPy) are not available.") - }) -} else { - - test_that("runIbex works with Seurat object", { - suppressWarnings(sc_example <- CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) - sc_example[["CTaa"]] <- sample(c("CASSL", "CASST", NA, "NA_IGHV1", "None_IGHV2"), 100, replace = TRUE) - sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) - - result <- runIbex(sc_example, +test_that("runIbex stops if amino acid sequences are missing", { + + skip_if_py_not_installed(c("keras", "numpy")) + local_reproducible_output(unicode = FALSE) + + sc_example <- suppressWarnings(SeuratObject::CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) + + expect_error(runIbex(sc_example, + chain = "Heavy", + method = "encoder", + encoder.model = "VAE", + encoder.input = "atchleyFactors", + verbose = FALSE), + "Amino acid sequences are not added to the single-cell object correctly.") +}) + +test_that("runIbex works with different reduction names", { + skip_if_py_not_installed(c("keras", "numpy")) + sc_example <- suppressWarnings(SeuratObject::CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) + sc_example[["CTaa"]] <- sample(c("CASSL", "CASST", NA, "NA_IGHV1", "None_IGHV2"), 100, replace = TRUE) + sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) + result1 <- runIbex(sc_example, chain = "Heavy", method = "encoder", encoder.model = "VAE", encoder.input = "atchleyFactors", - reduction.name = "IbexTest", + reduction.name = "Ibex1", verbose = FALSE) - - expect_true("IbexTest" %in% names(result@reductions)) - expect_true(inherits(result, "Seurat")) - }) - test_that("runIbex works with geometric method", { - sc_example <- suppressWarnings(SeuratObject::CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) - sc_example[["CTaa"]] <- sample(c("CASSL", "CASST", NA, "NA_IGHV1", "None_IGHV2"), 100, replace = TRUE) - sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) - - result <- runIbex(sc_example, - chain = "Heavy", - method = "geometric", - geometric.theta = pi / 4, - reduction.name = "IbexGeo", - verbose = FALSE) - - expect_true("IbexGeo" %in% names(result@reductions)) - expect_true(inherits(result, "Seurat")) - }) - - test_that("runIbex filters cells correctly", { - sc_example <- suppressWarnings(CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) - sc_example[["CTaa"]] <- c(rep("CASSL", 50), rep(NA, 50)) - sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) - result <- runIbex(sc_example, - chain = "Heavy", + result2 <- runIbex(sc_example, chain = "Heavy", method = "encoder", encoder.model = "VAE", encoder.input = "atchleyFactors", - reduction.name = "IbexFiltered", + reduction.name = "Ibex2", verbose = FALSE) - - expect_true("IbexFiltered" %in% names(result@reductions)) - expect_lt(ncol(result), 100) # Ensures some cells were filtered out - }) - test_that("runIbex stops if amino acid sequences are missing", { - sc_example <- suppressWarnings(SeuratObject::CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) - - expect_error(runIbex(sc_example, - chain = "Heavy", - method = "encoder", - encoder.model = "VAE", - encoder.input = "atchleyFactors", - verbose = FALSE), - "Amino acid sequences are not added to the single-cell object correctly.") - }) - - test_that("runIbex works with different reduction names", { - sc_example <- suppressWarnings(SeuratObject::CreateSeuratObject(counts = matrix(rnorm(1000), nrow = 10, ncol = 100))) - sc_example[["CTaa"]] <- sample(c("CASSL", "CASST", NA, "NA_IGHV1", "None_IGHV2"), 100, replace = TRUE) - sc_example[["CTgene"]] <- sample(c("NA_IGHV1.IGD1.IGJ1.IGM", "NA_IGHV1.IGD1.IGJ1.IGM", NA, "NA_IGHV1.IGD1.IGJ1.IGM", "None_IGHV1.IGD1.IGJ1.IGM"), 100, replace = TRUE) - result1 <- runIbex(sc_example, - chain = "Heavy", - method = "encoder", - encoder.model = "VAE", - encoder.input = "atchleyFactors", - reduction.name = "Ibex1", - verbose = FALSE) - - result2 <- runIbex(sc_example, chain = "Heavy", - method = "encoder", - encoder.model = "VAE", - encoder.input = "atchleyFactors", - reduction.name = "Ibex2", - verbose = FALSE) - - expect_true("Ibex1" %in% names(result1@reductions)) - expect_true("Ibex2" %in% names(result2@reductions)) - }) -} + expect_true("Ibex1" %in% names(result1@reductions)) + expect_true("Ibex2" %in% names(result2@reductions)) +})