Title: | Fast Functions for Differential Expression using Wilcox and AUC |
---|---|
Description: | Scalable implementation of the Wilcoxon rank sum test and auROC statistic. Interfaces to dense and sparse matrices, as well as genomics analysis frameworks Seurat and SingleCellExperiment. |
Authors: | Ilya Korsunsky [aut] , Aparna Nathan [aut], Nghia Millard [aut] , Soumya Raychaudhuri [aut] , Kamil Slowikowski [cre, ctb] , Austin Hartman [ctb] |
Maintainer: | Kamil Slowikowski <[email protected]> |
License: | GPL-3 |
Version: | 1.0.0 |
Built: | 2024-11-13 05:01:24 UTC |
Source: | https://github.com/immunogenomics/presto |
Collapse counts based on multiple categorical metadata columns
collapse_counts( counts_mat, meta_data, varnames, min_cells_per_group = 0, keep_n = FALSE, how = c("sum", "mean")[1] )
collapse_counts( counts_mat, meta_data, varnames, min_cells_per_group = 0, keep_n = FALSE, how = c("sum", "mean")[1] )
counts_mat |
counts matrix where columns represent cells and rows represent features |
meta_data |
data.frame containing cell metadata |
varnames |
subset of 'meta_data' column names |
min_cells_per_group |
minimum cells to keep collapsed group |
keep_n |
keep or drop the 'N' column containing the number of cells in each group. Default is 'FALSE' |
how |
method of collapsing counts from groups. 'sum' or 'mean' |
m <- matrix(sample.int(8, 100*500, replace=TRUE), nrow=100, ncol=500) rownames(m) <- paste0("G", 1:100) colnames(m) <- paste0("C", 1:500) md1 <- sample(c("a", "b"), 500, replace=TRUE) md2 <- sample(c("c", "d"), 500, replace=TRUE) df <- data.frame(md1, md2) data_collapsed <- collapse_counts(m, df, c("md1", "md2")) head(data_collapsed$counts_mat)
m <- matrix(sample.int(8, 100*500, replace=TRUE), nrow=100, ncol=500) rownames(m) <- paste0("G", 1:100) colnames(m) <- paste0("C", 1:500) md1 <- sample(c("a", "b"), 500, replace=TRUE) md2 <- sample(c("c", "d"), 500, replace=TRUE) df <- data.frame(md1, md2) data_collapsed <- collapse_counts(m, df, c("md1", "md2")) head(data_collapsed$counts_mat)
Compute unique hash for each row of data.frame
compute_hash(data_df, vars_use)
compute_hash(data_df, vars_use)
data_df |
data.frame |
vars_use |
vector of column names to use when computing the hash |
Small gene expression matrix
exprs
exprs
matrix of 25 features by 150 observations
Utility function to compute number of zeros-per-feature within group
nnzeroGroups(X, y, MARGIN = 2) ## S3 method for class 'dgCMatrix' nnzeroGroups(X, y, MARGIN = 2) ## S3 method for class 'matrix' nnzeroGroups(X, y, MARGIN = 2)
nnzeroGroups(X, y, MARGIN = 2) ## S3 method for class 'dgCMatrix' nnzeroGroups(X, y, MARGIN = 2) ## S3 method for class 'matrix' nnzeroGroups(X, y, MARGIN = 2)
X |
matrix |
y |
group labels |
MARGIN |
whether observations are rows (=2) or columns (=1) |
Matrix of groups by features
data(exprs) data(y) nnz_res <- nnzeroGroups(exprs, y, 1) nnz_res <- nnzeroGroups(t(exprs), y, 2)
data(exprs) data(y) nnz_res <- nnzeroGroups(exprs, y, 1) nnz_res <- nnzeroGroups(t(exprs), y, 2)
SingleCellExperiment object with fake data
object_sce
object_sce
A single cell experiment
Seurat V3 object with fake data
object_seurat
object_seurat
A Seurat object
Pseudobulk DESeq2
pseudobulk_deseq2( dge_formula, meta_data, counts_df, verbose = TRUE, min_counts_per_sample = 10, present_in_min_samples = 5, collapse_background = TRUE, vals_test = NULL, mode = c("one_vs_all", "pairwise", "within")[1] )
pseudobulk_deseq2( dge_formula, meta_data, counts_df, verbose = TRUE, min_counts_per_sample = 10, present_in_min_samples = 5, collapse_background = TRUE, vals_test = NULL, mode = c("one_vs_all", "pairwise", "within")[1] )
dge_formula |
differential gene expression formula for DESeq2 |
meta_data |
data.frame of cell metadata |
counts_df |
A feature-by-sample matrix |
verbose |
verbose |
min_counts_per_sample |
minimum counts per sample to include in differential gene expression |
present_in_min_samples |
minimum samples with gene counts to include in differential gene expression |
collapse_background |
collapse background. Default is 'TRUE' |
vals_test |
cell metadata columns |
mode |
kind of pseudobulk testing to perform. One of 'one_vs_all', 'pairwise', or 'within' |
## Not run: m <- matrix(sample.int(8, 100*500, replace=TRUE),nrow=100, ncol=500) rownames(m) <- paste0("G", 1:100) colnames(m) <- paste0("C", 1:500) md1 <- sample(c("a", "b"), 500, replace=TRUE) md2 <- sample(c("c", "d"), 500, replace=TRUE) df <- data.frame(md1, md2) data_collapsed <- collapse_counts(m, df, c("md1", "md2")) res_mat <- pseudobulk_deseq2( ~md1 + md1, data_collapsed$meta_data, data_collapsed$counts_mat, verbose = TRUE, present_in_min_samples = 1 ) head(res_mat) ## End(Not run)
## Not run: m <- matrix(sample.int(8, 100*500, replace=TRUE),nrow=100, ncol=500) rownames(m) <- paste0("G", 1:100) colnames(m) <- paste0("C", 1:500) md1 <- sample(c("a", "b"), 500, replace=TRUE) md2 <- sample(c("c", "d"), 500, replace=TRUE) df <- data.frame(md1, md2) data_collapsed <- collapse_counts(m, df, c("md1", "md2")) res_mat <- pseudobulk_deseq2( ~md1 + md1, data_collapsed$meta_data, data_collapsed$counts_mat, verbose = TRUE, present_in_min_samples = 1 ) head(res_mat) ## End(Not run)
Pseudobulk one versus all
pseudobulk_one_vs_all( dge_formula, counts_df, meta_data, contrast_var, vals_test, collapse_background, verbose )
pseudobulk_one_vs_all( dge_formula, counts_df, meta_data, contrast_var, vals_test, collapse_background, verbose )
dge_formula |
differential gene expression formula for DESeq2 |
counts_df |
counts matrix |
meta_data |
data.frame of cell metadata |
contrast_var |
cell metadata column to use for differential gene expression |
vals_test |
cell metadata columns |
collapse_background |
collapse background counts according to 'contrast_var' |
verbose |
verbose |
Pseudobulk pairwise
pseudobulk_pairwise( dge_formula, counts_df, meta_data, contrast_var, vals_test, verbose, min_counts_per_sample, present_in_min_samples )
pseudobulk_pairwise( dge_formula, counts_df, meta_data, contrast_var, vals_test, verbose, min_counts_per_sample, present_in_min_samples )
dge_formula |
differential gene expression formula for DESeq2 |
counts_df |
counts matrix |
meta_data |
data.frame of cell metadata |
contrast_var |
cell metadata column to use for differential gene expression |
vals_test |
cell metadata columns |
verbose |
verbose |
min_counts_per_sample |
minimum counts per sample to include in differential gene expression |
present_in_min_samples |
minimum samples with gene counts to include in differential gene expression |
Pseudobulk within
pseudobulk_within( dge_formula, counts_df, meta_data, split_var, vals_test, verbose, min_counts_per_sample, present_in_min_samples )
pseudobulk_within( dge_formula, counts_df, meta_data, split_var, vals_test, verbose, min_counts_per_sample, present_in_min_samples )
dge_formula |
differential gene expression formula for DESeq2 |
counts_df |
counts matrix |
meta_data |
data.frame of cell metadata |
split_var |
- |
vals_test |
cell metadata columns |
verbose |
verbose |
min_counts_per_sample |
minimum counts per sample to include in differential gene expression |
present_in_min_samples |
minimum samples with gene counts to include in differential gene expression |
Utility function to rank columns of matrix
rank_matrix(X) ## S3 method for class 'dgCMatrix' rank_matrix(X) ## S3 method for class 'matrix' rank_matrix(X)
rank_matrix(X) ## S3 method for class 'dgCMatrix' rank_matrix(X) ## S3 method for class 'matrix' rank_matrix(X)
X |
feature by observation matrix. |
List with 2 items
X_ranked - matrix of entry ranks
ties - list of tied group sizes
data(exprs) rank_res <- rank_matrix(exprs)
data(exprs) rank_res <- rank_matrix(exprs)
Utility function to sum over group labels
sumGroups(X, y, MARGIN = 2) ## S3 method for class 'dgCMatrix' sumGroups(X, y, MARGIN = 2) ## S3 method for class 'matrix' sumGroups(X, y, MARGIN = 2)
sumGroups(X, y, MARGIN = 2) ## S3 method for class 'dgCMatrix' sumGroups(X, y, MARGIN = 2) ## S3 method for class 'matrix' sumGroups(X, y, MARGIN = 2)
X |
matrix |
y |
group labels |
MARGIN |
whether observations are rows (=2) or columns (=1) |
Matrix of groups by features
data(exprs) data(y) sumGroups_res <- sumGroups(exprs, y, 1) sumGroups_res <- sumGroups(t(exprs), y, 2)
data(exprs) data(y) sumGroups_res <- sumGroups(exprs, y, 1) sumGroups_res <- sumGroups(t(exprs), y, 2)
Summarize differential gene expression pairs
summarize_dge_pairs(dge_res, mode = c("min", "max")[1])
summarize_dge_pairs(dge_res, mode = c("min", "max")[1])
dge_res |
table returned by pseudobulk_deseq2() function when 'mode' is 'pairwise' |
mode |
- |
Useful summary of the most distinguishing features in each group.
top_markers( res, n = 10, auc_min = 0, pval_max = 1, padj_max = 1, pct_in_min = 0, pct_out_max = 100 )
top_markers( res, n = 10, auc_min = 0, pval_max = 1, padj_max = 1, pct_in_min = 0, pct_out_max = 100 )
res |
table returned by wilcoxauc() function. |
n |
number of markers to find for each. |
auc_min |
filter features with auc < auc_min. |
pval_max |
filter features with pval > pval_max. |
padj_max |
filter features with padj > padj_max. |
pct_in_min |
Minimum percent (0-100) of observations with non-zero entries in group. |
pct_out_max |
Maximum percent (0-100) of observations with non-zero entries out of group. |
table with the top n markers for each cluster.
data(exprs) data(y) ## first, run wilcoxauc res <- wilcoxauc(exprs, y) ## top 10 markers for each group ## filter for nominally significant (p<0.05) and over-expressed (auc>0.5) top_markers(res, 10, auc_min = 0.5, pval_max = 0.05)
data(exprs) data(y) ## first, run wilcoxauc res <- wilcoxauc(exprs, y) ## top 10 markers for each group ## filter for nominally significant (p<0.05) and over-expressed (auc>0.5) top_markers(res, 10, auc_min = 0.5, pval_max = 0.05)
Useful summary of the most distinguishing features in each group.
top_markers_dds(res, n = 10, pval_max = 1, padj_max = 1, lfc_min = 1)
top_markers_dds(res, n = 10, pval_max = 1, padj_max = 1, lfc_min = 1)
res |
table returned by pseudobulk_deseq2() function. |
n |
number of markers to find for each. |
pval_max |
filter features with pval > pval_max. |
padj_max |
filter features with padj > padj_max. |
lfc_min |
filter features with log2FoldChange < lfc_min @return table with the top n markers for each cluster. |
Computes auROC and Wilcoxon p-value based on Gaussian approximation. Inputs can be
Dense matrix or data.frame
Sparse matrix, such as dgCMatrix
Seurat V3 object
SingleCellExperiment object
For detailed examples, consult the presto vignette.
wilcoxauc(X, ...) ## S3 method for class 'seurat' wilcoxauc(X, ...) ## S3 method for class 'Seurat' wilcoxauc( X, group_by = NULL, assay = "data", groups_use = NULL, seurat_assay = "RNA", ... ) ## S3 method for class 'SingleCellExperiment' wilcoxauc(X, group_by = NULL, assay = NULL, groups_use = NULL, ...) ## Default S3 method: wilcoxauc(X, y, groups_use = NULL, verbose = TRUE, ...)
wilcoxauc(X, ...) ## S3 method for class 'seurat' wilcoxauc(X, ...) ## S3 method for class 'Seurat' wilcoxauc( X, group_by = NULL, assay = "data", groups_use = NULL, seurat_assay = "RNA", ... ) ## S3 method for class 'SingleCellExperiment' wilcoxauc(X, group_by = NULL, assay = NULL, groups_use = NULL, ...) ## Default S3 method: wilcoxauc(X, y, groups_use = NULL, verbose = TRUE, ...)
X |
A feature-by-sample matrix, Seurat object, or SingleCellExperiment object |
... |
input specific parameters. |
group_by |
(Seurat & SCE) name of groups variable ('e.g. Cluster'). |
assay |
(Seurat & SCE) name of feature matrix slot (e.g. 'data' or 'logcounts'). |
groups_use |
(optional) which groups from y vector to test. |
seurat_assay |
(Seurat) name of Seurat Assay (e.g. 'RNA'). |
y |
vector of group labels. |
verbose |
boolean, TRUE for warnings and messages. |
table with the following columns:
feature - feature name (e.g. gene name).
group - group name.
avgExpr - mean value of feature in group.
logFC - log fold change between observations in group vs out.
statistic - Wilcoxon rank sum U statistic.
auc - area under the receiver operator curve.
pval - nominal p value.
padj - Benjamini-Hochberg adjusted p value.
pct_in - Percent of observations in the group with non-zero feature value.
pct_out - Percent of observations out of the group with non-zero feature value.
## Not run: data(exprs) data(y) ## on a dense matrix head(wilcoxauc(exprs, y)) ## with only some groups head(wilcoxauc(exprs, y, c('A', 'B'))) ## on a sparse matrix exprs_sparse <- as(exprs, 'dgCMatrix') head(wilcoxauc(exprs_sparse, y)) ## on a Seurat V3 object if (requireNamespace("Seurat", quietly = TRUE)) { pkg_version <- packageVersion('Seurat') if (pkg_version >= "3.0" & pkg_version < "4.0") { data(object_seurat) head(wilcoxauc(object_seurat, 'cell_type')) } } ## on a SingleCellExperiment object if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { data(object_sce) head(wilcoxauc(object_sce, 'cell_type')) } ## End(Not run)
## Not run: data(exprs) data(y) ## on a dense matrix head(wilcoxauc(exprs, y)) ## with only some groups head(wilcoxauc(exprs, y, c('A', 'B'))) ## on a sparse matrix exprs_sparse <- as(exprs, 'dgCMatrix') head(wilcoxauc(exprs_sparse, y)) ## on a Seurat V3 object if (requireNamespace("Seurat", quietly = TRUE)) { pkg_version <- packageVersion('Seurat') if (pkg_version >= "3.0" & pkg_version < "4.0") { data(object_seurat) head(wilcoxauc(object_seurat, 'cell_type')) } } ## on a SingleCellExperiment object if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { data(object_sce) head(wilcoxauc(object_sce, 'cell_type')) } ## End(Not run)
Group labels for observations in gene expression matrix
y
y
a factor with 3 levels