Skip to content

Commit 66910b0

Browse files
committed
Update paper related code
1 parent 6f26f5e commit 66910b0

19 files changed

Lines changed: 1517 additions & 1303 deletions

paper/methods/R_functions.R

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@
33
# Load the Python package
44
#
55
########################################################################
6+
# require(reticulate)
7+
8+
# # create a new environment
9+
# virtualenv_create("py")
10+
# virtualenv_install("py", c("numba", "pandas", "numpy", "scipy", "statsmodels", "scikit-learn", "cvxpy", "tqdm"))
11+
# use_virtualenv("py")
12+
# See the following document for more details
13+
# https://rstudio.github.io/reticulate/articles/versions.html
14+
615

716
# import Python package
817
causarray <- import("causarray")
@@ -28,15 +37,12 @@ run_causarray <- function(Y, W, A, r, update_disp_glm=TRUE, W_A='full', ...){
2837
# confounder adjustment
2938
if(r>0){
3039
X <- cbind(W, A)
31-
res_gcate <- causarray$fit_gcate(Y, X, r, num_d=a, offset=offset, ...)
32-
# res is a list of 2 matrices:
33-
# A1 = [X, Z] and A2 = [B, Gamma] where X = [W, A]
34-
names(res_gcate) <- c('A1', 'A2', 'info', 'A01', 'A02')
35-
Wp <- res_gcate[[1]]
40+
res_gcate <- causarray$fit_gcate(Y, W, A, r, offset=offset, ...)[[2]]
41+
Wp <- res_gcate$X_U
3642
Wp <- Wp[,-c((d+1):(d+a))] # remove the column of A
37-
disp_glm <- res_gcate[[3]]$kwargs_glm$nuisance
43+
disp_glm <- res_gcate$kwargs_glm$nuisance
3844
Z <- Wp[,(d+1):ncol(Wp)]
39-
offset <- log(res_gcate[[3]]$kwargs_glm$size_factor)
45+
offset <- log(res_gcate$kwargs_glm$size_factor)
4046
}else{
4147
res_gcate <- NULL
4248
Wp <- W
@@ -58,7 +64,7 @@ run_causarray <- function(Y, W, A, r, update_disp_glm=TRUE, W_A='full', ...){
5864
}
5965
res <- do.call(
6066
causarray$LFC,
61-
modifyList(list(alpha=0.1, c=0.1, family='nb', offset=offset, disp_glm=disp_glm),
67+
modifyList(list(fdx_alpha=0.1, fdx_c=0.1, family='nb', offset=offset, disp_glm=disp_glm),
6268
list('Y'=Y, 'W'=W_new, 'A'=A, 'W_A'=W_A, ...))
6369
)
6470
causarray.df <- res[[1]]
@@ -76,7 +82,7 @@ run_causarray <- function(Y, W, A, r, update_disp_glm=TRUE, W_A='full', ...){
7682
estimate_r_causarray <- function(Y, W, A, r_max, ...){
7783
data <- prep_causarray(Y, W, A, ...)
7884
Y <- data[[1]]; W <- data[[2]]; A <- data[[3]]
79-
df_r <- causarray$estimate_r(Y, cbind(W, A), r_max, ...)
85+
df_r <- causarray$estimate_r(Y, W, A, r_max, ...)
8086
df_r
8187
}
8288

@@ -175,11 +181,10 @@ library(MASS)
175181
#' @param Y Pseudo-bulk gene expression matrix, individual x gene
176182
#' @param metadata Metadata dataframe with all covariates including column called trt with 1 = treatment/case and 0 = control (factor),
177183
#' numeric features should be centered and scaled, all covariates in this dataframe will be used
178-
#' @param raw Whether the input data is raw or preprocessed
179184
#' @param family What time of regression to fit? 'poisson' or 'nb' (negative binomial)
180185
#' @returns Dataframe containing Wilcoxon statistic, p-value, and adjusted p-value for each gene
181186
#'
182-
run_wilcoxon <- function(Y, metadata, raw=F, family='nb') {
187+
run_wilcoxon <- function(Y, metadata, family='nb') {
183188
.check_input(Y, metadata)
184189

185190
loglibsize <- log2(rowSums(Y))
@@ -197,25 +202,38 @@ run_wilcoxon <- function(Y, metadata, raw=F, family='nb') {
197202
fit_regs <- function(gene, Y, metadata, family='poisson') {
198203
gene_data <- cbind(Y[,gene], metadata)
199204
colnames(gene_data) <- c('gexp', colnames(metadata))
200-
if(family=='poisson'){
201-
reg <- glm(gexp ~ . - trt, family="poisson", data=gene_data)
202-
}else if(family=='nb'){
203-
reg <- glm.nb(gexp ~ . - trt, data=gene_data)
204-
}
205-
reg$residuals
206-
}
205+
tryCatch({
206+
if(family=='poisson'){
207+
reg <- glm(gexp ~ . - trt, family="poisson", data=gene_data)
208+
}else if(family=='nb'){
209+
reg <- glm.nb(gexp ~ . - trt, data=gene_data)
210+
}
211+
reg$residuals
212+
}, error = function(e) {
213+
rep(NA, nrow(metadata))
214+
})
215+
}
207216
Y_resid <- do.call("cbind", mclapply(cycle,fit_regs,Y=Y,metadata=metadata,family=family,mc.cores=20,mc.preschedule = T,mc.silent=T))
208217
colnames(Y_resid) <- cycle
209218

210219
cat('Running Wilcoxon Tests... \n')
211220
wilcox_test <- function(gene, Y, trt){
212-
wilc <- wilcox.test(Y[trt == 1, gene], Y[trt == 0, gene])
213-
list('stat' = wilc$statistic, 'pvalue' = wilc$p.value)
221+
tryCatch({
222+
wilc <- wilcox.test(Y[trt == 1, gene], Y[trt == 0, gene])
223+
list('stat' = wilc$statistic, 'pvalue' = wilc$p.value)
224+
}, error = function(e) {
225+
list('stat' = NaN, 'pvalue' = NaN)
226+
})
214227
}
215228
test <- mclapply(cycle,wilcox_test,Y=Y_resid,trt=metadata$trt,mc.cores=20,mc.preschedule = T,mc.silent=T)
216229
wilc_df <- rbindlist(test)
217230
wilc_df <- as.data.frame(wilc_df)
218-
wilc_df['padj'] <- qvalue(wilc_df['pvalue'])$qvalues
231+
# Remove NaN values before computing q-values, then put them back
232+
valid_idx <- !is.nan(wilc_df$pvalue)
233+
wilc_df$padj <- rep(NaN, nrow(wilc_df))
234+
if (sum(valid_idx) > 0) {
235+
wilc_df$padj[valid_idx] <- qvalue(wilc_df$pvalue[valid_idx])$qvalues
236+
}
219237
rownames(wilc_df) <- cycle
220238
t1 <- Sys.time()
221239
print(t1-t0)
@@ -359,9 +377,21 @@ run_cocoa <- function(sc, metadata, indvs = NULL, cocoAWriteName='simu/tmp'){
359377
A <- metadata$trt
360378
if(is.null(indvs)){
361379
indvs <- 1:dim(sc)[1]
380+
# indvs <- c()
381+
# indv <- 1
382+
# prev_trt <- A[1]
383+
# for (trt_i in A) {
384+
# if (trt_i != prev_trt) {
385+
# indv <- indv + 1
386+
# prev_trt = trt_i
387+
# }
388+
# # indvs <- c(indvs, paste0('indv', indv))
389+
# indvs <- c(indvs, indv)
390+
# }
362391
}
363392

364-
cell2indv <- data.frame(cell=1:dim(sc)[1], indv=indvs)
393+
cell2indv <- data.frame(cell=1:dim(sc)[1],#paste0('cell', 1:dim(sc)[1]),
394+
indv=indvs)
365395

366396
indv2trt <- data.frame(indv=indvs[!duplicated(indvs)],
367397
exp=A)

0 commit comments

Comments
 (0)