33# Load the Python package
44#
55# #######################################################################
6+ # require(reticulate)
7+
8+ # # create a new environment
9+ # virtualenv_create("py")
10+ # virtualenv_install("py", c("numba", "pandas", "numpy", "scipy", "statsmodels", "scikit-learn", "cvxpy", "tqdm"))
11+ # use_virtualenv("py")
12+ # See the following document for more details
13+ # https://rstudio.github.io/reticulate/articles/versions.html
14+
615
716# import Python package
817causarray <- import(" causarray" )
@@ -28,15 +37,12 @@ run_causarray <- function(Y, W, A, r, update_disp_glm=TRUE, W_A='full', ...){
2837 # confounder adjustment
2938 if (r > 0 ){
3039 X <- cbind(W , A )
31- res_gcate <- causarray $ fit_gcate(Y , X , r , num_d = a , offset = offset , ... )
32- # res is a list of 2 matrices:
33- # A1 = [X, Z] and A2 = [B, Gamma] where X = [W, A]
34- names(res_gcate ) <- c(' A1' , ' A2' , ' info' , ' A01' , ' A02' )
35- Wp <- res_gcate [[1 ]]
40+ res_gcate <- causarray $ fit_gcate(Y , W , A , r , offset = offset , ... )[[2 ]]
41+ Wp <- res_gcate $ X_U
3642 Wp <- Wp [,- c((d + 1 ): (d + a ))] # remove the column of A
37- disp_glm <- res_gcate [[ 3 ]] $ kwargs_glm $ nuisance
43+ disp_glm <- res_gcate $ kwargs_glm $ nuisance
3844 Z <- Wp [,(d + 1 ): ncol(Wp )]
39- offset <- log(res_gcate [[ 3 ]] $ kwargs_glm $ size_factor )
45+ offset <- log(res_gcate $ kwargs_glm $ size_factor )
4046 }else {
4147 res_gcate <- NULL
4248 Wp <- W
@@ -58,7 +64,7 @@ run_causarray <- function(Y, W, A, r, update_disp_glm=TRUE, W_A='full', ...){
5864 }
5965 res <- do.call(
6066 causarray $ LFC ,
61- modifyList(list (alpha = 0.1 , c = 0.1 , family = ' nb' , offset = offset , disp_glm = disp_glm ),
67+ modifyList(list (fdx_alpha = 0.1 , fdx_c = 0.1 , family = ' nb' , offset = offset , disp_glm = disp_glm ),
6268 list (' Y' = Y , ' W' = W_new , ' A' = A , ' W_A' = W_A , ... ))
6369 )
6470 causarray.df <- res [[1 ]]
@@ -76,7 +82,7 @@ run_causarray <- function(Y, W, A, r, update_disp_glm=TRUE, W_A='full', ...){
7682estimate_r_causarray <- function (Y , W , A , r_max , ... ){
7783 data <- prep_causarray(Y , W , A , ... )
7884 Y <- data [[1 ]]; W <- data [[2 ]]; A <- data [[3 ]]
79- df_r <- causarray $ estimate_r(Y , cbind( W , A ) , r_max , ... )
85+ df_r <- causarray $ estimate_r(Y , W , A , r_max , ... )
8086 df_r
8187}
8288
@@ -175,11 +181,10 @@ library(MASS)
175181# ' @param Y Pseudo-bulk gene expression matrix, individual x gene
176182# ' @param metadata Metadata dataframe with all covariates including column called trt with 1 = treatment/case and 0 = control (factor),
177183# ' numeric features should be centered and scaled, all covariates in this dataframe will be used
178- # ' @param raw Whether the input data is raw or preprocessed
179184# ' @param family What time of regression to fit? 'poisson' or 'nb' (negative binomial)
180185# ' @returns Dataframe containing Wilcoxon statistic, p-value, and adjusted p-value for each gene
181186# '
182- run_wilcoxon <- function (Y , metadata , raw = F , family = ' nb' ) {
187+ run_wilcoxon <- function (Y , metadata , family = ' nb' ) {
183188 .check_input(Y , metadata )
184189
185190 loglibsize <- log2(rowSums(Y ))
@@ -197,25 +202,38 @@ run_wilcoxon <- function(Y, metadata, raw=F, family='nb') {
197202 fit_regs <- function (gene , Y , metadata , family = ' poisson' ) {
198203 gene_data <- cbind(Y [,gene ], metadata )
199204 colnames(gene_data ) <- c(' gexp' , colnames(metadata ))
200- if (family == ' poisson' ){
201- reg <- glm(gexp ~ . - trt , family = " poisson" , data = gene_data )
202- }else if (family == ' nb' ){
203- reg <- glm.nb(gexp ~ . - trt , data = gene_data )
204- }
205- reg $ residuals
206- }
205+ tryCatch({
206+ if (family == ' poisson' ){
207+ reg <- glm(gexp ~ . - trt , family = " poisson" , data = gene_data )
208+ }else if (family == ' nb' ){
209+ reg <- glm.nb(gexp ~ . - trt , data = gene_data )
210+ }
211+ reg $ residuals
212+ }, error = function (e ) {
213+ rep(NA , nrow(metadata ))
214+ })
215+ }
207216 Y_resid <- do.call(" cbind" , mclapply(cycle ,fit_regs ,Y = Y ,metadata = metadata ,family = family ,mc.cores = 20 ,mc.preschedule = T ,mc.silent = T ))
208217 colnames(Y_resid ) <- cycle
209218
210219 cat(' Running Wilcoxon Tests... \n ' )
211220 wilcox_test <- function (gene , Y , trt ){
212- wilc <- wilcox.test(Y [trt == 1 , gene ], Y [trt == 0 , gene ])
213- list (' stat' = wilc $ statistic , ' pvalue' = wilc $ p.value )
221+ tryCatch({
222+ wilc <- wilcox.test(Y [trt == 1 , gene ], Y [trt == 0 , gene ])
223+ list (' stat' = wilc $ statistic , ' pvalue' = wilc $ p.value )
224+ }, error = function (e ) {
225+ list (' stat' = NaN , ' pvalue' = NaN )
226+ })
214227 }
215228 test <- mclapply(cycle ,wilcox_test ,Y = Y_resid ,trt = metadata $ trt ,mc.cores = 20 ,mc.preschedule = T ,mc.silent = T )
216229 wilc_df <- rbindlist(test )
217230 wilc_df <- as.data.frame(wilc_df )
218- wilc_df [' padj' ] <- qvalue(wilc_df [' pvalue' ])$ qvalues
231+ # Remove NaN values before computing q-values, then put them back
232+ valid_idx <- ! is.nan(wilc_df $ pvalue )
233+ wilc_df $ padj <- rep(NaN , nrow(wilc_df ))
234+ if (sum(valid_idx ) > 0 ) {
235+ wilc_df $ padj [valid_idx ] <- qvalue(wilc_df $ pvalue [valid_idx ])$ qvalues
236+ }
219237 rownames(wilc_df ) <- cycle
220238 t1 <- Sys.time()
221239 print(t1 - t0 )
@@ -359,9 +377,21 @@ run_cocoa <- function(sc, metadata, indvs = NULL, cocoAWriteName='simu/tmp'){
359377 A <- metadata $ trt
360378 if (is.null(indvs )){
361379 indvs <- 1 : dim(sc )[1 ]
380+ # indvs <- c()
381+ # indv <- 1
382+ # prev_trt <- A[1]
383+ # for (trt_i in A) {
384+ # if (trt_i != prev_trt) {
385+ # indv <- indv + 1
386+ # prev_trt = trt_i
387+ # }
388+ # # indvs <- c(indvs, paste0('indv', indv))
389+ # indvs <- c(indvs, indv)
390+ # }
362391 }
363392
364- cell2indv <- data.frame (cell = 1 : dim(sc )[1 ], indv = indvs )
393+ cell2indv <- data.frame (cell = 1 : dim(sc )[1 ],# paste0('cell', 1:dim(sc)[1]),
394+ indv = indvs )
365395
366396 indv2trt <- data.frame (indv = indvs [! duplicated(indvs )],
367397 exp = A )
0 commit comments