Update v0.0.4

jaydu1 · jaydu1 · commit 6f26f5e773b3 · 2025-06-17T09:28:48.000-04:00
Add an option for unequal variance inference.
Add an option for boolean mask for propensity estimation samples.
diff --git a/causarray/DR_estimation.py b/causarray/DR_estimation.py
@@ -33,7 +33,7 @@ def _get_func_ps(ps_model, **kwargs):
 def cross_fitting(
     Y, A, X, X_A, family='poisson', K=1, glm_alpha=1e-4,
     ps_model='logistic', 
-    pi_hat=None, Y_hat=None, verbose=False, **kwargs):
+    Y_hat=None, pi_hat=None, mask=None, verbose=False, **kwargs):
     '''
     Cross-fitting for causal estimands.
 
@@ -55,10 +55,16 @@ def cross_fitting(
         The regularization parameter for the generalized linear model. The default is 1e-4.
     ps_model : str, optional
         The propensity score model. The default is 'logistic'.
-    pi_hat : array, optional
-        Propensity score of shape (n, a). The default is None.
+    
     Y_hat : array, optional
         Estimated potential outcome of shape (n, p, a, 2). The default is None.
+    pi_hat : array, optional
+        Propensity score of shape (n, a). The default is None.
+    mask : array, optional
+        Boolean mask of shape (n, a) for the treatment, indicating which samples are used for 
+        the estimation of the estimand. This does not affect the estimation of pseudo-outcomes
+        and propensity scores.
+
     **kwargs : dict
         Additional arguments to pass to the model.
 
@@ -104,7 +110,12 @@ def cross_fitting(
             pi = np.zeros_like(A_test, dtype=float)
             for j in range(A.shape[1]):
                 i_case = (A_train[:,j] == 1.)
-                i_cells = i_ctrl | i_case
+
+                if mask is not None:
+                    i_cells = mask[:, j]
+                else:
+                    i_ctrl = (np.sum(A_train, axis=1) == 0.)
+                    i_cells = i_ctrl | i_case
 
                 if ps_model=='logistic' and XA_train.shape[1]==1 and np.all(XA_train==1):
                     prob = np.sum(i_case)/np.sum(i_cells)
diff --git a/causarray/DR_learner.py b/causarray/DR_learner.py
@@ -65,19 +65,16 @@ def compute_causal_estimand(
     '''
     reset_random_seeds(random_state)
 
-    kwargs = {k:v for k,v in kwargs.items() if k not in 
-        ['kwargs_ls_1', 'kwargs_ls_2', 'kwargs_es_1', 'kwargs_es_2', 'c1', 'num_d']
-    }
-
+    # check the input data
     if isinstance(Y, pd.DataFrame):
         gene_names = Y.columns
         Y = Y.values
     else:
         gene_names = range(Y.shape[1])
+    Y = Y.astype('float')
     n, p = Y.shape
 
-    if len(A.shape) == 1:
-        A = A.reshape(-1,1)
+    if A.ndim == 1: A = A[:, None]
     if isinstance(A, pd.DataFrame):
         trt_names = A.columns
         A = A.values
@@ -97,7 +94,10 @@ def compute_causal_estimand(
         if len(mask.shape) == 1: mask = mask.reshape(-1,1)
         if mask.shape != A.shape:
             raise ValueError('Mask must have the same shape as the treatment matrix')
-    
+
+    kwargs = {k:v for k,v in kwargs.items() if k not in 
+        ['kwargs_ls_1', 'kwargs_ls_2', 'kwargs_es_1', 'kwargs_es_2', 'c1', 'num_d']
+    }
 
     if verbose:
         d_A = W_A.shape[1]
@@ -113,10 +113,9 @@ def compute_causal_estimand(
     else:
         offset = None
         size_factors = np.ones(n)
-
-    Y = Y.astype('float')
+    
     Y_hat, pi_hat = cross_fitting(Y, A, W, W_A, family=family, offset=offset, 
-        Y_hat=Y_hat, pi_hat=pi_hat, random_state=random_state, verbose=verbose, **kwargs)
+        Y_hat=Y_hat, pi_hat=pi_hat, mask=mask, random_state=random_state, verbose=verbose, **kwargs)
     pi_hat = pi_hat.reshape(*A.shape)
 
     if verbose: pprint.pprint('Estimating AIPW mean...')
@@ -201,6 +200,9 @@ def LFC(
         Boolean mask of shape (n, a) for the treatment, indicating which samples are used for 
         the estimation of the estimand. This does not affect the estimation of pseudo-outcomes
         and propensity scores.
+    usevar : str
+        The method to use for estimating the variance of treatment effects. 
+        Options are 'pooled' (default) or 'unequal'.
     
     thres_min : float
         The minimum threshold for the treatment effect.
diff --git a/causarray/__about__.py b/causarray/__about__.py
@@ -1 +1 @@
-__version__ = "0.0.3"
+__version__ = "0.0.4"
diff --git a/causarray/gcate.py b/causarray/gcate.py
@@ -82,7 +82,8 @@ def fit_gcate(Y, X, A, r, family='nb', disp_glm=None, disp_family=None, offset=T
     kwargs : dict
         Additional keyword arguments.
     '''
-
+    if X.ndim == 1: X = X[:, None]
+    if A.ndim == 1: A = A[:, None]
     X = np.hstack((X, A))
     a = A.shape[1]
     Y, kwargs_glm, lam1 = _check_input(Y, X, family, disp_glm, disp_family, offset, c1, **kwargs)    
@@ -195,6 +196,8 @@ def estimate_r(Y, X, A, r_max, c=1.,
     df_r : DataFrame
         Results of the number of latent factors.
     '''
+    if X.ndim == 1: X = X[:, None]
+    if A.ndim == 1: A = A[:, None]
     a, d = A.shape[1], X.shape[1]
     X = np.hstack((X, A))    
     n, p = Y.shape
diff --git a/causarray/gcate_glm.py b/causarray/gcate_glm.py
@@ -41,15 +41,29 @@ def fit_glm(Y, X, A=None, family='gaussian', disp_family='poisson',
     A : array
         n x 1 vector of treatments or None
     family : str
-        family of GLM to fit, can be one of: 'gaussian', 'poisson', 'nb'
+        Family of GLM to fit, can be one of: 'gaussian', 'poisson', 'nb'
     disp_glm : array or None
-        dispersion parameter for negative binomial GLM
-    return_df : bool
-        whether to return results as DataFrame
-    impute : bool
-        whether to impute potential outcomes and get predicted values
+        Dispersion parameter for negative binomial GLM.
+    impute : bool or None
+        Whether to impute missing values in Y.        
     offset : bool
-        whether to use log of sum of Y as offset
+        Whether to use log of sum of Y as offset.
+    shrinkage : bool
+        Whether to use regularized GLM.
+    alpha : float
+        Regularization parameter for regularized GLM.
+    maxiter : int
+        Maximum number of iterations for GLM fitting.
+    thres_disp : float
+        Threshold for dispersion parameter for negative binomial GLM.
+    n_jobs : int
+        Number of jobs to run in parallel.
+    random_state : int
+        Random seed for reproducibility.
+    verbose : bool
+        Whether to print progress messages.
+    kwargs : dict
+        Additional arguments to pass to GLM fitting.
 
     Returns
     -------
diff --git a/causarray/utils.py b/causarray/utils.py
@@ -49,6 +49,8 @@ def prep_causarray_data(Y, A, X=None, X_A=None, intercept=True):
     Y = np.minimum(Y, np.round(np.quantile(np.max(Y, 0), 0.999)))
     if not isinstance(A, pd.DataFrame):
         A = np.asarray(A)
+    if A.ndim == 1:
+        A = A[:, None]
 
     X = np.zeros((Y.shape[0], 0)) if X is None else np.asarray(X)        
     X_A = X if X_A is None else np.asarray(X_A)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.3"`
	`1`	`+__version__ = "0.0.4"`