Update estimate_r function

jaydu1 · jaydu1 · commit 3a27de2fe7ed · 2025-03-20T16:17:55.000+08:00
diff --git a/causarray/DR_estimation.py b/causarray/DR_estimation.py
@@ -69,7 +69,7 @@ def cross_fitting(
     pi_hat : array
         Estimated propensity score.
     '''
-    func_ps, params_ps = _get_func_ps(ps_model, verbose=verbose, **kwargs)
+    func_ps, params_ps = _get_func_ps(ps_model, verbose=False, **kwargs)
     params_glm = _filter_params(fit_glm, {**kwargs, 'verbose': verbose})
 
     if verbose:
diff --git a/causarray/__about__.py b/causarray/__about__.py
@@ -1 +1 @@
-__version__ = "0.0.2"
+__version__ = "0.0.3"
diff --git a/causarray/gcate.py b/causarray/gcate.py
@@ -195,8 +195,8 @@ def estimate_r(Y, X, A, r_max, c=1.,
     df_r : DataFrame
         Results of the number of latent factors.
     '''
-    X = np.hstack((X, A))
     a, d = A.shape[1], X.shape[1]
+    X = np.hstack((X, A))    
     n, p = Y.shape
 
     Y, kwargs_glm, _ = _check_input(Y, X, family, disp_glm, disp_family, offset, None, **kwargs)
@@ -210,28 +210,37 @@ def estimate_r(Y, X, A, r_max, c=1.,
         r_list = np.arange(1, int(r_max)+1)
     else:
         r_list = np.array(r_max, dtype=int)
-        
-    for r in r_list:
-        res_1, res_2 = estimate(Y, X, r, a,
-            0, kwargs_glm, kwargs_ls_1, kwargs_es_1, kwargs_ls_2, kwargs_es_2, **kwargs)
-        A01, A02, A1, A2 = res_1['X_U'], res_1['B_Gamma'], res_2['X_U'], res_2['B_Gamma']
-
-        logh = log_h(Y, family, nuisance)
-
-        if r==1:
-            ll = 2 * ( 
-                nll(Y, A01, A02, family, nuisance, size_factor) / p 
-                - np.sum(logh) / (n*p) ) 
-            nu = (d+a) * np.maximum(n,p) * np.log(n * p / np.maximum(n,p)) / (n*p)
-            jic = ll + c * nu
-            res.append([0, ll, nu, jic])
-        
+    r_max = np.max(r_list)
+
+    # Estimate the residual deviance
+    res_glm = fit_glm(Y, X, offset=np.log(size_factor[:,0]), family=family, disp_glm=nuisance[0], maxiter=100, verbose=False)
+    u, s, vt = svds(res_glm[-1], k=r_max)
+    if u.shape[1]<r_max:
+        raise ValueError(f'The number of latent factors is larger than the rank of deviance residuals ({u.shape[1]}). Try to decrease the value of r.')
+    Q, _ = sp.linalg.qr(X, mode='economic')
+    P1 = np.identity(n) - Q @ Q.T
+    P1 = P1.astype(type_f)
+    A1 = np.c_[X, P1 @ u]
+
+    logh = log_h(Y, family, nuisance)
+    ll = 2 * ( 
+        nll(Y, X, res_glm[0], family, nuisance, size_factor) / p 
+        - np.sum(logh) / (n*p) ) 
+    nu = (d+a) * np.maximum(n,p) * np.log(n * p / np.maximum(n,p)) / (n*p)
+    jic = ll + c * nu
+    res.append([0, ll, nu, jic])
+
+    for r in r_list[::-1]:
+        _, res_2 = estimate(Y, X, r, a,
+            0, kwargs_glm, kwargs_ls_1, kwargs_es_1, kwargs_ls_2, kwargs_es_2, A=A1[:,:d+a+r], **kwargs)
+        A1, A2 = res_2['X_U'], res_2['B_Gamma']
+
         ll = 2 * ( 
             nll(Y, A1, A2, family, nuisance, size_factor) / p 
             - np.sum(logh) / (n*p) ) 
         nu = (d + a + r) * np.maximum(n,p) * np.log(n * p / np.maximum(n,p)) / (n*p)
         jic = ll + c * nu
         res.append([r, ll, nu, jic])
 
-    df_r = pd.DataFrame(res, columns=['r', 'deviance', 'nu', 'JIC'])
+    df_r = pd.DataFrame(res, columns=['r', 'deviance', 'nu', 'JIC']).sort_values(by='r')
     return df_r 
diff --git a/causarray/gcate_glm.py b/causarray/gcate_glm.py
@@ -156,8 +156,7 @@ def fit_model(j, Y, X, offsets, family, disp, impute, alpha):
 
 
     results = Parallel(n_jobs=n_jobs)(delayed(fit_model)(
-        j, Y, X, offsets, family, disp_glm, impute, alpha) for j in tqdm(range(Y.shape[1])))
-    pprint.pprint('Fitting GLM done.')
+        j, Y, X, offsets, family, disp_glm, impute, alpha) for j in tqdm(range(Y.shape[1]), disable=not verbose))
     if verbose: pprint.pprint('Fitting GLM done.')
 
     B, Yhat_0, Yhat_1, resid_deviance = zip(*results)
diff --git a/causarray/gcate_opt.py b/causarray/gcate_opt.py
@@ -208,39 +208,35 @@ def alter_min(
         a = d
 
     # initialization for Theta = A @ B^T    
-    if A is None or B is None:
+    if A is None:
         if verbose:
             pprint.pprint('Estimating initial latent variables with GLMs...')
-        res_glm = fit_glm(Y, X, offset=np.log(size_factor[:,0]), family=family, disp_glm=nuisance[0], maxiter=100)
+        res_glm = fit_glm(Y, X, offset=np.log(size_factor[:,0]), family=family, disp_glm=nuisance[0], maxiter=100, verbose=verbose)
         u, s, vt = svds(res_glm[-1], k=r)
 
         if u.shape[1]<r:
             raise ValueError(f'The number of latent factors is larger than the rank of deviance residuals ({u.shape[1]}). Try to decrease the value of r.')
+        
+        A = np.c_[X, P1 @ u]
+    else:
+        assert A.shape[1] == d+r
 
+    if B is None:
         if verbose:
             pprint.pprint('Estimating initial coefficients with GLMs...')
-        A = np.c_[X, P1 @ u]
-        B = fit_glm(Y, A, offset=np.log(size_factor[:,0]), family=family, disp_glm=nuisance[0], maxiter=100)[0]
+        
+        B = fit_glm(Y, A, offset=np.log(size_factor[:,0]), family=family, disp_glm=nuisance[0], maxiter=100, verbose=verbose)[0]
         
         E = A[:, -r:] @ B[:, -r:].T
         u, s, vh = sp.sparse.linalg.svds(E, k=r)        
         A[:, d:] = u * s[None,:]**(1/2)
         B[:, d:] = vh.T * s[None,:]**(1/2)
         del E, u, s, vh
 
-        # if offset==1:
-        #     scale = np.sqrt(np.median(np.abs(X[:,0])))
-        #     B[:, :offset] = scale
-        #     A[:, :offset] /= scale 
 
     if P2 is not None:
         P2 = P2.astype(type_f)
-        # E = A[:,d-a:] @ B[:,d-a:].T @ (np.identity(p) - P2)
-        # u, s, vh = sp.sparse.linalg.svds(E, k=r)
         B[:, d-a:d] = P2 @ B[:, d-a:d]
-        # A[:, d:] = u * s[None,:]**(1/2)
-        # B[:, d:] = vh.T * s[None,:]**(1/2)
-        # del E, u, s, vh
     
 
     Y = Y.astype(type_f)
@@ -265,10 +261,10 @@ def alter_min(
     kwargs_ls['alpha'] = kwargs_ls['alpha']
     if verbose:
         pprint.pprint({'kwargs_glm':kwargs_glm,'kwargs_ls':kwargs_ls,'kwargs_es':kwargs_es}, compact=True)
-
+    pprint.pprint(f'Fitting GCATE (step {1 if P1 is None else 2})...')
     hist = [func_val_pre]
     es = Early_Stopping(**kwargs_es)
-    with tqdm(np.arange(kwargs_es['max_iters'])) as pbar:
+    with tqdm(np.arange(kwargs_es['max_iters']), disable=not verbose) as pbar:
         for t in pbar:
             func_val, A, B = update(
                 Y, A, B, d, weights, P1, P2,
@@ -281,7 +277,7 @@ def alter_min(
                 pprint.pprint('Encountered large or infinity values. Try to decrease the value of C for the norm constraints.')
                 break
             elif es(func_val):
-                pbar.set_postfix_str('Early stopped.' + es.info)
+                pbar.set_postfix_str('Early stopped. ' + es.info)
                 pbar.close()
                 break
             else:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.2"`
	`1`	`+__version__ = "0.0.3"`