Ignore covariates in binary WGR transform for linear response (#311)

* Ignore covars for linear response in binary glowgr Signed-off-by: Karen Feng <[email protected]> * Clarify docs Signed-off-by: Karen Feng <[email protected]> * Use warnings lib Signed-off-by: Karen Feng <[email protected]>
projectglow · Dec 2, 2020 · 8af47ee · 8af47ee
1 parent 1f3fecd
commit 8af47ee
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 10 deletions.
diff --git a/python/glow/wgr/linear_model/logistic_model.py b/python/glow/wgr/linear_model/logistic_model.py
@@ -19,6 +19,7 @@
 import pyspark.sql.functions as f
 from typeguard import typechecked
 from typing import Any, Dict, List
+import warnings
 from glow.logging import record_hls_event
 
 
@@ -153,7 +154,7 @@ def reduce_block_matrix(self, blockdf: DataFrame, labeldf: pd.DataFrame,
             validation routine.
             covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                 ensemble.  The covariates should not include an explicit intercept term, as one will be
-                added automatically.
+                added automatically. Covariates will be ignored for a linear response.
             response : String specifying what transformation to apply ("linear" or "sigmoid")
 
         Returns:
@@ -163,6 +164,9 @@ def reduce_block_matrix(self, blockdf: DataFrame, labeldf: pd.DataFrame,
         transform_key_pattern = ['sample_block', 'label']
 
         if response == 'linear':
+            if not covdf.empty:
+                warnings.warn('Ignoring covariates for linear response')
+                covdf = pd.DataFrame({})
             transform_udf = pandas_udf(
                 lambda key, pdf: apply_model(key, transform_key_pattern, pdf, labeldf,
                                              sample_blocks, self.alphas, covdf),
@@ -208,7 +212,7 @@ def transform(self,
             validation routine.
             covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                 ensemble (optional). The covariates should not include an explicit intercept term, as one will be
-                added automatically.
+                added automatically. Covariates will be ignored for a linear response.
             response : String specifying the desired output.  Can be 'linear' to specify the direct output of the linear
                 WGR model (default) or 'sigmoid' to specify predicted label probabilities.
 
@@ -247,7 +251,7 @@ def transform_loco(self,
             validation routine.
             covdf : covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                 ensemble (optional). The covariates should not include an explicit intercept term, as one will be
-                added automatically.
+                added automatically. Covariates will be ignored for a linear response.
             response : String specifying the desired output.  Can be 'linear' to specify the direct output of the linear
                 WGR model (default) or 'sigmoid' to specify predicted label probabilities.
             chromosomes : List of chromosomes for which to generate a prediction (optional). If not provided, the
@@ -286,7 +290,7 @@ def fit_transform(self,
             sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
             covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                 ensemble (optional). The covariates should not include an explicit intercept term, as one will be
-                added automatically.
+                added automatically. Covariates will be ignored during the transformation step for a linear response.
             response : String specifying the desired output.  Can be 'linear' to specify the direct output of the linear
                 WGR model (default) or 'sigmoid' to specify predicted label probabilities.
 
@@ -295,8 +299,4 @@ def fit_transform(self,
             rows are indexed by sample ID and the columns by label. The column types are float64.
         """
         modeldf, cvdf = self.fit(blockdf, labeldf, sample_blocks, covdf)
-        if response == 'linear':
-            return self.transform(blockdf, labeldf, sample_blocks, modeldf, cvdf, pd.DataFrame({}),
-                                  response)
-        else:
-            return self.transform(blockdf, labeldf, sample_blocks, modeldf, cvdf, covdf, response)
+        return self.transform(blockdf, labeldf, sample_blocks, modeldf, cvdf, covdf, response)
diff --git a/python/glow/wgr/linear_model/tests/test_logistic_regression.py b/python/glow/wgr/linear_model/tests/test_logistic_regression.py
@@ -269,7 +269,7 @@ def test_logistic_regression_transform(spark):
 
     logreg = LogisticRegression(alpha_values)
     modeldf, cvdf = logreg.fit(lvl1df, labeldf, sample_blocks, covdf)
-    wgr_cov_df = logreg.transform(lvl1df, labeldf, sample_blocks, modeldf, cvdf)
+    wgr_cov_df = logreg.transform(lvl1df, labeldf, sample_blocks, modeldf, cvdf, covdf)
     wgr_cov_glow = wgr_cov_df[test_label].to_numpy()
 
     assert (np.allclose(np.array(test_values['wgr_cov']), wgr_cov_glow))