From 4648c7def9b17a24a477b811e54cdb095e1a6a2c Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Sun, 9 Jun 2024 18:48:31 +0200 Subject: [PATCH 01/15] type valid separation methods --- pyfixest/estimation/FixestMulti_.py | 2 +- pyfixest/estimation/fepois_.py | 111 +++++++++++++++++++- tests/data/ppmlhdfe_separation_example1.csv | 13 +++ tests/data/ppmlhdfe_separation_example2.csv | 13 +++ tests/test_poisson.py | 10 ++ 5 files changed, 144 insertions(+), 5 deletions(-) create mode 100644 tests/data/ppmlhdfe_separation_example1.csv create mode 100644 tests/data/ppmlhdfe_separation_example2.csv diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index f76186ef..58abcf4e 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -332,7 +332,7 @@ def _estimate_all_models( na_separation: list[int] = [] if fe is not None: - na_separation = _check_for_separation(Y=Y, fe=fe) + na_separation = _check_for_separation(Y=Y, X=X, fe=fe) if na_separation: warnings.warn( f"{str(len(na_separation))} observations removed because of separation." diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 4b53fe14..bee8a081 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -1,5 +1,6 @@ import warnings -from typing import Optional, Union +from importlib import import_module +from typing import Callable, Optional, Union import numpy as np import pandas as pd @@ -312,25 +313,75 @@ def predict( return y_hat -def _check_for_separation(Y: pd.DataFrame, fe: pd.DataFrame) -> list[int]: +def _check_for_separation( + Y: pd.DataFrame, + X: pd.DataFrame, + fe: pd.DataFrame, + methods: list[str] | None = None, +) -> list[int]: """ Check for separation. Check for separation of Poisson Regression. For details, see the ppmlhdfe - documentation on separation checks. Currently, only the "fe" check is implemented. + documentation on separation checks. Parameters ---------- Y : pd.DataFrame Dependent variable. + X : pd.DataFrame + Independent variables. fe : pd.DataFrame Fixed effects. + method: list[str], optional + Method to check for separation. Executes ``fe`` and ``ir`` by default. Returns ------- list List of indices of observations that are removed due to separation. """ + valid_methods: dict[ + str, Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame], set[int]] + ] = { + "fe": _check_for_separation_fe, + "ir": _check_for_separation_ir, + } + if methods is None: + methods = list(valid_methods) + invalid_methods = [method for method in methods if method not in valid_methods] + if invalid_methods: + raise ValueError( + f"Invalid separation method. Expecting {valid_methods}. Got {invalid_methods}" + ) + + separation_na: set[int] = set() + for method in methods: + separation_na = separation_na.union(valid_methods[method](Y, X, fe)) + + return list(separation_na) + + +def _check_for_separation_fe( + Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame +) -> set[int]: + """ + Check for separation using the "fe" check. + + Parameters + ---------- + Y : pd.DataFrame + Dependent variable. + X : pd.DataFrame + Independent variables. + fe : pd.DataFrame + Fixed effects. + + Returns + ------- + set + Set of indices of observations that are removed due to separation. + """ separation_na: set[int] = set() if not (Y > 0).all(axis=0).all(): Y_help = (Y > 0).astype(int).squeeze() @@ -353,7 +404,59 @@ def _check_for_separation(Y: pd.DataFrame, fe: pd.DataFrame) -> list[int]: dropset = set(fe[x][fe_in_droplist].index) separation_na = separation_na.union(dropset) - return list(separation_na) + return separation_na + + +def _check_for_separation_ir( + Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame +) -> set[int]: + """ + Check for separation using the "iterative rectifier" check. + + For details see http://arxiv.org/abs/1903.01633 + + Parameters + ---------- + Y : pd.DataFrame + Dependent variable. + X : pd.DataFrame + Independent variables. + fe : pd.DataFrame + Fixed effects. + + Returns + ------- + set + Set of indices of observations that are removed due to separation. + """ + tol = 1e-5 + maxiter = 10_000 + # lazy loading to avoid circular import + fixest_module = import_module("pyfixest.estimation") + feols = getattr(fixest_module, "feols") + + U = (Y == 0).astype(int) + N0 = (Y > 0).sum() + K = N0 / tol**2 + omega = U.where(U > 0, K) + data = pd.concat([U, X, fe], axis=1) + fml = f"U ~ {'+'.join(X.columns)}" + + iter = 0 + while iter < maxiter: + iter += 1 + # regress U on X + # TODO: check acceleration in ppmlhdfe's implementation: https://github.com/sergiocorreia/ppmlhdfe/blob/master/src/ppmlhdfe_separation_relu.mata#L135 + Uhat = feols(fml, data=data, weights=omega).predict() + Uhat = np.where(Uhat.abs() < tol, 0, Uhat) + if (Uhat >= 0).all(): + # all separated observations have been identified + break + + data["U"] = np.fmax(Uhat, 0) # rectified linear unit (ReLU) + + separation_na = set(Y[Uhat > 0].index) + return separation_na def _fepois_input_checks( diff --git a/tests/data/ppmlhdfe_separation_example1.csv b/tests/data/ppmlhdfe_separation_example1.csv new file mode 100644 index 00000000..7414bc79 --- /dev/null +++ b/tests/data/ppmlhdfe_separation_example1.csv @@ -0,0 +1,13 @@ +y,x1,x2,x3,x4 +0,0,0,2,10 +0,0,0,0,-2 +0,0,0,0,6 +0,0,0,4,5 +0,1,0,0,3 +2,0,0,0,3 +2,0,0,0,4 +2,0,0,-2,15 +2,0,0,0,-7 +4,0,0,-3,15 +6,-3,-3,0,4 +6,0,0,0,4 diff --git a/tests/data/ppmlhdfe_separation_example2.csv b/tests/data/ppmlhdfe_separation_example2.csv new file mode 100644 index 00000000..d66ee15f --- /dev/null +++ b/tests/data/ppmlhdfe_separation_example2.csv @@ -0,0 +1,13 @@ +y,x1,x2,x3,x4 +0,14,4,0,-12 +0,0,35,34,12 +0,0,3,0,17 +0,0,0,0,1 +0,0,0,-2,7 +0,0,25,0,12 +0,0,13,0,60 +0,0,15,0,7 +0,1,0,7,-24 +9,0,0,0,18 +4,2,0,6,-1 +2,1,0,0,-7 diff --git a/tests/test_poisson.py b/tests/test_poisson.py index bcf9aa27..fdae91c4 100644 --- a/tests/test_poisson.py +++ b/tests/test_poisson.py @@ -18,3 +18,13 @@ def test_separation(): UserWarning, match="2 observations removed because of separation." ): mod = fepois("Y ~ x | fe1", data=df, vcov="hetero") # noqa: F841 + + +# def test_separation_ir(): +# """Test iterative rectifier separation detection.""" +# fns = [ +# 'ppmlhdfe_separation_example1.csv', +# 'ppmlhdfe_separation_example2.csv', +# ] +# dfs = [pd.read_csv('data', fn) for fn in fns] +# raise NotImplementedError From abe079b1441d319bca13dc4c8534657292594715 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Sun, 9 Jun 2024 19:00:44 +0200 Subject: [PATCH 02/15] type separation method --- pyfixest/estimation/fepois_.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index bee8a081..e378f969 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -1,6 +1,6 @@ import warnings from importlib import import_module -from typing import Callable, Optional, Union +from typing import Optional, Protocol, Union import numpy as np import pandas as pd @@ -341,9 +341,7 @@ def _check_for_separation( list List of indices of observations that are removed due to separation. """ - valid_methods: dict[ - str, Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame], set[int]] - ] = { + valid_methods: dict[str, _SeparationMethod] = { "fe": _check_for_separation_fe, "ir": _check_for_separation_ir, } @@ -357,11 +355,33 @@ def _check_for_separation( separation_na: set[int] = set() for method in methods: - separation_na = separation_na.union(valid_methods[method](Y, X, fe)) + separation_na = separation_na.union(valid_methods[method](Y=Y, X=X, fe=fe)) return list(separation_na) +class _SeparationMethod(Protocol): + def __call__(self, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame) -> set[int]: + """ + Check for separation using the "fe" check. + + Parameters + ---------- + Y : pd.DataFrame + Dependent variable. + X : pd.DataFrame + Independent variables. + fe : pd.DataFrame + Fixed effects. + + Returns + ------- + set + Set of indices of observations that are removed due to separation. + """ + ... + + def _check_for_separation_fe( Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame ) -> set[int]: From 1c8498d05669fe2c74f338216c09e9aa0fa3db44 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Mon, 10 Jun 2024 20:19:34 +0200 Subject: [PATCH 03/15] add separation_check arg --- pyfixest/estimation/FixestMulti_.py | 10 ++++++++-- pyfixest/estimation/estimation.py | 6 ++++++ pyfixest/estimation/fepois_.py | 5 +++-- tests/data/ppmlhdfe_separation_example1.csv | 13 ------------- tests/data/ppmlhdfe_separation_example2.csv | 13 ------------- 5 files changed, 17 insertions(+), 30 deletions(-) delete mode 100644 tests/data/ppmlhdfe_separation_example1.csv delete mode 100644 tests/data/ppmlhdfe_separation_example2.csv diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 58abcf4e..7cd9bf4f 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -32,7 +32,7 @@ def __init__( Parameters ---------- - data : panda.DataFrame + data : pandas.DataFrame The input DataFrame for the object. copy_data : bool Whether to copy the data or not. @@ -154,6 +154,7 @@ def _estimate_all_models( collin_tol: float = 1e-6, iwls_maxiter: int = 25, iwls_tol: float = 1e-08, + separation_check: list[str] | None = None, ) -> None: """ Estimate multiple regression models. @@ -174,6 +175,9 @@ def _estimate_all_models( iwls_tol : float, optional The tolerance level for the IWLS algorithm. Default is 1e-8. Only relevant for non-linear estimation strategies. + separation_check: list[str], optional + Only used in "fepois". Methods to identify and drop separated observations. + Either "fe" or "ir". Executes both by default. Returns ------- @@ -332,7 +336,9 @@ def _estimate_all_models( na_separation: list[int] = [] if fe is not None: - na_separation = _check_for_separation(Y=Y, X=X, fe=fe) + na_separation = _check_for_separation( + Y=Y, X=X, fe=fe, methods=separation_check + ) if na_separation: warnings.warn( f"{str(len(na_separation))} observations removed because of separation." diff --git a/pyfixest/estimation/estimation.py b/pyfixest/estimation/estimation.py index 763114a5..996e4bd4 100644 --- a/pyfixest/estimation/estimation.py +++ b/pyfixest/estimation/estimation.py @@ -354,6 +354,7 @@ def fepois( iwls_tol: float = 1e-08, iwls_maxiter: int = 25, collin_tol: float = 1e-10, + separation_check: list[str] | None = None, drop_intercept: bool = False, i_ref1=None, copy_data: bool = True, @@ -402,6 +403,10 @@ def fepois( collin_tol : float, optional Tolerance for collinearity check, by default 1e-10. + separation_check: list[str], optional + Methods to identify and drop separated observations. + Either "fe" or "ir". Executes both by default. + drop_intercept : bool, optional Whether to drop the intercept from the model, by default False. @@ -498,6 +503,7 @@ def fepois( iwls_tol=iwls_tol, iwls_maxiter=iwls_maxiter, collin_tol=collin_tol, + separation_check=separation_check, ) if fixest._is_multiple_estimation: diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index e378f969..1ac1ad99 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -334,7 +334,7 @@ def _check_for_separation( fe : pd.DataFrame Fixed effects. method: list[str], optional - Method to check for separation. Executes ``fe`` and ``ir`` by default. + Methods to check for separation. Either "fe" or "ir". Executes both by default. Returns ------- @@ -347,6 +347,7 @@ def _check_for_separation( } if methods is None: methods = list(valid_methods) + invalid_methods = [method for method in methods if method not in valid_methods] if invalid_methods: raise ValueError( @@ -363,7 +364,7 @@ def _check_for_separation( class _SeparationMethod(Protocol): def __call__(self, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame) -> set[int]: """ - Check for separation using the "fe" check. + Check for separation. Parameters ---------- diff --git a/tests/data/ppmlhdfe_separation_example1.csv b/tests/data/ppmlhdfe_separation_example1.csv deleted file mode 100644 index 7414bc79..00000000 --- a/tests/data/ppmlhdfe_separation_example1.csv +++ /dev/null @@ -1,13 +0,0 @@ -y,x1,x2,x3,x4 -0,0,0,2,10 -0,0,0,0,-2 -0,0,0,0,6 -0,0,0,4,5 -0,1,0,0,3 -2,0,0,0,3 -2,0,0,0,4 -2,0,0,-2,15 -2,0,0,0,-7 -4,0,0,-3,15 -6,-3,-3,0,4 -6,0,0,0,4 diff --git a/tests/data/ppmlhdfe_separation_example2.csv b/tests/data/ppmlhdfe_separation_example2.csv deleted file mode 100644 index d66ee15f..00000000 --- a/tests/data/ppmlhdfe_separation_example2.csv +++ /dev/null @@ -1,13 +0,0 @@ -y,x1,x2,x3,x4 -0,14,4,0,-12 -0,0,35,34,12 -0,0,3,0,17 -0,0,0,0,1 -0,0,0,-2,7 -0,0,25,0,12 -0,0,13,0,60 -0,0,15,0,7 -0,1,0,7,-24 -9,0,0,0,18 -4,2,0,6,-1 -2,1,0,0,-7 From d67db9950a34b5d06467add8057979b044904597 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Mon, 10 Jun 2024 20:25:12 +0200 Subject: [PATCH 04/15] update test_poisson --- tests/test_poisson.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/test_poisson.py b/tests/test_poisson.py index fdae91c4..e3a4cfa0 100644 --- a/tests/test_poisson.py +++ b/tests/test_poisson.py @@ -7,24 +7,28 @@ def test_separation(): """Test separation detection.""" - y = np.array([0, 0, 0, 1, 2, 3]) - df1 = np.array(["a", "a", "b", "b", "b", "c"]) - df2 = np.array(["c", "c", "d", "d", "d", "e"]) - x = np.random.normal(0, 1, 6) - - df = pd.DataFrame({"Y": y, "fe1": df1, "fe2": df2, "x": x}) + example1 = pd.DataFrame.from_dict( + { + "Y": [0, 0, 0, 1, 2, 3], + "fe1": ["a", "a", "b", "b", "b", "c"], + "fe2": ["c", "c", "d", "d", "d", "e"], + "X": np.random.normal(0, 1, 6), + } + ) with pytest.warns( UserWarning, match="2 observations removed because of separation." ): - mod = fepois("Y ~ x | fe1", data=df, vcov="hetero") # noqa: F841 - + mod = fepois("Y ~ X | fe1", data=example1, vcov="hetero", method=["fe"]) # noqa: F841 -# def test_separation_ir(): -# """Test iterative rectifier separation detection.""" -# fns = [ -# 'ppmlhdfe_separation_example1.csv', -# 'ppmlhdfe_separation_example2.csv', -# ] -# dfs = [pd.read_csv('data', fn) for fn in fns] -# raise NotImplementedError + example2 = pd.DataFrame.from_dict( + { + "Y": [0, 0, 0, 1, 2, 3], + "X1": [2, -1, 0, 0, 5, 6], + "X2": [-1, 2, 0, 0, -10, -12], + } + ) + with pytest.warns( + UserWarning, match="1 observations removed because of separation." + ): + mod = fepois("Y ~ X1 + X2", data=example2, vcov="hetero", method=["ir"]) # noqa: F841 From 09fc032f7de3d5dd91b9df8a48b3999e5d813222 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Mon, 10 Jun 2024 21:29:44 +0200 Subject: [PATCH 05/15] fix bugs --- pyfixest/estimation/FixestMulti_.py | 19 ++++++++++--------- pyfixest/estimation/fepois_.py | 19 ++++++++++--------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 7cd9bf4f..1bf2d61b 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -335,17 +335,18 @@ def _estimate_all_models( # check for separation and drop separated variables na_separation: list[int] = [] - if fe is not None: - na_separation = _check_for_separation( - Y=Y, X=X, fe=fe, methods=separation_check + # if fe is not None: + na_separation = _check_for_separation( + Y=Y, X=X, fe=fe, methods=separation_check + ) + if na_separation: + warnings.warn( + f"{str(len(na_separation))} observations removed because of separation." ) - if na_separation: - warnings.warn( - f"{str(len(na_separation))} observations removed because of separation." - ) - Y.drop(na_separation, axis=0, inplace=True) - X.drop(na_separation, axis=0, inplace=True) + Y.drop(na_separation, axis=0, inplace=True) + X.drop(na_separation, axis=0, inplace=True) + if fe is not None: fe.drop(na_separation, axis=0, inplace=True) Y_array, X_array = (x.to_numpy() for x in [Y, X]) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 1ac1ad99..565bc02b 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -341,6 +341,7 @@ def _check_for_separation( list List of indices of observations that are removed due to separation. """ + # import pdb; pdb.set_trace() valid_methods: dict[str, _SeparationMethod] = { "fe": _check_for_separation_fe, "ir": _check_for_separation_ir, @@ -404,7 +405,7 @@ def _check_for_separation_fe( Set of indices of observations that are removed due to separation. """ separation_na: set[int] = set() - if not (Y > 0).all(axis=0).all(): + if fe is not None and not (Y > 0).all(axis=0).all(): Y_help = (Y > 0).astype(int).squeeze() # loop over all elements of fe @@ -450,26 +451,26 @@ def _check_for_separation_ir( set Set of indices of observations that are removed due to separation. """ - tol = 1e-5 + tol = 1e-8 maxiter = 10_000 # lazy loading to avoid circular import fixest_module = import_module("pyfixest.estimation") feols = getattr(fixest_module, "feols") - U = (Y == 0).astype(int) - N0 = (Y > 0).sum() + U = (Y == 0).astype(int).squeeze().rename("U") + N0 = (Y > 0).squeeze().sum() K = N0 / tol**2 - omega = U.where(U > 0, K) - data = pd.concat([U, X, fe], axis=1) - fml = f"U ~ {'+'.join(X.columns)}" + omega = U.where(U > 0, K).rename("omega") + data = pd.concat([U, X, omega], axis=1) + fml = f"U ~ {'+'.join(X.columns[X.columns!='Intercept'])}" iter = 0 while iter < maxiter: iter += 1 # regress U on X # TODO: check acceleration in ppmlhdfe's implementation: https://github.com/sergiocorreia/ppmlhdfe/blob/master/src/ppmlhdfe_separation_relu.mata#L135 - Uhat = feols(fml, data=data, weights=omega).predict() - Uhat = np.where(Uhat.abs() < tol, 0, Uhat) + Uhat = feols(fml, data=data, weights="omega").predict() + Uhat = np.where(np.abs(Uhat) < tol, 0, Uhat) if (Uhat >= 0).all(): # all separated observations have been identified break From d14e6bb29b0f7de416440f59c334c61e62c50857 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Mon, 10 Jun 2024 21:37:58 +0200 Subject: [PATCH 06/15] remove unused variable --- tests/test_poisson.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_poisson.py b/tests/test_poisson.py index e3a4cfa0..a4deb964 100644 --- a/tests/test_poisson.py +++ b/tests/test_poisson.py @@ -19,7 +19,7 @@ def test_separation(): with pytest.warns( UserWarning, match="2 observations removed because of separation." ): - mod = fepois("Y ~ X | fe1", data=example1, vcov="hetero", method=["fe"]) # noqa: F841 + fepois("Y ~ X | fe1", data=example1, vcov="hetero", separation_check=["fe"]) # noqa: F841 example2 = pd.DataFrame.from_dict( { @@ -28,7 +28,8 @@ def test_separation(): "X2": [-1, 2, 0, 0, -10, -12], } ) + with pytest.warns( UserWarning, match="1 observations removed because of separation." ): - mod = fepois("Y ~ X1 + X2", data=example2, vcov="hetero", method=["ir"]) # noqa: F841 + fepois("Y ~ X1 + X2", data=example2, vcov="hetero", separation_check=["ir"]) # noqa: F841 From 7ce587c8399b365fd24b7cc955d0c6560fedabc2 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Mon, 1 Jul 2024 15:56:20 +0200 Subject: [PATCH 07/15] update test case to avoid perfect collinearity --- pyfixest/estimation/fepois_.py | 30 ++++++++++++++++++++++-------- tests/test_poisson.py | 4 ++-- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 565bc02b..4a02715a 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -341,7 +341,6 @@ def _check_for_separation( list List of indices of observations that are removed due to separation. """ - # import pdb; pdb.set_trace() valid_methods: dict[str, _SeparationMethod] = { "fe": _check_for_separation_fe, "ir": _check_for_separation_ir, @@ -430,7 +429,11 @@ def _check_for_separation_fe( def _check_for_separation_ir( - Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame + Y: pd.DataFrame, + X: pd.DataFrame, + fe: pd.DataFrame, + tol: float = 1e-4, + maxiter: int = 100, ) -> set[int]: """ Check for separation using the "iterative rectifier" check. @@ -445,24 +448,36 @@ def _check_for_separation_ir( Independent variables. fe : pd.DataFrame Fixed effects. + tol : float + Tolerance to detect separated observation. Defaults to 1e-4. + maxiter : int + Maximum number of iterations. Defaults to 100. Returns ------- set Set of indices of observations that are removed due to separation. """ - tol = 1e-8 - maxiter = 10_000 - # lazy loading to avoid circular import + # lazy load and avoid circular import fixest_module = import_module("pyfixest.estimation") feols = getattr(fixest_module, "feols") U = (Y == 0).astype(int).squeeze().rename("U") N0 = (Y > 0).squeeze().sum() K = N0 / tol**2 - omega = U.where(U > 0, K).rename("omega") + omega = pd.Series(np.where(Y.squeeze() > 0, K, 1), name="omega") data = pd.concat([U, X, omega], axis=1) - fml = f"U ~ {'+'.join(X.columns[X.columns!='Intercept'])}" + fml = "U" + if X is not None and not X.empty: + fml += f" ~ {' + '.join(X.columns[X.columns!='Intercept'])}" + if fe is not None and not fe.empty: + fml += f" | {' + '.join(fe.columns)}" + + separation_na: set[int] = set() + if (Y.squeeze() > 0).all(): + # no boundary sample, can exit + # TODO: check if algorithm can be sped up based on boundary sample + return separation_na iter = 0 while iter < maxiter: @@ -474,7 +489,6 @@ def _check_for_separation_ir( if (Uhat >= 0).all(): # all separated observations have been identified break - data["U"] = np.fmax(Uhat, 0) # rectified linear unit (ReLU) separation_na = set(Y[Uhat > 0].index) diff --git a/tests/test_poisson.py b/tests/test_poisson.py index a4deb964..f2bd47fe 100644 --- a/tests/test_poisson.py +++ b/tests/test_poisson.py @@ -25,11 +25,11 @@ def test_separation(): { "Y": [0, 0, 0, 1, 2, 3], "X1": [2, -1, 0, 0, 5, 6], - "X2": [-1, 2, 0, 0, -10, -12], + "X2": [5, 10, 0, 0, -10, -12], } ) with pytest.warns( - UserWarning, match="1 observations removed because of separation." + UserWarning, match="2 observations removed because of separation." ): fepois("Y ~ X1 + X2", data=example2, vcov="hetero", separation_check=["ir"]) # noqa: F841 From b4ee73835bad5f304c795c77331da31443b02ca9 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Tue, 2 Jul 2024 15:41:33 +0200 Subject: [PATCH 08/15] improve rectifier --- pyfixest/estimation/fepois_.py | 51 ++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 4a02715a..c5957dda 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -334,7 +334,8 @@ def _check_for_separation( fe : pd.DataFrame Fixed effects. method: list[str], optional - Methods to check for separation. Either "fe" or "ir". Executes both by default. + Methods used to check for separation. One of fixed effects ("fe") or + iterative rectifier ("ir"). Executes all methods by default. Returns ------- @@ -351,7 +352,7 @@ def _check_for_separation( invalid_methods = [method for method in methods if method not in valid_methods] if invalid_methods: raise ValueError( - f"Invalid separation method. Expecting {valid_methods}. Got {invalid_methods}" + f"Invalid separation method. Expecting {list(valid_methods)}. Received {invalid_methods}" ) separation_na: set[int] = set() @@ -378,7 +379,7 @@ def __call__(self, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame) -> set[in Returns ------- set - Set of indices of observations that are removed due to separation. + Set of indices of separated observations. """ ... @@ -401,7 +402,7 @@ def _check_for_separation_fe( Returns ------- set - Set of indices of observations that are removed due to separation. + Set of indices of separated observations. """ separation_na: set[int] = set() if fe is not None and not (Y > 0).all(axis=0).all(): @@ -456,42 +457,58 @@ def _check_for_separation_ir( Returns ------- set - Set of indices of observations that are removed due to separation. + Set of indices of separated observations. """ # lazy load and avoid circular import fixest_module = import_module("pyfixest.estimation") feols = getattr(fixest_module, "feols") - - U = (Y == 0).astype(int).squeeze().rename("U") + # initialize variables + U = (Y == 0).astype(float).squeeze().rename("U") N0 = (Y > 0).squeeze().sum() K = N0 / tol**2 omega = pd.Series(np.where(Y.squeeze() > 0, K, 1), name="omega") - data = pd.concat([U, X, omega], axis=1) + # build formula fml = "U" if X is not None and not X.empty: fml += f" ~ {' + '.join(X.columns[X.columns!='Intercept'])}" if fe is not None and not fe.empty: + # TODO: can this rename be avoided? + fe.columns = fe.columns.str.replace("factorize", "").str.strip(r"\(|\)") fml += f" | {' + '.join(fe.columns)}" - + # construct data + data = pd.concat([U, X, fe, omega], axis=1) separation_na: set[int] = set() - if (Y.squeeze() > 0).all(): + is_interior = Y.squeeze() > 0 + if is_interior.all(): # no boundary sample, can exit - # TODO: check if algorithm can be sped up based on boundary sample return separation_na - iter = 0 - while iter < maxiter: - iter += 1 + iteration = 0 + has_converged = False + while iteration < maxiter: + iteration += 1 # regress U on X # TODO: check acceleration in ppmlhdfe's implementation: https://github.com/sergiocorreia/ppmlhdfe/blob/master/src/ppmlhdfe_separation_relu.mata#L135 Uhat = feols(fml, data=data, weights="omega").predict() - Uhat = np.where(np.abs(Uhat) < tol, 0, Uhat) + # update when within tolerance of zero + # need to be more strict below zero, to avoid false positives + within_zero = (Uhat > -0.1 * tol) & (Uhat < tol) + Uhat = np.where(is_interior | within_zero, 0, Uhat) if (Uhat >= 0).all(): # all separated observations have been identified + has_converged = True break - data["U"] = np.fmax(Uhat, 0) # rectified linear unit (ReLU) + data.loc[~is_interior, "U"] = np.fmax( + Uhat[~is_interior], 0 + ) # rectified linear unit (ReLU) + + if has_converged: + separation_na = set(Y[Uhat > 0].index) + else: + warnings.warn( + "iterative rectivier separation check: maximum number of iterations reached before convergence" + ) - separation_na = set(Y[Uhat > 0].index) return separation_na From d66f69fea65b6917a714db5323c71c27b67a14a6 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Wed, 3 Jul 2024 12:56:45 +0200 Subject: [PATCH 09/15] cosmetics --- pyfixest/estimation/FixestMulti_.py | 1 - pyfixest/estimation/fepois_.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 1bf2d61b..dd6259c3 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -335,7 +335,6 @@ def _estimate_all_models( # check for separation and drop separated variables na_separation: list[int] = [] - # if fe is not None: na_separation = _check_for_separation( Y=Y, X=X, fe=fe, methods=separation_check ) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index c5957dda..e7227f6a 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -437,9 +437,8 @@ def _check_for_separation_ir( maxiter: int = 100, ) -> set[int]: """ - Check for separation using the "iterative rectifier" check. - - For details see http://arxiv.org/abs/1903.01633 + Check for separation using the "iterative rectifier" algorithm + proposed by Correia et al. (2021). For details see http://arxiv.org/abs/1903.01633. Parameters ---------- From d8d48911fec61398d595428efb8e29cd72e4b57b Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Thu, 4 Jul 2024 19:08:39 +0200 Subject: [PATCH 10/15] use Optional --- pyfixest/estimation/FixestMulti_.py | 2 +- pyfixest/estimation/estimation.py | 2 +- pyfixest/estimation/fepois_.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index dd6259c3..9b3c52f3 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -154,7 +154,7 @@ def _estimate_all_models( collin_tol: float = 1e-6, iwls_maxiter: int = 25, iwls_tol: float = 1e-08, - separation_check: list[str] | None = None, + separation_check: Optional[list[str]] = None, ) -> None: """ Estimate multiple regression models. diff --git a/pyfixest/estimation/estimation.py b/pyfixest/estimation/estimation.py index 996e4bd4..8a690e07 100644 --- a/pyfixest/estimation/estimation.py +++ b/pyfixest/estimation/estimation.py @@ -354,7 +354,7 @@ def fepois( iwls_tol: float = 1e-08, iwls_maxiter: int = 25, collin_tol: float = 1e-10, - separation_check: list[str] | None = None, + separation_check: Optional[list[str]] = None, drop_intercept: bool = False, i_ref1=None, copy_data: bool = True, diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index e7227f6a..a616837a 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -317,7 +317,7 @@ def _check_for_separation( Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame, - methods: list[str] | None = None, + methods: Optional[list[str]] = None, ) -> list[int]: """ Check for separation. From 6822451672f50955012b909dce0f0e9e7b6669ef Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Sun, 14 Jul 2024 14:20:48 +0200 Subject: [PATCH 11/15] ensure consistent index --- pyfixest/estimation/fepois_.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index a616837a..40e80e29 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -463,15 +463,17 @@ def _check_for_separation_ir( feols = getattr(fixest_module, "feols") # initialize variables U = (Y == 0).astype(float).squeeze().rename("U") + # weights N0 = (Y > 0).squeeze().sum() K = N0 / tol**2 - omega = pd.Series(np.where(Y.squeeze() > 0, K, 1), name="omega") + omega = pd.Series(np.where(Y.squeeze() > 0, K, 1), name="omega", index=Y.index) # build formula fml = "U" if X is not None and not X.empty: + # TODO: ideally get regressor names from parent class fml += f" ~ {' + '.join(X.columns[X.columns!='Intercept'])}" if fe is not None and not fe.empty: - # TODO: can this rename be avoided? + # TODO: can this rename be avoided (get fixed effect names from parent class)? fe.columns = fe.columns.str.replace("factorize", "").str.strip(r"\(|\)") fml += f" | {' + '.join(fe.columns)}" # construct data @@ -490,7 +492,7 @@ def _check_for_separation_ir( # TODO: check acceleration in ppmlhdfe's implementation: https://github.com/sergiocorreia/ppmlhdfe/blob/master/src/ppmlhdfe_separation_relu.mata#L135 Uhat = feols(fml, data=data, weights="omega").predict() # update when within tolerance of zero - # need to be more strict below zero, to avoid false positives + # need to be more strict below zero to avoid false positives within_zero = (Uhat > -0.1 * tol) & (Uhat < tol) Uhat = np.where(is_interior | within_zero, 0, Uhat) if (Uhat >= 0).all(): From 662b152ab290a9233e0b27a33d0b71e7ac6dd632 Mon Sep 17 00:00:00 2001 From: lstimpfl Date: Sun, 14 Jul 2024 16:10:24 +0200 Subject: [PATCH 12/15] use input fml and data --- pyfixest/estimation/FixestMulti_.py | 8 ++- pyfixest/estimation/fepois_.py | 92 +++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 28 deletions(-) diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 9b3c52f3..0815cbc8 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -333,10 +333,14 @@ def _estimate_all_models( elif _method == "fepois": # check for separation and drop separated variables - na_separation: list[int] = [] na_separation = _check_for_separation( - Y=Y, X=X, fe=fe, methods=separation_check + fml=FixestFormula.fml, + data=_data, + Y=Y, + X=X, + fe=fe, + methods=separation_check, ) if na_separation: warnings.warn( diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 40e80e29..b7086d69 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -314,6 +314,8 @@ def predict( def _check_for_separation( + fml: str, + data: pd.DataFrame, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame, @@ -327,6 +329,10 @@ def _check_for_separation( Parameters ---------- + fml : str + The formula used for estimation. + data : pd.DataFrame + The data used for estimation. Y : pd.DataFrame Dependent variable. X : pd.DataFrame @@ -357,18 +363,31 @@ def _check_for_separation( separation_na: set[int] = set() for method in methods: - separation_na = separation_na.union(valid_methods[method](Y=Y, X=X, fe=fe)) + separation_na = separation_na.union( + valid_methods[method](fml=fml, data=data, Y=Y, X=X, fe=fe) + ) return list(separation_na) class _SeparationMethod(Protocol): - def __call__(self, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame) -> set[int]: + def __call__( + self, + fml: str, + data: pd.DataFrame, + Y: pd.DataFrame, + X: pd.DataFrame, + fe: pd.DataFrame, + ) -> set[int]: """ Check for separation. Parameters ---------- + fml : str + The formula used for estimation. + data : pd.DataFrame + The data used for estimation. Y : pd.DataFrame Dependent variable. X : pd.DataFrame @@ -385,13 +404,17 @@ def __call__(self, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame) -> set[in def _check_for_separation_fe( - Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame + fml: str, data: pd.DataFrame, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame ) -> set[int]: """ Check for separation using the "fe" check. Parameters ---------- + fml : str + The formula used for estimation. + data : pd.DataFrame + The data used for estimation. Y : pd.DataFrame Dependent variable. X : pd.DataFrame @@ -430,6 +453,8 @@ def _check_for_separation_fe( def _check_for_separation_ir( + fml: str, + data: pd.DataFrame, Y: pd.DataFrame, X: pd.DataFrame, fe: pd.DataFrame, @@ -442,6 +467,10 @@ def _check_for_separation_ir( Parameters ---------- + fml : str + The formula used for estimation. + data : pd.DataFrame + The data used for estimation. Y : pd.DataFrame Dependent variable. X : pd.DataFrame @@ -458,53 +487,64 @@ def _check_for_separation_ir( set Set of indices of separated observations. """ - # lazy load and avoid circular import + # lazy load to avoid circular import fixest_module = import_module("pyfixest.estimation") feols = getattr(fixest_module, "feols") - # initialize variables - U = (Y == 0).astype(float).squeeze().rename("U") - # weights - N0 = (Y > 0).squeeze().sum() - K = N0 / tol**2 - omega = pd.Series(np.where(Y.squeeze() > 0, K, 1), name="omega", index=Y.index) - # build formula - fml = "U" - if X is not None and not X.empty: - # TODO: ideally get regressor names from parent class - fml += f" ~ {' + '.join(X.columns[X.columns!='Intercept'])}" - if fe is not None and not fe.empty: - # TODO: can this rename be avoided (get fixed effect names from parent class)? - fe.columns = fe.columns.str.replace("factorize", "").str.strip(r"\(|\)") - fml += f" | {' + '.join(fe.columns)}" - # construct data - data = pd.concat([U, X, fe, omega], axis=1) + # initialize separation_na: set[int] = set() - is_interior = Y.squeeze() > 0 + tmp_suffix = "_separationTmp" + # build formula + name_dependent, rest = fml.split("~") + name_dependent_separation = "U" + if name_dependent_separation in data.columns: + name_dependent_separation += tmp_suffix + + fml_separation = f"{name_dependent_separation} ~ {rest}" + + dependent: pd.Series = data[name_dependent] + is_interior = dependent > 0 if is_interior.all(): # no boundary sample, can exit return separation_na + # initialize variables + tmp: pd.DataFrame = pd.DataFrame(index=data.index) + tmp["U"] = (dependent == 0).astype(float).rename("U") + # weights + N0 = (dependent > 0).sum() + K = N0 / tol**2 + tmp["omega"] = pd.Series( + np.where(dependent > 0, K, 1), name="omega", index=data.index + ) + # combine data + # TODO: avoid create new object? + tmp = data.join(tmp, how="left", validate="one_to_one", rsuffix=tmp_suffix) + # TODO: need to ensure that join doesn't create duplicated columns + # assert not tmp.columns.duplicated().any() + iteration = 0 has_converged = False while iteration < maxiter: iteration += 1 # regress U on X # TODO: check acceleration in ppmlhdfe's implementation: https://github.com/sergiocorreia/ppmlhdfe/blob/master/src/ppmlhdfe_separation_relu.mata#L135 - Uhat = feols(fml, data=data, weights="omega").predict() + fitted = feols(fml_separation, data=tmp, weights="omega") + tmp["Uhat"] = pd.Series(fitted.predict(), index=fitted._data.index, name="Uhat") + Uhat = tmp["Uhat"] # update when within tolerance of zero # need to be more strict below zero to avoid false positives within_zero = (Uhat > -0.1 * tol) & (Uhat < tol) - Uhat = np.where(is_interior | within_zero, 0, Uhat) + Uhat.where(~(is_interior | within_zero.fillna(True)), 0, inplace=True) if (Uhat >= 0).all(): # all separated observations have been identified has_converged = True break - data.loc[~is_interior, "U"] = np.fmax( + tmp.loc[~is_interior, "U"] = np.fmax( Uhat[~is_interior], 0 ) # rectified linear unit (ReLU) if has_converged: - separation_na = set(Y[Uhat > 0].index) + separation_na = set(dependent[Uhat > 0].index) else: warnings.warn( "iterative rectivier separation check: maximum number of iterations reached before convergence" From d3190be3c3e056c0a7206662b4741db59cca9ed8 Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Sun, 29 Sep 2024 13:16:53 +0200 Subject: [PATCH 13/15] minor cleanups --- coverage.xml | 1932 ++++++++++++++------------- pyfixest/estimation/FixestMulti_.py | 1 + pyfixest/estimation/fepois_.py | 20 +- 3 files changed, 1016 insertions(+), 937 deletions(-) diff --git a/coverage.xml b/coverage.xml index c3551824..8fcec677 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,5 +1,5 @@ - + @@ -28,12 +28,12 @@ - + - - + + @@ -491,7 +491,7 @@ - + @@ -509,24 +509,20 @@ - - - - + + - - - + + + - - @@ -536,11 +532,11 @@ + - - - - + + + @@ -549,139 +545,143 @@ + + - - + + - - - - - - - - - + + + + + + + - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - - + + - + + + - - - - - + + + + + + + - - - - - - - - + + + + + + + - - - - - - - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - + + + + + + + + + + + + - + + + + + @@ -1146,68 +1146,68 @@ - - - - - - - - - + + + + + + + - - - + + - - - - - + + + - + - - + + - + - - + + + + - - - + + + + + - - - - - - - - + + + + + + - - + + + + + @@ -1324,7 +1324,7 @@ - + @@ -1466,98 +1466,98 @@ - - - + + + - - - - + + + + - + - - - + + + - + - - + + - + - + - - - - + + + + - + - - + + - + - - + + - - - - - + + + + + - - - - - + + + + + - - - - + + + + - - - + + + - - + + - - - - - - + + + + + + - + - - - + + + @@ -1566,16 +1566,16 @@ - + - - + + - - + + @@ -1583,134 +1583,134 @@ - + - - - + + + - + - - + + - - + + - + - - - + + + - - - - + + + + - - + + - - - + + + - - - + + + - - - - + + + + - - + + - - + + - + - + - - + + - + - + - - + + - + - + - + - - + + - + - + - + - + - + - - + + - + - - + + - - - - + + + + @@ -1718,10 +1718,10 @@ - - - - + + + + @@ -1729,130 +1729,130 @@ - + - + - - + + - - - - - - + + + + + + - - - - - - - - + + + + + + + + - + - + - - - - + + + + - + - - + + - + - + - + - - - - - + + + + + - + - + - - - - + + + + - - - - + + + + - + - + - + - + - - + + - - - - - - - + + + + + + + - + - + - - + + - + - + - - + + - + - + @@ -1860,59 +1860,59 @@ - - - - + + + + - - - + + + - - - - - + + + + + - + - + - + - + - - + + - + - + - + - - - - - + + + + + - - + + @@ -1921,176 +1921,176 @@ - - + + - + - + - - - + + + - + - + - - - - + + + + - + - + - + - + - - + + - + - + - - - - - - - - - - - - + + + + + + + + + + + + - + - - + + - - + + - + - + - + - - + + - - - - - - - - - + + + + + + + + + - + - + - + - + - + - - - - + + + + - + - - + + - - + + - - + + - - + + - + - - - - + + + + - - + + - - - - - + + + + + - + - - - - - + + + + + @@ -2102,100 +2102,100 @@ - - - + + + - + - - - + + + - + - - - - + + + + - + - + - + - + - - + + - - + + - + - + - + - + - - + + - + - + - - - + + + - - + + - - + + - + - - + + - - + + - - + + @@ -2204,23 +2204,24 @@ - + - + - - - + + + - - + + - + + @@ -2384,173 +2385,225 @@ - + - + - - + + - - - - - + + + + + - + - + - - - - - - + + + + + + - - - - - - - - - - - - + + + + + + + + + + + - - - - - - - - - + + + + + + + + - - - - - - - - - + + + + + + + + + - - - - - - - - + + + + + + + + - + - + - - - - + + + + - + + - - - - - + + - + + - + - - - - - - - - - - + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -3035,7 +3088,7 @@ - + @@ -3045,7 +3098,7 @@ - + @@ -3070,7 +3123,7 @@ - + @@ -3082,7 +3135,7 @@ - + @@ -3102,11 +3155,11 @@ - - - - - + + + + + @@ -3116,7 +3169,7 @@ - + @@ -3160,127 +3213,127 @@ - + - + - + - + - - - - - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - - - - - - - + + + + + + + - - - - - - - + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + @@ -3291,16 +3344,16 @@ - - - - - - - - - - + + + + + + + + + + @@ -3322,7 +3375,7 @@ - + @@ -3330,210 +3383,227 @@ - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - + + + - - - - - + + + + - + + - - - + + + - - - + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + - - + + - - - - - - - - - - + + + + + + - - - + + + - - + + + + + - - - + - - - - - - - + + + + + + + - - + + - - + - + + - + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - - + + + - + @@ -3558,7 +3628,7 @@ - + @@ -3585,26 +3655,26 @@ - - - - - - - - + + + + + + + + - - - - - - + + + + + + - - - - + + + + @@ -3720,7 +3790,7 @@ - + @@ -3749,7 +3819,7 @@ - + @@ -3772,13 +3842,13 @@ - + - + - + - + diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index d0f84f58..052976fe 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -73,6 +73,7 @@ def __init__( self._use_compression = use_compression self._reps = reps if use_compression else None self._seed = seed if use_compression else None + self._separation_check = separation_check data = _polars_to_pandas(data) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index d012e4b4..0adf47f4 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -131,11 +131,14 @@ def prepare_model_matrix(self): # check for separation na_separation: list[int] = [] if self._fe is not None: - na_separation = _check_for_separation(Y=self._Y, fe=self._fe) - if na_separation: - warnings.warn( - f"{str(len(na_separation))} observations removed because of separation." - ) + na_separation = _check_for_separation( + Y=self._Y, + X=self._X, + fe=self._fe, + fml=self._fml, + data=self._data, + methods=self._separation_check, + ) if na_separation: self._Y.drop(na_separation, axis=0, inplace=True) @@ -428,7 +431,7 @@ def _check_for_separation( Independent variables. fe : pd.DataFrame Fixed effects. - method: list[str], optional + methods: list[str], optional Methods used to check for separation. One of fixed effects ("fe") or iterative rectifier ("ir"). Executes all methods by default. @@ -456,6 +459,11 @@ def _check_for_separation( valid_methods[method](fml=fml, data=data, Y=Y, X=X, fe=fe) ) + if separation_na: + warnings.warn( + f"{str(len(separation_na))} observations removed because of separation." + ) + return list(separation_na) From 5e2870ab432b6013c3861f625bfbddbc042aa340 Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Sun, 29 Sep 2024 14:22:24 +0200 Subject: [PATCH 14/15] add test against pplmhdfe test data set 01 --- coverage.xml | 859 +++++++++--------- pyfixest/estimation/FixestMulti_.py | 1 + pyfixest/estimation/estimation.py | 20 +- pyfixest/estimation/fepois_.py | 8 +- .../data/pplmhdfe_separation_examples/01.csv | 101 ++ .../readme.md.txt | 4 + tests/test_poisson.py | 13 +- 7 files changed, 570 insertions(+), 436 deletions(-) create mode 100644 tests/data/pplmhdfe_separation_examples/01.csv create mode 100644 tests/data/pplmhdfe_separation_examples/readme.md.txt diff --git a/coverage.xml b/coverage.xml index 8fcec677..ee737b53 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,5 +1,5 @@ - + @@ -491,9 +491,9 @@ - + - + @@ -517,13 +517,13 @@ - - + + - - + + - + @@ -536,8 +536,8 @@ - - + + @@ -549,15 +549,15 @@ - - + + - + - - + + @@ -570,118 +570,119 @@ - + - + - - - + + + - + - + - - + + - - - + + + - + - - - + + + - - + - + + - + - - + + - - - + + + - - - + + + - + - - - - - - - - - - - - - + + + + + + + + + + + + + - - + + - + - - - - - - + + + + + - + - - + + + - - - + + + - - + + + @@ -1123,7 +1124,7 @@ - + @@ -1138,76 +1139,81 @@ - - - - - + + + + - - - + + + - + - - - - - - - - - - - - - + + + + + + + + + + + + - - + - + + - - + + - + + - - + - - - - - - - - - + + + + + + + + - - - + + + + + + + + + + + @@ -2385,7 +2391,7 @@ - + @@ -2400,41 +2406,40 @@ - - - + + + - + - + - + - - - + + - - + + - - - - - - - + + + + + + + - - + + @@ -2470,8 +2475,8 @@ - - + + @@ -2520,7 +2525,7 @@ - + @@ -2529,81 +2534,81 @@ - - - + + + - - - - - - + + + + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - + - + + - - + + - - + + + - - - + + - + - + + - - + + - - - - - - - - - - - + + + + + + + + + + + + @@ -3088,7 +3093,7 @@ - + @@ -3098,7 +3103,7 @@ - + @@ -3123,7 +3128,7 @@ - + @@ -3135,7 +3140,7 @@ - + @@ -3155,11 +3160,11 @@ - - - - - + + + + + @@ -3169,7 +3174,7 @@ - + @@ -3201,7 +3206,7 @@ - + @@ -3213,127 +3218,127 @@ - + - + - + - + - - - - - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - - - - - - - + + + + + + + - - - - - - - + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + @@ -3344,16 +3349,16 @@ - - - - - - - - - - + + + + + + + + + + @@ -3375,7 +3380,7 @@ - + @@ -3383,110 +3388,110 @@ - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - + - - + + - + - - - - - - - - - - - + + + + + + + + + + + - - - - - + + + + + - - + + - - - - - - - - - - - - - + + + + + + + + + + + + + - + - + - - - + + + - + @@ -3592,18 +3597,18 @@ - + - - - + + + - + @@ -3628,7 +3633,7 @@ - + @@ -3655,26 +3660,26 @@ - - - - - - - - + + + + + + + + - - - - - - + + + + + + - - - - + + + + @@ -3790,7 +3795,7 @@ - + @@ -3819,7 +3824,7 @@ - + @@ -3842,13 +3847,13 @@ - + - + - + - + diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 052976fe..2fd7314f 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -293,6 +293,7 @@ def _estimate_all_models( store_data=_store_data, copy_data=_copy_data, lean=_lean, + separation_check=separation_check, # solver=_solver ) FIT.prepare_model_matrix() diff --git a/pyfixest/estimation/estimation.py b/pyfixest/estimation/estimation.py index 3590767d..6ae672a7 100644 --- a/pyfixest/estimation/estimation.py +++ b/pyfixest/estimation/estimation.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List import pandas as pd @@ -358,6 +358,7 @@ class for multiple models specified via `fml`. use_compression=use_compression, reps=reps, seed=seed, + separation_check=None, ) fixest = FixestMulti( @@ -397,7 +398,7 @@ def fepois( iwls_tol: float = 1e-08, iwls_maxiter: int = 25, collin_tol: float = 1e-10, - separation_check: Optional[list[str]] = None, + separation_check: Optional[list[str]] = ["fe"], drop_intercept: bool = False, i_ref1=None, copy_data: bool = True, @@ -449,7 +450,7 @@ def fepois( separation_check: list[str], optional Methods to identify and drop separated observations. - Either "fe" or "ir". Executes both by default. + Either "fe" or "ir". Executes "fe" by default. drop_intercept : bool, optional Whether to drop the intercept from the model, by default False. @@ -535,6 +536,7 @@ def fepois( use_compression=False, reps=None, seed=None, + separation_check=separation_check, ) fixest = FixestMulti( @@ -587,6 +589,7 @@ def _estimation_input_checks( use_compression: bool, reps: Optional[int], seed: Optional[int], + separation_check: List[str]=None, ): if not isinstance(fml, str): raise TypeError("fml must be a string") @@ -670,3 +673,14 @@ def _estimation_input_checks( if seed is not None and not isinstance(seed, int): raise TypeError("The function argument `seed` must be of type int.") + + if separation_check is not None: + if not isinstance(separation_check, list): + raise TypeError( + "The function argument `separation_check` must be of type list." + ) + + if not all(x in ["fe", "ir"] for x in separation_check): + raise ValueError( + "The function argument `separation_check` must be a list of strings containing 'fe' and/or 'ir'." + ) diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 0adf47f4..3b565c47 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -82,6 +82,7 @@ def __init__( store_data: bool = True, copy_data: bool = True, lean: bool = False, + separation_check: Optional[list[str]] = None, ): super().__init__( FixestFormula, @@ -107,6 +108,7 @@ def __init__( self.tol = tol self._method = "fepois" self.convergence = False + self.separation_check = separation_check self._support_crv3_inference = True self._support_iid_inference = True @@ -130,14 +132,14 @@ def prepare_model_matrix(self): # check for separation na_separation: list[int] = [] - if self._fe is not None: + if self._fe is not None and self.separation_check: na_separation = _check_for_separation( Y=self._Y, X=self._X, fe=self._fe, fml=self._fml, data=self._data, - methods=self._separation_check, + methods=self.separation_check, ) if na_separation: @@ -444,8 +446,6 @@ def _check_for_separation( "fe": _check_for_separation_fe, "ir": _check_for_separation_ir, } - if methods is None: - methods = list(valid_methods) invalid_methods = [method for method in methods if method not in valid_methods] if invalid_methods: diff --git a/tests/data/pplmhdfe_separation_examples/01.csv b/tests/data/pplmhdfe_separation_examples/01.csv new file mode 100644 index 00000000..0639065c --- /dev/null +++ b/tests/data/pplmhdfe_separation_examples/01.csv @@ -0,0 +1,101 @@ +y,x1,x2,id1,id2,separated +0.0000000000,-0.9303550124,1,1,4,1 +0.0000000000,0.1835959703,1,2,1,1 +0.0000000000,-0.6371972561,0,2,6,0 +0.0000000000,-0.4237562418,0,2,7,0 +0.1527670026,-1.1799178123,0,8,4,0 +0.1553160399,0.8860545158,0,1,7,0 +0.1734523475,1.0502026081,0,8,3,0 +0.2217264324,-0.2490162849,0,9,1,0 +0.2260344625,0.9635434151,0,7,6,0 +0.2283350676,0.5023207068,0,3,5,0 +0.2368061543,0.9141282439,0,10,1,0 +0.2410950512,-1.3616287708,0,4,5,0 +0.2541858852,0.5753656030,0,3,3,0 +0.2637400925,-0.6333113909,0,3,10,0 +0.2677916288,1.0411013365,0,6,9,0 +0.2768439949,-1.1694648266,0,8,10,0 +0.2934476137,-0.7940499187,0,4,6,0 +0.3290584087,0.5041465163,0,2,4,0 +0.3606268466,-3.0584282875,0,3,8,0 +0.4013363719,0.6099517941,0,7,5,0 +0.4354907870,0.9624704719,0,6,1,0 +0.4908127189,-0.7442333698,0,5,2,0 +0.4976674914,-0.5138924718,0,6,4,0 +0.5012444854,-1.3591595888,0,9,7,0 +0.5456602573,0.0567612983,0,5,5,0 +0.5634447336,1.2903038263,0,7,8,0 +0.5983847380,-0.6872945428,0,6,5,0 +0.6183075905,0.7253564000,0,1,5,0 +0.6413634419,1.6118478775,0,4,3,0 +0.6482065916,1.2488127947,0,7,1,0 +0.6522977948,-0.4748489261,0,6,6,0 +0.6631931663,0.4219789803,0,4,8,0 +0.6953295469,-1.0251801014,0,10,6,0 +0.6986964941,-0.3038678169,0,9,9,0 +0.8503285050,1.8723217249,0,8,2,0 +0.9026033878,-1.0245078802,0,10,10,0 +0.9204394221,0.4229967892,0,6,10,0 +0.9228412509,0.4940861166,0,1,1,0 +0.9359286427,1.3081433773,0,9,2,0 +0.9685080647,1.2934249640,0,2,10,0 +0.9945486188,-0.5332730412,0,5,1,0 +1.0105472803,-0.1284428090,0,9,3,0 +1.0721468925,-1.5399883986,0,6,8,0 +1.1205748320,0.6894677877,0,8,5,0 +1.1252909899,-1.2204582691,0,1,10,0 +1.1561176777,-0.9787744284,0,9,8,0 +1.1946246624,-0.0799055845,0,10,8,0 +1.2046658993,-0.8231971860,0,6,7,0 +1.2189750671,0.5437637568,0,3,4,0 +1.2277959585,1.3177309036,0,9,10,0 +1.2413842678,0.6673717499,0,8,9,0 +1.2569460869,-0.0167010967,0,6,3,0 +1.2587834597,-0.4196293950,0,1,8,0 +1.2782599926,-0.6420007348,0,8,1,0 +1.2911227942,1.1136496067,0,9,4,0 +1.2973045111,-0.3824758530,0,7,9,0 +1.3675237894,1.2361305952,0,5,9,0 +1.3778325319,-1.0304020643,0,5,4,0 +1.3857760429,0.3235974312,0,2,3,0 +1.3960508108,-0.4157371819,0,2,5,0 +1.4190907478,0.9920675159,0,1,2,0 +1.4420653582,-0.9114651084,0,4,1,0 +1.5038720369,-1.0453398228,0,3,2,0 +1.5394419432,-0.1935533732,0,4,4,0 +1.5747014284,0.0698969364,0,9,6,0 +1.6199581623,1.3169367313,0,4,2,0 +1.6392902136,-0.3978092670,0,7,10,0 +1.6421631575,-0.7466211319,0,5,8,0 +1.6952790022,-0.0158907417,0,5,6,0 +1.7640979290,1.0598815680,0,7,4,0 +1.9505974054,0.0092241317,0,10,7,0 +2.0685675144,0.1434842199,0,8,8,0 +2.1190843582,0.6173521280,0,3,1,0 +2.1889939308,-1.9780639410,0,3,7,0 +2.2176725864,-1.5379956961,0,7,3,0 +2.2831020355,0.5082080960,0,2,2,0 +2.3055832386,1.0296376944,0,7,2,0 +2.3692295551,2.1091823578,0,10,2,0 +2.7510018349,0.2632481158,0,2,9,0 +2.7675759792,-0.0022486539,0,8,6,0 +2.7777233124,-1.3771806955,0,10,4,0 +2.7846245766,0.1415781677,0,10,5,0 +2.7860391140,-2.2442505360,0,4,9,0 +2.9671635628,0.2927849889,0,5,3,0 +2.9819300175,-0.8325243592,0,4,10,0 +3.1186814308,-0.4090226293,0,2,8,0 +3.2802021503,-0.4062994719,0,4,7,0 +3.4179122448,0.0959109142,0,8,7,0 +3.6803083420,2.3073217869,0,1,9,0 +3.7194297314,0.3930145800,0,3,9,0 +3.7777581215,0.2952103019,0,6,2,0 +4.1290211678,-1.4121559858,0,5,7,0 +4.2730326653,-0.5140260458,0,10,9,0 +4.2883334160,-1.0160779953,0,7,7,0 +4.3528537750,0.7201576829,0,10,3,0 +4.9981565475,0.2207358032,0,3,6,0 +5.0979351997,0.7166025043,0,9,5,0 +7.3969793320,2.1998977661,0,5,10,0 +8.4651517868,0.1178035960,0,1,6,0 +9.8326959610,0.7707119584,0,1,3,0 diff --git a/tests/data/pplmhdfe_separation_examples/readme.md.txt b/tests/data/pplmhdfe_separation_examples/readme.md.txt new file mode 100644 index 00000000..6d8897a1 --- /dev/null +++ b/tests/data/pplmhdfe_separation_examples/readme.md.txt @@ -0,0 +1,4 @@ +## Separation Data Sets + +All files in this document stem from the [pplmhdfe test suite](https://github.com/sergiocorreia/ppmlhdfe/tree/master/test/separation_datasets), +published under MIT license. \ No newline at end of file diff --git a/tests/test_poisson.py b/tests/test_poisson.py index dfb35fa3..4455ca7c 100644 --- a/tests/test_poisson.py +++ b/tests/test_poisson.py @@ -46,8 +46,17 @@ def test_separation(): with pytest.warns( UserWarning, match="2 observations removed because of separation." ): - fepois("Y ~ X1 + X2", data=example2, vcov="hetero", separation_check=["ir"]) # noqa: F841 - fepois("Y ~ x | fe1", data=example2, vcov="hetero") # noqa: F841 + fepois("Y ~ X1 | X2", data=example2, vcov="hetero", separation_check=["ir"]) # noqa: F841 + + + data_01 = pd.read_csv("data/pplmhdfe_separations_examples/data_01.csv") + + # pplmhdfe test data sets: + with pytest.warns( + UserWarning, match=f"{str(data_01.sum())} observations removed because of separation." + ): + + pf.fepois("y ~ x1 + x2 | id1 + id2", data = data_01, separation_check = ["ir"]) @pytest.mark.parametrize("fml", ["Y ~ X1", "Y ~ X1 | f1"]) From ee95873171c97bdc61b3e44a1a790fa1fc35f9db Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Sun, 29 Sep 2024 14:23:01 +0200 Subject: [PATCH 15/15] readme txt to md --- pyfixest/estimation/estimation.py | 4 ++-- .../{readme.md.txt => readme.md} | 2 +- tests/test_poisson.py | 7 +++---- 3 files changed, 6 insertions(+), 7 deletions(-) rename tests/data/pplmhdfe_separation_examples/{readme.md.txt => readme.md} (85%) diff --git a/pyfixest/estimation/estimation.py b/pyfixest/estimation/estimation.py index 6ae672a7..bab7d046 100644 --- a/pyfixest/estimation/estimation.py +++ b/pyfixest/estimation/estimation.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, List +from typing import List, Optional, Union import pandas as pd @@ -589,7 +589,7 @@ def _estimation_input_checks( use_compression: bool, reps: Optional[int], seed: Optional[int], - separation_check: List[str]=None, + separation_check: List[str] = None, ): if not isinstance(fml, str): raise TypeError("fml must be a string") diff --git a/tests/data/pplmhdfe_separation_examples/readme.md.txt b/tests/data/pplmhdfe_separation_examples/readme.md similarity index 85% rename from tests/data/pplmhdfe_separation_examples/readme.md.txt rename to tests/data/pplmhdfe_separation_examples/readme.md index 6d8897a1..9989f224 100644 --- a/tests/data/pplmhdfe_separation_examples/readme.md.txt +++ b/tests/data/pplmhdfe_separation_examples/readme.md @@ -1,4 +1,4 @@ ## Separation Data Sets All files in this document stem from the [pplmhdfe test suite](https://github.com/sergiocorreia/ppmlhdfe/tree/master/test/separation_datasets), -published under MIT license. \ No newline at end of file +published under MIT license. diff --git a/tests/test_poisson.py b/tests/test_poisson.py index 4455ca7c..c23766cc 100644 --- a/tests/test_poisson.py +++ b/tests/test_poisson.py @@ -48,15 +48,14 @@ def test_separation(): ): fepois("Y ~ X1 | X2", data=example2, vcov="hetero", separation_check=["ir"]) # noqa: F841 - data_01 = pd.read_csv("data/pplmhdfe_separations_examples/data_01.csv") # pplmhdfe test data sets: with pytest.warns( - UserWarning, match=f"{str(data_01.sum())} observations removed because of separation." + UserWarning, + match=f"{str(data_01.sum())} observations removed because of separation.", ): - - pf.fepois("y ~ x1 + x2 | id1 + id2", data = data_01, separation_check = ["ir"]) + pf.fepois("y ~ x1 + x2 | id1 + id2", data=data_01, separation_check=["ir"]) @pytest.mark.parametrize("fml", ["Y ~ X1", "Y ~ X1 | f1"])