Skip to content

Commit

Permalink
formatting; karen's comment
Browse files Browse the repository at this point in the history
  • Loading branch information
henrydavidge committed Dec 23, 2020
1 parent 138559c commit e05a351
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 148 deletions.
11 changes: 11 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.


import numpy as np
from pyspark.sql import SparkSession
import pytest
import os
Expand Down Expand Up @@ -50,6 +51,16 @@ def spark(spark_builder):
sess = spark_builder.getOrCreate()
return sess.newSession()

def pytest_addoption(parser):
parser.addoption('--random-seed', action='store', type=int, help='Seed to use for random number generator')

@pytest.fixture(scope="function")
def rg(pytestconfig):
seed = pytestconfig.getoption('random_seed')
seed_seq = np.random.SeedSequence(seed)
print(f'Creating random number generator with seed {seed_seq.entropy}')
return np.random.default_rng(seed_seq)

def pytest_runtest_setup(item):
min_spark_version = next((mark.args[0] for mark in item.iter_markers(name='min_spark')), None)
if min_spark_version:
Expand Down
4 changes: 2 additions & 2 deletions python/glow/gwas/log_reg.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ def logistic_regression(

def map_func(pdf_iterator):
for pdf in pdf_iterator:
yield gwas_fx._loco_dispatch(pdf, state, _logistic_regression_inner,
C, Y_mask, correction, phenotype_names)
yield gwas_fx._loco_dispatch(pdf, state, _logistic_regression_inner, C, Y_mask,
correction, phenotype_names)

return genotype_df.mapInPandas(map_func, result_struct)

Expand Down
179 changes: 89 additions & 90 deletions python/glow/gwas/tests/test_lin_reg.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,55 +169,55 @@ def test_r_glm_covariates():
assert_glow_equals_golden(genotype_df, phenotype_df, covariate_df, fit_intercept=False)


def test_multiple():
def test_multiple(rg):
num_samples = 100
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 25)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 5)))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 25)))
covariate_df = pd.DataFrame(rg.random((num_samples, 5)))
assert_glow_equals_golden(genotype_df, phenotype_df, covariate_df)


def test_missing():
def test_missing(rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 1)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 1)))
genotype_df = pd.DataFrame(rg.random((num_samples, 1)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 1)))
phenotype_df.loc[0, 0] = np.nan
covariate_df = pd.DataFrame(np.random.random((num_samples, 3)))
covariate_df = pd.DataFrame(rg.random((num_samples, 3)))
glow = run_linear_regression(genotype_df, phenotype_df, covariate_df)
baseline = statsmodels_baseline(genotype_df, phenotype_df, covariate_df)
assert regression_results_equal(glow, baseline)


@pytest.mark.min_spark('3')
def test_missing_spark(spark):
def test_missing_spark(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 1)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 3)))
genotype_df = pd.DataFrame(rg.random((num_samples, 1)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 3)))
phenotype_df.loc[0, 0] = np.nan
phenotype_df.loc[[1, 3, 5], 1] = np.nan
covariate_df = pd.DataFrame(np.random.random((num_samples, 3)))
covariate_df = pd.DataFrame(rg.random((num_samples, 3)))
glow = run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df)
baseline = statsmodels_baseline(genotype_df, phenotype_df, covariate_df)
assert regression_results_equal(glow, baseline)


@pytest.mark.min_spark('3')
def test_multiple_spark(spark):
def test_multiple_spark(spark, rg):
num_samples = 100
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 25)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 5)))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 25)))
covariate_df = pd.DataFrame(rg.random((num_samples, 5)))
baseline = statsmodels_baseline(genotype_df, phenotype_df, covariate_df)
results = run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df)
assert regression_results_equal(baseline, results)


@pytest.mark.min_spark('3')
def test_propagate_extra_cols(spark):
def test_propagate_extra_cols(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 3)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 2)))
genotype_df = pd.DataFrame(rg.random((num_samples, 3)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 2)))
extra_cols = pd.DataFrame({'genotype_idx': range(3), 'animal': 'monkey'})
results = run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df,
extra_cols)
Expand All @@ -229,11 +229,11 @@ def test_propagate_extra_cols(spark):


@pytest.mark.min_spark('3')
def test_different_values_column(spark):
def test_different_values_column(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 3)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 2)))
genotype_df = pd.DataFrame(rg.random((num_samples, 3)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 2)))
results = run_linear_regression_spark(spark,
genotype_df,
phenotype_df,
Expand All @@ -243,52 +243,52 @@ def test_different_values_column(spark):


@pytest.mark.min_spark('3')
def test_intercept_no_covariates(spark):
def test_intercept_no_covariates(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 25)))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 25)))
# No error
run_linear_regression_spark(spark, genotype_df, phenotype_df, pd.DataFrame({}))


@pytest.mark.min_spark('3')
def test_validates_missing_covariates(spark):
def test_validates_missing_covariates(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 3)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 2)))
genotype_df = pd.DataFrame(rg.random((num_samples, 3)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 2)))
covariate_df.loc[0, 0] = np.nan
with pytest.raises(ValueError):
run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df)


@pytest.mark.min_spark('3')
def test_validate_same_number_of_rows(spark):
genotype_df = pd.DataFrame(np.random.random((4, 3)))
phenotype_df = pd.DataFrame(np.random.random((4, 5)))
covariate_df = pd.DataFrame(np.random.random((5, 2)))
def test_validate_same_number_of_rows(spark, rg):
genotype_df = pd.DataFrame(rg.random((4, 3)))
phenotype_df = pd.DataFrame(rg.random((4, 5)))
covariate_df = pd.DataFrame(rg.random((5, 2)))
with pytest.raises(ValueError):
run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df)


def test_error_for_old_spark(spark):
def test_error_for_old_spark(spark, rg):
if spark.version.startswith('2'):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 25)))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 25)))
with pytest.raises(AttributeError):
run_linear_regression_spark(spark, genotype_df, phenotype_df, pd.DataFrame({}))


@pytest.mark.min_spark('3')
def test_simple_offset(spark):
def test_simple_offset(spark, rg):
num_samples = 25
num_pheno = 6
num_geno = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 2)))
offset_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
genotype_df = pd.DataFrame(rg.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
covariate_df = pd.DataFrame(rg.random((num_samples, 2)))
offset_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
results = run_linear_regression_spark(spark,
genotype_df,
phenotype_df,
Expand All @@ -299,15 +299,15 @@ def test_simple_offset(spark):


@pytest.mark.min_spark('3')
def test_multi_offset(spark):
def test_multi_offset(spark, rg):
num_samples = 25
num_pheno = 25
num_geno = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 10)))
genotype_df = pd.DataFrame(rg.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
covariate_df = pd.DataFrame(rg.random((num_samples, 10)))
offset_index = pd.MultiIndex.from_product([phenotype_df.index, ['chr1', 'chr2']])
offset_df = pd.DataFrame(np.random.random((num_samples * 2, num_pheno)), index=offset_index)
offset_df = pd.DataFrame(rg.random((num_samples * 2, num_pheno)), index=offset_index)
extra_cols = pd.DataFrame({'contigName': ['chr1', 'chr2'] * 5})
results = run_linear_regression_spark(spark,
genotype_df,
Expand All @@ -322,20 +322,19 @@ def test_multi_offset(spark):


@pytest.mark.min_spark('3')
def test_multi_offset_with_missing(spark):
def test_multi_offset_with_missing(spark, rg):
num_samples = 25
num_pheno = 24
num_geno = 18
contigs = ['chr1', 'chr2', 'chr3']
genotype_df = pd.DataFrame(np.random.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
genotype_df = pd.DataFrame(rg.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
missing = np.triu(np.ones(phenotype_df.shape))
missing[:, -1] = 0
phenotype_df[missing.astype('bool')] = np.nan
covariate_df = pd.DataFrame(np.random.random((num_samples, 10)))
covariate_df = pd.DataFrame(rg.random((num_samples, 10)))
offset_index = pd.MultiIndex.from_product([phenotype_df.index, contigs])
offset_df = pd.DataFrame(np.random.random((num_samples * len(contigs), num_pheno)),
index=offset_index)
offset_df = pd.DataFrame(rg.random((num_samples * len(contigs), num_pheno)), index=offset_index)
extra_cols = pd.DataFrame({'contigName': contigs * 6})
results = run_linear_regression_spark(spark,
genotype_df,
Expand All @@ -349,50 +348,50 @@ def test_multi_offset_with_missing(spark):


@pytest.mark.min_spark('3')
def test_offset_wrong_columns(spark):
def test_offset_wrong_columns(spark, rg):
num_samples = 25
num_pheno = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
offset_df = pd.DataFrame(np.random.random((num_samples, num_pheno)), columns=range(10, 20))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
offset_df = pd.DataFrame(rg.random((num_samples, num_pheno)), columns=range(10, 20))
with pytest.raises(ValueError):
run_linear_regression_spark(spark, genotype_df, phenotype_df, offset_df=offset_df)


@pytest.mark.min_spark('3')
def test_offset_wrong_index(spark):
def test_offset_wrong_index(spark, rg):
num_samples = 25
num_pheno = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
offset_df = pd.DataFrame(np.random.random((num_samples, num_pheno)), index=range(1, 26))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
offset_df = pd.DataFrame(rg.random((num_samples, num_pheno)), index=range(1, 26))
with pytest.raises(ValueError):
run_linear_regression_spark(spark, genotype_df, phenotype_df, offset_df=offset_df)


@pytest.mark.min_spark('3')
def test_offset_wrong_multi_index(spark):
def test_offset_wrong_multi_index(spark, rg):
num_samples = 25
num_pheno = 10
contigs = ['chr1', 'chr2']
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
offset_df = pd.DataFrame(np.random.random((num_samples * len(contigs), num_pheno)),
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
offset_df = pd.DataFrame(rg.random((num_samples * len(contigs), num_pheno)),
pd.MultiIndex.from_product([range(1, 26), contigs]))
with pytest.raises(ValueError):
run_linear_regression_spark(spark, genotype_df, phenotype_df, offset_df=offset_df)


@pytest.mark.min_spark('3')
def test_offset_different_index_order(spark):
def test_offset_different_index_order(spark, rg):
num_samples = 25
num_pheno = 6
num_geno = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
genotype_df = pd.DataFrame(rg.random((num_samples, num_geno)))
phenotype_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
phenotype_df.columns = phenotype_df.columns.astype('str')
covariate_df = pd.DataFrame(np.random.random((num_samples, 2)))
offset_df = pd.DataFrame(np.random.random((num_samples, num_pheno)))
covariate_df = pd.DataFrame(rg.random((num_samples, 2)))
offset_df = pd.DataFrame(rg.random((num_samples, num_pheno)))
offset_df.columns = offset_df.columns.astype('str')
baseline = statsmodels_baseline(genotype_df, phenotype_df, covariate_df, [offset_df] * num_geno)

Expand All @@ -409,23 +408,23 @@ def test_offset_different_index_order(spark):


@pytest.mark.min_spark('3')
def test_cast_genotypes(spark):
def test_cast_genotypes(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.randint(0, 10, (num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 5)))
genotype_df = pd.DataFrame(rg.integers(0, 10, (num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 5)))
baseline = statsmodels_baseline(genotype_df, phenotype_df, covariate_df)
results = run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df)
assert results['effect'].dtype == np.float64
assert regression_results_equal(baseline, results)


@pytest.mark.min_spark('3')
def test_cast_genotypes_float32(spark):
def test_cast_genotypes_float32(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.randint(0, 10, (num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 5)))
genotype_df = pd.DataFrame(rg.integers(0, 10, (num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 5)))
baseline = statsmodels_baseline(genotype_df, phenotype_df, covariate_df)
results = run_linear_regression_spark(spark,
genotype_df,
Expand All @@ -437,21 +436,21 @@ def test_cast_genotypes_float32(spark):


@pytest.mark.min_spark('3')
def test_bad_datatype(spark):
def test_bad_datatype(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 5)))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 5)))
with pytest.raises(ValueError):
run_linear_regression_spark(spark, genotype_df, phenotype_df, covariate_df, dt=np.int32)


@pytest.mark.min_spark('3')
def test_bad_column_name(spark):
def test_bad_column_name(spark, rg):
num_samples = 10
genotype_df = pd.DataFrame(np.random.random((num_samples, 10)))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 5)))
genotype_df = pd.DataFrame(rg.random((num_samples, 10)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 5)))
with pytest.raises(ValueError):
run_linear_regression_spark(spark,
genotype_df,
Expand All @@ -461,12 +460,12 @@ def test_bad_column_name(spark):


@pytest.mark.min_spark('3')
def test_values_expr(spark):
def test_values_expr(spark, rg):
from pyspark.sql.functions import array, lit
num_samples = 5
genotype_df = spark.range(1).withColumn('genotypes', lit(42))
phenotype_df = pd.DataFrame(np.random.random((num_samples, 5)))
covariate_df = pd.DataFrame(np.random.random((num_samples, 2)))
phenotype_df = pd.DataFrame(rg.random((num_samples, 5)))
covariate_df = pd.DataFrame(rg.random((num_samples, 2)))
array_vals = [lit(i) for i in range(num_samples)]
results = lr.linear_regression(genotype_df,
phenotype_df,
Expand Down
Loading

0 comments on commit e05a351

Please sign in to comment.