Merge branch 'master' of github.com:projectglow/glow into update-email

projectglow · Feb 3, 2021 · 88063c4 · 88063c4
2 parents 1db6b01 + 54db807
commit 88063c4
Show file tree

Hide file tree

Showing 96 changed files with 9,369 additions and 2,485 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -117,6 +117,14 @@ jobs:
             export SPARK_VERSION="2.4.5"
             export SCALA_VERSION="2.11.12"
             sbt coverage core/test coverageReport exit
+      - run:
+          name: Run docs tests
+          environment:
+          command: |
+            export PATH=$HOME/conda/envs/glow/bin:$PATH
+            export SPARK_VERSION="2.4.5"
+            export SCALA_VERSION="2.11.12"
+            sbt docs/test exit
       - run:
           name: Run Python tests
           no_output_timeout: 30m
@@ -137,14 +145,6 @@ jobs:
             sudo apt-get update
             sudo apt-get -y install rsync
             sbt installHail hail/test uninstallHail exit
-      - run:
-          name: Run docs tests
-          environment:
-          command: |
-            export PATH=$HOME/conda/envs/glow/bin:$PATH
-            export SPARK_VERSION="2.4.5"
-            export SCALA_VERSION="2.11.12"
-            sbt docs/test exit
       - *check_clean_repo
       - store_artifacts:
           path: ~/glow/unit-tests.log
@@ -176,6 +176,14 @@ jobs:
             export SPARK_VERSION="2.4.5"
             export SCALA_VERSION="2.12.8"
             sbt core/test exit
+      - run:
+          name: Run docs tests
+          environment:
+          command: |
+            export PATH=$HOME/conda/envs/glow/bin:$PATH
+            export SPARK_VERSION="2.4.5"
+            export SCALA_VERSION="2.12.8"
+            sbt docs/test exit
       - run:
           name: Run Python tests
           no_output_timeout: 30m
@@ -195,14 +203,6 @@ jobs:
             sudo apt-get update
             sudo apt-get -y install rsync
             sbt installHail hail/test uninstallHail exit
-      - run:
-          name: Run docs tests
-          environment:
-          command: |
-            export PATH=$HOME/conda/envs/glow/bin:$PATH
-            export SPARK_VERSION="2.4.5"
-            export SCALA_VERSION="2.12.8"
-            sbt docs/test exit
 
 
   spark-3-tests:
@@ -226,6 +226,14 @@ jobs:
             export SPARK_VERSION="3.0.0"
             export SCALA_VERSION="2.12.8"
             sbt core/test exit
+      - run:
+          name: Run docs tests
+          environment:
+          command: |
+            export PATH=$HOME/conda/envs/glow/bin:$PATH
+            export SPARK_VERSION="3.0.0"
+            export SCALA_VERSION="2.12.8"
+            sbt docs/test exit
       - run:
           name: Run Python tests
           no_output_timeout: 30m
@@ -245,14 +253,6 @@ jobs:
             sudo apt-get update
             sudo apt-get -y install rsync
             sbt installHail hail/test uninstallHail exit
-      - run:
-          name: Run docs tests
-          environment:
-          command: |
-            export PATH=$HOME/conda/envs/glow/bin:$PATH
-            export SPARK_VERSION="3.0.0"
-            export SCALA_VERSION="2.12.8"
-            sbt docs/test exit
       - *check_clean_repo
       - store_artifacts:
           path: ~/glow/unit-tests.log

diff --git a/README.md b/README.md
@@ -114,6 +114,27 @@ To run Scala tests against the staged Maven artifact with the current stable ver
 stagedRelease/test
 ```
 
+## Testing code on a Databricks cluster
+
+To test your changes on a Databricks cluster, you'll need to build and install the Python and Scala artifacts.
+
+To build an uber jar (Glow + dependencies) with your changes:
+
+`sbt core/assembly`
+
+The uber jar will be at a path like `glow/core/target/${scala_version}/${artifact-name}-assembly-${version}-SNAPSHOT.jar`.
+
+To build a wheel with the Python code:
+
+1. Activate the Glow dev conda environment (`conda activate glow`)
+2. `cd` into the `python` directory
+3. Run `python setup.py bdist_wheel`
+
+The wheel file will be at a path like `python/dist/glow.py-${version}-py3-none-any.whl`.
+
+You can then [install these libraries on a Databricks cluster](https://docs.databricks.com/libraries/index.html).
+
+
 ## IntelliJ Tips
 
 If you use IntelliJ, you'll want to:

diff --git a/build.sbt b/build.sbt
@@ -405,23 +405,33 @@ updateCondaEnv := {
   "conda env update -f python/environment.yml" !
 }
 
-def crossReleaseStep(step: ReleaseStep, requiresPySpark: Boolean): Seq[ReleaseStep] = {
+def crossReleaseStep(step: ReleaseStep, requiresPySpark: Boolean, requiresHail: Boolean): Seq[ReleaseStep] = {
   val updateCondaEnvStep = releaseStepCommandAndRemaining(
     if (requiresPySpark) "updateCondaEnv" else "")
   val changePySparkVersionStep = releaseStepCommandAndRemaining(
     if (requiresPySpark) "changePySparkVersion" else "")
+  val installHailStep = releaseStepCommandAndRemaining(
+    if (requiresHail) "installHail" else "")
+  val uninstallHailStep = releaseStepCommandAndRemaining(
+    if (requiresHail) "uninstallHail" else "")
 
   Seq(
     updateCondaEnvStep,
     releaseStepCommandAndRemaining(s"""set ThisBuild / sparkVersion := "$spark3""""),
     releaseStepCommandAndRemaining(s"""set ThisBuild / scalaVersion := "$scala212""""),
+    installHailStep,
     step,
+    uninstallHailStep,
     changePySparkVersionStep,
     releaseStepCommandAndRemaining(s"""set ThisBuild / sparkVersion := "$spark2""""),
     releaseStepCommandAndRemaining(s"""set ThisBuild / scalaVersion := "$scala211""""),
+    installHailStep,
     step,
+    uninstallHailStep,
     releaseStepCommandAndRemaining(s"""set ThisBuild / scalaVersion := "$scala212""""),
+    installHailStep,
     step,
+    uninstallHailStep,
     updateCondaEnvStep
   )
 }
@@ -431,16 +441,19 @@ releaseProcess := Seq[ReleaseStep](
   inquireVersions,
   runClean
 ) ++
-crossReleaseStep(runTest, true) ++
+crossReleaseStep(releaseStepCommandAndRemaining("core/test"), requiresPySpark = false, requiresHail = false) ++
+crossReleaseStep(releaseStepCommandAndRemaining("python/test"), requiresPySpark = true, requiresHail = false) ++
+crossReleaseStep(releaseStepCommandAndRemaining("docs/test"), requiresPySpark = true, requiresHail = false) ++
+crossReleaseStep(releaseStepCommandAndRemaining("hail/test"), requiresPySpark = true, requiresHail = true) ++
 Seq(
   setReleaseVersion,
   updateStableVersion,
   commitReleaseVersion,
   commitStableVersion,
   tagRelease
 ) ++
-crossReleaseStep(publishArtifacts, false) ++
-crossReleaseStep(releaseStepCommandAndRemaining("stagedRelease/test"), false) ++
+crossReleaseStep(publishArtifacts, requiresPySpark = false, requiresHail = false) ++
+crossReleaseStep(releaseStepCommandAndRemaining("stagedRelease/test"), requiresPySpark = false, requiresHail = false) ++
 Seq(
   setNextVersion,
   commitNextVersion

diff --git a/core/src/main/scala/io/projectglow/sql/dsl/package.scala b/core/src/main/scala/io/projectglow/sql/dsl/package.scala
@@ -24,15 +24,25 @@ package object dsl {
 
   trait ImplicitOperators {
     def expr: Expression
+
+    // Ensure that lambda variables have unique names for nested functions
+    private var lambdaCounter = 0
+    private def nextLambdaName(): String = {
+      lambdaCounter += 1
+      lambdaCounter.toString
+    }
+
     private def makeLambdaFunction(f: Expression => Expression): LambdaFunction = {
-      val x = UnresolvedNamedLambdaVariable(Seq("x"))
+      val x = UnresolvedNamedLambdaVariable(Seq(nextLambdaName()))
       LambdaFunction(f(x), Seq(x))
     }
+
     private def makeLambdaFunction(f: (Expression, Expression) => Expression): LambdaFunction = {
-      val x = UnresolvedNamedLambdaVariable(Seq("x"))
-      val y = UnresolvedNamedLambdaVariable(Seq("y"))
+      val x = UnresolvedNamedLambdaVariable(Seq(nextLambdaName()))
+      val y = UnresolvedNamedLambdaVariable(Seq(nextLambdaName()))
       LambdaFunction(f(x, y), Seq(x, y))
     }
+
     def arrayTransform(fn: Expression => Expression): Expression = {
       ArrayTransform(expr, makeLambdaFunction(fn))
     }

diff --git a/core/src/main/scala/io/projectglow/sql/expressions/MeanSubstitute.scala b/core/src/main/scala/io/projectglow/sql/expressions/MeanSubstitute.scala
@@ -83,17 +83,8 @@ case class MeanSubstitute(array: Expression, missingValue: Expression)
     )
   }
 
-  lazy val arrayMean: Expression = {
-    // Sum and count of non-missing values
-    array.aggregate(
-      createNamedStruct(Literal(0d), Literal(0L)),
-      updateSumAndCountConditionally,
-      calculateMean
-    )
-  }
-
-  def substituteWithMean(arrayElement: Expression): Expression = {
-    If(isMissing(arrayElement), arrayMean, arrayElement)
+  def substituteWithMean(arrayElement: Expression, meanExpr: Expression): Expression = {
+    If(isMissing(arrayElement), meanExpr, arrayElement)
   }
 
   override def rewrite: Expression = {
@@ -107,7 +98,12 @@ case class MeanSubstitute(array: Expression, missingValue: Expression)
         s"Missing value must be of numeric type; provided type is ${missingValue.dataType}.")
     }
 
-    // Replace missing values with the provided strategy
-    array.arrayTransform(substituteWithMean(_))
+    array.aggregate(
+      createNamedStruct(Literal(0d), Literal(0L)),
+      updateSumAndCountConditionally,
+      // Note: it's important that we perform the substitution in the finish function. Otherwise the mean is
+      // recomputed for each missing element.
+      acc => array.arrayTransform(el => substituteWithMean(el, calculateMean(acc)))
+    )
   }
 }
diff --git a/core/src/test/scala/io/projectglow/tertiary/VariantQcExprsSuite.scala b/core/src/test/scala/io/projectglow/tertiary/VariantQcExprsSuite.scala
@@ -352,6 +352,18 @@ class VariantQcExprsSuite extends GlowBaseTest {
     assert(test == Seq(1.5, 1.5, 0.0, 1.0, 2.0, 3.0))
   }
 
+  test("mean substitute big array") {
+    val values = Array.fill(100000)(None: Option[Int])
+    values(0) = Some(1)
+    val test = spark
+      .createDataFrame(Seq(OptIntDatum(values)))
+      .selectExpr("mean_substitute(numbers)")
+      .collect()
+      .head
+      .getSeq[Double](0)
+    assert(test.forall(_ == 1))
+  }
+
   test("null array") {
     val test = spark
       .createDataFrame(Seq(Datum(null)))

diff --git a/docs/source/_static/notebooks/etl/gff.html b/docs/source/_static/notebooks/etl/gff.html
diff --git a/docs/source/_static/notebooks/etl/lift-over.html b/docs/source/_static/notebooks/etl/lift-over.html
diff --git a/docs/source/_static/notebooks/etl/merge-vcf.html b/docs/source/_static/notebooks/etl/merge-vcf.html
diff --git a/docs/source/_static/notebooks/etl/normalizevariants.html b/docs/source/_static/notebooks/etl/normalizevariants.html
diff --git a/docs/source/_static/notebooks/etl/sample-qc-demo.html b/docs/source/_static/notebooks/etl/sample-qc-demo.html
diff --git a/docs/source/_static/notebooks/etl/splitmultiallelics-transformer.html b/docs/source/_static/notebooks/etl/splitmultiallelics-transformer.html
diff --git a/docs/source/_static/notebooks/etl/variant-data.html b/docs/source/_static/notebooks/etl/variant-data.html
diff --git a/docs/source/_static/notebooks/etl/variant-qc-demo.html b/docs/source/_static/notebooks/etl/variant-qc-demo.html
diff --git a/docs/source/_static/notebooks/etl/vcf2delta.html b/docs/source/_static/notebooks/etl/vcf2delta.html
diff --git a/docs/source/_static/notebooks/tertiary/binaryglowgr.html b/docs/source/_static/notebooks/tertiary/binaryglowgr.html
diff --git a/docs/source/_static/notebooks/tertiary/glowgr.html b/docs/source/_static/notebooks/tertiary/glowgr.html
diff --git a/docs/source/_static/notebooks/tertiary/gwas-binary.html b/docs/source/_static/notebooks/tertiary/gwas-binary.html
diff --git a/docs/source/_static/notebooks/tertiary/gwas-quantitative.html b/docs/source/_static/notebooks/tertiary/gwas-quantitative.html
diff --git a/docs/source/_static/notebooks/tertiary/gwas.html b/docs/source/_static/notebooks/tertiary/gwas.html
diff --git a/docs/source/_static/notebooks/tertiary/pandas-lmm.html b/docs/source/_static/notebooks/tertiary/pandas-lmm.html
diff --git a/docs/source/_static/notebooks/tertiary/pipe-transformer.html b/docs/source/_static/notebooks/tertiary/pipe-transformer.html
diff --git a/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/lift-over.py b/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/lift-over.py
@@ -16,24 +16,22 @@
 # MAGIC 
 # MAGIC To perform variant liftover, you must download a reference file to each node of the cluster. Here, we assume the reference genome is downloaded to 
 # MAGIC ```/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa```
-# MAGIC 
-# MAGIC If you are using a Databricks cluster with [Databricks Runtime for Genomics](https://docs.databricks.com/applications/genomics/index.html), this can be achieved by setting [environment variable](https://docs.databricks.com/user-guide/clusters/spark-config.html#environment-variables) `refGenomeId=grch38`.
 
 # COMMAND ----------
 
 # DBTITLE 1,Import glow and define path variables
 import glow
-glow.register(spark)
+spark = glow.register(spark)
 chain_file = '/opt/liftover/b37ToHg38.over.chain'
 reference_file = '/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
 vcf_file = 'dbfs:/databricks-datasets/genomics/1kg-vcfs/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
 
 # COMMAND ----------
 
 # DBTITLE 1,First, read in a VCF from a flat file or Delta Lake table.
-input_df = spark.read.format("vcf") \
-  .load(vcf_file) \
-  .cache()
+input_df = (spark.read.format("vcf")
+  .load(vcf_file)
+  .cache())
 
 # COMMAND ----------
 

diff --git a/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/merge-vcf.py b/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/merge-vcf.py
@@ -1,7 +1,7 @@
 # Databricks notebook source
 from pyspark.sql.functions import *
 import glow
-glow.register(spark)
+spark = glow.register(spark)
 
 # COMMAND ----------
 

diff --git a/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/normalizevariants.py b/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/normalizevariants.py
@@ -4,24 +4,22 @@
 # MAGIC To use variant normalizer, a copy of the reference genome `.fa/.fasta` file (along with its `.fai` file) must be downloaded to each node of the cluster. 
 # MAGIC 
 # MAGIC Here, we assume the reference genome is downloaded to the following path: `/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa`
-# MAGIC 
-# MAGIC If you are using a Databricks cluster with [Databricks Runtime for Genomics](https://docs.databricks.com/applications/genomics/index.html), this can be done by setting the [environment variable](https://docs.databricks.com/user-guide/clusters/spark-config.html#environment-variables) ``refGenomeId=grch38`` for your cluster.
 
 # COMMAND ----------
 
 # DBTITLE 1,Define path variables
 import glow
-glow.register(spark)
-ref_genome_path = '/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
+spark = glow.register(spark)
+ref_genome_path = '/dbfs/databricks-datasets/genomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
 vcf_path = '/databricks-datasets/genomics/variant-normalization/test_left_align_hg38.vcf'
 
 # COMMAND ----------
 
 # DBTITLE 1,Load a VCF into a DataFrame
-original_variants_df = spark.read\
-  .format("vcf")\
-  .option("includeSampleIds", False)\
-  .load(vcf_path)
+original_variants_df = (spark.read
+  .format("vcf")
+  .option("includeSampleIds", False)
+  .load(vcf_path))
 
 # COMMAND ----------
 
@@ -31,9 +29,9 @@
 # COMMAND ----------
 
 # DBTITLE 1,Normalize variants using normalize_variants transformer with column replacement
-normalized_variants_df = glow.transform(\
-  "normalize_variants",\
-  original_variants_df,\
+normalized_variants_df = glow.transform(
+  "normalize_variants",
+  original_variants_df,
   reference_genome_path=ref_genome_path
 )
 
@@ -42,9 +40,9 @@
 # COMMAND ----------
 
 # DBTITLE 1,Normalize variants using normalize_variants transformer without column replacement
-normalized_variants_df = glow.transform(\
-  "normalize_variants",\
-  original_variants_df,\
+normalized_variants_df = glow.transform(
+  "normalize_variants",
+  original_variants_df,
   reference_genome_path=ref_genome_path,
   replace_columns="False"
 )
@@ -54,8 +52,6 @@
 # COMMAND ----------
 
 # DBTITLE 1,Normalize variants using normalize_variant function
-from glow.functions import *
-
-normalized_variants_df = original_variants_df.select("*", normalize_variant("contigName", "start", "end", "referenceAllele", "alternateAlleles", ref_genome_path).alias("normalizationResult"))
+normalized_variants_df = original_variants_df.select("*", glow.normalize_variant("contigName", "start", "end", "referenceAllele", "alternateAlleles", ref_genome_path).alias("normalizationResult"))
 
 display(normalized_variants_df)