Skip to content

Commit

Permalink
Merge branch 'master' of github.com:projectglow/glow into update-email
Browse files Browse the repository at this point in the history
  • Loading branch information
Henry Davidge committed Feb 3, 2021
2 parents 1db6b01 + 54db807 commit 88063c4
Show file tree
Hide file tree
Showing 96 changed files with 9,369 additions and 2,485 deletions.
48 changes: 24 additions & 24 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@ jobs:
export SPARK_VERSION="2.4.5"
export SCALA_VERSION="2.11.12"
sbt coverage core/test coverageReport exit
- run:
name: Run docs tests
environment:
command: |
export PATH=$HOME/conda/envs/glow/bin:$PATH
export SPARK_VERSION="2.4.5"
export SCALA_VERSION="2.11.12"
sbt docs/test exit
- run:
name: Run Python tests
no_output_timeout: 30m
Expand All @@ -137,14 +145,6 @@ jobs:
sudo apt-get update
sudo apt-get -y install rsync
sbt installHail hail/test uninstallHail exit
- run:
name: Run docs tests
environment:
command: |
export PATH=$HOME/conda/envs/glow/bin:$PATH
export SPARK_VERSION="2.4.5"
export SCALA_VERSION="2.11.12"
sbt docs/test exit
- *check_clean_repo
- store_artifacts:
path: ~/glow/unit-tests.log
Expand Down Expand Up @@ -176,6 +176,14 @@ jobs:
export SPARK_VERSION="2.4.5"
export SCALA_VERSION="2.12.8"
sbt core/test exit
- run:
name: Run docs tests
environment:
command: |
export PATH=$HOME/conda/envs/glow/bin:$PATH
export SPARK_VERSION="2.4.5"
export SCALA_VERSION="2.12.8"
sbt docs/test exit
- run:
name: Run Python tests
no_output_timeout: 30m
Expand All @@ -195,14 +203,6 @@ jobs:
sudo apt-get update
sudo apt-get -y install rsync
sbt installHail hail/test uninstallHail exit
- run:
name: Run docs tests
environment:
command: |
export PATH=$HOME/conda/envs/glow/bin:$PATH
export SPARK_VERSION="2.4.5"
export SCALA_VERSION="2.12.8"
sbt docs/test exit
spark-3-tests:
Expand All @@ -226,6 +226,14 @@ jobs:
export SPARK_VERSION="3.0.0"
export SCALA_VERSION="2.12.8"
sbt core/test exit
- run:
name: Run docs tests
environment:
command: |
export PATH=$HOME/conda/envs/glow/bin:$PATH
export SPARK_VERSION="3.0.0"
export SCALA_VERSION="2.12.8"
sbt docs/test exit
- run:
name: Run Python tests
no_output_timeout: 30m
Expand All @@ -245,14 +253,6 @@ jobs:
sudo apt-get update
sudo apt-get -y install rsync
sbt installHail hail/test uninstallHail exit
- run:
name: Run docs tests
environment:
command: |
export PATH=$HOME/conda/envs/glow/bin:$PATH
export SPARK_VERSION="3.0.0"
export SCALA_VERSION="2.12.8"
sbt docs/test exit
- *check_clean_repo
- store_artifacts:
path: ~/glow/unit-tests.log
Expand Down
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,27 @@ To run Scala tests against the staged Maven artifact with the current stable ver
stagedRelease/test
```

## Testing code on a Databricks cluster

To test your changes on a Databricks cluster, you'll need to build and install the Python and Scala artifacts.

To build an uber jar (Glow + dependencies) with your changes:

`sbt core/assembly`

The uber jar will be at a path like `glow/core/target/${scala_version}/${artifact-name}-assembly-${version}-SNAPSHOT.jar`.

To build a wheel with the Python code:

1. Activate the Glow dev conda environment (`conda activate glow`)
2. `cd` into the `python` directory
3. Run `python setup.py bdist_wheel`

The wheel file will be at a path like `python/dist/glow.py-${version}-py3-none-any.whl`.

You can then [install these libraries on a Databricks cluster](https://docs.databricks.com/libraries/index.html).


## IntelliJ Tips

If you use IntelliJ, you'll want to:
Expand Down
21 changes: 17 additions & 4 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -405,23 +405,33 @@ updateCondaEnv := {
"conda env update -f python/environment.yml" !
}

def crossReleaseStep(step: ReleaseStep, requiresPySpark: Boolean): Seq[ReleaseStep] = {
def crossReleaseStep(step: ReleaseStep, requiresPySpark: Boolean, requiresHail: Boolean): Seq[ReleaseStep] = {
val updateCondaEnvStep = releaseStepCommandAndRemaining(
if (requiresPySpark) "updateCondaEnv" else "")
val changePySparkVersionStep = releaseStepCommandAndRemaining(
if (requiresPySpark) "changePySparkVersion" else "")
val installHailStep = releaseStepCommandAndRemaining(
if (requiresHail) "installHail" else "")
val uninstallHailStep = releaseStepCommandAndRemaining(
if (requiresHail) "uninstallHail" else "")

Seq(
updateCondaEnvStep,
releaseStepCommandAndRemaining(s"""set ThisBuild / sparkVersion := "$spark3""""),
releaseStepCommandAndRemaining(s"""set ThisBuild / scalaVersion := "$scala212""""),
installHailStep,
step,
uninstallHailStep,
changePySparkVersionStep,
releaseStepCommandAndRemaining(s"""set ThisBuild / sparkVersion := "$spark2""""),
releaseStepCommandAndRemaining(s"""set ThisBuild / scalaVersion := "$scala211""""),
installHailStep,
step,
uninstallHailStep,
releaseStepCommandAndRemaining(s"""set ThisBuild / scalaVersion := "$scala212""""),
installHailStep,
step,
uninstallHailStep,
updateCondaEnvStep
)
}
Expand All @@ -431,16 +441,19 @@ releaseProcess := Seq[ReleaseStep](
inquireVersions,
runClean
) ++
crossReleaseStep(runTest, true) ++
crossReleaseStep(releaseStepCommandAndRemaining("core/test"), requiresPySpark = false, requiresHail = false) ++
crossReleaseStep(releaseStepCommandAndRemaining("python/test"), requiresPySpark = true, requiresHail = false) ++
crossReleaseStep(releaseStepCommandAndRemaining("docs/test"), requiresPySpark = true, requiresHail = false) ++
crossReleaseStep(releaseStepCommandAndRemaining("hail/test"), requiresPySpark = true, requiresHail = true) ++
Seq(
setReleaseVersion,
updateStableVersion,
commitReleaseVersion,
commitStableVersion,
tagRelease
) ++
crossReleaseStep(publishArtifacts, false) ++
crossReleaseStep(releaseStepCommandAndRemaining("stagedRelease/test"), false) ++
crossReleaseStep(publishArtifacts, requiresPySpark = false, requiresHail = false) ++
crossReleaseStep(releaseStepCommandAndRemaining("stagedRelease/test"), requiresPySpark = false, requiresHail = false) ++
Seq(
setNextVersion,
commitNextVersion
Expand Down
16 changes: 13 additions & 3 deletions core/src/main/scala/io/projectglow/sql/dsl/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,25 @@ package object dsl {

trait ImplicitOperators {
def expr: Expression

// Ensure that lambda variables have unique names for nested functions
private var lambdaCounter = 0
private def nextLambdaName(): String = {
lambdaCounter += 1
lambdaCounter.toString
}

private def makeLambdaFunction(f: Expression => Expression): LambdaFunction = {
val x = UnresolvedNamedLambdaVariable(Seq("x"))
val x = UnresolvedNamedLambdaVariable(Seq(nextLambdaName()))
LambdaFunction(f(x), Seq(x))
}

private def makeLambdaFunction(f: (Expression, Expression) => Expression): LambdaFunction = {
val x = UnresolvedNamedLambdaVariable(Seq("x"))
val y = UnresolvedNamedLambdaVariable(Seq("y"))
val x = UnresolvedNamedLambdaVariable(Seq(nextLambdaName()))
val y = UnresolvedNamedLambdaVariable(Seq(nextLambdaName()))
LambdaFunction(f(x, y), Seq(x, y))
}

def arrayTransform(fn: Expression => Expression): Expression = {
ArrayTransform(expr, makeLambdaFunction(fn))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,8 @@ case class MeanSubstitute(array: Expression, missingValue: Expression)
)
}

lazy val arrayMean: Expression = {
// Sum and count of non-missing values
array.aggregate(
createNamedStruct(Literal(0d), Literal(0L)),
updateSumAndCountConditionally,
calculateMean
)
}

def substituteWithMean(arrayElement: Expression): Expression = {
If(isMissing(arrayElement), arrayMean, arrayElement)
def substituteWithMean(arrayElement: Expression, meanExpr: Expression): Expression = {
If(isMissing(arrayElement), meanExpr, arrayElement)
}

override def rewrite: Expression = {
Expand All @@ -107,7 +98,12 @@ case class MeanSubstitute(array: Expression, missingValue: Expression)
s"Missing value must be of numeric type; provided type is ${missingValue.dataType}.")
}

// Replace missing values with the provided strategy
array.arrayTransform(substituteWithMean(_))
array.aggregate(
createNamedStruct(Literal(0d), Literal(0L)),
updateSumAndCountConditionally,
// Note: it's important that we perform the substitution in the finish function. Otherwise the mean is
// recomputed for each missing element.
acc => array.arrayTransform(el => substituteWithMean(el, calculateMean(acc)))
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,18 @@ class VariantQcExprsSuite extends GlowBaseTest {
assert(test == Seq(1.5, 1.5, 0.0, 1.0, 2.0, 3.0))
}

test("mean substitute big array") {
val values = Array.fill(100000)(None: Option[Int])
values(0) = Some(1)
val test = spark
.createDataFrame(Seq(OptIntDatum(values)))
.selectExpr("mean_substitute(numbers)")
.collect()
.head
.getSeq[Double](0)
assert(test.forall(_ == 1))
}

test("null array") {
val test = spark
.createDataFrame(Seq(Datum(null)))
Expand Down
18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/gff.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/lift-over.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/merge-vcf.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/normalizevariants.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/sample-qc-demo.html

Large diffs are not rendered by default.

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/variant-data.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/variant-qc-demo.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/etl/vcf2delta.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/tertiary/binaryglowgr.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/tertiary/glowgr.html

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions docs/source/_static/notebooks/tertiary/gwas-binary.html

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions docs/source/_static/notebooks/tertiary/gwas-quantitative.html

Large diffs are not rendered by default.

17 changes: 9 additions & 8 deletions docs/source/_static/notebooks/tertiary/gwas.html

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions docs/source/_static/notebooks/tertiary/pandas-lmm.html

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions docs/source/_static/notebooks/tertiary/pipe-transformer.html

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,22 @@
# MAGIC
# MAGIC To perform variant liftover, you must download a reference file to each node of the cluster. Here, we assume the reference genome is downloaded to
# MAGIC ```/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa```
# MAGIC
# MAGIC If you are using a Databricks cluster with [Databricks Runtime for Genomics](https://docs.databricks.com/applications/genomics/index.html), this can be achieved by setting [environment variable](https://docs.databricks.com/user-guide/clusters/spark-config.html#environment-variables) `refGenomeId=grch38`.

# COMMAND ----------

# DBTITLE 1,Import glow and define path variables
import glow
glow.register(spark)
spark = glow.register(spark)
chain_file = '/opt/liftover/b37ToHg38.over.chain'
reference_file = '/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
vcf_file = 'dbfs:/databricks-datasets/genomics/1kg-vcfs/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'

# COMMAND ----------

# DBTITLE 1,First, read in a VCF from a flat file or Delta Lake table.
input_df = spark.read.format("vcf") \
.load(vcf_file) \
.cache()
input_df = (spark.read.format("vcf")
.load(vcf_file)
.cache())

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Databricks notebook source
from pyspark.sql.functions import *
import glow
glow.register(spark)
spark = glow.register(spark)

# COMMAND ----------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,22 @@
# MAGIC To use variant normalizer, a copy of the reference genome `.fa/.fasta` file (along with its `.fai` file) must be downloaded to each node of the cluster.
# MAGIC
# MAGIC Here, we assume the reference genome is downloaded to the following path: `/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa`
# MAGIC
# MAGIC If you are using a Databricks cluster with [Databricks Runtime for Genomics](https://docs.databricks.com/applications/genomics/index.html), this can be done by setting the [environment variable](https://docs.databricks.com/user-guide/clusters/spark-config.html#environment-variables) ``refGenomeId=grch38`` for your cluster.

# COMMAND ----------

# DBTITLE 1,Define path variables
import glow
glow.register(spark)
ref_genome_path = '/mnt/dbnucleus/dbgenomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
spark = glow.register(spark)
ref_genome_path = '/dbfs/databricks-datasets/genomics/grch38/data/GRCh38_full_analysis_set_plus_decoy_hla.fa'
vcf_path = '/databricks-datasets/genomics/variant-normalization/test_left_align_hg38.vcf'

# COMMAND ----------

# DBTITLE 1,Load a VCF into a DataFrame
original_variants_df = spark.read\
.format("vcf")\
.option("includeSampleIds", False)\
.load(vcf_path)
original_variants_df = (spark.read
.format("vcf")
.option("includeSampleIds", False)
.load(vcf_path))

# COMMAND ----------

Expand All @@ -31,9 +29,9 @@
# COMMAND ----------

# DBTITLE 1,Normalize variants using normalize_variants transformer with column replacement
normalized_variants_df = glow.transform(\
"normalize_variants",\
original_variants_df,\
normalized_variants_df = glow.transform(
"normalize_variants",
original_variants_df,
reference_genome_path=ref_genome_path
)

Expand All @@ -42,9 +40,9 @@
# COMMAND ----------

# DBTITLE 1,Normalize variants using normalize_variants transformer without column replacement
normalized_variants_df = glow.transform(\
"normalize_variants",\
original_variants_df,\
normalized_variants_df = glow.transform(
"normalize_variants",
original_variants_df,
reference_genome_path=ref_genome_path,
replace_columns="False"
)
Expand All @@ -54,8 +52,6 @@
# COMMAND ----------

# DBTITLE 1,Normalize variants using normalize_variant function
from glow.functions import *

normalized_variants_df = original_variants_df.select("*", normalize_variant("contigName", "start", "end", "referenceAllele", "alternateAlleles", ref_genome_path).alias("normalizationResult"))
normalized_variants_df = original_variants_df.select("*", glow.normalize_variant("contigName", "start", "end", "referenceAllele", "alternateAlleles", ref_genome_path).alias("normalizationResult"))

display(normalized_variants_df)
Loading

0 comments on commit 88063c4

Please sign in to comment.