projectglow · karenfeng · Nov 25, 2020 · Nov 19, 2020 · Nov 19, 2020 · Nov 19, 2020
diff --git a/build.sbt b/build.sbt
@@ -223,7 +223,11 @@ ThisBuild / installHail := {
 
 lazy val uninstallHail = taskKey[Unit]("Uninstall Hail")
 ThisBuild / uninstallHail := {
-  "conda env remove --name hail" ### "rm -rf hail" !
+  Seq(
+    "/bin/bash",
+    "-c",
+    "conda env remove --name hail;" + "rm -rf hail"
+  ) !
 }
 
 lazy val sparkClasspath = taskKey[String]("sparkClasspath")
@@ -301,11 +305,11 @@ lazy val python =
     )
     .dependsOn(core % "test->test")
 
-lazy val hail = (project in file("python/glow/hail"))
+lazy val hail = (project in file("hail"))
   .settings(
     pythonSettings,
     test in Test := {
-      hailtest.toTask(" --doctest-modules python/glow/hail/").value
+      hailtest.toTask(" --doctest-modules python/glow/hail/ docs/source/etl/hail.rst").value
     }
   )
   .dependsOn(core % "test->test", python)
@@ -314,7 +318,7 @@ lazy val docs = (project in file("docs"))
   .settings(
     pythonSettings,
     test in Test := {
-      pytest.toTask(" docs").value
+      pytest.toTask(" --ignore=docs/source/etl/hail.rst docs").value
     }
   )
   .dependsOn(core % "test->test", python)

diff --git a/docs/source/api-docs/hail-functions.rst b/docs/source/api-docs/hail-functions.rst
@@ -0,0 +1,7 @@
+Hail Interoperation Functions
+-----------------------------
+
+Glow includes functionality to enable interoperation with `Hail <https://hail.is/>`_.
+
+.. automodule:: glow.hail.functions
+  :members:
diff --git a/docs/source/api-docs/index.rst b/docs/source/api-docs/index.rst
@@ -8,3 +8,4 @@ Glow's Python API is designed to work seamlessly with PySpark and other tools in
    toplevel-functions
    pyspark-functions
    glowgr
+   hail-functions
diff --git a/docs/source/etl/hail.rst b/docs/source/etl/hail.rst
@@ -0,0 +1,43 @@
+===================
+Hail Interoperation
+===================
+
+.. invisible-code-block: python
+
+    import glow
+    import hail as hl
+    hl.init(spark.sparkContext, idempotent=True, quiet=True)
+    glow.register(spark)
+
+    vcf = 'test-data/NA12878_21_10002403.vcf'
+    mt = hl.import_vcf(vcf)
+
+Glow includes functionality to enable conversion between a
+`Hail MatrixTable <https://hail.is/docs/0.2/overview/matrix_table.html>`_ and a Spark DataFrame, similar to one created
+with the :ref:`native Glow datasources <variant_data>`.
+
+Create a Hail cluster
+=====================
+
+To use the Hail interoperation functions, you need Hail to be installed on the cluster.
+On a Databricks cluster,
+`install Hail with an environment variable <https://docs.databricks.com/applications/genomics/tertiary/hail.html#create-a-hail-cluster>`_.
+See the `Hail installation documentation <https://hail.is/docs/0.2/getting_started.html>`_ to install Hail in other setups.
+
+Convert to a Glow DataFrame
+===========================
+
+Convert from a Hail MatrixTable to a Glow-compatible DataFrame with the function ``from_matrix_table``.
+
+.. code-block:: python
+
+    from glow.hail import functions
+    df = functions.from_matrix_table(mt, include_sample_ids=True)
+
+.. invisible-code-block: python
+
+    from pyspark.sql import Row
+    native_glow_df = spark.read.format('vcf').load(vcf).drop('splitFromMultiAllelic')
+    assert_rows_equal(df.head(), native_glow_df.head())
+
+By default, the genotypes contain sample IDs. To remove the sample IDs, set the parameter ``include_sample_ids=False``.
diff --git a/docs/source/etl/index.rst b/docs/source/etl/index.rst
@@ -20,4 +20,5 @@ enabling seamless manipulation, filtering, quality control and transformation be
    variant-normalization
    variant-splitter
    merge
+   hail
    utility-functions
diff --git a/docs/source/etl/lift-over.rst b/docs/source/etl/lift-over.rst
@@ -43,7 +43,7 @@ you can use to download the required file for liftOver from the b37 to the hg38
 Coordinate liftOver
 ====================
 
-To perform liftOver for genomic coordinates, use the function ``lift_over_coordinates``. ``lift_over_coordinates``, which has
+To perform liftOver for genomic coordinates, use the function ``lift_over_coordinates``. ``lift_over_coordinates`` has
 the following parameters.
 
 - chromosome: ``string``

diff --git a/python/glow/hail/functions.py b/python/glow/hail/functions.py
@@ -101,7 +101,7 @@ def _get_base_cols(row: StructExpression) -> List[Column]:
     if 'rsid' in row and row.rsid.dtype == tstr:
         names_elems.append("rsid")
     names_col = fx.expr(
-        f"filter(nullif(array({','.join(names_elems)}), array()), n -> isnotnull(n))").alias("names")
+        f"nullif(filter(array({','.join(names_elems)}), n -> isnotnull(n)), array())").alias("names")
 
     reference_allele_col = fx.element_at("alleles", 1).alias("referenceAllele")
 
@@ -123,11 +123,9 @@ def _get_other_cols(row: StructExpression) -> List[Column]:
     if 'qual' in row and row.qual.dtype == tfloat64:
         # -10 qual means missing
         other_cols.append(fx.expr("if(qual = -10, null, qual)").alias("qual"))
-    # null filters means missing, [] filters means PASS
+    # [] filters means PASS, null filters means missing
     if 'filters' in row and row.filters.dtype == tset(tstr):
-        other_cols.append(
-            fx.expr("if(size(filters) = 0, array('PASS'), if(isnull(filters), array(), filters))").
-            alias("filters"))
+        other_cols.append(fx.expr("if(size(filters) = 0, array('PASS'), filters)").alias("filters"))
     # Rename info.* columns to INFO_*
     if 'info' in row and isinstance(row.info.dtype, tstruct):
         for f in row.info:
@@ -156,6 +154,8 @@ def from_matrix_table(mt: MatrixTable, include_sample_ids: bool = True) -> DataF
     """
     Converts a Hail MatrixTable to a Glow DataFrame.
 
+    Requires that the MatrixTable rows contain locus and alleles fields.
+
     Args:
         mt : The Hail MatrixTable to convert
         include_sample_ids : If true, include sample IDs in the Glow DataFrame