projectglow · karenfeng · Nov 25, 2020 · Nov 19, 2020 · Nov 19, 2020 · Nov 19, 2020
diff --git a/build.sbt b/build.sbt
@@ -19,6 +19,12 @@ ThisBuild / sparkVersion := sys.env.getOrElse("SPARK_VERSION", spark3)
 lazy val hailVersion = settingKey[String]("hailVersion")
 ThisBuild / hailVersion := sys.env.getOrElse("HAIL_VERSION", "0.2.58")
 
+// Paths containing Hail tests
+lazy val hailTestPaths = Seq("python/glow/hail/", "docs/source/etl/hail.rst")
+lazy val ignoreHailTestPathsOption = hailTestPaths.map { p =>
+  s"--ignore $p"
+}.mkString(" ")
+
 def majorVersion(version: String): String = {
   StringUtils.ordinalIndexOf(version, ".", 1) match {
     case StringUtils.INDEX_NOT_FOUND => version
@@ -223,7 +229,11 @@ ThisBuild / installHail := {
 
 lazy val uninstallHail = taskKey[Unit]("Uninstall Hail")
 ThisBuild / uninstallHail := {
-  "conda env remove --name hail" ### "rm -rf hail" !
+  Seq(
+    "/bin/bash",
+    "-c",
+    "conda env remove --name hail;" + "rm -rf hail"
+  ) !
 }
 
 lazy val sparkClasspath = taskKey[String]("sparkClasspath")
@@ -293,7 +303,7 @@ lazy val python =
       functionGenerationSettings,
       test in Test := {
         yapf.toTask(" --diff").value
-        pytest.toTask(" --doctest-modules --ignore=python/glow/hail python").value
+        pytest.toTask(s" --doctest-modules $ignoreHailTestPathsOption python").value
       },
       generatedFunctionsOutput := baseDirectory.value / "glow" / "functions.py",
       functionsTemplate := baseDirectory.value / "glow" / "functions.py.TEMPLATE",
@@ -305,7 +315,7 @@ lazy val hail = (project in file("python/glow/hail"))
   .settings(
     pythonSettings,
     test in Test := {
-      hailtest.toTask(" --doctest-modules python/glow/hail/").value
+      hailtest.toTask(s" --doctest-modules ${hailTestPaths.mkString(" ")}").value
     }
   )
   .dependsOn(core % "test->test", python)
@@ -314,7 +324,7 @@ lazy val docs = (project in file("docs"))
   .settings(
     pythonSettings,
     test in Test := {
-      pytest.toTask(" docs").value
+      pytest.toTask(s" $ignoreHailTestPathsOption docs").value
     }
   )
   .dependsOn(core % "test->test", python)

diff --git a/docs/source/api-docs/hail-functions.rst b/docs/source/api-docs/hail-functions.rst
@@ -0,0 +1,7 @@
+Hail Interoperation Functions
+-----------------------------
+
+Glow includes functionality to enable interoperation with `Hail <https://hail.is/>`_.
+
+.. automodule:: glow.hail.functions
+  :members:
diff --git a/docs/source/api-docs/index.rst b/docs/source/api-docs/index.rst
@@ -8,3 +8,4 @@ Glow's Python API is designed to work seamlessly with PySpark and other tools in
    toplevel-functions
    pyspark-functions
    glowgr
+   hail-functions
diff --git a/docs/source/etl/hail.rst b/docs/source/etl/hail.rst
@@ -0,0 +1,118 @@
+===================
+Hail Interoperation
+===================
+
+.. invisible-code-block: python
+
+    import glow
+    import hail as hl
+    hl.init(spark.sparkContext, idempotent=True, quiet=True)
+    glow.register(spark)
+
+    vcf = 'test-data/NA12878_21_10002403.vcf'
+    mt = hl.import_vcf(vcf)
+
+Glow includes functionality to enable conversion between a
+`Hail MatrixTable <https://hail.is/docs/0.2/overview/matrix_table.html>`_ and a Spark DataFrame, similar to one created
+with the :ref:`native Glow datasources <variant_data>`.
+
+Create a Hail cluster
+=====================
+
+To use the Hail interoperation functions, you need Hail to be installed on the cluster.
+On a Databricks cluster,
+`install Hail with an environment variable <https://docs.databricks.com/applications/genomics/tertiary/hail.html#create-a-hail-cluster>`_.
+See the `Hail installation documentation <https://hail.is/docs/0.2/getting_started.html>`_ to install Hail in other setups.
+
+Convert to a Glow DataFrame
+===========================
+
+Convert from a Hail MatrixTable to a Glow-compatible DataFrame with the function ``from_matrix_table``.
+
+.. code-block:: python
+
+    from glow.hail import functions
+    df = functions.from_matrix_table(mt, include_sample_ids=True)
+
+.. invisible-code-block: python
+
+    from pyspark.sql import Row
+    native_glow_df = spark.read.format('vcf').load(vcf).drop('splitFromMultiAllelic')
+    assert_rows_equal(df.head(), native_glow_df.head())
+
+By default, the genotypes contain sample IDs. To remove the sample IDs, set the parameter ``include_sample_ids=False``.
+
+Schema mapping
+==============
+
+The Glow DataFrame variant fields are derived from the Hail MatrixTable row fields.
+
+.. list-table::
+  :header-rows: 1
+
+  * - Required
+    - Glow DataFrame variant field
+    - Hail MatrixTable row field
+  * - Yes
+    - ``contigName``
+    - ``locus.contig``
+  * - Yes
+    - ``start``
+    - ``locus.position - 1``
+  * - Yes
+    - ``end``
+    - ``info.END`` or ``locus.position - 1 + len(alleles[0])``
+  * - Yes
+    - ``referenceAllele``
+    - ``alleles[0]``
+  * - No
+    - ``alternateAlleles``
+    - ``alleles[1:]``
+  * - No
+    - ``names``
+    - ``[rsid, varid]``
+  * - No
+    - ``qual``
+    - ``qual``
+  * - No
+    - ``filters``
+    - ``filters``
+  * - No
+    - ``INFO_<ANY_FIELD>``
+    - ``info.<ANY_FIELD>``
+
+The Glow DataFrame genotype sample IDs are derived from the Hail MatrixTable column fields.
+
+All of the other Glow DataFrame genotype fields are derived from the Hail MatrixTable entry fields.
+
+.. list-table::
+  :header-rows: 1
+
+  * - Glow DataFrame genotype field
+    - Hail MatrixTable entry field
+  * - ``phased``
+    - ``GT.phased``
+  * - ``calls``
+    - ``GT.alleles``
+  * - ``depth``
+    - ``DP``
+  * - ``filters``
+    - ``FT``
+  * - ``genotypeLikelihoods``
+    - ``GL``
+  * - ``phredLikelihoods``
+    - ``PL``
+  * - ``posteriorProbabilities``
+    - ``GP``
+  * - ``conditionalQuality``
+    - ``GQ``
+  * - ``haplotypeQualities``
+    - ``HQ``
+  * - ``expectedAlleleCounts``
+    - ``EC``
+  * - ``mappingQuality``
+    - ``MQ``
+  * - ``alleleDepths``
+    - ``AD``
+  * - ``<ANY_FIELD>``
+    - ``<ANY_FIELD>``
diff --git a/docs/source/etl/index.rst b/docs/source/etl/index.rst
@@ -20,4 +20,5 @@ enabling seamless manipulation, filtering, quality control and transformation be
    variant-normalization
    variant-splitter
    merge
+   hail
    utility-functions
diff --git a/docs/source/etl/lift-over.rst b/docs/source/etl/lift-over.rst
@@ -43,7 +43,7 @@ you can use to download the required file for liftOver from the b37 to the hg38
 Coordinate liftOver
 ====================
 
-To perform liftOver for genomic coordinates, use the function ``lift_over_coordinates``. ``lift_over_coordinates``, which has
+To perform liftOver for genomic coordinates, use the function ``lift_over_coordinates``. ``lift_over_coordinates`` has
 the following parameters.
 
 - chromosome: ``string``

diff --git a/docs/source/etl/variant-data.rst b/docs/source/etl/variant-data.rst
@@ -56,7 +56,7 @@ You can control the behavior of the VCF reader with a few parameters. All parame
 +--------------------------+---------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Parameter                | Type    | Default     | Description                                                                                                                                             |
 +==========================+=========+=============+=========================================================================================================================================================+
-| ``includeSampleIds``     | boolean | ``true``    | If true, each genotype includes the name of the sample ID it belongs to. Sample names increases the size of each row, both in memory and on storage.    |
+| ``includeSampleIds``     | boolean | ``true``    | If true, each genotype includes the name of the sample ID it belongs to. Sample names increase the size of each row, both in memory and on storage.     |
 +--------------------------+---------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``flattenInfoFields``    | boolean | ``true``    | If true, each info field in the input VCF will be converted into a column in the output DataFrame with each column typed as specified in the VCF header.|
 |                          |         |             | If false, all info fields will be contained in a single column with a string -> string map of info keys to values.                                      |

diff --git a/python/glow/hail/functions.py b/python/glow/hail/functions.py
@@ -101,7 +101,7 @@ def _get_base_cols(row: StructExpression) -> List[Column]:
     if 'rsid' in row and row.rsid.dtype == tstr:
         names_elems.append("rsid")
     names_col = fx.expr(
-        f"filter(nullif(array({','.join(names_elems)}), array()), n -> isnotnull(n))").alias("names")
+        f"nullif(filter(array({','.join(names_elems)}), n -> isnotnull(n)), array())").alias("names")
 
     reference_allele_col = fx.element_at("alleles", 1).alias("referenceAllele")
 
@@ -123,11 +123,9 @@ def _get_other_cols(row: StructExpression) -> List[Column]:
     if 'qual' in row and row.qual.dtype == tfloat64:
         # -10 qual means missing
         other_cols.append(fx.expr("if(qual = -10, null, qual)").alias("qual"))
-    # null filters means missing, [] filters means PASS
+    # [] filters means PASS, null filters means missing
     if 'filters' in row and row.filters.dtype == tset(tstr):
-        other_cols.append(
-            fx.expr("if(size(filters) = 0, array('PASS'), if(isnull(filters), array(), filters))").
-            alias("filters"))
+        other_cols.append(fx.expr("if(size(filters) = 0, array('PASS'), filters)").alias("filters"))
     # Rename info.* columns to INFO_*
     if 'info' in row and isinstance(row.info.dtype, tstruct):
         for f in row.info:
@@ -154,11 +152,16 @@ def _require_row_variant_w_struct_locus(mt: MatrixTable) -> NoReturn:
 
 def from_matrix_table(mt: MatrixTable, include_sample_ids: bool = True) -> DataFrame:
     """
-    Converts a Hail MatrixTable to a Glow DataFrame.
+    Converts a Hail MatrixTable to a Glow DataFrame. The variant fields are derived from the Hail MatrixTable
+    row fields. The sample IDs are derived from the Hail MatrixTable column fields. All other genotype fields are
+    derived from the Hail MatrixTable entry fields.
+
+    Requires that the MatrixTable rows contain locus and alleles fields.
 
     Args:
         mt : The Hail MatrixTable to convert
-        include_sample_ids : If true, include sample IDs in the Glow DataFrame
+        include_sample_ids : If true (default), include sample IDs in the Glow DataFrame.
+                             Sample names increase the size of each row, both in memory and on storage.
 
     Returns:
         Glow DataFrame converted from the MatrixTable.