Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic raster dataset import #795

Merged
merged 4 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,12 @@ vendor/Brewfile.lock.json
!/tests/data/conflicts
!/tests/data/patches
!/tests/data/point-cloud/
!/tests/data/raster/
!/tests/data/shapefiles/
!/tests/data/upgrade
/tests/data/conflicts/*/
/tests/data/point-cloud/*/
/tests/data/raster/*/
/tests/data/shapefiles/*/
/tests/data/upgrade/v0/*/
/tests/data/upgrade/v1/*/
Expand Down
11 changes: 8 additions & 3 deletions kart/geometry.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import binascii
import itertools
import json
import math
import re
Expand Down Expand Up @@ -704,8 +705,13 @@ def geom_envelope(gpkg_geom, only_2d=False, calculate_if_missing=False):
return envelope


def ring_as_wkt(*points):
return "(" + ",".join(f"{x} {y}" for x, y in points) + ")"
def ring_as_wkt(*points, repeat_first_point=True):
if repeat_first_point:
points_iter = itertools.chain(points, [points[0]])
else:
points_iter = points

return "(" + ",".join(f"{x} {y}" for x, y in points_iter) + ")"


def bbox_as_wkt_polygon(min_x, max_x, min_y, max_y):
Expand All @@ -716,7 +722,6 @@ def bbox_as_wkt_polygon(min_x, max_x, min_y, max_y):
(max_x, min_y),
(max_x, max_y),
(min_x, max_y),
(min_x, min_y),
)
+ ")"
)
12 changes: 12 additions & 0 deletions kart/import_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class ImportType(Enum):
SQLALCHEMY_TABLE = auto()
OGR_TABLE = auto()
POINT_CLOUD = auto()
RASTER = auto()

@property
def import_cmd(self):
Expand All @@ -27,6 +28,10 @@ def import_cmd(self):
from kart.point_cloud.import_ import point_cloud_import

return point_cloud_import
elif self is self.RASTER:
from kart.raster.import_ import raster_import

return raster_import

@property
def import_source_class(self):
Expand Down Expand Up @@ -124,6 +129,13 @@ def import_source_class(self):
ImportType.POINT_CLOUD,
file_ext=(".las", ".laz"),
),
# Raster imports:
ImportSourceType(
"GeoTIFF",
"PATH.tif or PATH.tiff",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a human-readable filename suggestion? normally everyone uses .tif, i'd just drop the .tiff one

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not done - it's not hurting anyone and we do accept .tif or .tiff files

ImportType.RASTER,
file_ext=(".tif", ".tiff"),
),
]

URI_SCHEME_TO_IMPORT_SOURCE_TYPE = {
Expand Down
10 changes: 1 addition & 9 deletions kart/point_cloud/import_.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,6 @@
"such a commit. This option bypasses the safety"
),
)
@click.option(
"--num-processes",
help="Parallel import using multiple processes. Not yet supported",
default=None,
hidden=True,
)
@click.option("--dataset-path", "--dataset", help="The dataset's path once imported")
@click.argument(
"args",
Expand All @@ -107,12 +101,10 @@ def point_cloud_import(
delete,
amend,
allow_empty,
num_processes,
args,
):
"""
Experimental command for importing point cloud datasets. Work-in-progress.
Will eventually be merged with the main `import` command.
Import a dataset of point-cloud tiles.

SOURCES should be one or more LAZ or LAS files (or wildcards that match multiple LAZ or LAS files).
"""
Expand Down
165 changes: 165 additions & 0 deletions kart/raster/import_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import logging

import click

from kart.cli_util import StringFromFile, MutexOption, KartCommand
from kart.completion_shared import file_path_completer
from kart.parse_args import parse_import_sources_and_datasets
from kart.raster.metadata_util import rewrite_and_merge_metadata
from kart.raster.v1 import RasterV1
from kart.tile.importer import TileImporter

L = logging.getLogger(__name__)


@click.command("raster-import", hidden=True, cls=KartCommand)
@click.option(
"--convert-to-cog/--no-convert-to-cog",
" /--preserve-format",
is_flag=True,
default=False,
help="Whether to convert all GeoTIFFs to COGs (Cloud Optimized GeoTIFFs), or to import all files in their native format.",
)
@click.pass_context
@click.option(
"--message",
"-m",
type=StringFromFile(encoding="utf-8"),
help="Commit message. By default this is auto-generated.",
)
@click.option(
"--checkout/--no-checkout",
"do_checkout",
is_flag=True,
default=True,
help="Whether to create a working copy once the import is finished, if no working copy exists yet.",
)
@click.option(
"--replace-existing",
is_flag=True,
cls=MutexOption,
exclusive_with=["--delete", "--update-existing"],
help="Replace existing dataset at the same path.",
)
@click.option(
"--update-existing",
is_flag=True,
cls=MutexOption,
exclusive_with=["--replace-existing"],
help=(
"Update existing dataset at the same path. "
"Tiles will be replaced by source tiles with the same name. "
"Tiles in the existing dataset which are not present in SOURCES will remain untouched."
),
)
@click.option(
"--delete",
type=StringFromFile(encoding="utf-8"),
cls=MutexOption,
exclusive_with=["--replace-existing"],
multiple=True,
help=("Deletes the given tile. Can be used multiple times."),
)
@click.option(
"--amend",
default=False,
is_flag=True,
help="Amend the previous commit instead of adding a new commit",
)
@click.option(
"--allow-empty",
is_flag=True,
default=False,
help=(
"Usually recording a commit that has the exact same tree as its sole "
"parent commit is a mistake, and the command prevents you from making "
"such a commit. This option bypasses the safety"
),
)
@click.option("--dataset-path", "--dataset", help="The dataset's path once imported")
@click.argument(
"args",
nargs=-1,
metavar="SOURCE [SOURCES...]",
shell_complete=file_path_completer,
)
def raster_import(
ctx,
convert_to_cog,
dataset_path,
message,
do_checkout,
replace_existing,
update_existing,
delete,
amend,
allow_empty,
args,
):
"""
Experimental command for importing a dataset of raster tiles.

SOURCES should be one or more GeoTIFF files (or wildcards that match multiple GeoTIFF files).
"""
repo = ctx.obj.repo

if convert_to_cog:
raise NotImplementedError("Sorry, --convert-to-cog is not yet implemented")

sources, datasets = parse_import_sources_and_datasets(args)
if datasets:
problem = " \n".join(datasets)
raise click.UsageError(
f"For raster import, every argument should be a GeoTIFF file:\n {problem}"
)

RasterImporter(repo, ctx, convert_to_cog).import_tiles(
dataset_path=dataset_path,
message=message,
do_checkout=do_checkout,
replace_existing=replace_existing,
update_existing=update_existing,
delete=delete,
amend=amend,
allow_empty=allow_empty,
sources=sources,
)


class RasterImporter(TileImporter):

DATASET_CLASS = RasterV1

def __init__(self, repo, ctx, convert_to_copc):
super().__init__(repo, ctx)
self.convert_to_copc = convert_to_copc

def get_default_message(self):
return f"Importing {len(self.sources)} GeoTIFF tiles as {self.dataset_path}"

def check_metadata_pre_convert(self):
pass

def check_metadata_post_convert(self):
pass

# These are all pretty simple since we don't do any conversions yet:

def get_merged_source_metadata(self, all_metadata):
return rewrite_and_merge_metadata(all_metadata)

def get_predicted_merged_metadata(self, all_metadata):
return rewrite_and_merge_metadata(all_metadata)

def get_actual_merged_metadata(self, all_metadata):
return rewrite_and_merge_metadata(all_metadata)

def get_conversion_func(self, source_metadata):
return None

def existing_tile_matches_source(self, source_oid, existing_summary):
"""Check if the existing tile can be reused instead of reimporting."""
if not source_oid.startswith("sha256:"):
source_oid = "sha256:" + source_oid

return existing_summary.get("oid") == source_oid
129 changes: 129 additions & 0 deletions kart/raster/metadata_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from kart.crs_util import normalise_wkt
from kart.geometry import ring_as_wkt
from kart.list_of_conflicts import ListOfConflicts
from kart.schema import Schema, ColumnSchema


def rewrite_and_merge_metadata(tile_metadata_list):
"""
Given a list of tile metadata, merges the parts we expect to be homogenous into a single piece of tile metadata in
the same format that describes the whole list.
"""
# TODO - this will get more complicated as we add support for convert-to-COG.
result = {}
for tile_metadata in tile_metadata_list:
_merge_metadata_field(result, "format", tile_metadata["format"])
_merge_metadata_field(result, "schema", tile_metadata["schema"])
_merge_metadata_field(result, "crs", tile_metadata["crs"])
# Don't copy anything from "tile" to the result - these fields are tile specific and needn't be merged.
return result


def _merge_metadata_field(output, key, value):
if key not in output:
output[key] = value
return
existing_value = output[key]
if isinstance(existing_value, ListOfConflicts):
if value not in existing_value:
existing_value.append(value)
elif existing_value != value:
output[key] = ListOfConflicts([existing_value, value])


def extract_raster_tile_metadata(
raster_tile_path,
*,
extract_schema=True,
):
"""
Use gdalinfo to get any and all raster metadata we can make use of in Kart.
This includes metadata that must be dataset-homogenous and would be stored in the dataset's /meta/ folder,
along with other metadata that is tile-specific and would be stored in the tile's pointer file.

Output:
{
"format": - Information about file format, as stored at meta/format.json (or some subset thereof).
"tile": - Tile-specific (non-homogenous) information, as stored in individual tile pointer files.
"schema": - PDRF schema, as stored in meta/schema.json
"crs": - CRS as stored at meta/crs.wkt
}

Although any two raster tiles can differ in any way imaginable, we specifically constrain tiles in the
same dataset to be homogenous enough that the meta items format.json, schema.json and crs.wkt
describe *all* of the tiles in that dataset. The "tile" field is where we keep all information
that can be different for every tile in the dataset, which is why it must be stored in pointer files.
"""
from osgeo import gdal

metadata = gdal.Info(raster_tile_path, options="-json")

# NOTE: this format is still in early stages of design, is subject to change.

crs = metadata["coordinateSystem"]["wkt"]
format_info = {"fileType": "image/tiff; application=geotiff"}

cc = metadata["cornerCoordinates"]
size_in_pixels = metadata["size"]
tile_info = {
"format": "geotiff",
"crs84Extent": format_polygon(*metadata["wgs84Extent"]["coordinates"][0]),
"nativeExtent": format_polygon(
cc["upperLeft"], cc["lowerLeft"], cc["lowerRight"], cc["upperRight"]
craigds marked this conversation as resolved.
Show resolved Hide resolved
),
"dimensions": f"{size_in_pixels[0]}x{size_in_pixels[1]}",
}

result = {
"format": format_info,
"tile": tile_info,
"crs": normalise_wkt(crs),
}
if extract_schema:
result["schema"] = gdalinfo_bands_to_kart_schema(metadata["bands"])

return result


def format_polygon(*points):
# TODO - should we just store the axis-aligned extent?
return "POLYGON(" + ring_as_wkt(*points) + ")"


def gdalinfo_bands_to_kart_schema(gdalinfo_bands):
return Schema([gdalinfo_band_to_kart_columnschema(b) for b in gdalinfo_bands])


GDAL_TYPE_TO_KART_TYPE = {
"Byte": {"dataType": "integer", "size": 8, "unsigned": True},
"Int8": {"dataType": "integer", "size": 8},
"Int16": {"dataType": "integer", "size": 16},
"Int32": {"dataType": "integer", "size": 32},
"Int64": {"dataType": "integer", "size": 64},
"Float32": {"dataType": "integer", "size": 32},
"Float64": {"dataType": "integer", "size": 64},
}


def gdalinfo_band_to_kart_columnschema(gdalinfo_band):
# TODO - handle color tables and category tables.
result = {}

gdal_type = gdalinfo_band["type"]
if gdal_type.startswith("UInt"):
gdal_type = gdal_type[1:]
result["unsigned"] = True
elif gdal_type.startswith("CInt") or gdal_type.startswith("CFloat"):
gdal_type = gdal_type[1:]
result["complex"] = True

kart_type_info = GDAL_TYPE_TO_KART_TYPE.get(gdal_type)
if kart_type_info is None:
raise RuntimeError(f"Unrecognized GDAL type: {gdal_type}")

result.update(kart_type_info)

if "colorInterpretation" in gdalinfo_band:
result["interpretation"] = gdalinfo_band["colorInterpretation"].lower()

return ColumnSchema(result)
Loading