sparkgeo
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎.idea/parquet-nuts.iml
Lines changed: 2 additions & 0 deletions b/‎.idea/parquet-nuts.iml
Lines changed: 2 additions & 0 deletions
diff --git a/‎coverages/__init__.py b/‎coverages/__init__.py
diff --git a/‎coverages/nuts.py
Lines changed: 67 additions & 0 deletions b/‎coverages/nuts.py
Lines changed: 67 additions & 0 deletions
diff --git a/‎coverages/watersheds.py
Lines changed: 62 additions & 0 deletions b/‎coverages/watersheds.py
Lines changed: 62 additions & 0 deletions
diff --git a/‎datasets/__init__.py b/‎datasets/__init__.py
diff --git a/‎worldcover.py renamed to ‎datasets/worldcover.py
Lines changed: 65 additions & 5 deletions b/‎worldcover.py renamed to ‎datasets/worldcover.py
Lines changed: 65 additions & 5 deletions
diff --git a/‎datasets/worldsoils.py
Lines changed: 26 additions & 0 deletions b/‎datasets/worldsoils.py
Lines changed: 26 additions & 0 deletions
diff --git a/‎inject_metadata.py
Lines changed: 92 additions & 0 deletions b/‎inject_metadata.py
Lines changed: 92 additions & 0 deletions
diff --git a/‎data/NUTS_with_children.fgb renamed to ‎input/NUTS_with_children.fgb
1.24 MB b/‎data/NUTS_with_children.fgb renamed to ‎input/NUTS_with_children.fgb
1.24 MB
@@ -241,5 +241,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-output/
+output*/
 
@@ -0,0 +1,67 @@
+from pathlib import Path
+
+import geopandas as gpd
+from pandas import DataFrame, Series
+from shapely import box
+
+from datasets.worldcover import (
+    create_directories,
+    get_tile_keys,
+    process,
+    land_cover_names
+)
+from inject_metadata import inject_metadata
+
+nuts_worldcover_metadata = {
+    "identifierKey": "NUTS_ID",
+    "nameKey": "NUTS_NAME",
+    "levelKey": "LEVL_CODE",
+    "childrenKey": "children",
+    "attributeKeys": land_cover_names
+}
+
+
+def nuts_level_func(stats_df: DataFrame, level: int) -> Series:
+    return Series(stats_df["LEVL_CODE"] == level)
+
+
+def foo(row, df):
+    if not row["children"]:
+        return []
+
+    return row["children"].split(",")
+
+
+def nuts_children(df: DataFrame, nuts_id: str) -> Series:
+    children = df.loc[df["NUTS_ID"] == nuts_id, "children"].iloc[0].split(",")
+
+    return Series(df["NUTS_ID"].isin(children))
+
+
+def nuts_intersections(ds_bbox: box, stats_df: DataFrame, level: int):
+    return stats_df[
+            (stats_df.geometry.intersects(ds_bbox)) & (stats_df["children"] == "")
+        ]
+
+
+def main():
+    create_directories()
+    geom_df = gpd.read_file("input/NUTS_with_children.fgb", engine="pyogrio")
+    keys = get_tile_keys(geom_df, 0, nuts_level_func)
+    file_names = process(
+        keys,
+        geom_df,
+        4,
+        nuts_level_func,
+        nuts_children,
+        nuts_intersections,
+        "NUTS_ID",
+        Path("output/worldcover-stats-nuts.fgb")
+    )
+
+    for file_name in file_names:
+        inject_metadata(file_name, nuts_worldcover_metadata)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,62 @@
+from pathlib import Path
+
+import geopandas as gpd
+from pandas import DataFrame, Series
+from shapely import box
+
+from datasets.worldcover import (
+    create_directories,
+    get_tile_keys,
+    process,
+    land_cover_names,
+)
+from inject_metadata import inject_metadata
+
+watersheds_worldcover_metadata = {
+    "identifierKey": "HYBAS_ID",
+    "nameKey": "HYBAS_ID",
+    "levelKey": "level",
+    "childrenKey": "children",
+    "attributeKeys": land_cover_names
+}
+
+
+def hydrosheds_level_func(stats_df: DataFrame, level: int) -> Series:
+    return Series(stats_df["PFAF_ID"].astype("str").str.len() == level + 1)
+
+
+def hydrosheds_intersections(ds_bbox: box, stats_df: DataFrame, level: int) -> DataFrame:
+    return stats_df[stats_df.geometry.intersects(ds_bbox) & hydrosheds_level_func(stats_df, level - 1)]
+
+
+def hydrosheds_children(stats_df: DataFrame, pfaf_id: int) -> Series:
+    """
+    Returns a boolean Series indicating whether the PFAF_ID is a child of the given PFAF_ID.
+    """
+
+    child_lower = pfaf_id * 10
+    child_upper = child_lower + 9
+    return Series((stats_df["PFAF_ID"] >= child_lower) & (stats_df["PFAF_ID"] <= child_upper))
+
+
+def main():
+    create_directories()
+    geom_df = gpd.read_file("input/hybas_eu_lev01-12_v1c.fgb", engine="pyogrio")
+    keys = get_tile_keys(geom_df, 2, hydrosheds_level_func)
+    file_names = process(
+        keys,
+        geom_df,
+        12,
+        hydrosheds_level_func,
+        hydrosheds_children,
+        hydrosheds_intersections,
+        "PFAF_ID",
+        Path("output/worldcover-stats-watersheds.fgb")
+    )
+
+    for file_name in file_names:
+        inject_metadata(file_name, watersheds_worldcover_metadata)
+
+
+if __name__ == "__main__":
+    main()
@@ -3,7 +3,6 @@
 from pathlib import Path
 
 import geopandas as gpd
-import humanize
 import psutil
 from pandas import DataFrame, Series
 from pyproj import Geod
@@ -12,6 +11,12 @@
 from rasterio import features, DatasetReader
 import numpy as np
 from shapely import box
+import boto3
+import humanize
+from botocore import UNSIGNED
+from botocore.config import Config
+import rasterio
+
 
 # Land cover classification mapping
 LAND_COVER_CLASSES = {
@@ -39,6 +44,7 @@
 
 type LevelFunc = Callable[[DataFrame, int], Series]
 type ChildFunc = Callable[[DataFrame, any], Series]
+type IntersectionFunc = Callable[[box, DataFrame, int], DataFrame]
 
 
 def get_process_memory_use():
@@ -47,13 +53,18 @@ def get_process_memory_use():
     return mem_info.rss
 
 
-def output_by_level(max_level: int, stats_df: DataFrame, level_fn: LevelFunc, file_path: Path):
+def output_by_level(max_level: int, stats_df: DataFrame, level_fn: LevelFunc, file_path: Path) -> list[Path]:
+    output_file_names = []
+
     for i in trange(max_level, desc=f"Saving to {str(file_path.parent)}"):
         level = stats_df[level_fn(stats_df, i)]
         suffix = file_path.suffix
         file_name = file_path.with_suffix(f".level{i:02}{suffix}")
+        output_file_names.append(file_name)
         level.to_file(file_name)
 
+    return output_file_names
+
 
 def calculate_total_area(statistics: DataFrame):
     areas = []
@@ -84,13 +95,12 @@ def calculate_values(source_raster: DatasetReader, intersections: DataFrame) ->
         max_memory_usage = max(max_memory_usage, get_process_memory_use())
         current_memory_usage = get_process_memory_use()
         geometry_progress_bar.set_postfix_str(f"{humanize.naturalsize(current_memory_usage)}")
-        geometry_progress_bar.set_description_str(f"{src_file_name.name} <- {region["HYBAS_ID"]}")
+        geometry_progress_bar.set_description_str(f"{src_file_name.name}")
         geom = region.geometry
 
         try:
             window = features.geometry_window(source_raster, [geom])
         except WindowError as e:
-            print(f"Error: {e} - skipping raster")
             continue
 
         window_xform = source_raster.window_transform(window)
@@ -138,7 +148,7 @@ def create_directories() -> tuple[Path, Path]:
 
 def get_tile_keys(geom_df: DataFrame, level: int, level_fn: LevelFunc) -> list[str]:
     countries = geom_df[level_fn(geom_df, level)]
-    tile_index_df = gpd.read_file("data/esa_worldcover_grid.fgb", engine="pyogrio")
+    tile_index_df = gpd.read_file("input/esa_worldcover_grid.fgb", engine="pyogrio")
     intersected_tiles = gpd.sjoin(tile_index_df, countries, how="inner")
     unique_tiles = intersected_tiles.drop_duplicates(subset="ll_tile").copy()
 
@@ -166,3 +176,53 @@ def sum_children(stats_df: DataFrame, bottom_level: int, level_fn: LevelFunc, ch
                 continue
 
             stats_df.loc[index, land_cover_names] = children[land_cover_names].sum()
+
+
+def process(tif_file_names: list[str],
+            stats_df: DataFrame,
+            bottom_level: int,
+            level_fn: LevelFunc,
+            child_fn: ChildFunc,
+            intersection_fn: IntersectionFunc,
+            code_column_name: str,
+            output_path: Path
+            ) -> list[Path]:
+    for name in land_cover_names:
+        stats_df[name] = 0.0
+
+    stats_df["Unknown"] = 0.0
+    stats_df["total_area"] = 0.0
+
+    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
+
+    print("Processing")
+    tiff_progress_bar = trange(len(tif_file_names))
+    for idx in tiff_progress_bar:
+        tiff_progress_bar.set_postfix_str(f"{humanize.naturalsize(max_memory_usage)}")
+        src_file_name = Path("./.cache/" + Path(tif_file_names[idx]).name)
+
+        if not src_file_name.exists():
+            s3.download_file(s3_bucket, tif_file_names[idx], str(src_file_name.resolve()))
+
+        ds = rasterio.open(src_file_name)
+        ds_bbox = box(*ds.bounds)
+
+        intersections = intersection_fn(ds_bbox, stats_df, bottom_level)
+
+        if len(intersections) == 0:
+            ds.close()
+            continue
+
+        results = calculate_values(ds, intersections)
+
+        stats_df.loc[results.index] = results
+
+        ds.close()
+
+    sum_children(stats_df, bottom_level, level_fn, child_fn, code_column_name)
+    calculate_total_area(stats_df)
+
+    file_names = output_by_level(bottom_level, stats_df, level_fn, output_path)
+    print("Done")
+
+    return file_names
@@ -0,0 +1,26 @@
+# https://gui.world-soils.com/mapserver/Europe?SERVICE=WCS&VERSION=2.0.1&REQUEST=GetCoverage&COVERAGEID=soc_0-5cm_mean_europe_2020-2022&FORMAT=image/tiff&OUTPUTCRS=http://www.opengis.net/def/crs/EPSG/0/4326&SUBSET=long(8,10)&SUBSET=lat(50,52)&SUBSETTINGCRS=http://www.opengis.net/def/crs/EPSG/0/4326
+from collections.abc import Callable
+
+from pandas import DataFrame, Series
+import geopandas as gpd
+
+type LevelFunc = Callable[[DataFrame, int], Series]
+
+
+def get_tile_keys(geom_df: DataFrame, level: int, level_fn: LevelFunc) -> list[str]:
+    countries = geom_df[level_fn(geom_df, level)]
+
+    tile_index_df = gpd.read_file("input/esa_worldcover_grid.fgb", engine="pyogrio")
+    intersected_tiles = gpd.sjoin(tile_index_df, countries, how="inner")
+    unique_tiles = intersected_tiles.drop_duplicates(subset="ll_tile").copy()
+
+    tile_names = unique_tiles["ll_tile"].tolist()
+    tile_keys = [f"v200/2021/map/ESA_WorldCover_10m_2021_v200_{tile_name.strip()}_Map.tif" for tile_name in tile_names]
+
+    unique_tiles["tile_key"] = tile_keys
+
+    unique_tiles = unique_tiles[["ll_tile", "tile_key", "geometry"]]
+
+    unique_tiles.to_file("output/worldcover-tiles.fgb")
+
+    return tile_keys
@@ -0,0 +1,92 @@
+import json
+from pathlib import Path
+
+from osgeo import ogr
+from pydantic import BaseModel, ValidationError, field_serializer
+
+
+class MissingColumn(Exception):
+    pass
+
+
+class MetadataIn(BaseModel):
+    identifierKey: str
+    nameKey: str
+    levelKey: str
+    childrenKey: str
+    attributeKeys: list[str]
+
+
+class MetadataOut(BaseModel):
+    identifierKey: str
+    nameKey: str
+    levelKey: str
+    childrenKey: str
+    attributeKeys: list[int]
+
+    # FGB metadata fields can only be strings.
+    @field_serializer("attributeKeys")
+    def serialize_attribute_keys(self, keys, _):
+        return ",".join([str(v) for v in keys])
+
+
+def inject_metadata(fgb_file_path: Path, metadata: dict):
+    try:
+        metadata = MetadataIn.model_validate(metadata)
+    except ValidationError:
+        raise
+
+    if not fgb_file_path.exists():
+        raise FileNotFoundError(f"Input file '{fgb_file_path}' not found")
+    if fgb_file_path.suffix != ".fgb":
+        raise ValueError(f"Input file '{fgb_file_path}' is not a FlatGeobuf file")
+
+    src_ds = ogr.Open(fgb_file_path)
+    src_layer = src_ds.GetLayer()
+
+    # Check columns exist in the input file.
+    defn = src_layer.GetLayerDefn()
+
+    if defn.GetFieldIndex(metadata.identifierKey) == -1:
+        raise MissingColumn(f"Index column '{metadata.identifierKey}' not found in {fgb_file_path}")
+
+    if defn.GetFieldIndex(metadata.nameKey) == -1:
+        raise MissingColumn(f"Name column '{metadata.nameKey}' not found in {fgb_file_path}")
+
+    if defn.GetFieldIndex(metadata.levelKey) == -1:
+        raise MissingColumn(f"Level column '{metadata.levelKey}' not found in {fgb_file_path}")
+
+    if defn.GetFieldIndex(metadata.childrenKey) == -1:
+        raise MissingColumn(f"Level column '{metadata.childrenKey}' not found in {fgb_file_path}")
+
+    attribute_column_indices = []
+
+    for column in metadata.attributeKeys:
+        column_index = defn.GetFieldIndex(column)
+        if column_index == -1:
+            raise MissingColumn(f"Column '{column}' not found in {fgb_file_path}")
+        else:
+            attribute_column_indices.append(column_index)
+
+    metadata_out = MetadataOut(
+        identifierKey=metadata.identifierKey,
+        nameKey=metadata.nameKey,
+        levelKey=metadata.levelKey,
+        attributeKeys=attribute_column_indices,
+        childrenKey=metadata.childrenKey
+    )
+
+    # Overwrite the input file.
+    driver = ogr.GetDriverByName("FlatGeobuf")
+    dst_ds = driver.CreateDataSource(fgb_file_path)
+    dst_ds.CopyLayer(src_layer, src_layer.GetName())
+
+    print(f"Metadata injected into {fgb_file_path}")
+    print(f"{metadata_out}")
+
+    # We serialize the metadata to a string to take advantage of Pydantic's field serializer,
+    # then load it back up as a regular Python object so OGR can set the metadata.
+    dst_ds.SetMetadata(json.loads(metadata_out.model_dump_json()))
+
+    dst_ds = None
+    src_ds = None