Skip to content
This repository was archived by the owner on Feb 18, 2023. It is now read-only.

Commit 5654135

Browse files
committed
utils overhaul
1 parent aae1261 commit 5654135

File tree

6 files changed

+179
-153
lines changed

6 files changed

+179
-153
lines changed

preprocessing.ipynb

Lines changed: 51 additions & 68 deletions
Large diffs are not rendered by default.

utils/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@
44
import utils.geo
55
import utils.img
66
import utils.coco
7-
import utils.viz
87

utils/coco.py

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,30 +6,34 @@
66
import itertools
77

88
from shapely.geometry import Polygon, MultiPolygon
9-
import utils.other
109
import numpy as np
10+
import matplotlib.pyplot as plt
11+
from matplotlib.collections import PatchCollection
12+
from descartes import PolygonPatch
13+
from PIL import Image as pilimage
14+
15+
import utils.other
1116

1217

13-
def train_test_split_coco(chips_stats: Dict) -> Tuple[List, List]:
14-
chips_list = list(chips_stats.keys())
18+
def train_test_split(chip_dfs: Dict, test_size=0.2) -> Tuple[Dict, Dict]:
19+
"""Split chips into training and test set"""
20+
chips_list = list(chip_dfs.keys())
1521
random.seed(1)
1622
random.shuffle(chips_list)
17-
split_idx = round(len(chips_list) * 0.2) # 80% train, 20% test.
23+
split_idx = round(len(chips_list) * test_size)
1824
train_split = chips_list[split_idx:]
1925
val_split = chips_list[:split_idx]
2026

21-
# Apply split to geometries/stats.
22-
train_chip_dfs = {k: chips_stats[k] for k in sorted(train_split)}
23-
val_chip_dfs = {k.replace('train', 'val'): chips_stats[k] for k in sorted(val_split)}
27+
train_chip_dfs = {k: chip_dfs[k] for k in sorted(train_split)}
28+
val_chip_dfs = {k.replace('train', 'val'): chip_dfs[k] for k in sorted(val_split)}
2429

2530
return train_chip_dfs, val_chip_dfs
2631

2732

28-
def format_coco(set_: Dict, chip_width: int, chip_height: int):
29-
"""
30-
Format extracted chip geometries to COCO json format.
33+
def format_coco(chip_dfs: Dict, chip_width: int, chip_height: int):
34+
"""Format train and test chip geometries to COCO json format.
3135
32-
Coco train/val have specific ids, formatting requires the split data..
36+
COCO train and val set have specific ids.
3337
"""
3438
cocojson = {
3539
"info": {},
@@ -38,7 +42,7 @@ def format_coco(set_: Dict, chip_width: int, chip_height: int):
3842
'id': 1, # id needs to match category_id.
3943
'name': 'agfields_singleclass'}]}
4044

41-
for key_idx, key in enumerate(set_.keys()):
45+
for key_idx, key in enumerate(chip_dfs.keys()):
4246
if 'train' in key:
4347
chip_id = int(key[21:])
4448
elif 'val' in key:
@@ -50,16 +54,16 @@ def format_coco(set_: Dict, chip_width: int, chip_height: int):
5054
"width": chip_height})
5155
cocojson.setdefault('images', []).append(key_image)
5256

53-
for row_idx, row in set_[key]['chip_df'].iterrows():
57+
for row_idx, row in chip_dfs[key]['chip_df'].iterrows():
5458
# Convert geometry to COCO segmentation format:
5559
# From shapely POLYGON ((x y, x1 y2, ..)) to COCO [[x, y, x1, y1, ..]].
5660
# The annotations were encoded by RLE, except for crowd region (iscrowd=1)
5761
coco_xy = list(itertools.chain.from_iterable((x, y) for x, y in zip(*row.geometry.exterior.coords.xy)))
58-
coco_xy = [round(xy, 2) for xy in coco_xy]
62+
coco_xy = [round(coords, 2) for coords in coco_xy]
5963
# Add COCO bbox in format [minx, miny, width, height]
6064
bounds = row.geometry.bounds # COCO bbox
6165
coco_bbox = [bounds[0], bounds[1], bounds[2] - bounds[0], bounds[3] - bounds[1]]
62-
coco_bbox = [round(xy, 2) for xy in coco_bbox]
66+
coco_bbox = [round(coords, 2) for coords in coco_bbox]
6367

6468
key_annotation = {"id": key_idx,
6569
"image_id": int(chip_id),
@@ -77,7 +81,12 @@ def format_coco(set_: Dict, chip_width: int, chip_height: int):
7781

7882

7983
def move_coco_val_images(val_chips_list, path_train_folder):
80-
"""Move val chip images to val folder, applies train/val split on images"""
84+
"""Move validation chip images to val folder (applies train/val split on images)
85+
86+
Args:
87+
val_chips_list: List of validation image key names
88+
path_train_folder: Filepath to the training COCO image chip "train" folder
89+
"""
8190
out_folder = path_train_folder.parent / 'val2016'
8291
Path(out_folder).mkdir(parents=True, exist_ok=True)
8392
for chip in val_chips_list:
@@ -86,16 +95,15 @@ def move_coco_val_images(val_chips_list, path_train_folder):
8695

8796
def coco_to_shapely(fp_coco_json: Union[Path, str],
8897
categories: List[int]=None) -> Dict:
89-
"""
90-
Transforms coco json annotations to shapely format.
98+
"""Transforms COCO annotations to shapely geometry format.
9199
92100
Args:
93101
fp_coco_json: Input filepath coco json file.
94102
categories: Categories will filter to specific categories and images that contain at least one
95103
annotation of that category.
96104
97105
Returns:
98-
Dictionary of image key and shapely Multipolygon
106+
Dictionary of image key and shapely Multipolygon.
99107
"""
100108

101109
data = utils.other.load_saved(fp_coco_json, file_format='json')
@@ -110,8 +118,9 @@ def coco_to_shapely(fp_coco_json: Union[Path, str],
110118
extracted_geometries = {}
111119
for image_id, file_name in zip(image_ids, file_names):
112120
annotations = [x for x in data['annotations'] if x['image_id'] == image_id]
113-
# Filter to annotations of the selected category.
114-
annotations = [x for x in annotations if x['category_id'] in categories]
121+
if categories is not None:
122+
annotations = [x for x in annotations if x['category_id'] in categories]
123+
115124
segments = [segment['segmentation'][0] for segment in annotations] # format [x,y,x1,y1,...]
116125

117126
# Create shapely Multipolygons from COCO format polygons.
@@ -121,3 +130,19 @@ def coco_to_shapely(fp_coco_json: Union[Path, str],
121130
return extracted_geometries
122131

123132

133+
def plot_coco(in_json, chip_img_folder, start=0, end=2):
134+
"""Plot COCO annotations and image chips"""
135+
extracted = utils.coco.coco_to_shapely(in_json)
136+
137+
for key in sorted(extracted.keys())[start:end]:
138+
print(key)
139+
plt.figure(figsize=(5, 5))
140+
plt.axis('off')
141+
142+
img = np.asarray(pilimage.open(rf'{chip_img_folder}\{key}'))
143+
plt.imshow(img, interpolation='none')
144+
145+
mp = extracted[key]
146+
patches = [PolygonPatch(p, ec='r', fill=False, alpha=1, lw=0.7, zorder=1) for p in mp]
147+
plt.gca().add_collection(PatchCollection(patches, match_original=True))
148+
plt.show()

utils/geo.py

Lines changed: 78 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
# geo.py
22

33
import warnings
4-
from typing import Union, Dict, Tuple, List
5-
import random
6-
import itertools
4+
from typing import Union, Dict
75

86
import numpy as np
9-
import geopandas as gpd
107
from geopandas import GeoDataFrame as GDF
118
from pandas import DataFrame as DF
129
import shapely
13-
from shapely.geometry import Polygon, MultiPolygon
10+
from shapely.geometry import Polygon
1411
import rasterio.crs
15-
from pathlib import Path
12+
import geopandas as gpd
13+
from tqdm import tqdm
14+
15+
import utils.img
1616

1717

1818
def buffer_zero(in_geo: Union[GDF, Polygon]) -> Union[GDF, Polygon]:
@@ -54,6 +54,25 @@ def set_crs(df: GDF, epsg_code: Union[int, str]) -> GDF:
5454
return df
5555

5656

57+
def explode_mp(df: GDF) -> GDF:
58+
"""Explode all multi-polygon geometries in a geodataframe into individual polygon geometries.
59+
60+
Adds exploded polygons as rows at the end of the geodataframe and resets its index.
61+
"""
62+
outdf = df[df.geom_type == 'Polygon']
63+
64+
df_mp = df[df.geom_type == 'MultiPolygon']
65+
for idx, row in df_mp.iterrows():
66+
df_temp = gpd.GeoDataFrame(columns=df_mp.columns)
67+
df_temp = df_temp.append([row] * len(row.geometry), ignore_index=True)
68+
for i in range(len(row.geometry)):
69+
df_temp.loc[i, 'geometry'] = row.geometry[i]
70+
outdf = outdf.append(df_temp, ignore_index=True)
71+
72+
outdf.reset_index(drop=True, inplace=True)
73+
return outdf
74+
75+
5776
def clip(df: GDF,
5877
clip_poly: Polygon,
5978
explode_mp_: bool = False,
@@ -102,7 +121,7 @@ def reclassify_col(df: Union[GDF, DF],
102121
) -> Union[GDF, DF]:
103122
"""Reclassify class label and class ids in a dataframe column.
104123
105-
# TODO: Make more efficient!
124+
# TODO: Simplify & make more efficient!
106125
Args:
107126
df: input geodataframe.
108127
rcl_scheme: Reclassification scheme, e.g. {'springcereal': [1,2,3], 'wintercereal': [10,11]}
@@ -126,6 +145,7 @@ def reclassify_col(df: Union[GDF, DF],
126145

127146
df[f'rcl_{col_classlabels}'] = df[col_classids].copy().map(rcl_dict) # map name first, id second!
128147
df[f'rcl_{col_classids}'] = df[col_classids].map(rcl_dict_id)
148+
129149
return df
130150

131151

@@ -250,4 +270,54 @@ def _invert_y_axis(poly: Polygon=ingeo, reference_height=reference_height):
250270
return _invert_y_axis(poly=ingeo, reference_height=reference_height)
251271
elif isinstance(ingeo, GDF):
252272
ingeo.geometry = ingeo.geometry.apply(lambda _p: _invert_y_axis(poly=_p, reference_height=reference_height))
253-
return ingeo
273+
return ingeo
274+
275+
276+
def cut_chip_geometries(vector_df, raster_width, raster_height, raster_transform, chip_width=128, chip_height=128,):
277+
"""Workflow to cut a vector geodataframe to chip geometries.
278+
279+
Filters small polygons and skips empty chips.
280+
281+
Args:
282+
vector_df: Geodataframe containing the geometries to be cut to chip geometries.
283+
raster_width: rasterio meta['width']
284+
raster_height: rasterio meta['height']
285+
raster_transform: rasterio meta['transform']
286+
chip_width: Desired pixel width.
287+
chip_height: Desired pixel height.
288+
289+
Returns: Dictionary containing the final chip_df, chip_window, chip_transform, chip_poly objects.
290+
"""
291+
292+
generator_window_bounds = utils.img.get_chip_windows(raster_width=raster_width,
293+
raster_height=raster_height,
294+
raster_transform=raster_transform,
295+
chip_width=chip_width,
296+
chip_height=chip_height,
297+
skip_partial_chips=True)
298+
299+
all_chip_dfs = {}
300+
for i, (chip_window, chip_transform, chip_poly) in enumerate(tqdm(generator_window_bounds)):
301+
302+
# # Clip geometry to chip
303+
chip_df = vector_df.pipe(utils.geo.clip, clip_poly=chip_poly, keep_biggest_poly_=True)
304+
if not all(chip_df.geometry.is_empty):
305+
chip_df.geometry = chip_df.simplify(1, preserve_topology=True)
306+
else:
307+
continue
308+
# Drop small geometries
309+
chip_df = chip_df[chip_df.geometry.area * (10 * 10) > 5000] #5000 sqm in UTM
310+
# Transform to chip pixelcoordinates and invert y-axis for COCO format.
311+
if not all(chip_df.geometry.is_empty):
312+
chip_df = chip_df.pipe(utils.geo.to_pixelcoords, reference_bounds=chip_poly.bounds, scale=True,
313+
ncols=chip_width, nrows=chip_height)
314+
chip_df = chip_df.pipe(invert_y_axis, reference_height=chip_height)
315+
else:
316+
continue
317+
318+
chip_name = f'COCO_train2016_000000{100000+i}' # _{clip_minX}_{clip_minY}_{clip_maxX}_{clip_maxY}'
319+
all_chip_dfs[chip_name] = {'chip_df': chip_df,
320+
'chip_window': chip_window,
321+
'chip_transform': chip_transform,
322+
'chip_poly': chip_poly}
323+
return all_chip_dfs

utils/img.py

Lines changed: 2 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# img.py
22

3-
from typing import Tuple, Generator, Union
3+
from typing import Tuple, Generator
44

55
import rasterio.windows
66
from rasterio.windows import Window
@@ -9,7 +9,6 @@
99
import warnings
1010
from pathlib import Path
1111

12-
import utils.geo
1312
import itertools
1413
import numpy as np
1514
import rasterio
@@ -63,56 +62,6 @@ def get_chip_windows(raster_width: int,
6362
yield (chip_window, chip_transform, chip_poly)
6463

6564

66-
def cut_chip_geometries(vector_df, raster_width, raster_height, raster_transform, chip_width=128, chip_height=128,):
67-
"""Workflow to cut a vector geodataframe to chip geometries.
68-
69-
Filters small polygons and skips empty chips.
70-
71-
Args:
72-
vector_df: Geodataframe containing the geometries to be cut to chip geometries.
73-
raster_width: rasterio meta['width']
74-
raster_height: rasterio meta['height']
75-
raster_transform: rasterio meta['transform']
76-
chip_width: Desired pixel width.
77-
chip_height: Desired pixel height.
78-
79-
Returns: Dictionary containing the final chip_df, chip_window, chip_transform, chip_poly objects.
80-
"""
81-
82-
generator_window_bounds = get_chip_windows(raster_width=raster_width,
83-
raster_height=raster_height,
84-
raster_transform=raster_transform,
85-
chip_width=chip_width,
86-
chip_height=chip_height,
87-
skip_partial_chips=True)
88-
89-
all_chip_dfs = {}
90-
for i, (chip_window, chip_transform, chip_poly) in enumerate(tqdm(generator_window_bounds)):
91-
92-
# # Clip geometry to chip
93-
chip_df = vector_df.pipe(utils.geo.clip, clip_poly=chip_poly, keep_biggest_poly_=True)
94-
if not all(chip_df.geometry.is_empty):
95-
chip_df.geometry = chip_df.simplify(1, preserve_topology=True)
96-
else:
97-
continue
98-
# Drop small geometries
99-
chip_df = chip_df[chip_df.geometry.area * (10 * 10) > 5000] #5000 sqm in UTM
100-
# Transform to chip pixelcoordinates and invert y-axis for COCO format.
101-
if not all(chip_df.geometry.is_empty):
102-
chip_df = chip_df.pipe(utils.geo.to_pixelcoords, reference_bounds=chip_poly.bounds, scale=True,
103-
ncols=chip_width, nrows=chip_height)
104-
chip_df = chip_df.pipe(utils.geo.invert_y_axis, reference_height=chip_height)
105-
else:
106-
continue
107-
108-
chip_name = f'COCO_train2016_000000{100000+i}' # _{clip_minX}_{clip_minY}_{clip_maxX}_{clip_maxY}'
109-
all_chip_dfs[chip_name] = {'chip_df': chip_df,
110-
'chip_window': chip_window,
111-
'chip_transform': chip_transform,
112-
'chip_poly': chip_poly}
113-
return all_chip_dfs
114-
115-
11665
def cut_chip_images(img_path, chip_windows, out_folder, bands=[3, 2, 1]):
11766
"""Cuts image chips and exports them
11867
@@ -128,7 +77,7 @@ def cut_chip_images(img_path, chip_windows, out_folder, bands=[3, 2, 1]):
12877
src = rasterio.open(img_path)
12978

13079
all_chip_stats = {}
131-
for i, chip_window in enumerate(chip_windows):
80+
for i, chip_window in enumerate(tqdm(chip_windows)):
13281
img_array = np.dstack(list(src.read(bands, window=chip_window)))
13382
img_array = exposure.rescale_intensity(img_array, in_range=(0, 2200)) # Sentinel2 range.
13483
with warnings.catch_warnings():

utils/other.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# other.py
22

3-
from typing import Union, Any, Callable, Tuple, Dict, Iterable, List
3+
from typing import Union, Any, Callable, Dict
44
from pathlib import Path
55
import pickle
66

@@ -40,7 +40,7 @@ def load_or_new_save(path: Path,
4040
"""Write data to new pickle/json file or load pickle/json if that file already exists.
4141
4242
Example:
43-
df = cgeo.other.load_or_new_save(path=Path('output\preprocessed_marker_small.pkl'),
43+
df = utils.other.load_or_new_save(path=Path('output\preprocessed_marker_small.pkl'),
4444
default_data=preprocess_vector,
4545
callable_args={'inpath': fp_fields, 'meta': meta})
4646
Args:

0 commit comments

Comments
 (0)