Skip to content

Performance optimization for permute_dims #393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 30, 2025
167 changes: 121 additions & 46 deletions bench/ndarray/transpose.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
"import blosc2\n",
"import time\n",
"import plotly.express as px\n",
"import pandas as pd"
"import pandas as pd\n",
"\n",
"from blosc2 import NDArray\n",
"from typing import Any\n",
"\n",
"import builtins"
],
"id": "55765646130156ef",
"outputs": [],
Expand All @@ -18,35 +23,104 @@
"metadata": {},
"cell_type": "code",
"source": [
"sizes = [(100, 100), (500, 500), (500, 1000), (1000, 1000), (2000, 2000), (3000, 3000), (4000, 4000), (5000, 5000)]\n",
"sizes_mb = [(np.prod(size) * 8) / 2**20 for size in sizes] # Convert to MB\n",
"results = {\"numpy\": [], \"blosc2\": []}"
"def new_permute_dims(arr: NDArray, axes: tuple[int] | list[int] | None = None, **kwargs: Any) -> NDArray:\n",
" if np.isscalar(arr) or arr.ndim < 2:\n",
" return arr\n",
"\n",
" ndim = arr.ndim\n",
" if axes is None:\n",
" axes = tuple(range(ndim))[::-1]\n",
" else:\n",
" axes = tuple(axis if axis >= 0 else ndim + axis for axis in axes)\n",
" if sorted(axes) != list(range(ndim)):\n",
" raise ValueError(f\"axes {axes} is not a valid permutation of {ndim} dimensions\")\n",
"\n",
" new_shape = tuple(arr.shape[axis] for axis in axes)\n",
" if \"chunks\" not in kwargs or kwargs[\"chunks\"] is None:\n",
" kwargs[\"chunks\"] = tuple(arr.chunks[axis] for axis in axes)\n",
"\n",
" result = blosc2.empty(shape=new_shape, dtype=arr.dtype, **kwargs)\n",
"\n",
" # Precomputar info por dimensión\n",
" chunks = arr.chunks\n",
" shape = arr.shape\n",
"\n",
" for info in arr.iterchunks_info():\n",
" coords = info.coords\n",
" start_stop = [\n",
" (coord * chunk, builtins.min(chunk * (coord + 1), dim))\n",
" for coord, chunk, dim in zip(coords, chunks, shape)\n",
" ]\n",
"\n",
" src_slice = tuple(slice(start, stop) for start, stop in start_stop)\n",
" dst_slice = tuple(slice(start_stop[ax][0], start_stop[ax][1]) for ax in axes)\n",
"\n",
" transposed = np.transpose(arr[src_slice], axes=axes)\n",
" result[dst_slice] = np.ascontiguousarray(transposed)\n",
"\n",
" return result"
],
"id": "1cfb7daa6eee1401",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"metadata": {
"jupyter": {
"is_executing": true
}
},
"cell_type": "code",
"source": [
"for method in [\"numpy\", \"blosc2\"]:\n",
" for size in sizes:\n",
" arr = np.random.rand(*size)\n",
" arr_b2 = blosc2.asarray(arr)\n",
"def validate_results(result_orig, result_new, shape):\n",
" if not np.allclose(result_orig[:], result_new[:]):\n",
" raise ValueError(f\"Mismatch found for shape {shape}\")\n",
"\n",
"shapes = [\n",
" (100, 100), (2000, 2000), (3000, 3000), (4000, 4000), (3000, 7000),\n",
" (5000, 5000), (6000, 6000), (7000, 7000), (8000, 8000), (6000, 12000),\n",
" (9000, 9000), (10000, 10000), (10500, 10500), (11000, 11000), (11500, 11500),\n",
" (12000, 12000), (12500, 12500), (13000, 13000), (13500, 13500), (14000, 14000),\n",
" (14500, 14500), (15000, 15000), (16000, 16000), (16500, 16500), (17000, 17000),\n",
" (17500, 17500), (18000, 18000)\n",
"]\n",
"\n",
"sizes = []\n",
"time_total = []\n",
"chunk_labels = []\n",
"\n",
" start_time = time.perf_counter()\n",
"def numpy_permute(arr: np.ndarray, axes: tuple[int] | list[int] | None = None) -> np.ndarray:\n",
" if axes is None:\n",
" axes = range(arr.ndim)[::-1]\n",
" return np.transpose(arr, axes=axes).copy()\n",
"\n",
" if method == \"numpy\":\n",
" np.transpose(arr).copy()\n",
" elif method == \"blosc2\":\n",
" blosc2.transpose(arr_b2)\n",
"for shape in shapes:\n",
" size_mb = (np.prod(shape) * 8) / (2 ** 20)\n",
"\n",
" # NumPy transpose\n",
" matrix_numpy = np.linspace(0, 1, np.prod(shape)).reshape(shape)\n",
" t0 = time.perf_counter()\n",
" result_numpy = numpy_permute(matrix_numpy)\n",
" t1 = time.perf_counter()\n",
" time_total.append(t1 - t0)\n",
" sizes.append(size_mb)\n",
" chunk_labels.append(\"numpy.transpose()\")\n",
"\n",
" end_time = time.perf_counter()\n",
" time_b = end_time - start_time\n",
" # New permute dims (optimized)\n",
" matrix_blosc2 = blosc2.linspace(0, 1, np.prod(shape), shape=shape)\n",
" t0 = time.perf_counter()\n",
" result_new_perm = new_permute_dims(matrix_blosc2)\n",
" t1 = time.perf_counter()\n",
" time_total.append(t1 - t0)\n",
" sizes.append(size_mb)\n",
" chunk_labels.append(\"blosc2.permute_dims()\")\n",
"\n",
" try:\n",
" validate_results(result_new_perm, result_numpy, shape)\n",
" except ValueError as e:\n",
" print(e)\n",
"\n",
" print(f\"{method}: shape={size}, Performance = {time_b:.6f} s\")\n",
" results[method].append(time_b)"
" print(f\"Shape={shape}, Chunk={matrix_blosc2.chunks}: permute_dims={time_total[-2]:.6f}s, numpy={time_total[-1]:.6f}s\")"
],
"id": "384d0ad7983a8d26",
"outputs": [],
Expand All @@ -57,21 +131,21 @@
"cell_type": "code",
"source": [
"df = pd.DataFrame({\n",
" \"Matrix Size (MB)\": sizes_mb,\n",
" \"NumPy Time (s)\": results[\"numpy\"],\n",
" \"Blosc2 Time (s)\": results[\"blosc2\"]\n",
" \"Matrix Size (MB)\": sizes,\n",
" \"Time (s)\": time_total,\n",
" \"Implementation\": chunk_labels\n",
"})\n",
"\n",
"fig = px.line(df,\n",
" x=\"Matrix Size (MB)\",\n",
" y=[\"NumPy Time (s)\", \"Blosc2 Time (s)\"],\n",
" title=\"Performance of Matrix Transposition (NumPy vs Blosc2)\",\n",
" labels={\"value\": \"Time (s)\", \"variable\": \"Method\"},\n",
" y=\"Time (s)\",\n",
" color=\"Implementation\",\n",
" title=\"Performance: NumPy vs Blosc2\",\n",
" width=1000, height=600,\n",
" markers=True)\n",
"\n",
"fig.show()"
],
"id": "c71ffb39eb28992c",
"id": "786b8b7b5ea95225",
"outputs": [],
"execution_count": null
},
Expand All @@ -81,15 +155,14 @@
"source": [
"%%time\n",
"shapes = [\n",
" (100, 100), (2000, 2000), (3000, 3000), (4000, 4000), (3000, 7000)\n",
" # (5000, 5000), (6000, 6000), (7000, 7000), (8000, 8000), (6000, 12000),\n",
" # (9000, 9000), (10000, 10000),\n",
" # (10500, 10500), (11000, 11000), (11500, 11500), (12000, 12000),\n",
" # (12500, 12500), (13000, 13000), (13500, 13500), (14000, 14000),\n",
" # (14500, 14500), (15000, 15000), (15500, 15500), (16000, 16000),\n",
" # (16500, 16500), (17000, 17000)\n",
" (100, 100), (1000, 1000), (2000, 2000), (3000, 3000), (4000, 4000),\n",
" (5000, 5000), (6000, 6000), (7000, 7000), (8000, 8000), (9000, 9000),\n",
" (9500, 9500), (10000, 10000), (10500, 10500), (11000, 11000), (11500, 11500),\n",
" (12000, 12000), (12500, 12500), (13000, 13000), (13500, 13500), (14000, 14000),\n",
" (14500, 14500), (15000, 15000), (16000, 16000), (16500, 16500), (17000, 17000)\n",
"]\n",
"chunkshapes = [None, (150, 300), (200, 500), (500, 200), (1000, 1000)]\n",
"\n",
"chunkshapes = [None, (150, 300), (1000, 1000), (4000, 4000)]\n",
"\n",
"sizes = []\n",
"time_total = []\n",
Expand All @@ -111,18 +184,27 @@
" print(f\"NumPy: Shape={shape}, Time = {numpy_time:.6f} s\")\n",
"\n",
" for chunk in chunkshapes:\n",
" matrix_blosc2 = blosc2.asarray(matrix_np, chunks=chunk)\n",
" matrix_blosc2 = blosc2.asarray(matrix_np)\n",
" matrix_blosc2 = blosc2.linspace(0, 1, np.prod(shape), shape=shape)\n",
"\n",
" t0 = time.perf_counter()\n",
" result_blosc2 = blosc2.transpose(matrix_blosc2)\n",
" result_blosc2 = new_permute_dims(matrix_blosc2, chunks=chunk)\n",
" blosc2_time = time.perf_counter() - t0\n",
"\n",
" sizes.append(size_mb)\n",
" time_total.append(blosc2_time)\n",
" chunk_labels.append(f\"{chunk[0]}x{chunk[1]}\" if chunk else \"Auto\")\n",
"\n",
" print(f\"Blosc2: Shape={shape}, Chunks = {matrix_blosc2.chunks}, Time = {blosc2_time:.6f} s\")\n",
"\n",
" print(f\"Blosc2: Shape={shape}, Chunks = {result_blosc2.chunks}, Time = {blosc2_time:.6f} s\")"
],
"id": "bcdd8aa5f65df561",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"df = pd.DataFrame({\n",
" \"Matrix Size (MB)\": sizes,\n",
" \"Time (s)\": time_total,\n",
Expand All @@ -135,17 +217,10 @@
" color=\"Chunk Shape\",\n",
" title=\"Performance of Matrix Transposition (Blosc2 vs NumPy)\",\n",
" labels={\"value\": \"Time (s)\", \"variable\": \"Metric\"},\n",
" width=1000, height=600,\n",
" markers=True)\n",
"fig.show()"
],
"id": "bcdd8aa5f65df561",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": "",
"id": "1d2f48f370ba7e7a",
"outputs": [],
"execution_count": null
Expand Down
47 changes: 25 additions & 22 deletions src/blosc2/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3876,11 +3876,11 @@ def matmul(x1: NDArray, x2: NDArray, **kwargs: Any) -> NDArray:
r = x2.chunks[-1]

for row in range(0, n, p):
row_end = (row + p) if (row + p) < n else n
row_end = builtins.min(row + p, n)
for col in range(0, m, q):
col_end = (col + q) if (col + q) < m else m
col_end = builtins.min(col + q, m)
for aux in range(0, k, r):
aux_end = (aux + r) if (aux + r) < k else k
aux_end = builtins.min(aux + r, k)
bx1 = x1[row:row_end, aux:aux_end]
bx2 = x2[aux:aux_end, col:col_end]
result[row:row_end, col:col_end] += np.matmul(bx1, bx2)
Expand Down Expand Up @@ -3951,6 +3951,7 @@ def permute_dims(arr: NDArray, axes: tuple[int] | list[int] | None = None, **kwa
[[13, 14, 15, 16],
[17, 18, 19, 20],
[21, 22, 23, 24]]])

>>> at = blosc2.permute_dims(a, axes=(1, 0, 2))
>>> at[:]
array([[[ 1, 2, 3, 4],
Expand All @@ -3960,37 +3961,39 @@ def permute_dims(arr: NDArray, axes: tuple[int] | list[int] | None = None, **kwa
[[ 9, 10, 11, 12],
[21, 22, 23, 24]]])
"""

if np.isscalar(arr) or arr.ndim < 2:
return arr

ndim = arr.ndim

if axes is None:
axes = range(arr.ndim)[::-1]
axes = tuple(range(ndim))[::-1]
else:
axes_transformed = tuple(axis if axis >= 0 else arr.ndim + axis for axis in axes)
if sorted(axes_transformed) != list(range(arr.ndim)):
raise ValueError(f"axes {axes} is not a valid permutation of {arr.ndim} dimensions")
axes = axes_transformed
axes = tuple(axis if axis >= 0 else ndim + axis for axis in axes)
if sorted(axes) != list(range(ndim)):
raise ValueError(f"axes {axes} is not a valid permutation of {ndim} dimensions")

new_shape = tuple(arr.shape[axis] for axis in axes)
if "chunks" not in kwargs:
if "chunks" not in kwargs or kwargs["chunks"] is None:
kwargs["chunks"] = tuple(arr.chunks[axis] for axis in axes)

result = blosc2.empty(shape=new_shape, dtype=arr.dtype, **kwargs)

chunk_slices = [
[slice(start, builtins.min(dim, start + chunk)) for start in range(0, dim, chunk)]
for dim, chunk in zip(arr.shape, arr.chunks, strict=False)
]
chunks = arr.chunks
shape = arr.shape

for info in arr.iterchunks_info():
coords = info.coords
start_stop = [
(coord * chunk, builtins.min(chunk * (coord + 1), dim))
for coord, chunk, dim in zip(coords, chunks, shape, strict=False)
]

block_counts = [len(s) for s in chunk_slices]
grid = np.indices(block_counts).reshape(len(block_counts), -1).T
src_slice = tuple(slice(start, stop) for start, stop in start_stop)
dst_slice = tuple(slice(start_stop[ax][0], start_stop[ax][1]) for ax in axes)

for idx in grid:
block_slices = tuple(chunk_slices[axis][i] for axis, i in enumerate(idx))
block = arr[block_slices]
target_slices = tuple(block_slices[axis] for axis in axes)
result[target_slices] = np.transpose(block, axes=axes).copy()
transposed = np.transpose(arr[src_slice], axes=axes)
result[dst_slice] = np.ascontiguousarray(transposed)

return result

Expand Down Expand Up @@ -4024,7 +4027,7 @@ def transpose(x, **kwargs: Any) -> NDArray:
stacklevel=2,
)

# If arguments are dimension < 2 they are returned
# If arguments are dimension < 2, they are returned
if np.isscalar(x) or x.ndim < 2:
return x

Expand Down
Loading