Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,11 +392,17 @@ def __init__(self, properties: Properties = EMPTY_DICT):
super().__init__(properties=properties)

@staticmethod
def parse_location(location: str) -> Tuple[str, str, str]:
"""Return the path without the scheme."""
def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]:
"""Return (scheme, netloc, path) for the given location.

Uses DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing.
"""
uri = urlparse(location)

if not uri.scheme:
return "file", uri.netloc, os.path.abspath(location)
default_scheme = properties.get("DEFAULT_SCHEME", "file")
default_netloc = properties.get("DEFAULT_NETLOC", "")
return default_scheme, default_netloc, os.path.abspath(location)
elif uri.scheme in ("hdfs", "viewfs"):
return uri.scheme, uri.netloc, uri.path
else:
Expand Down Expand Up @@ -614,7 +620,7 @@ def new_input(self, location: str) -> PyArrowFile:
Returns:
PyArrowFile: A PyArrowFile instance for the given location.
"""
scheme, netloc, path = self.parse_location(location)
scheme, netloc, path = self.parse_location(location, self.properties)
return PyArrowFile(
fs=self.fs_by_scheme(scheme, netloc),
location=location,
Expand All @@ -631,7 +637,7 @@ def new_output(self, location: str) -> PyArrowFile:
Returns:
PyArrowFile: A PyArrowFile instance for the given location.
"""
scheme, netloc, path = self.parse_location(location)
scheme, netloc, path = self.parse_location(location, self.properties)
return PyArrowFile(
fs=self.fs_by_scheme(scheme, netloc),
location=location,
Expand All @@ -652,7 +658,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
an AWS error code 15.
"""
str_location = location.location if isinstance(location, (InputFile, OutputFile)) else location
scheme, netloc, path = self.parse_location(str_location)
scheme, netloc, path = self.parse_location(str_location, self.properties)
fs = self.fs_by_scheme(scheme, netloc)

try:
Expand Down
26 changes: 26 additions & 0 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2785,3 +2785,29 @@ def _expected_batch(unit: str) -> pa.RecordBatch:
)

assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result)


def test_parse_location_defaults() -> None:
"""Test that parse_location uses defaults."""

from pyiceberg.io.pyarrow import PyArrowFileIO

# if no default scheme or netloc is provided, use file scheme and empty netloc
scheme, netloc, path = PyArrowFileIO.parse_location("/foo/bar")
assert scheme == "file"
assert netloc == ""
assert path == "/foo/bar"

scheme, netloc, path = PyArrowFileIO.parse_location(
"/foo/bar", properties={"DEFAULT_SCHEME": "scheme", "DEFAULT_NETLOC": "netloc:8000"}
)
assert scheme == "scheme"
assert netloc == "netloc:8000"
assert path == "/foo/bar"

scheme, netloc, path = PyArrowFileIO.parse_location(
"/foo/bar", properties={"DEFAULT_SCHEME": "hdfs", "DEFAULT_NETLOC": "netloc:8000"}
)
assert scheme == "hdfs"
assert netloc == "netloc:8000"
assert path == "/foo/bar"