Skip to content

Commit 5d278dc

Browse files
committed
Update archive and file source to accept objref
1 parent 1bd1322 commit 5d278dc

File tree

3 files changed

+45
-2
lines changed

3 files changed

+45
-2
lines changed

llmstack/data/sources/files/archive.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from llmstack.data.sources.base import BaseSource, DataDocument
1515
from llmstack.data.sources.utils import (
1616
create_source_document_asset,
17+
get_document_data_uri_from_objref,
1718
get_source_document_asset_by_objref,
1819
)
1920

@@ -74,10 +75,15 @@ def provider_slug(cls):
7475
return "promptly"
7576

7677
def get_data_documents(self, **kwargs) -> List[DataDocument]:
78+
archive_file = self.file
79+
# If objref:// is present, get the data URI from the objref
80+
if archive_file and archive_file.startswith("objref://"):
81+
archive_file = get_document_data_uri_from_objref(archive_file, datasource_uuid=kwargs["datasource_uuid"])
82+
7783
if self.split_files:
78-
files = extract_archive_files(*validate_parse_data_uri(self.file))
84+
files = extract_archive_files(*validate_parse_data_uri(archive_file))
7985
else:
80-
files = [self.file]
86+
files = [archive_file]
8187

8288
documents = []
8389
for file in files:

llmstack/data/sources/files/file.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from llmstack.data.sources.base import BaseSource, DataDocument
1111
from llmstack.data.sources.utils import (
1212
create_source_document_asset,
13+
get_document_data_uri_from_objref,
1314
get_source_document_asset_by_objref,
1415
)
1516

@@ -48,6 +49,21 @@ def provider_slug(cls):
4849

4950
def get_data_documents(self, **kwargs) -> List[DataDocument]:
5051
files = self.file.split("|")
52+
files = list(
53+
filter(
54+
lambda entry: entry is not None,
55+
list(
56+
map(
57+
lambda entry: (
58+
get_document_data_uri_from_objref(file_objref, datasource_uuid=kwargs["datasource_uuid"])
59+
if entry.startswith("objref://")
60+
else entry
61+
),
62+
files,
63+
)
64+
),
65+
)
66+
)
5167
documents = []
5268
for file in files:
5369
file_id = str(uuid.uuid4())

llmstack/data/sources/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,24 @@ def get_source_document_asset_by_objref(objref):
7171
pass
7272

7373
return asset
74+
75+
76+
def get_document_data_uri_from_objref(objref, datasource_uuid):
77+
from llmstack.data.models import DataSourceEntryFiles
78+
79+
if not objref:
80+
return None
81+
asset = None
82+
try:
83+
_, uuid = objref.strip().split("//")[1].split("/")
84+
asset_obj = DataSourceEntryFiles.objects.get(uuid=uuid)
85+
86+
if asset_obj.metadata.get("datasource_uuid") != datasource_uuid:
87+
return None
88+
89+
asset = DataSourceEntryFiles.get_asset_data_uri(asset_obj, include_name=True)
90+
91+
except Exception:
92+
pass
93+
94+
return asset

0 commit comments

Comments
 (0)