Skip to content

Commit da86719

Browse files
pjbulljayqi
andauthored
Implement content type propagation (#226)
* Implement content type propagation * add to changelog * random filenames * fix for custom endpoint * check class and live before updating kwargs * update header level * Move content type to base client; add docstring * move other client settings to its own page * update version and changelog * Correct docstring error * fix typo in history Co-authored-by: Jay Qi <[email protected]> * set release date Co-authored-by: Jay Qi <[email protected]> Co-authored-by: Jay Qi <[email protected]>
1 parent 85268c8 commit da86719

File tree

13 files changed

+258
-43
lines changed

13 files changed

+258
-43
lines changed

HISTORY.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# cloudpathlib Changelog
22

3-
## v0.7.2 (UNRELEASED)
3+
## v0.8.0 (2022-05-19)
44

55
- Fixed pickling of `CloudPath` objects not working. ([Issue #223](https://github.com/drivendataorg/cloudpathlib/issues/223), [PR #224](https://github.com/drivendataorg/cloudpathlib/pull/224))
6+
- Added functionality to [push the MIME (media) type to the content type property on cloud providers by default. ([Issue #222](https://github.com/drivendataorg/cloudpathlib/issues/222), [PR #226](https://github.com/drivendataorg/cloudpathlib/pull/226))
67

78
## v0.7.1 (2022-04-06)
89

cloudpathlib/azure/azblobclient.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from datetime import datetime
2+
import mimetypes
23
import os
34
from pathlib import Path, PurePosixPath
4-
from typing import Any, Dict, Iterable, Optional, Tuple, Union
5+
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union
56

67

78
from ..client import Client, register_client_class
@@ -12,7 +13,7 @@
1213

1314
try:
1415
from azure.core.exceptions import ResourceNotFoundError
15-
from azure.storage.blob import BlobServiceClient, BlobProperties
16+
from azure.storage.blob import BlobServiceClient, BlobProperties, ContentSettings
1617
except ModuleNotFoundError:
1718
implementation_registry["azure"].dependencies_loaded = False
1819

@@ -32,6 +33,7 @@ def __init__(
3233
connection_string: Optional[str] = None,
3334
blob_service_client: Optional["BlobServiceClient"] = None,
3435
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
36+
content_type_method: Optional[Callable] = mimetypes.guess_type,
3537
):
3638
"""Class constructor. Sets up a [`BlobServiceClient`](
3739
https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python).
@@ -68,6 +70,8 @@ def __init__(
6870
https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python).
6971
local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
7072
for downloaded files. If None, will use a temporary directory.
73+
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
74+
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
7175
"""
7276
if connection_string is None:
7377
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None)
@@ -86,14 +90,16 @@ def __init__(
8690
"Credentials are required; see docs for options."
8791
)
8892

89-
super().__init__(local_cache_dir=local_cache_dir)
93+
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)
9094

9195
def _get_metadata(self, cloud_path: AzureBlobPath) -> Union["BlobProperties", Dict[str, Any]]:
9296
blob = self.service_client.get_blob_client(
9397
container=cloud_path.container, blob=cloud_path.blob
9498
)
9599
properties = blob.get_blob_properties()
96100

101+
properties["content_type"] = properties.content_settings.content_type
102+
97103
return properties
98104

99105
def _download_file(
@@ -220,7 +226,18 @@ def _upload_file(
220226
container=cloud_path.container, blob=cloud_path.blob
221227
)
222228

223-
blob.upload_blob(Path(local_path).read_bytes(), overwrite=True) # type: ignore
229+
extra_args = {}
230+
if self.content_type_method is not None:
231+
content_type, content_encoding = self.content_type_method(str(local_path))
232+
233+
if content_type is not None:
234+
extra_args["content_type"] = content_type
235+
if content_encoding is not None:
236+
extra_args["content_encoding"] = content_encoding
237+
238+
content_settings = ContentSettings(**extra_args)
239+
240+
blob.upload_blob(Path(local_path).read_bytes(), overwrite=True, content_settings=content_settings) # type: ignore
224241

225242
return cloud_path
226243

cloudpathlib/client.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import abc
2+
import mimetypes
23
import os
34
from pathlib import Path
45
from tempfile import TemporaryDirectory
@@ -25,7 +26,11 @@ class Client(abc.ABC, Generic[BoundedCloudPath]):
2526
_cloud_meta: CloudImplementation
2627
_default_client = None
2728

28-
def __init__(self, local_cache_dir: Optional[Union[str, os.PathLike]] = None):
29+
def __init__(
30+
self,
31+
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
32+
content_type_method: Optional[Callable] = mimetypes.guess_type,
33+
):
2934
self._cloud_meta.validate_completeness()
3035
# setup caching and local versions of file and track if it is a tmp dir
3136
self._cache_tmp_dir = None
@@ -34,6 +39,7 @@ def __init__(self, local_cache_dir: Optional[Union[str, os.PathLike]] = None):
3439
local_cache_dir = self._cache_tmp_dir.name
3540

3641
self._local_cache_dir = Path(local_cache_dir)
42+
self.content_type_method = content_type_method
3743

3844
def __del__(self) -> None:
3945
# make sure temp is cleaned up if we created it

cloudpathlib/gs/gsclient.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from datetime import datetime
2+
import mimetypes
23
import os
34
from pathlib import Path, PurePosixPath
4-
from typing import Any, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union
5+
from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union
56

67
from ..client import Client, register_client_class
78
from ..cloudpath import implementation_registry
@@ -34,6 +35,7 @@ def __init__(
3435
project: Optional[str] = None,
3536
storage_client: Optional["StorageClient"] = None,
3637
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
38+
content_type_method: Optional[Callable] = mimetypes.guess_type,
3739
):
3840
"""Class constructor. Sets up a [`Storage
3941
Client`](https://googleapis.dev/python/storage/latest/client.html).
@@ -65,6 +67,8 @@ def __init__(
6567
https://googleapis.dev/python/storage/latest/client.html).
6668
local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
6769
for downloaded files. If None, will use a temporary directory.
70+
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
71+
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
6872
"""
6973
if application_credentials is None:
7074
application_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
@@ -81,7 +85,7 @@ def __init__(
8185
except DefaultCredentialsError:
8286
self.client = StorageClient.create_anonymous_client()
8387

84-
super().__init__(local_cache_dir=local_cache_dir)
88+
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)
8589

8690
def _get_metadata(self, cloud_path: GSPath) -> Optional[Dict[str, Any]]:
8791
bucket = self.client.bucket(cloud_path.bucket)
@@ -94,6 +98,7 @@ def _get_metadata(self, cloud_path: GSPath) -> Optional[Dict[str, Any]]:
9498
"etag": blob.etag,
9599
"size": blob.size,
96100
"updated": blob.updated,
101+
"content_type": blob.content_type,
97102
}
98103

99104
def _download_file(self, cloud_path: GSPath, local_path: Union[str, os.PathLike]) -> Path:
@@ -207,7 +212,12 @@ def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: GSPath)
207212
bucket = self.client.bucket(cloud_path.bucket)
208213
blob = bucket.blob(cloud_path.blob)
209214

210-
blob.upload_from_filename(str(local_path))
215+
extra_args = {}
216+
if self.content_type_method is not None:
217+
content_type, _ = self.content_type_method(str(local_path))
218+
extra_args["content_type"] = content_type
219+
220+
blob.upload_from_filename(str(local_path), **extra_args)
211221
return cloud_path
212222

213223

cloudpathlib/local/localclient.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import atexit
22
from hashlib import md5
3+
import mimetypes
34
import os
45
from pathlib import Path, PurePosixPath
56
import shutil
67
from tempfile import TemporaryDirectory
7-
from typing import Iterable, List, Optional, Tuple, Union
8+
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
89

910
from ..client import Client
1011
from .localpath import LocalPath
@@ -21,14 +22,15 @@ def __init__(
2122
*args,
2223
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
2324
local_storage_dir: Optional[Union[str, os.PathLike]] = None,
25+
content_type_method: Optional[Callable] = mimetypes.guess_type,
2426
**kwargs,
2527
):
2628
# setup caching and local versions of file. use default temp dir if not provided
2729
if local_storage_dir is None:
2830
local_storage_dir = self.get_default_storage_dir()
2931
self._local_storage_dir = Path(local_storage_dir)
3032

31-
super().__init__(local_cache_dir=local_cache_dir)
33+
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)
3234

3335
@classmethod
3436
def get_default_storage_dir(cls) -> Path:
@@ -132,6 +134,17 @@ def _upload_file(
132134
shutil.copy(local_path, dst)
133135
return cloud_path
134136

137+
def _get_metadata(self, cloud_path: "LocalPath") -> Dict:
138+
# content_type is the only metadata we test currently
139+
if self.content_type_method is None:
140+
content_type_method = lambda x: (None, None)
141+
else:
142+
content_type_method = self.content_type_method
143+
144+
return {
145+
"content_type": content_type_method(str(self._cloud_path_to_local(cloud_path)))[0],
146+
}
147+
135148

136149
_temp_dirs_to_clean: List[TemporaryDirectory] = []
137150

cloudpathlib/s3/s3client.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
import mimetypes
12
import os
23
from pathlib import Path, PurePosixPath
3-
from typing import Any, Dict, Iterable, Optional, Tuple, Union
4+
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union
45

56

67
from ..client import Client, register_client_class
@@ -35,6 +36,7 @@ def __init__(
3536
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
3637
endpoint_url: Optional[str] = None,
3738
boto3_transfer_config: Optional["TransferConfig"] = None,
39+
content_type_method: Optional[Callable] = mimetypes.guess_type,
3840
):
3941
"""Class constructor. Sets up a boto3 [`Session`](
4042
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html).
@@ -63,6 +65,8 @@ def __init__(
6365
Parameterize it to access a customly deployed S3-compatible object store such as MinIO, Ceph or any other.
6466
boto3_transfer_config (Optional[dict]): Instantiated TransferConfig for managing s3 transfers.
6567
(https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig)
68+
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
69+
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
6670
"""
6771
endpoint_url = endpoint_url or os.getenv("AWS_ENDPOINT_URL")
6872
if boto3_session is not None:
@@ -93,7 +97,7 @@ def __init__(
9397

9498
self.boto3_transfer_config = boto3_transfer_config
9599

96-
super().__init__(local_cache_dir=local_cache_dir)
100+
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)
97101

98102
def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]:
99103
data = self.s3.ObjectSummary(cloud_path.bucket, cloud_path.key).get()
@@ -102,7 +106,7 @@ def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]:
102106
"last_modified": data["LastModified"],
103107
"size": data["ContentLength"],
104108
"etag": data["ETag"],
105-
"mime": data["ContentType"],
109+
"content_type": data["ContentType"],
106110
"extra": data["Metadata"],
107111
}
108112

@@ -250,7 +254,16 @@ def _remove(self, cloud_path: S3Path) -> None:
250254
def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: S3Path) -> S3Path:
251255
obj = self.s3.Object(cloud_path.bucket, cloud_path.key)
252256

253-
obj.upload_file(str(local_path), Config=self.boto3_transfer_config)
257+
extra_args = {}
258+
259+
if self.content_type_method is not None:
260+
content_type, content_encoding = self.content_type_method(str(local_path))
261+
if content_type is not None:
262+
extra_args["ContentType"] = content_type
263+
if content_encoding is not None:
264+
extra_args["ContentEncoding"] = content_encoding
265+
266+
obj.upload_file(str(local_path), Config=self.boto3_transfer_config, ExtraArgs=extra_args)
254267
return cloud_path
255268

256269

docs/docs/other_client_settings.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Other `Client` settings
2+
3+
## Content type guessing (`content_type_method`)
4+
5+
All of the clients support passing a `content_type_method` when they are instantiated.
6+
This is a method that is used to guess the [MIME (media) type](https://en.wikipedia.org/wiki/Media_type)
7+
(often called the "content type") of the file and set that on the cloud provider.
8+
9+
By default, `content_type_method` use the Python built-in
10+
[`guess_type`](https://docs.python.org/3/library/mimetypes.html#mimetypes.guess_type)
11+
to set this content type. This guesses based on the file extension, and may not always get the correct type.
12+
In these cases, you can set `content_type_method` to your own function that gets the proper type; for example, by
13+
reading the file content or by looking it up in a dictionary of filename-to-media-type mappings that you maintain.
14+
15+
If you set a custom method, it should follow the signature of `guess_type` and return a tuple of the form:
16+
`(content_type, content_encoding)`; for example, `("text/css", None)`.
17+
18+
If you set `content_type_method` to None, it will do whatever the default of the cloud provider's SDK does. This
19+
varies from provider to provider.
20+
21+
Here is an example of using a custom `content_type_method`.
22+
23+
```python
24+
import mimetypes
25+
from pathlib import Path
26+
27+
from cloudpathlib import S3Client, CloudPath
28+
29+
def my_content_type(path):
30+
# do lookup for content types I define; fallback to
31+
# guess_type for anything else
32+
return {
33+
".potato": ("application/potato", None),
34+
}.get(Path(path).suffix, mimetypes.guess_type(path))
35+
36+
37+
# create a client with my custom content type
38+
client = S3Client(content_type_method=my_content_type)
39+
40+
# To use this same method for every cloud path, set our client as the default.
41+
# This is optional, and you could use client.CloudPath to create paths instead.
42+
client.set_as_default_client()
43+
44+
# create a cloud path
45+
cp1 = CloudPath("s3://cloudpathlib-test-bucket/i_am_a.potato")
46+
cp1.write_text("hello")
47+
48+
# check content type with boto3
49+
print(client.s3.Object(cp1.bucket, cp1.key).content_type)
50+
#> application/potato
51+
```

docs/mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ nav:
2020
- Authentication: "authentication.md"
2121
- Caching: "caching.ipynb"
2222
- AnyPath: "anypath-polymorphism.md"
23+
- Other Client settings: "other_client_settings.md"
2324
- Testing code that uses cloudpathlib: "testing_mocked_cloudpathlib.ipynb"
2425
- Integrations: "integrations.md"
2526
- Changelog: "changelog.md"

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,5 @@ def load_requirements(path: Path):
6060
"Source Code": "https://github.com/drivendataorg/cloudpathlib",
6161
},
6262
url="https://github.com/drivendataorg/cloudpathlib",
63-
version="0.7.1",
63+
version="0.8.0",
6464
)

0 commit comments

Comments
 (0)