Add from_pretrained telemetry (huggingface#1461)

anton-l · web-flow · commit bb2d7cacc038 · 2022-12-07T11:56:21.000+01:00
* Add from_pretrained usage logging

* Add classes

* add a telemetry notice

* macos
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -120,3 +120,24 @@ git pull
 ```
 
 Your Python environment will find the `main` version of 🤗 Diffusers on the next run.
+
+## Notice on telemetry logging
+
+Our library gathers telemetry information during `from_pretrained()` requests.
+This data includes the version of Diffusers and PyTorch/Flax, the requested model or pipeline class,
+and the path to a pretrained checkpoint if it is hosted on the Hub.
+This usage data helps us debug issues and prioritize new features.
+No private data, such as paths to models saved locally on disk, is ever collected.
+
+We understand that not everyone wants to share additional information, and we respect your privacy,
+so you can disable telemetry collection by setting the `DISABLE_TELEMETRY` environment variable from your terminal:
+
+On Linux/MacOS:
+```bash
+export DISABLE_TELEMETRY=YES
+```
+
+On Windows:
+```bash
+set DISABLE_TELEMETRY=YES
+```
diff --git a/src/diffusers/hub_utils.py b/src/diffusers/hub_utils.py
@@ -20,10 +20,11 @@
 from typing import Dict, Optional, Union
 from uuid import uuid4
 
+import requests
 from huggingface_hub import HfFolder, whoami
 
 from . import __version__
-from .utils import ENV_VARS_TRUE_VALUES, logging
+from .utils import ENV_VARS_TRUE_VALUES, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging
 from .utils.import_utils import (
     _flax_version,
     _jax_version,
@@ -45,7 +46,9 @@
 
 MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "utils" / "model_card_template.md"
 SESSION_ID = uuid4().hex
+HF_HUB_OFFLINE = os.getenv("HF_HUB_OFFLINE", "").upper() in ENV_VARS_TRUE_VALUES
 DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES
+HUGGINGFACE_CO_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/"
 
 
 def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
@@ -72,6 +75,27 @@ def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
     return ua
 
 
+def send_telemetry(data: Dict, name: str):
+    """
+    Sends logs to the Hub telemetry endpoint.
+
+    Args:
+        data: the fields to track, e.g. {"example_name": "dreambooth"}
+        name: a unique name to differentiate the telemetry logs, e.g. "diffusers_examples" or "diffusers_notebooks"
+    """
+    if DISABLE_TELEMETRY or HF_HUB_OFFLINE:
+        pass
+
+    headers = {"user-agent": http_user_agent(data)}
+    endpoint = HUGGINGFACE_CO_TELEMETRY + name
+    try:
+        r = requests.head(endpoint, headers=headers)
+        r.raise_for_status()
+    except Exception:
+        # We don't want to error in case of connection errors of any kind.
+        pass
+
+
 def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
diff --git a/src/diffusers/modeling_flax_utils.py b/src/diffusers/modeling_flax_utils.py
@@ -28,6 +28,7 @@
 from requests import HTTPError
 
 from . import __version__, is_torch_available
+from .hub_utils import send_telemetry
 from .modeling_flax_pytorch_utils import convert_pytorch_state_dict_to_flax
 from .utils import (
     CONFIG_NAME,
@@ -339,6 +340,10 @@ def from_pretrained(
                     f"Error no file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
                     f"{pretrained_path_with_subfolder}."
                 )
+            send_telemetry(
+                {"model_class": cls.__name__, "model_path": "local", "framework": "flax"},
+                name="diffusers_from_pretrained",
+            )
         else:
             try:
                 model_file = hf_hub_download(
@@ -354,6 +359,10 @@ def from_pretrained(
                     subfolder=subfolder,
                     revision=revision,
                 )
+                send_telemetry(
+                    {"model_class": cls.__name__, "model_path": "hub", "framework": "flax"},
+                    name="diffusers_from_pretrained",
+                )
 
             except RepositoryNotFoundError:
                 raise EnvironmentError(
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
@@ -26,6 +26,7 @@
 from requests import HTTPError
 
 from . import __version__
+from .hub_utils import send_telemetry
 from .utils import (
     CONFIG_NAME,
     DIFFUSERS_CACHE,
@@ -400,7 +401,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         model_file = None
         if is_safetensors_available():
             try:
-                model_file = _get_model_file(
+                model_file = cls._get_model_file(
                     pretrained_model_name_or_path,
                     weights_name=SAFETENSORS_WEIGHTS_NAME,
                     cache_dir=cache_dir,
@@ -416,7 +417,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             except:
                 pass
         if model_file is None:
-            model_file = _get_model_file(
+            model_file = cls._get_model_file(
                 pretrained_model_name_or_path,
                 weights_name=WEIGHTS_NAME,
                 cache_dir=cache_dir,
@@ -531,6 +532,100 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         return model
 
+    @classmethod
+    def _get_model_file(
+        cls,
+        pretrained_model_name_or_path,
+        *,
+        weights_name,
+        subfolder,
+        cache_dir,
+        force_download,
+        proxies,
+        resume_download,
+        local_files_only,
+        use_auth_token,
+        user_agent,
+        revision,
+    ):
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
+                # Load from a PyTorch checkpoint
+                model_file = os.path.join(pretrained_model_name_or_path, weights_name)
+            elif subfolder is not None and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+            ):
+                model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {weights_name} found in directory {pretrained_model_name_or_path}."
+                )
+            send_telemetry(
+                {"model_class": cls.__name__, "model_path": "local", "framework": "pytorch"},
+                name="diffusers_from_pretrained",
+            )
+            return model_file
+        else:
+            try:
+                # Load from URL or cache if already cached
+                model_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=weights_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision,
+                )
+                send_telemetry(
+                    {"model_class": cls.__name__, "model_path": "hub", "framework": "pytorch"},
+                    name="diffusers_from_pretrained",
+                )
+                return model_file
+
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+                    "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
+                    "login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+                    "this model name. Check the model page at "
+                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}."
+                )
+            except HTTPError as err:
+                raise EnvironmentError(
+                    "There was a specific connection error when trying to load"
+                    f" {pretrained_model_name_or_path}:\n{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a file named {weights_name} or"
+                    " \nCheckout your internet connection or see how to run the library in"
+                    " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a file named {weights_name}"
+                )
+
     @classmethod
     def _load_pretrained_model(
         cls,
diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
@@ -29,7 +29,7 @@
 from tqdm.auto import tqdm
 
 from .configuration_utils import ConfigMixin
-from .hub_utils import http_user_agent
+from .hub_utils import http_user_agent, send_telemetry
 from .modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin
 from .schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin
 from .utils import CONFIG_NAME, DIFFUSERS_CACHE, BaseOutput, is_transformers_available, logging
@@ -346,8 +346,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 ignore_patterns=ignore_patterns,
                 user_agent=user_agent,
             )
+            send_telemetry(
+                {"pipeline_class": requested_pipeline_class, "pipeline_path": "hub", "framework": "flax"},
+                name="diffusers_from_pretrained",
+            )
         else:
             cached_folder = pretrained_model_name_or_path
+            send_telemetry(
+                {"pipeline_class": cls.__name__, "pipeline_path": "local", "framework": "flax"},
+                name="diffusers_from_pretrained",
+            )
 
         config_dict = cls.load_config(cached_folder)
 
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
@@ -33,7 +33,7 @@
 
 from .configuration_utils import ConfigMixin
 from .dynamic_modules_utils import get_class_from_dynamic_module
-from .hub_utils import http_user_agent
+from .hub_utils import http_user_agent, send_telemetry
 from .modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
 from .schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from .utils import (
@@ -477,7 +477,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             else:
                 requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
             user_agent = {"pipeline_class": requested_pipeline_class}
-            if custom_pipeline is not None:
+            if custom_pipeline is not None and not custom_pipeline.endswith(".py"):
                 user_agent["custom_pipeline"] = custom_pipeline
 
             user_agent = http_user_agent(user_agent)
@@ -504,8 +504,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 ignore_patterns=ignore_patterns,
                 user_agent=user_agent,
             )
+            send_telemetry(
+                {"pipeline_class": requested_pipeline_class, "pipeline_path": "hub", "framework": "pytorch"},
+                name="diffusers_from_pretrained",
+            )
         else:
             cached_folder = pretrained_model_name_or_path
+            send_telemetry(
+                {"pipeline_class": cls.__name__, "pipeline_path": "local", "framework": "pytorch"},
+                name="diffusers_from_pretrained",
+            )
 
         config_dict = cls.load_config(cached_folder)