Adding log table to log dictionary or df as artifacts in MLflow run (mlflow#8467)

sunishsheth2009 · web-flow · commit 375052561d25 · 2023-05-23T13:18:19.000-07:00
Signed-off-by: Sunish Sheth &lt;sunishsheth2009@gmail.com&gt;
diff --git a/mlflow/__init__.py b/mlflow/__init__.py
@@ -145,6 +145,7 @@
     log_dict,
     log_image,
     log_figure,
+    log_table,
     active_run,
     get_run,
     start_run,
@@ -200,6 +201,7 @@
     "log_text",
     "log_dict",
     "log_figure",
+    "log_table",
     "log_image",
     "active_run",
     "start_run",
diff --git a/mlflow/tracking/client.py b/mlflow/tracking/client.py
@@ -3,6 +3,7 @@
 and model versions. This is a lower level API than the :py:mod:`mlflow.tracking.fluent` module,
 and is exposed in the :py:mod:`mlflow.tracking` module.
 """
+import mlflow
 import contextlib
 import logging
 import json
@@ -41,8 +42,11 @@
     _validate_model_alias_name,
     _validate_model_version,
 )
+from mlflow.utils.mlflow_tags import MLFLOW_LOGGED_ARTIFACTS
+from mlflow.utils.annotations import experimental
 
 if TYPE_CHECKING:
+    import pandas  # pylint: disable=unused-import
     import matplotlib  # pylint: disable=unused-import
     import plotly  # pylint: disable=unused-import
     import numpy  # pylint: disable=unused-import
@@ -1386,6 +1390,102 @@ def _normalize_to_uint8(x):
             else:
                 raise TypeError("Unsupported image object type: '{}'".format(type(image)))
 
+    @experimental
+    def log_table(
+        self,
+        run_id: str,
+        data: Union[Dict[str, Any], "pandas.DataFrame"],
+        artifact_file: str,
+    ) -> None:
+        """
+        Log a table to MLflow Tracking as a JSON artifact. If the artifact_file already exists
+        in the run, the data would be appended to the existing artifact_file.
+
+        :param run_id: String ID of the run.
+        :param data: Dictionary or pandas.DataFrame to log.
+        :param artifact_file: The run-relative artifact file path in posixpath format to which
+                                the table is saved (e.g. "dir/file.json").
+        :return: None
+
+        .. test-code-block:: python
+            :caption: Dictionary Example
+
+            import mlflow
+            from mlflow import MlflowClient
+
+            table_dict = {
+                "inputs": ["What is MLflow?", "What is Databricks?"],
+                "outputs": ["MLflow is ...", "Databricks is ..."],
+                "toxicity": [0.0, 0.0],
+            }
+
+            client = MlflowClient()
+            run = client.create_run(experiment_id="0")
+            client.log_table(
+                run.info.run_id, data=table_dict, artifact_file="qabot_eval_results.json"
+            )
+
+        .. test-code-block:: python
+            :caption: Pandas DF Example
+
+            import mlflow
+            import pandas as pd
+            from mlflow import MlflowClient
+
+            table_dict = {
+                "inputs": ["What is MLflow?", "What is Databricks?"],
+                "outputs": ["MLflow is ...", "Databricks is ..."],
+                "toxicity": [0.0, 0.0],
+            }
+            df = pd.DataFrame.from_dict(table_dict)
+
+            client = MlflowClient()
+            run = client.create_run(experiment_id="0")
+            client.log_table(run.info.run_id, data=df, artifact_file="qabot_eval_results.json")
+
+        """
+        import pandas as pd
+
+        if not isinstance(data, (pd.DataFrame, dict)):
+            raise MlflowException.invalid_parameter_value(
+                "data must be a pandas.DataFrame or a dictionary"
+            )
+
+        data = pd.DataFrame(data)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            norm_path = posixpath.normpath(artifact_file)
+            artifact_dir = posixpath.dirname(norm_path)
+            artifact_dir = None if artifact_dir == "" else artifact_dir
+
+            artifacts = [f.path for f in self.list_artifacts(run_id, path=artifact_dir)]
+            if artifact_file in artifacts:
+                downloaded_artifact_path = mlflow.artifacts.download_artifacts(
+                    run_id=run_id, artifact_path=artifact_file, dst_path=tmpdir
+                )
+                existing_predictions = pd.read_json(downloaded_artifact_path, orient="split")
+                data = pd.concat([existing_predictions, data], ignore_index=True)
+                _logger.info(
+                    "Appending new table to already existing artifact "
+                    f"{artifact_file} for run {run_id}."
+                )
+            else:
+                _logger.info(f"Creating a new {artifact_file} for run {run_id}.")
+
+        with self._log_artifact_helper(run_id, artifact_file) as artifact_path:
+            data.to_json(artifact_path, orient="split", index=False)
+
+        run = self.get_run(run_id)
+
+        # Get the current value of the tag
+        current_tag_value = json.loads(run.data.tags.get(MLFLOW_LOGGED_ARTIFACTS, "[]"))
+        tag_value = {"path": artifact_file, "type": "table"}
+
+        # Append the new tag value to the list if one doesn't exists
+        if tag_value not in current_tag_value:
+            current_tag_value.append(tag_value)
+            # Set the tag with the updated list
+            self.set_tag(run_id, MLFLOW_LOGGED_ARTIFACTS, json.dumps(current_tag_value))
+
     def _record_logged_model(self, run_id, mlflow_model):
         """
         Record logged model info with the tracking server.
diff --git a/mlflow/tracking/fluent.py b/mlflow/tracking/fluent.py
@@ -41,6 +41,7 @@
 from mlflow.utils.validation import _validate_run_id, _validate_experiment_id_type
 from mlflow.utils.time_utils import get_current_time_millis
 from mlflow.utils.databricks_utils import is_in_databricks_runtime
+from mlflow.utils.annotations import experimental
 
 
 if TYPE_CHECKING:
@@ -988,6 +989,56 @@ def log_image(image: Union["numpy.ndarray", "PIL.Image.Image"], artifact_file: s
     MlflowClient().log_image(run_id, image, artifact_file)
 
 
+@experimental
+def log_table(
+    data: Union[Dict[str, Any], "pandas.DataFrame"],
+    artifact_file: str,
+) -> None:
+    """
+    Log a table to MLflow Tracking as a JSON artifact. If the artifact_file already exists
+    in the run, the data would be appended to the existing artifact_file.
+
+    :param data: Dictionary or pandas.DataFrame to log.
+    :param artifact_file: The run-relative artifact file path in posixpath format to which
+                              the table is saved (e.g. "dir/file.json").
+    :return: None
+
+    .. test-code-block:: python
+        :caption: Dictionary Example
+
+        import mlflow
+
+        table_dict = {
+            "inputs": ["What is MLflow?", "What is Databricks?"],
+            "outputs": ["MLflow is ...", "Databricks is ..."],
+            "toxicity": [0.0, 0.0],
+        }
+
+        with mlflow.start_run():
+            # Log the dictionary as a table
+            mlflow.log_table(data=table_dict, artifact_file="qabot_eval_results.json")
+
+    .. test-code-block:: python
+        :caption: Pandas DF Example
+
+        import mlflow
+        import pandas as pd
+
+        table_dict = {
+            "inputs": ["What is MLflow?", "What is Databricks?"],
+            "outputs": ["MLflow is ...", "Databricks is ..."],
+            "toxicity": [0.0, 0.0],
+        }
+        df = pd.DataFrame.from_dict(table_dict)
+
+        with mlflow.start_run():
+            # Log the df as a table
+            mlflow.log_table(data=df, artifact_file="qabot_eval_results.json")
+    """
+    run_id = _get_or_start_run().info.run_id
+    MlflowClient().log_table(run_id, data, artifact_file)
+
+
 def _record_logged_model(mlflow_model):
     run_id = _get_or_start_run().info.run_id
     MlflowClient()._record_logged_model(run_id, mlflow_model)
diff --git a/mlflow/utils/mlflow_tags.py b/mlflow/utils/mlflow_tags.py
@@ -26,6 +26,8 @@
 MLFLOW_DOCKER_IMAGE_ID = "mlflow.docker.image.id"
 # Indicates that an MLflow run was created by an autologging integration
 MLFLOW_AUTOLOGGING = "mlflow.autologging"
+# Indicates the artifacts type and path that are logged
+MLFLOW_LOGGED_ARTIFACTS = "mlflow.loggedArtifacts"
 
 MLFLOW_DATABRICKS_NOTEBOOK_ID = "mlflow.databricks.notebookID"
 MLFLOW_DATABRICKS_NOTEBOOK_PATH = "mlflow.databricks.notebookPath"
diff --git a/tests/tracking/test_tracking.py b/tests/tracking/test_tracking.py
@@ -1,3 +1,4 @@
+import ast
 import pathlib
 from collections import namedtuple
 import filecmp
@@ -847,3 +848,124 @@ def test_search_runs_multiple_experiments():
     assert len(MlflowClient().search_runs(experiment_ids, "metrics.m_1 > 0", ViewType.ALL)) == 1
     assert len(MlflowClient().search_runs(experiment_ids, "metrics.m_2 = 2", ViewType.ALL)) == 1
     assert len(MlflowClient().search_runs(experiment_ids, "metrics.m_3 < 4", ViewType.ALL)) == 1
+
+
+@pytest.mark.skipif(
+    "MLFLOW_SKINNY" in os.environ,
+    reason="Skinny client does not support the np or pandas dependencies",
+)
+def test_log_table():
+    import pandas as pd
+
+    table_dict = {
+        "inputs": ["What is MLflow?", "What is Databricks?"],
+        "outputs": ["MLflow is ...", "Databricks is ..."],
+        "toxicity": [0.0, 0.0],
+    }
+    artifact_file = "qabot_eval_results.json"
+    TAG_NAME = "mlflow.loggedArtifacts"
+    run_id = None
+
+    with pytest.raises(
+        MlflowException, match="data must be a pandas.DataFrame or a dictionary"
+    ) as e:
+        with mlflow.start_run() as run:
+            # Log the incorrect data format as a table
+            mlflow.log_table(data="incorrect-data-format", artifact_file=artifact_file)
+    assert e.value.error_code == ErrorCode.Name(INVALID_PARAMETER_VALUE)
+
+    with mlflow.start_run() as run:
+        # Log the dictionary as a table
+        mlflow.log_table(data=table_dict, artifact_file=artifact_file)
+        run_id = run.info.run_id
+
+    run = mlflow.get_run(run_id)
+    artifact_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=artifact_file)
+    table_data = pd.read_json(artifact_path, orient="split")
+    assert table_data.shape[0] == 2
+    assert table_data.shape[1] == 3
+
+    # Get the current value of the tag
+    current_tag_value = ast.literal_eval(run.data.tags.get(TAG_NAME, "[]"))
+    assert {"path": artifact_file, "type": "table"} in current_tag_value
+    assert len(current_tag_value) == 1
+
+    table_df = pd.DataFrame.from_dict(table_dict)
+    with mlflow.start_run(run_id=run_id):
+        # Log the dataframe as a table
+        mlflow.log_table(data=table_df, artifact_file=artifact_file)
+
+    run = mlflow.get_run(run_id)
+    artifact_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=artifact_file)
+    table_data = pd.read_json(artifact_path, orient="split")
+    assert table_data.shape[0] == 4
+    assert table_data.shape[1] == 3
+    # Get the current value of the tag
+    current_tag_value = ast.literal_eval(run.data.tags.get(TAG_NAME, "[]"))
+    assert {"path": artifact_file, "type": "table"} in current_tag_value
+    assert len(current_tag_value) == 1
+
+    artifact_file_new = "qabot_eval_results_new.json"
+    with mlflow.start_run(run_id=run_id):
+        # Log the dataframe as a table to new artifact file
+        mlflow.log_table(data=table_df, artifact_file=artifact_file_new)
+
+    run = mlflow.get_run(run_id)
+    artifact_path = mlflow.artifacts.download_artifacts(
+        run_id=run_id, artifact_path=artifact_file_new
+    )
+    table_data = pd.read_json(artifact_path, orient="split")
+    assert table_data.shape[0] == 2
+    assert table_data.shape[1] == 3
+    # Get the current value of the tag
+    current_tag_value = ast.literal_eval(run.data.tags.get(TAG_NAME, "[]"))
+    assert {"path": artifact_file_new, "type": "table"} in current_tag_value
+    assert len(current_tag_value) == 2
+
+
+@pytest.mark.skipif(
+    "MLFLOW_SKINNY" in os.environ,
+    reason="Skinny client does not support the np or pandas dependencies",
+)
+def test_log_table_with_subdirectory():
+    import pandas as pd
+
+    table_dict = {
+        "inputs": ["What is MLflow?", "What is Databricks?"],
+        "outputs": ["MLflow is ...", "Databricks is ..."],
+        "toxicity": [0.0, 0.0],
+    }
+    artifact_file = "dir/foo.json"
+    TAG_NAME = "mlflow.loggedArtifacts"
+    run_id = None
+
+    with mlflow.start_run() as run:
+        # Log the dictionary as a table
+        mlflow.log_table(data=table_dict, artifact_file=artifact_file)
+        run_id = run.info.run_id
+
+    run = mlflow.get_run(run_id)
+    artifact_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=artifact_file)
+    table_data = pd.read_json(artifact_path, orient="split")
+    assert table_data.shape[0] == 2
+    assert table_data.shape[1] == 3
+
+    # Get the current value of the tag
+    current_tag_value = ast.literal_eval(run.data.tags.get(TAG_NAME, "[]"))
+    assert {"path": artifact_file, "type": "table"} in current_tag_value
+    assert len(current_tag_value) == 1
+
+    table_df = pd.DataFrame.from_dict(table_dict)
+    with mlflow.start_run(run_id=run_id):
+        # Log the dataframe as a table
+        mlflow.log_table(data=table_df, artifact_file=artifact_file)
+
+    run = mlflow.get_run(run_id)
+    artifact_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=artifact_file)
+    table_data = pd.read_json(artifact_path, orient="split")
+    assert table_data.shape[0] == 4
+    assert table_data.shape[1] == 3
+    # Get the current value of the tag
+    current_tag_value = ast.literal_eval(run.data.tags.get(TAG_NAME, "[]"))
+    assert {"path": artifact_file, "type": "table"} in current_tag_value
+    assert len(current_tag_value) == 1