sparkdq-community · flitzpiepe93 · May 17, 2025 · May 16, 2025 · May 16, 2025 · May 16, 2025
diff --git a/README.md b/README.md
@@ -93,8 +93,10 @@ Alternatively, if you're using uv, a fast and modern Python package manager:
 uv add sparkdq
 ```
 
-The framework supports Python 3.10+ and is fully tested with PySpark 3.5.x. No additional Spark installation
-is required when running inside environments like Databricks, AWS Glue, or EMR.
+The framework supports Python 3.10+ and is fully tested with PySpark 3.5.x. If you're running SparkDQ outside
+of managed platforms like Databricks, AWS Glue, or EMR, make sure Spark is installed and properly
+configured on your system. You can install it via your package manager or by following the official
+[Installation Guide](https://spark.apache.org/docs/latest/api/python/getting_started/install.html).
 
 ## Why SparkDQ?
 

diff --git a/docs/source/built_in_checks/aggregate.rst b/docs/source/built_in_checks/aggregate.rst
@@ -6,20 +6,30 @@ Aggregate Checks
    :caption: Built-in Checks
    :hidden:
 
+   checks/completeness/columns_are_complete_check
    checks/schema/column_presence_check
+   checks/completeness/completeness_ratio_check
    checks/count/count_between_check
    checks/count/count_exact_check
    checks/count/count_min_check
    checks/count/count_max_check
+   checks/uniqueness/distinct_ratio_check
    checks/schema/schema_check
+   checks/uniqueness/unique_ratio_check
+   checks/uniqueness/unique_rows_check
 
 .. csv-table::
     :header: "Check", "Description"
     :widths: 20, 80
 
-    ":ref:`count-min-check` ", "Ensures that the DataFrame contains at least a defined minimum number of rows."
-    ":ref:`count-max-check` ", "Ensures that the DataFrame does not exceed a defined maximum number of rows."
-    ":ref:`count-between-check` ", "Ensures that the number of rows in the dataset falls within a defined inclusive range."
-    ":ref:`count-exact-check` ", "Ensures that the dataset contains exactly the specified number of rows."
-    ":ref:`column-presence-check` ", "Verifies the existence of required columns in the DataFrame, independent of their data types."
-    ":ref:`schema-check` ", "Ensures that a DataFrame matches an expected schema by verifying column names and data types, with optional strict enforcement against unexpected columns."
+   ":ref:`columns-are-complete-check`", "Validates that a set of columns are fully populated. If any nulls are detected in the specified columns, the entire DataFrame is marked as invalid."
+   ":ref:`column-presence-check` ", "Verifies the existence of required columns in the DataFrame, independent of their data types."
+   ":ref:`completeness-ratio-check`", "Validates that the ratio of non-null values in a column meets a minimum threshold, enabling soft completeness validation and early detection of partially missing data."
+   ":ref:`count-min-check` ", "Ensures that the DataFrame contains at least a defined minimum number of rows."
+   ":ref:`count-max-check` ", "Ensures that the DataFrame does not exceed a defined maximum number of rows."
+   ":ref:`count-between-check` ", "Ensures that the number of rows in the dataset falls within a defined inclusive range."
+   ":ref:`count-exact-check` ", "Ensures that the dataset contains exactly the specified number of rows."
+   ":ref:`distinct-ratio-check`", "Validates that the ratio of distinct non-null values in a column exceeds a defined threshold, helping to detect overly uniform or low-cardinality fields."
+   ":ref:`schema-check` ", "Ensures that a DataFrame matches an expected schema by verifying column names and data types, with optional strict enforcement against unexpected columns."
+   ":ref:`unique-ratio-check`", "Validates that a specified column maintains a minimum ratio of unique (non-null) values, helping to detect excessive duplication and assess data entropy or feature distinctiveness."
+   ":ref:`unique-rows-check`", "Validates that all rows in a DataFrame are unique, either across all columns or a defined subset, helping to detect unintended duplication and enforce row-level uniqueness."
diff --git a/docs/source/built_in_checks/checks/completeness/columns_are_complete_check.rst b/docs/source/built_in_checks/checks/completeness/columns_are_complete_check.rst
@@ -0,0 +1,44 @@
+.. _columns-are-complete-check:
+
+Columns Are Complete
+====================
+
+**Check**: ``columns-are-complete-check``
+
+**Purpose**:  
+Ensures that all specified columns are **fully populated** (i.e. contain no null values).  
+If any null values are found in one of the columns, the **entire dataset is considered invalid**.
+
+Python Configuration
+--------------------
+
+.. code-block:: python
+
+   from sparkdq.checks import ColumnsAreCompleteCheckConfig
+   from sparkdq.core import Severity
+
+   ColumnsAreCompleteCheckConfig(
+       check_id="required_fields_check",
+       columns=["trip_id", "pickup_time"],
+       severity=Severity.CRITICAL
+   )
+
+Declarative Configuration
+-------------------------
+
+.. code-block:: yaml
+
+    - check: columns-are-complete-check
+      check-id: required_fields_check
+      columns:
+        - trip_id
+        - pickup_time
+      severity: critical
+
+Typical Use Cases
+-----------------
+
+* ✅ Enforce critical business fields to be complete (e.g. primary keys, timestamps).
+* ✅ Detect corruption or data loss caused by ETL errors or schema mismatches.
+* ✅ Ensure key fields required for downstream processing or analytics are intact.
+* ✅ Use as a hard fail condition to quarantine incomplete datasets early in the pipeline.
diff --git a/docs/source/built_in_checks/checks/completeness/completeness_ratio_check.rst b/docs/source/built_in_checks/checks/completeness/completeness_ratio_check.rst
@@ -0,0 +1,44 @@
+.. _completeness-ratio-check:
+
+Completeness Ratio
+==================
+
+**Check**: ``completeness-ratio-check``
+
+**Purpose**:  
+Validates that the ratio of non-null values in a specified column meets or exceeds a defined threshold (``min_ratio``).  
+This allows for soft validation of column completeness without enforcing strict non-null constraints.
+
+Python Configuration
+--------------------
+
+.. code-block:: python
+
+   from sparkdq.checks import CompletenessRatioCheckConfig
+   from sparkdq.core import Severity
+
+   CompletenessRatioCheckConfig(
+       check_id="pickup-time-mostly-complete",
+       column="tpep_pickup_datetime",
+       min_ratio=0.95,
+       severity=Severity.WARNING
+   )
+
+Declarative Configuration
+-------------------------
+
+.. code-block:: yaml
+
+    - check: completeness-ratio-check
+      check-id: pickup-time-mostly-complete
+      column: tpep_pickup_datetime
+      min-ratio: 0.95
+      severity: warning
+
+Typical Use Cases
+-----------------
+
+* ✅ Detect columns with unexpectedly high proportions of missing values.
+* ✅ Enforce soft completeness thresholds on optional or partially-populated fields.
+* ✅ Ensure minimum data quality for downstream analytics or feature generation.
+* ✅ Provide early signals for upstream data loss or extraction failures.
diff --git a/docs/source/built_in_checks/checks/uniqueness/distinct_ratio_check.rst b/docs/source/built_in_checks/checks/uniqueness/distinct_ratio_check.rst
@@ -0,0 +1,44 @@
+.. _distinct-ratio-check:
+
+Distinct Ratio
+==============
+
+**Check**: ``distinct-ratio-check``
+
+**Purpose**:  
+Validates that the ratio of distinct (non-null) values in a specified column exceeds a minimum threshold.  
+A row set fails the check if the actual ratio of distinct values is lower than the configured ``min_ratio``.
+
+Python Configuration
+--------------------
+
+.. code-block:: python
+
+   from sparkdq.checks import DistinctRatioCheckConfig
+   from sparkdq.core import Severity
+
+   DistinctRatioCheckConfig(
+       check_id="passenger-count-uniqueness",
+       column="passenger_count",
+       min_ratio=0.8,
+       severity=Severity.CRITICAL
+   )
+
+Declarative Configuration
+-------------------------
+
+.. code-block:: yaml
+
+    - check: distinct-ratio-check
+      check-id: passenger-count-uniqueness
+      column: passenger_count
+      min-ratio: 0.8
+      severity: critical
+
+Typical Use Cases
+-----------------
+
+* ✅ Ensure that a column has a sufficiently high number of distinct (non-null) values.
+* ✅ Detect columns that may have too much repetition or lack of variability.
+* ✅ Identify potential issues such as constants, default-filled fields, or data entry errors.
+* ✅ Enforce entropy or uniqueness expectations for features used in ML models or analytics.
diff --git a/docs/source/built_in_checks/checks/uniqueness/unique_ratio_check.rst b/docs/source/built_in_checks/checks/uniqueness/unique_ratio_check.rst
@@ -0,0 +1,54 @@
+.. _unique-ratio-check:
+
+Unique Ratio
+============
+
+**Check**: ``unique-ratio-check``
+
+**Purpose**:  
+Checks whether the ratio of unique (non-null) values in a specified column meets or exceeds a configured threshold.  
+A row does not directly fail; instead, the dataset is considered invalid if the proportion of unique values is too low.
+
+.. note::
+
+    * If the configured ``min-ratio`` is not met, the check fails.  
+    * Null values are excluded from the uniqueness calculation.  
+    * The total number of rows is used as the denominator (including nulls).
+
+Python Configuration
+--------------------
+
+.. code-block:: python
+
+   from sparkdq.checks import UniqueRatioCheckConfig
+   from sparkdq.core import Severity
+
+   UniqueRatioCheckConfig(
+       check_id="vendor-id-uniqueness",
+       column="VendorID",
+       min_ratio=0.7,
+       severity=Severity.CRITICAL
+   )
+
+Declarative Configuration
+-------------------------
+
+.. code-block:: yaml
+
+    - check: unique-ratio-check
+      check-id: vendor-id-uniqueness
+      column: VendorID
+      min-ratio: 0.7
+      severity: critical
+
+Typical Use Cases
+-----------------
+
+* ✅ Ensure that a column intended to be mostly unique (e.g., IDs, hashes) behaves as expected.
+
+* ✅ Detect issues where only a few values are repeated frequently, reducing feature usefulness.
+
+* ✅ Prevent downstream errors due to low-entropy or non-discriminative values.
+
+* ✅ Support feature quality checks in ML preprocessing pipelines.
+
diff --git a/docs/source/built_in_checks/checks/uniqueness/unique_rows_check.rst b/docs/source/built_in_checks/checks/uniqueness/unique_rows_check.rst
@@ -0,0 +1,48 @@
+.. _unique-rows-check:
+
+Unique Rows
+===========
+
+**Check**: ``unique-rows-check``
+
+**Purpose**:  
+Ensures that all rows in the dataset are unique, either across all columns or a specified subset.  
+This check helps detect unintended data duplication and enforces row-level uniqueness constraints.
+
+.. note::
+
+    If no subset is provided, the check considers all columns to determine uniqueness.
+
+Python Configuration
+--------------------
+
+.. code-block:: python
+
+   from sparkdq.checks import UniqueRowsCheckConfig
+   from sparkdq.core import Severity
+
+   UniqueRowsCheckConfig(
+       check_id="no_duplicate_rows",
+       subset_columns=["trip_id", "pickup_time"],
+       severity=Severity.CRITICAL
+   )
+
+Declarative Configuration
+-------------------------
+
+.. code-block:: yaml
+
+    - check: unique-rows-check
+      check-id: no_duplicate_rows
+      subset-columns:
+        - trip_id
+        - pickup_time
+      severity: critical
+
+Typical Use Cases
+-----------------
+
+* ✅ Enforce uniqueness on primary key–like columns (e.g., ``trip_id``, ``user_id``)
+* ✅ Detect duplicated records caused by faulty joins, reprocessing, or ingestion errors
+* ✅ Ensure referential integrity before merging datasets or writing to transactional stores
+* ✅ Validate correctness of deduplication logic in preprocessing pipelines
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -0,0 +1,6 @@
+Examples
+========
+
+The `examples <https://github.com/sparkdq-community/sparkdq/tree/main/examples>`_ folder contains practical
+examples that demonstrate how to use the SparkDQ framework. These notebooks and scripts showcase common use
+cases — from simple checks to more complex validation scenarios—and help you get started quickly.
diff --git a/docs/source/getting_started/applying_validation.rst b/docs/source/getting_started/applying_validation.rst
@@ -46,7 +46,7 @@ Implementation Tip
 
 .. code-block:: python
 
-    if result.summary().failed_records > 0:
+    if not result.summary().all_passed:
         raise RuntimeError("Critical checks failed — stopping pipeline.")
 
 Quarantine Strategy

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -49,3 +49,4 @@ SparkDQ — Data Quality Validation
    :hidden:
 
    api_docs
+   examples
diff --git a/sparkdq/checks/__init__.py b/sparkdq/checks/__init__.py
@@ -1,9 +1,14 @@
+from .aggregate.completeness_checks.columns_are_complete_check import ColumnsAreCompleteCheckConfig
+from .aggregate.completeness_checks.completeness_ratio_check import CompletenessRatioCheckConfig
 from .aggregate.count_checks.count_between_check import RowCountBetweenCheckConfig
 from .aggregate.count_checks.count_exact_check import RowCountExactCheckConfig
 from .aggregate.count_checks.count_max_check import RowCountMaxCheckConfig
 from .aggregate.count_checks.count_min_check import RowCountMinCheckConfig
 from .aggregate.schema_checks.column_presence_check import ColumnPresenceCheckConfig
 from .aggregate.schema_checks.schema_check import SchemaCheckConfig
+from .aggregate.uniqueness_checks.distinct_ratio_check import DistinctRatioCheckConfig
+from .aggregate.uniqueness_checks.unique_ratio_check import UniqueRatioCheckConfig
+from .aggregate.uniqueness_checks.unique_rows_check import UniqueRowsCheckConfig
 from .row_level.columns_comparison_checks.column_less_than import ColumnLessThanCheckConfig
 from .row_level.contained_checks.is_contained_in_check import IsContainedInCheckConfig
 from .row_level.contained_checks.is_not_contained_in_check import IsNotContainedInCheckConfig
@@ -50,4 +55,9 @@
     "StringMaxLengthCheckConfig",
     "StringLengthBetweenCheckConfig",
     "RegexMatchCheckConfig",
+    "UniqueRowsCheckConfig",
+    "UniqueRatioCheckConfig",
+    "CompletenessRatioCheckConfig",
+    "ColumnsAreCompleteCheckConfig",
+    "DistinctRatioCheckConfig",
 ]
diff --git a/sparkdq/checks/aggregate/completeness_checks/__init__.py b/sparkdq/checks/aggregate/completeness_checks/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -49,3 +49,4 @@ SparkDQ — Data Quality Validation
		:hidden:

		api_docs
		examples