11import io
22import json
3+ import os
34from typing import TYPE_CHECKING
45from urllib .parse import urlparse
56
1011from feast .staging .storage_client import get_staging_client
1112
1213try :
13- from great_expectations .core import ExpectationSuite
14+ from great_expectations .core import ExpectationConfiguration , ExpectationSuite
1415 from great_expectations .dataset import PandasDataset
1516except ImportError :
1617 raise ImportError (
@@ -41,7 +42,28 @@ def __init__(self, name: str, pickled_code: bytes):
4142 self .pickled_code = pickled_code
4243
4344
44- def create_validation_udf (name : str , expectations : ExpectationSuite ) -> ValidationUDF :
45+ def drop_feature_table_prefix (
46+ expectation_configuration : ExpectationConfiguration , prefix
47+ ):
48+ kwargs = expectation_configuration .kwargs
49+ for arg_name in ("column" , "column_A" , "column_B" ):
50+ if arg_name not in kwargs :
51+ continue
52+
53+ if kwargs [arg_name ].startswith (prefix ):
54+ kwargs [arg_name ] = kwargs [arg_name ][len (prefix ) :]
55+
56+
57+ def prepare_expectations (suite : ExpectationSuite , feature_table : "FeatureTable" ):
58+ for expectation in suite .expectations :
59+ drop_feature_table_prefix (expectation , f"{ feature_table .name } __" )
60+
61+ return suite
62+
63+
64+ def create_validation_udf (
65+ name : str , expectations : ExpectationSuite , feature_table : "FeatureTable" ,
66+ ) -> ValidationUDF :
4567 """
4668 Wraps your expectations into Spark UDF.
4769
@@ -60,10 +82,25 @@ def create_validation_udf(name: str, expectations: ExpectationSuite) -> Validati
6082
6183 :param name
6284 :param expectations: collection of expectation gathered on training dataset
85+ :param feature_table
6386 :return: ValidationUDF with serialized code
6487 """
6588
89+ expectations = prepare_expectations (expectations , feature_table )
90+
6691 def udf (df : pd .DataFrame ) -> pd .Series :
92+ from datadog .dogstatsd import DogStatsd
93+
94+ reporter = (
95+ DogStatsd (
96+ host = os .environ ["STATSD_HOST" ],
97+ port = int (os .environ ["STATSD_PORT" ]),
98+ telemetry_min_flush_interval = 0 ,
99+ )
100+ if os .getenv ("STATSD_HOST" ) and os .getenv ("STATSD_PORT" )
101+ else DogStatsd ()
102+ )
103+
67104 ds = PandasDataset .from_dataset (df )
68105 result = ds .validate (expectations , result_format = "COMPLETE" )
69106 valid_rows = pd .Series ([True ] * df .shape [0 ])
@@ -72,6 +109,32 @@ def udf(df: pd.DataFrame) -> pd.Series:
72109 if check .success :
73110 continue
74111
112+ unexpected_count = (
113+ check .result ["unexpected_count" ]
114+ if "unexpected_count" in check .result
115+ else df .shape [0 ]
116+ )
117+
118+ check_kwargs = check .expectation_config .kwargs
119+ check_kwargs .pop ("result_format" , None )
120+ check_name = "_" .join (
121+ [check .expectation_config .expectation_type ]
122+ + [
123+ str (v )
124+ for v in check_kwargs .values ()
125+ if isinstance (v , (str , int , float ))
126+ ]
127+ )
128+
129+ reporter .increment (
130+ "feast_feature_validation_check_failed" ,
131+ value = unexpected_count ,
132+ tags = [
133+ f"feature_table:{ os .getenv ('FEAST_INGESTION_FEATURE_TABLE' , 'unknown' )} " ,
134+ f"check:{ check_name } " ,
135+ ],
136+ )
137+
75138 if check .exception_info ["raised_exception" ]:
76139 # ToDo: probably we should mark all rows as invalid
77140 continue
@@ -106,7 +169,7 @@ def apply_validation(
106169 staging_client = get_staging_client (staging_scheme , client ._config )
107170
108171 pickled_code_fp = io .BytesIO (udf .pickled_code )
109- remote_path = f"{ staging_location } /udfs/{ udf .name } .pickle"
172+ remote_path = f"{ staging_location } /udfs/{ feature_table . name } / { udf .name } .pickle"
110173 staging_client .upload_fileobj (
111174 pickled_code_fp , f"{ udf .name } .pickle" , remote_uri = urlparse (remote_path )
112175 )
0 commit comments