|
12 | 12 |
|
13 | 13 | """
|
14 | 14 | import logging
|
15 |
| -from collections import namedtuple |
16 | 15 | import pprint
|
| 16 | +from collections import namedtuple |
17 | 17 |
|
18 | 18 | import numpy as np
|
19 |
| - |
| 19 | +import pyspark.sql.functions as F |
| 20 | +from pyspark import sql |
20 | 21 | from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
|
21 | 22 | TimestampType, DateType, DecimalType, ByteType, BinaryType, StructType, ArrayType, DataType, MapType
|
22 | 23 |
|
23 |
| -from pyspark import sql |
24 |
| -import pyspark.sql.functions as F |
25 |
| - |
26 |
| -from .utils import strip_margins, json_value_from_path |
27 |
| -from .spark_singleton import SparkSingleton |
28 | 24 | from .html_utils import HtmlUtils
|
| 25 | +from .spark_singleton import SparkSingleton |
| 26 | +from .utils import strip_margins, json_value_from_path |
29 | 27 |
|
30 | 28 |
|
31 | 29 | class DataAnalyzer:
|
@@ -148,12 +146,9 @@ def _addMeasureToSummary(self, measureName, summaryExpr="''", fieldExprs=None, d
|
148 | 146 | # add measures for fields
|
149 | 147 | exprs.extend(fieldExprs)
|
150 | 148 |
|
151 |
| - if dfSummary is not None: |
152 |
| - dfResult = dfSummary.union(dfData.selectExpr(*exprs).limit(rowLimit)) |
153 |
| - else: |
154 |
| - dfResult = dfData.selectExpr(*exprs).limit(rowLimit) |
| 149 | + dfMeasure = dfData.selectExpr(*exprs).limit(rowLimit) if rowLimit is not None else dfData.selectExpr(*exprs) |
155 | 150 |
|
156 |
| - return dfResult |
| 151 | + return dfSummary.union(dfMeasure) if dfSummary is not None else dfMeasure |
157 | 152 |
|
158 | 153 | @staticmethod
|
159 | 154 | def _is_numeric_type(dtype):
|
@@ -223,6 +218,112 @@ def _compute_pattern_match_clauses(self):
|
223 | 218 | result = stmts # "\n".join(stmts)
|
224 | 219 | return result
|
225 | 220 |
|
| 221 | + def generateTextFeatures(self, sourceDf): |
| 222 | + """ Generate text features from source dataframe |
| 223 | +
|
| 224 | + Generates set of text features for each column (analyzing string representation of each column value) |
| 225 | +
|
| 226 | + :param sourceDf: Source datafame |
| 227 | + :return: Dataframe of text features |
| 228 | + """ |
| 229 | + # generate named struct of text features for each column |
| 230 | + |
| 231 | + # we need to double escape backslashes in regular expressions as they will be lost in string expansion |
| 232 | + WORD_REGEX = r"\\b\\w+\\b" |
| 233 | + SPACE_REGEX = r"\\s+" |
| 234 | + DIGIT_REGEX = r"\\d" |
| 235 | + PUNCTUATION_REGEX = r"[\\?\\.\\;\\,\\!\\{\\}\\[\\]\\(\\)\\>\\<]" |
| 236 | + AT_REGEX = r"\\@" |
| 237 | + PERIOD_REGEX = r"\\." |
| 238 | + HTTP_REGEX = r"^http[s]?\\:\\/\\/" |
| 239 | + ALPHA_REGEX = r"[a-zA-Z]" |
| 240 | + ALPHA_UPPER_REGEX = r"[A-Z]" |
| 241 | + ALPHA_LOWER_REGEX = r"[a-z]" |
| 242 | + HEX_REGEX = r"[0-9a-fA-F]" |
| 243 | + |
| 244 | + # for each column, extract text features from string representation of column value (leftmost 4096 characters) |
| 245 | + def left4k(name): |
| 246 | + return f"left(string({name}), 4096)" |
| 247 | + |
| 248 | + fieldTextFeatures = [] |
| 249 | + |
| 250 | + for colInfo in self.columnsInfo: |
| 251 | + fieldTextFeatures.append( |
| 252 | + strip_margins( |
| 253 | + f"""named_struct( |
| 254 | + | 'print_len', length(string({colInfo.name})), |
| 255 | + | 'word_count', size(regexp_extract_all({left4k(colInfo.name)}, '{WORD_REGEX}',0)), |
| 256 | + | 'space_count', size(regexp_extract_all({left4k(colInfo.name)}, '{SPACE_REGEX}',0)), |
| 257 | + | 'digit_count', size(regexp_extract_all({left4k(colInfo.name)}, '{DIGIT_REGEX}',0)), |
| 258 | + | 'punctuation_count', size(regexp_extract_all({left4k(colInfo.name)}, '{PUNCTUATION_REGEX}',0)), |
| 259 | + | 'at_count', size(regexp_extract_all({left4k(colInfo.name)}, '{AT_REGEX}',0)), |
| 260 | + | 'period_count', size(regexp_extract_all({left4k(colInfo.name)}, '{PERIOD_REGEX}',0)), |
| 261 | + | 'http_count', size(regexp_extract_all({left4k(colInfo.name)}, '{HTTP_REGEX}',0)), |
| 262 | + | 'alpha_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_REGEX}',0)), |
| 263 | + | 'alpha_lower_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_LOWER_REGEX}',0)), |
| 264 | + | 'alpha_upper_count', size(regexp_extract_all({left4k(colInfo.name)}, '{ALPHA_UPPER_REGEX}',0)), |
| 265 | + | 'hex_digit_count', size(regexp_extract_all({left4k(colInfo.name)}, '{HEX_REGEX}',0)) |
| 266 | + | ) |
| 267 | + | as {colInfo.name}""", marginChar="|") |
| 268 | + ) |
| 269 | + |
| 270 | + dfTextFeatures = self._addMeasureToSummary( |
| 271 | + 'text_features', |
| 272 | + fieldExprs=fieldTextFeatures, |
| 273 | + dfData=sourceDf, |
| 274 | + dfSummary=None, |
| 275 | + rowLimit=None) |
| 276 | + |
| 277 | + return dfTextFeatures |
| 278 | + |
| 279 | + def _summarizeTextFeatures(self, textFeaturesDf): |
| 280 | + """ |
| 281 | + Generate summary of text features |
| 282 | +
|
| 283 | + :param textFeaturesDf: Text features dataframe |
| 284 | + :return: dataframe of summary text features |
| 285 | + """ |
| 286 | + assert textFeaturesDf is not None, "textFeaturesDf must be specified" |
| 287 | + |
| 288 | + # generate named struct of summary text features for each column |
| 289 | + fieldTextFeatures = [] |
| 290 | + |
| 291 | + # TODO: use json syntax asin:print_len when migrating to Spark 10.4LTS as minimum version |
| 292 | + |
| 293 | + for colInfo in self.columnsInfo: |
| 294 | + cname = colInfo.name |
| 295 | + fieldTextFeatures.append(strip_margins( |
| 296 | + f"""to_json(named_struct( |
| 297 | + | 'print_len', array(min({cname}.print_len), max({cname}.print_len), avg({cname}.print_len)), |
| 298 | + | 'word_count', array(min({cname}.word_count), max({cname}.word_count), avg({cname}.word_count)), |
| 299 | + | 'space_count',array(min({cname}.space_count), max({cname}.space_count), avg({cname}.space_count)), |
| 300 | + | 'digit_count', array(min({cname}.digit_count), max({cname}.digit_count), avg({cname}.digit_count)), |
| 301 | + | 'punctuation_count', array(min({cname}.punctuation_count), max({cname}.punctuation_count), |
| 302 | + | avg({cname}.punctuation_count)), |
| 303 | + | 'at_count', array(min({cname}.at_count), max({cname}.at_count), avg({cname}.at_count)), |
| 304 | + | 'period_count', array(min({cname}.period_count), max({cname}.period_count), |
| 305 | + | avg({cname}.period_count)), |
| 306 | + | 'http_count', array(min({cname}.http_count), max({cname}.http_count), avg({cname}.http_count)), |
| 307 | + | 'alpha_count', array(min({cname}.alpha_count), max({cname}.alpha_count), avg({cname}.alpha_count)), |
| 308 | + | 'alpha_lower_count', array(min({cname}.alpha_lower_count), max({cname}.alpha_lower_count), |
| 309 | + | avg({cname}.alpha_lower_count)), |
| 310 | + | 'alpha_upper_count', array(min({cname}.alpha_upper_count), max({cname}.alpha_upper_count), |
| 311 | + | avg({cname}.alpha_upper_count)), |
| 312 | + | 'hex_digit_count', array(min({cname}.hex_digit_count), max({cname}.hex_digit_count), |
| 313 | + | avg({cname}.hex_digit_count)) |
| 314 | + | )) |
| 315 | + | as {cname}""", marginChar="|") |
| 316 | + ) |
| 317 | + |
| 318 | + dfSummaryTextFeatures = self._addMeasureToSummary( |
| 319 | + 'summary_text_features', |
| 320 | + fieldExprs=fieldTextFeatures, |
| 321 | + dfData=textFeaturesDf, |
| 322 | + dfSummary=None, |
| 323 | + rowLimit=1) |
| 324 | + |
| 325 | + return dfSummaryTextFeatures |
| 326 | + |
226 | 327 | def summarizeToDF(self):
|
227 | 328 | """ Generate summary analysis of data set as dataframe
|
228 | 329 |
|
@@ -368,6 +469,14 @@ def summarizeToDF(self):
|
368 | 469 | dfData=df_under_analysis,
|
369 | 470 | dfSummary=dfDataSummary)
|
370 | 471 |
|
| 472 | + logger.info("Analyzing text features") |
| 473 | + dfTextFeatures = self.generateTextFeatures(self._getExpandedSourceDf()) |
| 474 | + |
| 475 | + logger.info("Summarizing text features") |
| 476 | + dfTextFeaturesSummary = self._summarizeTextFeatures(dfTextFeatures) |
| 477 | + |
| 478 | + dfDataSummary = dfDataSummary.union(dfTextFeaturesSummary) |
| 479 | + |
371 | 480 | return dfDataSummary
|
372 | 481 |
|
373 | 482 | def summarize(self, suppressOutput=False):
|
|
0 commit comments