|
41 | 41 | is_magic_lib_available = False
|
42 | 42 |
|
43 | 43 |
|
| 44 | +# Base mapping for most SQL engines (Hive, Impala, SparkSQL) |
| 45 | +SQL_TYPE_BASE_MAP = { |
| 46 | + # signed ints |
| 47 | + "Int8": "TINYINT", |
| 48 | + "Int16": "SMALLINT", |
| 49 | + "Int32": "INT", |
| 50 | + "Int64": "BIGINT", |
| 51 | + # unsigned ints: same size signed by default (Hive/Impala/SparkSQL) |
| 52 | + "UInt8": "TINYINT", |
| 53 | + "UInt16": "SMALLINT", |
| 54 | + "UInt32": "INT", |
| 55 | + "UInt64": "BIGINT", |
| 56 | + # floats & decimal |
| 57 | + "Float32": "FLOAT", |
| 58 | + "Float64": "DOUBLE", |
| 59 | + "Decimal": "DECIMAL", # Hive/Impala/SparkSQL use DECIMAL(precision,scale) |
| 60 | + # boolean, string, binary |
| 61 | + "Boolean": "BOOLEAN", |
| 62 | + "Utf8": "STRING", # STRING covers Hive/VARCHAR/CHAR for default |
| 63 | + "String": "STRING", |
| 64 | + "Categorical": "STRING", |
| 65 | + "Enum": "STRING", |
| 66 | + "Binary": "BINARY", |
| 67 | + # temporal |
| 68 | + "Date": "DATE", |
| 69 | + "Time": "TIMESTAMP", # Hive/Impala/SparkSQL have no pure TIME type |
| 70 | + "Datetime": "TIMESTAMP", |
| 71 | + "Duration": "INTERVAL DAY TO SECOND", |
| 72 | + # nested & other |
| 73 | + "Array": "ARRAY", |
| 74 | + "List": "ARRAY", |
| 75 | + "Struct": "STRUCT", |
| 76 | + "Object": "STRING", |
| 77 | + "Null": "STRING", # no SQL NULL type—use STRING or handle as special case |
| 78 | + "Unknown": "STRING", |
| 79 | +} |
| 80 | + |
| 81 | +# Per‑dialect overrides for the few differences |
| 82 | +SQL_TYPE_DIALECT_OVERRIDES = { |
| 83 | + "hive": {}, |
| 84 | + "impala": {}, |
| 85 | + "sparksql": {}, |
| 86 | + "trino": { |
| 87 | + "Int32": "INTEGER", |
| 88 | + "UInt32": "INTEGER", |
| 89 | + "Utf8": "VARCHAR", |
| 90 | + "String": "VARCHAR", |
| 91 | + "Binary": "VARBINARY", |
| 92 | + "Float32": "REAL", |
| 93 | + "Struct": "ROW", |
| 94 | + "Object": "JSON", |
| 95 | + "Duration": "INTERVAL DAY TO SECOND", # explicit SQL syntax |
| 96 | + }, |
| 97 | + "phoenix": { |
| 98 | + **{f"UInt{b}": f"UNSIGNED_{t}" for b, t in [(8, "TINYINT"), (16, "SMALLINT"), (32, "INT"), (64, "LONG")]}, |
| 99 | + "Utf8": "VARCHAR", |
| 100 | + "String": "VARCHAR", |
| 101 | + "Binary": "VARBINARY", |
| 102 | + "Duration": "STRING", # Phoenix treats durations as strings |
| 103 | + "Struct": "STRING", # no native STRUCT type |
| 104 | + "Object": "VARCHAR", |
| 105 | + "Time": "TIME", # Phoenix has its own TIME type |
| 106 | + "Decimal": "DECIMAL", # up to precision 38 |
| 107 | + }, |
| 108 | +} |
| 109 | + |
| 110 | + |
44 | 111 | def local_file_upload(upload_file, username: str) -> Dict[str, str]:
|
45 | 112 | """Uploads a local file to a temporary directory with a unique filename.
|
46 | 113 |
|
@@ -95,8 +162,7 @@ def local_file_upload(upload_file, username: str) -> Dict[str, str]:
|
95 | 162 |
|
96 | 163 |
|
97 | 164 | def guess_file_metadata(file_path: str, import_type: str, fs=None) -> Dict[str, Any]:
|
98 |
| - """ |
99 |
| - Guess the metadata of a file based on its content or extension. |
| 165 | + """Guess the metadata of a file based on its content or extension. |
100 | 166 |
|
101 | 167 | Args:
|
102 | 168 | file_path: Path to the file to analyze
|
@@ -177,8 +243,7 @@ def preview_file(
|
177 | 243 | fs=None,
|
178 | 244 | preview_rows: int = 50,
|
179 | 245 | ) -> Dict[str, Any]:
|
180 |
| - """ |
181 |
| - Generate a preview of a file's content with column type mapping. |
| 246 | + """Generate a preview of a file's content with column type mapping. |
182 | 247 |
|
183 | 248 | This method reads a file and returns a preview of its contents, along with
|
184 | 249 | column information and metadata for creating tables or further processing.
|
@@ -268,8 +333,7 @@ def preview_file(
|
268 | 333 |
|
269 | 334 |
|
270 | 335 | def _detect_file_type(file_sample: bytes) -> str:
|
271 |
| - """ |
272 |
| - Detect the file type based on its content. |
| 336 | + """Detect the file type based on its content. |
273 | 337 |
|
274 | 338 | Args:
|
275 | 339 | file_sample: Binary sample of the file content
|
@@ -308,8 +372,7 @@ def _detect_file_type(file_sample: bytes) -> str:
|
308 | 372 |
|
309 | 373 |
|
310 | 374 | def _get_excel_metadata(fh: BinaryIO) -> Dict[str, Any]:
|
311 |
| - """ |
312 |
| - Extract metadata for Excel files (.xlsx, .xls). |
| 375 | + """Extract metadata for Excel files (.xlsx, .xls). |
313 | 376 |
|
314 | 377 | Args:
|
315 | 378 | fh: File handle for the Excel file
|
@@ -371,8 +434,7 @@ def _get_sheet_names_xlsx(fh: BinaryIO) -> List[str]:
|
371 | 434 |
|
372 | 435 |
|
373 | 436 | def _get_delimited_metadata(file_sample: Union[bytes, str], file_type: str) -> Dict[str, Any]:
|
374 |
| - """ |
375 |
| - Extract metadata for delimited files (CSV, TSV, etc.). |
| 437 | + """Extract metadata for delimited files (CSV, TSV, etc.). |
376 | 438 |
|
377 | 439 | Args:
|
378 | 440 | file_sample: Binary or string sample of the file content
|
@@ -543,8 +605,7 @@ def _preview_delimited_file(
|
543 | 605 |
|
544 | 606 |
|
545 | 607 | def guess_file_header(file_path: str, file_type: str, import_type: str, sheet_name: Optional[str] = None, fs=None) -> bool:
|
546 |
| - """ |
547 |
| - Guess whether a file has a header row. |
| 608 | + """Guess whether a file has a header row. |
548 | 609 |
|
549 | 610 | This function analyzes a file to determine if it contains a header row based on the
|
550 | 611 | content pattern. It works for both Excel files and delimited text files (CSV, TSV, etc.).
|
@@ -633,98 +694,43 @@ def guess_file_header(file_path: str, file_type: str, import_type: str, sheet_na
|
633 | 694 | fh.close()
|
634 | 695 |
|
635 | 696 |
|
636 |
| -def _map_polars_dtype_to_sql_type(dialect: str, polars_type: str) -> str: |
637 |
| - """ |
638 |
| - Map a Polars dtype to the corresponding SQL type for a given dialect. |
| 697 | +def get_sql_type_mapping(dialect: str) -> Dict[str, str]: |
| 698 | + """Get all type mappings from Polars dtypes to SQL types for a given SQL dialect. |
639 | 699 |
|
640 |
| - Supports all Polars dtypes as listed in the Polars docs: |
641 |
| - Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, |
642 |
| - Float32, Float64, Decimal, Boolean, Utf8/String, Categorical, Enum, |
643 |
| - Binary, Date, Time, Datetime, Duration, Array, List, Struct, |
644 |
| - Object, Null, Unknown. |
| 700 | + This function returns a dictionary mapping of all Polars data types to their |
| 701 | + corresponding SQL types for a specific dialect. |
645 | 702 |
|
646 | 703 | Args:
|
647 | 704 | dialect: One of "hive", "impala", "trino", "phoenix", "sparksql".
|
648 |
| - polars_type: Polars dtype name as string. |
649 | 705 |
|
650 | 706 | Returns:
|
651 |
| - A string representing the SQL type. |
| 707 | + A dict mapping Polars dtype names to SQL type names. |
652 | 708 |
|
653 | 709 | Raises:
|
654 |
| - ValueError: If the dialect or polars_type is not supported. |
| 710 | + ValueError: If the dialect is not supported. |
655 | 711 | """
|
656 |
| - # Base mapping for most engines (Hive, Impala, SparkSQL) |
657 |
| - base_map = { |
658 |
| - # signed ints |
659 |
| - "Int8": "TINYINT", |
660 |
| - "Int16": "SMALLINT", |
661 |
| - "Int32": "INT", |
662 |
| - "Int64": "BIGINT", |
663 |
| - # unsigned ints: same size signed by default (Hive/Impala/SparkSQL) |
664 |
| - "UInt8": "TINYINT", |
665 |
| - "UInt16": "SMALLINT", |
666 |
| - "UInt32": "INT", |
667 |
| - "UInt64": "BIGINT", |
668 |
| - # floats & decimal |
669 |
| - "Float32": "FLOAT", |
670 |
| - "Float64": "DOUBLE", |
671 |
| - "Decimal": "DECIMAL", # Hive/Impala/SparkSQL use DECIMAL(precision,scale) |
672 |
| - # boolean, string, binary |
673 |
| - "Boolean": "BOOLEAN", |
674 |
| - "Utf8": "STRING", # STRING covers Hive/VARCHAR/CHAR for default |
675 |
| - "String": "STRING", |
676 |
| - "Categorical": "STRING", |
677 |
| - "Enum": "STRING", |
678 |
| - "Binary": "BINARY", |
679 |
| - # temporal |
680 |
| - "Date": "DATE", |
681 |
| - "Time": "TIMESTAMP", # Hive/Impala/SparkSQL have no pure TIME type |
682 |
| - "Datetime": "TIMESTAMP", |
683 |
| - "Duration": "INTERVAL DAY TO SECOND", |
684 |
| - # nested & other |
685 |
| - "Array": "ARRAY", |
686 |
| - "List": "ARRAY", |
687 |
| - "Struct": "STRUCT", |
688 |
| - "Object": "STRING", |
689 |
| - "Null": "STRING", # no SQL NULL type—use STRING or handle as special case |
690 |
| - "Unknown": "STRING", |
691 |
| - } |
692 |
| - |
693 |
| - # Per‑dialect overrides for the few differences |
694 |
| - overrides = { |
695 |
| - "hive": {}, |
696 |
| - "impala": {}, |
697 |
| - "sparksql": {}, |
698 |
| - "trino": { |
699 |
| - "Int32": "INTEGER", |
700 |
| - "UInt32": "INTEGER", |
701 |
| - "Utf8": "VARCHAR", |
702 |
| - "String": "VARCHAR", |
703 |
| - "Binary": "VARBINARY", |
704 |
| - "Float32": "REAL", |
705 |
| - "Struct": "ROW", |
706 |
| - "Object": "JSON", |
707 |
| - "Duration": "INTERVAL DAY TO SECOND", # explicit SQL syntax |
708 |
| - }, |
709 |
| - "phoenix": { |
710 |
| - **{f"UInt{b}": f"UNSIGNED_{t}" for b, t in [(8, "TINYINT"), (16, "SMALLINT"), (32, "INT"), (64, "LONG")]}, |
711 |
| - "Utf8": "VARCHAR", |
712 |
| - "String": "VARCHAR", |
713 |
| - "Binary": "VARBINARY", |
714 |
| - "Duration": "STRING", # Phoenix treats durations as strings |
715 |
| - "Struct": "STRING", # no native STRUCT type |
716 |
| - "Object": "VARCHAR", |
717 |
| - "Time": "TIME", # Phoenix has its own TIME type |
718 |
| - "Decimal": "DECIMAL", # up to precision 38 |
719 |
| - }, |
720 |
| - } |
721 |
| - |
722 | 712 | dl = dialect.lower()
|
723 |
| - if dl not in overrides: |
| 713 | + if dl not in SQL_TYPE_DIALECT_OVERRIDES: |
724 | 714 | raise ValueError(f"Unsupported dialect: {dialect}")
|
725 | 715 |
|
726 | 716 | # Merge base_map and overrides[dl] into a new dict, giving precedence to any overlapping keys in overrides[dl]
|
727 |
| - mapping = {**base_map, **overrides[dl]} |
| 717 | + return {**SQL_TYPE_BASE_MAP, **SQL_TYPE_DIALECT_OVERRIDES[dl]} |
| 718 | + |
| 719 | + |
| 720 | +def _map_polars_dtype_to_sql_type(dialect: str, polars_type: str) -> str: |
| 721 | + """Map a Polars dtype to the corresponding SQL type for a given dialect. |
| 722 | +
|
| 723 | + Args: |
| 724 | + dialect: One of "hive", "impala", "trino", "phoenix", "sparksql". |
| 725 | + polars_type: Polars dtype name as string. |
| 726 | +
|
| 727 | + Returns: |
| 728 | + A string representing the SQL type. |
| 729 | +
|
| 730 | + Raises: |
| 731 | + ValueError: If the dialect or polars_type is not supported. |
| 732 | + """ |
| 733 | + mapping = get_sql_type_mapping(dialect) |
728 | 734 |
|
729 | 735 | if polars_type not in mapping:
|
730 | 736 | raise ValueError(f"No mapping for Polars dtype {polars_type} in dialect {dialect}")
|
|
0 commit comments