7
7
8
8
"""Parse DDL statements"""
9
9
10
- import re
10
+ import re , textwrap , json
11
11
from collections import OrderedDict
12
12
from enum import IntEnum
13
13
@@ -74,16 +74,19 @@ def get_name(self, name_case=DdlParseBase.NAME_CASE.original):
74
74
class DdlParseColumn (DdlParseTableColumnBase ):
75
75
"""Column define info"""
76
76
77
- def __init__ (self , name , data_type_array , constraint = None , source_database = None ):
77
+ def __init__ (self , name , data_type_array , array_brackets = None , constraint = None , source_database = None ):
78
78
"""
79
79
:param data_type_array[]: Column data type ['data type name'] or ['data type name', '(length)'] or ['data type name', '(precision, scale)']
80
+ :param array_brackets: Column array brackets string '[]' or '[][]...'
80
81
:param constraint: Column constraint string
81
82
:param source_database: enum DdlParse.DATABASE
82
83
"""
84
+
83
85
super ().__init__ (source_database )
84
86
self ._name = name
85
87
self ._set_data_type (data_type_array )
86
88
self .constraint = constraint
89
+ self ._array_dimensional = 0 if array_brackets is None else array_brackets .count ('[]' )
87
90
88
91
@property
89
92
def data_type (self ):
@@ -109,11 +112,12 @@ def _set_data_type(self, data_type_array):
109
112
if len (data_type_array ) < 2 :
110
113
return
111
114
112
- matches = re .findall (r"(\d+)\s*,*\s*(\d*)" , data_type_array [1 ])
115
+ matches = re .findall (r"(\d+)\s*,*\s*(\d*)" , data_type_array [- 1 ])
113
116
if len (matches ) > 0 :
114
117
self ._length = int (matches [0 ][0 ])
115
118
self ._scale = None if len (matches [0 ]) < 2 or matches [0 ][1 ] == "" or int (matches [0 ][1 ]) == 0 else int (matches [0 ][1 ])
116
- else :
119
+
120
+ if re .search (r"^\D+" , data_type_array [1 ]):
117
121
self ._data_type += " {}" .format (data_type_array [1 ])
118
122
119
123
@@ -136,6 +140,11 @@ def constraint(self, constraint):
136
140
self ._pk = False if self ._constraint is None or not re .search ("PRIMARY KEY" , self ._constraint ) else True
137
141
self ._unique = False if self ._constraint is None or not re .search ("UNIQUE" , self ._constraint ) else True
138
142
143
+ @property
144
+ def array_dimensional (self ):
145
+ """array dimensional number"""
146
+ return self ._array_dimensional
147
+
139
148
@property
140
149
def not_null (self ):
141
150
return self ._not_null
@@ -166,7 +175,7 @@ def bigquery_data_type(self):
166
175
167
176
# BigQuery data type = {source_database: [data type, ...], ...}
168
177
BQ_DATA_TYPE_DIC = OrderedDict ()
169
- BQ_DATA_TYPE_DIC ["STRING" ] = {None : [re .compile (r"(CHAR|TEXT|CLOB)" )]}
178
+ BQ_DATA_TYPE_DIC ["STRING" ] = {None : [re .compile (r"(CHAR|TEXT|CLOB|JSON )" )]}
170
179
BQ_DATA_TYPE_DIC ["INTEGER" ] = {None : [re .compile (r"INT|SERIAL|YEAR" )]}
171
180
BQ_DATA_TYPE_DIC ["FLOAT" ] = {None : [re .compile (r"(FLOAT|DOUBLE)" ), "REAL" , "MONEY" ]}
172
181
BQ_DATA_TYPE_DIC ["DATETIME" ] = {
@@ -231,12 +240,48 @@ def bigquery_standard_data_type(self):
231
240
def bigquery_mode (self ):
232
241
"""Get BigQuery constraint"""
233
242
234
- return "REQUIRED" if self .not_null else "NULLABLE"
243
+ if self .array_dimensional > 0 :
244
+ return "REPEATED"
245
+ elif self .not_null :
246
+ return "REQUIRED"
247
+ else :
248
+ return "NULLABLE"
235
249
236
250
def to_bigquery_field (self , name_case = DdlParseBase .NAME_CASE .original ):
237
251
"""Generate BigQuery JSON field define"""
238
252
239
- return '{{"name": "{}", "type": "{}", "mode": "{}"}}' .format (self .get_name (name_case ), self .bigquery_data_type , self .bigquery_mode )
253
+ col_name = self .get_name (name_case )
254
+ mode = self .bigquery_mode
255
+
256
+ if self .array_dimensional <= 1 :
257
+ # no or one dimensional array data type
258
+ type = self .bigquery_legacy_data_type
259
+
260
+ else :
261
+ # multiple dimensional array data type
262
+ type = "RECORD"
263
+
264
+ fields = OrderedDict ()
265
+ fields_cur = fields
266
+
267
+ for i in range (1 , self .array_dimensional ):
268
+ is_last = True if i == self .array_dimensional - 1 else False
269
+
270
+ fields_cur ['fields' ] = [OrderedDict ()]
271
+ fields_cur = fields_cur ['fields' ][0 ]
272
+
273
+ fields_cur ['name' ] = "dimension_{}" .format (i )
274
+ fields_cur ['type' ] = self .bigquery_legacy_data_type if is_last else "RECORD"
275
+ fields_cur ['mode' ] = self .bigquery_mode if is_last else "REPEATED"
276
+
277
+ col = OrderedDict ()
278
+ col ['name' ] = col_name
279
+ col ['type' ] = type
280
+ col ['mode' ] = mode
281
+ if self .array_dimensional > 1 :
282
+ col ['fields' ] = fields ['fields' ]
283
+
284
+ return json .dumps (col )
240
285
241
286
242
287
class DdlParseColumnDict (OrderedDict , DdlParseBase ):
@@ -258,11 +303,11 @@ def __getitem__(self, key):
258
303
def __setitem__ (self , key , value ):
259
304
super ().__setitem__ (key .lower (), value )
260
305
261
- def append (self , column_name , data_type_array = None , constraint = None , source_database = None ):
306
+ def append (self , column_name , data_type_array = None , array_brackets = None , constraint = None , source_database = None ):
262
307
if source_database is None :
263
308
source_database = self .source_database
264
309
265
- column = DdlParseColumn (column_name , data_type_array , constraint , source_database )
310
+ column = DdlParseColumn (column_name , data_type_array , array_brackets , constraint , source_database )
266
311
self .__setitem__ (column_name , column )
267
312
return column
268
313
@@ -366,18 +411,42 @@ def to_bigquery_ddl(self, name_case=DdlParseBase.NAME_CASE.original):
366
411
else :
367
412
dataset = self .schema
368
413
369
- cols_def = []
414
+ cols_defs = []
370
415
for col in self .columns .values ():
371
- cols_def .append ("{name} {type}{not_null}" .format (
372
- name = col .get_name (name_case ),
373
- type = col .bigquery_standard_data_type ,
374
- not_null = " NOT NULL" if col .not_null else "" ,
416
+ col_name = col .get_name (name_case )
417
+
418
+ if col .array_dimensional < 1 :
419
+ # no array data type
420
+ type = col .bigquery_standard_data_type
421
+ not_null = " NOT NULL" if col .not_null else ""
422
+
423
+ else :
424
+ # one or multiple dimensional array data type
425
+ type_front = "ARRAY<"
426
+ type_back = ">"
427
+ for i in range (1 , col .array_dimensional ):
428
+ type_front += "STRUCT<dimension_{} ARRAY<" .format (i )
429
+ type_back += ">>"
430
+
431
+ type = "{}{}{}" .format (type_front , col .bigquery_standard_data_type , type_back )
432
+ not_null = ""
433
+
434
+ cols_defs .append ("{name} {type}{not_null}" .format (
435
+ name = col_name ,
436
+ type = type ,
437
+ not_null = not_null ,
375
438
))
376
439
377
- return "#standardSQL\n CREATE TABLE `project.{dataset}.{table}`\n (\n {colmns_define}\n )" .format (
440
+ return textwrap .dedent (
441
+ """\
442
+ #standardSQL
443
+ CREATE TABLE `project.{dataset}.{table}`
444
+ (
445
+ {colmns_define}
446
+ )""" ).format (
378
447
dataset = dataset ,
379
448
table = self .get_name (name_case ),
380
- colmns_define = ",\n " .join (cols_def ),
449
+ colmns_define = ",\n " .join (cols_defs ),
381
450
)
382
451
383
452
@@ -411,10 +480,11 @@ class DdlParse(DdlParseBase):
411
480
+ Optional (_SUPPRESS_QUOTE ) + Word (alphanums + "_" )("name" ) + Optional (_SUPPRESS_QUOTE )
412
481
+ Group (
413
482
Word (alphanums + "_" )
414
- + Optional (CaselessKeyword ("WITHOUT TIME ZONE" ) ^ CaselessKeyword ("WITH TIME ZONE" ) ^ CaselessKeyword ("PRECISION" ))
483
+ + Optional (CaselessKeyword ("WITHOUT TIME ZONE" ) ^ CaselessKeyword ("WITH TIME ZONE" ) ^ CaselessKeyword ("PRECISION" ) ^ CaselessKeyword ( "VARYING" ) )
415
484
+ Optional (_LPAR + Regex (r"\d+\s*,*\s*\d*" ) + Optional (Suppress (_CHAR_SEMANTICS | _BYTE_SEMANTICS )) + _RPAR )
416
485
)("type" )
417
- + Optional (Word (alphanums + "_': -" ))("constraint" )
486
+ + Optional (Word ("[]" ))("array_brackets" )
487
+ + Optional (Word (alphanums + "_': -." ))("constraint" )
418
488
)("column" )
419
489
)
420
490
)("columns" )
@@ -483,7 +553,8 @@ def parse(self, ddl=None, source_database=None):
483
553
# add column
484
554
col = self ._table .columns .append (
485
555
column_name = ret_col ["name" ],
486
- data_type_array = ret_col ["type" ])
556
+ data_type_array = ret_col ["type" ],
557
+ array_brackets = ret_col ['array_brackets' ] if "array_brackets" in ret_col else None )
487
558
488
559
if "constraint" in ret_col :
489
560
col .constraint = ret_col ["constraint" ]
0 commit comments