meerschaum.utils.dtypes.sql
Utility functions for working with SQL data types.
1#! /usr/bin/env python3 2# -*- coding: utf-8 -*- 3# vim:fenc=utf-8 4 5""" 6Utility functions for working with SQL data types. 7""" 8 9from __future__ import annotations 10from meerschaum.utils.typing import Dict, Union, Tuple, Optional 11 12NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = { 13 'mariadb': (38, 20), 14 'mysql': (38, 20), 15 'mssql': (28, 10), 16} 17NUMERIC_AS_TEXT_FLAVORS = {'sqlite', 'duckdb'} 18TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'} 19 20### MySQL doesn't allow for casting as BIGINT, so this is a workaround. 21DB_FLAVORS_CAST_DTYPES = { 22 'mariadb': { 23 'BIGINT': 'DECIMAL', 24 'TINYINT': 'SIGNED INT', 25 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 26 'BOOL': 'SIGNED INT', 27 'BOOLEAN': 'SIGNED INT', 28 'DOUBLE PRECISION': 'DECIMAL', 29 'DOUBLE': 'DECIMAL', 30 'FLOAT': 'DECIMAL', 31 }, 32 'mysql': { 33 'BIGINT': 'DECIMAL', 34 'TINYINT': 'SIGNED INT', 35 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 36 'INT': 'SIGNED INT', 37 'INTEGER': 'SIGNED INT', 38 'BOOL': 'SIGNED INT', 39 'BOOLEAN': 'SIGNED INT', 40 'DOUBLE PRECISION': 'DECIMAL', 41 'DOUBLE': 'DECIMAL', 42 'FLOAT': 'DECIMAL', 43 }, 44 'sqlite': { 45 'BOOLEAN': 'INTEGER', 46 'REAL': 'FLOAT', 47 }, 48 'oracle': { 49 'NVARCHAR(2000)': 'NVARCHAR2(2000)', 50 'NVARCHAR': 'NVARCHAR2(2000)', 51 'NVARCHAR2': 'NVARCHAR2(2000)', 52 'CHAR': 'CHAR(36)', # UUID columns 53 }, 54 'mssql': { 55 'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 56 'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 57 'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 58 'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 59 'NVARCHAR': 'NVARCHAR(MAX)', 60 }, 61} 62for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items(): 63 if _flavor not in DB_FLAVORS_CAST_DTYPES: 64 DB_FLAVORS_CAST_DTYPES[_flavor] = {} 65 DB_FLAVORS_CAST_DTYPES[_flavor].update({ 66 'NUMERIC': f"NUMERIC({_precision}, {_scale})", 67 'DECIMAL': f"DECIMAL({_precision}, {_scale})", 68 }) 69 70DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = { 71 'FLOAT': 'float64[pyarrow]', 72 'REAL': 'float64[pyarrow]', 73 'DOUBLE_PRECISION': 'float64[pyarrow]', 74 'DOUBLE': 'float64[pyarrow]', 75 'DECIMAL': 'numeric', 76 'BIGINT': 'int64[pyarrow]', 77 'INT': 'int64[pyarrow]', 78 'INTEGER': 'int64[pyarrow]', 79 'NUMBER': 'numeric', 80 'NUMERIC': 'numeric', 81 'TIMESTAMP': 'datetime64[ns]', 82 'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]', 83 'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]', 84 'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]', 85 'TIMESTAMPTZ': 'datetime64[ns, UTC]', 86 'DATE': 'datetime64[ns]', 87 'DATETIME': 'datetime64[ns]', 88 'DATETIME2': 'datetime64[ns]', 89 'DATETIMEOFFSET': 'datetime64[ns, UTC]', 90 'TEXT': 'string[pyarrow]', 91 'VARCHAR': 'string[pyarrow]', 92 'CLOB': 'string[pyarrow]', 93 'BOOL': 'bool[pyarrow]', 94 'BOOLEAN': 'bool[pyarrow]', 95 'BOOLEAN()': 'bool[pyarrow]', 96 'TINYINT': 'bool[pyarrow]', 97 'TINYINT(1)': 'bool[pyarrow]', 98 'BIT': 'bool[pyarrow]', 99 'BIT(1)': 'bool[pyarrow]', 100 'JSON': 'json', 101 'JSONB': 'json', 102 'UUID': 'uuid', 103 'UNIQUEIDENTIFIER': 'uuid', 104 'BYTEA': 'bytes', 105 'BLOB': 'bytes', 106 'VARBINARY': 'bytes', 107 'VARBINARY(MAX)': 'bytes', 108 'substrings': { 109 'CHAR': 'string[pyarrow]', 110 'TIMESTAMP': 'datetime64[ns]', 111 'TIME': 'datetime64[ns]', 112 'DATE': 'datetime64[ns]', 113 'DOUBLE': 'double[pyarrow]', 114 'DECIMAL': 'numeric', 115 'NUMERIC': 'numeric', 116 'NUMBER': 'numeric', 117 'INT': 'int64[pyarrow]', 118 'BOOL': 'bool[pyarrow]', 119 'JSON': 'json', 120 'BYTE': 'bytes', 121 'LOB': 'bytes', 122 'BINARY': 'bytes', 123 }, 124 'default': 'object', 125} 126### Map pandas dtypes to flavor-specific dtypes. 127PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = { 128 'int': { 129 'timescaledb': 'BIGINT', 130 'postgresql': 'BIGINT', 131 'mariadb': 'BIGINT', 132 'mysql': 'BIGINT', 133 'mssql': 'BIGINT', 134 'oracle': 'INT', 135 'sqlite': 'BIGINT', 136 'duckdb': 'BIGINT', 137 'citus': 'BIGINT', 138 'cockroachdb': 'BIGINT', 139 'default': 'INT', 140 }, 141 'float': { 142 'timescaledb': 'DOUBLE PRECISION', 143 'postgresql': 'DOUBLE PRECISION', 144 'mariadb': 'DOUBLE PRECISION', 145 'mysql': 'DOUBLE PRECISION', 146 'mssql': 'FLOAT', 147 'oracle': 'FLOAT', 148 'sqlite': 'FLOAT', 149 'duckdb': 'DOUBLE PRECISION', 150 'citus': 'DOUBLE PRECISION', 151 'cockroachdb': 'DOUBLE PRECISION', 152 'default': 'DOUBLE', 153 }, 154 'double': { 155 'timescaledb': 'DOUBLE PRECISION', 156 'postgresql': 'DOUBLE PRECISION', 157 'mariadb': 'DOUBLE PRECISION', 158 'mysql': 'DOUBLE PRECISION', 159 'mssql': 'FLOAT', 160 'oracle': 'FLOAT', 161 'sqlite': 'FLOAT', 162 'duckdb': 'DOUBLE PRECISION', 163 'citus': 'DOUBLE PRECISION', 164 'cockroachdb': 'DOUBLE PRECISION', 165 'default': 'DOUBLE', 166 }, 167 'datetime64[ns]': { 168 'timescaledb': 'TIMESTAMP', 169 'postgresql': 'TIMESTAMP', 170 'mariadb': 'DATETIME', 171 'mysql': 'DATETIME', 172 'mssql': 'DATETIME2', 173 'oracle': 'TIMESTAMP(9)', 174 'sqlite': 'DATETIME', 175 'duckdb': 'TIMESTAMP', 176 'citus': 'TIMESTAMP', 177 'cockroachdb': 'TIMESTAMP', 178 'default': 'DATETIME', 179 }, 180 'datetime64[ns, UTC]': { 181 'timescaledb': 'TIMESTAMPTZ', 182 'postgresql': 'TIMESTAMPTZ', 183 'mariadb': 'DATETIME', 184 'mysql': 'DATETIME', 185 'mssql': 'DATETIMEOFFSET', 186 'oracle': 'TIMESTAMP(9)', 187 'sqlite': 'TIMESTAMP', 188 'duckdb': 'TIMESTAMPTZ', 189 'citus': 'TIMESTAMPTZ', 190 'cockroachdb': 'TIMESTAMPTZ', 191 'default': 'TIMESTAMPTZ', 192 }, 193 'datetime': { 194 'timescaledb': 'TIMESTAMPTZ', 195 'postgresql': 'TIMESTAMPTZ', 196 'mariadb': 'DATETIME', 197 'mysql': 'DATETIME', 198 'mssql': 'DATETIMEOFFSET', 199 'oracle': 'TIMESTAMP(9)', 200 'sqlite': 'TIMESTAMP', 201 'duckdb': 'TIMESTAMPTZ', 202 'citus': 'TIMESTAMPTZ', 203 'cockroachdb': 'TIMESTAMPTZ', 204 'default': 'TIMESTAMPTZ', 205 }, 206 'datetimetz': { 207 'timescaledb': 'TIMESTAMPTZ', 208 'postgresql': 'TIMESTAMPTZ', 209 'mariadb': 'DATETIME', 210 'mysql': 'DATETIME', 211 'mssql': 'DATETIMEOFFSET', 212 'oracle': 'TIMESTAMP(9)', 213 'sqlite': 'TIMESTAMP', 214 'duckdb': 'TIMESTAMPTZ', 215 'citus': 'TIMESTAMPTZ', 216 'cockroachdb': 'TIMESTAMPTZ', 217 'default': 'TIMESTAMPTZ', 218 }, 219 'bool': { 220 'timescaledb': 'BOOLEAN', 221 'postgresql': 'BOOLEAN', 222 'mariadb': 'BOOLEAN', 223 'mysql': 'BOOLEAN', 224 'mssql': 'BIT', 225 'oracle': 'INTEGER', 226 'sqlite': 'FLOAT', 227 'duckdb': 'BOOLEAN', 228 'citus': 'BOOLEAN', 229 'cockroachdb': 'BOOLEAN', 230 'default': 'BOOLEAN', 231 }, 232 'object': { 233 'timescaledb': 'TEXT', 234 'postgresql': 'TEXT', 235 'mariadb': 'TEXT', 236 'mysql': 'TEXT', 237 'mssql': 'NVARCHAR(MAX)', 238 'oracle': 'NVARCHAR2(2000)', 239 'sqlite': 'TEXT', 240 'duckdb': 'TEXT', 241 'citus': 'TEXT', 242 'cockroachdb': 'TEXT', 243 'default': 'TEXT', 244 }, 245 'string': { 246 'timescaledb': 'TEXT', 247 'postgresql': 'TEXT', 248 'mariadb': 'TEXT', 249 'mysql': 'TEXT', 250 'mssql': 'NVARCHAR(MAX)', 251 'oracle': 'NVARCHAR2(2000)', 252 'sqlite': 'TEXT', 253 'duckdb': 'TEXT', 254 'citus': 'TEXT', 255 'cockroachdb': 'TEXT', 256 'default': 'TEXT', 257 }, 258 'unicode': { 259 'timescaledb': 'TEXT', 260 'postgresql': 'TEXT', 261 'mariadb': 'TEXT', 262 'mysql': 'TEXT', 263 'mssql': 'NVARCHAR(MAX)', 264 'oracle': 'NVARCHAR2(2000)', 265 'sqlite': 'TEXT', 266 'duckdb': 'TEXT', 267 'citus': 'TEXT', 268 'cockroachdb': 'TEXT', 269 'default': 'TEXT', 270 }, 271 'json': { 272 'timescaledb': 'JSONB', 273 'postgresql': 'JSONB', 274 'mariadb': 'TEXT', 275 'mysql': 'TEXT', 276 'mssql': 'NVARCHAR(MAX)', 277 'oracle': 'NVARCHAR2(2000)', 278 'sqlite': 'TEXT', 279 'duckdb': 'TEXT', 280 'citus': 'JSONB', 281 'cockroachdb': 'JSONB', 282 'default': 'TEXT', 283 }, 284 'numeric': { 285 'timescaledb': 'NUMERIC', 286 'postgresql': 'NUMERIC', 287 'mariadb': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mariadb"]}', 288 'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}', 289 'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}', 290 'oracle': 'NUMBER', 291 'sqlite': 'TEXT', 292 'duckdb': 'TEXT', 293 'citus': 'NUMERIC', 294 'cockroachdb': 'NUMERIC', 295 'default': 'NUMERIC', 296 }, 297 'uuid': { 298 'timescaledb': 'UUID', 299 'postgresql': 'UUID', 300 'mariadb': 'CHAR(36)', 301 'mysql': 'CHAR(36)', 302 'mssql': 'UNIQUEIDENTIFIER', 303 ### I know this is too much space, but erring on the side of caution. 304 'oracle': 'CHAR(36)', 305 'sqlite': 'TEXT', 306 'duckdb': 'VARCHAR', 307 'citus': 'UUID', 308 'cockroachdb': 'UUID', 309 'default': 'TEXT', 310 }, 311 'bytes': { 312 'timescaledb': 'BYTEA', 313 'postgresql': 'BYTEA', 314 'mariadb': 'BLOB', 315 'mysql': 'BLOB', 316 'mssql': 'VARBINARY(MAX)', 317 'oracle': 'BLOB', 318 'sqlite': 'BLOB', 319 'duckdb': 'BLOB', 320 'citus': 'BYTEA', 321 'cockroachdb': 'BYTEA', 322 'default': 'BLOB', 323 }, 324} 325PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = { 326 'int': { 327 'timescaledb': 'BigInteger', 328 'postgresql': 'BigInteger', 329 'mariadb': 'BigInteger', 330 'mysql': 'BigInteger', 331 'mssql': 'BigInteger', 332 'oracle': 'BigInteger', 333 'sqlite': 'BigInteger', 334 'duckdb': 'BigInteger', 335 'citus': 'BigInteger', 336 'cockroachdb': 'BigInteger', 337 'default': 'BigInteger', 338 }, 339 'float': { 340 'timescaledb': 'Float', 341 'postgresql': 'Float', 342 'mariadb': 'Float', 343 'mysql': 'Float', 344 'mssql': 'Float', 345 'oracle': 'Float', 346 'sqlite': 'Float', 347 'duckdb': 'Float', 348 'citus': 'Float', 349 'cockroachdb': 'Float', 350 'default': 'Float', 351 }, 352 'datetime': { 353 'timescaledb': 'DateTime(timezone=True)', 354 'postgresql': 'DateTime(timezone=True)', 355 'mariadb': 'DateTime(timezone=True)', 356 'mysql': 'DateTime(timezone=True)', 357 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 358 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 359 'sqlite': 'DateTime(timezone=True)', 360 'duckdb': 'DateTime(timezone=True)', 361 'citus': 'DateTime(timezone=True)', 362 'cockroachdb': 'DateTime(timezone=True)', 363 'default': 'DateTime(timezone=True)', 364 }, 365 'datetime64[ns]': { 366 'timescaledb': 'DateTime', 367 'postgresql': 'DateTime', 368 'mariadb': 'DateTime', 369 'mysql': 'DateTime', 370 'mssql': 'sqlalchemy.dialects.mssql.DATETIME2', 371 'oracle': 'DateTime', 372 'sqlite': 'DateTime', 373 'duckdb': 'DateTime', 374 'citus': 'DateTime', 375 'cockroachdb': 'DateTime', 376 'default': 'DateTime', 377 }, 378 'datetime64[ns, UTC]': { 379 'timescaledb': 'DateTime(timezone=True)', 380 'postgresql': 'DateTime(timezone=True)', 381 'mariadb': 'DateTime(timezone=True)', 382 'mysql': 'DateTime(timezone=True)', 383 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 384 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 385 'sqlite': 'DateTime(timezone=True)', 386 'duckdb': 'DateTime(timezone=True)', 387 'citus': 'DateTime(timezone=True)', 388 'cockroachdb': 'DateTime(timezone=True)', 389 'default': 'DateTime(timezone=True)', 390 }, 391 'bool': { 392 'timescaledb': 'Boolean', 393 'postgresql': 'Boolean', 394 'mariadb': 'Integer', 395 'mysql': 'Integer', 396 'mssql': 'sqlalchemy.dialects.mssql.BIT', 397 'oracle': 'Integer', 398 'sqlite': 'Float', 399 'duckdb': 'Boolean', 400 'citus': 'Boolean', 401 'cockroachdb': 'Boolean', 402 'default': 'Boolean', 403 }, 404 'object': { 405 'timescaledb': 'UnicodeText', 406 'postgresql': 'UnicodeText', 407 'mariadb': 'UnicodeText', 408 'mysql': 'UnicodeText', 409 'mssql': 'UnicodeText', 410 'oracle': 'UnicodeText', 411 'sqlite': 'UnicodeText', 412 'duckdb': 'UnicodeText', 413 'citus': 'UnicodeText', 414 'cockroachdb': 'UnicodeText', 415 'default': 'UnicodeText', 416 }, 417 'string': { 418 'timescaledb': 'UnicodeText', 419 'postgresql': 'UnicodeText', 420 'mariadb': 'UnicodeText', 421 'mysql': 'UnicodeText', 422 'mssql': 'UnicodeText', 423 'oracle': 'UnicodeText', 424 'sqlite': 'UnicodeText', 425 'duckdb': 'UnicodeText', 426 'citus': 'UnicodeText', 427 'cockroachdb': 'UnicodeText', 428 'default': 'UnicodeText', 429 }, 430 'json': { 431 'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB', 432 'postgresql': 'sqlalchemy.dialects.postgresql.JSONB', 433 'mariadb': 'UnicodeText', 434 'mysql': 'UnicodeText', 435 'mssql': 'UnicodeText', 436 'oracle': 'UnicodeText', 437 'sqlite': 'UnicodeText', 438 'duckdb': 'TEXT', 439 'citus': 'sqlalchemy.dialects.postgresql.JSONB', 440 'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB', 441 'default': 'UnicodeText', 442 }, 443 'numeric': { 444 'timescaledb': 'Numeric', 445 'postgresql': 'Numeric', 446 'mariadb': 'Numeric', 447 'mysql': 'Numeric', 448 'mssql': 'Numeric', 449 'oracle': 'Numeric', 450 'sqlite': 'UnicodeText', 451 'duckdb': 'Numeric', 452 'citus': 'Numeric', 453 'cockroachdb': 'Numeric', 454 'default': 'Numeric', 455 }, 456 'uuid': { 457 'timescaledb': 'Uuid', 458 'postgresql': 'Uuid', 459 'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)', 460 'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)', 461 'mssql': 'Uuid', 462 'oracle': 'sqlalchemy.dialects.oracle.CHAR(36)', 463 'sqlite': 'UnicodeText', 464 'duckdb': 'UnicodeText', 465 'citus': 'Uuid', 466 'cockroachdb': 'Uuid', 467 'default': 'Uuid', 468 }, 469 'bytes': { 470 'timescaledb': 'LargeBinary', 471 'postgresql': 'LargeBinary', 472 'mariadb': 'LargeBinary', 473 'mysql': 'LargeBinary', 474 'mssql': 'LargeBinary', 475 'oracle': 'LargeBinary', 476 'sqlite': 'LargeBinary', 477 'duckdb': 'LargeBinary', 478 'citus': 'LargeBinary', 479 'cockroachdb': 'LargeBinary', 480 'default': 'LargeBinary', 481 }, 482} 483 484AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = { 485 'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY', 486 'postgresql': 'GENERATED BY DEFAULT AS IDENTITY', 487 'mariadb': 'AUTO_INCREMENT', 488 'mysql': 'AUTO_INCREMENT', 489 'mssql': 'IDENTITY(1,1)', 490 'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY', 491 'sqlite': 'AUTOINCREMENT', 492 'duckdb': 'GENERATED BY DEFAULT', 493 'citus': 'GENERATED BY DEFAULT', 494 'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY', 495 'default': 'GENERATED BY DEFAULT AS IDENTITY', 496} 497 498 499def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = True) -> str: 500 """ 501 Parse a database type to a pandas data type. 502 503 Parameters 504 ---------- 505 db_type: str 506 The database type, e.g. `DATETIME`, `BIGINT`, etc. 507 508 allow_custom_dtypes: bool, default False 509 If `True`, allow for custom data types like `json` and `str`. 510 511 Returns 512 ------- 513 The equivalent datatype for a pandas DataFrame. 514 """ 515 from meerschaum.utils.dtypes import are_dtypes_equal 516 def parse_custom(_pd_type: str, _db_type: str) -> str: 517 if 'json' in _db_type.lower(): 518 return 'json' 519 if are_dtypes_equal(_pd_type, 'numeric') and _pd_type != 'object': 520 precision, scale = get_numeric_precision_scale(None, dtype=_db_type.upper()) 521 if precision and scale: 522 return f"numeric[{precision},{scale}]" 523 return _pd_type 524 525 pd_type = DB_TO_PD_DTYPES.get(db_type.upper().split('(', maxsplit=1)[0].strip(), None) 526 if pd_type is not None: 527 return ( 528 parse_custom(pd_type, db_type) 529 if allow_custom_dtypes 530 else pd_type 531 ) 532 for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items(): 533 if db_t in db_type.upper(): 534 return ( 535 parse_custom(pd_t, db_t) 536 if allow_custom_dtypes 537 else pd_t 538 ) 539 return DB_TO_PD_DTYPES['default'] 540 541 542def get_db_type_from_pd_type( 543 pd_type: str, 544 flavor: str = 'default', 545 as_sqlalchemy: bool = False, 546) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']: 547 """ 548 Parse a Pandas data type into a flavor's database type. 549 550 Parameters 551 ---------- 552 pd_type: str 553 The Pandas datatype. This must be a string, not the actual dtype object. 554 555 flavor: str, default 'default' 556 The flavor of the database to be mapped to. 557 558 as_sqlalchemy: bool, default False 559 If `True`, return a type from `sqlalchemy.types`. 560 561 Returns 562 ------- 563 The database data type for the incoming Pandas data type. 564 If nothing can be found, a warning will be thrown and 'TEXT' will be returned. 565 """ 566 from meerschaum.utils.warnings import warn 567 from meerschaum.utils.packages import attempt_import 568 from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_ALIAS_DTYPES 569 from meerschaum.utils.misc import parse_arguments_str 570 sqlalchemy_types = attempt_import('sqlalchemy.types', lazy=False) 571 572 types_registry = ( 573 PD_TO_DB_DTYPES_FLAVORS 574 if not as_sqlalchemy 575 else PD_TO_SQLALCHEMY_DTYPES_FLAVORS 576 ) 577 578 precision, scale = None, None 579 og_pd_type = pd_type 580 if pd_type in MRSM_ALIAS_DTYPES: 581 pd_type = MRSM_ALIAS_DTYPES[pd_type] 582 583 ### Check whether we are able to match this type (e.g. pyarrow support). 584 found_db_type = False 585 if pd_type not in types_registry and not pd_type.startswith('numeric['): 586 for mapped_pd_type in types_registry: 587 if are_dtypes_equal(mapped_pd_type, pd_type): 588 pd_type = mapped_pd_type 589 found_db_type = True 590 break 591 elif pd_type.startswith('numeric['): 592 og_pd_type = pd_type 593 pd_type = 'numeric' 594 precision, scale = get_numeric_precision_scale(flavor, og_pd_type) 595 found_db_type = True 596 else: 597 found_db_type = True 598 599 if not found_db_type: 600 warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.", stacklevel=3) 601 return ( 602 'TEXT' 603 if not as_sqlalchemy 604 else sqlalchemy_types.UnicodeText 605 ) 606 flavor_types = types_registry.get( 607 pd_type, 608 { 609 'default': ( 610 'TEXT' 611 if not as_sqlalchemy 612 else 'UnicodeText' 613 ), 614 }, 615 ) 616 default_flavor_type = flavor_types.get( 617 'default', 618 ( 619 'TEXT' 620 if not as_sqlalchemy 621 else 'UnicodeText' 622 ), 623 ) 624 if flavor not in flavor_types: 625 warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).") 626 db_type = flavor_types.get(flavor, default_flavor_type) 627 if not as_sqlalchemy: 628 if precision is not None and scale is not None: 629 db_type_bare = db_type.split('(', maxsplit=1)[0] 630 return f"{db_type_bare}({precision},{scale})" 631 return db_type 632 633 if db_type.startswith('sqlalchemy.dialects'): 634 dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2) 635 cls_args, cls_kwargs = None, None 636 if '(' in typ_class_name: 637 typ_class_name, args_str = typ_class_name.split('(', maxsplit=1) 638 args_str = args_str.rstrip(')') 639 cls_args, cls_kwargs = parse_arguments_str(args_str) 640 sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}') 641 cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name) 642 if cls_args is None: 643 return cls 644 return cls(*cls_args, **cls_kwargs) 645 646 if 'numeric' in db_type.lower(): 647 if precision is None or scale is None: 648 return sqlalchemy_types.Numeric 649 return sqlalchemy_types.Numeric(precision, scale) 650 651 cls_args, cls_kwargs = None, None 652 typ_class_name = db_type 653 if '(' in db_type: 654 typ_class_name, args_str = db_type.split('(', maxsplit=1) 655 args_str = args_str.rstrip(')') 656 cls_args, cls_kwargs = parse_arguments_str(args_str) 657 658 cls = getattr(sqlalchemy_types, typ_class_name) 659 if cls_args is None: 660 return cls 661 return cls(*cls_args, **cls_kwargs) 662 663 664def get_numeric_precision_scale( 665 flavor: str, 666 dtype: Optional[str] = None, 667) -> Union[Tuple[int, int], Tuple[None, None]]: 668 """ 669 Return the precision and scale to use for a numeric column for a given database flavor. 670 671 Parameters 672 ---------- 673 flavor: str 674 The database flavor for which to return the precision and scale. 675 676 dtype: Optional[str], default None 677 If provided, return the precision and scale provided in the dtype (if applicable). 678 If all caps, treat this as a DB type. 679 680 Returns 681 ------- 682 A tuple of ints or a tuple of Nones. 683 """ 684 if not dtype: 685 return None, None 686 687 lbracket = '[' if '[' in dtype else '(' 688 rbracket = ']' if lbracket == '[' else ')' 689 if lbracket in dtype and dtype.count(',') == 1 and dtype.endswith(rbracket): 690 try: 691 parts = dtype.split(lbracket, maxsplit=1)[-1].rstrip(rbracket).split(',', maxsplit=1) 692 return int(parts[0].strip()), int(parts[1].strip()) 693 except Exception: 694 pass 695 696 return NUMERIC_PRECISION_FLAVORS.get(flavor, (None, None))
NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] =
{'mariadb': (38, 20), 'mysql': (38, 20), 'mssql': (28, 10)}
NUMERIC_AS_TEXT_FLAVORS =
{'duckdb', 'sqlite'}
TIMEZONE_NAIVE_FLAVORS =
{'oracle', 'mysql', 'mariadb'}
DB_FLAVORS_CAST_DTYPES =
{'mariadb': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'mysql': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'INT': 'SIGNED INT', 'INTEGER': 'SIGNED INT', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'sqlite': {'BOOLEAN': 'INTEGER', 'REAL': 'FLOAT'}, 'oracle': {'NVARCHAR(2000)': 'NVARCHAR2(2000)', 'NVARCHAR': 'NVARCHAR2(2000)', 'NVARCHAR2': 'NVARCHAR2(2000)', 'CHAR': 'CHAR(36)'}, 'mssql': {'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'NVARCHAR': 'NVARCHAR(MAX)', 'NUMERIC': 'NUMERIC(28, 10)', 'DECIMAL': 'DECIMAL(28, 10)'}}
DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] =
{'FLOAT': 'float64[pyarrow]', 'REAL': 'float64[pyarrow]', 'DOUBLE_PRECISION': 'float64[pyarrow]', 'DOUBLE': 'float64[pyarrow]', 'DECIMAL': 'numeric', 'BIGINT': 'int64[pyarrow]', 'INT': 'int64[pyarrow]', 'INTEGER': 'int64[pyarrow]', 'NUMBER': 'numeric', 'NUMERIC': 'numeric', 'TIMESTAMP': 'datetime64[ns]', 'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]', 'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]', 'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]', 'TIMESTAMPTZ': 'datetime64[ns, UTC]', 'DATE': 'datetime64[ns]', 'DATETIME': 'datetime64[ns]', 'DATETIME2': 'datetime64[ns]', 'DATETIMEOFFSET': 'datetime64[ns, UTC]', 'TEXT': 'string[pyarrow]', 'VARCHAR': 'string[pyarrow]', 'CLOB': 'string[pyarrow]', 'BOOL': 'bool[pyarrow]', 'BOOLEAN': 'bool[pyarrow]', 'BOOLEAN()': 'bool[pyarrow]', 'TINYINT': 'bool[pyarrow]', 'TINYINT(1)': 'bool[pyarrow]', 'BIT': 'bool[pyarrow]', 'BIT(1)': 'bool[pyarrow]', 'JSON': 'json', 'JSONB': 'json', 'UUID': 'uuid', 'UNIQUEIDENTIFIER': 'uuid', 'BYTEA': 'bytes', 'BLOB': 'bytes', 'VARBINARY': 'bytes', 'VARBINARY(MAX)': 'bytes', 'substrings': {'CHAR': 'string[pyarrow]', 'TIMESTAMP': 'datetime64[ns]', 'TIME': 'datetime64[ns]', 'DATE': 'datetime64[ns]', 'DOUBLE': 'double[pyarrow]', 'DECIMAL': 'numeric', 'NUMERIC': 'numeric', 'NUMBER': 'numeric', 'INT': 'int64[pyarrow]', 'BOOL': 'bool[pyarrow]', 'JSON': 'json', 'BYTE': 'bytes', 'LOB': 'bytes', 'BINARY': 'bytes'}, 'default': 'object'}
PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] =
{'int': {'timescaledb': 'BIGINT', 'postgresql': 'BIGINT', 'mariadb': 'BIGINT', 'mysql': 'BIGINT', 'mssql': 'BIGINT', 'oracle': 'INT', 'sqlite': 'BIGINT', 'duckdb': 'BIGINT', 'citus': 'BIGINT', 'cockroachdb': 'BIGINT', 'default': 'INT'}, 'float': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'double': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'datetime64[ns]': {'timescaledb': 'TIMESTAMP', 'postgresql': 'TIMESTAMP', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIME2', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'DATETIME', 'duckdb': 'TIMESTAMP', 'citus': 'TIMESTAMP', 'cockroachdb': 'TIMESTAMP', 'default': 'DATETIME'}, 'datetime64[ns, UTC]': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'datetime': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'datetimetz': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'bool': {'timescaledb': 'BOOLEAN', 'postgresql': 'BOOLEAN', 'mariadb': 'BOOLEAN', 'mysql': 'BOOLEAN', 'mssql': 'BIT', 'oracle': 'INTEGER', 'sqlite': 'FLOAT', 'duckdb': 'BOOLEAN', 'citus': 'BOOLEAN', 'cockroachdb': 'BOOLEAN', 'default': 'BOOLEAN'}, 'object': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'string': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'unicode': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'json': {'timescaledb': 'JSONB', 'postgresql': 'JSONB', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'JSONB', 'cockroachdb': 'JSONB', 'default': 'TEXT'}, 'numeric': {'timescaledb': 'NUMERIC', 'postgresql': 'NUMERIC', 'mariadb': 'DECIMAL(38, 20)', 'mysql': 'DECIMAL(38, 20)', 'mssql': 'NUMERIC(28, 10)', 'oracle': 'NUMBER', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'NUMERIC', 'cockroachdb': 'NUMERIC', 'default': 'NUMERIC'}, 'uuid': {'timescaledb': 'UUID', 'postgresql': 'UUID', 'mariadb': 'CHAR(36)', 'mysql': 'CHAR(36)', 'mssql': 'UNIQUEIDENTIFIER', 'oracle': 'CHAR(36)', 'sqlite': 'TEXT', 'duckdb': 'VARCHAR', 'citus': 'UUID', 'cockroachdb': 'UUID', 'default': 'TEXT'}, 'bytes': {'timescaledb': 'BYTEA', 'postgresql': 'BYTEA', 'mariadb': 'BLOB', 'mysql': 'BLOB', 'mssql': 'VARBINARY(MAX)', 'oracle': 'BLOB', 'sqlite': 'BLOB', 'duckdb': 'BLOB', 'citus': 'BYTEA', 'cockroachdb': 'BYTEA', 'default': 'BLOB'}}
PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] =
{'int': {'timescaledb': 'BigInteger', 'postgresql': 'BigInteger', 'mariadb': 'BigInteger', 'mysql': 'BigInteger', 'mssql': 'BigInteger', 'oracle': 'BigInteger', 'sqlite': 'BigInteger', 'duckdb': 'BigInteger', 'citus': 'BigInteger', 'cockroachdb': 'BigInteger', 'default': 'BigInteger'}, 'float': {'timescaledb': 'Float', 'postgresql': 'Float', 'mariadb': 'Float', 'mysql': 'Float', 'mssql': 'Float', 'oracle': 'Float', 'sqlite': 'Float', 'duckdb': 'Float', 'citus': 'Float', 'cockroachdb': 'Float', 'default': 'Float'}, 'datetime': {'timescaledb': 'DateTime(timezone=True)', 'postgresql': 'DateTime(timezone=True)', 'mariadb': 'DateTime(timezone=True)', 'mysql': 'DateTime(timezone=True)', 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 'sqlite': 'DateTime(timezone=True)', 'duckdb': 'DateTime(timezone=True)', 'citus': 'DateTime(timezone=True)', 'cockroachdb': 'DateTime(timezone=True)', 'default': 'DateTime(timezone=True)'}, 'datetime64[ns]': {'timescaledb': 'DateTime', 'postgresql': 'DateTime', 'mariadb': 'DateTime', 'mysql': 'DateTime', 'mssql': 'sqlalchemy.dialects.mssql.DATETIME2', 'oracle': 'DateTime', 'sqlite': 'DateTime', 'duckdb': 'DateTime', 'citus': 'DateTime', 'cockroachdb': 'DateTime', 'default': 'DateTime'}, 'datetime64[ns, UTC]': {'timescaledb': 'DateTime(timezone=True)', 'postgresql': 'DateTime(timezone=True)', 'mariadb': 'DateTime(timezone=True)', 'mysql': 'DateTime(timezone=True)', 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 'sqlite': 'DateTime(timezone=True)', 'duckdb': 'DateTime(timezone=True)', 'citus': 'DateTime(timezone=True)', 'cockroachdb': 'DateTime(timezone=True)', 'default': 'DateTime(timezone=True)'}, 'bool': {'timescaledb': 'Boolean', 'postgresql': 'Boolean', 'mariadb': 'Integer', 'mysql': 'Integer', 'mssql': 'sqlalchemy.dialects.mssql.BIT', 'oracle': 'Integer', 'sqlite': 'Float', 'duckdb': 'Boolean', 'citus': 'Boolean', 'cockroachdb': 'Boolean', 'default': 'Boolean'}, 'object': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'string': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'json': {'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB', 'postgresql': 'sqlalchemy.dialects.postgresql.JSONB', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'TEXT', 'citus': 'sqlalchemy.dialects.postgresql.JSONB', 'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB', 'default': 'UnicodeText'}, 'numeric': {'timescaledb': 'Numeric', 'postgresql': 'Numeric', 'mariadb': 'Numeric', 'mysql': 'Numeric', 'mssql': 'Numeric', 'oracle': 'Numeric', 'sqlite': 'UnicodeText', 'duckdb': 'Numeric', 'citus': 'Numeric', 'cockroachdb': 'Numeric', 'default': 'Numeric'}, 'uuid': {'timescaledb': 'Uuid', 'postgresql': 'Uuid', 'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mssql': 'Uuid', 'oracle': 'sqlalchemy.dialects.oracle.CHAR(36)', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'Uuid', 'cockroachdb': 'Uuid', 'default': 'Uuid'}, 'bytes': {'timescaledb': 'LargeBinary', 'postgresql': 'LargeBinary', 'mariadb': 'LargeBinary', 'mysql': 'LargeBinary', 'mssql': 'LargeBinary', 'oracle': 'LargeBinary', 'sqlite': 'LargeBinary', 'duckdb': 'LargeBinary', 'citus': 'LargeBinary', 'cockroachdb': 'LargeBinary', 'default': 'LargeBinary'}}
AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] =
{'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY', 'postgresql': 'GENERATED BY DEFAULT AS IDENTITY', 'mariadb': 'AUTO_INCREMENT', 'mysql': 'AUTO_INCREMENT', 'mssql': 'IDENTITY(1,1)', 'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY', 'sqlite': 'AUTOINCREMENT', 'duckdb': 'GENERATED BY DEFAULT', 'citus': 'GENERATED BY DEFAULT', 'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY', 'default': 'GENERATED BY DEFAULT AS IDENTITY'}
def
get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = True) -> str:
500def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = True) -> str: 501 """ 502 Parse a database type to a pandas data type. 503 504 Parameters 505 ---------- 506 db_type: str 507 The database type, e.g. `DATETIME`, `BIGINT`, etc. 508 509 allow_custom_dtypes: bool, default False 510 If `True`, allow for custom data types like `json` and `str`. 511 512 Returns 513 ------- 514 The equivalent datatype for a pandas DataFrame. 515 """ 516 from meerschaum.utils.dtypes import are_dtypes_equal 517 def parse_custom(_pd_type: str, _db_type: str) -> str: 518 if 'json' in _db_type.lower(): 519 return 'json' 520 if are_dtypes_equal(_pd_type, 'numeric') and _pd_type != 'object': 521 precision, scale = get_numeric_precision_scale(None, dtype=_db_type.upper()) 522 if precision and scale: 523 return f"numeric[{precision},{scale}]" 524 return _pd_type 525 526 pd_type = DB_TO_PD_DTYPES.get(db_type.upper().split('(', maxsplit=1)[0].strip(), None) 527 if pd_type is not None: 528 return ( 529 parse_custom(pd_type, db_type) 530 if allow_custom_dtypes 531 else pd_type 532 ) 533 for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items(): 534 if db_t in db_type.upper(): 535 return ( 536 parse_custom(pd_t, db_t) 537 if allow_custom_dtypes 538 else pd_t 539 ) 540 return DB_TO_PD_DTYPES['default']
Parse a database type to a pandas data type.
Parameters
- db_type (str):
The database type, e.g.
DATETIME
,BIGINT
, etc. - allow_custom_dtypes (bool, default False):
If
True
, allow for custom data types likejson
andstr
.
Returns
- The equivalent datatype for a pandas DataFrame.
def
get_db_type_from_pd_type( pd_type: str, flavor: str = 'default', as_sqlalchemy: bool = False) -> "Union[str, 'sqlalchemy.sql.visitors.TraversibleType']":
543def get_db_type_from_pd_type( 544 pd_type: str, 545 flavor: str = 'default', 546 as_sqlalchemy: bool = False, 547) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']: 548 """ 549 Parse a Pandas data type into a flavor's database type. 550 551 Parameters 552 ---------- 553 pd_type: str 554 The Pandas datatype. This must be a string, not the actual dtype object. 555 556 flavor: str, default 'default' 557 The flavor of the database to be mapped to. 558 559 as_sqlalchemy: bool, default False 560 If `True`, return a type from `sqlalchemy.types`. 561 562 Returns 563 ------- 564 The database data type for the incoming Pandas data type. 565 If nothing can be found, a warning will be thrown and 'TEXT' will be returned. 566 """ 567 from meerschaum.utils.warnings import warn 568 from meerschaum.utils.packages import attempt_import 569 from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_ALIAS_DTYPES 570 from meerschaum.utils.misc import parse_arguments_str 571 sqlalchemy_types = attempt_import('sqlalchemy.types', lazy=False) 572 573 types_registry = ( 574 PD_TO_DB_DTYPES_FLAVORS 575 if not as_sqlalchemy 576 else PD_TO_SQLALCHEMY_DTYPES_FLAVORS 577 ) 578 579 precision, scale = None, None 580 og_pd_type = pd_type 581 if pd_type in MRSM_ALIAS_DTYPES: 582 pd_type = MRSM_ALIAS_DTYPES[pd_type] 583 584 ### Check whether we are able to match this type (e.g. pyarrow support). 585 found_db_type = False 586 if pd_type not in types_registry and not pd_type.startswith('numeric['): 587 for mapped_pd_type in types_registry: 588 if are_dtypes_equal(mapped_pd_type, pd_type): 589 pd_type = mapped_pd_type 590 found_db_type = True 591 break 592 elif pd_type.startswith('numeric['): 593 og_pd_type = pd_type 594 pd_type = 'numeric' 595 precision, scale = get_numeric_precision_scale(flavor, og_pd_type) 596 found_db_type = True 597 else: 598 found_db_type = True 599 600 if not found_db_type: 601 warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.", stacklevel=3) 602 return ( 603 'TEXT' 604 if not as_sqlalchemy 605 else sqlalchemy_types.UnicodeText 606 ) 607 flavor_types = types_registry.get( 608 pd_type, 609 { 610 'default': ( 611 'TEXT' 612 if not as_sqlalchemy 613 else 'UnicodeText' 614 ), 615 }, 616 ) 617 default_flavor_type = flavor_types.get( 618 'default', 619 ( 620 'TEXT' 621 if not as_sqlalchemy 622 else 'UnicodeText' 623 ), 624 ) 625 if flavor not in flavor_types: 626 warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).") 627 db_type = flavor_types.get(flavor, default_flavor_type) 628 if not as_sqlalchemy: 629 if precision is not None and scale is not None: 630 db_type_bare = db_type.split('(', maxsplit=1)[0] 631 return f"{db_type_bare}({precision},{scale})" 632 return db_type 633 634 if db_type.startswith('sqlalchemy.dialects'): 635 dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2) 636 cls_args, cls_kwargs = None, None 637 if '(' in typ_class_name: 638 typ_class_name, args_str = typ_class_name.split('(', maxsplit=1) 639 args_str = args_str.rstrip(')') 640 cls_args, cls_kwargs = parse_arguments_str(args_str) 641 sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}') 642 cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name) 643 if cls_args is None: 644 return cls 645 return cls(*cls_args, **cls_kwargs) 646 647 if 'numeric' in db_type.lower(): 648 if precision is None or scale is None: 649 return sqlalchemy_types.Numeric 650 return sqlalchemy_types.Numeric(precision, scale) 651 652 cls_args, cls_kwargs = None, None 653 typ_class_name = db_type 654 if '(' in db_type: 655 typ_class_name, args_str = db_type.split('(', maxsplit=1) 656 args_str = args_str.rstrip(')') 657 cls_args, cls_kwargs = parse_arguments_str(args_str) 658 659 cls = getattr(sqlalchemy_types, typ_class_name) 660 if cls_args is None: 661 return cls 662 return cls(*cls_args, **cls_kwargs)
Parse a Pandas data type into a flavor's database type.
Parameters
- pd_type (str): The Pandas datatype. This must be a string, not the actual dtype object.
- flavor (str, default 'default'): The flavor of the database to be mapped to.
- as_sqlalchemy (bool, default False):
If
True
, return a type fromsqlalchemy.types
.
Returns
- The database data type for the incoming Pandas data type.
- If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
def
get_numeric_precision_scale( flavor: str, dtype: Optional[str] = None) -> Union[Tuple[int, int], Tuple[NoneType, NoneType]]:
665def get_numeric_precision_scale( 666 flavor: str, 667 dtype: Optional[str] = None, 668) -> Union[Tuple[int, int], Tuple[None, None]]: 669 """ 670 Return the precision and scale to use for a numeric column for a given database flavor. 671 672 Parameters 673 ---------- 674 flavor: str 675 The database flavor for which to return the precision and scale. 676 677 dtype: Optional[str], default None 678 If provided, return the precision and scale provided in the dtype (if applicable). 679 If all caps, treat this as a DB type. 680 681 Returns 682 ------- 683 A tuple of ints or a tuple of Nones. 684 """ 685 if not dtype: 686 return None, None 687 688 lbracket = '[' if '[' in dtype else '(' 689 rbracket = ']' if lbracket == '[' else ')' 690 if lbracket in dtype and dtype.count(',') == 1 and dtype.endswith(rbracket): 691 try: 692 parts = dtype.split(lbracket, maxsplit=1)[-1].rstrip(rbracket).split(',', maxsplit=1) 693 return int(parts[0].strip()), int(parts[1].strip()) 694 except Exception: 695 pass 696 697 return NUMERIC_PRECISION_FLAVORS.get(flavor, (None, None))
Return the precision and scale to use for a numeric column for a given database flavor.
Parameters
- flavor (str): The database flavor for which to return the precision and scale.
- dtype (Optional[str], default None): If provided, return the precision and scale provided in the dtype (if applicable). If all caps, treat this as a DB type.
Returns
- A tuple of ints or a tuple of Nones.