meerschaum.utils.dtypes.sql

Utility functions for working with SQL data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with SQL data types.
  7"""
  8
  9from __future__ import annotations
 10from meerschaum.utils.typing import Dict, Union, Tuple, Optional
 11
 12NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
 13    'mariadb': (38, 20),
 14    'mysql': (38, 20),
 15    'mssql': (28, 10),
 16}
 17NUMERIC_AS_TEXT_FLAVORS = {'sqlite', 'duckdb'}
 18TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
 19
 20### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
 21DB_FLAVORS_CAST_DTYPES = {
 22    'mariadb': {
 23        'BIGINT': 'DECIMAL',
 24        'TINYINT': 'SIGNED INT',
 25        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 26        'BOOL': 'SIGNED INT',
 27        'BOOLEAN': 'SIGNED INT',
 28        'DOUBLE PRECISION': 'DECIMAL',
 29        'DOUBLE': 'DECIMAL',
 30        'FLOAT': 'DECIMAL',
 31    },
 32    'mysql': {
 33        'BIGINT': 'DECIMAL',
 34        'TINYINT': 'SIGNED INT',
 35        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 36        'INT': 'SIGNED INT',
 37        'INTEGER': 'SIGNED INT',
 38        'BOOL': 'SIGNED INT',
 39        'BOOLEAN': 'SIGNED INT',
 40        'DOUBLE PRECISION': 'DECIMAL',
 41        'DOUBLE': 'DECIMAL',
 42        'FLOAT': 'DECIMAL',
 43    },
 44    'sqlite': {
 45        'BOOLEAN': 'INTEGER',
 46        'REAL': 'FLOAT',
 47    },
 48    'oracle': {
 49        'NVARCHAR(2000)': 'NVARCHAR2(2000)',
 50        'NVARCHAR': 'NVARCHAR2(2000)',
 51        'NVARCHAR2': 'NVARCHAR2(2000)',
 52        'CHAR': 'CHAR(36)',  # UUID columns
 53    },
 54    'mssql': {
 55        'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 56        'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 57        'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 58        'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 59        'NVARCHAR': 'NVARCHAR(MAX)',
 60    },
 61}
 62for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items():
 63    if _flavor not in DB_FLAVORS_CAST_DTYPES:
 64        DB_FLAVORS_CAST_DTYPES[_flavor] = {}
 65    DB_FLAVORS_CAST_DTYPES[_flavor].update({
 66        'NUMERIC': f"NUMERIC({_precision}, {_scale})",
 67        'DECIMAL': f"DECIMAL({_precision}, {_scale})",
 68    })
 69
 70DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
 71    'FLOAT': 'float64[pyarrow]',
 72    'REAL': 'float64[pyarrow]',
 73    'DOUBLE_PRECISION': 'float64[pyarrow]',
 74    'DOUBLE': 'float64[pyarrow]',
 75    'DECIMAL': 'numeric',
 76    'BIGINT': 'int64[pyarrow]',
 77    'INT': 'int64[pyarrow]',
 78    'INTEGER': 'int64[pyarrow]',
 79    'NUMBER': 'numeric',
 80    'NUMERIC': 'numeric',
 81    'TIMESTAMP': 'datetime64[ns]',
 82    'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]',
 83    'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]',
 84    'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]',
 85    'TIMESTAMPTZ': 'datetime64[ns, UTC]',
 86    'DATE': 'datetime64[ns]',
 87    'DATETIME': 'datetime64[ns]',
 88    'DATETIME2': 'datetime64[ns]',
 89    'DATETIMEOFFSET': 'datetime64[ns, UTC]',
 90    'TEXT': 'string[pyarrow]',
 91    'VARCHAR': 'string[pyarrow]',
 92    'CLOB': 'string[pyarrow]',
 93    'BOOL': 'bool[pyarrow]',
 94    'BOOLEAN': 'bool[pyarrow]',
 95    'BOOLEAN()': 'bool[pyarrow]',
 96    'TINYINT': 'bool[pyarrow]',
 97    'TINYINT(1)': 'bool[pyarrow]',
 98    'BIT': 'bool[pyarrow]',
 99    'BIT(1)': 'bool[pyarrow]',
100    'JSON': 'json',
101    'JSONB': 'json',
102    'UUID': 'uuid',
103    'UNIQUEIDENTIFIER': 'uuid',
104    'BYTEA': 'bytes',
105    'BLOB': 'bytes',
106    'VARBINARY': 'bytes',
107    'VARBINARY(MAX)': 'bytes',
108    'substrings': {
109        'CHAR': 'string[pyarrow]',
110        'TIMESTAMP': 'datetime64[ns]',
111        'TIME': 'datetime64[ns]',
112        'DATE': 'datetime64[ns]',
113        'DOUBLE': 'double[pyarrow]',
114        'DECIMAL': 'numeric',
115        'NUMERIC': 'numeric',
116        'NUMBER': 'numeric',
117        'INT': 'int64[pyarrow]',
118        'BOOL': 'bool[pyarrow]',
119        'JSON': 'json',
120        'BYTE': 'bytes',
121        'LOB': 'bytes',
122        'BINARY': 'bytes',
123    },
124    'default': 'object',
125}
126### Map pandas dtypes to flavor-specific dtypes.
127PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
128    'int': {
129        'timescaledb': 'BIGINT',
130        'postgresql': 'BIGINT',
131        'mariadb': 'BIGINT',
132        'mysql': 'BIGINT',
133        'mssql': 'BIGINT',
134        'oracle': 'INT',
135        'sqlite': 'BIGINT',
136        'duckdb': 'BIGINT',
137        'citus': 'BIGINT',
138        'cockroachdb': 'BIGINT',
139        'default': 'INT',
140    },
141    'float': {
142        'timescaledb': 'DOUBLE PRECISION',
143        'postgresql': 'DOUBLE PRECISION',
144        'mariadb': 'DOUBLE PRECISION',
145        'mysql': 'DOUBLE PRECISION',
146        'mssql': 'FLOAT',
147        'oracle': 'FLOAT',
148        'sqlite': 'FLOAT',
149        'duckdb': 'DOUBLE PRECISION',
150        'citus': 'DOUBLE PRECISION',
151        'cockroachdb': 'DOUBLE PRECISION',
152        'default': 'DOUBLE',
153    },
154    'double': {
155        'timescaledb': 'DOUBLE PRECISION',
156        'postgresql': 'DOUBLE PRECISION',
157        'mariadb': 'DOUBLE PRECISION',
158        'mysql': 'DOUBLE PRECISION',
159        'mssql': 'FLOAT',
160        'oracle': 'FLOAT',
161        'sqlite': 'FLOAT',
162        'duckdb': 'DOUBLE PRECISION',
163        'citus': 'DOUBLE PRECISION',
164        'cockroachdb': 'DOUBLE PRECISION',
165        'default': 'DOUBLE',
166    },
167    'datetime64[ns]': {
168        'timescaledb': 'TIMESTAMP',
169        'postgresql': 'TIMESTAMP',
170        'mariadb': 'DATETIME',
171        'mysql': 'DATETIME',
172        'mssql': 'DATETIME2',
173        'oracle': 'TIMESTAMP(9)',
174        'sqlite': 'DATETIME',
175        'duckdb': 'TIMESTAMP',
176        'citus': 'TIMESTAMP',
177        'cockroachdb': 'TIMESTAMP',
178        'default': 'DATETIME',
179    },
180    'datetime64[ns, UTC]': {
181        'timescaledb': 'TIMESTAMPTZ',
182        'postgresql': 'TIMESTAMPTZ',
183        'mariadb': 'DATETIME',
184        'mysql': 'DATETIME',
185        'mssql': 'DATETIMEOFFSET',
186        'oracle': 'TIMESTAMP(9)',
187        'sqlite': 'TIMESTAMP',
188        'duckdb': 'TIMESTAMPTZ',
189        'citus': 'TIMESTAMPTZ',
190        'cockroachdb': 'TIMESTAMPTZ',
191        'default': 'TIMESTAMPTZ',
192    },
193    'datetime': {
194        'timescaledb': 'TIMESTAMPTZ',
195        'postgresql': 'TIMESTAMPTZ',
196        'mariadb': 'DATETIME',
197        'mysql': 'DATETIME',
198        'mssql': 'DATETIMEOFFSET',
199        'oracle': 'TIMESTAMP(9)',
200        'sqlite': 'TIMESTAMP',
201        'duckdb': 'TIMESTAMPTZ',
202        'citus': 'TIMESTAMPTZ',
203        'cockroachdb': 'TIMESTAMPTZ',
204        'default': 'TIMESTAMPTZ',
205    },
206    'datetimetz': {
207        'timescaledb': 'TIMESTAMPTZ',
208        'postgresql': 'TIMESTAMPTZ',
209        'mariadb': 'DATETIME',
210        'mysql': 'DATETIME',
211        'mssql': 'DATETIMEOFFSET',
212        'oracle': 'TIMESTAMP(9)',
213        'sqlite': 'TIMESTAMP',
214        'duckdb': 'TIMESTAMPTZ',
215        'citus': 'TIMESTAMPTZ',
216        'cockroachdb': 'TIMESTAMPTZ',
217        'default': 'TIMESTAMPTZ',
218    },
219    'bool': {
220        'timescaledb': 'BOOLEAN',
221        'postgresql': 'BOOLEAN',
222        'mariadb': 'BOOLEAN',
223        'mysql': 'BOOLEAN',
224        'mssql': 'BIT',
225        'oracle': 'INTEGER',
226        'sqlite': 'FLOAT',
227        'duckdb': 'BOOLEAN',
228        'citus': 'BOOLEAN',
229        'cockroachdb': 'BOOLEAN',
230        'default': 'BOOLEAN',
231    },
232    'object': {
233        'timescaledb': 'TEXT',
234        'postgresql': 'TEXT',
235        'mariadb': 'TEXT',
236        'mysql': 'TEXT',
237        'mssql': 'NVARCHAR(MAX)',
238        'oracle': 'NVARCHAR2(2000)',
239        'sqlite': 'TEXT',
240        'duckdb': 'TEXT',
241        'citus': 'TEXT',
242        'cockroachdb': 'TEXT',
243        'default': 'TEXT',
244    },
245    'string': {
246        'timescaledb': 'TEXT',
247        'postgresql': 'TEXT',
248        'mariadb': 'TEXT',
249        'mysql': 'TEXT',
250        'mssql': 'NVARCHAR(MAX)',
251        'oracle': 'NVARCHAR2(2000)',
252        'sqlite': 'TEXT',
253        'duckdb': 'TEXT',
254        'citus': 'TEXT',
255        'cockroachdb': 'TEXT',
256        'default': 'TEXT',
257    },
258    'unicode': {
259        'timescaledb': 'TEXT',
260        'postgresql': 'TEXT',
261        'mariadb': 'TEXT',
262        'mysql': 'TEXT',
263        'mssql': 'NVARCHAR(MAX)',
264        'oracle': 'NVARCHAR2(2000)',
265        'sqlite': 'TEXT',
266        'duckdb': 'TEXT',
267        'citus': 'TEXT',
268        'cockroachdb': 'TEXT',
269        'default': 'TEXT',
270    },
271    'json': {
272        'timescaledb': 'JSONB',
273        'postgresql': 'JSONB',
274        'mariadb': 'TEXT',
275        'mysql': 'TEXT',
276        'mssql': 'NVARCHAR(MAX)',
277        'oracle': 'NVARCHAR2(2000)',
278        'sqlite': 'TEXT',
279        'duckdb': 'TEXT',
280        'citus': 'JSONB',
281        'cockroachdb': 'JSONB',
282        'default': 'TEXT',
283    },
284    'numeric': {
285        'timescaledb': 'NUMERIC',
286        'postgresql': 'NUMERIC',
287        'mariadb': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mariadb"]}',
288        'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}',
289        'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}',
290        'oracle': 'NUMBER',
291        'sqlite': 'TEXT',
292        'duckdb': 'TEXT',
293        'citus': 'NUMERIC',
294        'cockroachdb': 'NUMERIC',
295        'default': 'NUMERIC',
296    },
297    'uuid': {
298        'timescaledb': 'UUID',
299        'postgresql': 'UUID',
300        'mariadb': 'CHAR(36)',
301        'mysql': 'CHAR(36)',
302        'mssql': 'UNIQUEIDENTIFIER',
303        ### I know this is too much space, but erring on the side of caution.
304        'oracle': 'CHAR(36)',
305        'sqlite': 'TEXT',
306        'duckdb': 'VARCHAR',
307        'citus': 'UUID',
308        'cockroachdb': 'UUID',
309        'default': 'TEXT',
310    },
311    'bytes': {
312        'timescaledb': 'BYTEA',
313        'postgresql': 'BYTEA',
314        'mariadb': 'BLOB',
315        'mysql': 'BLOB',
316        'mssql': 'VARBINARY(MAX)',
317        'oracle': 'BLOB',
318        'sqlite': 'BLOB',
319        'duckdb': 'BLOB',
320        'citus': 'BYTEA',
321        'cockroachdb': 'BYTEA',
322        'default': 'BLOB',
323    },
324}
325PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
326    'int': {
327        'timescaledb': 'BigInteger',
328        'postgresql': 'BigInteger',
329        'mariadb': 'BigInteger',
330        'mysql': 'BigInteger',
331        'mssql': 'BigInteger',
332        'oracle': 'BigInteger',
333        'sqlite': 'BigInteger',
334        'duckdb': 'BigInteger',
335        'citus': 'BigInteger',
336        'cockroachdb': 'BigInteger',
337        'default': 'BigInteger',
338    },
339    'float': {
340        'timescaledb': 'Float',
341        'postgresql': 'Float',
342        'mariadb': 'Float',
343        'mysql': 'Float',
344        'mssql': 'Float',
345        'oracle': 'Float',
346        'sqlite': 'Float',
347        'duckdb': 'Float',
348        'citus': 'Float',
349        'cockroachdb': 'Float',
350        'default': 'Float',
351    },
352    'datetime': {
353        'timescaledb': 'DateTime(timezone=True)',
354        'postgresql': 'DateTime(timezone=True)',
355        'mariadb': 'DateTime(timezone=True)',
356        'mysql': 'DateTime(timezone=True)',
357        'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
358        'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)',
359        'sqlite': 'DateTime(timezone=True)',
360        'duckdb': 'DateTime(timezone=True)',
361        'citus': 'DateTime(timezone=True)',
362        'cockroachdb': 'DateTime(timezone=True)',
363        'default': 'DateTime(timezone=True)',
364    },
365    'datetime64[ns]': {
366        'timescaledb': 'DateTime',
367        'postgresql': 'DateTime',
368        'mariadb': 'DateTime',
369        'mysql': 'DateTime',
370        'mssql': 'sqlalchemy.dialects.mssql.DATETIME2',
371        'oracle': 'DateTime',
372        'sqlite': 'DateTime',
373        'duckdb': 'DateTime',
374        'citus': 'DateTime',
375        'cockroachdb': 'DateTime',
376        'default': 'DateTime',
377    },
378    'datetime64[ns, UTC]': {
379        'timescaledb': 'DateTime(timezone=True)',
380        'postgresql': 'DateTime(timezone=True)',
381        'mariadb': 'DateTime(timezone=True)',
382        'mysql': 'DateTime(timezone=True)',
383        'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
384        'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)',
385        'sqlite': 'DateTime(timezone=True)',
386        'duckdb': 'DateTime(timezone=True)',
387        'citus': 'DateTime(timezone=True)',
388        'cockroachdb': 'DateTime(timezone=True)',
389        'default': 'DateTime(timezone=True)',
390    },
391    'bool': {
392        'timescaledb': 'Boolean',
393        'postgresql': 'Boolean',
394        'mariadb': 'Integer',
395        'mysql': 'Integer',
396        'mssql': 'sqlalchemy.dialects.mssql.BIT',
397        'oracle': 'Integer',
398        'sqlite': 'Float',
399        'duckdb': 'Boolean',
400        'citus': 'Boolean',
401        'cockroachdb': 'Boolean',
402        'default': 'Boolean',
403    },
404    'object': {
405        'timescaledb': 'UnicodeText',
406        'postgresql': 'UnicodeText',
407        'mariadb': 'UnicodeText',
408        'mysql': 'UnicodeText',
409        'mssql': 'UnicodeText',
410        'oracle': 'UnicodeText',
411        'sqlite': 'UnicodeText',
412        'duckdb': 'UnicodeText',
413        'citus': 'UnicodeText',
414        'cockroachdb': 'UnicodeText',
415        'default': 'UnicodeText',
416    },
417    'string': {
418        'timescaledb': 'UnicodeText',
419        'postgresql': 'UnicodeText',
420        'mariadb': 'UnicodeText',
421        'mysql': 'UnicodeText',
422        'mssql': 'UnicodeText',
423        'oracle': 'UnicodeText',
424        'sqlite': 'UnicodeText',
425        'duckdb': 'UnicodeText',
426        'citus': 'UnicodeText',
427        'cockroachdb': 'UnicodeText',
428        'default': 'UnicodeText',
429    },
430    'json': {
431        'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB',
432        'postgresql': 'sqlalchemy.dialects.postgresql.JSONB',
433        'mariadb': 'UnicodeText',
434        'mysql': 'UnicodeText',
435        'mssql': 'UnicodeText',
436        'oracle': 'UnicodeText',
437        'sqlite': 'UnicodeText',
438        'duckdb': 'TEXT',
439        'citus': 'sqlalchemy.dialects.postgresql.JSONB',
440        'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB',
441        'default': 'UnicodeText',
442    },
443    'numeric': {
444        'timescaledb': 'Numeric',
445        'postgresql': 'Numeric',
446        'mariadb': 'Numeric',
447        'mysql': 'Numeric',
448        'mssql': 'Numeric',
449        'oracle': 'Numeric',
450        'sqlite': 'UnicodeText',
451        'duckdb': 'Numeric',
452        'citus': 'Numeric',
453        'cockroachdb': 'Numeric',
454        'default': 'Numeric',
455    },
456    'uuid': {
457        'timescaledb': 'Uuid',
458        'postgresql': 'Uuid',
459        'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)',
460        'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)',
461        'mssql': 'Uuid',
462        'oracle': 'sqlalchemy.dialects.oracle.CHAR(36)',
463        'sqlite': 'UnicodeText',
464        'duckdb': 'UnicodeText',
465        'citus': 'Uuid',
466        'cockroachdb': 'Uuid',
467        'default': 'Uuid',
468    },
469    'bytes': {
470        'timescaledb': 'LargeBinary',
471        'postgresql': 'LargeBinary',
472        'mariadb': 'LargeBinary',
473        'mysql': 'LargeBinary',
474        'mssql': 'LargeBinary',
475        'oracle': 'LargeBinary',
476        'sqlite': 'LargeBinary',
477        'duckdb': 'LargeBinary',
478        'citus': 'LargeBinary',
479        'cockroachdb': 'LargeBinary',
480        'default': 'LargeBinary',
481    },
482}
483
484AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
485    'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY',
486    'postgresql': 'GENERATED BY DEFAULT AS IDENTITY',
487    'mariadb': 'AUTO_INCREMENT',
488    'mysql': 'AUTO_INCREMENT',
489    'mssql': 'IDENTITY(1,1)',
490    'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY',
491    'sqlite': 'AUTOINCREMENT',
492    'duckdb': 'GENERATED BY DEFAULT',
493    'citus': 'GENERATED BY DEFAULT',
494    'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY',
495    'default': 'GENERATED BY DEFAULT AS IDENTITY',
496}
497
498
499def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = True) -> str:
500    """
501    Parse a database type to a pandas data type.
502
503    Parameters
504    ----------
505    db_type: str
506        The database type, e.g. `DATETIME`, `BIGINT`, etc.
507
508    allow_custom_dtypes: bool, default False
509        If `True`, allow for custom data types like `json` and `str`.
510
511    Returns
512    -------
513    The equivalent datatype for a pandas DataFrame.
514    """
515    from meerschaum.utils.dtypes import are_dtypes_equal
516    def parse_custom(_pd_type: str, _db_type: str) -> str:
517        if 'json' in _db_type.lower():
518            return 'json'
519        if are_dtypes_equal(_pd_type, 'numeric') and _pd_type != 'object':
520            precision, scale = get_numeric_precision_scale(None, dtype=_db_type.upper())
521            if precision and scale:
522                return f"numeric[{precision},{scale}]"
523        return _pd_type
524
525    pd_type = DB_TO_PD_DTYPES.get(db_type.upper().split('(', maxsplit=1)[0].strip(), None)
526    if pd_type is not None:
527        return (
528            parse_custom(pd_type, db_type)
529            if allow_custom_dtypes
530            else pd_type
531        )
532    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
533        if db_t in db_type.upper():
534            return (
535                parse_custom(pd_t, db_t)
536                if allow_custom_dtypes
537                else pd_t
538            )
539    return DB_TO_PD_DTYPES['default']
540
541
542def get_db_type_from_pd_type(
543    pd_type: str,
544    flavor: str = 'default',
545    as_sqlalchemy: bool = False,
546) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
547    """
548    Parse a Pandas data type into a flavor's database type.
549
550    Parameters
551    ----------
552    pd_type: str
553        The Pandas datatype. This must be a string, not the actual dtype object.
554
555    flavor: str, default 'default'
556        The flavor of the database to be mapped to.
557
558    as_sqlalchemy: bool, default False
559        If `True`, return a type from `sqlalchemy.types`.
560
561    Returns
562    -------
563    The database data type for the incoming Pandas data type.
564    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
565    """
566    from meerschaum.utils.warnings import warn
567    from meerschaum.utils.packages import attempt_import
568    from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_ALIAS_DTYPES
569    from meerschaum.utils.misc import parse_arguments_str
570    sqlalchemy_types = attempt_import('sqlalchemy.types', lazy=False)
571
572    types_registry = (
573        PD_TO_DB_DTYPES_FLAVORS
574        if not as_sqlalchemy
575        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
576    )
577
578    precision, scale = None, None
579    og_pd_type = pd_type
580    if pd_type in MRSM_ALIAS_DTYPES:
581        pd_type = MRSM_ALIAS_DTYPES[pd_type]
582
583    ### Check whether we are able to match this type (e.g. pyarrow support).
584    found_db_type = False
585    if pd_type not in types_registry and not pd_type.startswith('numeric['):
586        for mapped_pd_type in types_registry:
587            if are_dtypes_equal(mapped_pd_type, pd_type):
588                pd_type = mapped_pd_type
589                found_db_type = True
590                break
591    elif pd_type.startswith('numeric['):
592        og_pd_type = pd_type
593        pd_type = 'numeric'
594        precision, scale = get_numeric_precision_scale(flavor, og_pd_type)
595        found_db_type = True
596    else:
597        found_db_type = True
598
599    if not found_db_type:
600        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.", stacklevel=3)
601        return (
602            'TEXT'
603            if not as_sqlalchemy
604            else sqlalchemy_types.UnicodeText
605        )
606    flavor_types = types_registry.get(
607        pd_type,
608        {
609            'default': (
610                'TEXT'
611                if not as_sqlalchemy
612                else 'UnicodeText'
613            ),
614        },
615    )
616    default_flavor_type = flavor_types.get(
617        'default',
618        (
619            'TEXT'
620            if not as_sqlalchemy
621            else 'UnicodeText'
622        ),
623    )
624    if flavor not in flavor_types:
625        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
626    db_type = flavor_types.get(flavor, default_flavor_type)
627    if not as_sqlalchemy:
628        if precision is not None and scale is not None:
629            db_type_bare = db_type.split('(', maxsplit=1)[0]
630            return f"{db_type_bare}({precision},{scale})"
631        return db_type
632
633    if db_type.startswith('sqlalchemy.dialects'):
634        dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
635        cls_args, cls_kwargs = None, None
636        if '(' in typ_class_name:
637            typ_class_name, args_str = typ_class_name.split('(', maxsplit=1)
638            args_str = args_str.rstrip(')')
639            cls_args, cls_kwargs = parse_arguments_str(args_str)
640        sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
641        cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
642        if cls_args is None:
643            return cls
644        return cls(*cls_args, **cls_kwargs)
645
646    if 'numeric' in db_type.lower():
647        if precision is None or scale is None:
648            return sqlalchemy_types.Numeric
649        return sqlalchemy_types.Numeric(precision, scale)
650
651    cls_args, cls_kwargs = None, None
652    typ_class_name = db_type
653    if '(' in db_type:
654        typ_class_name, args_str = db_type.split('(', maxsplit=1)
655        args_str = args_str.rstrip(')')
656        cls_args, cls_kwargs = parse_arguments_str(args_str)
657
658    cls = getattr(sqlalchemy_types, typ_class_name)
659    if cls_args is None:
660        return cls
661    return cls(*cls_args, **cls_kwargs)
662
663
664def get_numeric_precision_scale(
665    flavor: str,
666    dtype: Optional[str] = None,
667) -> Union[Tuple[int, int], Tuple[None, None]]:
668    """
669    Return the precision and scale to use for a numeric column for a given database flavor.
670
671    Parameters
672    ----------
673    flavor: str
674        The database flavor for which to return the precision and scale.
675    
676    dtype: Optional[str], default None
677        If provided, return the precision and scale provided in the dtype (if applicable).
678        If all caps, treat this as a DB type.
679
680    Returns
681    -------
682    A tuple of ints or a tuple of Nones.
683    """
684    if not dtype:
685        return None, None
686
687    lbracket = '[' if '[' in dtype else '('
688    rbracket = ']' if lbracket == '[' else ')'
689    if lbracket in dtype and dtype.count(',') == 1 and dtype.endswith(rbracket):
690        try:
691            parts = dtype.split(lbracket, maxsplit=1)[-1].rstrip(rbracket).split(',', maxsplit=1)
692            return int(parts[0].strip()), int(parts[1].strip())
693        except Exception:
694            pass
695
696    return NUMERIC_PRECISION_FLAVORS.get(flavor, (None, None))
NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {'mariadb': (38, 20), 'mysql': (38, 20), 'mssql': (28, 10)}
NUMERIC_AS_TEXT_FLAVORS = {'duckdb', 'sqlite'}
TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
DB_FLAVORS_CAST_DTYPES = {'mariadb': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'mysql': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'INT': 'SIGNED INT', 'INTEGER': 'SIGNED INT', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'sqlite': {'BOOLEAN': 'INTEGER', 'REAL': 'FLOAT'}, 'oracle': {'NVARCHAR(2000)': 'NVARCHAR2(2000)', 'NVARCHAR': 'NVARCHAR2(2000)', 'NVARCHAR2': 'NVARCHAR2(2000)', 'CHAR': 'CHAR(36)'}, 'mssql': {'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'NVARCHAR': 'NVARCHAR(MAX)', 'NUMERIC': 'NUMERIC(28, 10)', 'DECIMAL': 'DECIMAL(28, 10)'}}
DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {'FLOAT': 'float64[pyarrow]', 'REAL': 'float64[pyarrow]', 'DOUBLE_PRECISION': 'float64[pyarrow]', 'DOUBLE': 'float64[pyarrow]', 'DECIMAL': 'numeric', 'BIGINT': 'int64[pyarrow]', 'INT': 'int64[pyarrow]', 'INTEGER': 'int64[pyarrow]', 'NUMBER': 'numeric', 'NUMERIC': 'numeric', 'TIMESTAMP': 'datetime64[ns]', 'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]', 'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]', 'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]', 'TIMESTAMPTZ': 'datetime64[ns, UTC]', 'DATE': 'datetime64[ns]', 'DATETIME': 'datetime64[ns]', 'DATETIME2': 'datetime64[ns]', 'DATETIMEOFFSET': 'datetime64[ns, UTC]', 'TEXT': 'string[pyarrow]', 'VARCHAR': 'string[pyarrow]', 'CLOB': 'string[pyarrow]', 'BOOL': 'bool[pyarrow]', 'BOOLEAN': 'bool[pyarrow]', 'BOOLEAN()': 'bool[pyarrow]', 'TINYINT': 'bool[pyarrow]', 'TINYINT(1)': 'bool[pyarrow]', 'BIT': 'bool[pyarrow]', 'BIT(1)': 'bool[pyarrow]', 'JSON': 'json', 'JSONB': 'json', 'UUID': 'uuid', 'UNIQUEIDENTIFIER': 'uuid', 'BYTEA': 'bytes', 'BLOB': 'bytes', 'VARBINARY': 'bytes', 'VARBINARY(MAX)': 'bytes', 'substrings': {'CHAR': 'string[pyarrow]', 'TIMESTAMP': 'datetime64[ns]', 'TIME': 'datetime64[ns]', 'DATE': 'datetime64[ns]', 'DOUBLE': 'double[pyarrow]', 'DECIMAL': 'numeric', 'NUMERIC': 'numeric', 'NUMBER': 'numeric', 'INT': 'int64[pyarrow]', 'BOOL': 'bool[pyarrow]', 'JSON': 'json', 'BYTE': 'bytes', 'LOB': 'bytes', 'BINARY': 'bytes'}, 'default': 'object'}
PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BIGINT', 'postgresql': 'BIGINT', 'mariadb': 'BIGINT', 'mysql': 'BIGINT', 'mssql': 'BIGINT', 'oracle': 'INT', 'sqlite': 'BIGINT', 'duckdb': 'BIGINT', 'citus': 'BIGINT', 'cockroachdb': 'BIGINT', 'default': 'INT'}, 'float': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'double': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'datetime64[ns]': {'timescaledb': 'TIMESTAMP', 'postgresql': 'TIMESTAMP', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIME2', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'DATETIME', 'duckdb': 'TIMESTAMP', 'citus': 'TIMESTAMP', 'cockroachdb': 'TIMESTAMP', 'default': 'DATETIME'}, 'datetime64[ns, UTC]': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'datetime': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'datetimetz': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP(9)', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'bool': {'timescaledb': 'BOOLEAN', 'postgresql': 'BOOLEAN', 'mariadb': 'BOOLEAN', 'mysql': 'BOOLEAN', 'mssql': 'BIT', 'oracle': 'INTEGER', 'sqlite': 'FLOAT', 'duckdb': 'BOOLEAN', 'citus': 'BOOLEAN', 'cockroachdb': 'BOOLEAN', 'default': 'BOOLEAN'}, 'object': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'string': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'unicode': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'json': {'timescaledb': 'JSONB', 'postgresql': 'JSONB', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'JSONB', 'cockroachdb': 'JSONB', 'default': 'TEXT'}, 'numeric': {'timescaledb': 'NUMERIC', 'postgresql': 'NUMERIC', 'mariadb': 'DECIMAL(38, 20)', 'mysql': 'DECIMAL(38, 20)', 'mssql': 'NUMERIC(28, 10)', 'oracle': 'NUMBER', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'NUMERIC', 'cockroachdb': 'NUMERIC', 'default': 'NUMERIC'}, 'uuid': {'timescaledb': 'UUID', 'postgresql': 'UUID', 'mariadb': 'CHAR(36)', 'mysql': 'CHAR(36)', 'mssql': 'UNIQUEIDENTIFIER', 'oracle': 'CHAR(36)', 'sqlite': 'TEXT', 'duckdb': 'VARCHAR', 'citus': 'UUID', 'cockroachdb': 'UUID', 'default': 'TEXT'}, 'bytes': {'timescaledb': 'BYTEA', 'postgresql': 'BYTEA', 'mariadb': 'BLOB', 'mysql': 'BLOB', 'mssql': 'VARBINARY(MAX)', 'oracle': 'BLOB', 'sqlite': 'BLOB', 'duckdb': 'BLOB', 'citus': 'BYTEA', 'cockroachdb': 'BYTEA', 'default': 'BLOB'}}
PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BigInteger', 'postgresql': 'BigInteger', 'mariadb': 'BigInteger', 'mysql': 'BigInteger', 'mssql': 'BigInteger', 'oracle': 'BigInteger', 'sqlite': 'BigInteger', 'duckdb': 'BigInteger', 'citus': 'BigInteger', 'cockroachdb': 'BigInteger', 'default': 'BigInteger'}, 'float': {'timescaledb': 'Float', 'postgresql': 'Float', 'mariadb': 'Float', 'mysql': 'Float', 'mssql': 'Float', 'oracle': 'Float', 'sqlite': 'Float', 'duckdb': 'Float', 'citus': 'Float', 'cockroachdb': 'Float', 'default': 'Float'}, 'datetime': {'timescaledb': 'DateTime(timezone=True)', 'postgresql': 'DateTime(timezone=True)', 'mariadb': 'DateTime(timezone=True)', 'mysql': 'DateTime(timezone=True)', 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 'sqlite': 'DateTime(timezone=True)', 'duckdb': 'DateTime(timezone=True)', 'citus': 'DateTime(timezone=True)', 'cockroachdb': 'DateTime(timezone=True)', 'default': 'DateTime(timezone=True)'}, 'datetime64[ns]': {'timescaledb': 'DateTime', 'postgresql': 'DateTime', 'mariadb': 'DateTime', 'mysql': 'DateTime', 'mssql': 'sqlalchemy.dialects.mssql.DATETIME2', 'oracle': 'DateTime', 'sqlite': 'DateTime', 'duckdb': 'DateTime', 'citus': 'DateTime', 'cockroachdb': 'DateTime', 'default': 'DateTime'}, 'datetime64[ns, UTC]': {'timescaledb': 'DateTime(timezone=True)', 'postgresql': 'DateTime(timezone=True)', 'mariadb': 'DateTime(timezone=True)', 'mysql': 'DateTime(timezone=True)', 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 'sqlite': 'DateTime(timezone=True)', 'duckdb': 'DateTime(timezone=True)', 'citus': 'DateTime(timezone=True)', 'cockroachdb': 'DateTime(timezone=True)', 'default': 'DateTime(timezone=True)'}, 'bool': {'timescaledb': 'Boolean', 'postgresql': 'Boolean', 'mariadb': 'Integer', 'mysql': 'Integer', 'mssql': 'sqlalchemy.dialects.mssql.BIT', 'oracle': 'Integer', 'sqlite': 'Float', 'duckdb': 'Boolean', 'citus': 'Boolean', 'cockroachdb': 'Boolean', 'default': 'Boolean'}, 'object': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'string': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'json': {'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB', 'postgresql': 'sqlalchemy.dialects.postgresql.JSONB', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'TEXT', 'citus': 'sqlalchemy.dialects.postgresql.JSONB', 'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB', 'default': 'UnicodeText'}, 'numeric': {'timescaledb': 'Numeric', 'postgresql': 'Numeric', 'mariadb': 'Numeric', 'mysql': 'Numeric', 'mssql': 'Numeric', 'oracle': 'Numeric', 'sqlite': 'UnicodeText', 'duckdb': 'Numeric', 'citus': 'Numeric', 'cockroachdb': 'Numeric', 'default': 'Numeric'}, 'uuid': {'timescaledb': 'Uuid', 'postgresql': 'Uuid', 'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mssql': 'Uuid', 'oracle': 'sqlalchemy.dialects.oracle.CHAR(36)', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'Uuid', 'cockroachdb': 'Uuid', 'default': 'Uuid'}, 'bytes': {'timescaledb': 'LargeBinary', 'postgresql': 'LargeBinary', 'mariadb': 'LargeBinary', 'mysql': 'LargeBinary', 'mssql': 'LargeBinary', 'oracle': 'LargeBinary', 'sqlite': 'LargeBinary', 'duckdb': 'LargeBinary', 'citus': 'LargeBinary', 'cockroachdb': 'LargeBinary', 'default': 'LargeBinary'}}
AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY', 'postgresql': 'GENERATED BY DEFAULT AS IDENTITY', 'mariadb': 'AUTO_INCREMENT', 'mysql': 'AUTO_INCREMENT', 'mssql': 'IDENTITY(1,1)', 'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY', 'sqlite': 'AUTOINCREMENT', 'duckdb': 'GENERATED BY DEFAULT', 'citus': 'GENERATED BY DEFAULT', 'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY', 'default': 'GENERATED BY DEFAULT AS IDENTITY'}
def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = True) -> str:
500def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = True) -> str:
501    """
502    Parse a database type to a pandas data type.
503
504    Parameters
505    ----------
506    db_type: str
507        The database type, e.g. `DATETIME`, `BIGINT`, etc.
508
509    allow_custom_dtypes: bool, default False
510        If `True`, allow for custom data types like `json` and `str`.
511
512    Returns
513    -------
514    The equivalent datatype for a pandas DataFrame.
515    """
516    from meerschaum.utils.dtypes import are_dtypes_equal
517    def parse_custom(_pd_type: str, _db_type: str) -> str:
518        if 'json' in _db_type.lower():
519            return 'json'
520        if are_dtypes_equal(_pd_type, 'numeric') and _pd_type != 'object':
521            precision, scale = get_numeric_precision_scale(None, dtype=_db_type.upper())
522            if precision and scale:
523                return f"numeric[{precision},{scale}]"
524        return _pd_type
525
526    pd_type = DB_TO_PD_DTYPES.get(db_type.upper().split('(', maxsplit=1)[0].strip(), None)
527    if pd_type is not None:
528        return (
529            parse_custom(pd_type, db_type)
530            if allow_custom_dtypes
531            else pd_type
532        )
533    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
534        if db_t in db_type.upper():
535            return (
536                parse_custom(pd_t, db_t)
537                if allow_custom_dtypes
538                else pd_t
539            )
540    return DB_TO_PD_DTYPES['default']

Parse a database type to a pandas data type.

Parameters
  • db_type (str): The database type, e.g. DATETIME, BIGINT, etc.
  • allow_custom_dtypes (bool, default False): If True, allow for custom data types like json and str.
Returns
  • The equivalent datatype for a pandas DataFrame.
def get_db_type_from_pd_type( pd_type: str, flavor: str = 'default', as_sqlalchemy: bool = False) -> "Union[str, 'sqlalchemy.sql.visitors.TraversibleType']":
543def get_db_type_from_pd_type(
544    pd_type: str,
545    flavor: str = 'default',
546    as_sqlalchemy: bool = False,
547) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
548    """
549    Parse a Pandas data type into a flavor's database type.
550
551    Parameters
552    ----------
553    pd_type: str
554        The Pandas datatype. This must be a string, not the actual dtype object.
555
556    flavor: str, default 'default'
557        The flavor of the database to be mapped to.
558
559    as_sqlalchemy: bool, default False
560        If `True`, return a type from `sqlalchemy.types`.
561
562    Returns
563    -------
564    The database data type for the incoming Pandas data type.
565    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
566    """
567    from meerschaum.utils.warnings import warn
568    from meerschaum.utils.packages import attempt_import
569    from meerschaum.utils.dtypes import are_dtypes_equal, MRSM_ALIAS_DTYPES
570    from meerschaum.utils.misc import parse_arguments_str
571    sqlalchemy_types = attempt_import('sqlalchemy.types', lazy=False)
572
573    types_registry = (
574        PD_TO_DB_DTYPES_FLAVORS
575        if not as_sqlalchemy
576        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
577    )
578
579    precision, scale = None, None
580    og_pd_type = pd_type
581    if pd_type in MRSM_ALIAS_DTYPES:
582        pd_type = MRSM_ALIAS_DTYPES[pd_type]
583
584    ### Check whether we are able to match this type (e.g. pyarrow support).
585    found_db_type = False
586    if pd_type not in types_registry and not pd_type.startswith('numeric['):
587        for mapped_pd_type in types_registry:
588            if are_dtypes_equal(mapped_pd_type, pd_type):
589                pd_type = mapped_pd_type
590                found_db_type = True
591                break
592    elif pd_type.startswith('numeric['):
593        og_pd_type = pd_type
594        pd_type = 'numeric'
595        precision, scale = get_numeric_precision_scale(flavor, og_pd_type)
596        found_db_type = True
597    else:
598        found_db_type = True
599
600    if not found_db_type:
601        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.", stacklevel=3)
602        return (
603            'TEXT'
604            if not as_sqlalchemy
605            else sqlalchemy_types.UnicodeText
606        )
607    flavor_types = types_registry.get(
608        pd_type,
609        {
610            'default': (
611                'TEXT'
612                if not as_sqlalchemy
613                else 'UnicodeText'
614            ),
615        },
616    )
617    default_flavor_type = flavor_types.get(
618        'default',
619        (
620            'TEXT'
621            if not as_sqlalchemy
622            else 'UnicodeText'
623        ),
624    )
625    if flavor not in flavor_types:
626        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
627    db_type = flavor_types.get(flavor, default_flavor_type)
628    if not as_sqlalchemy:
629        if precision is not None and scale is not None:
630            db_type_bare = db_type.split('(', maxsplit=1)[0]
631            return f"{db_type_bare}({precision},{scale})"
632        return db_type
633
634    if db_type.startswith('sqlalchemy.dialects'):
635        dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
636        cls_args, cls_kwargs = None, None
637        if '(' in typ_class_name:
638            typ_class_name, args_str = typ_class_name.split('(', maxsplit=1)
639            args_str = args_str.rstrip(')')
640            cls_args, cls_kwargs = parse_arguments_str(args_str)
641        sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
642        cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
643        if cls_args is None:
644            return cls
645        return cls(*cls_args, **cls_kwargs)
646
647    if 'numeric' in db_type.lower():
648        if precision is None or scale is None:
649            return sqlalchemy_types.Numeric
650        return sqlalchemy_types.Numeric(precision, scale)
651
652    cls_args, cls_kwargs = None, None
653    typ_class_name = db_type
654    if '(' in db_type:
655        typ_class_name, args_str = db_type.split('(', maxsplit=1)
656        args_str = args_str.rstrip(')')
657        cls_args, cls_kwargs = parse_arguments_str(args_str)
658
659    cls = getattr(sqlalchemy_types, typ_class_name)
660    if cls_args is None:
661        return cls
662    return cls(*cls_args, **cls_kwargs)

Parse a Pandas data type into a flavor's database type.

Parameters
  • pd_type (str): The Pandas datatype. This must be a string, not the actual dtype object.
  • flavor (str, default 'default'): The flavor of the database to be mapped to.
  • as_sqlalchemy (bool, default False): If True, return a type from sqlalchemy.types.
Returns
  • The database data type for the incoming Pandas data type.
  • If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
def get_numeric_precision_scale( flavor: str, dtype: Optional[str] = None) -> Union[Tuple[int, int], Tuple[NoneType, NoneType]]:
665def get_numeric_precision_scale(
666    flavor: str,
667    dtype: Optional[str] = None,
668) -> Union[Tuple[int, int], Tuple[None, None]]:
669    """
670    Return the precision and scale to use for a numeric column for a given database flavor.
671
672    Parameters
673    ----------
674    flavor: str
675        The database flavor for which to return the precision and scale.
676    
677    dtype: Optional[str], default None
678        If provided, return the precision and scale provided in the dtype (if applicable).
679        If all caps, treat this as a DB type.
680
681    Returns
682    -------
683    A tuple of ints or a tuple of Nones.
684    """
685    if not dtype:
686        return None, None
687
688    lbracket = '[' if '[' in dtype else '('
689    rbracket = ']' if lbracket == '[' else ')'
690    if lbracket in dtype and dtype.count(',') == 1 and dtype.endswith(rbracket):
691        try:
692            parts = dtype.split(lbracket, maxsplit=1)[-1].rstrip(rbracket).split(',', maxsplit=1)
693            return int(parts[0].strip()), int(parts[1].strip())
694        except Exception:
695            pass
696
697    return NUMERIC_PRECISION_FLAVORS.get(flavor, (None, None))

Return the precision and scale to use for a numeric column for a given database flavor.

Parameters
  • flavor (str): The database flavor for which to return the precision and scale.
  • dtype (Optional[str], default None): If provided, return the precision and scale provided in the dtype (if applicable). If all caps, treat this as a DB type.
Returns
  • A tuple of ints or a tuple of Nones.