meerschaum.utils.dtypes.sql

Utility functions for working with SQL data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with SQL data types.
  7"""
  8
  9from __future__ import annotations
 10from meerschaum.utils.typing import Dict, Union, Tuple
 11
 12NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
 13    'mariadb': (38, 20),
 14    'mysql': (38, 20),
 15    'mssql': (28, 10),
 16    'duckdb': (15, 4),
 17    'sqlite': (15, 4),
 18}
 19
 20### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
 21DB_FLAVORS_CAST_DTYPES = {
 22    'mariadb': {
 23        'BIGINT': 'DECIMAL',
 24        'TINYINT': 'SIGNED INT',
 25        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 26        'BOOL': 'SIGNED INT',
 27        'BOOLEAN': 'SIGNED INT',
 28        'DOUBLE PRECISION': 'DECIMAL',
 29        'DOUBLE': 'DECIMAL',
 30        'FLOAT': 'DECIMAL',
 31    },
 32    'mysql': {
 33        'BIGINT': 'DECIMAL',
 34        'TINYINT': 'SIGNED INT',
 35        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 36        'INT': 'SIGNED INT',
 37        'INTEGER': 'SIGNED INT',
 38        'BOOL': 'SIGNED INT',
 39        'BOOLEAN': 'SIGNED INT',
 40        'DOUBLE PRECISION': 'DECIMAL',
 41        'DOUBLE': 'DECIMAL',
 42        'FLOAT': 'DECIMAL',
 43    },
 44    'sqlite': {
 45        'BOOLEAN': 'INTEGER',
 46        'REAL': 'FLOAT',
 47    },
 48    'oracle': {
 49        'NVARCHAR(2000)': 'NVARCHAR2(2000)',
 50        'NVARCHAR': 'NVARCHAR2(2000)',
 51        'NVARCHAR2': 'NVARCHAR2(2000)',
 52    },
 53    'mssql': {
 54        'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 55        'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 56        'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 57        'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 58        'NVARCHAR': 'NVARCHAR(MAX)',
 59    },
 60}
 61for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items():
 62    if _flavor not in DB_FLAVORS_CAST_DTYPES:
 63        DB_FLAVORS_CAST_DTYPES[_flavor] = {}
 64    DB_FLAVORS_CAST_DTYPES[_flavor].update({
 65        'NUMERIC': f"NUMERIC({_precision}, {_scale})",
 66        'DECIMAL': f"DECIMAL({_precision}, {_scale})",
 67    })
 68
 69DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
 70    'FLOAT': 'float64[pyarrow]',
 71    'REAL': 'float64[pyarrow]',
 72    'DOUBLE_PRECISION': 'float64[pyarrow]',
 73    'DOUBLE': 'float64[pyarrow]',
 74    'DECIMAL': 'numeric',
 75    'BIGINT': 'int64[pyarrow]',
 76    'INT': 'int64[pyarrow]',
 77    'INTEGER': 'int64[pyarrow]',
 78    'NUMBER': 'numeric',
 79    'NUMERIC': 'numeric',
 80    'TIMESTAMP': 'datetime64[ns]',
 81    'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]',
 82    'TIMESTAMPTZ': 'datetime64[ns, UTC]',
 83    'DATE': 'datetime64[ns]',
 84    'DATETIME': 'datetime64[ns]',
 85    'DATETIME2': 'datetime64[ns]',
 86    'DATETIMEOFFSET': 'datetime64[ns, UTC]',
 87    'TEXT': 'string[pyarrow]',
 88    'VARCHAR': 'string[pyarrow]',
 89    'CLOB': 'string[pyarrow]',
 90    'BOOL': 'bool[pyarrow]',
 91    'BOOLEAN': 'bool[pyarrow]',
 92    'BOOLEAN()': 'bool[pyarrow]',
 93    'TINYINT': 'bool[pyarrow]',
 94    'TINYINT(1)': 'bool[pyarrow]',
 95    'BIT': 'bool[pyarrow]',
 96    'BIT(1)': 'bool[pyarrow]',
 97    'JSON': 'json',
 98    'JSONB': 'json',
 99    'UUID': 'uuid',
100    'UNIQUEIDENTIFIER': 'uuid',
101    'substrings': {
102        'CHAR': 'string[pyarrow]',
103        'TIMESTAMP': 'datetime64[ns]',
104        'TIME': 'datetime64[ns]',
105        'DATE': 'datetime64[ns]',
106        'DOUBLE': 'double[pyarrow]',
107        'DECIMAL': 'numeric',
108        'NUMERIC': 'numeric',
109        'NUMBER': 'numeric',
110        'INT': 'int64[pyarrow]',
111        'BOOL': 'bool[pyarrow]',
112        'JSON': 'json',
113    },
114    'default': 'object',
115}
116### Map pandas dtypes to flavor-specific dtypes.
117PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
118    'int': {
119        'timescaledb': 'BIGINT',
120        'postgresql': 'BIGINT',
121        'mariadb': 'BIGINT',
122        'mysql': 'BIGINT',
123        'mssql': 'BIGINT',
124        'oracle': 'INT',
125        'sqlite': 'BIGINT',
126        'duckdb': 'BIGINT',
127        'citus': 'BIGINT',
128        'cockroachdb': 'BIGINT',
129        'default': 'INT',
130    },
131    'float': {
132        'timescaledb': 'DOUBLE PRECISION',
133        'postgresql': 'DOUBLE PRECISION',
134        'mariadb': 'DOUBLE PRECISION',
135        'mysql': 'DOUBLE PRECISION',
136        'mssql': 'FLOAT',
137        'oracle': 'FLOAT',
138        'sqlite': 'FLOAT',
139        'duckdb': 'DOUBLE PRECISION',
140        'citus': 'DOUBLE PRECISION',
141        'cockroachdb': 'DOUBLE PRECISION',
142        'default': 'DOUBLE',
143    },
144    'double': {
145        'timescaledb': 'DOUBLE PRECISION',
146        'postgresql': 'DOUBLE PRECISION',
147        'mariadb': 'DOUBLE PRECISION',
148        'mysql': 'DOUBLE PRECISION',
149        'mssql': 'FLOAT',
150        'oracle': 'FLOAT',
151        'sqlite': 'FLOAT',
152        'duckdb': 'DOUBLE PRECISION',
153        'citus': 'DOUBLE PRECISION',
154        'cockroachdb': 'DOUBLE PRECISION',
155        'default': 'DOUBLE',
156    },
157    'datetime64[ns]': {
158        'timescaledb': 'TIMESTAMP',
159        'postgresql': 'TIMESTAMP',
160        'mariadb': 'DATETIME',
161        'mysql': 'DATETIME',
162        'mssql': 'DATETIME2',
163        'oracle': 'DATE',
164        'sqlite': 'DATETIME',
165        'duckdb': 'TIMESTAMP',
166        'citus': 'TIMESTAMP',
167        'cockroachdb': 'TIMESTAMP',
168        'default': 'DATETIME',
169    },
170    'datetime64[ns, UTC]': {
171        'timescaledb': 'TIMESTAMP',
172        'postgresql': 'TIMESTAMP',
173        'mariadb': 'TIMESTAMP',
174        'mysql': 'TIMESTAMP',
175        'mssql': 'DATETIMEOFFSET',
176        'oracle': 'TIMESTAMP',
177        'sqlite': 'TIMESTAMP',
178        'duckdb': 'TIMESTAMP',
179        'citus': 'TIMESTAMP',
180        'cockroachdb': 'TIMESTAMP',
181        'default': 'TIMESTAMP',
182    },
183    'bool': {
184        'timescaledb': 'BOOLEAN',
185        'postgresql': 'BOOLEAN',
186        'mariadb': 'BOOLEAN',
187        'mysql': 'BOOLEAN',
188        'mssql': 'INTEGER',
189        'oracle': 'INTEGER',
190        'sqlite': 'FLOAT',
191        'duckdb': 'BOOLEAN',
192        'citus': 'BOOLEAN',
193        'cockroachdb': 'BOOLEAN',
194        'default': 'BOOLEAN',
195    },
196    'object': {
197        'timescaledb': 'TEXT',
198        'postgresql': 'TEXT',
199        'mariadb': 'TEXT',
200        'mysql': 'TEXT',
201        'mssql': 'NVARCHAR(MAX)',
202        'oracle': 'NVARCHAR2(2000)',
203        'sqlite': 'TEXT',
204        'duckdb': 'TEXT',
205        'citus': 'TEXT',
206        'cockroachdb': 'TEXT',
207        'default': 'TEXT',
208    },
209    'string': {
210        'timescaledb': 'TEXT',
211        'postgresql': 'TEXT',
212        'mariadb': 'TEXT',
213        'mysql': 'TEXT',
214        'mssql': 'NVARCHAR(MAX)',
215        'oracle': 'NVARCHAR2(2000)',
216        'sqlite': 'TEXT',
217        'duckdb': 'TEXT',
218        'citus': 'TEXT',
219        'cockroachdb': 'TEXT',
220        'default': 'TEXT',
221    },
222    'json': {
223        'timescaledb': 'JSONB',
224        'postgresql': 'JSONB',
225        'mariadb': 'TEXT',
226        'mysql': 'TEXT',
227        'mssql': 'NVARCHAR(MAX)',
228        'oracle': 'NVARCHAR2(2000)',
229        'sqlite': 'TEXT',
230        'duckdb': 'TEXT',
231        'citus': 'JSONB',
232        'cockroachdb': 'JSONB',
233        'default': 'TEXT',
234    },
235    'numeric': {
236        'timescaledb': 'NUMERIC',
237        'postgresql': 'NUMERIC',
238        'mariadb': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mariadb"]}',
239        'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}',
240        'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}',
241        'oracle': 'NUMBER',
242        'sqlite': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["sqlite"]}',
243        'duckdb': 'NUMERIC',
244        'citus': 'NUMERIC',
245        'cockroachdb': 'NUMERIC',
246        'default': 'NUMERIC',
247    },
248    'uuid': {
249        'timescaledb': 'UUID',
250        'postgresql': 'UUID',
251        'mariadb': 'CHAR(36)',
252        'mysql': 'CHAR(36)',
253        'mssql': 'UNIQUEIDENTIFIER',
254        ### I know this is too much space, but erring on the side of caution.
255        'oracle': 'NVARCHAR(2000)',
256        'sqlite': 'TEXT',
257        'duckdb': 'VARCHAR',
258        'citus': 'UUID',
259        'cockroachdb': 'UUID',
260        'default': 'TEXT',
261    },
262}
263PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
264    'int': {
265        'timescaledb': 'BigInteger',
266        'postgresql': 'BigInteger',
267        'mariadb': 'BigInteger',
268        'mysql': 'BigInteger',
269        'mssql': 'BigInteger',
270        'oracle': 'BigInteger',
271        'sqlite': 'BigInteger',
272        'duckdb': 'BigInteger',
273        'citus': 'BigInteger',
274        'cockroachdb': 'BigInteger',
275        'default': 'BigInteger',
276    },
277    'float': {
278        'timescaledb': 'Float',
279        'postgresql': 'Float',
280        'mariadb': 'Float',
281        'mysql': 'Float',
282        'mssql': 'Float',
283        'oracle': 'Float',
284        'sqlite': 'Float',
285        'duckdb': 'Float',
286        'citus': 'Float',
287        'cockroachdb': 'Float',
288        'default': 'Float',
289    },
290    'datetime64[ns]': {
291        'timescaledb': 'DateTime',
292        'postgresql': 'DateTime',
293        'mariadb': 'DateTime',
294        'mysql': 'DateTime',
295        'mssql': 'sqlalchemy.dialects.mssql.DATETIME2',
296        'oracle': 'DateTime',
297        'sqlite': 'DateTime',
298        'duckdb': 'DateTime',
299        'citus': 'DateTime',
300        'cockroachdb': 'DateTime',
301        'default': 'DateTime',
302    },
303    'datetime64[ns, UTC]': {
304        'timescaledb': 'DateTime',
305        'postgresql': 'DateTime',
306        'mariadb': 'DateTime',
307        'mysql': 'DateTime',
308        'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
309        'oracle': 'DateTime',
310        'sqlite': 'DateTime',
311        'duckdb': 'DateTime',
312        'citus': 'DateTime',
313        'cockroachdb': 'DateTime',
314        'default': 'DateTime',
315    },
316    'bool': {
317        'timescaledb': 'Boolean',
318        'postgresql': 'Boolean',
319        'mariadb': 'Integer',
320        'mysql': 'Integer',
321        'mssql': 'Integer',
322        'oracle': 'Integer',
323        'sqlite': 'Float',
324        'duckdb': 'Boolean',
325        'citus': 'Boolean',
326        'cockroachdb': 'Boolean',
327        'default': 'Boolean',
328    },
329    'object': {
330        'timescaledb': 'UnicodeText',
331        'postgresql': 'UnicodeText',
332        'mariadb': 'UnicodeText',
333        'mysql': 'UnicodeText',
334        'mssql': 'UnicodeText',
335        'oracle': 'UnicodeText',
336        'sqlite': 'UnicodeText',
337        'duckdb': 'UnicodeText',
338        'citus': 'UnicodeText',
339        'cockroachdb': 'UnicodeText',
340        'default': 'UnicodeText',
341    },
342    'string': {
343        'timescaledb': 'UnicodeText',
344        'postgresql': 'UnicodeText',
345        'mariadb': 'UnicodeText',
346        'mysql': 'UnicodeText',
347        'mssql': 'UnicodeText',
348        'oracle': 'UnicodeText',
349        'sqlite': 'UnicodeText',
350        'duckdb': 'UnicodeText',
351        'citus': 'UnicodeText',
352        'cockroachdb': 'UnicodeText',
353        'default': 'UnicodeText',
354    },
355    'json': {
356        'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB',
357        'postgresql': 'sqlalchemy.dialects.postgresql.JSONB',
358        'mariadb': 'UnicodeText',
359        'mysql': 'UnicodeText',
360        'mssql': 'UnicodeText',
361        'oracle': 'UnicodeText',
362        'sqlite': 'UnicodeText',
363        'duckdb': 'TEXT',
364        'citus': 'sqlalchemy.dialects.postgresql.JSONB',
365        'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB',
366        'default': 'UnicodeText',
367    },
368    'numeric': {
369        'timescaledb': 'Numeric',
370        'postgresql': 'Numeric',
371        'mariadb': 'Numeric',
372        'mysql': 'Numeric',
373        'mssql': 'Numeric',
374        'oracle': 'Numeric',
375        'sqlite': 'Numeric',
376        'duckdb': 'Numeric',
377        'citus': 'Numeric',
378        'cockroachdb': 'Numeric',
379        'default': 'Numeric',
380    },
381    'uuid': {
382        'timescaledb': 'Uuid',
383        'postgresql': 'Uuid',
384        'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)',
385        'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)',
386        'mssql': 'Uuid',
387        'oracle': 'UnicodeText',
388        'sqlite': 'UnicodeText',
389        'duckdb': 'UnicodeText',
390        'citus': 'Uuid',
391        'cockroachdb': 'Uuid',
392        'default': 'Uuid',
393    },
394}
395
396
397def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
398    """
399    Parse a database type to a pandas data type.
400
401    Parameters
402    ----------
403    db_type: str
404        The database type, e.g. `DATETIME`, `BIGINT`, etc.
405
406    allow_custom_dtypes: bool, default False
407        If `True`, allow for custom data types like `json` and `str`.
408
409    Returns
410    -------
411    The equivalent datatype for a pandas DataFrame.
412    """
413    def parse_custom(_pd_type: str, _db_type: str) -> str:
414        if 'json' in _db_type.lower():
415            return 'json'
416        return _pd_type
417
418    pd_type = DB_TO_PD_DTYPES.get(db_type.upper(), None)
419    if pd_type is not None:
420        return (
421            parse_custom(pd_type, db_type)
422            if allow_custom_dtypes
423            else pd_type
424        )
425    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
426        if db_t in db_type.upper():
427            return (
428                parse_custom(pd_t, db_t)
429                if allow_custom_dtypes
430                else pd_t
431            )
432    return DB_TO_PD_DTYPES['default']
433
434
435def get_db_type_from_pd_type(
436    pd_type: str,
437    flavor: str = 'default',
438    as_sqlalchemy: bool = False,
439) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
440    """
441    Parse a Pandas data type into a flavor's database type.
442
443    Parameters
444    ----------
445    pd_type: str
446        The Pandas datatype. This must be a string, not the actual dtype object.
447
448    flavor: str, default 'default'
449        The flavor of the database to be mapped to.
450
451    as_sqlalchemy: bool, default False
452        If `True`, return a type from `sqlalchemy.types`.
453
454    Returns
455    -------
456    The database data type for the incoming Pandas data type.
457    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
458    """
459    import ast
460    from meerschaum.utils.warnings import warn
461    from meerschaum.utils.packages import attempt_import
462    from meerschaum.utils.dtypes import are_dtypes_equal
463    sqlalchemy_types = attempt_import('sqlalchemy.types')
464
465    types_registry = (
466        PD_TO_DB_DTYPES_FLAVORS
467        if not as_sqlalchemy
468        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
469    )
470
471    ### Check whether we are able to match this type (e.g. pyarrow support).
472    found_db_type = False
473    if pd_type not in types_registry:
474        for mapped_pd_type in types_registry:
475            if are_dtypes_equal(mapped_pd_type, pd_type):
476                pd_type = mapped_pd_type
477                found_db_type = True
478                break
479    else:
480        found_db_type = True
481
482    if not found_db_type:
483        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.")
484        return (
485            'TEXT'
486            if not as_sqlalchemy
487            else sqlalchemy_types.UnicodeText
488        )
489    flavor_types = types_registry.get(
490        pd_type,
491        {
492            'default': (
493                'TEXT'
494                if not as_sqlalchemy
495                else 'UnicodeText'
496            ),
497        },
498    )
499    default_flavor_type = flavor_types.get(
500        'default',
501        (
502            'TEXT'
503            if not as_sqlalchemy
504            else 'UnicodeText'
505        ),
506    )
507    if flavor not in flavor_types:
508        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
509    db_type = flavor_types.get(flavor, default_flavor_type)
510    if not as_sqlalchemy:
511        return db_type
512
513    if db_type.startswith('sqlalchemy.dialects'):
514        dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
515        arg = None
516        if '(' in typ_class_name:
517            typ_class_name, arg_str = typ_class_name.split('(', maxsplit=1)
518            arg = ast.literal_eval(arg_str.rstrip(')'))
519        sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
520        cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
521        if arg is None:
522            return cls
523        return cls(arg)
524
525    if 'numeric' in db_type.lower():
526        numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
527        if flavor not in NUMERIC_PRECISION_FLAVORS:
528            return sqlalchemy_types.Numeric
529        precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
530        return sqlalchemy_types.Numeric(precision, scale)
531    return getattr(sqlalchemy_types, db_type)
NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {'mariadb': (38, 20), 'mysql': (38, 20), 'mssql': (28, 10), 'duckdb': (15, 4), 'sqlite': (15, 4)}
DB_FLAVORS_CAST_DTYPES = {'mariadb': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'mysql': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'INT': 'SIGNED INT', 'INTEGER': 'SIGNED INT', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'sqlite': {'BOOLEAN': 'INTEGER', 'REAL': 'FLOAT', 'NUMERIC': 'NUMERIC(15, 4)', 'DECIMAL': 'DECIMAL(15, 4)'}, 'oracle': {'NVARCHAR(2000)': 'NVARCHAR2(2000)', 'NVARCHAR': 'NVARCHAR2(2000)', 'NVARCHAR2': 'NVARCHAR2(2000)'}, 'mssql': {'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'NVARCHAR': 'NVARCHAR(MAX)', 'NUMERIC': 'NUMERIC(28, 10)', 'DECIMAL': 'DECIMAL(28, 10)'}, 'duckdb': {'NUMERIC': 'NUMERIC(15, 4)', 'DECIMAL': 'DECIMAL(15, 4)'}}
DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {'FLOAT': 'float64[pyarrow]', 'REAL': 'float64[pyarrow]', 'DOUBLE_PRECISION': 'float64[pyarrow]', 'DOUBLE': 'float64[pyarrow]', 'DECIMAL': 'numeric', 'BIGINT': 'int64[pyarrow]', 'INT': 'int64[pyarrow]', 'INTEGER': 'int64[pyarrow]', 'NUMBER': 'numeric', 'NUMERIC': 'numeric', 'TIMESTAMP': 'datetime64[ns]', 'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]', 'TIMESTAMPTZ': 'datetime64[ns, UTC]', 'DATE': 'datetime64[ns]', 'DATETIME': 'datetime64[ns]', 'DATETIME2': 'datetime64[ns]', 'DATETIMEOFFSET': 'datetime64[ns, UTC]', 'TEXT': 'string[pyarrow]', 'VARCHAR': 'string[pyarrow]', 'CLOB': 'string[pyarrow]', 'BOOL': 'bool[pyarrow]', 'BOOLEAN': 'bool[pyarrow]', 'BOOLEAN()': 'bool[pyarrow]', 'TINYINT': 'bool[pyarrow]', 'TINYINT(1)': 'bool[pyarrow]', 'BIT': 'bool[pyarrow]', 'BIT(1)': 'bool[pyarrow]', 'JSON': 'json', 'JSONB': 'json', 'UUID': 'uuid', 'UNIQUEIDENTIFIER': 'uuid', 'substrings': {'CHAR': 'string[pyarrow]', 'TIMESTAMP': 'datetime64[ns]', 'TIME': 'datetime64[ns]', 'DATE': 'datetime64[ns]', 'DOUBLE': 'double[pyarrow]', 'DECIMAL': 'numeric', 'NUMERIC': 'numeric', 'NUMBER': 'numeric', 'INT': 'int64[pyarrow]', 'BOOL': 'bool[pyarrow]', 'JSON': 'json'}, 'default': 'object'}
PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BIGINT', 'postgresql': 'BIGINT', 'mariadb': 'BIGINT', 'mysql': 'BIGINT', 'mssql': 'BIGINT', 'oracle': 'INT', 'sqlite': 'BIGINT', 'duckdb': 'BIGINT', 'citus': 'BIGINT', 'cockroachdb': 'BIGINT', 'default': 'INT'}, 'float': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'double': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'datetime64[ns]': {'timescaledb': 'TIMESTAMP', 'postgresql': 'TIMESTAMP', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIME2', 'oracle': 'DATE', 'sqlite': 'DATETIME', 'duckdb': 'TIMESTAMP', 'citus': 'TIMESTAMP', 'cockroachdb': 'TIMESTAMP', 'default': 'DATETIME'}, 'datetime64[ns, UTC]': {'timescaledb': 'TIMESTAMP', 'postgresql': 'TIMESTAMP', 'mariadb': 'TIMESTAMP', 'mysql': 'TIMESTAMP', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMP', 'citus': 'TIMESTAMP', 'cockroachdb': 'TIMESTAMP', 'default': 'TIMESTAMP'}, 'bool': {'timescaledb': 'BOOLEAN', 'postgresql': 'BOOLEAN', 'mariadb': 'BOOLEAN', 'mysql': 'BOOLEAN', 'mssql': 'INTEGER', 'oracle': 'INTEGER', 'sqlite': 'FLOAT', 'duckdb': 'BOOLEAN', 'citus': 'BOOLEAN', 'cockroachdb': 'BOOLEAN', 'default': 'BOOLEAN'}, 'object': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'string': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'json': {'timescaledb': 'JSONB', 'postgresql': 'JSONB', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'JSONB', 'cockroachdb': 'JSONB', 'default': 'TEXT'}, 'numeric': {'timescaledb': 'NUMERIC', 'postgresql': 'NUMERIC', 'mariadb': 'DECIMAL(38, 20)', 'mysql': 'DECIMAL(38, 20)', 'mssql': 'NUMERIC(28, 10)', 'oracle': 'NUMBER', 'sqlite': 'DECIMAL(15, 4)', 'duckdb': 'NUMERIC', 'citus': 'NUMERIC', 'cockroachdb': 'NUMERIC', 'default': 'NUMERIC'}, 'uuid': {'timescaledb': 'UUID', 'postgresql': 'UUID', 'mariadb': 'CHAR(36)', 'mysql': 'CHAR(36)', 'mssql': 'UNIQUEIDENTIFIER', 'oracle': 'NVARCHAR(2000)', 'sqlite': 'TEXT', 'duckdb': 'VARCHAR', 'citus': 'UUID', 'cockroachdb': 'UUID', 'default': 'TEXT'}}
PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BigInteger', 'postgresql': 'BigInteger', 'mariadb': 'BigInteger', 'mysql': 'BigInteger', 'mssql': 'BigInteger', 'oracle': 'BigInteger', 'sqlite': 'BigInteger', 'duckdb': 'BigInteger', 'citus': 'BigInteger', 'cockroachdb': 'BigInteger', 'default': 'BigInteger'}, 'float': {'timescaledb': 'Float', 'postgresql': 'Float', 'mariadb': 'Float', 'mysql': 'Float', 'mssql': 'Float', 'oracle': 'Float', 'sqlite': 'Float', 'duckdb': 'Float', 'citus': 'Float', 'cockroachdb': 'Float', 'default': 'Float'}, 'datetime64[ns]': {'timescaledb': 'DateTime', 'postgresql': 'DateTime', 'mariadb': 'DateTime', 'mysql': 'DateTime', 'mssql': 'sqlalchemy.dialects.mssql.DATETIME2', 'oracle': 'DateTime', 'sqlite': 'DateTime', 'duckdb': 'DateTime', 'citus': 'DateTime', 'cockroachdb': 'DateTime', 'default': 'DateTime'}, 'datetime64[ns, UTC]': {'timescaledb': 'DateTime', 'postgresql': 'DateTime', 'mariadb': 'DateTime', 'mysql': 'DateTime', 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 'oracle': 'DateTime', 'sqlite': 'DateTime', 'duckdb': 'DateTime', 'citus': 'DateTime', 'cockroachdb': 'DateTime', 'default': 'DateTime'}, 'bool': {'timescaledb': 'Boolean', 'postgresql': 'Boolean', 'mariadb': 'Integer', 'mysql': 'Integer', 'mssql': 'Integer', 'oracle': 'Integer', 'sqlite': 'Float', 'duckdb': 'Boolean', 'citus': 'Boolean', 'cockroachdb': 'Boolean', 'default': 'Boolean'}, 'object': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'string': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'json': {'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB', 'postgresql': 'sqlalchemy.dialects.postgresql.JSONB', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'TEXT', 'citus': 'sqlalchemy.dialects.postgresql.JSONB', 'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB', 'default': 'UnicodeText'}, 'numeric': {'timescaledb': 'Numeric', 'postgresql': 'Numeric', 'mariadb': 'Numeric', 'mysql': 'Numeric', 'mssql': 'Numeric', 'oracle': 'Numeric', 'sqlite': 'Numeric', 'duckdb': 'Numeric', 'citus': 'Numeric', 'cockroachdb': 'Numeric', 'default': 'Numeric'}, 'uuid': {'timescaledb': 'Uuid', 'postgresql': 'Uuid', 'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mssql': 'Uuid', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'Uuid', 'cockroachdb': 'Uuid', 'default': 'Uuid'}}
def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
398def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
399    """
400    Parse a database type to a pandas data type.
401
402    Parameters
403    ----------
404    db_type: str
405        The database type, e.g. `DATETIME`, `BIGINT`, etc.
406
407    allow_custom_dtypes: bool, default False
408        If `True`, allow for custom data types like `json` and `str`.
409
410    Returns
411    -------
412    The equivalent datatype for a pandas DataFrame.
413    """
414    def parse_custom(_pd_type: str, _db_type: str) -> str:
415        if 'json' in _db_type.lower():
416            return 'json'
417        return _pd_type
418
419    pd_type = DB_TO_PD_DTYPES.get(db_type.upper(), None)
420    if pd_type is not None:
421        return (
422            parse_custom(pd_type, db_type)
423            if allow_custom_dtypes
424            else pd_type
425        )
426    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
427        if db_t in db_type.upper():
428            return (
429                parse_custom(pd_t, db_t)
430                if allow_custom_dtypes
431                else pd_t
432            )
433    return DB_TO_PD_DTYPES['default']

Parse a database type to a pandas data type.

Parameters
  • db_type (str): The database type, e.g. DATETIME, BIGINT, etc.
  • allow_custom_dtypes (bool, default False): If True, allow for custom data types like json and str.
Returns
  • The equivalent datatype for a pandas DataFrame.
def get_db_type_from_pd_type( pd_type: str, flavor: str = 'default', as_sqlalchemy: bool = False) -> "Union[str, 'sqlalchemy.sql.visitors.TraversibleType']":
436def get_db_type_from_pd_type(
437    pd_type: str,
438    flavor: str = 'default',
439    as_sqlalchemy: bool = False,
440) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
441    """
442    Parse a Pandas data type into a flavor's database type.
443
444    Parameters
445    ----------
446    pd_type: str
447        The Pandas datatype. This must be a string, not the actual dtype object.
448
449    flavor: str, default 'default'
450        The flavor of the database to be mapped to.
451
452    as_sqlalchemy: bool, default False
453        If `True`, return a type from `sqlalchemy.types`.
454
455    Returns
456    -------
457    The database data type for the incoming Pandas data type.
458    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
459    """
460    import ast
461    from meerschaum.utils.warnings import warn
462    from meerschaum.utils.packages import attempt_import
463    from meerschaum.utils.dtypes import are_dtypes_equal
464    sqlalchemy_types = attempt_import('sqlalchemy.types')
465
466    types_registry = (
467        PD_TO_DB_DTYPES_FLAVORS
468        if not as_sqlalchemy
469        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
470    )
471
472    ### Check whether we are able to match this type (e.g. pyarrow support).
473    found_db_type = False
474    if pd_type not in types_registry:
475        for mapped_pd_type in types_registry:
476            if are_dtypes_equal(mapped_pd_type, pd_type):
477                pd_type = mapped_pd_type
478                found_db_type = True
479                break
480    else:
481        found_db_type = True
482
483    if not found_db_type:
484        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.")
485        return (
486            'TEXT'
487            if not as_sqlalchemy
488            else sqlalchemy_types.UnicodeText
489        )
490    flavor_types = types_registry.get(
491        pd_type,
492        {
493            'default': (
494                'TEXT'
495                if not as_sqlalchemy
496                else 'UnicodeText'
497            ),
498        },
499    )
500    default_flavor_type = flavor_types.get(
501        'default',
502        (
503            'TEXT'
504            if not as_sqlalchemy
505            else 'UnicodeText'
506        ),
507    )
508    if flavor not in flavor_types:
509        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
510    db_type = flavor_types.get(flavor, default_flavor_type)
511    if not as_sqlalchemy:
512        return db_type
513
514    if db_type.startswith('sqlalchemy.dialects'):
515        dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
516        arg = None
517        if '(' in typ_class_name:
518            typ_class_name, arg_str = typ_class_name.split('(', maxsplit=1)
519            arg = ast.literal_eval(arg_str.rstrip(')'))
520        sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
521        cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
522        if arg is None:
523            return cls
524        return cls(arg)
525
526    if 'numeric' in db_type.lower():
527        numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
528        if flavor not in NUMERIC_PRECISION_FLAVORS:
529            return sqlalchemy_types.Numeric
530        precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
531        return sqlalchemy_types.Numeric(precision, scale)
532    return getattr(sqlalchemy_types, db_type)

Parse a Pandas data type into a flavor's database type.

Parameters
  • pd_type (str): The Pandas datatype. This must be a string, not the actual dtype object.
  • flavor (str, default 'default'): The flavor of the database to be mapped to.
  • as_sqlalchemy (bool, default False): If True, return a type from sqlalchemy.types.
Returns
  • The database data type for the incoming Pandas data type.
  • If nothing can be found, a warning will be thrown and 'TEXT' will be returned.