meerschaum.utils.dtypes.sql

Utility functions for working with SQL data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with SQL data types.
  7"""
  8
  9from __future__ import annotations
 10from meerschaum.utils.typing import Dict, Union, Tuple, List
 11
 12NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
 13    'mariadb': (38, 20),
 14    'mysql': (38, 20),
 15    'mssql': (28, 10),
 16    'duckdb': (15, 3),
 17    'sqlite': (15, 4),
 18}
 19TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
 20
 21### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
 22DB_FLAVORS_CAST_DTYPES = {
 23    'mariadb': {
 24        'BIGINT': 'DECIMAL',
 25        'TINYINT': 'SIGNED INT',
 26        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 27        'BOOL': 'SIGNED INT',
 28        'BOOLEAN': 'SIGNED INT',
 29        'DOUBLE PRECISION': 'DECIMAL',
 30        'DOUBLE': 'DECIMAL',
 31        'FLOAT': 'DECIMAL',
 32    },
 33    'mysql': {
 34        'BIGINT': 'DECIMAL',
 35        'TINYINT': 'SIGNED INT',
 36        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 37        'INT': 'SIGNED INT',
 38        'INTEGER': 'SIGNED INT',
 39        'BOOL': 'SIGNED INT',
 40        'BOOLEAN': 'SIGNED INT',
 41        'DOUBLE PRECISION': 'DECIMAL',
 42        'DOUBLE': 'DECIMAL',
 43        'FLOAT': 'DECIMAL',
 44    },
 45    'sqlite': {
 46        'BOOLEAN': 'INTEGER',
 47        'REAL': 'FLOAT',
 48    },
 49    'oracle': {
 50        'NVARCHAR(2000)': 'NVARCHAR2(2000)',
 51        'NVARCHAR': 'NVARCHAR2(2000)',
 52        'NVARCHAR2': 'NVARCHAR2(2000)',
 53        'CHAR': 'CHAR(36)',  # UUID columns
 54    },
 55    'mssql': {
 56        'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 57        'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 58        'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 59        'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 60        'NVARCHAR': 'NVARCHAR(MAX)',
 61    },
 62}
 63for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items():
 64    if _flavor not in DB_FLAVORS_CAST_DTYPES:
 65        DB_FLAVORS_CAST_DTYPES[_flavor] = {}
 66    DB_FLAVORS_CAST_DTYPES[_flavor].update({
 67        'NUMERIC': f"NUMERIC({_precision}, {_scale})",
 68        'DECIMAL': f"DECIMAL({_precision}, {_scale})",
 69    })
 70
 71DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
 72    'FLOAT': 'float64[pyarrow]',
 73    'REAL': 'float64[pyarrow]',
 74    'DOUBLE_PRECISION': 'float64[pyarrow]',
 75    'DOUBLE': 'float64[pyarrow]',
 76    'DECIMAL': 'numeric',
 77    'BIGINT': 'int64[pyarrow]',
 78    'INT': 'int64[pyarrow]',
 79    'INTEGER': 'int64[pyarrow]',
 80    'NUMBER': 'numeric',
 81    'NUMERIC': 'numeric',
 82    'TIMESTAMP': 'datetime64[ns]',
 83    'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]',
 84    'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]',
 85    'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]',
 86    'TIMESTAMPTZ': 'datetime64[ns, UTC]',
 87    'DATE': 'datetime64[ns]',
 88    'DATETIME': 'datetime64[ns]',
 89    'DATETIME2': 'datetime64[ns]',
 90    'DATETIMEOFFSET': 'datetime64[ns, UTC]',
 91    'TEXT': 'string[pyarrow]',
 92    'VARCHAR': 'string[pyarrow]',
 93    'CLOB': 'string[pyarrow]',
 94    'BOOL': 'bool[pyarrow]',
 95    'BOOLEAN': 'bool[pyarrow]',
 96    'BOOLEAN()': 'bool[pyarrow]',
 97    'TINYINT': 'bool[pyarrow]',
 98    'TINYINT(1)': 'bool[pyarrow]',
 99    'BIT': 'bool[pyarrow]',
100    'BIT(1)': 'bool[pyarrow]',
101    'JSON': 'json',
102    'JSONB': 'json',
103    'UUID': 'uuid',
104    'UNIQUEIDENTIFIER': 'uuid',
105    'substrings': {
106        'CHAR': 'string[pyarrow]',
107        'TIMESTAMP': 'datetime64[ns]',
108        'TIME': 'datetime64[ns]',
109        'DATE': 'datetime64[ns]',
110        'DOUBLE': 'double[pyarrow]',
111        'DECIMAL': 'numeric',
112        'NUMERIC': 'numeric',
113        'NUMBER': 'numeric',
114        'INT': 'int64[pyarrow]',
115        'BOOL': 'bool[pyarrow]',
116        'JSON': 'json',
117    },
118    'default': 'object',
119}
120### Map pandas dtypes to flavor-specific dtypes.
121PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
122    'int': {
123        'timescaledb': 'BIGINT',
124        'postgresql': 'BIGINT',
125        'mariadb': 'BIGINT',
126        'mysql': 'BIGINT',
127        'mssql': 'BIGINT',
128        'oracle': 'INT',
129        'sqlite': 'BIGINT',
130        'duckdb': 'BIGINT',
131        'citus': 'BIGINT',
132        'cockroachdb': 'BIGINT',
133        'default': 'INT',
134    },
135    'float': {
136        'timescaledb': 'DOUBLE PRECISION',
137        'postgresql': 'DOUBLE PRECISION',
138        'mariadb': 'DOUBLE PRECISION',
139        'mysql': 'DOUBLE PRECISION',
140        'mssql': 'FLOAT',
141        'oracle': 'FLOAT',
142        'sqlite': 'FLOAT',
143        'duckdb': 'DOUBLE PRECISION',
144        'citus': 'DOUBLE PRECISION',
145        'cockroachdb': 'DOUBLE PRECISION',
146        'default': 'DOUBLE',
147    },
148    'double': {
149        'timescaledb': 'DOUBLE PRECISION',
150        'postgresql': 'DOUBLE PRECISION',
151        'mariadb': 'DOUBLE PRECISION',
152        'mysql': 'DOUBLE PRECISION',
153        'mssql': 'FLOAT',
154        'oracle': 'FLOAT',
155        'sqlite': 'FLOAT',
156        'duckdb': 'DOUBLE PRECISION',
157        'citus': 'DOUBLE PRECISION',
158        'cockroachdb': 'DOUBLE PRECISION',
159        'default': 'DOUBLE',
160    },
161    'datetime64[ns]': {
162        'timescaledb': 'TIMESTAMP',
163        'postgresql': 'TIMESTAMP',
164        'mariadb': 'DATETIME',
165        'mysql': 'DATETIME',
166        'mssql': 'DATETIME2',
167        'oracle': 'TIMESTAMP',
168        'sqlite': 'DATETIME',
169        'duckdb': 'TIMESTAMP',
170        'citus': 'TIMESTAMP',
171        'cockroachdb': 'TIMESTAMP',
172        'default': 'DATETIME',
173    },
174    'datetime64[ns, UTC]': {
175        'timescaledb': 'TIMESTAMPTZ',
176        'postgresql': 'TIMESTAMPTZ',
177        'mariadb': 'DATETIME',
178        'mysql': 'DATETIME',
179        'mssql': 'DATETIMEOFFSET',
180        'oracle': 'TIMESTAMP',
181        'sqlite': 'TIMESTAMP',
182        'duckdb': 'TIMESTAMPTZ',
183        'citus': 'TIMESTAMPTZ',
184        'cockroachdb': 'TIMESTAMPTZ',
185        'default': 'TIMESTAMPTZ',
186    },
187    'datetime': {
188        'timescaledb': 'TIMESTAMPTZ',
189        'postgresql': 'TIMESTAMPTZ',
190        'mariadb': 'DATETIME',
191        'mysql': 'DATETIME',
192        'mssql': 'DATETIMEOFFSET',
193        'oracle': 'TIMESTAMP',
194        'sqlite': 'TIMESTAMP',
195        'duckdb': 'TIMESTAMPTZ',
196        'citus': 'TIMESTAMPTZ',
197        'cockroachdb': 'TIMESTAMPTZ',
198        'default': 'TIMESTAMPTZ',
199    },
200    'bool': {
201        'timescaledb': 'BOOLEAN',
202        'postgresql': 'BOOLEAN',
203        'mariadb': 'BOOLEAN',
204        'mysql': 'BOOLEAN',
205        'mssql': 'BIT',
206        'oracle': 'INTEGER',
207        'sqlite': 'FLOAT',
208        'duckdb': 'BOOLEAN',
209        'citus': 'BOOLEAN',
210        'cockroachdb': 'BOOLEAN',
211        'default': 'BOOLEAN',
212    },
213    'object': {
214        'timescaledb': 'TEXT',
215        'postgresql': 'TEXT',
216        'mariadb': 'TEXT',
217        'mysql': 'TEXT',
218        'mssql': 'NVARCHAR(MAX)',
219        'oracle': 'NVARCHAR2(2000)',
220        'sqlite': 'TEXT',
221        'duckdb': 'TEXT',
222        'citus': 'TEXT',
223        'cockroachdb': 'TEXT',
224        'default': 'TEXT',
225    },
226    'string': {
227        'timescaledb': 'TEXT',
228        'postgresql': 'TEXT',
229        'mariadb': 'TEXT',
230        'mysql': 'TEXT',
231        'mssql': 'NVARCHAR(MAX)',
232        'oracle': 'NVARCHAR2(2000)',
233        'sqlite': 'TEXT',
234        'duckdb': 'TEXT',
235        'citus': 'TEXT',
236        'cockroachdb': 'TEXT',
237        'default': 'TEXT',
238    },
239    'json': {
240        'timescaledb': 'JSONB',
241        'postgresql': 'JSONB',
242        'mariadb': 'TEXT',
243        'mysql': 'TEXT',
244        'mssql': 'NVARCHAR(MAX)',
245        'oracle': 'NVARCHAR2(2000)',
246        'sqlite': 'TEXT',
247        'duckdb': 'TEXT',
248        'citus': 'JSONB',
249        'cockroachdb': 'JSONB',
250        'default': 'TEXT',
251    },
252    'numeric': {
253        'timescaledb': 'NUMERIC',
254        'postgresql': 'NUMERIC',
255        'mariadb': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mariadb"]}',
256        'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}',
257        'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}',
258        'oracle': 'NUMBER',
259        'sqlite': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["sqlite"]}',
260        'duckdb': 'NUMERIC',
261        'citus': 'NUMERIC',
262        'cockroachdb': 'NUMERIC',
263        'default': 'NUMERIC',
264    },
265    'uuid': {
266        'timescaledb': 'UUID',
267        'postgresql': 'UUID',
268        'mariadb': 'CHAR(36)',
269        'mysql': 'CHAR(36)',
270        'mssql': 'UNIQUEIDENTIFIER',
271        ### I know this is too much space, but erring on the side of caution.
272        'oracle': 'CHAR(36)',
273        'sqlite': 'TEXT',
274        'duckdb': 'VARCHAR',
275        'citus': 'UUID',
276        'cockroachdb': 'UUID',
277        'default': 'TEXT',
278    },
279}
280PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
281    'int': {
282        'timescaledb': 'BigInteger',
283        'postgresql': 'BigInteger',
284        'mariadb': 'BigInteger',
285        'mysql': 'BigInteger',
286        'mssql': 'BigInteger',
287        'oracle': 'BigInteger',
288        'sqlite': 'BigInteger',
289        'duckdb': 'BigInteger',
290        'citus': 'BigInteger',
291        'cockroachdb': 'BigInteger',
292        'default': 'BigInteger',
293    },
294    'float': {
295        'timescaledb': 'Float',
296        'postgresql': 'Float',
297        'mariadb': 'Float',
298        'mysql': 'Float',
299        'mssql': 'Float',
300        'oracle': 'Float',
301        'sqlite': 'Float',
302        'duckdb': 'Float',
303        'citus': 'Float',
304        'cockroachdb': 'Float',
305        'default': 'Float',
306    },
307    'datetime': {
308        'timescaledb': 'DateTime(timezone=True)',
309        'postgresql': 'DateTime(timezone=True)',
310        'mariadb': 'DateTime(timezone=True)',
311        'mysql': 'DateTime(timezone=True)',
312        'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
313        'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)',
314        'sqlite': 'DateTime(timezone=True)',
315        'duckdb': 'DateTime(timezone=True)',
316        'citus': 'DateTime(timezone=True)',
317        'cockroachdb': 'DateTime(timezone=True)',
318        'default': 'DateTime(timezone=True)',
319    },
320    'datetime64[ns]': {
321        'timescaledb': 'DateTime',
322        'postgresql': 'DateTime',
323        'mariadb': 'DateTime',
324        'mysql': 'DateTime',
325        'mssql': 'sqlalchemy.dialects.mssql.DATETIME2',
326        'oracle': 'DateTime',
327        'sqlite': 'DateTime',
328        'duckdb': 'DateTime',
329        'citus': 'DateTime',
330        'cockroachdb': 'DateTime',
331        'default': 'DateTime',
332    },
333    'datetime64[ns, UTC]': {
334        'timescaledb': 'DateTime(timezone=True)',
335        'postgresql': 'DateTime(timezone=True)',
336        'mariadb': 'DateTime(timezone=True)',
337        'mysql': 'DateTime(timezone=True)',
338        'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET',
339        'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)',
340        'sqlite': 'DateTime(timezone=True)',
341        'duckdb': 'DateTime(timezone=True)',
342        'citus': 'DateTime(timezone=True)',
343        'cockroachdb': 'DateTime(timezone=True)',
344        'default': 'DateTime(timezone=True)',
345    },
346    'bool': {
347        'timescaledb': 'Boolean',
348        'postgresql': 'Boolean',
349        'mariadb': 'Integer',
350        'mysql': 'Integer',
351        'mssql': 'sqlalchemy.dialects.mssql.BIT',
352        'oracle': 'Integer',
353        'sqlite': 'Float',
354        'duckdb': 'Boolean',
355        'citus': 'Boolean',
356        'cockroachdb': 'Boolean',
357        'default': 'Boolean',
358    },
359    'object': {
360        'timescaledb': 'UnicodeText',
361        'postgresql': 'UnicodeText',
362        'mariadb': 'UnicodeText',
363        'mysql': 'UnicodeText',
364        'mssql': 'UnicodeText',
365        'oracle': 'UnicodeText',
366        'sqlite': 'UnicodeText',
367        'duckdb': 'UnicodeText',
368        'citus': 'UnicodeText',
369        'cockroachdb': 'UnicodeText',
370        'default': 'UnicodeText',
371    },
372    'string': {
373        'timescaledb': 'UnicodeText',
374        'postgresql': 'UnicodeText',
375        'mariadb': 'UnicodeText',
376        'mysql': 'UnicodeText',
377        'mssql': 'UnicodeText',
378        'oracle': 'UnicodeText',
379        'sqlite': 'UnicodeText',
380        'duckdb': 'UnicodeText',
381        'citus': 'UnicodeText',
382        'cockroachdb': 'UnicodeText',
383        'default': 'UnicodeText',
384    },
385    'json': {
386        'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB',
387        'postgresql': 'sqlalchemy.dialects.postgresql.JSONB',
388        'mariadb': 'UnicodeText',
389        'mysql': 'UnicodeText',
390        'mssql': 'UnicodeText',
391        'oracle': 'UnicodeText',
392        'sqlite': 'UnicodeText',
393        'duckdb': 'TEXT',
394        'citus': 'sqlalchemy.dialects.postgresql.JSONB',
395        'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB',
396        'default': 'UnicodeText',
397    },
398    'numeric': {
399        'timescaledb': 'Numeric',
400        'postgresql': 'Numeric',
401        'mariadb': 'Numeric',
402        'mysql': 'Numeric',
403        'mssql': 'Numeric',
404        'oracle': 'Numeric',
405        'sqlite': 'Numeric',
406        'duckdb': 'Numeric',
407        'citus': 'Numeric',
408        'cockroachdb': 'Numeric',
409        'default': 'Numeric',
410    },
411    'uuid': {
412        'timescaledb': 'Uuid',
413        'postgresql': 'Uuid',
414        'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)',
415        'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)',
416        'mssql': 'Uuid',
417        'oracle': 'sqlalchemy.dialects.oracle.CHAR(36)',
418        'sqlite': 'UnicodeText',
419        'duckdb': 'UnicodeText',
420        'citus': 'Uuid',
421        'cockroachdb': 'Uuid',
422        'default': 'Uuid',
423    },
424}
425
426AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {
427    'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY',
428    'postgresql': 'GENERATED BY DEFAULT AS IDENTITY',
429    'mariadb': 'AUTO_INCREMENT',
430    'mysql': 'AUTO_INCREMENT',
431    'mssql': 'IDENTITY(1,1)',
432    'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY',
433    'sqlite': 'AUTOINCREMENT',
434    'duckdb': 'GENERATED BY DEFAULT',
435    'citus': 'GENERATED BY DEFAULT',
436    'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY',
437    'default': 'GENERATED BY DEFAULT AS IDENTITY',
438}
439
440
441def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
442    """
443    Parse a database type to a pandas data type.
444
445    Parameters
446    ----------
447    db_type: str
448        The database type, e.g. `DATETIME`, `BIGINT`, etc.
449
450    allow_custom_dtypes: bool, default False
451        If `True`, allow for custom data types like `json` and `str`.
452
453    Returns
454    -------
455    The equivalent datatype for a pandas DataFrame.
456    """
457    def parse_custom(_pd_type: str, _db_type: str) -> str:
458        if 'json' in _db_type.lower():
459            return 'json'
460        return _pd_type
461
462    pd_type = DB_TO_PD_DTYPES.get(db_type.upper(), None)
463    if pd_type is not None:
464        return (
465            parse_custom(pd_type, db_type)
466            if allow_custom_dtypes
467            else pd_type
468        )
469    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
470        if db_t in db_type.upper():
471            return (
472                parse_custom(pd_t, db_t)
473                if allow_custom_dtypes
474                else pd_t
475            )
476    return DB_TO_PD_DTYPES['default']
477
478
479def get_db_type_from_pd_type(
480    pd_type: str,
481    flavor: str = 'default',
482    as_sqlalchemy: bool = False,
483) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
484    """
485    Parse a Pandas data type into a flavor's database type.
486
487    Parameters
488    ----------
489    pd_type: str
490        The Pandas datatype. This must be a string, not the actual dtype object.
491
492    flavor: str, default 'default'
493        The flavor of the database to be mapped to.
494
495    as_sqlalchemy: bool, default False
496        If `True`, return a type from `sqlalchemy.types`.
497
498    Returns
499    -------
500    The database data type for the incoming Pandas data type.
501    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
502    """
503    from meerschaum.utils.warnings import warn
504    from meerschaum.utils.packages import attempt_import
505    from meerschaum.utils.dtypes import are_dtypes_equal
506    from meerschaum.utils.misc import parse_arguments_str
507    sqlalchemy_types = attempt_import('sqlalchemy.types')
508
509    types_registry = (
510        PD_TO_DB_DTYPES_FLAVORS
511        if not as_sqlalchemy
512        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
513    )
514
515    ### Check whether we are able to match this type (e.g. pyarrow support).
516    found_db_type = False
517    if pd_type not in types_registry:
518        for mapped_pd_type in types_registry:
519            if are_dtypes_equal(mapped_pd_type, pd_type):
520                pd_type = mapped_pd_type
521                found_db_type = True
522                break
523    else:
524        found_db_type = True
525
526    if not found_db_type:
527        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.")
528        return (
529            'TEXT'
530            if not as_sqlalchemy
531            else sqlalchemy_types.UnicodeText
532        )
533    flavor_types = types_registry.get(
534        pd_type,
535        {
536            'default': (
537                'TEXT'
538                if not as_sqlalchemy
539                else 'UnicodeText'
540            ),
541        },
542    )
543    default_flavor_type = flavor_types.get(
544        'default',
545        (
546            'TEXT'
547            if not as_sqlalchemy
548            else 'UnicodeText'
549        ),
550    )
551    if flavor not in flavor_types:
552        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
553    db_type = flavor_types.get(flavor, default_flavor_type)
554    if not as_sqlalchemy:
555        return db_type
556
557    if db_type.startswith('sqlalchemy.dialects'):
558        dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
559        cls_args, cls_kwargs = None, None
560        if '(' in typ_class_name:
561            typ_class_name, args_str = typ_class_name.split('(', maxsplit=1)
562            args_str = args_str.rstrip(')')
563            cls_args, cls_kwargs = parse_arguments_str(args_str)
564        sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
565        cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
566        if cls_args is None:
567            return cls
568        return cls(*cls_args, **cls_kwargs)
569
570    if 'numeric' in db_type.lower():
571        numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
572        if flavor not in NUMERIC_PRECISION_FLAVORS:
573            return sqlalchemy_types.Numeric
574        precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
575        return sqlalchemy_types.Numeric(precision, scale)
576
577    cls_args, cls_kwargs = None, None
578    typ_class_name = db_type
579    if '(' in db_type:
580        typ_class_name, args_str = db_type.split('(', maxsplit=1)
581        args_str = args_str.rstrip(')')
582        cls_args, cls_kwargs = parse_arguments_str(args_str)
583
584    cls = getattr(sqlalchemy_types, typ_class_name)
585    if cls_args is None:
586        return cls
587    return cls(*cls_args, **cls_kwargs)
NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {'mariadb': (38, 20), 'mysql': (38, 20), 'mssql': (28, 10), 'duckdb': (15, 3), 'sqlite': (15, 4)}
TIMEZONE_NAIVE_FLAVORS = {'oracle', 'mysql', 'mariadb'}
DB_FLAVORS_CAST_DTYPES = {'mariadb': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'mysql': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'INT': 'SIGNED INT', 'INTEGER': 'SIGNED INT', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'sqlite': {'BOOLEAN': 'INTEGER', 'REAL': 'FLOAT', 'NUMERIC': 'NUMERIC(15, 4)', 'DECIMAL': 'DECIMAL(15, 4)'}, 'oracle': {'NVARCHAR(2000)': 'NVARCHAR2(2000)', 'NVARCHAR': 'NVARCHAR2(2000)', 'NVARCHAR2': 'NVARCHAR2(2000)', 'CHAR': 'CHAR(36)'}, 'mssql': {'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'NVARCHAR': 'NVARCHAR(MAX)', 'NUMERIC': 'NUMERIC(28, 10)', 'DECIMAL': 'DECIMAL(28, 10)'}, 'duckdb': {'NUMERIC': 'NUMERIC(15, 3)', 'DECIMAL': 'DECIMAL(15, 3)'}}
DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {'FLOAT': 'float64[pyarrow]', 'REAL': 'float64[pyarrow]', 'DOUBLE_PRECISION': 'float64[pyarrow]', 'DOUBLE': 'float64[pyarrow]', 'DECIMAL': 'numeric', 'BIGINT': 'int64[pyarrow]', 'INT': 'int64[pyarrow]', 'INTEGER': 'int64[pyarrow]', 'NUMBER': 'numeric', 'NUMERIC': 'numeric', 'TIMESTAMP': 'datetime64[ns]', 'TIMESTAMP WITHOUT TIMEZONE': 'datetime64[ns]', 'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]', 'TIMESTAMP WITH TIME ZONE': 'datetime64[ns, UTC]', 'TIMESTAMPTZ': 'datetime64[ns, UTC]', 'DATE': 'datetime64[ns]', 'DATETIME': 'datetime64[ns]', 'DATETIME2': 'datetime64[ns]', 'DATETIMEOFFSET': 'datetime64[ns, UTC]', 'TEXT': 'string[pyarrow]', 'VARCHAR': 'string[pyarrow]', 'CLOB': 'string[pyarrow]', 'BOOL': 'bool[pyarrow]', 'BOOLEAN': 'bool[pyarrow]', 'BOOLEAN()': 'bool[pyarrow]', 'TINYINT': 'bool[pyarrow]', 'TINYINT(1)': 'bool[pyarrow]', 'BIT': 'bool[pyarrow]', 'BIT(1)': 'bool[pyarrow]', 'JSON': 'json', 'JSONB': 'json', 'UUID': 'uuid', 'UNIQUEIDENTIFIER': 'uuid', 'substrings': {'CHAR': 'string[pyarrow]', 'TIMESTAMP': 'datetime64[ns]', 'TIME': 'datetime64[ns]', 'DATE': 'datetime64[ns]', 'DOUBLE': 'double[pyarrow]', 'DECIMAL': 'numeric', 'NUMERIC': 'numeric', 'NUMBER': 'numeric', 'INT': 'int64[pyarrow]', 'BOOL': 'bool[pyarrow]', 'JSON': 'json'}, 'default': 'object'}
PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BIGINT', 'postgresql': 'BIGINT', 'mariadb': 'BIGINT', 'mysql': 'BIGINT', 'mssql': 'BIGINT', 'oracle': 'INT', 'sqlite': 'BIGINT', 'duckdb': 'BIGINT', 'citus': 'BIGINT', 'cockroachdb': 'BIGINT', 'default': 'INT'}, 'float': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'double': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'datetime64[ns]': {'timescaledb': 'TIMESTAMP', 'postgresql': 'TIMESTAMP', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIME2', 'oracle': 'TIMESTAMP', 'sqlite': 'DATETIME', 'duckdb': 'TIMESTAMP', 'citus': 'TIMESTAMP', 'cockroachdb': 'TIMESTAMP', 'default': 'DATETIME'}, 'datetime64[ns, UTC]': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'datetime': {'timescaledb': 'TIMESTAMPTZ', 'postgresql': 'TIMESTAMPTZ', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIMEOFFSET', 'oracle': 'TIMESTAMP', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMPTZ', 'citus': 'TIMESTAMPTZ', 'cockroachdb': 'TIMESTAMPTZ', 'default': 'TIMESTAMPTZ'}, 'bool': {'timescaledb': 'BOOLEAN', 'postgresql': 'BOOLEAN', 'mariadb': 'BOOLEAN', 'mysql': 'BOOLEAN', 'mssql': 'BIT', 'oracle': 'INTEGER', 'sqlite': 'FLOAT', 'duckdb': 'BOOLEAN', 'citus': 'BOOLEAN', 'cockroachdb': 'BOOLEAN', 'default': 'BOOLEAN'}, 'object': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'string': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'json': {'timescaledb': 'JSONB', 'postgresql': 'JSONB', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'JSONB', 'cockroachdb': 'JSONB', 'default': 'TEXT'}, 'numeric': {'timescaledb': 'NUMERIC', 'postgresql': 'NUMERIC', 'mariadb': 'DECIMAL(38, 20)', 'mysql': 'DECIMAL(38, 20)', 'mssql': 'NUMERIC(28, 10)', 'oracle': 'NUMBER', 'sqlite': 'DECIMAL(15, 4)', 'duckdb': 'NUMERIC', 'citus': 'NUMERIC', 'cockroachdb': 'NUMERIC', 'default': 'NUMERIC'}, 'uuid': {'timescaledb': 'UUID', 'postgresql': 'UUID', 'mariadb': 'CHAR(36)', 'mysql': 'CHAR(36)', 'mssql': 'UNIQUEIDENTIFIER', 'oracle': 'CHAR(36)', 'sqlite': 'TEXT', 'duckdb': 'VARCHAR', 'citus': 'UUID', 'cockroachdb': 'UUID', 'default': 'TEXT'}}
PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BigInteger', 'postgresql': 'BigInteger', 'mariadb': 'BigInteger', 'mysql': 'BigInteger', 'mssql': 'BigInteger', 'oracle': 'BigInteger', 'sqlite': 'BigInteger', 'duckdb': 'BigInteger', 'citus': 'BigInteger', 'cockroachdb': 'BigInteger', 'default': 'BigInteger'}, 'float': {'timescaledb': 'Float', 'postgresql': 'Float', 'mariadb': 'Float', 'mysql': 'Float', 'mssql': 'Float', 'oracle': 'Float', 'sqlite': 'Float', 'duckdb': 'Float', 'citus': 'Float', 'cockroachdb': 'Float', 'default': 'Float'}, 'datetime': {'timescaledb': 'DateTime(timezone=True)', 'postgresql': 'DateTime(timezone=True)', 'mariadb': 'DateTime(timezone=True)', 'mysql': 'DateTime(timezone=True)', 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 'sqlite': 'DateTime(timezone=True)', 'duckdb': 'DateTime(timezone=True)', 'citus': 'DateTime(timezone=True)', 'cockroachdb': 'DateTime(timezone=True)', 'default': 'DateTime(timezone=True)'}, 'datetime64[ns]': {'timescaledb': 'DateTime', 'postgresql': 'DateTime', 'mariadb': 'DateTime', 'mysql': 'DateTime', 'mssql': 'sqlalchemy.dialects.mssql.DATETIME2', 'oracle': 'DateTime', 'sqlite': 'DateTime', 'duckdb': 'DateTime', 'citus': 'DateTime', 'cockroachdb': 'DateTime', 'default': 'DateTime'}, 'datetime64[ns, UTC]': {'timescaledb': 'DateTime(timezone=True)', 'postgresql': 'DateTime(timezone=True)', 'mariadb': 'DateTime(timezone=True)', 'mysql': 'DateTime(timezone=True)', 'mssql': 'sqlalchemy.dialects.mssql.DATETIMEOFFSET', 'oracle': 'sqlalchemy.dialects.oracle.TIMESTAMP(timezone=True)', 'sqlite': 'DateTime(timezone=True)', 'duckdb': 'DateTime(timezone=True)', 'citus': 'DateTime(timezone=True)', 'cockroachdb': 'DateTime(timezone=True)', 'default': 'DateTime(timezone=True)'}, 'bool': {'timescaledb': 'Boolean', 'postgresql': 'Boolean', 'mariadb': 'Integer', 'mysql': 'Integer', 'mssql': 'sqlalchemy.dialects.mssql.BIT', 'oracle': 'Integer', 'sqlite': 'Float', 'duckdb': 'Boolean', 'citus': 'Boolean', 'cockroachdb': 'Boolean', 'default': 'Boolean'}, 'object': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'string': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'json': {'timescaledb': 'sqlalchemy.dialects.postgresql.JSONB', 'postgresql': 'sqlalchemy.dialects.postgresql.JSONB', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'TEXT', 'citus': 'sqlalchemy.dialects.postgresql.JSONB', 'cockroachdb': 'sqlalchemy.dialects.postgresql.JSONB', 'default': 'UnicodeText'}, 'numeric': {'timescaledb': 'Numeric', 'postgresql': 'Numeric', 'mariadb': 'Numeric', 'mysql': 'Numeric', 'mssql': 'Numeric', 'oracle': 'Numeric', 'sqlite': 'Numeric', 'duckdb': 'Numeric', 'citus': 'Numeric', 'cockroachdb': 'Numeric', 'default': 'Numeric'}, 'uuid': {'timescaledb': 'Uuid', 'postgresql': 'Uuid', 'mariadb': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mysql': 'sqlalchemy.dialects.mysql.CHAR(36)', 'mssql': 'Uuid', 'oracle': 'sqlalchemy.dialects.oracle.CHAR(36)', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'Uuid', 'cockroachdb': 'Uuid', 'default': 'Uuid'}}
AUTO_INCREMENT_COLUMN_FLAVORS: Dict[str, str] = {'timescaledb': 'GENERATED BY DEFAULT AS IDENTITY', 'postgresql': 'GENERATED BY DEFAULT AS IDENTITY', 'mariadb': 'AUTO_INCREMENT', 'mysql': 'AUTO_INCREMENT', 'mssql': 'IDENTITY(1,1)', 'oracle': 'GENERATED BY DEFAULT ON NULL AS IDENTITY', 'sqlite': 'AUTOINCREMENT', 'duckdb': 'GENERATED BY DEFAULT', 'citus': 'GENERATED BY DEFAULT', 'cockroachdb': 'GENERATED BY DEFAULT AS IDENTITY', 'default': 'GENERATED BY DEFAULT AS IDENTITY'}
def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
442def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
443    """
444    Parse a database type to a pandas data type.
445
446    Parameters
447    ----------
448    db_type: str
449        The database type, e.g. `DATETIME`, `BIGINT`, etc.
450
451    allow_custom_dtypes: bool, default False
452        If `True`, allow for custom data types like `json` and `str`.
453
454    Returns
455    -------
456    The equivalent datatype for a pandas DataFrame.
457    """
458    def parse_custom(_pd_type: str, _db_type: str) -> str:
459        if 'json' in _db_type.lower():
460            return 'json'
461        return _pd_type
462
463    pd_type = DB_TO_PD_DTYPES.get(db_type.upper(), None)
464    if pd_type is not None:
465        return (
466            parse_custom(pd_type, db_type)
467            if allow_custom_dtypes
468            else pd_type
469        )
470    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
471        if db_t in db_type.upper():
472            return (
473                parse_custom(pd_t, db_t)
474                if allow_custom_dtypes
475                else pd_t
476            )
477    return DB_TO_PD_DTYPES['default']

Parse a database type to a pandas data type.

Parameters
  • db_type (str): The database type, e.g. DATETIME, BIGINT, etc.
  • allow_custom_dtypes (bool, default False): If True, allow for custom data types like json and str.
Returns
  • The equivalent datatype for a pandas DataFrame.
def get_db_type_from_pd_type( pd_type: str, flavor: str = 'default', as_sqlalchemy: bool = False) -> "Union[str, 'sqlalchemy.sql.visitors.TraversibleType']":
480def get_db_type_from_pd_type(
481    pd_type: str,
482    flavor: str = 'default',
483    as_sqlalchemy: bool = False,
484) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
485    """
486    Parse a Pandas data type into a flavor's database type.
487
488    Parameters
489    ----------
490    pd_type: str
491        The Pandas datatype. This must be a string, not the actual dtype object.
492
493    flavor: str, default 'default'
494        The flavor of the database to be mapped to.
495
496    as_sqlalchemy: bool, default False
497        If `True`, return a type from `sqlalchemy.types`.
498
499    Returns
500    -------
501    The database data type for the incoming Pandas data type.
502    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
503    """
504    from meerschaum.utils.warnings import warn
505    from meerschaum.utils.packages import attempt_import
506    from meerschaum.utils.dtypes import are_dtypes_equal
507    from meerschaum.utils.misc import parse_arguments_str
508    sqlalchemy_types = attempt_import('sqlalchemy.types')
509
510    types_registry = (
511        PD_TO_DB_DTYPES_FLAVORS
512        if not as_sqlalchemy
513        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
514    )
515
516    ### Check whether we are able to match this type (e.g. pyarrow support).
517    found_db_type = False
518    if pd_type not in types_registry:
519        for mapped_pd_type in types_registry:
520            if are_dtypes_equal(mapped_pd_type, pd_type):
521                pd_type = mapped_pd_type
522                found_db_type = True
523                break
524    else:
525        found_db_type = True
526
527    if not found_db_type:
528        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.")
529        return (
530            'TEXT'
531            if not as_sqlalchemy
532            else sqlalchemy_types.UnicodeText
533        )
534    flavor_types = types_registry.get(
535        pd_type,
536        {
537            'default': (
538                'TEXT'
539                if not as_sqlalchemy
540                else 'UnicodeText'
541            ),
542        },
543    )
544    default_flavor_type = flavor_types.get(
545        'default',
546        (
547            'TEXT'
548            if not as_sqlalchemy
549            else 'UnicodeText'
550        ),
551    )
552    if flavor not in flavor_types:
553        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
554    db_type = flavor_types.get(flavor, default_flavor_type)
555    if not as_sqlalchemy:
556        return db_type
557
558    if db_type.startswith('sqlalchemy.dialects'):
559        dialect, typ_class_name = db_type.replace('sqlalchemy.dialects.', '').split('.', maxsplit=2)
560        cls_args, cls_kwargs = None, None
561        if '(' in typ_class_name:
562            typ_class_name, args_str = typ_class_name.split('(', maxsplit=1)
563            args_str = args_str.rstrip(')')
564            cls_args, cls_kwargs = parse_arguments_str(args_str)
565        sqlalchemy_dialects_flavor_module = attempt_import(f'sqlalchemy.dialects.{dialect}')
566        cls = getattr(sqlalchemy_dialects_flavor_module, typ_class_name)
567        if cls_args is None:
568            return cls
569        return cls(*cls_args, **cls_kwargs)
570
571    if 'numeric' in db_type.lower():
572        numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
573        if flavor not in NUMERIC_PRECISION_FLAVORS:
574            return sqlalchemy_types.Numeric
575        precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
576        return sqlalchemy_types.Numeric(precision, scale)
577
578    cls_args, cls_kwargs = None, None
579    typ_class_name = db_type
580    if '(' in db_type:
581        typ_class_name, args_str = db_type.split('(', maxsplit=1)
582        args_str = args_str.rstrip(')')
583        cls_args, cls_kwargs = parse_arguments_str(args_str)
584
585    cls = getattr(sqlalchemy_types, typ_class_name)
586    if cls_args is None:
587        return cls
588    return cls(*cls_args, **cls_kwargs)

Parse a Pandas data type into a flavor's database type.

Parameters
  • pd_type (str): The Pandas datatype. This must be a string, not the actual dtype object.
  • flavor (str, default 'default'): The flavor of the database to be mapped to.
  • as_sqlalchemy (bool, default False): If True, return a type from sqlalchemy.types.
Returns
  • The database data type for the incoming Pandas data type.
  • If nothing can be found, a warning will be thrown and 'TEXT' will be returned.