meerschaum.utils.dtypes.sql

Utility functions for working with SQL data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with SQL data types.
  7"""
  8
  9from __future__ import annotations
 10from meerschaum.utils.typing import Dict, Union, Tuple
 11
 12NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {
 13    'mariadb': (38, 20),
 14    'mysql': (38, 20),
 15    'mssql': (28, 10),
 16    'duckdb': (15, 4),
 17    'sqlite': (15, 4),
 18}
 19
 20### MySQL doesn't allow for casting as BIGINT, so this is a workaround.
 21DB_FLAVORS_CAST_DTYPES = {
 22    'mariadb': {
 23        'BIGINT': 'DECIMAL',
 24        'TINYINT': 'SIGNED INT',
 25        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 26        'BOOL': 'SIGNED INT',
 27        'BOOLEAN': 'SIGNED INT',
 28        'DOUBLE PRECISION': 'DECIMAL',
 29        'DOUBLE': 'DECIMAL',
 30        'FLOAT': 'DECIMAL',
 31    },
 32    'mysql': {
 33        'BIGINT': 'DECIMAL',
 34        'TINYINT': 'SIGNED INT',
 35        'TEXT': 'CHAR(10000) CHARACTER SET utf8',
 36        'INT': 'SIGNED INT',
 37        'INTEGER': 'SIGNED INT',
 38        'BOOL': 'SIGNED INT',
 39        'BOOLEAN': 'SIGNED INT',
 40        'DOUBLE PRECISION': 'DECIMAL',
 41        'DOUBLE': 'DECIMAL',
 42        'FLOAT': 'DECIMAL',
 43    },
 44    'sqlite': {
 45        'BOOLEAN': 'INTEGER',
 46        'REAL': 'FLOAT',
 47    },
 48    'oracle': {
 49        'NVARCHAR(2000)': 'NVARCHAR2(2000)',
 50        'NVARCHAR': 'NVARCHAR2(2000)',
 51        'NVARCHAR2': 'NVARCHAR2(2000)',
 52    },
 53    'mssql': {
 54        'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 55        'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 56        'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)',
 57        'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)',
 58    },
 59}
 60for _flavor, (_precision, _scale) in NUMERIC_PRECISION_FLAVORS.items():
 61    if _flavor not in DB_FLAVORS_CAST_DTYPES:
 62        DB_FLAVORS_CAST_DTYPES[_flavor] = {}
 63    DB_FLAVORS_CAST_DTYPES[_flavor].update({
 64        'NUMERIC': f"NUMERIC({_precision}, {_scale})",
 65        'DECIMAL': f"DECIMAL({_precision}, {_scale})",
 66    })
 67
 68DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {
 69    'FLOAT': 'float64[pyarrow]',
 70    'REAL': 'float64[pyarrow]',
 71    'DOUBLE_PRECISION': 'float64[pyarrow]',
 72    'DOUBLE': 'float64[pyarrow]',
 73    'DECIMAL': 'numeric',
 74    'BIGINT': 'int64[pyarrow]',
 75    'INT': 'int64[pyarrow]',
 76    'INTEGER': 'int64[pyarrow]',
 77    'NUMBER': 'numeric',
 78    'NUMERIC': 'numeric',
 79    'TIMESTAMP': 'datetime64[ns]',
 80    'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]',
 81    'TIMESTAMPTZ': 'datetime64[ns, UTC]',
 82    'DATE': 'datetime64[ns]',
 83    'DATETIME': 'datetime64[ns]',
 84    'TEXT': 'string[pyarrow]',
 85    'CLOB': 'string[pyarrow]',
 86    'BOOL': 'bool[pyarrow]',
 87    'BOOLEAN': 'bool[pyarrow]',
 88    'BOOLEAN()': 'bool[pyarrow]',
 89    'TINYINT': 'bool[pyarrow]',
 90    'TINYINT(1)': 'bool[pyarrow]',
 91    'BIT': 'bool[pyarrow]',
 92    'BIT(1)': 'bool[pyarrow]',
 93    'JSON': 'json',
 94    'JSONB': 'json',
 95    'substrings': {
 96        'CHAR': 'string[pyarrow]',
 97        'TIMESTAMP': 'datetime64[ns]',
 98        'TIME': 'datetime64[ns]',
 99        'DATE': 'datetime64[ns]',
100        'DOUBLE': 'double[pyarrow]',
101        'DECIMAL': 'numeric',
102        'NUMERIC': 'numeric',
103        'NUMBER': 'numeric',
104        'INT': 'int64[pyarrow]',
105        'BOOL': 'bool[pyarrow]',
106        'JSON': 'json',
107    },
108    'default': 'object',
109}
110### Map pandas dtypes to flavor-specific dtypes.
111PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
112    'int': {
113        'timescaledb': 'BIGINT',
114        'postgresql': 'BIGINT',
115        'mariadb': 'BIGINT',
116        'mysql': 'BIGINT',
117        'mssql': 'BIGINT',
118        'oracle': 'INT',
119        'sqlite': 'BIGINT',
120        'duckdb': 'BIGINT',
121        'citus': 'BIGINT',
122        'cockroachdb': 'BIGINT',
123        'default': 'INT',
124    },
125    'float': {
126        'timescaledb': 'DOUBLE PRECISION',
127        'postgresql': 'DOUBLE PRECISION',
128        'mariadb': 'DOUBLE PRECISION',
129        'mysql': 'DOUBLE PRECISION',
130        'mssql': 'FLOAT',
131        'oracle': 'FLOAT',
132        'sqlite': 'FLOAT',
133        'duckdb': 'DOUBLE PRECISION',
134        'citus': 'DOUBLE PRECISION',
135        'cockroachdb': 'DOUBLE PRECISION',
136        'default': 'DOUBLE',
137    },
138    'double': {
139        'timescaledb': 'DOUBLE PRECISION',
140        'postgresql': 'DOUBLE PRECISION',
141        'mariadb': 'DOUBLE PRECISION',
142        'mysql': 'DOUBLE PRECISION',
143        'mssql': 'FLOAT',
144        'oracle': 'FLOAT',
145        'sqlite': 'FLOAT',
146        'duckdb': 'DOUBLE PRECISION',
147        'citus': 'DOUBLE PRECISION',
148        'cockroachdb': 'DOUBLE PRECISION',
149        'default': 'DOUBLE',
150    },
151    'datetime64[ns]': {
152        'timescaledb': 'TIMESTAMP',
153        'postgresql': 'TIMESTAMP',
154        'mariadb': 'DATETIME',
155        'mysql': 'DATETIME',
156        'mssql': 'DATETIME',
157        'oracle': 'DATE',
158        'sqlite': 'DATETIME',
159        'duckdb': 'TIMESTAMP',
160        'citus': 'TIMESTAMP',
161        'cockroachdb': 'TIMESTAMP',
162        'default': 'DATETIME',
163    },
164    'datetime64[ns, UTC]': {
165        'timescaledb': 'TIMESTAMP',
166        'postgresql': 'TIMESTAMP',
167        'mariadb': 'TIMESTAMP',
168        'mysql': 'TIMESTAMP',
169        'mssql': 'TIMESTAMP',
170        'oracle': 'TIMESTAMP',
171        'sqlite': 'TIMESTAMP',
172        'duckdb': 'TIMESTAMP',
173        'citus': 'TIMESTAMP',
174        'cockroachdb': 'TIMESTAMP',
175        'default': 'TIMESTAMP',
176    },
177    'bool': {
178        'timescaledb': 'BOOLEAN',
179        'postgresql': 'BOOLEAN',
180        'mariadb': 'BOOLEAN',
181        'mysql': 'BOOLEAN',
182        'mssql': 'INTEGER',
183        'oracle': 'INTEGER',
184        'sqlite': 'FLOAT',
185        'duckdb': 'BOOLEAN',
186        'citus': 'BOOLEAN',
187        'cockroachdb': 'BOOLEAN',
188        'default': 'BOOLEAN',
189    },
190    'object': {
191        'timescaledb': 'TEXT',
192        'postgresql': 'TEXT',
193        'mariadb': 'TEXT',
194        'mysql': 'TEXT',
195        'mssql': 'NVARCHAR(MAX)',
196        'oracle': 'NVARCHAR2(2000)',
197        'sqlite': 'TEXT',
198        'duckdb': 'TEXT',
199        'citus': 'TEXT',
200        'cockroachdb': 'TEXT',
201        'default': 'TEXT',
202    },
203    'string': {
204        'timescaledb': 'TEXT',
205        'postgresql': 'TEXT',
206        'mariadb': 'TEXT',
207        'mysql': 'TEXT',
208        'mssql': 'NVARCHAR(MAX)',
209        'oracle': 'NVARCHAR2(2000)',
210        'sqlite': 'TEXT',
211        'duckdb': 'TEXT',
212        'citus': 'TEXT',
213        'cockroachdb': 'TEXT',
214        'default': 'TEXT',
215    },
216    'json': {
217        'timescaledb': 'JSONB',
218        'postgresql': 'JSONB',
219        'mariadb': 'TEXT',
220        'mysql': 'TEXT',
221        'mssql': 'NVARCHAR(MAX)',
222        'oracle': 'NVARCHAR2(2000)',
223        'sqlite': 'TEXT',
224        'duckdb': 'TEXT',
225        'citus': 'JSONB',
226        'cockroachdb': 'JSONB',
227        'default': 'TEXT',
228    },
229    'numeric': {
230        'timescaledb': 'NUMERIC',
231        'postgresql': 'NUMERIC',
232        'mariadb': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mariadb"]}',
233        'mysql': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["mysql"]}',
234        'mssql': f'NUMERIC{NUMERIC_PRECISION_FLAVORS["mssql"]}',
235        'oracle': 'NUMBER',
236        'sqlite': f'DECIMAL{NUMERIC_PRECISION_FLAVORS["sqlite"]}',
237        'duckdb': 'NUMERIC',
238        'citus': 'NUMERIC',
239        'cockroachdb': 'NUMERIC',
240        'default': 'NUMERIC',
241    },
242}
243PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {
244    'int': {
245        'timescaledb': 'BigInteger',
246        'postgresql': 'BigInteger',
247        'mariadb': 'BigInteger',
248        'mysql': 'BigInteger',
249        'mssql': 'BigInteger',
250        'oracle': 'BigInteger',
251        'sqlite': 'BigInteger',
252        'duckdb': 'BigInteger',
253        'citus': 'BigInteger',
254        'cockroachdb': 'BigInteger',
255        'default': 'BigInteger',
256    },
257    'float': {
258        'timescaledb': 'Float',
259        'postgresql': 'Float',
260        'mariadb': 'Float',
261        'mysql': 'Float',
262        'mssql': 'Float',
263        'oracle': 'Float',
264        'sqlite': 'Float',
265        'duckdb': 'Float',
266        'citus': 'Float',
267        'cockroachdb': 'Float',
268        'default': 'Float',
269    },
270    'datetime64[ns]': {
271        'timescaledb': 'DateTime',
272        'postgresql': 'DateTime',
273        'mariadb': 'DateTime',
274        'mysql': 'DateTime',
275        'mssql': 'DateTime',
276        'oracle': 'DateTime',
277        'sqlite': 'DateTime',
278        'duckdb': 'DateTime',
279        'citus': 'DateTime',
280        'cockroachdb': 'DateTime',
281        'default': 'DateTime',
282    },
283    'datetime64[ns, UTC]': {
284        'timescaledb': 'DateTime',
285        'postgresql': 'DateTime',
286        'mariadb': 'DateTime',
287        'mysql': 'DateTime',
288        'mssql': 'DateTime',
289        'oracle': 'DateTime',
290        'sqlite': 'DateTime',
291        'duckdb': 'DateTime',
292        'citus': 'DateTime',
293        'cockroachdb': 'DateTime',
294        'default': 'DateTime',
295    },
296    'bool': {
297        'timescaledb': 'Boolean',
298        'postgresql': 'Boolean',
299        'mariadb': 'Integer',
300        'mysql': 'Integer',
301        'mssql': 'Integer',
302        'oracle': 'Integer',
303        'sqlite': 'Float',
304        'duckdb': 'Boolean',
305        'citus': 'Boolean',
306        'cockroachdb': 'Boolean',
307        'default': 'Boolean',
308    },
309    'object': {
310        'timescaledb': 'UnicodeText',
311        'postgresql': 'UnicodeText',
312        'mariadb': 'UnicodeText',
313        'mysql': 'UnicodeText',
314        'mssql': 'UnicodeText',
315        'oracle': 'UnicodeText',
316        'sqlite': 'UnicodeText',
317        'duckdb': 'UnicodeText',
318        'citus': 'UnicodeText',
319        'cockroachdb': 'UnicodeText',
320        'default': 'UnicodeText',
321    },
322    'string': {
323        'timescaledb': 'UnicodeText',
324        'postgresql': 'UnicodeText',
325        'mariadb': 'UnicodeText',
326        'mysql': 'UnicodeText',
327        'mssql': 'UnicodeText',
328        'oracle': 'UnicodeText',
329        'sqlite': 'UnicodeText',
330        'duckdb': 'UnicodeText',
331        'citus': 'UnicodeText',
332        'cockroachdb': 'UnicodeText',
333        'default': 'UnicodeText',
334    },
335    'json': {
336        'timescaledb': 'JSONB',
337        'postgresql': 'JSONB',
338        'mariadb': 'UnicodeText',
339        'mysql': 'UnicodeText',
340        'mssql': 'UnicodeText',
341        'oracle': 'UnicodeText',
342        'sqlite': 'UnicodeText',
343        'duckdb': 'TEXT',
344        'citus': 'JSONB',
345        'cockroachdb': 'JSONB',
346        'default': 'UnicodeText',
347    },
348    'numeric': {
349        'timescaledb': 'Numeric',
350        'postgresql': 'Numeric',
351        'mariadb': 'Numeric',
352        'mysql': 'Numeric',
353        'mssql': 'Numeric',
354        'oracle': 'Numeric',
355        'sqlite': 'Numeric',
356        'duckdb': 'Numeric',
357        'citus': 'Numeric',
358        'cockroachdb': 'Numeric',
359        'default': 'Numeric',
360    },
361}
362
363
364def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
365    """
366    Parse a database type to a pandas data type.
367
368    Parameters
369    ----------
370    db_type: str
371        The database type, e.g. `DATETIME`, `BIGINT`, etc.
372
373    allow_custom_dtypes: bool, default False
374        If `True`, allow for custom data types like `json` and `str`.
375
376    Returns
377    -------
378    The equivalent datatype for a pandas DataFrame.
379    """
380    def parse_custom(_pd_type: str, _db_type: str) -> str:
381        if 'json' in _db_type.lower():
382            return 'json'
383        return _pd_type
384
385    pd_type = DB_TO_PD_DTYPES.get(db_type.upper(), None)
386    if pd_type is not None:
387        return (
388            parse_custom(pd_type, db_type)
389            if allow_custom_dtypes
390            else pd_type
391        )
392    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
393        if db_t in db_type.upper():
394            return (
395                parse_custom(pd_t, db_t)
396                if allow_custom_dtypes
397                else pd_t
398            )
399    return DB_TO_PD_DTYPES['default']
400
401
402def get_db_type_from_pd_type(
403    pd_type: str,
404    flavor: str = 'default',
405    as_sqlalchemy: bool = False,
406) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
407    """
408    Parse a Pandas data type into a flavor's database type.
409
410    Parameters
411    ----------
412    pd_type: str
413        The Pandas datatype. This must be a string, not the actual dtype object.
414
415    flavor: str, default 'default'
416        The flavor of the database to be mapped to.
417
418    as_sqlalchemy: bool, default False
419        If `True`, return a type from `sqlalchemy.types`.
420
421    Returns
422    -------
423    The database data type for the incoming Pandas data type.
424    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
425    """
426    from meerschaum.utils.warnings import warn
427    from meerschaum.utils.packages import attempt_import
428    from meerschaum.utils.dtypes import are_dtypes_equal
429    sqlalchemy_types = attempt_import('sqlalchemy.types')
430
431    types_registry = (
432        PD_TO_DB_DTYPES_FLAVORS
433        if not as_sqlalchemy
434        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
435    )
436
437    ### Check whether we are able to match this type (e.g. pyarrow support).
438    found_db_type = False
439    if pd_type not in types_registry:
440        for mapped_pd_type in types_registry:
441            if are_dtypes_equal(mapped_pd_type, pd_type):
442                pd_type = mapped_pd_type
443                found_db_type = True
444                break
445    else:
446        found_db_type = True
447
448    if not found_db_type:
449        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.")
450        return (
451            'TEXT'
452            if not as_sqlalchemy
453            else sqlalchemy_types.UnicodeText
454        )
455    flavor_types = types_registry.get(
456        pd_type,
457        {
458            'default': (
459                'TEXT'
460                if not as_sqlalchemy
461                else 'UnicodeText'
462            ),
463        },
464    )
465    default_flavor_type = flavor_types.get(
466        'default',
467        (
468            'TEXT'
469            if not as_sqlalchemy
470            else 'UnicodeText'
471        ),
472    )
473    if flavor not in flavor_types:
474        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
475    db_type = flavor_types.get(flavor, default_flavor_type)
476    if not as_sqlalchemy:
477        return db_type
478    if db_type == 'JSONB':
479        sqlalchemy_dialects_postgresql = attempt_import('sqlalchemy.dialects.postgresql')
480        return sqlalchemy_dialects_postgresql.JSONB
481    if 'numeric' in db_type.lower():
482        numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
483        if flavor not in NUMERIC_PRECISION_FLAVORS:
484            return sqlalchemy_types.Numeric
485        precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
486        return sqlalchemy_types.Numeric(precision, scale)
487    return getattr(sqlalchemy_types, db_type)
NUMERIC_PRECISION_FLAVORS: Dict[str, Tuple[int, int]] = {'mariadb': (38, 20), 'mysql': (38, 20), 'mssql': (28, 10), 'duckdb': (15, 4), 'sqlite': (15, 4)}
DB_FLAVORS_CAST_DTYPES = {'mariadb': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'mysql': {'BIGINT': 'DECIMAL', 'TINYINT': 'SIGNED INT', 'TEXT': 'CHAR(10000) CHARACTER SET utf8', 'INT': 'SIGNED INT', 'INTEGER': 'SIGNED INT', 'BOOL': 'SIGNED INT', 'BOOLEAN': 'SIGNED INT', 'DOUBLE PRECISION': 'DECIMAL', 'DOUBLE': 'DECIMAL', 'FLOAT': 'DECIMAL', 'NUMERIC': 'NUMERIC(38, 20)', 'DECIMAL': 'DECIMAL(38, 20)'}, 'sqlite': {'BOOLEAN': 'INTEGER', 'REAL': 'FLOAT', 'NUMERIC': 'NUMERIC(15, 4)', 'DECIMAL': 'DECIMAL(15, 4)'}, 'oracle': {'NVARCHAR(2000)': 'NVARCHAR2(2000)', 'NVARCHAR': 'NVARCHAR2(2000)', 'NVARCHAR2': 'NVARCHAR2(2000)'}, 'mssql': {'NVARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'NVARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL Latin1 General CP1 CI AS"': 'NVARCHAR(MAX)', 'VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS"': 'NVARCHAR(MAX)', 'NUMERIC': 'NUMERIC(28, 10)', 'DECIMAL': 'DECIMAL(28, 10)'}, 'duckdb': {'NUMERIC': 'NUMERIC(15, 4)', 'DECIMAL': 'DECIMAL(15, 4)'}}
DB_TO_PD_DTYPES: Dict[str, Union[str, Dict[str, str]]] = {'FLOAT': 'float64[pyarrow]', 'REAL': 'float64[pyarrow]', 'DOUBLE_PRECISION': 'float64[pyarrow]', 'DOUBLE': 'float64[pyarrow]', 'DECIMAL': 'numeric', 'BIGINT': 'int64[pyarrow]', 'INT': 'int64[pyarrow]', 'INTEGER': 'int64[pyarrow]', 'NUMBER': 'numeric', 'NUMERIC': 'numeric', 'TIMESTAMP': 'datetime64[ns]', 'TIMESTAMP WITH TIMEZONE': 'datetime64[ns, UTC]', 'TIMESTAMPTZ': 'datetime64[ns, UTC]', 'DATE': 'datetime64[ns]', 'DATETIME': 'datetime64[ns]', 'TEXT': 'string[pyarrow]', 'CLOB': 'string[pyarrow]', 'BOOL': 'bool[pyarrow]', 'BOOLEAN': 'bool[pyarrow]', 'BOOLEAN()': 'bool[pyarrow]', 'TINYINT': 'bool[pyarrow]', 'TINYINT(1)': 'bool[pyarrow]', 'BIT': 'bool[pyarrow]', 'BIT(1)': 'bool[pyarrow]', 'JSON': 'json', 'JSONB': 'json', 'substrings': {'CHAR': 'string[pyarrow]', 'TIMESTAMP': 'datetime64[ns]', 'TIME': 'datetime64[ns]', 'DATE': 'datetime64[ns]', 'DOUBLE': 'double[pyarrow]', 'DECIMAL': 'numeric', 'NUMERIC': 'numeric', 'NUMBER': 'numeric', 'INT': 'int64[pyarrow]', 'BOOL': 'bool[pyarrow]', 'JSON': 'json'}, 'default': 'object'}
PD_TO_DB_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BIGINT', 'postgresql': 'BIGINT', 'mariadb': 'BIGINT', 'mysql': 'BIGINT', 'mssql': 'BIGINT', 'oracle': 'INT', 'sqlite': 'BIGINT', 'duckdb': 'BIGINT', 'citus': 'BIGINT', 'cockroachdb': 'BIGINT', 'default': 'INT'}, 'float': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'double': {'timescaledb': 'DOUBLE PRECISION', 'postgresql': 'DOUBLE PRECISION', 'mariadb': 'DOUBLE PRECISION', 'mysql': 'DOUBLE PRECISION', 'mssql': 'FLOAT', 'oracle': 'FLOAT', 'sqlite': 'FLOAT', 'duckdb': 'DOUBLE PRECISION', 'citus': 'DOUBLE PRECISION', 'cockroachdb': 'DOUBLE PRECISION', 'default': 'DOUBLE'}, 'datetime64[ns]': {'timescaledb': 'TIMESTAMP', 'postgresql': 'TIMESTAMP', 'mariadb': 'DATETIME', 'mysql': 'DATETIME', 'mssql': 'DATETIME', 'oracle': 'DATE', 'sqlite': 'DATETIME', 'duckdb': 'TIMESTAMP', 'citus': 'TIMESTAMP', 'cockroachdb': 'TIMESTAMP', 'default': 'DATETIME'}, 'datetime64[ns, UTC]': {'timescaledb': 'TIMESTAMP', 'postgresql': 'TIMESTAMP', 'mariadb': 'TIMESTAMP', 'mysql': 'TIMESTAMP', 'mssql': 'TIMESTAMP', 'oracle': 'TIMESTAMP', 'sqlite': 'TIMESTAMP', 'duckdb': 'TIMESTAMP', 'citus': 'TIMESTAMP', 'cockroachdb': 'TIMESTAMP', 'default': 'TIMESTAMP'}, 'bool': {'timescaledb': 'BOOLEAN', 'postgresql': 'BOOLEAN', 'mariadb': 'BOOLEAN', 'mysql': 'BOOLEAN', 'mssql': 'INTEGER', 'oracle': 'INTEGER', 'sqlite': 'FLOAT', 'duckdb': 'BOOLEAN', 'citus': 'BOOLEAN', 'cockroachdb': 'BOOLEAN', 'default': 'BOOLEAN'}, 'object': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'string': {'timescaledb': 'TEXT', 'postgresql': 'TEXT', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'TEXT', 'cockroachdb': 'TEXT', 'default': 'TEXT'}, 'json': {'timescaledb': 'JSONB', 'postgresql': 'JSONB', 'mariadb': 'TEXT', 'mysql': 'TEXT', 'mssql': 'NVARCHAR(MAX)', 'oracle': 'NVARCHAR2(2000)', 'sqlite': 'TEXT', 'duckdb': 'TEXT', 'citus': 'JSONB', 'cockroachdb': 'JSONB', 'default': 'TEXT'}, 'numeric': {'timescaledb': 'NUMERIC', 'postgresql': 'NUMERIC', 'mariadb': 'DECIMAL(38, 20)', 'mysql': 'DECIMAL(38, 20)', 'mssql': 'NUMERIC(28, 10)', 'oracle': 'NUMBER', 'sqlite': 'DECIMAL(15, 4)', 'duckdb': 'NUMERIC', 'citus': 'NUMERIC', 'cockroachdb': 'NUMERIC', 'default': 'NUMERIC'}}
PD_TO_SQLALCHEMY_DTYPES_FLAVORS: Dict[str, Dict[str, str]] = {'int': {'timescaledb': 'BigInteger', 'postgresql': 'BigInteger', 'mariadb': 'BigInteger', 'mysql': 'BigInteger', 'mssql': 'BigInteger', 'oracle': 'BigInteger', 'sqlite': 'BigInteger', 'duckdb': 'BigInteger', 'citus': 'BigInteger', 'cockroachdb': 'BigInteger', 'default': 'BigInteger'}, 'float': {'timescaledb': 'Float', 'postgresql': 'Float', 'mariadb': 'Float', 'mysql': 'Float', 'mssql': 'Float', 'oracle': 'Float', 'sqlite': 'Float', 'duckdb': 'Float', 'citus': 'Float', 'cockroachdb': 'Float', 'default': 'Float'}, 'datetime64[ns]': {'timescaledb': 'DateTime', 'postgresql': 'DateTime', 'mariadb': 'DateTime', 'mysql': 'DateTime', 'mssql': 'DateTime', 'oracle': 'DateTime', 'sqlite': 'DateTime', 'duckdb': 'DateTime', 'citus': 'DateTime', 'cockroachdb': 'DateTime', 'default': 'DateTime'}, 'datetime64[ns, UTC]': {'timescaledb': 'DateTime', 'postgresql': 'DateTime', 'mariadb': 'DateTime', 'mysql': 'DateTime', 'mssql': 'DateTime', 'oracle': 'DateTime', 'sqlite': 'DateTime', 'duckdb': 'DateTime', 'citus': 'DateTime', 'cockroachdb': 'DateTime', 'default': 'DateTime'}, 'bool': {'timescaledb': 'Boolean', 'postgresql': 'Boolean', 'mariadb': 'Integer', 'mysql': 'Integer', 'mssql': 'Integer', 'oracle': 'Integer', 'sqlite': 'Float', 'duckdb': 'Boolean', 'citus': 'Boolean', 'cockroachdb': 'Boolean', 'default': 'Boolean'}, 'object': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'string': {'timescaledb': 'UnicodeText', 'postgresql': 'UnicodeText', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'UnicodeText', 'citus': 'UnicodeText', 'cockroachdb': 'UnicodeText', 'default': 'UnicodeText'}, 'json': {'timescaledb': 'JSONB', 'postgresql': 'JSONB', 'mariadb': 'UnicodeText', 'mysql': 'UnicodeText', 'mssql': 'UnicodeText', 'oracle': 'UnicodeText', 'sqlite': 'UnicodeText', 'duckdb': 'TEXT', 'citus': 'JSONB', 'cockroachdb': 'JSONB', 'default': 'UnicodeText'}, 'numeric': {'timescaledb': 'Numeric', 'postgresql': 'Numeric', 'mariadb': 'Numeric', 'mysql': 'Numeric', 'mssql': 'Numeric', 'oracle': 'Numeric', 'sqlite': 'Numeric', 'duckdb': 'Numeric', 'citus': 'Numeric', 'cockroachdb': 'Numeric', 'default': 'Numeric'}}
def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
365def get_pd_type_from_db_type(db_type: str, allow_custom_dtypes: bool = False) -> str:
366    """
367    Parse a database type to a pandas data type.
368
369    Parameters
370    ----------
371    db_type: str
372        The database type, e.g. `DATETIME`, `BIGINT`, etc.
373
374    allow_custom_dtypes: bool, default False
375        If `True`, allow for custom data types like `json` and `str`.
376
377    Returns
378    -------
379    The equivalent datatype for a pandas DataFrame.
380    """
381    def parse_custom(_pd_type: str, _db_type: str) -> str:
382        if 'json' in _db_type.lower():
383            return 'json'
384        return _pd_type
385
386    pd_type = DB_TO_PD_DTYPES.get(db_type.upper(), None)
387    if pd_type is not None:
388        return (
389            parse_custom(pd_type, db_type)
390            if allow_custom_dtypes
391            else pd_type
392        )
393    for db_t, pd_t in DB_TO_PD_DTYPES['substrings'].items():
394        if db_t in db_type.upper():
395            return (
396                parse_custom(pd_t, db_t)
397                if allow_custom_dtypes
398                else pd_t
399            )
400    return DB_TO_PD_DTYPES['default']

Parse a database type to a pandas data type.

Parameters
  • db_type (str): The database type, e.g. DATETIME, BIGINT, etc.
  • allow_custom_dtypes (bool, default False): If True, allow for custom data types like json and str.
Returns
  • The equivalent datatype for a pandas DataFrame.
def get_db_type_from_pd_type( pd_type: str, flavor: str = 'default', as_sqlalchemy: bool = False) -> "Union[str, 'sqlalchemy.sql.visitors.TraversibleType']":
403def get_db_type_from_pd_type(
404    pd_type: str,
405    flavor: str = 'default',
406    as_sqlalchemy: bool = False,
407) -> Union[str, 'sqlalchemy.sql.visitors.TraversibleType']:
408    """
409    Parse a Pandas data type into a flavor's database type.
410
411    Parameters
412    ----------
413    pd_type: str
414        The Pandas datatype. This must be a string, not the actual dtype object.
415
416    flavor: str, default 'default'
417        The flavor of the database to be mapped to.
418
419    as_sqlalchemy: bool, default False
420        If `True`, return a type from `sqlalchemy.types`.
421
422    Returns
423    -------
424    The database data type for the incoming Pandas data type.
425    If nothing can be found, a warning will be thrown and 'TEXT' will be returned.
426    """
427    from meerschaum.utils.warnings import warn
428    from meerschaum.utils.packages import attempt_import
429    from meerschaum.utils.dtypes import are_dtypes_equal
430    sqlalchemy_types = attempt_import('sqlalchemy.types')
431
432    types_registry = (
433        PD_TO_DB_DTYPES_FLAVORS
434        if not as_sqlalchemy
435        else PD_TO_SQLALCHEMY_DTYPES_FLAVORS
436    )
437
438    ### Check whether we are able to match this type (e.g. pyarrow support).
439    found_db_type = False
440    if pd_type not in types_registry:
441        for mapped_pd_type in types_registry:
442            if are_dtypes_equal(mapped_pd_type, pd_type):
443                pd_type = mapped_pd_type
444                found_db_type = True
445                break
446    else:
447        found_db_type = True
448
449    if not found_db_type:
450        warn(f"Unknown Pandas data type '{pd_type}'. Falling back to 'TEXT'.")
451        return (
452            'TEXT'
453            if not as_sqlalchemy
454            else sqlalchemy_types.UnicodeText
455        )
456    flavor_types = types_registry.get(
457        pd_type,
458        {
459            'default': (
460                'TEXT'
461                if not as_sqlalchemy
462                else 'UnicodeText'
463            ),
464        },
465    )
466    default_flavor_type = flavor_types.get(
467        'default',
468        (
469            'TEXT'
470            if not as_sqlalchemy
471            else 'UnicodeText'
472        ),
473    )
474    if flavor not in flavor_types:
475        warn(f"Unknown flavor '{flavor}'. Falling back to '{default_flavor_type}' (default).")
476    db_type = flavor_types.get(flavor, default_flavor_type)
477    if not as_sqlalchemy:
478        return db_type
479    if db_type == 'JSONB':
480        sqlalchemy_dialects_postgresql = attempt_import('sqlalchemy.dialects.postgresql')
481        return sqlalchemy_dialects_postgresql.JSONB
482    if 'numeric' in db_type.lower():
483        numeric_type_str = PD_TO_DB_DTYPES_FLAVORS['numeric'].get(flavor, 'NUMERIC')
484        if flavor not in NUMERIC_PRECISION_FLAVORS:
485            return sqlalchemy_types.Numeric
486        precision, scale = NUMERIC_PRECISION_FLAVORS[flavor]
487        return sqlalchemy_types.Numeric(precision, scale)
488    return getattr(sqlalchemy_types, db_type)

Parse a Pandas data type into a flavor's database type.

Parameters
  • pd_type (str): The Pandas datatype. This must be a string, not the actual dtype object.
  • flavor (str, default 'default'): The flavor of the database to be mapped to.
  • as_sqlalchemy (bool, default False): If True, return a type from sqlalchemy.types.
Returns
  • The database data type for the incoming Pandas data type.
  • If nothing can be found, a warning will be thrown and 'TEXT' will be returned.