meerschaum.utils.dtypes

Utility functions for working with data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with data types.
  7"""
  8
  9import traceback
 10from decimal import Decimal, Context, InvalidOperation
 11from meerschaum.utils.typing import Dict, Union, Any
 12from meerschaum.utils.warnings import warn
 13
 14MRSM_PD_DTYPES: Dict[str, str] = {
 15    'json': 'object',
 16    'numeric': 'object',
 17    'datetime': 'datetime64[ns]',
 18    'bool': 'bool[pyarrow]',
 19    'int': 'Int64',
 20    'int8': 'Int8',
 21    'int16': 'Int16',
 22    'int32': 'Int32',
 23    'int64': 'Int64',
 24    'str': 'string[python]',
 25}
 26
 27
 28def to_pandas_dtype(dtype: str) -> str:
 29    """
 30    Cast a supported Meerschaum dtype to a Pandas dtype.
 31    """
 32    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
 33    if known_dtype is not None:
 34        return known_dtype
 35
 36    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
 37    ### treat it as a SQL db type.
 38    if dtype.split(' ')[0].isupper():
 39        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
 40        return get_pd_type_from_db_type(dtype)
 41
 42    from meerschaum.utils.packages import attempt_import
 43    pandas = attempt_import('pandas', lazy=False)
 44
 45    try:
 46        return str(pandas.api.types.pandas_dtype(dtype))
 47    except Exception as e:
 48        warn(
 49            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
 50            + f"{traceback.format_exc()}",
 51            stack = False,
 52        )
 53    
 54    return 'object'
 55
 56
 57def are_dtypes_equal(
 58        ldtype: Union[str, Dict[str, str]],
 59        rdtype: Union[str, Dict[str, str]],
 60    ) -> bool:
 61    """
 62    Determine whether two dtype strings may be considered
 63    equivalent to avoid unnecessary conversions.
 64
 65    Parameters
 66    ----------
 67    ldtype: Union[str, Dict[str, str]]
 68        The left dtype to compare.
 69        May also provide a dtypes dictionary.
 70
 71    rdtype: Union[str, Dict[str, str]]
 72        The right dtype to compare.
 73        May also provide a dtypes dictionary.
 74
 75    Returns
 76    -------
 77    A `bool` indicating whether the two dtypes are to be considered equivalent.
 78    """
 79    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
 80        lkeys = sorted([str(k) for k in ldtype.keys()])
 81        rkeys = sorted([str(k) for k in rdtype.keys()])
 82        for lkey, rkey in zip(lkeys, rkeys):
 83            if lkey != rkey:
 84                return False
 85            ltype = ldtype[lkey]
 86            rtype = rdtype[rkey]
 87            if not are_dtypes_equal(ltype, rtype):
 88                return False
 89        return True
 90
 91    try:
 92        if ldtype == rdtype:
 93            return True
 94    except Exception as e:
 95        warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
 96        return False
 97
 98    ### Sometimes pandas dtype objects are passed.
 99    ldtype = str(ldtype)
100    rdtype = str(rdtype)
101
102    json_dtypes = ('json', 'object')
103    if ldtype in json_dtypes and rdtype in json_dtypes:
104        return True
105
106    numeric_dtypes = ('numeric', 'object')
107    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
108        return True
109
110    ldtype_clean = ldtype.split('[')[0]
111    rdtype_clean = rdtype.split('[')[0]
112
113    if ldtype_clean.lower() == rdtype_clean.lower():
114        return True
115
116    datetime_dtypes = ('datetime', 'timestamp')
117    ldtype_found_dt_prefix = False
118    rdtype_found_dt_prefix = False
119    for dt_prefix in datetime_dtypes:
120        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
121        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
122    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
123        return True
124
125    string_dtypes = ('str', 'string', 'object')
126    if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes:
127        return True
128
129    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
130    if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes:
131        return True
132
133    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
134    if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes:
135        return True
136
137    bool_dtypes = ('bool', 'boolean')
138    if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes:
139        return True
140
141    return False
142
143
144def is_dtype_numeric(dtype: str) -> bool:
145    """
146    Determine whether a given `dtype` string
147    should be considered compatible with the Meerschaum dtype `numeric`.
148
149    Parameters
150    ----------
151    dtype: str
152        The pandas-like dtype string.
153
154    Returns
155    -------
156    A bool indicating the dtype is compatible with `numeric`.
157    """
158    dtype_lower = dtype.lower()
159
160    acceptable_substrings = ('numeric', 'float', 'double', 'int')
161    for substring in acceptable_substrings:
162        if substring in dtype_lower:
163            return True
164
165    return False
166
167
168def attempt_cast_to_numeric(value: Any) -> Any:
169    """
170    Given a value, attempt to coerce it into a numeric (Decimal).
171    """
172    if isinstance(value, Decimal):
173        return value
174    try:
175        return (
176            Decimal(str(value))
177            if not value_is_null(value)
178            else Decimal('NaN')
179        )
180    except Exception as e:
181        return value
182
183
184def value_is_null(value: Any) -> bool:
185    """
186    Determine if a value is a null-like string.
187    """
188    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')
189
190
191def none_if_null(value: Any) -> Any:
192    """
193    Return `None` if a value is a null-like string.
194    """
195    return (None if value_is_null(value) else value)
196
197
198def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
199    """
200    Quantize a given `Decimal` to a known scale and precision.
201
202    Parameters
203    ----------
204    x: Decimal
205        The `Decimal` to be quantized.
206
207    scale: int
208        The total number of significant digits.
209
210    precision: int
211        The number of significant digits after the decimal point.
212
213    Returns
214    -------
215    A `Decimal` quantized to the specified scale and precision.
216    """
217    precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision)))
218    try:
219        return x.quantize(precision_decimal, context=Context(prec=scale))
220    except InvalidOperation:
221        return x
MRSM_PD_DTYPES: Dict[str, str] = {'json': 'object', 'numeric': 'object', 'datetime': 'datetime64[ns]', 'bool': 'bool[pyarrow]', 'int': 'Int64', 'int8': 'Int8', 'int16': 'Int16', 'int32': 'Int32', 'int64': 'Int64', 'str': 'string[python]'}
def to_pandas_dtype(dtype: str) -> str:
29def to_pandas_dtype(dtype: str) -> str:
30    """
31    Cast a supported Meerschaum dtype to a Pandas dtype.
32    """
33    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
34    if known_dtype is not None:
35        return known_dtype
36
37    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
38    ### treat it as a SQL db type.
39    if dtype.split(' ')[0].isupper():
40        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
41        return get_pd_type_from_db_type(dtype)
42
43    from meerschaum.utils.packages import attempt_import
44    pandas = attempt_import('pandas', lazy=False)
45
46    try:
47        return str(pandas.api.types.pandas_dtype(dtype))
48    except Exception as e:
49        warn(
50            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
51            + f"{traceback.format_exc()}",
52            stack = False,
53        )
54    
55    return 'object'

Cast a supported Meerschaum dtype to a Pandas dtype.

def are_dtypes_equal( ldtype: Union[str, Dict[str, str]], rdtype: Union[str, Dict[str, str]]) -> bool:
 58def are_dtypes_equal(
 59        ldtype: Union[str, Dict[str, str]],
 60        rdtype: Union[str, Dict[str, str]],
 61    ) -> bool:
 62    """
 63    Determine whether two dtype strings may be considered
 64    equivalent to avoid unnecessary conversions.
 65
 66    Parameters
 67    ----------
 68    ldtype: Union[str, Dict[str, str]]
 69        The left dtype to compare.
 70        May also provide a dtypes dictionary.
 71
 72    rdtype: Union[str, Dict[str, str]]
 73        The right dtype to compare.
 74        May also provide a dtypes dictionary.
 75
 76    Returns
 77    -------
 78    A `bool` indicating whether the two dtypes are to be considered equivalent.
 79    """
 80    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
 81        lkeys = sorted([str(k) for k in ldtype.keys()])
 82        rkeys = sorted([str(k) for k in rdtype.keys()])
 83        for lkey, rkey in zip(lkeys, rkeys):
 84            if lkey != rkey:
 85                return False
 86            ltype = ldtype[lkey]
 87            rtype = rdtype[rkey]
 88            if not are_dtypes_equal(ltype, rtype):
 89                return False
 90        return True
 91
 92    try:
 93        if ldtype == rdtype:
 94            return True
 95    except Exception as e:
 96        warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
 97        return False
 98
 99    ### Sometimes pandas dtype objects are passed.
100    ldtype = str(ldtype)
101    rdtype = str(rdtype)
102
103    json_dtypes = ('json', 'object')
104    if ldtype in json_dtypes and rdtype in json_dtypes:
105        return True
106
107    numeric_dtypes = ('numeric', 'object')
108    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
109        return True
110
111    ldtype_clean = ldtype.split('[')[0]
112    rdtype_clean = rdtype.split('[')[0]
113
114    if ldtype_clean.lower() == rdtype_clean.lower():
115        return True
116
117    datetime_dtypes = ('datetime', 'timestamp')
118    ldtype_found_dt_prefix = False
119    rdtype_found_dt_prefix = False
120    for dt_prefix in datetime_dtypes:
121        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
122        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
123    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
124        return True
125
126    string_dtypes = ('str', 'string', 'object')
127    if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes:
128        return True
129
130    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
131    if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes:
132        return True
133
134    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
135    if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes:
136        return True
137
138    bool_dtypes = ('bool', 'boolean')
139    if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes:
140        return True
141
142    return False

Determine whether two dtype strings may be considered equivalent to avoid unnecessary conversions.

Parameters
  • ldtype (Union[str, Dict[str, str]]): The left dtype to compare. May also provide a dtypes dictionary.
  • rdtype (Union[str, Dict[str, str]]): The right dtype to compare. May also provide a dtypes dictionary.
Returns
  • A bool indicating whether the two dtypes are to be considered equivalent.
def is_dtype_numeric(dtype: str) -> bool:
145def is_dtype_numeric(dtype: str) -> bool:
146    """
147    Determine whether a given `dtype` string
148    should be considered compatible with the Meerschaum dtype `numeric`.
149
150    Parameters
151    ----------
152    dtype: str
153        The pandas-like dtype string.
154
155    Returns
156    -------
157    A bool indicating the dtype is compatible with `numeric`.
158    """
159    dtype_lower = dtype.lower()
160
161    acceptable_substrings = ('numeric', 'float', 'double', 'int')
162    for substring in acceptable_substrings:
163        if substring in dtype_lower:
164            return True
165
166    return False

Determine whether a given dtype string should be considered compatible with the Meerschaum dtype numeric.

Parameters
  • dtype (str): The pandas-like dtype string.
Returns
  • A bool indicating the dtype is compatible with numeric.
def attempt_cast_to_numeric(value: Any) -> Any:
169def attempt_cast_to_numeric(value: Any) -> Any:
170    """
171    Given a value, attempt to coerce it into a numeric (Decimal).
172    """
173    if isinstance(value, Decimal):
174        return value
175    try:
176        return (
177            Decimal(str(value))
178            if not value_is_null(value)
179            else Decimal('NaN')
180        )
181    except Exception as e:
182        return value

Given a value, attempt to coerce it into a numeric (Decimal).

def value_is_null(value: Any) -> bool:
185def value_is_null(value: Any) -> bool:
186    """
187    Determine if a value is a null-like string.
188    """
189    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')

Determine if a value is a null-like string.

def none_if_null(value: Any) -> Any:
192def none_if_null(value: Any) -> Any:
193    """
194    Return `None` if a value is a null-like string.
195    """
196    return (None if value_is_null(value) else value)

Return None if a value is a null-like string.

def quantize_decimal(x: decimal.Decimal, scale: int, precision: int) -> decimal.Decimal:
199def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
200    """
201    Quantize a given `Decimal` to a known scale and precision.
202
203    Parameters
204    ----------
205    x: Decimal
206        The `Decimal` to be quantized.
207
208    scale: int
209        The total number of significant digits.
210
211    precision: int
212        The number of significant digits after the decimal point.
213
214    Returns
215    -------
216    A `Decimal` quantized to the specified scale and precision.
217    """
218    precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision)))
219    try:
220        return x.quantize(precision_decimal, context=Context(prec=scale))
221    except InvalidOperation:
222        return x

Quantize a given Decimal to a known scale and precision.

Parameters
  • x (Decimal): The Decimal to be quantized.
  • scale (int): The total number of significant digits.
  • precision (int): The number of significant digits after the decimal point.
Returns
  • A Decimal quantized to the specified scale and precision.