meerschaum.utils.dtypes

Utility functions for working with data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with data types.
  7"""
  8
  9from decimal import Decimal, Context, InvalidOperation
 10from meerschaum.utils.typing import Dict, Union, Any
 11
 12MRSM_PD_DTYPES: Dict[str, str] = {
 13    'json': 'object',
 14    'numeric': 'object',
 15    'datetime': 'datetime64[ns]',
 16    'bool': 'bool[pyarrow]',
 17    'int': 'Int64',
 18    'int8': 'Int8',
 19    'int16': 'Int16',
 20    'int32': 'Int32',
 21    'int64': 'Int64',
 22    'str': 'string[python]',
 23}
 24
 25
 26def to_pandas_dtype(dtype: str) -> str:
 27    """
 28    Cast a supported Meerschaum dtype to a Pandas dtype.
 29    """
 30    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
 31    if known_dtype is not None:
 32        return known_dtype
 33
 34    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
 35    ### treat it as a SQL db type.
 36    if dtype.split(' ')[0].isupper():
 37        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
 38        return get_pd_type_from_db_type(dtype)
 39
 40    import traceback
 41    from meerschaum.utils.packages import attempt_import
 42    from meerschaum.utils.warnings import warn
 43    pandas = attempt_import('pandas', lazy=False)
 44
 45    try:
 46        return str(pandas.api.types.pandas_dtype(dtype))
 47    except Exception as e:
 48        warn(
 49            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
 50            + f"{traceback.format_exc()}",
 51            stack = False,
 52        )
 53    
 54    return 'object'
 55
 56
 57def are_dtypes_equal(
 58        ldtype: Union[str, Dict[str, str]],
 59        rdtype: Union[str, Dict[str, str]],
 60    ) -> bool:
 61    """
 62    Determine whether two dtype strings may be considered
 63    equivalent to avoid unnecessary conversions.
 64
 65    Parameters
 66    ----------
 67    ldtype: Union[str, Dict[str, str]]
 68        The left dtype to compare.
 69        May also provide a dtypes dictionary.
 70
 71    rdtype: Union[str, Dict[str, str]]
 72        The right dtype to compare.
 73        May also provide a dtypes dictionary.
 74
 75    Returns
 76    -------
 77    A `bool` indicating whether the two dtypes are to be considered equivalent.
 78    """
 79    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
 80        lkeys = sorted([str(k) for k in ldtype.keys()])
 81        rkeys = sorted([str(k) for k in rdtype.keys()])
 82        for lkey, rkey in zip(lkeys, rkeys):
 83            if lkey != rkey:
 84                return False
 85            ltype = ldtype[lkey]
 86            rtype = rdtype[rkey]
 87            if not are_dtypes_equal(ltype, rtype):
 88                return False
 89        return True
 90
 91    if ldtype == rdtype:
 92        return True
 93
 94    ### Sometimes pandas dtype objects are passed.
 95    ldtype = str(ldtype)
 96    rdtype = str(rdtype)
 97
 98    json_dtypes = ('json', 'object')
 99    if ldtype in json_dtypes and rdtype in json_dtypes:
100        return True
101
102    numeric_dtypes = ('numeric', 'object')
103    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
104        return True
105
106    ldtype_clean = ldtype.split('[')[0]
107    rdtype_clean = rdtype.split('[')[0]
108
109    if ldtype_clean.lower() == rdtype_clean.lower():
110        return True
111
112    datetime_dtypes = ('datetime', 'timestamp')
113    ldtype_found_dt_prefix = False
114    rdtype_found_dt_prefix = False
115    for dt_prefix in datetime_dtypes:
116        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
117        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
118    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
119        return True
120
121    string_dtypes = ('str', 'string', 'object')
122    if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes:
123        return True
124
125    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
126    if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes:
127        return True
128
129    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
130    if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes:
131        return True
132
133    bool_dtypes = ('bool', 'boolean')
134    if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes:
135        return True
136
137    return False
138
139
140def is_dtype_numeric(dtype: str) -> bool:
141    """
142    Determine whether a given `dtype` string
143    should be considered compatible with the Meerschaum dtype `numeric`.
144
145    Parameters
146    ----------
147    dtype: str
148        The pandas-like dtype string.
149
150    Returns
151    -------
152    A bool indicating the dtype is compatible with `numeric`.
153    """
154    dtype_lower = dtype.lower()
155
156    acceptable_substrings = ('numeric', 'float', 'double', 'int')
157    for substring in acceptable_substrings:
158        if substring in dtype_lower:
159            return True
160
161    return False
162
163
164def attempt_cast_to_numeric(value: Any) -> Any:
165    """
166    Given a value, attempt to coerce it into a numeric (Decimal).
167    """
168    if isinstance(value, Decimal):
169        return value
170    try:
171        return (
172            Decimal(str(value))
173            if not value_is_null(value)
174            else Decimal('NaN')
175        )
176    except Exception as e:
177        return value
178
179
180def value_is_null(value: Any) -> Any:
181    """
182    Determine if a value is a null-like string.
183    """
184    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')
185
186
187def none_if_null(value: Any) -> Any:
188    """
189    Return `None` if a value is a null-like string.
190    """
191    return (None if value_is_null(value) else value)
192
193
194def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
195    """
196    Quantize a given `Decimal` to a known scale and precision.
197
198    Parameters
199    ----------
200    x: Decimal
201        The `Decimal` to be quantized.
202
203    scale: int
204        The total number of significant digits.
205
206    precision: int
207        The number of significant digits after the decimal point.
208
209    Returns
210    -------
211    A `Decimal` quantized to the specified scale and precision.
212    """
213    precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision)))
214    try:
215        return x.quantize(precision_decimal, context=Context(prec=scale))
216    except InvalidOperation:
217        return x
MRSM_PD_DTYPES: Dict[str, str] = {'json': 'object', 'numeric': 'object', 'datetime': 'datetime64[ns]', 'bool': 'bool[pyarrow]', 'int': 'Int64', 'int8': 'Int8', 'int16': 'Int16', 'int32': 'Int32', 'int64': 'Int64', 'str': 'string[python]'}
def to_pandas_dtype(dtype: str) -> str:
27def to_pandas_dtype(dtype: str) -> str:
28    """
29    Cast a supported Meerschaum dtype to a Pandas dtype.
30    """
31    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
32    if known_dtype is not None:
33        return known_dtype
34
35    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
36    ### treat it as a SQL db type.
37    if dtype.split(' ')[0].isupper():
38        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
39        return get_pd_type_from_db_type(dtype)
40
41    import traceback
42    from meerschaum.utils.packages import attempt_import
43    from meerschaum.utils.warnings import warn
44    pandas = attempt_import('pandas', lazy=False)
45
46    try:
47        return str(pandas.api.types.pandas_dtype(dtype))
48    except Exception as e:
49        warn(
50            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
51            + f"{traceback.format_exc()}",
52            stack = False,
53        )
54    
55    return 'object'

Cast a supported Meerschaum dtype to a Pandas dtype.

def are_dtypes_equal( ldtype: Union[str, Dict[str, str]], rdtype: Union[str, Dict[str, str]]) -> bool:
 58def are_dtypes_equal(
 59        ldtype: Union[str, Dict[str, str]],
 60        rdtype: Union[str, Dict[str, str]],
 61    ) -> bool:
 62    """
 63    Determine whether two dtype strings may be considered
 64    equivalent to avoid unnecessary conversions.
 65
 66    Parameters
 67    ----------
 68    ldtype: Union[str, Dict[str, str]]
 69        The left dtype to compare.
 70        May also provide a dtypes dictionary.
 71
 72    rdtype: Union[str, Dict[str, str]]
 73        The right dtype to compare.
 74        May also provide a dtypes dictionary.
 75
 76    Returns
 77    -------
 78    A `bool` indicating whether the two dtypes are to be considered equivalent.
 79    """
 80    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
 81        lkeys = sorted([str(k) for k in ldtype.keys()])
 82        rkeys = sorted([str(k) for k in rdtype.keys()])
 83        for lkey, rkey in zip(lkeys, rkeys):
 84            if lkey != rkey:
 85                return False
 86            ltype = ldtype[lkey]
 87            rtype = rdtype[rkey]
 88            if not are_dtypes_equal(ltype, rtype):
 89                return False
 90        return True
 91
 92    if ldtype == rdtype:
 93        return True
 94
 95    ### Sometimes pandas dtype objects are passed.
 96    ldtype = str(ldtype)
 97    rdtype = str(rdtype)
 98
 99    json_dtypes = ('json', 'object')
100    if ldtype in json_dtypes and rdtype in json_dtypes:
101        return True
102
103    numeric_dtypes = ('numeric', 'object')
104    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
105        return True
106
107    ldtype_clean = ldtype.split('[')[0]
108    rdtype_clean = rdtype.split('[')[0]
109
110    if ldtype_clean.lower() == rdtype_clean.lower():
111        return True
112
113    datetime_dtypes = ('datetime', 'timestamp')
114    ldtype_found_dt_prefix = False
115    rdtype_found_dt_prefix = False
116    for dt_prefix in datetime_dtypes:
117        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
118        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
119    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
120        return True
121
122    string_dtypes = ('str', 'string', 'object')
123    if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes:
124        return True
125
126    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
127    if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes:
128        return True
129
130    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
131    if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes:
132        return True
133
134    bool_dtypes = ('bool', 'boolean')
135    if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes:
136        return True
137
138    return False

Determine whether two dtype strings may be considered equivalent to avoid unnecessary conversions.

Parameters
  • ldtype (Union[str, Dict[str, str]]): The left dtype to compare. May also provide a dtypes dictionary.
  • rdtype (Union[str, Dict[str, str]]): The right dtype to compare. May also provide a dtypes dictionary.
Returns
  • A bool indicating whether the two dtypes are to be considered equivalent.
def is_dtype_numeric(dtype: str) -> bool:
141def is_dtype_numeric(dtype: str) -> bool:
142    """
143    Determine whether a given `dtype` string
144    should be considered compatible with the Meerschaum dtype `numeric`.
145
146    Parameters
147    ----------
148    dtype: str
149        The pandas-like dtype string.
150
151    Returns
152    -------
153    A bool indicating the dtype is compatible with `numeric`.
154    """
155    dtype_lower = dtype.lower()
156
157    acceptable_substrings = ('numeric', 'float', 'double', 'int')
158    for substring in acceptable_substrings:
159        if substring in dtype_lower:
160            return True
161
162    return False

Determine whether a given dtype string should be considered compatible with the Meerschaum dtype numeric.

Parameters
  • dtype (str): The pandas-like dtype string.
Returns
  • A bool indicating the dtype is compatible with numeric.
def attempt_cast_to_numeric(value: Any) -> Any:
165def attempt_cast_to_numeric(value: Any) -> Any:
166    """
167    Given a value, attempt to coerce it into a numeric (Decimal).
168    """
169    if isinstance(value, Decimal):
170        return value
171    try:
172        return (
173            Decimal(str(value))
174            if not value_is_null(value)
175            else Decimal('NaN')
176        )
177    except Exception as e:
178        return value

Given a value, attempt to coerce it into a numeric (Decimal).

def value_is_null(value: Any) -> Any:
181def value_is_null(value: Any) -> Any:
182    """
183    Determine if a value is a null-like string.
184    """
185    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')

Determine if a value is a null-like string.

def none_if_null(value: Any) -> Any:
188def none_if_null(value: Any) -> Any:
189    """
190    Return `None` if a value is a null-like string.
191    """
192    return (None if value_is_null(value) else value)

Return None if a value is a null-like string.

def quantize_decimal(x: decimal.Decimal, scale: int, precision: int) -> decimal.Decimal:
195def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
196    """
197    Quantize a given `Decimal` to a known scale and precision.
198
199    Parameters
200    ----------
201    x: Decimal
202        The `Decimal` to be quantized.
203
204    scale: int
205        The total number of significant digits.
206
207    precision: int
208        The number of significant digits after the decimal point.
209
210    Returns
211    -------
212    A `Decimal` quantized to the specified scale and precision.
213    """
214    precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision)))
215    try:
216        return x.quantize(precision_decimal, context=Context(prec=scale))
217    except InvalidOperation:
218        return x

Quantize a given Decimal to a known scale and precision.

Parameters
  • x (Decimal): The Decimal to be quantized.
  • scale (int): The total number of significant digits.
  • precision (int): The number of significant digits after the decimal point.
Returns
  • A Decimal quantized to the specified scale and precision.