meerschaum.utils.dtypes

Utility functions for working with data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with data types.
  7"""
  8
  9import traceback
 10import uuid
 11from datetime import timezone
 12from decimal import Decimal, Context, InvalidOperation
 13
 14import meerschaum as mrsm
 15from meerschaum.utils.typing import Dict, Union, Any
 16from meerschaum.utils.warnings import warn
 17
 18MRSM_PD_DTYPES: Dict[str, str] = {
 19    'json': 'object',
 20    'numeric': 'object',
 21    'uuid': 'object',
 22    'datetime': 'datetime64[ns, UTC]',
 23    'bool': 'bool[pyarrow]',
 24    'int': 'Int64',
 25    'int8': 'Int8',
 26    'int16': 'Int16',
 27    'int32': 'Int32',
 28    'int64': 'Int64',
 29    'str': 'string[python]',
 30}
 31
 32
 33def to_pandas_dtype(dtype: str) -> str:
 34    """
 35    Cast a supported Meerschaum dtype to a Pandas dtype.
 36    """
 37    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
 38    if known_dtype is not None:
 39        return known_dtype
 40
 41    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
 42    ### treat it as a SQL db type.
 43    if dtype.split(' ')[0].isupper():
 44        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
 45        return get_pd_type_from_db_type(dtype)
 46
 47    from meerschaum.utils.packages import attempt_import
 48    pandas = attempt_import('pandas', lazy=False)
 49
 50    try:
 51        return str(pandas.api.types.pandas_dtype(dtype))
 52    except Exception:
 53        warn(
 54            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
 55            + f"{traceback.format_exc()}",
 56            stack=False,
 57        )
 58    return 'object'
 59
 60
 61def are_dtypes_equal(
 62    ldtype: Union[str, Dict[str, str]],
 63    rdtype: Union[str, Dict[str, str]],
 64) -> bool:
 65    """
 66    Determine whether two dtype strings may be considered
 67    equivalent to avoid unnecessary conversions.
 68
 69    Parameters
 70    ----------
 71    ldtype: Union[str, Dict[str, str]]
 72        The left dtype to compare.
 73        May also provide a dtypes dictionary.
 74
 75    rdtype: Union[str, Dict[str, str]]
 76        The right dtype to compare.
 77        May also provide a dtypes dictionary.
 78
 79    Returns
 80    -------
 81    A `bool` indicating whether the two dtypes are to be considered equivalent.
 82    """
 83    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
 84        lkeys = sorted([str(k) for k in ldtype.keys()])
 85        rkeys = sorted([str(k) for k in rdtype.keys()])
 86        for lkey, rkey in zip(lkeys, rkeys):
 87            if lkey != rkey:
 88                return False
 89            ltype = ldtype[lkey]
 90            rtype = rdtype[rkey]
 91            if not are_dtypes_equal(ltype, rtype):
 92                return False
 93        return True
 94
 95    try:
 96        if ldtype == rdtype:
 97            return True
 98    except Exception as e:
 99        warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
100        return False
101
102    ### Sometimes pandas dtype objects are passed.
103    ldtype = str(ldtype)
104    rdtype = str(rdtype)
105
106    json_dtypes = ('json', 'object')
107    if ldtype in json_dtypes and rdtype in json_dtypes:
108        return True
109
110    numeric_dtypes = ('numeric', 'object')
111    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
112        return True
113
114    uuid_dtypes = ('uuid', 'object')
115    if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
116        return True
117
118    ldtype_clean = ldtype.split('[', maxsplit=1)[0]
119    rdtype_clean = rdtype.split('[', maxsplit=1)[0]
120
121    if ldtype_clean.lower() == rdtype_clean.lower():
122        return True
123
124    datetime_dtypes = ('datetime', 'timestamp')
125    ldtype_found_dt_prefix = False
126    rdtype_found_dt_prefix = False
127    for dt_prefix in datetime_dtypes:
128        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
129        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
130    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
131        return True
132
133    string_dtypes = ('str', 'string', 'object')
134    if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes:
135        return True
136
137    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
138    if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes:
139        return True
140
141    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
142    if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes:
143        return True
144
145    bool_dtypes = ('bool', 'boolean')
146    if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes:
147        return True
148
149    return False
150
151
152def is_dtype_numeric(dtype: str) -> bool:
153    """
154    Determine whether a given `dtype` string
155    should be considered compatible with the Meerschaum dtype `numeric`.
156
157    Parameters
158    ----------
159    dtype: str
160        The pandas-like dtype string.
161
162    Returns
163    -------
164    A bool indicating the dtype is compatible with `numeric`.
165    """
166    dtype_lower = dtype.lower()
167
168    acceptable_substrings = ('numeric', 'float', 'double', 'int')
169    for substring in acceptable_substrings:
170        if substring in dtype_lower:
171            return True
172
173    return False
174
175
176def attempt_cast_to_numeric(value: Any) -> Any:
177    """
178    Given a value, attempt to coerce it into a numeric (Decimal).
179    """
180    if isinstance(value, Decimal):
181        return value
182    try:
183        return (
184            Decimal(str(value))
185            if not value_is_null(value)
186            else Decimal('NaN')
187        )
188    except Exception as e:
189        return value
190
191
192def attempt_cast_to_uuid(value: Any) -> Any:
193    """
194    Given a value, attempt to coerce it into a UUID (`uuid4`).
195    """
196    if isinstance(value, uuid.UUID):
197        return value
198    try:
199        return (
200            uuid.UUID(str(value))
201            if not value_is_null(value)
202            else None
203        )
204    except Exception as e:
205        return value
206
207
208def value_is_null(value: Any) -> bool:
209    """
210    Determine if a value is a null-like string.
211    """
212    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')
213
214
215def none_if_null(value: Any) -> Any:
216    """
217    Return `None` if a value is a null-like string.
218    """
219    return (None if value_is_null(value) else value)
220
221
222def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
223    """
224    Quantize a given `Decimal` to a known scale and precision.
225
226    Parameters
227    ----------
228    x: Decimal
229        The `Decimal` to be quantized.
230
231    scale: int
232        The total number of significant digits.
233
234    precision: int
235        The number of significant digits after the decimal point.
236
237    Returns
238    -------
239    A `Decimal` quantized to the specified scale and precision.
240    """
241    precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision)))
242    try:
243        return x.quantize(precision_decimal, context=Context(prec=scale))
244    except InvalidOperation:
245        return x
246
247
248def coerce_timezone(
249    dt: Any,
250    strip_utc: bool = False,
251) -> Any:
252    """
253    Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
254    return a naive datetime in terms of UTC.
255    """
256    if dt is None:
257        return None
258
259    if isinstance(dt, int):
260        return dt
261
262    if isinstance(dt, str):
263        dateutil_parser = mrsm.attempt_import('dateutil.parser')
264        dt = dateutil_parser.parse(dt)
265
266    dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
267
268    if dt_is_series:
269        is_dask = 'dask' in dt.__module__
270        pandas = mrsm.attempt_import('pandas', lazy=False)
271        dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
272
273        if (
274            pandas.api.types.is_datetime64_any_dtype(dt) and (
275                (dt.dt.tz is not None and not strip_utc)
276                or
277                (dt.dt.tz is None and strip_utc)
278            )
279        ):
280            return dt
281
282        dt_series = (
283            pandas.to_datetime(dt, utc=True, format='ISO8601')
284            if dd is None
285            else dd.to_datetime(dt, utc=True, format='ISO8601')
286        )
287        if strip_utc:
288            if dt_series.dt.tz is not None:
289                dt_series = dt_series.dt.tz_localize(None)
290
291        return dt_series
292
293    if dt.tzinfo is None:
294        if strip_utc:
295            return dt
296        return dt.replace(tzinfo=timezone.utc)
297
298    utc_dt = dt.astimezone(timezone.utc)
299    if strip_utc:
300        return utc_dt.replace(tzinfo=None)
301    return utc_dt
MRSM_PD_DTYPES: Dict[str, str] = {'json': 'object', 'numeric': 'object', 'uuid': 'object', 'datetime': 'datetime64[ns, UTC]', 'bool': 'bool[pyarrow]', 'int': 'Int64', 'int8': 'Int8', 'int16': 'Int16', 'int32': 'Int32', 'int64': 'Int64', 'str': 'string[python]'}
def to_pandas_dtype(dtype: str) -> str:
34def to_pandas_dtype(dtype: str) -> str:
35    """
36    Cast a supported Meerschaum dtype to a Pandas dtype.
37    """
38    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
39    if known_dtype is not None:
40        return known_dtype
41
42    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
43    ### treat it as a SQL db type.
44    if dtype.split(' ')[0].isupper():
45        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
46        return get_pd_type_from_db_type(dtype)
47
48    from meerschaum.utils.packages import attempt_import
49    pandas = attempt_import('pandas', lazy=False)
50
51    try:
52        return str(pandas.api.types.pandas_dtype(dtype))
53    except Exception:
54        warn(
55            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
56            + f"{traceback.format_exc()}",
57            stack=False,
58        )
59    return 'object'

Cast a supported Meerschaum dtype to a Pandas dtype.

def are_dtypes_equal( ldtype: Union[str, Dict[str, str]], rdtype: Union[str, Dict[str, str]]) -> bool:
 62def are_dtypes_equal(
 63    ldtype: Union[str, Dict[str, str]],
 64    rdtype: Union[str, Dict[str, str]],
 65) -> bool:
 66    """
 67    Determine whether two dtype strings may be considered
 68    equivalent to avoid unnecessary conversions.
 69
 70    Parameters
 71    ----------
 72    ldtype: Union[str, Dict[str, str]]
 73        The left dtype to compare.
 74        May also provide a dtypes dictionary.
 75
 76    rdtype: Union[str, Dict[str, str]]
 77        The right dtype to compare.
 78        May also provide a dtypes dictionary.
 79
 80    Returns
 81    -------
 82    A `bool` indicating whether the two dtypes are to be considered equivalent.
 83    """
 84    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
 85        lkeys = sorted([str(k) for k in ldtype.keys()])
 86        rkeys = sorted([str(k) for k in rdtype.keys()])
 87        for lkey, rkey in zip(lkeys, rkeys):
 88            if lkey != rkey:
 89                return False
 90            ltype = ldtype[lkey]
 91            rtype = rdtype[rkey]
 92            if not are_dtypes_equal(ltype, rtype):
 93                return False
 94        return True
 95
 96    try:
 97        if ldtype == rdtype:
 98            return True
 99    except Exception as e:
100        warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
101        return False
102
103    ### Sometimes pandas dtype objects are passed.
104    ldtype = str(ldtype)
105    rdtype = str(rdtype)
106
107    json_dtypes = ('json', 'object')
108    if ldtype in json_dtypes and rdtype in json_dtypes:
109        return True
110
111    numeric_dtypes = ('numeric', 'object')
112    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
113        return True
114
115    uuid_dtypes = ('uuid', 'object')
116    if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
117        return True
118
119    ldtype_clean = ldtype.split('[', maxsplit=1)[0]
120    rdtype_clean = rdtype.split('[', maxsplit=1)[0]
121
122    if ldtype_clean.lower() == rdtype_clean.lower():
123        return True
124
125    datetime_dtypes = ('datetime', 'timestamp')
126    ldtype_found_dt_prefix = False
127    rdtype_found_dt_prefix = False
128    for dt_prefix in datetime_dtypes:
129        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
130        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
131    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
132        return True
133
134    string_dtypes = ('str', 'string', 'object')
135    if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes:
136        return True
137
138    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
139    if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes:
140        return True
141
142    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
143    if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes:
144        return True
145
146    bool_dtypes = ('bool', 'boolean')
147    if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes:
148        return True
149
150    return False

Determine whether two dtype strings may be considered equivalent to avoid unnecessary conversions.

Parameters
  • ldtype (Union[str, Dict[str, str]]): The left dtype to compare. May also provide a dtypes dictionary.
  • rdtype (Union[str, Dict[str, str]]): The right dtype to compare. May also provide a dtypes dictionary.
Returns
  • A bool indicating whether the two dtypes are to be considered equivalent.
def is_dtype_numeric(dtype: str) -> bool:
153def is_dtype_numeric(dtype: str) -> bool:
154    """
155    Determine whether a given `dtype` string
156    should be considered compatible with the Meerschaum dtype `numeric`.
157
158    Parameters
159    ----------
160    dtype: str
161        The pandas-like dtype string.
162
163    Returns
164    -------
165    A bool indicating the dtype is compatible with `numeric`.
166    """
167    dtype_lower = dtype.lower()
168
169    acceptable_substrings = ('numeric', 'float', 'double', 'int')
170    for substring in acceptable_substrings:
171        if substring in dtype_lower:
172            return True
173
174    return False

Determine whether a given dtype string should be considered compatible with the Meerschaum dtype numeric.

Parameters
  • dtype (str): The pandas-like dtype string.
Returns
  • A bool indicating the dtype is compatible with numeric.
def attempt_cast_to_numeric(value: Any) -> Any:
177def attempt_cast_to_numeric(value: Any) -> Any:
178    """
179    Given a value, attempt to coerce it into a numeric (Decimal).
180    """
181    if isinstance(value, Decimal):
182        return value
183    try:
184        return (
185            Decimal(str(value))
186            if not value_is_null(value)
187            else Decimal('NaN')
188        )
189    except Exception as e:
190        return value

Given a value, attempt to coerce it into a numeric (Decimal).

def attempt_cast_to_uuid(value: Any) -> Any:
193def attempt_cast_to_uuid(value: Any) -> Any:
194    """
195    Given a value, attempt to coerce it into a UUID (`uuid4`).
196    """
197    if isinstance(value, uuid.UUID):
198        return value
199    try:
200        return (
201            uuid.UUID(str(value))
202            if not value_is_null(value)
203            else None
204        )
205    except Exception as e:
206        return value

Given a value, attempt to coerce it into a UUID (uuid4).

def value_is_null(value: Any) -> bool:
209def value_is_null(value: Any) -> bool:
210    """
211    Determine if a value is a null-like string.
212    """
213    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')

Determine if a value is a null-like string.

def none_if_null(value: Any) -> Any:
216def none_if_null(value: Any) -> Any:
217    """
218    Return `None` if a value is a null-like string.
219    """
220    return (None if value_is_null(value) else value)

Return None if a value is a null-like string.

def quantize_decimal(x: decimal.Decimal, scale: int, precision: int) -> decimal.Decimal:
223def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal:
224    """
225    Quantize a given `Decimal` to a known scale and precision.
226
227    Parameters
228    ----------
229    x: Decimal
230        The `Decimal` to be quantized.
231
232    scale: int
233        The total number of significant digits.
234
235    precision: int
236        The number of significant digits after the decimal point.
237
238    Returns
239    -------
240    A `Decimal` quantized to the specified scale and precision.
241    """
242    precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision)))
243    try:
244        return x.quantize(precision_decimal, context=Context(prec=scale))
245    except InvalidOperation:
246        return x

Quantize a given Decimal to a known scale and precision.

Parameters
  • x (Decimal): The Decimal to be quantized.
  • scale (int): The total number of significant digits.
  • precision (int): The number of significant digits after the decimal point.
Returns
  • A Decimal quantized to the specified scale and precision.
def coerce_timezone(dt: Any, strip_utc: bool = False) -> Any:
249def coerce_timezone(
250    dt: Any,
251    strip_utc: bool = False,
252) -> Any:
253    """
254    Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
255    return a naive datetime in terms of UTC.
256    """
257    if dt is None:
258        return None
259
260    if isinstance(dt, int):
261        return dt
262
263    if isinstance(dt, str):
264        dateutil_parser = mrsm.attempt_import('dateutil.parser')
265        dt = dateutil_parser.parse(dt)
266
267    dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
268
269    if dt_is_series:
270        is_dask = 'dask' in dt.__module__
271        pandas = mrsm.attempt_import('pandas', lazy=False)
272        dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
273
274        if (
275            pandas.api.types.is_datetime64_any_dtype(dt) and (
276                (dt.dt.tz is not None and not strip_utc)
277                or
278                (dt.dt.tz is None and strip_utc)
279            )
280        ):
281            return dt
282
283        dt_series = (
284            pandas.to_datetime(dt, utc=True, format='ISO8601')
285            if dd is None
286            else dd.to_datetime(dt, utc=True, format='ISO8601')
287        )
288        if strip_utc:
289            if dt_series.dt.tz is not None:
290                dt_series = dt_series.dt.tz_localize(None)
291
292        return dt_series
293
294    if dt.tzinfo is None:
295        if strip_utc:
296            return dt
297        return dt.replace(tzinfo=timezone.utc)
298
299    utc_dt = dt.astimezone(timezone.utc)
300    if strip_utc:
301        return utc_dt.replace(tzinfo=None)
302    return utc_dt

Given a datetime, pandas Timestamp or Series of Timestamp, return a naive datetime in terms of UTC.