meerschaum.utils.dtypes
Utility functions for working with data types.
1#! /usr/bin/env python3 2# -*- coding: utf-8 -*- 3# vim:fenc=utf-8 4 5""" 6Utility functions for working with data types. 7""" 8 9import traceback 10import uuid 11from datetime import timezone 12from decimal import Decimal, Context, InvalidOperation 13 14import meerschaum as mrsm 15from meerschaum.utils.typing import Dict, Union, Any 16from meerschaum.utils.warnings import warn 17 18MRSM_PD_DTYPES: Dict[str, str] = { 19 'json': 'object', 20 'numeric': 'object', 21 'uuid': 'object', 22 'datetime': 'datetime64[ns, UTC]', 23 'bool': 'bool[pyarrow]', 24 'int': 'Int64', 25 'int8': 'Int8', 26 'int16': 'Int16', 27 'int32': 'Int32', 28 'int64': 'Int64', 29 'str': 'string[python]', 30} 31 32 33def to_pandas_dtype(dtype: str) -> str: 34 """ 35 Cast a supported Meerschaum dtype to a Pandas dtype. 36 """ 37 known_dtype = MRSM_PD_DTYPES.get(dtype, None) 38 if known_dtype is not None: 39 return known_dtype 40 41 ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps, 42 ### treat it as a SQL db type. 43 if dtype.split(' ')[0].isupper(): 44 from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type 45 return get_pd_type_from_db_type(dtype) 46 47 from meerschaum.utils.packages import attempt_import 48 pandas = attempt_import('pandas', lazy=False) 49 50 try: 51 return str(pandas.api.types.pandas_dtype(dtype)) 52 except Exception: 53 warn( 54 f"Invalid dtype '{dtype}', will use 'object' instead:\n" 55 + f"{traceback.format_exc()}", 56 stack=False, 57 ) 58 return 'object' 59 60 61def are_dtypes_equal( 62 ldtype: Union[str, Dict[str, str]], 63 rdtype: Union[str, Dict[str, str]], 64) -> bool: 65 """ 66 Determine whether two dtype strings may be considered 67 equivalent to avoid unnecessary conversions. 68 69 Parameters 70 ---------- 71 ldtype: Union[str, Dict[str, str]] 72 The left dtype to compare. 73 May also provide a dtypes dictionary. 74 75 rdtype: Union[str, Dict[str, str]] 76 The right dtype to compare. 77 May also provide a dtypes dictionary. 78 79 Returns 80 ------- 81 A `bool` indicating whether the two dtypes are to be considered equivalent. 82 """ 83 if isinstance(ldtype, dict) and isinstance(rdtype, dict): 84 lkeys = sorted([str(k) for k in ldtype.keys()]) 85 rkeys = sorted([str(k) for k in rdtype.keys()]) 86 for lkey, rkey in zip(lkeys, rkeys): 87 if lkey != rkey: 88 return False 89 ltype = ldtype[lkey] 90 rtype = rdtype[rkey] 91 if not are_dtypes_equal(ltype, rtype): 92 return False 93 return True 94 95 try: 96 if ldtype == rdtype: 97 return True 98 except Exception as e: 99 warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}") 100 return False 101 102 ### Sometimes pandas dtype objects are passed. 103 ldtype = str(ldtype) 104 rdtype = str(rdtype) 105 106 json_dtypes = ('json', 'object') 107 if ldtype in json_dtypes and rdtype in json_dtypes: 108 return True 109 110 numeric_dtypes = ('numeric', 'object') 111 if ldtype in numeric_dtypes and rdtype in numeric_dtypes: 112 return True 113 114 uuid_dtypes = ('uuid', 'object') 115 if ldtype in uuid_dtypes and rdtype in uuid_dtypes: 116 return True 117 118 ldtype_clean = ldtype.split('[', maxsplit=1)[0] 119 rdtype_clean = rdtype.split('[', maxsplit=1)[0] 120 121 if ldtype_clean.lower() == rdtype_clean.lower(): 122 return True 123 124 datetime_dtypes = ('datetime', 'timestamp') 125 ldtype_found_dt_prefix = False 126 rdtype_found_dt_prefix = False 127 for dt_prefix in datetime_dtypes: 128 ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix 129 rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix 130 if ldtype_found_dt_prefix and rdtype_found_dt_prefix: 131 return True 132 133 string_dtypes = ('str', 'string', 'object') 134 if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes: 135 return True 136 137 int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8') 138 if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes: 139 return True 140 141 float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double') 142 if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes: 143 return True 144 145 bool_dtypes = ('bool', 'boolean') 146 if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes: 147 return True 148 149 return False 150 151 152def is_dtype_numeric(dtype: str) -> bool: 153 """ 154 Determine whether a given `dtype` string 155 should be considered compatible with the Meerschaum dtype `numeric`. 156 157 Parameters 158 ---------- 159 dtype: str 160 The pandas-like dtype string. 161 162 Returns 163 ------- 164 A bool indicating the dtype is compatible with `numeric`. 165 """ 166 dtype_lower = dtype.lower() 167 168 acceptable_substrings = ('numeric', 'float', 'double', 'int') 169 for substring in acceptable_substrings: 170 if substring in dtype_lower: 171 return True 172 173 return False 174 175 176def attempt_cast_to_numeric(value: Any) -> Any: 177 """ 178 Given a value, attempt to coerce it into a numeric (Decimal). 179 """ 180 if isinstance(value, Decimal): 181 return value 182 try: 183 return ( 184 Decimal(str(value)) 185 if not value_is_null(value) 186 else Decimal('NaN') 187 ) 188 except Exception as e: 189 return value 190 191 192def attempt_cast_to_uuid(value: Any) -> Any: 193 """ 194 Given a value, attempt to coerce it into a UUID (`uuid4`). 195 """ 196 if isinstance(value, uuid.UUID): 197 return value 198 try: 199 return ( 200 uuid.UUID(str(value)) 201 if not value_is_null(value) 202 else None 203 ) 204 except Exception as e: 205 return value 206 207 208def value_is_null(value: Any) -> bool: 209 """ 210 Determine if a value is a null-like string. 211 """ 212 return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>') 213 214 215def none_if_null(value: Any) -> Any: 216 """ 217 Return `None` if a value is a null-like string. 218 """ 219 return (None if value_is_null(value) else value) 220 221 222def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal: 223 """ 224 Quantize a given `Decimal` to a known scale and precision. 225 226 Parameters 227 ---------- 228 x: Decimal 229 The `Decimal` to be quantized. 230 231 scale: int 232 The total number of significant digits. 233 234 precision: int 235 The number of significant digits after the decimal point. 236 237 Returns 238 ------- 239 A `Decimal` quantized to the specified scale and precision. 240 """ 241 precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision))) 242 try: 243 return x.quantize(precision_decimal, context=Context(prec=scale)) 244 except InvalidOperation: 245 return x 246 247 248def coerce_timezone( 249 dt: Any, 250 strip_utc: bool = False, 251) -> Any: 252 """ 253 Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`, 254 return a naive datetime in terms of UTC. 255 """ 256 if dt is None: 257 return None 258 259 if isinstance(dt, int): 260 return dt 261 262 if isinstance(dt, str): 263 dateutil_parser = mrsm.attempt_import('dateutil.parser') 264 dt = dateutil_parser.parse(dt) 265 266 dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__') 267 268 if dt_is_series: 269 is_dask = 'dask' in dt.__module__ 270 pandas = mrsm.attempt_import('pandas', lazy=False) 271 dd = mrsm.attempt_import('dask.dataframe') if is_dask else None 272 273 if ( 274 pandas.api.types.is_datetime64_any_dtype(dt) and ( 275 (dt.dt.tz is not None and not strip_utc) 276 or 277 (dt.dt.tz is None and strip_utc) 278 ) 279 ): 280 return dt 281 282 dt_series = ( 283 pandas.to_datetime(dt, utc=True, format='ISO8601') 284 if dd is None 285 else dd.to_datetime(dt, utc=True, format='ISO8601') 286 ) 287 if strip_utc: 288 if dt_series.dt.tz is not None: 289 dt_series = dt_series.dt.tz_localize(None) 290 291 return dt_series 292 293 if dt.tzinfo is None: 294 if strip_utc: 295 return dt 296 return dt.replace(tzinfo=timezone.utc) 297 298 utc_dt = dt.astimezone(timezone.utc) 299 if strip_utc: 300 return utc_dt.replace(tzinfo=None) 301 return utc_dt
MRSM_PD_DTYPES: Dict[str, str] =
{'json': 'object', 'numeric': 'object', 'uuid': 'object', 'datetime': 'datetime64[ns, UTC]', 'bool': 'bool[pyarrow]', 'int': 'Int64', 'int8': 'Int8', 'int16': 'Int16', 'int32': 'Int32', 'int64': 'Int64', 'str': 'string[python]'}
def
to_pandas_dtype(dtype: str) -> str:
34def to_pandas_dtype(dtype: str) -> str: 35 """ 36 Cast a supported Meerschaum dtype to a Pandas dtype. 37 """ 38 known_dtype = MRSM_PD_DTYPES.get(dtype, None) 39 if known_dtype is not None: 40 return known_dtype 41 42 ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps, 43 ### treat it as a SQL db type. 44 if dtype.split(' ')[0].isupper(): 45 from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type 46 return get_pd_type_from_db_type(dtype) 47 48 from meerschaum.utils.packages import attempt_import 49 pandas = attempt_import('pandas', lazy=False) 50 51 try: 52 return str(pandas.api.types.pandas_dtype(dtype)) 53 except Exception: 54 warn( 55 f"Invalid dtype '{dtype}', will use 'object' instead:\n" 56 + f"{traceback.format_exc()}", 57 stack=False, 58 ) 59 return 'object'
Cast a supported Meerschaum dtype to a Pandas dtype.
def
are_dtypes_equal( ldtype: Union[str, Dict[str, str]], rdtype: Union[str, Dict[str, str]]) -> bool:
62def are_dtypes_equal( 63 ldtype: Union[str, Dict[str, str]], 64 rdtype: Union[str, Dict[str, str]], 65) -> bool: 66 """ 67 Determine whether two dtype strings may be considered 68 equivalent to avoid unnecessary conversions. 69 70 Parameters 71 ---------- 72 ldtype: Union[str, Dict[str, str]] 73 The left dtype to compare. 74 May also provide a dtypes dictionary. 75 76 rdtype: Union[str, Dict[str, str]] 77 The right dtype to compare. 78 May also provide a dtypes dictionary. 79 80 Returns 81 ------- 82 A `bool` indicating whether the two dtypes are to be considered equivalent. 83 """ 84 if isinstance(ldtype, dict) and isinstance(rdtype, dict): 85 lkeys = sorted([str(k) for k in ldtype.keys()]) 86 rkeys = sorted([str(k) for k in rdtype.keys()]) 87 for lkey, rkey in zip(lkeys, rkeys): 88 if lkey != rkey: 89 return False 90 ltype = ldtype[lkey] 91 rtype = rdtype[rkey] 92 if not are_dtypes_equal(ltype, rtype): 93 return False 94 return True 95 96 try: 97 if ldtype == rdtype: 98 return True 99 except Exception as e: 100 warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}") 101 return False 102 103 ### Sometimes pandas dtype objects are passed. 104 ldtype = str(ldtype) 105 rdtype = str(rdtype) 106 107 json_dtypes = ('json', 'object') 108 if ldtype in json_dtypes and rdtype in json_dtypes: 109 return True 110 111 numeric_dtypes = ('numeric', 'object') 112 if ldtype in numeric_dtypes and rdtype in numeric_dtypes: 113 return True 114 115 uuid_dtypes = ('uuid', 'object') 116 if ldtype in uuid_dtypes and rdtype in uuid_dtypes: 117 return True 118 119 ldtype_clean = ldtype.split('[', maxsplit=1)[0] 120 rdtype_clean = rdtype.split('[', maxsplit=1)[0] 121 122 if ldtype_clean.lower() == rdtype_clean.lower(): 123 return True 124 125 datetime_dtypes = ('datetime', 'timestamp') 126 ldtype_found_dt_prefix = False 127 rdtype_found_dt_prefix = False 128 for dt_prefix in datetime_dtypes: 129 ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix 130 rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix 131 if ldtype_found_dt_prefix and rdtype_found_dt_prefix: 132 return True 133 134 string_dtypes = ('str', 'string', 'object') 135 if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes: 136 return True 137 138 int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8') 139 if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes: 140 return True 141 142 float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double') 143 if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes: 144 return True 145 146 bool_dtypes = ('bool', 'boolean') 147 if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes: 148 return True 149 150 return False
Determine whether two dtype strings may be considered equivalent to avoid unnecessary conversions.
Parameters
- ldtype (Union[str, Dict[str, str]]): The left dtype to compare. May also provide a dtypes dictionary.
- rdtype (Union[str, Dict[str, str]]): The right dtype to compare. May also provide a dtypes dictionary.
Returns
- A
bool
indicating whether the two dtypes are to be considered equivalent.
def
is_dtype_numeric(dtype: str) -> bool:
153def is_dtype_numeric(dtype: str) -> bool: 154 """ 155 Determine whether a given `dtype` string 156 should be considered compatible with the Meerschaum dtype `numeric`. 157 158 Parameters 159 ---------- 160 dtype: str 161 The pandas-like dtype string. 162 163 Returns 164 ------- 165 A bool indicating the dtype is compatible with `numeric`. 166 """ 167 dtype_lower = dtype.lower() 168 169 acceptable_substrings = ('numeric', 'float', 'double', 'int') 170 for substring in acceptable_substrings: 171 if substring in dtype_lower: 172 return True 173 174 return False
Determine whether a given dtype
string
should be considered compatible with the Meerschaum dtype numeric
.
Parameters
- dtype (str): The pandas-like dtype string.
Returns
- A bool indicating the dtype is compatible with
numeric
.
def
attempt_cast_to_numeric(value: Any) -> Any:
177def attempt_cast_to_numeric(value: Any) -> Any: 178 """ 179 Given a value, attempt to coerce it into a numeric (Decimal). 180 """ 181 if isinstance(value, Decimal): 182 return value 183 try: 184 return ( 185 Decimal(str(value)) 186 if not value_is_null(value) 187 else Decimal('NaN') 188 ) 189 except Exception as e: 190 return value
Given a value, attempt to coerce it into a numeric (Decimal).
def
attempt_cast_to_uuid(value: Any) -> Any:
193def attempt_cast_to_uuid(value: Any) -> Any: 194 """ 195 Given a value, attempt to coerce it into a UUID (`uuid4`). 196 """ 197 if isinstance(value, uuid.UUID): 198 return value 199 try: 200 return ( 201 uuid.UUID(str(value)) 202 if not value_is_null(value) 203 else None 204 ) 205 except Exception as e: 206 return value
Given a value, attempt to coerce it into a UUID (uuid4
).
def
value_is_null(value: Any) -> bool:
209def value_is_null(value: Any) -> bool: 210 """ 211 Determine if a value is a null-like string. 212 """ 213 return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')
Determine if a value is a null-like string.
def
none_if_null(value: Any) -> Any:
216def none_if_null(value: Any) -> Any: 217 """ 218 Return `None` if a value is a null-like string. 219 """ 220 return (None if value_is_null(value) else value)
Return None
if a value is a null-like string.
def
quantize_decimal(x: decimal.Decimal, scale: int, precision: int) -> decimal.Decimal:
223def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal: 224 """ 225 Quantize a given `Decimal` to a known scale and precision. 226 227 Parameters 228 ---------- 229 x: Decimal 230 The `Decimal` to be quantized. 231 232 scale: int 233 The total number of significant digits. 234 235 precision: int 236 The number of significant digits after the decimal point. 237 238 Returns 239 ------- 240 A `Decimal` quantized to the specified scale and precision. 241 """ 242 precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision))) 243 try: 244 return x.quantize(precision_decimal, context=Context(prec=scale)) 245 except InvalidOperation: 246 return x
Quantize a given Decimal
to a known scale and precision.
Parameters
- x (Decimal):
The
Decimal
to be quantized. - scale (int): The total number of significant digits.
- precision (int): The number of significant digits after the decimal point.
Returns
- A
Decimal
quantized to the specified scale and precision.
def
coerce_timezone(dt: Any, strip_utc: bool = False) -> Any:
249def coerce_timezone( 250 dt: Any, 251 strip_utc: bool = False, 252) -> Any: 253 """ 254 Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`, 255 return a naive datetime in terms of UTC. 256 """ 257 if dt is None: 258 return None 259 260 if isinstance(dt, int): 261 return dt 262 263 if isinstance(dt, str): 264 dateutil_parser = mrsm.attempt_import('dateutil.parser') 265 dt = dateutil_parser.parse(dt) 266 267 dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__') 268 269 if dt_is_series: 270 is_dask = 'dask' in dt.__module__ 271 pandas = mrsm.attempt_import('pandas', lazy=False) 272 dd = mrsm.attempt_import('dask.dataframe') if is_dask else None 273 274 if ( 275 pandas.api.types.is_datetime64_any_dtype(dt) and ( 276 (dt.dt.tz is not None and not strip_utc) 277 or 278 (dt.dt.tz is None and strip_utc) 279 ) 280 ): 281 return dt 282 283 dt_series = ( 284 pandas.to_datetime(dt, utc=True, format='ISO8601') 285 if dd is None 286 else dd.to_datetime(dt, utc=True, format='ISO8601') 287 ) 288 if strip_utc: 289 if dt_series.dt.tz is not None: 290 dt_series = dt_series.dt.tz_localize(None) 291 292 return dt_series 293 294 if dt.tzinfo is None: 295 if strip_utc: 296 return dt 297 return dt.replace(tzinfo=timezone.utc) 298 299 utc_dt = dt.astimezone(timezone.utc) 300 if strip_utc: 301 return utc_dt.replace(tzinfo=None) 302 return utc_dt
Given a datetime
, pandas Timestamp
or Series
of Timestamp
,
return a naive datetime in terms of UTC.