meerschaum.utils.dtypes
Utility functions for working with data types.
1#! /usr/bin/env python3 2# -*- coding: utf-8 -*- 3# vim:fenc=utf-8 4 5""" 6Utility functions for working with data types. 7""" 8 9import traceback 10from decimal import Decimal, Context, InvalidOperation 11from meerschaum.utils.typing import Dict, Union, Any 12from meerschaum.utils.warnings import warn 13 14MRSM_PD_DTYPES: Dict[str, str] = { 15 'json': 'object', 16 'numeric': 'object', 17 'datetime': 'datetime64[ns]', 18 'bool': 'bool[pyarrow]', 19 'int': 'Int64', 20 'int8': 'Int8', 21 'int16': 'Int16', 22 'int32': 'Int32', 23 'int64': 'Int64', 24 'str': 'string[python]', 25} 26 27 28def to_pandas_dtype(dtype: str) -> str: 29 """ 30 Cast a supported Meerschaum dtype to a Pandas dtype. 31 """ 32 known_dtype = MRSM_PD_DTYPES.get(dtype, None) 33 if known_dtype is not None: 34 return known_dtype 35 36 ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps, 37 ### treat it as a SQL db type. 38 if dtype.split(' ')[0].isupper(): 39 from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type 40 return get_pd_type_from_db_type(dtype) 41 42 from meerschaum.utils.packages import attempt_import 43 pandas = attempt_import('pandas', lazy=False) 44 45 try: 46 return str(pandas.api.types.pandas_dtype(dtype)) 47 except Exception as e: 48 warn( 49 f"Invalid dtype '{dtype}', will use 'object' instead:\n" 50 + f"{traceback.format_exc()}", 51 stack = False, 52 ) 53 54 return 'object' 55 56 57def are_dtypes_equal( 58 ldtype: Union[str, Dict[str, str]], 59 rdtype: Union[str, Dict[str, str]], 60 ) -> bool: 61 """ 62 Determine whether two dtype strings may be considered 63 equivalent to avoid unnecessary conversions. 64 65 Parameters 66 ---------- 67 ldtype: Union[str, Dict[str, str]] 68 The left dtype to compare. 69 May also provide a dtypes dictionary. 70 71 rdtype: Union[str, Dict[str, str]] 72 The right dtype to compare. 73 May also provide a dtypes dictionary. 74 75 Returns 76 ------- 77 A `bool` indicating whether the two dtypes are to be considered equivalent. 78 """ 79 if isinstance(ldtype, dict) and isinstance(rdtype, dict): 80 lkeys = sorted([str(k) for k in ldtype.keys()]) 81 rkeys = sorted([str(k) for k in rdtype.keys()]) 82 for lkey, rkey in zip(lkeys, rkeys): 83 if lkey != rkey: 84 return False 85 ltype = ldtype[lkey] 86 rtype = rdtype[rkey] 87 if not are_dtypes_equal(ltype, rtype): 88 return False 89 return True 90 91 try: 92 if ldtype == rdtype: 93 return True 94 except Exception as e: 95 warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}") 96 return False 97 98 ### Sometimes pandas dtype objects are passed. 99 ldtype = str(ldtype) 100 rdtype = str(rdtype) 101 102 json_dtypes = ('json', 'object') 103 if ldtype in json_dtypes and rdtype in json_dtypes: 104 return True 105 106 numeric_dtypes = ('numeric', 'object') 107 if ldtype in numeric_dtypes and rdtype in numeric_dtypes: 108 return True 109 110 ldtype_clean = ldtype.split('[')[0] 111 rdtype_clean = rdtype.split('[')[0] 112 113 if ldtype_clean.lower() == rdtype_clean.lower(): 114 return True 115 116 datetime_dtypes = ('datetime', 'timestamp') 117 ldtype_found_dt_prefix = False 118 rdtype_found_dt_prefix = False 119 for dt_prefix in datetime_dtypes: 120 ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix 121 rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix 122 if ldtype_found_dt_prefix and rdtype_found_dt_prefix: 123 return True 124 125 string_dtypes = ('str', 'string', 'object') 126 if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes: 127 return True 128 129 int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8') 130 if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes: 131 return True 132 133 float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double') 134 if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes: 135 return True 136 137 bool_dtypes = ('bool', 'boolean') 138 if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes: 139 return True 140 141 return False 142 143 144def is_dtype_numeric(dtype: str) -> bool: 145 """ 146 Determine whether a given `dtype` string 147 should be considered compatible with the Meerschaum dtype `numeric`. 148 149 Parameters 150 ---------- 151 dtype: str 152 The pandas-like dtype string. 153 154 Returns 155 ------- 156 A bool indicating the dtype is compatible with `numeric`. 157 """ 158 dtype_lower = dtype.lower() 159 160 acceptable_substrings = ('numeric', 'float', 'double', 'int') 161 for substring in acceptable_substrings: 162 if substring in dtype_lower: 163 return True 164 165 return False 166 167 168def attempt_cast_to_numeric(value: Any) -> Any: 169 """ 170 Given a value, attempt to coerce it into a numeric (Decimal). 171 """ 172 if isinstance(value, Decimal): 173 return value 174 try: 175 return ( 176 Decimal(str(value)) 177 if not value_is_null(value) 178 else Decimal('NaN') 179 ) 180 except Exception as e: 181 return value 182 183 184def value_is_null(value: Any) -> bool: 185 """ 186 Determine if a value is a null-like string. 187 """ 188 return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>') 189 190 191def none_if_null(value: Any) -> Any: 192 """ 193 Return `None` if a value is a null-like string. 194 """ 195 return (None if value_is_null(value) else value) 196 197 198def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal: 199 """ 200 Quantize a given `Decimal` to a known scale and precision. 201 202 Parameters 203 ---------- 204 x: Decimal 205 The `Decimal` to be quantized. 206 207 scale: int 208 The total number of significant digits. 209 210 precision: int 211 The number of significant digits after the decimal point. 212 213 Returns 214 ------- 215 A `Decimal` quantized to the specified scale and precision. 216 """ 217 precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision))) 218 try: 219 return x.quantize(precision_decimal, context=Context(prec=scale)) 220 except InvalidOperation: 221 return x
MRSM_PD_DTYPES: Dict[str, str] =
{'json': 'object', 'numeric': 'object', 'datetime': 'datetime64[ns]', 'bool': 'bool[pyarrow]', 'int': 'Int64', 'int8': 'Int8', 'int16': 'Int16', 'int32': 'Int32', 'int64': 'Int64', 'str': 'string[python]'}
def
to_pandas_dtype(dtype: str) -> str:
29def to_pandas_dtype(dtype: str) -> str: 30 """ 31 Cast a supported Meerschaum dtype to a Pandas dtype. 32 """ 33 known_dtype = MRSM_PD_DTYPES.get(dtype, None) 34 if known_dtype is not None: 35 return known_dtype 36 37 ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps, 38 ### treat it as a SQL db type. 39 if dtype.split(' ')[0].isupper(): 40 from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type 41 return get_pd_type_from_db_type(dtype) 42 43 from meerschaum.utils.packages import attempt_import 44 pandas = attempt_import('pandas', lazy=False) 45 46 try: 47 return str(pandas.api.types.pandas_dtype(dtype)) 48 except Exception as e: 49 warn( 50 f"Invalid dtype '{dtype}', will use 'object' instead:\n" 51 + f"{traceback.format_exc()}", 52 stack = False, 53 ) 54 55 return 'object'
Cast a supported Meerschaum dtype to a Pandas dtype.
def
are_dtypes_equal( ldtype: Union[str, Dict[str, str]], rdtype: Union[str, Dict[str, str]]) -> bool:
58def are_dtypes_equal( 59 ldtype: Union[str, Dict[str, str]], 60 rdtype: Union[str, Dict[str, str]], 61 ) -> bool: 62 """ 63 Determine whether two dtype strings may be considered 64 equivalent to avoid unnecessary conversions. 65 66 Parameters 67 ---------- 68 ldtype: Union[str, Dict[str, str]] 69 The left dtype to compare. 70 May also provide a dtypes dictionary. 71 72 rdtype: Union[str, Dict[str, str]] 73 The right dtype to compare. 74 May also provide a dtypes dictionary. 75 76 Returns 77 ------- 78 A `bool` indicating whether the two dtypes are to be considered equivalent. 79 """ 80 if isinstance(ldtype, dict) and isinstance(rdtype, dict): 81 lkeys = sorted([str(k) for k in ldtype.keys()]) 82 rkeys = sorted([str(k) for k in rdtype.keys()]) 83 for lkey, rkey in zip(lkeys, rkeys): 84 if lkey != rkey: 85 return False 86 ltype = ldtype[lkey] 87 rtype = rdtype[rkey] 88 if not are_dtypes_equal(ltype, rtype): 89 return False 90 return True 91 92 try: 93 if ldtype == rdtype: 94 return True 95 except Exception as e: 96 warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}") 97 return False 98 99 ### Sometimes pandas dtype objects are passed. 100 ldtype = str(ldtype) 101 rdtype = str(rdtype) 102 103 json_dtypes = ('json', 'object') 104 if ldtype in json_dtypes and rdtype in json_dtypes: 105 return True 106 107 numeric_dtypes = ('numeric', 'object') 108 if ldtype in numeric_dtypes and rdtype in numeric_dtypes: 109 return True 110 111 ldtype_clean = ldtype.split('[')[0] 112 rdtype_clean = rdtype.split('[')[0] 113 114 if ldtype_clean.lower() == rdtype_clean.lower(): 115 return True 116 117 datetime_dtypes = ('datetime', 'timestamp') 118 ldtype_found_dt_prefix = False 119 rdtype_found_dt_prefix = False 120 for dt_prefix in datetime_dtypes: 121 ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix 122 rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix 123 if ldtype_found_dt_prefix and rdtype_found_dt_prefix: 124 return True 125 126 string_dtypes = ('str', 'string', 'object') 127 if ldtype_clean in string_dtypes and rdtype_clean in string_dtypes: 128 return True 129 130 int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8') 131 if ldtype_clean.lower() in int_dtypes and rdtype_clean.lower() in int_dtypes: 132 return True 133 134 float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double') 135 if ldtype_clean.lower() in float_dtypes and rdtype_clean.lower() in float_dtypes: 136 return True 137 138 bool_dtypes = ('bool', 'boolean') 139 if ldtype_clean in bool_dtypes and rdtype_clean in bool_dtypes: 140 return True 141 142 return False
Determine whether two dtype strings may be considered equivalent to avoid unnecessary conversions.
Parameters
- ldtype (Union[str, Dict[str, str]]): The left dtype to compare. May also provide a dtypes dictionary.
- rdtype (Union[str, Dict[str, str]]): The right dtype to compare. May also provide a dtypes dictionary.
Returns
- A
bool
indicating whether the two dtypes are to be considered equivalent.
def
is_dtype_numeric(dtype: str) -> bool:
145def is_dtype_numeric(dtype: str) -> bool: 146 """ 147 Determine whether a given `dtype` string 148 should be considered compatible with the Meerschaum dtype `numeric`. 149 150 Parameters 151 ---------- 152 dtype: str 153 The pandas-like dtype string. 154 155 Returns 156 ------- 157 A bool indicating the dtype is compatible with `numeric`. 158 """ 159 dtype_lower = dtype.lower() 160 161 acceptable_substrings = ('numeric', 'float', 'double', 'int') 162 for substring in acceptable_substrings: 163 if substring in dtype_lower: 164 return True 165 166 return False
Determine whether a given dtype
string
should be considered compatible with the Meerschaum dtype numeric
.
Parameters
- dtype (str): The pandas-like dtype string.
Returns
- A bool indicating the dtype is compatible with
numeric
.
def
attempt_cast_to_numeric(value: Any) -> Any:
169def attempt_cast_to_numeric(value: Any) -> Any: 170 """ 171 Given a value, attempt to coerce it into a numeric (Decimal). 172 """ 173 if isinstance(value, Decimal): 174 return value 175 try: 176 return ( 177 Decimal(str(value)) 178 if not value_is_null(value) 179 else Decimal('NaN') 180 ) 181 except Exception as e: 182 return value
Given a value, attempt to coerce it into a numeric (Decimal).
def
value_is_null(value: Any) -> bool:
185def value_is_null(value: Any) -> bool: 186 """ 187 Determine if a value is a null-like string. 188 """ 189 return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')
Determine if a value is a null-like string.
def
none_if_null(value: Any) -> Any:
192def none_if_null(value: Any) -> Any: 193 """ 194 Return `None` if a value is a null-like string. 195 """ 196 return (None if value_is_null(value) else value)
Return None
if a value is a null-like string.
def
quantize_decimal(x: decimal.Decimal, scale: int, precision: int) -> decimal.Decimal:
199def quantize_decimal(x: Decimal, scale: int, precision: int) -> Decimal: 200 """ 201 Quantize a given `Decimal` to a known scale and precision. 202 203 Parameters 204 ---------- 205 x: Decimal 206 The `Decimal` to be quantized. 207 208 scale: int 209 The total number of significant digits. 210 211 precision: int 212 The number of significant digits after the decimal point. 213 214 Returns 215 ------- 216 A `Decimal` quantized to the specified scale and precision. 217 """ 218 precision_decimal = Decimal((('1' * scale) + '.' + ('1' * precision))) 219 try: 220 return x.quantize(precision_decimal, context=Context(prec=scale)) 221 except InvalidOperation: 222 return x
Quantize a given Decimal
to a known scale and precision.
Parameters
- x (Decimal):
The
Decimal
to be quantized. - scale (int): The total number of significant digits.
- precision (int): The number of significant digits after the decimal point.
Returns
- A
Decimal
quantized to the specified scale and precision.