meerschaum.utils.dtypes

Utility functions for working with data types.

  1#! /usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3# vim:fenc=utf-8
  4
  5"""
  6Utility functions for working with data types.
  7"""
  8
  9import traceback
 10import json
 11import uuid
 12from datetime import timezone, datetime
 13from decimal import Decimal, Context, InvalidOperation, ROUND_HALF_UP
 14
 15import meerschaum as mrsm
 16from meerschaum.utils.typing import Dict, Union, Any, Optional, Tuple
 17from meerschaum.utils.warnings import warn
 18
 19MRSM_ALIAS_DTYPES: Dict[str, str] = {
 20    'decimal': 'numeric',
 21    'Decimal': 'numeric',
 22    'number': 'numeric',
 23    'jsonl': 'json',
 24    'JSON': 'json',
 25    'binary': 'bytes',
 26    'blob': 'bytes',
 27    'varbinary': 'bytes',
 28    'bytea': 'bytes',
 29    'guid': 'uuid',
 30    'UUID': 'uuid',
 31    'geom': 'geometry',
 32    'geog': 'geography',
 33}
 34MRSM_PD_DTYPES: Dict[Union[str, None], str] = {
 35    'json': 'object',
 36    'numeric': 'object',
 37    'geometry': 'object',
 38    'geography': 'object',
 39    'uuid': 'object',
 40    'datetime': 'datetime64[ns, UTC]',
 41    'bool': 'bool[pyarrow]',
 42    'int': 'Int64',
 43    'int8': 'Int8',
 44    'int16': 'Int16',
 45    'int32': 'Int32',
 46    'int64': 'Int64',
 47    'str': 'string[python]',
 48    'bytes': 'object',
 49    None: 'object',
 50}
 51
 52
 53def to_pandas_dtype(dtype: str) -> str:
 54    """
 55    Cast a supported Meerschaum dtype to a Pandas dtype.
 56    """
 57    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
 58    if known_dtype is not None:
 59        return known_dtype
 60
 61    alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
 62    if alias_dtype is not None:
 63        return MRSM_PD_DTYPES[alias_dtype]
 64
 65    if dtype.startswith('numeric'):
 66        return MRSM_PD_DTYPES['numeric']
 67
 68    if dtype.startswith('geometry'):
 69        return MRSM_PD_DTYPES['geometry']
 70
 71    if dtype.startswith('geography'):
 72        return MRSM_PD_DTYPES['geography']
 73
 74    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
 75    ### treat it as a SQL db type.
 76    if dtype.split(' ')[0].isupper():
 77        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
 78        return get_pd_type_from_db_type(dtype)
 79
 80    from meerschaum.utils.packages import attempt_import
 81    _ = attempt_import('pyarrow', lazy=False)
 82    pandas = attempt_import('pandas', lazy=False)
 83
 84    try:
 85        return str(pandas.api.types.pandas_dtype(dtype))
 86    except Exception:
 87        warn(
 88            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
 89            + f"{traceback.format_exc()}",
 90            stack=False,
 91        )
 92    return 'object'
 93
 94
 95def are_dtypes_equal(
 96    ldtype: Union[str, Dict[str, str]],
 97    rdtype: Union[str, Dict[str, str]],
 98) -> bool:
 99    """
100    Determine whether two dtype strings may be considered
101    equivalent to avoid unnecessary conversions.
102
103    Parameters
104    ----------
105    ldtype: Union[str, Dict[str, str]]
106        The left dtype to compare.
107        May also provide a dtypes dictionary.
108
109    rdtype: Union[str, Dict[str, str]]
110        The right dtype to compare.
111        May also provide a dtypes dictionary.
112
113    Returns
114    -------
115    A `bool` indicating whether the two dtypes are to be considered equivalent.
116    """
117    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
118        lkeys = sorted([str(k) for k in ldtype.keys()])
119        rkeys = sorted([str(k) for k in rdtype.keys()])
120        for lkey, rkey in zip(lkeys, rkeys):
121            if lkey != rkey:
122                return False
123            ltype = ldtype[lkey]
124            rtype = rdtype[rkey]
125            if not are_dtypes_equal(ltype, rtype):
126                return False
127        return True
128
129    try:
130        if ldtype == rdtype:
131            return True
132    except Exception:
133        warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
134        return False
135
136    ### Sometimes pandas dtype objects are passed.
137    ldtype = str(ldtype).split('[', maxsplit=1)[0]
138    rdtype = str(rdtype).split('[', maxsplit=1)[0]
139
140    if ldtype in MRSM_ALIAS_DTYPES:
141        ldtype = MRSM_ALIAS_DTYPES[ldtype]
142
143    if rdtype in MRSM_ALIAS_DTYPES:
144        rdtype = MRSM_ALIAS_DTYPES[rdtype]
145
146    json_dtypes = ('json', 'object')
147    if ldtype in json_dtypes and rdtype in json_dtypes:
148        return True
149
150    numeric_dtypes = ('numeric', 'object')
151    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
152        return True
153
154    uuid_dtypes = ('uuid', 'object')
155    if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
156        return True
157
158    bytes_dtypes = ('bytes', 'object')
159    if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
160        return True
161
162    geometry_dtypes = ('geometry', 'object', 'geography')
163    if ldtype in geometry_dtypes and rdtype in geometry_dtypes:
164        return True
165
166    if ldtype.lower() == rdtype.lower():
167        return True
168
169    datetime_dtypes = ('datetime', 'timestamp')
170    ldtype_found_dt_prefix = False
171    rdtype_found_dt_prefix = False
172    for dt_prefix in datetime_dtypes:
173        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
174        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
175    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
176        return True
177
178    string_dtypes = ('str', 'string', 'object')
179    if ldtype in string_dtypes and rdtype in string_dtypes:
180        return True
181
182    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
183    if ldtype.lower() in int_dtypes and rdtype.lower() in int_dtypes:
184        return True
185
186    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
187    if ldtype.lower() in float_dtypes and rdtype.lower() in float_dtypes:
188        return True
189
190    bool_dtypes = ('bool', 'boolean')
191    if ldtype in bool_dtypes and rdtype in bool_dtypes:
192        return True
193
194    return False
195
196
197def is_dtype_numeric(dtype: str) -> bool:
198    """
199    Determine whether a given `dtype` string
200    should be considered compatible with the Meerschaum dtype `numeric`.
201
202    Parameters
203    ----------
204    dtype: str
205        The pandas-like dtype string.
206
207    Returns
208    -------
209    A bool indicating the dtype is compatible with `numeric`.
210    """
211    dtype_lower = dtype.lower()
212
213    acceptable_substrings = ('numeric', 'float', 'double', 'int')
214    for substring in acceptable_substrings:
215        if substring in dtype_lower:
216            return True
217
218    return False
219
220
221def attempt_cast_to_numeric(
222    value: Any,
223    quantize: bool = False,
224    precision: Optional[int] = None,
225    scale: Optional[int] = None,
226)-> Any:
227    """
228    Given a value, attempt to coerce it into a numeric (Decimal).
229
230    Parameters
231    ----------
232    value: Any
233        The value to be cast to a Decimal.
234
235    quantize: bool, default False
236        If `True`, quantize the decimal to the specified precision and scale.
237
238    precision: Optional[int], default None
239        If `quantize` is `True`, use this precision.
240
241    scale: Optional[int], default None
242        If `quantize` is `True`, use this scale.
243
244    Returns
245    -------
246    A `Decimal` if possible, or `value`.
247    """
248    if isinstance(value, Decimal):
249        if quantize and precision and scale:
250            return quantize_decimal(value, precision, scale)
251        return value
252    try:
253        if value_is_null(value):
254            return Decimal('NaN')
255
256        dec = Decimal(str(value))
257        if not quantize or not precision or not scale:
258            return dec
259        return quantize_decimal(dec, precision, scale)
260    except Exception:
261        return value
262
263
264def attempt_cast_to_uuid(value: Any) -> Any:
265    """
266    Given a value, attempt to coerce it into a UUID (`uuid4`).
267    """
268    if isinstance(value, uuid.UUID):
269        return value
270    try:
271        return (
272            uuid.UUID(str(value))
273            if not value_is_null(value)
274            else None
275        )
276    except Exception:
277        return value
278
279
280def attempt_cast_to_bytes(value: Any) -> Any:
281    """
282    Given a value, attempt to coerce it into a bytestring.
283    """
284    if isinstance(value, bytes):
285        return value
286    try:
287        return (
288            deserialize_bytes_string(str(value))
289            if not value_is_null(value)
290            else None
291        )
292    except Exception:
293        return value
294
295
296def attempt_cast_to_geometry(value: Any) -> Any:
297    """
298    Given a value, attempt to coerce it into a `shapely` (`geometry`) object.
299    """
300    shapely, shapely_wkt, shapely_wkb = mrsm.attempt_import(
301        'shapely',
302        'shapely.wkt',
303        'shapely.wkb',
304        lazy=False,
305    )
306    if 'shapely' in str(type(value)):
307        return value
308
309    if isinstance(value, (dict, list)):
310        try:
311            return shapely.from_geojson(json.dumps(value))
312        except Exception as e:
313            return value
314
315    value_is_wkt = geometry_is_wkt(value)
316    if value_is_wkt is None:
317        return value
318
319    try:
320        return (
321            shapely_wkt.loads(value)
322            if value_is_wkt
323            else shapely_wkb.loads(value)
324        )
325    except Exception:
326        return value
327
328
329def geometry_is_wkt(value: Union[str, bytes]) -> Union[bool, None]:
330    """
331    Determine whether an input value should be treated as WKT or WKB geometry data.
332
333    Parameters
334    ----------
335    value: Union[str, bytes]
336        The input data to be parsed into geometry data.
337
338    Returns
339    -------
340    A `bool` (`True` if `value` is WKT and `False` if it should be treated as WKB).
341    Return `None` if `value` should be parsed as neither.
342    """
343    import re
344    if not isinstance(value, (str, bytes)):
345        return None
346
347    if isinstance(value, bytes):
348        return False
349    
350    wkt_pattern = r'^\s*(POINT|LINESTRING|POLYGON|MULTIPOINT|MULTILINESTRING|MULTIPOLYGON|GEOMETRYCOLLECTION)\s*\(.*\)\s*$'
351    if re.match(wkt_pattern, value, re.IGNORECASE):
352        return True
353    
354    if all(c in '0123456789ABCDEFabcdef' for c in value) and len(value) % 2 == 0:
355        return False
356    
357    return None
358
359
360def value_is_null(value: Any) -> bool:
361    """
362    Determine if a value is a null-like string.
363    """
364    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')
365
366
367def none_if_null(value: Any) -> Any:
368    """
369    Return `None` if a value is a null-like string.
370    """
371    return (None if value_is_null(value) else value)
372
373
374def quantize_decimal(x: Decimal, precision: int, scale: int) -> Decimal:
375    """
376    Quantize a given `Decimal` to a known scale and precision.
377
378    Parameters
379    ----------
380    x: Decimal
381        The `Decimal` to be quantized.
382
383    precision: int
384        The total number of significant digits.
385
386    scale: int
387        The number of significant digits after the decimal point.
388
389    Returns
390    -------
391    A `Decimal` quantized to the specified scale and precision.
392    """
393    precision_decimal = Decimal(('1' * (precision - scale)) + '.' + ('1' * scale))
394    try:
395        return x.quantize(precision_decimal, context=Context(prec=precision), rounding=ROUND_HALF_UP)
396    except InvalidOperation:
397        pass
398
399    raise ValueError(f"Cannot quantize value '{x}' to {precision=}, {scale=}.")
400
401
402def serialize_decimal(
403    x: Any,
404    quantize: bool = False,
405    precision: Optional[int] = None,
406    scale: Optional[int] = None,
407) -> Any:
408    """
409    Return a quantized string of an input decimal.
410
411    Parameters
412    ----------
413    x: Any
414        The potential decimal to be serialized.
415
416    quantize: bool, default False
417        If `True`, quantize the incoming Decimal to the specified scale and precision
418        before serialization.
419
420    precision: Optional[int], default None
421        The precision of the decimal to be quantized.
422
423    scale: Optional[int], default None
424        The scale of the decimal to be quantized.
425
426    Returns
427    -------
428    A string of the input decimal or the input if not a Decimal.
429    """
430    if not isinstance(x, Decimal):
431        return x
432
433    if value_is_null(x):
434        return None
435
436    if quantize and scale and precision:
437        x = quantize_decimal(x, precision, scale)
438
439    return f"{x:f}"
440
441
442def coerce_timezone(
443    dt: Any,
444    strip_utc: bool = False,
445) -> Any:
446    """
447    Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
448    return a UTC timestamp (strip timezone if `strip_utc` is `True`.
449    """
450    if dt is None:
451        return None
452
453    if isinstance(dt, int):
454        return dt
455
456    if isinstance(dt, str):
457        dateutil_parser = mrsm.attempt_import('dateutil.parser')
458        dt = dateutil_parser.parse(dt)
459
460    dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
461
462    if dt_is_series:
463        pandas = mrsm.attempt_import('pandas', lazy=False)
464
465        if (
466            pandas.api.types.is_datetime64_any_dtype(dt) and (
467                (dt.dt.tz is not None and not strip_utc)
468                or
469                (dt.dt.tz is None and strip_utc)
470            )
471        ):
472            return dt
473
474        dt_series = to_datetime(dt, coerce_utc=False)
475        if strip_utc:
476            try:
477                if dt_series.dt.tz is not None:
478                    dt_series = dt_series.dt.tz_localize(None)
479            except Exception:
480                pass
481
482        return dt_series
483
484    if dt.tzinfo is None:
485        if strip_utc:
486            return dt
487        return dt.replace(tzinfo=timezone.utc)
488
489    utc_dt = dt.astimezone(timezone.utc)
490    if strip_utc:
491        return utc_dt.replace(tzinfo=None)
492    return utc_dt
493
494
495def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
496    """
497    Wrap `pd.to_datetime()` and add support for out-of-bounds values.
498    """
499    pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
500    is_dask = 'dask' in getattr(dt_val, '__module__', '')
501    dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
502    dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
503    pd = pandas if dd is None else dd
504
505    try:
506        new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
507        if as_pydatetime:
508            return new_dt_val.to_pydatetime()
509        return new_dt_val
510    except (pd.errors.OutOfBoundsDatetime, ValueError):
511        pass
512
513    def parse(x: Any) -> Any:
514        try:
515            return dateutil_parser.parse(x)
516        except Exception:
517            return x
518
519    if dt_is_series:
520        new_series = dt_val.apply(parse)
521        if coerce_utc:
522            return coerce_timezone(new_series)
523        return new_series
524
525    new_dt_val = parse(dt_val)
526    if not coerce_utc:
527        return new_dt_val
528    return coerce_timezone(new_dt_val)
529
530
531def serialize_bytes(data: bytes) -> str:
532    """
533    Return the given bytes as a base64-encoded string.
534    """
535    import base64
536    if not isinstance(data, bytes) and value_is_null(data):
537        return data
538    return base64.b64encode(data).decode('utf-8')
539
540
541def serialize_geometry(
542    geom: Any,
543    geometry_format: str = 'wkb_hex',
544    as_wkt: bool = False,
545) -> Union[str, Dict[str, Any], None]:
546    """
547    Serialize geometry data as a hex-encoded well-known-binary string. 
548
549    Parameters
550    ----------
551    geom: Any
552        The potential geometry data to be serialized.
553
554    geometry_format: str, default 'wkb_hex'
555        The serialization format for geometry data.
556        Accepted formats are `wkb_hex` (well-known binary hex string),
557        `wkt` (well-known text), and `geojson`.
558
559    Returns
560    -------
561    A string containing the geometry data.
562    """
563    if value_is_null(geom):
564        return None
565    shapely = mrsm.attempt_import('shapely', lazy=False)
566    if geometry_format == 'geojson':
567        geojson_str = shapely.to_geojson(geom)
568        return json.loads(geojson_str)
569
570    if hasattr(geom, 'wkb_hex'):
571        return geom.wkb_hex if geometry_format == 'wkb_hex' else geom.wkt
572
573    return str(geom)
574
575
576def deserialize_geometry(geom_wkb: Union[str, bytes]):
577    """
578    Deserialize a WKB string into a shapely geometry object.
579    """
580    shapely = mrsm.attempt_import(lazy=False)
581    return shapely.wkb.loads(geom_wkb)
582
583
584def deserialize_bytes_string(data: Optional[str], force_hex: bool = False) -> Union[bytes, None]:
585    """
586    Given a serialized ASCII string of bytes data, return the original bytes.
587    The input data may either be base64- or hex-encoded.
588
589    Parameters
590    ----------
591    data: str | None
592        The string to be deserialized into bytes.
593        May be base64- or hex-encoded (prefixed with `'\\x'`).
594
595    force_hex: bool = False
596        If `True`, treat the input string as hex-encoded.
597        If `data` does not begin with the prefix `'\\x'`, set `force_hex` to `True`.
598        This will still strip the leading `'\\x'` prefix if present.
599
600    Returns
601    -------
602    The original bytes used to produce the encoded string `data`.
603    """
604    if not isinstance(data, str) and value_is_null(data):
605        return data
606
607    import binascii
608    import base64
609
610    is_hex = force_hex or data.startswith('\\x')
611
612    if is_hex:
613        if data.startswith('\\x'):
614            data = data[2:]
615        return binascii.unhexlify(data)
616
617    return base64.b64decode(data)
618
619
620def deserialize_base64(data: str) -> bytes:
621    """
622    Return the original bytestring from the given base64-encoded string.
623    """
624    import base64
625    return base64.b64decode(data)
626
627
628def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
629    """
630    Return the given bytes as a hex string for PostgreSQL's `BYTEA` type.
631    """
632    import binascii
633    if not isinstance(data, bytes) and value_is_null(data):
634        return data
635    return ('\\x' if with_prefix else '') + binascii.hexlify(data).decode('utf-8')
636
637
638def serialize_datetime(dt: datetime) -> Union[str, None]:
639    """
640    Serialize a datetime object into JSON (ISO format string).
641
642    Examples
643    --------
644    >>> import json
645    >>> from datetime import datetime
646    >>> json.dumps({'a': datetime(2022, 1, 1)}, default=json_serialize_datetime)
647    '{"a": "2022-01-01T00:00:00Z"}'
648
649    """
650    if not isinstance(dt, datetime):
651        return None
652    tz_suffix = 'Z' if dt.tzinfo is None else ''
653    return dt.isoformat() + tz_suffix
654
655
656def json_serialize_value(x: Any, default_to_str: bool = True) -> str:
657    """
658    Serialize the given value to a JSON value. Accounts for datetimes, bytes, decimals, etc.
659
660    Parameters
661    ----------
662    x: Any
663        The value to serialize.
664
665    default_to_str: bool, default True
666        If `True`, return a string of `x` if x is not a designated type.
667        Otherwise return x.
668
669    Returns
670    -------
671    A serialized version of x, or x.
672    """
673    if isinstance(x, (mrsm.Pipe, mrsm.connectors.Connector)):
674        return x.meta
675
676    if hasattr(x, 'tzinfo'):
677        return serialize_datetime(x)
678
679    if isinstance(x, bytes):
680        return serialize_bytes(x)
681
682    if isinstance(x, Decimal):
683        return serialize_decimal(x)
684
685    if 'shapely' in str(type(x)):
686        return serialize_geometry(x)
687
688    if value_is_null(x):
689        return None
690
691    return str(x) if default_to_str else x
692
693
694def get_geometry_type_srid(
695    dtype: str = 'geometry',
696    default_type: str = 'geometry',
697    default_srid: int = 4326,
698) -> Union[Tuple[str, int], Tuple[str, None]]:
699    """
700    Given the specified geometry `dtype`, return a tuple in the form (type, SRID).
701
702    Parameters
703    ----------
704    dtype: Optional[str], default None
705        Optionally provide a specific `geometry` syntax (e.g. `geometry[MultiLineString, 4326]`).
706        You may specify a supported `shapely` geometry type and an SRID in the dtype modifier:
707
708        - `Point`
709        - `LineString`
710        - `LinearRing`
711        - `Polygon`
712        - `MultiPoint`
713        - `MultiLineString`
714        - `MultiPolygon`
715        - `GeometryCollection`
716
717    Returns
718    -------
719    A tuple in the form (type, SRID).
720    Defaults to `(default_type, default_srid)`.
721
722    Examples
723    --------
724    >>> from meerschaum.utils.dtypes import get_geometry_type_srid
725    >>> get_geometry_type_srid()
726    ('geometry', 4326)
727    >>> get_geometry_type_srid('geometry[]')
728    ('geometry', 4326)
729    >>> get_geometry_type_srid('geometry[Point, 0]')
730    ('Point', 0)
731    >>> get_geometry_type_srid('geometry[0, Point]')
732    ('Point', 0)
733    >>> get_geometry_type_srid('geometry[0]')
734    ('geometry', 0)
735    >>> get_geometry_type_srid('geometry[MULTILINESTRING, 4326]')
736    ('MultiLineString', 4326)
737    >>> get_geometry_type_srid('geography')
738    ('geometry', 4326)
739    >>> get_geometry_type_srid('geography[POINT]')
740    ('Point', 4376)
741    """
742    from meerschaum.utils.misc import is_int
743    ### NOTE: PostGIS syntax must also be parsed.
744    dtype = dtype.replace('(', '[').replace(')', ']')
745    bare_dtype = dtype.split('[', maxsplit=1)[0]
746    modifier = dtype.split(bare_dtype, maxsplit=1)[-1].lstrip('[').rstrip(']')
747    if not modifier:
748        return default_type, default_srid
749
750    parts = [
751        part.split('=')[-1].strip()
752        for part in modifier.split(',')
753    ]
754    parts_casted = [
755        (
756            int(part)
757            if is_int(part)
758            else part
759        )
760        for part in parts
761    ]
762
763    srid = default_srid
764    geometry_type = default_type
765
766    for part in parts_casted:
767        if isinstance(part, int):
768            srid = part
769            break
770
771    for part in parts_casted:
772        if isinstance(part, str):
773            geometry_type = part
774            break
775
776    return geometry_type, srid
MRSM_ALIAS_DTYPES: Dict[str, str] = {'decimal': 'numeric', 'Decimal': 'numeric', 'number': 'numeric', 'jsonl': 'json', 'JSON': 'json', 'binary': 'bytes', 'blob': 'bytes', 'varbinary': 'bytes', 'bytea': 'bytes', 'guid': 'uuid', 'UUID': 'uuid', 'geom': 'geometry', 'geog': 'geography'}
MRSM_PD_DTYPES: Dict[Optional[str], str] = {'json': 'object', 'numeric': 'object', 'geometry': 'object', 'geography': 'object', 'uuid': 'object', 'datetime': 'datetime64[ns, UTC]', 'bool': 'bool[pyarrow]', 'int': 'Int64', 'int8': 'Int8', 'int16': 'Int16', 'int32': 'Int32', 'int64': 'Int64', 'str': 'string[python]', 'bytes': 'object', None: 'object'}
def to_pandas_dtype(dtype: str) -> str:
54def to_pandas_dtype(dtype: str) -> str:
55    """
56    Cast a supported Meerschaum dtype to a Pandas dtype.
57    """
58    known_dtype = MRSM_PD_DTYPES.get(dtype, None)
59    if known_dtype is not None:
60        return known_dtype
61
62    alias_dtype = MRSM_ALIAS_DTYPES.get(dtype, None)
63    if alias_dtype is not None:
64        return MRSM_PD_DTYPES[alias_dtype]
65
66    if dtype.startswith('numeric'):
67        return MRSM_PD_DTYPES['numeric']
68
69    if dtype.startswith('geometry'):
70        return MRSM_PD_DTYPES['geometry']
71
72    if dtype.startswith('geography'):
73        return MRSM_PD_DTYPES['geography']
74
75    ### NOTE: Kind of a hack, but if the first word of the given dtype is in all caps,
76    ### treat it as a SQL db type.
77    if dtype.split(' ')[0].isupper():
78        from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
79        return get_pd_type_from_db_type(dtype)
80
81    from meerschaum.utils.packages import attempt_import
82    _ = attempt_import('pyarrow', lazy=False)
83    pandas = attempt_import('pandas', lazy=False)
84
85    try:
86        return str(pandas.api.types.pandas_dtype(dtype))
87    except Exception:
88        warn(
89            f"Invalid dtype '{dtype}', will use 'object' instead:\n"
90            + f"{traceback.format_exc()}",
91            stack=False,
92        )
93    return 'object'

Cast a supported Meerschaum dtype to a Pandas dtype.

def are_dtypes_equal( ldtype: Union[str, Dict[str, str]], rdtype: Union[str, Dict[str, str]]) -> bool:
 96def are_dtypes_equal(
 97    ldtype: Union[str, Dict[str, str]],
 98    rdtype: Union[str, Dict[str, str]],
 99) -> bool:
100    """
101    Determine whether two dtype strings may be considered
102    equivalent to avoid unnecessary conversions.
103
104    Parameters
105    ----------
106    ldtype: Union[str, Dict[str, str]]
107        The left dtype to compare.
108        May also provide a dtypes dictionary.
109
110    rdtype: Union[str, Dict[str, str]]
111        The right dtype to compare.
112        May also provide a dtypes dictionary.
113
114    Returns
115    -------
116    A `bool` indicating whether the two dtypes are to be considered equivalent.
117    """
118    if isinstance(ldtype, dict) and isinstance(rdtype, dict):
119        lkeys = sorted([str(k) for k in ldtype.keys()])
120        rkeys = sorted([str(k) for k in rdtype.keys()])
121        for lkey, rkey in zip(lkeys, rkeys):
122            if lkey != rkey:
123                return False
124            ltype = ldtype[lkey]
125            rtype = rdtype[rkey]
126            if not are_dtypes_equal(ltype, rtype):
127                return False
128        return True
129
130    try:
131        if ldtype == rdtype:
132            return True
133    except Exception:
134        warn(f"Exception when comparing dtypes, returning False:\n{traceback.format_exc()}")
135        return False
136
137    ### Sometimes pandas dtype objects are passed.
138    ldtype = str(ldtype).split('[', maxsplit=1)[0]
139    rdtype = str(rdtype).split('[', maxsplit=1)[0]
140
141    if ldtype in MRSM_ALIAS_DTYPES:
142        ldtype = MRSM_ALIAS_DTYPES[ldtype]
143
144    if rdtype in MRSM_ALIAS_DTYPES:
145        rdtype = MRSM_ALIAS_DTYPES[rdtype]
146
147    json_dtypes = ('json', 'object')
148    if ldtype in json_dtypes and rdtype in json_dtypes:
149        return True
150
151    numeric_dtypes = ('numeric', 'object')
152    if ldtype in numeric_dtypes and rdtype in numeric_dtypes:
153        return True
154
155    uuid_dtypes = ('uuid', 'object')
156    if ldtype in uuid_dtypes and rdtype in uuid_dtypes:
157        return True
158
159    bytes_dtypes = ('bytes', 'object')
160    if ldtype in bytes_dtypes and rdtype in bytes_dtypes:
161        return True
162
163    geometry_dtypes = ('geometry', 'object', 'geography')
164    if ldtype in geometry_dtypes and rdtype in geometry_dtypes:
165        return True
166
167    if ldtype.lower() == rdtype.lower():
168        return True
169
170    datetime_dtypes = ('datetime', 'timestamp')
171    ldtype_found_dt_prefix = False
172    rdtype_found_dt_prefix = False
173    for dt_prefix in datetime_dtypes:
174        ldtype_found_dt_prefix = (dt_prefix in ldtype.lower()) or ldtype_found_dt_prefix
175        rdtype_found_dt_prefix = (dt_prefix in rdtype.lower()) or rdtype_found_dt_prefix
176    if ldtype_found_dt_prefix and rdtype_found_dt_prefix:
177        return True
178
179    string_dtypes = ('str', 'string', 'object')
180    if ldtype in string_dtypes and rdtype in string_dtypes:
181        return True
182
183    int_dtypes = ('int', 'int64', 'int32', 'int16', 'int8')
184    if ldtype.lower() in int_dtypes and rdtype.lower() in int_dtypes:
185        return True
186
187    float_dtypes = ('float', 'float64', 'float32', 'float16', 'float128', 'double')
188    if ldtype.lower() in float_dtypes and rdtype.lower() in float_dtypes:
189        return True
190
191    bool_dtypes = ('bool', 'boolean')
192    if ldtype in bool_dtypes and rdtype in bool_dtypes:
193        return True
194
195    return False

Determine whether two dtype strings may be considered equivalent to avoid unnecessary conversions.

Parameters
  • ldtype (Union[str, Dict[str, str]]): The left dtype to compare. May also provide a dtypes dictionary.
  • rdtype (Union[str, Dict[str, str]]): The right dtype to compare. May also provide a dtypes dictionary.
Returns
  • A bool indicating whether the two dtypes are to be considered equivalent.
def is_dtype_numeric(dtype: str) -> bool:
198def is_dtype_numeric(dtype: str) -> bool:
199    """
200    Determine whether a given `dtype` string
201    should be considered compatible with the Meerschaum dtype `numeric`.
202
203    Parameters
204    ----------
205    dtype: str
206        The pandas-like dtype string.
207
208    Returns
209    -------
210    A bool indicating the dtype is compatible with `numeric`.
211    """
212    dtype_lower = dtype.lower()
213
214    acceptable_substrings = ('numeric', 'float', 'double', 'int')
215    for substring in acceptable_substrings:
216        if substring in dtype_lower:
217            return True
218
219    return False

Determine whether a given dtype string should be considered compatible with the Meerschaum dtype numeric.

Parameters
  • dtype (str): The pandas-like dtype string.
Returns
  • A bool indicating the dtype is compatible with numeric.
def attempt_cast_to_numeric( value: Any, quantize: bool = False, precision: Optional[int] = None, scale: Optional[int] = None) -> Any:
222def attempt_cast_to_numeric(
223    value: Any,
224    quantize: bool = False,
225    precision: Optional[int] = None,
226    scale: Optional[int] = None,
227)-> Any:
228    """
229    Given a value, attempt to coerce it into a numeric (Decimal).
230
231    Parameters
232    ----------
233    value: Any
234        The value to be cast to a Decimal.
235
236    quantize: bool, default False
237        If `True`, quantize the decimal to the specified precision and scale.
238
239    precision: Optional[int], default None
240        If `quantize` is `True`, use this precision.
241
242    scale: Optional[int], default None
243        If `quantize` is `True`, use this scale.
244
245    Returns
246    -------
247    A `Decimal` if possible, or `value`.
248    """
249    if isinstance(value, Decimal):
250        if quantize and precision and scale:
251            return quantize_decimal(value, precision, scale)
252        return value
253    try:
254        if value_is_null(value):
255            return Decimal('NaN')
256
257        dec = Decimal(str(value))
258        if not quantize or not precision or not scale:
259            return dec
260        return quantize_decimal(dec, precision, scale)
261    except Exception:
262        return value

Given a value, attempt to coerce it into a numeric (Decimal).

Parameters
  • value (Any): The value to be cast to a Decimal.
  • quantize (bool, default False): If True, quantize the decimal to the specified precision and scale.
  • precision (Optional[int], default None): If quantize is True, use this precision.
  • scale (Optional[int], default None): If quantize is True, use this scale.
Returns
  • A Decimal if possible, or value.
def attempt_cast_to_uuid(value: Any) -> Any:
265def attempt_cast_to_uuid(value: Any) -> Any:
266    """
267    Given a value, attempt to coerce it into a UUID (`uuid4`).
268    """
269    if isinstance(value, uuid.UUID):
270        return value
271    try:
272        return (
273            uuid.UUID(str(value))
274            if not value_is_null(value)
275            else None
276        )
277    except Exception:
278        return value

Given a value, attempt to coerce it into a UUID (uuid4).

def attempt_cast_to_bytes(value: Any) -> Any:
281def attempt_cast_to_bytes(value: Any) -> Any:
282    """
283    Given a value, attempt to coerce it into a bytestring.
284    """
285    if isinstance(value, bytes):
286        return value
287    try:
288        return (
289            deserialize_bytes_string(str(value))
290            if not value_is_null(value)
291            else None
292        )
293    except Exception:
294        return value

Given a value, attempt to coerce it into a bytestring.

def attempt_cast_to_geometry(value: Any) -> Any:
297def attempt_cast_to_geometry(value: Any) -> Any:
298    """
299    Given a value, attempt to coerce it into a `shapely` (`geometry`) object.
300    """
301    shapely, shapely_wkt, shapely_wkb = mrsm.attempt_import(
302        'shapely',
303        'shapely.wkt',
304        'shapely.wkb',
305        lazy=False,
306    )
307    if 'shapely' in str(type(value)):
308        return value
309
310    if isinstance(value, (dict, list)):
311        try:
312            return shapely.from_geojson(json.dumps(value))
313        except Exception as e:
314            return value
315
316    value_is_wkt = geometry_is_wkt(value)
317    if value_is_wkt is None:
318        return value
319
320    try:
321        return (
322            shapely_wkt.loads(value)
323            if value_is_wkt
324            else shapely_wkb.loads(value)
325        )
326    except Exception:
327        return value

Given a value, attempt to coerce it into a shapely (geometry) object.

def geometry_is_wkt(value: Union[str, bytes]) -> Optional[bool]:
330def geometry_is_wkt(value: Union[str, bytes]) -> Union[bool, None]:
331    """
332    Determine whether an input value should be treated as WKT or WKB geometry data.
333
334    Parameters
335    ----------
336    value: Union[str, bytes]
337        The input data to be parsed into geometry data.
338
339    Returns
340    -------
341    A `bool` (`True` if `value` is WKT and `False` if it should be treated as WKB).
342    Return `None` if `value` should be parsed as neither.
343    """
344    import re
345    if not isinstance(value, (str, bytes)):
346        return None
347
348    if isinstance(value, bytes):
349        return False
350    
351    wkt_pattern = r'^\s*(POINT|LINESTRING|POLYGON|MULTIPOINT|MULTILINESTRING|MULTIPOLYGON|GEOMETRYCOLLECTION)\s*\(.*\)\s*$'
352    if re.match(wkt_pattern, value, re.IGNORECASE):
353        return True
354    
355    if all(c in '0123456789ABCDEFabcdef' for c in value) and len(value) % 2 == 0:
356        return False
357    
358    return None

Determine whether an input value should be treated as WKT or WKB geometry data.

Parameters
  • value (Union[str, bytes]): The input data to be parsed into geometry data.
Returns
  • A bool (True if value is WKT and False if it should be treated as WKB).
  • Return None if value should be parsed as neither.
def value_is_null(value: Any) -> bool:
361def value_is_null(value: Any) -> bool:
362    """
363    Determine if a value is a null-like string.
364    """
365    return str(value).lower() in ('none', 'nan', 'na', 'nat', '', '<na>')

Determine if a value is a null-like string.

def none_if_null(value: Any) -> Any:
368def none_if_null(value: Any) -> Any:
369    """
370    Return `None` if a value is a null-like string.
371    """
372    return (None if value_is_null(value) else value)

Return None if a value is a null-like string.

def quantize_decimal(x: decimal.Decimal, precision: int, scale: int) -> decimal.Decimal:
375def quantize_decimal(x: Decimal, precision: int, scale: int) -> Decimal:
376    """
377    Quantize a given `Decimal` to a known scale and precision.
378
379    Parameters
380    ----------
381    x: Decimal
382        The `Decimal` to be quantized.
383
384    precision: int
385        The total number of significant digits.
386
387    scale: int
388        The number of significant digits after the decimal point.
389
390    Returns
391    -------
392    A `Decimal` quantized to the specified scale and precision.
393    """
394    precision_decimal = Decimal(('1' * (precision - scale)) + '.' + ('1' * scale))
395    try:
396        return x.quantize(precision_decimal, context=Context(prec=precision), rounding=ROUND_HALF_UP)
397    except InvalidOperation:
398        pass
399
400    raise ValueError(f"Cannot quantize value '{x}' to {precision=}, {scale=}.")

Quantize a given Decimal to a known scale and precision.

Parameters
  • x (Decimal): The Decimal to be quantized.
  • precision (int): The total number of significant digits.
  • scale (int): The number of significant digits after the decimal point.
Returns
  • A Decimal quantized to the specified scale and precision.
def serialize_decimal( x: Any, quantize: bool = False, precision: Optional[int] = None, scale: Optional[int] = None) -> Any:
403def serialize_decimal(
404    x: Any,
405    quantize: bool = False,
406    precision: Optional[int] = None,
407    scale: Optional[int] = None,
408) -> Any:
409    """
410    Return a quantized string of an input decimal.
411
412    Parameters
413    ----------
414    x: Any
415        The potential decimal to be serialized.
416
417    quantize: bool, default False
418        If `True`, quantize the incoming Decimal to the specified scale and precision
419        before serialization.
420
421    precision: Optional[int], default None
422        The precision of the decimal to be quantized.
423
424    scale: Optional[int], default None
425        The scale of the decimal to be quantized.
426
427    Returns
428    -------
429    A string of the input decimal or the input if not a Decimal.
430    """
431    if not isinstance(x, Decimal):
432        return x
433
434    if value_is_null(x):
435        return None
436
437    if quantize and scale and precision:
438        x = quantize_decimal(x, precision, scale)
439
440    return f"{x:f}"

Return a quantized string of an input decimal.

Parameters
  • x (Any): The potential decimal to be serialized.
  • quantize (bool, default False): If True, quantize the incoming Decimal to the specified scale and precision before serialization.
  • precision (Optional[int], default None): The precision of the decimal to be quantized.
  • scale (Optional[int], default None): The scale of the decimal to be quantized.
Returns
  • A string of the input decimal or the input if not a Decimal.
def coerce_timezone(dt: Any, strip_utc: bool = False) -> Any:
443def coerce_timezone(
444    dt: Any,
445    strip_utc: bool = False,
446) -> Any:
447    """
448    Given a `datetime`, pandas `Timestamp` or `Series` of `Timestamp`,
449    return a UTC timestamp (strip timezone if `strip_utc` is `True`.
450    """
451    if dt is None:
452        return None
453
454    if isinstance(dt, int):
455        return dt
456
457    if isinstance(dt, str):
458        dateutil_parser = mrsm.attempt_import('dateutil.parser')
459        dt = dateutil_parser.parse(dt)
460
461    dt_is_series = hasattr(dt, 'dtype') and hasattr(dt, '__module__')
462
463    if dt_is_series:
464        pandas = mrsm.attempt_import('pandas', lazy=False)
465
466        if (
467            pandas.api.types.is_datetime64_any_dtype(dt) and (
468                (dt.dt.tz is not None and not strip_utc)
469                or
470                (dt.dt.tz is None and strip_utc)
471            )
472        ):
473            return dt
474
475        dt_series = to_datetime(dt, coerce_utc=False)
476        if strip_utc:
477            try:
478                if dt_series.dt.tz is not None:
479                    dt_series = dt_series.dt.tz_localize(None)
480            except Exception:
481                pass
482
483        return dt_series
484
485    if dt.tzinfo is None:
486        if strip_utc:
487            return dt
488        return dt.replace(tzinfo=timezone.utc)
489
490    utc_dt = dt.astimezone(timezone.utc)
491    if strip_utc:
492        return utc_dt.replace(tzinfo=None)
493    return utc_dt

Given a datetime, pandas Timestamp or Series of Timestamp, return a UTC timestamp (strip timezone if strip_utc is True.

def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
496def to_datetime(dt_val: Any, as_pydatetime: bool = False, coerce_utc: bool = True) -> Any:
497    """
498    Wrap `pd.to_datetime()` and add support for out-of-bounds values.
499    """
500    pandas, dateutil_parser = mrsm.attempt_import('pandas', 'dateutil.parser', lazy=False)
501    is_dask = 'dask' in getattr(dt_val, '__module__', '')
502    dd = mrsm.attempt_import('dask.dataframe') if is_dask else None
503    dt_is_series = hasattr(dt_val, 'dtype') and hasattr(dt_val, '__module__')
504    pd = pandas if dd is None else dd
505
506    try:
507        new_dt_val = pd.to_datetime(dt_val, utc=True, format='ISO8601')
508        if as_pydatetime:
509            return new_dt_val.to_pydatetime()
510        return new_dt_val
511    except (pd.errors.OutOfBoundsDatetime, ValueError):
512        pass
513
514    def parse(x: Any) -> Any:
515        try:
516            return dateutil_parser.parse(x)
517        except Exception:
518            return x
519
520    if dt_is_series:
521        new_series = dt_val.apply(parse)
522        if coerce_utc:
523            return coerce_timezone(new_series)
524        return new_series
525
526    new_dt_val = parse(dt_val)
527    if not coerce_utc:
528        return new_dt_val
529    return coerce_timezone(new_dt_val)

Wrap pd.to_datetime() and add support for out-of-bounds values.

def serialize_bytes(data: bytes) -> str:
532def serialize_bytes(data: bytes) -> str:
533    """
534    Return the given bytes as a base64-encoded string.
535    """
536    import base64
537    if not isinstance(data, bytes) and value_is_null(data):
538        return data
539    return base64.b64encode(data).decode('utf-8')

Return the given bytes as a base64-encoded string.

def serialize_geometry( geom: Any, geometry_format: str = 'wkb_hex', as_wkt: bool = False) -> Union[str, Dict[str, Any], NoneType]:
542def serialize_geometry(
543    geom: Any,
544    geometry_format: str = 'wkb_hex',
545    as_wkt: bool = False,
546) -> Union[str, Dict[str, Any], None]:
547    """
548    Serialize geometry data as a hex-encoded well-known-binary string. 
549
550    Parameters
551    ----------
552    geom: Any
553        The potential geometry data to be serialized.
554
555    geometry_format: str, default 'wkb_hex'
556        The serialization format for geometry data.
557        Accepted formats are `wkb_hex` (well-known binary hex string),
558        `wkt` (well-known text), and `geojson`.
559
560    Returns
561    -------
562    A string containing the geometry data.
563    """
564    if value_is_null(geom):
565        return None
566    shapely = mrsm.attempt_import('shapely', lazy=False)
567    if geometry_format == 'geojson':
568        geojson_str = shapely.to_geojson(geom)
569        return json.loads(geojson_str)
570
571    if hasattr(geom, 'wkb_hex'):
572        return geom.wkb_hex if geometry_format == 'wkb_hex' else geom.wkt
573
574    return str(geom)

Serialize geometry data as a hex-encoded well-known-binary string.

Parameters
  • geom (Any): The potential geometry data to be serialized.
  • geometry_format (str, default 'wkb_hex'): The serialization format for geometry data. Accepted formats are wkb_hex (well-known binary hex string), wkt (well-known text), and geojson.
Returns
  • A string containing the geometry data.
def deserialize_geometry(geom_wkb: Union[str, bytes]):
577def deserialize_geometry(geom_wkb: Union[str, bytes]):
578    """
579    Deserialize a WKB string into a shapely geometry object.
580    """
581    shapely = mrsm.attempt_import(lazy=False)
582    return shapely.wkb.loads(geom_wkb)

Deserialize a WKB string into a shapely geometry object.

def deserialize_bytes_string(data: Optional[str], force_hex: bool = False) -> Optional[bytes]:
585def deserialize_bytes_string(data: Optional[str], force_hex: bool = False) -> Union[bytes, None]:
586    """
587    Given a serialized ASCII string of bytes data, return the original bytes.
588    The input data may either be base64- or hex-encoded.
589
590    Parameters
591    ----------
592    data: str | None
593        The string to be deserialized into bytes.
594        May be base64- or hex-encoded (prefixed with `'\\x'`).
595
596    force_hex: bool = False
597        If `True`, treat the input string as hex-encoded.
598        If `data` does not begin with the prefix `'\\x'`, set `force_hex` to `True`.
599        This will still strip the leading `'\\x'` prefix if present.
600
601    Returns
602    -------
603    The original bytes used to produce the encoded string `data`.
604    """
605    if not isinstance(data, str) and value_is_null(data):
606        return data
607
608    import binascii
609    import base64
610
611    is_hex = force_hex or data.startswith('\\x')
612
613    if is_hex:
614        if data.startswith('\\x'):
615            data = data[2:]
616        return binascii.unhexlify(data)
617
618    return base64.b64decode(data)

Given a serialized ASCII string of bytes data, return the original bytes. The input data may either be base64- or hex-encoded.

Parameters
  • data (str | None): The string to be deserialized into bytes. May be base64- or hex-encoded (prefixed with '\x').
  • force_hex (bool = False): If True, treat the input string as hex-encoded. If data does not begin with the prefix '\x', set force_hex to True. This will still strip the leading '\x' prefix if present.
Returns
  • The original bytes used to produce the encoded string data.
def deserialize_base64(data: str) -> bytes:
621def deserialize_base64(data: str) -> bytes:
622    """
623    Return the original bytestring from the given base64-encoded string.
624    """
625    import base64
626    return base64.b64decode(data)

Return the original bytestring from the given base64-encoded string.

def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
629def encode_bytes_for_bytea(data: bytes, with_prefix: bool = True) -> str | None:
630    """
631    Return the given bytes as a hex string for PostgreSQL's `BYTEA` type.
632    """
633    import binascii
634    if not isinstance(data, bytes) and value_is_null(data):
635        return data
636    return ('\\x' if with_prefix else '') + binascii.hexlify(data).decode('utf-8')

Return the given bytes as a hex string for PostgreSQL's BYTEA type.

def serialize_datetime(dt: datetime.datetime) -> Optional[str]:
639def serialize_datetime(dt: datetime) -> Union[str, None]:
640    """
641    Serialize a datetime object into JSON (ISO format string).
642
643    Examples
644    --------
645    >>> import json
646    >>> from datetime import datetime
647    >>> json.dumps({'a': datetime(2022, 1, 1)}, default=json_serialize_datetime)
648    '{"a": "2022-01-01T00:00:00Z"}'
649
650    """
651    if not isinstance(dt, datetime):
652        return None
653    tz_suffix = 'Z' if dt.tzinfo is None else ''
654    return dt.isoformat() + tz_suffix

Serialize a datetime object into JSON (ISO format string).

Examples
>>> import json
>>> from datetime import datetime
>>> json.dumps({'a': datetime(2022, 1, 1)}, default=json_serialize_datetime)
'{"a": "2022-01-01T00:00:00Z"}'
def json_serialize_value(x: Any, default_to_str: bool = True) -> str:
657def json_serialize_value(x: Any, default_to_str: bool = True) -> str:
658    """
659    Serialize the given value to a JSON value. Accounts for datetimes, bytes, decimals, etc.
660
661    Parameters
662    ----------
663    x: Any
664        The value to serialize.
665
666    default_to_str: bool, default True
667        If `True`, return a string of `x` if x is not a designated type.
668        Otherwise return x.
669
670    Returns
671    -------
672    A serialized version of x, or x.
673    """
674    if isinstance(x, (mrsm.Pipe, mrsm.connectors.Connector)):
675        return x.meta
676
677    if hasattr(x, 'tzinfo'):
678        return serialize_datetime(x)
679
680    if isinstance(x, bytes):
681        return serialize_bytes(x)
682
683    if isinstance(x, Decimal):
684        return serialize_decimal(x)
685
686    if 'shapely' in str(type(x)):
687        return serialize_geometry(x)
688
689    if value_is_null(x):
690        return None
691
692    return str(x) if default_to_str else x

Serialize the given value to a JSON value. Accounts for datetimes, bytes, decimals, etc.

Parameters
  • x (Any): The value to serialize.
  • default_to_str (bool, default True): If True, return a string of x if x is not a designated type. Otherwise return x.
Returns
  • A serialized version of x, or x.
def get_geometry_type_srid( dtype: str = 'geometry', default_type: str = 'geometry', default_srid: int = 4326) -> Union[Tuple[str, int], Tuple[str, NoneType]]:
695def get_geometry_type_srid(
696    dtype: str = 'geometry',
697    default_type: str = 'geometry',
698    default_srid: int = 4326,
699) -> Union[Tuple[str, int], Tuple[str, None]]:
700    """
701    Given the specified geometry `dtype`, return a tuple in the form (type, SRID).
702
703    Parameters
704    ----------
705    dtype: Optional[str], default None
706        Optionally provide a specific `geometry` syntax (e.g. `geometry[MultiLineString, 4326]`).
707        You may specify a supported `shapely` geometry type and an SRID in the dtype modifier:
708
709        - `Point`
710        - `LineString`
711        - `LinearRing`
712        - `Polygon`
713        - `MultiPoint`
714        - `MultiLineString`
715        - `MultiPolygon`
716        - `GeometryCollection`
717
718    Returns
719    -------
720    A tuple in the form (type, SRID).
721    Defaults to `(default_type, default_srid)`.
722
723    Examples
724    --------
725    >>> from meerschaum.utils.dtypes import get_geometry_type_srid
726    >>> get_geometry_type_srid()
727    ('geometry', 4326)
728    >>> get_geometry_type_srid('geometry[]')
729    ('geometry', 4326)
730    >>> get_geometry_type_srid('geometry[Point, 0]')
731    ('Point', 0)
732    >>> get_geometry_type_srid('geometry[0, Point]')
733    ('Point', 0)
734    >>> get_geometry_type_srid('geometry[0]')
735    ('geometry', 0)
736    >>> get_geometry_type_srid('geometry[MULTILINESTRING, 4326]')
737    ('MultiLineString', 4326)
738    >>> get_geometry_type_srid('geography')
739    ('geometry', 4326)
740    >>> get_geometry_type_srid('geography[POINT]')
741    ('Point', 4376)
742    """
743    from meerschaum.utils.misc import is_int
744    ### NOTE: PostGIS syntax must also be parsed.
745    dtype = dtype.replace('(', '[').replace(')', ']')
746    bare_dtype = dtype.split('[', maxsplit=1)[0]
747    modifier = dtype.split(bare_dtype, maxsplit=1)[-1].lstrip('[').rstrip(']')
748    if not modifier:
749        return default_type, default_srid
750
751    parts = [
752        part.split('=')[-1].strip()
753        for part in modifier.split(',')
754    ]
755    parts_casted = [
756        (
757            int(part)
758            if is_int(part)
759            else part
760        )
761        for part in parts
762    ]
763
764    srid = default_srid
765    geometry_type = default_type
766
767    for part in parts_casted:
768        if isinstance(part, int):
769            srid = part
770            break
771
772    for part in parts_casted:
773        if isinstance(part, str):
774            geometry_type = part
775            break
776
777    return geometry_type, srid

Given the specified geometry dtype, return a tuple in the form (type, SRID).

Parameters
  • dtype (Optional[str], default None): Optionally provide a specific geometry syntax (e.g. geometry[MultiLineString, 4326]). You may specify a supported shapely geometry type and an SRID in the dtype modifier:

    • Point
    • LineString
    • LinearRing
    • Polygon
    • MultiPoint
    • MultiLineString
    • MultiPolygon
    • GeometryCollection
Returns
  • A tuple in the form (type, SRID).
  • Defaults to (default_type, default_srid).
Examples
>>> from meerschaum.utils.dtypes import get_geometry_type_srid
>>> get_geometry_type_srid()
('geometry', 4326)
>>> get_geometry_type_srid('geometry[]')
('geometry', 4326)
>>> get_geometry_type_srid('geometry[Point, 0]')
('Point', 0)
>>> get_geometry_type_srid('geometry[0, Point]')
('Point', 0)
>>> get_geometry_type_srid('geometry[0]')
('geometry', 0)
>>> get_geometry_type_srid('geometry[MULTILINESTRING, 4326]')
('MultiLineString', 4326)
>>> get_geometry_type_srid('geography')
('geometry', 4326)
>>> get_geometry_type_srid('geography[POINT]')
('Point', 4376)