meerschaum
Meerschaum Python API
Welcome to the Meerschaum Python API technical documentation! Here you can find information about the classes and functions provided by the meerschaum
package. Visit meerschaum.io for general usage documentation.
Root Module
For your convenience, the following classes and functions may be imported from the root meerschaum
namespace:
Classes
Examples
Build a Connector
Get existing connectors or build a new one in-memory with the meerschaum.get_connector()
factory function:
import meerschaum as mrsm
sql_conn = mrsm.get_connector(
'sql:temp',
flavor='sqlite',
database='/tmp/tmp.db',
)
df = sql_conn.read("SELECT 1 AS foo")
print(df)
# foo
# 0 1
sql_conn.to_sql(df, 'foo')
print(sql_conn.read('foo'))
# foo
# 0 1
Create a Custom Connector Class
Decorate your connector classes with meerschaum.make_connector()
to designate it as a custom connector:
from datetime import datetime, timezone
from random import randint
import meerschaum as mrsm
from meerschaum.utils.misc import round_time
@mrsm.make_connector
class FooConnector(mrsm.Connector):
REQUIRED_ATTRIBUTES = ['username', 'password']
def fetch(
self,
begin: datetime | None = None,
end: datetime | None = None,
):
now = begin or round_time(datetime.now(timezone.utc))
return [
{'ts': now, 'id': 1, 'vl': randint(1, 100)},
{'ts': now, 'id': 2, 'vl': randint(1, 100)},
{'ts': now, 'id': 3, 'vl': randint(1, 100)},
]
foo_conn = mrsm.get_connector(
'foo:bar',
username='foo',
password='bar',
)
docs = foo_conn.fetch()
Build a Pipe
Build a meerschaum.Pipe
in-memory:
from datetime import datetime
import meerschaum as mrsm
pipe = mrsm.Pipe(
foo_conn, 'demo',
instance=sql_conn,
columns={'datetime': 'ts', 'id': 'id'},
tags=['production'],
)
pipe.sync(begin=datetime(2024, 1, 1))
df = pipe.get_data()
print(df)
# ts id vl
# 0 2024-01-01 1 97
# 1 2024-01-01 2 18
# 2 2024-01-01 3 96
Add temporary=True
to skip registering the pipe in the pipes table.
Get Registered Pipes
The meerschaum.get_pipes()
function returns a dictionary hierarchy of pipes by connector, metric, and location:
import meerschaum as mrsm
pipes = mrsm.get_pipes(instance='sql:temp')
pipe = pipes['foo:bar']['demo'][None]
Add as_list=True
to flatten the hierarchy:
import meerschaum as mrsm
pipes = mrsm.get_pipes(
tags=['production'],
instance=sql_conn,
as_list=True,
)
print(pipes)
# [Pipe('foo:bar', 'demo', instance='sql:temp')]
Import Plugins
You can import a plugin's module through meerschaum.Plugin.module
:
import meerschaum as mrsm
plugin = mrsm.Plugin('noaa')
with mrsm.Venv(plugin):
noaa = plugin.module
If your plugin has submodules, use meerschaum.plugins.from_plugin_import
:
from meerschaum.plugins import from_plugin_import
get_defined_pipes = from_plugin_import('compose.utils.pipes', 'get_defined_pipes')
Import multiple plugins with meerschaum.plugins.import_plugins
:
from meerschaum.plugins import import_plugins
noaa, compose = import_plugins('noaa', 'compose')
Create a Job
Create a meerschaum.Job
with name
and sysargs
:
import meerschaum as mrsm
job = mrsm.Job('syncing-engine', 'sync pipes --loop')
success, msg = job.start()
Pass executor_keys
as the connectors keys of an API instance to create a remote job:
import meerschaum as mrsm
job = mrsm.Job(
'foo',
'sync pipes -s daily',
executor_keys='api:main',
)
Import from a Virtual Environment
Use the meerschaum.Venv
context manager to activate a virtual environment:
import meerschaum as mrsm
with mrsm.Venv('noaa'):
import requests
print(requests.__file__)
# /home/bmeares/.config/meerschaum/venvs/noaa/lib/python3.12/site-packages/requests/__init__.py
To import packages which may not be installed, use meerschaum.attempt_import()
:
import meerschaum as mrsm
requests = mrsm.attempt_import('requests', venv='noaa')
print(requests.__file__)
# /home/bmeares/.config/meerschaum/venvs/noaa/lib/python3.12/site-packages/requests/__init__.py
Run Actions
Run sysargs
with meerschaum.entry()
:
import meerschaum as mrsm
success, msg = mrsm.entry('show pipes + show version : x2')
Use meerschaum.actions.get_action()
to access an action function directly:
from meerschaum.actions import get_action
show_pipes = get_action(['show', 'pipes'])
success, msg = show_pipes(connector_keys=['plugin:noaa'])
Get a dictionary of available subactions with meerschaum.actions.get_subactions()
:
from meerschaum.actions import get_subactions
subactions = get_subactions('show')
success, msg = subactions['pipes']()
Create a Plugin
Run bootstrap plugin
to create a new plugin:
mrsm bootstrap plugin example
This will create example.py
in your plugins directory (default ~/.config/meerschaum/plugins/
, Windows: %APPDATA%\Meerschaum\plugins
). You may paste the example code from the "Create a Custom Action" example below.
Open your plugin with edit plugin
:
mrsm edit plugin example
Run edit plugin
and paste the example code below to try out the features.
See the writing plugins guide for more in-depth documentation.
Create a Custom Action
Decorate a function with meerschaum.actions.make_action
to designate it as an action. Subactions will be automatically detected if not decorated:
from meerschaum.actions import make_action
@make_action
def sing():
print('What would you like me to sing?')
return True, "Success"
def sing_tune():
return False, "I don't know that song!"
def sing_song():
print('Hello, World!')
return True, "Success"
Use meerschaum.plugins.add_plugin_argument()
to create new parameters for your action:
from meerschaum.plugins import make_action, add_plugin_argument
add_plugin_argument(
'--song', type=str, help='What song to sing.',
)
@make_action
def sing_melody(action=None, song=None):
to_sing = action[0] if action else song
if not to_sing:
return False, "Please tell me what to sing!"
return True, f'~I am singing {to_sing}~'
mrsm sing melody lalala
mrsm sing melody --song do-re-mi
Add a Page to the Web Dashboard
Use the decorators meerschaum.plugins.dash_plugin()
and meerschaum.plugins.web_page()
to add new pages to the web dashboard:
from meerschaum.plugins import dash_plugin, web_page
@dash_plugin
def init_dash(dash_app):
import dash.html as html
import dash_bootstrap_components as dbc
from dash import Input, Output, no_update
### Routes to '/dash/my-page'
@web_page('/my-page', login_required=False)
def my_page():
return dbc.Container([
html.H1("Hello, World!"),
dbc.Button("Click me", id='my-button'),
html.Div(id="my-output-div"),
])
@dash_app.callback(
Output('my-output-div', 'children'),
Input('my-button', 'n_clicks'),
)
def my_button_click(n_clicks):
if not n_clicks:
return no_update
return html.P(f'You clicked {n_clicks} times!')
Submodules
meerschaum.actions
Access functions for actions and subactions.
meerschaum.actions.actions
meerschaum.actions.get_action()
meerschaum.actions.get_completer()
meerschaum.actions.get_main_action_name()
meerschaum.actions.get_subactions()
meerschaum.config
Read and write the Meerschaum configuration registry.
meerschaum.config.get_config()
meerschaum.config.get_plugin_config()
meerschaum.config.write_config()
meerschaum.config.write_plugin_config()
meerschaum.connectors
Build connectors to interact with databases and fetch data.
meerschaum.connectors.get_connector()
meerschaum.connectors.make_connector()
meerschaum.connectors.is_connected()
meerschaum.connectors.poll.retry_connect()
meerschaum.connectors.Connector
meerschaum.connectors.sql.SQLConnector
meerschaum.connectors.api.APIConnector
meerschaum.connectors.valkey.ValkeyConnector
meerschaum.jobs
Start background jobs.
meerschaum.jobs.Job
meerschaum.jobs.Executor
meerschaum.jobs.systemd.SystemdExecutor
meerschaum.jobs.get_jobs()
meerschaum.jobs.get_filtered_jobs()
meerschaum.jobs.get_running_jobs()
meerschaum.jobs.get_stopped_jobs()
meerschaum.jobs.get_paused_jobs()
meerschaum.jobs.get_restart_jobs()
meerschaum.jobs.make_executor()
meerschaum.jobs.check_restart_jobs()
meerschaum.jobs.start_check_jobs_thread()
meerschaum.jobs.stop_check_jobs_thread()
meerschaum.plugins
Access plugin modules and other API utilties.
meerschaum.plugins.Plugin
meerschaum.plugins.api_plugin()
meerschaum.plugins.dash_plugin()
meerschaum.plugins.import_plugins()
meerschaum.plugins.reload_plugins()
meerschaum.plugins.get_plugins()
meerschaum.plugins.get_data_plugins()
meerschaum.plugins.add_plugin_argument()
meerschaum.plugins.pre_sync_hook()
meerschaum.plugins.post_sync_hook()
meerschaum.utils
Utility functions are available in several submodules:
meerschaum.utils.daemon.daemon_entry()
meerschaum.utils.daemon.daemon_action()
meerschaum.utils.daemon.get_daemons()
meerschaum.utils.daemon.get_daemon_ids()
meerschaum.utils.daemon.get_running_daemons()
meerschaum.utils.daemon.get_paused_daemons()
meerschaum.utils.daemon.get_stopped_daemons()
meerschaum.utils.daemon.get_filtered_daemons()
meerschaum.utils.daemon.run_daemon()
meerschaum.utils.daemon.Daemon
meerschaum.utils.daemon.FileDescriptorInterceptor
meerschaum.utils.daemon.RotatingFile
meerschaum.utils.daemon
Manage background jobs.
meerschaum.utils.dataframe.add_missing_cols_to_df()
meerschaum.utils.dataframe.df_is_chunk_generator()
meerschaum.utils.dataframe.enforce_dtypes()
meerschaum.utils.dataframe.filter_unseen_df()
meerschaum.utils.dataframe.get_datetime_bound_from_df()
meerschaum.utils.dataframe.get_first_valid_dask_partition()
meerschaum.utils.dataframe.get_json_cols()
meerschaum.utils.dataframe.get_numeric_cols()
meerschaum.utils.dataframe.get_unhashable_cols()
meerschaum.utils.dataframe.parse_df_datetimes()
meerschaum.utils.dataframe.query_df()
meerschaum.utils.dataframe.to_json()
meerschaum.utils.dataframe
Manipulate dataframes.
meerschaum.utils.dtypes.are_dtypes_equal()
meerschaum.utils.dtypes.attempt_cast_to_numeric()
meerschaum.utils.dtypes.is_dtype_numeric()
meerschaum.utils.dtypes.none_if_null()
meerschaum.utils.dtypes.quantize_decimal()
meerschaum.utils.dtypes.to_pandas_dtype()
meerschaum.utils.dtypes.value_is_null()
meerschaum.utils.dtypes.sql.get_pd_type_from_db_type()
meerschaum.utils.dtypes.sql.get_db_type_from_pd_type()
meerschaum.utils.dtypes
Work with data types.
meerschaum.utils.formatting.colored()
meerschaum.utils.formatting.extract_stats_from_message()
meerschaum.utils.formatting.fill_ansi()
meerschaum.utils.formatting.get_console()
meerschaum.utils.formatting.highlight_pipes()
meerschaum.utils.formatting.make_header()
meerschaum.utils.formatting.pipe_repr()
meerschaum.utils.formatting.pprint()
meerschaum.utils.formatting.pprint_pipes()
meerschaum.utils.formatting.print_options()
meerschaum.utils.formatting.print_pipes_results()
meerschaum.utils.formatting.print_tuple()
meerschaum.utils.formatting.translate_rich_to_termcolor()
meerschaum.utils.formatting
Format output text.
meerschaum.utils.misc.items_str()
meerschaum.utils.misc.round_time()
meerschaum.utils.misc.is_int()
meerschaum.utils.misc.interval_str()
meerschaum.utils.misc.filter_keywords()
meerschaum.utils.misc.generate_password()
meerschaum.utils.misc.string_to_dict()
meerschaum.utils.misc.iterate_chunks()
meerschaum.utils.misc.timed_input()
meerschaum.utils.misc.replace_pipes_in_dict()
meerschaum.utils.misc.is_valid_email()
meerschaum.utils.misc.string_width()
meerschaum.utils.misc.replace_password()
meerschaum.utils.misc.parse_config_substitution()
meerschaum.utils.misc.edit_file()
meerschaum.utils.misc.get_in_ex_params()
meerschaum.utils.misc.separate_negation_values()
meerschaum.utils.misc.flatten_list()
meerschaum.utils.misc.make_symlink()
meerschaum.utils.misc.is_symlink()
meerschaum.utils.misc.wget()
meerschaum.utils.misc.add_method_to_class()
meerschaum.utils.misc.is_pipe_registered()
meerschaum.utils.misc.get_cols_lines()
meerschaum.utils.misc.sorted_dict()
meerschaum.utils.misc.flatten_pipes_dict()
meerschaum.utils.misc.dict_from_od()
meerschaum.utils.misc.remove_ansi()
meerschaum.utils.misc.get_connector_labels()
meerschaum.utils.misc.json_serialize_datetime()
meerschaum.utils.misc.async_wrap()
meerschaum.utils.misc.is_docker_available()
meerschaum.utils.misc.is_android()
meerschaum.utils.misc.is_bcp_available()
meerschaum.utils.misc.truncate_string_sections()
meerschaum.utils.misc.safely_extract_tar()
meerschaum.utils.misc
Miscellaneous utility functions.
meerschaum.utils.packages.attempt_import()
meerschaum.utils.packages.get_module_path()
meerschaum.utils.packages.manually_import_module()
meerschaum.utils.packages.get_install_no_version()
meerschaum.utils.packages.determine_version()
meerschaum.utils.packages.need_update()
meerschaum.utils.packages.get_pip()
meerschaum.utils.packages.pip_install()
meerschaum.utils.packages.pip_uninstall()
meerschaum.utils.packages.completely_uninstall_package()
meerschaum.utils.packages.run_python_package()
meerschaum.utils.packages.lazy_import()
meerschaum.utils.packages.pandas_name()
meerschaum.utils.packages.import_pandas()
meerschaum.utils.packages.import_rich()
meerschaum.utils.packages.import_dcc()
meerschaum.utils.packages.import_html()
meerschaum.utils.packages.get_modules_from_package()
meerschaum.utils.packages.import_children()
meerschaum.utils.packages.reload_package()
meerschaum.utils.packages.reload_meerschaum()
meerschaum.utils.packages.is_installed()
meerschaum.utils.packages.venv_contains_package()
meerschaum.utils.packages.package_venv()
meerschaum.utils.packages.ensure_readline()
meerschaum.utils.packages.get_prerelease_dependencies()
meerschaum.utils.packages
Manage Python packages.
meerschaum.utils.sql.build_where()
meerschaum.utils.sql.clean()
meerschaum.utils.sql.dateadd_str()
meerschaum.utils.sql.test_connection()
meerschaum.utils.sql.get_distinct_col_count()
meerschaum.utils.sql.sql_item_name()
meerschaum.utils.sql.pg_capital()
meerschaum.utils.sql.oracle_capital()
meerschaum.utils.sql.truncate_item_name()
meerschaum.utils.sql.table_exists()
meerschaum.utils.sql.get_table_cols_types()
meerschaum.utils.sql.get_update_queries()
meerschaum.utils.sql.get_null_replacement()
meerschaum.utils.sql.get_db_version()
meerschaum.utils.sql.get_rename_table_queries()
meerschaum.utils.sql.get_create_table_query()
meerschaum.utils.sql.wrap_query_with_cte()
meerschaum.utils.sql.format_cte_subquery()
meerschaum.utils.sql.session_execute()
meerschaum.utils.sql
Build SQL queries.
meerschaum.utils.venv.Venv
meerschaum.utils.venv.activate_venv()
meerschaum.utils.venv.deactivate_venv()
meerschaum.utils.venv.get_module_venv()
meerschaum.utils.venv.get_venvs()
meerschaum.utils.venv.init_venv()
meerschaum.utils.venv.inside_venv()
meerschaum.utils.venv.is_venv_active()
meerschaum.utils.venv.venv_exec()
meerschaum.utils.venv.venv_executable()
meerschaum.utils.venv.venv_exists()
meerschaum.utils.venv.venv_target_path()
meerschaum.utils.venv.verify_venv()
meerschaum.utils.venv
Manage virtual environments.
meerschaum.utils.warnings
Print warnings, errors, info, and debug messages.
1#! /usr/bin/env python 2# -*- coding: utf-8 -*- 3# vim:fenc=utf-8 4 5""" 6Copyright 2023 Bennett Meares 7 8Licensed under the Apache License, Version 2.0 (the "License"); 9you may not use this file except in compliance with the License. 10You may obtain a copy of the License at 11 12 http://www.apache.org/licenses/LICENSE-2.0 13 14Unless required by applicable law or agreed to in writing, software 15distributed under the License is distributed on an "AS IS" BASIS, 16WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17See the License for the specific language governing permissions and 18limitations under the License. 19""" 20 21import atexit 22from meerschaum.utils.typing import SuccessTuple 23from meerschaum.utils.packages import attempt_import 24from meerschaum.core.Pipe import Pipe 25from meerschaum.plugins import Plugin 26from meerschaum.utils.venv import Venv 27from meerschaum.jobs import Job, make_executor 28from meerschaum.connectors import get_connector, Connector, make_connector 29from meerschaum.utils import get_pipes 30from meerschaum.utils.formatting import pprint 31from meerschaum._internal.docs import index as __doc__ 32from meerschaum.config import __version__, get_config 33from meerschaum._internal.entry import entry 34from meerschaum.__main__ import _close_pools 35 36atexit.register(_close_pools) 37 38__pdoc__ = {'gui': False, 'api': False, 'core': False, '_internal': False} 39__all__ = ( 40 "get_pipes", 41 "get_connector", 42 "get_config", 43 "Pipe", 44 "Plugin", 45 "Venv", 46 "Plugin", 47 "Job", 48 "pprint", 49 "attempt_import", 50 "actions", 51 "config", 52 "connectors", 53 "jobs", 54 "plugins", 55 "utils", 56 "SuccessTuple", 57 "Connector", 58 "make_connector", 59 "entry", 60)
19def get_pipes( 20 connector_keys: Union[str, List[str], None] = None, 21 metric_keys: Union[str, List[str], None] = None, 22 location_keys: Union[str, List[str], None] = None, 23 tags: Optional[List[str]] = None, 24 params: Optional[Dict[str, Any]] = None, 25 mrsm_instance: Union[str, InstanceConnector, None] = None, 26 instance: Union[str, InstanceConnector, None] = None, 27 as_list: bool = False, 28 method: str = 'registered', 29 debug: bool = False, 30 **kw: Any 31) -> Union[PipesDict, List[mrsm.Pipe]]: 32 """ 33 Return a dictionary or list of `meerschaum.Pipe` objects. 34 35 Parameters 36 ---------- 37 connector_keys: Union[str, List[str], None], default None 38 String or list of connector keys. 39 If omitted or is `'*'`, fetch all possible keys. 40 If a string begins with `'_'`, select keys that do NOT match the string. 41 42 metric_keys: Union[str, List[str], None], default None 43 String or list of metric keys. See `connector_keys` for formatting. 44 45 location_keys: Union[str, List[str], None], default None 46 String or list of location keys. See `connector_keys` for formatting. 47 48 tags: Optional[List[str]], default None 49 If provided, only include pipes with these tags. 50 51 params: Optional[Dict[str, Any]], default None 52 Dictionary of additional parameters to search by. 53 Params are parsed into a SQL WHERE clause. 54 E.g. `{'a': 1, 'b': 2}` equates to `'WHERE a = 1 AND b = 2'` 55 56 mrsm_instance: Union[str, InstanceConnector, None], default None 57 Connector keys for the Meerschaum instance of the pipes. 58 Must be a `meerschaum.connectors.sql.SQLConnector.SQLConnector` or 59 `meerschaum.connectors.api.APIConnector.APIConnector`. 60 61 as_list: bool, default False 62 If `True`, return pipes in a list instead of a hierarchical dictionary. 63 `False` : `{connector_keys: {metric_key: {location_key: Pipe}}}` 64 `True` : `[Pipe]` 65 66 method: str, default 'registered' 67 Available options: `['registered', 'explicit', 'all']` 68 If `'registered'` (default), create pipes based on registered keys in the connector's pipes table 69 (API or SQL connector, depends on mrsm_instance). 70 If `'explicit'`, create pipes from provided connector_keys, metric_keys, and location_keys 71 instead of consulting the pipes table. Useful for creating non-existent pipes. 72 If `'all'`, create pipes from predefined metrics and locations. Required `connector_keys`. 73 **NOTE:** Method `'all'` is not implemented! 74 75 **kw: Any: 76 Keyword arguments to pass to the `meerschaum.Pipe` constructor. 77 78 79 Returns 80 ------- 81 A dictionary of dictionaries and `meerschaum.Pipe` objects 82 in the connector, metric, location hierarchy. 83 If `as_list` is `True`, return a list of `meerschaum.Pipe` objects. 84 85 Examples 86 -------- 87 ``` 88 >>> ### Manual definition: 89 >>> pipes = { 90 ... <connector_keys>: { 91 ... <metric_key>: { 92 ... <location_key>: Pipe( 93 ... <connector_keys>, 94 ... <metric_key>, 95 ... <location_key>, 96 ... ), 97 ... }, 98 ... }, 99 ... }, 100 >>> ### Accessing a single pipe: 101 >>> pipes['sql:main']['weather'][None] 102 >>> ### Return a list instead: 103 >>> get_pipes(as_list=True) 104 [sql_main_weather] 105 >>> 106 ``` 107 """ 108 109 from meerschaum.config import get_config 110 from meerschaum.utils.warnings import error 111 from meerschaum.utils.misc import filter_keywords 112 113 if connector_keys is None: 114 connector_keys = [] 115 if metric_keys is None: 116 metric_keys = [] 117 if location_keys is None: 118 location_keys = [] 119 if params is None: 120 params = {} 121 if tags is None: 122 tags = [] 123 124 if isinstance(connector_keys, str): 125 connector_keys = [connector_keys] 126 if isinstance(metric_keys, str): 127 metric_keys = [metric_keys] 128 if isinstance(location_keys, str): 129 location_keys = [location_keys] 130 131 ### Get SQL or API connector (keys come from `connector.fetch_pipes_keys()`). 132 if mrsm_instance is None: 133 mrsm_instance = instance 134 if mrsm_instance is None: 135 mrsm_instance = get_config('meerschaum', 'instance', patch=True) 136 if isinstance(mrsm_instance, str): 137 from meerschaum.connectors.parse import parse_instance_keys 138 connector = parse_instance_keys(keys=mrsm_instance, debug=debug) 139 else: 140 from meerschaum.connectors import instance_types 141 valid_connector = False 142 if hasattr(mrsm_instance, 'type'): 143 if mrsm_instance.type in instance_types: 144 valid_connector = True 145 if not valid_connector: 146 error(f"Invalid instance connector: {mrsm_instance}") 147 connector = mrsm_instance 148 if debug: 149 from meerschaum.utils.debug import dprint 150 dprint(f"Using instance connector: {connector}") 151 if not connector: 152 error(f"Could not create connector from keys: '{mrsm_instance}'") 153 154 ### Get a list of tuples for the keys needed to build pipes. 155 result = fetch_pipes_keys( 156 method, 157 connector, 158 connector_keys = connector_keys, 159 metric_keys = metric_keys, 160 location_keys = location_keys, 161 tags = tags, 162 params = params, 163 debug = debug 164 ) 165 if result is None: 166 error(f"Unable to build pipes!") 167 168 ### Populate the `pipes` dictionary with Pipes based on the keys 169 ### obtained from the chosen `method`. 170 from meerschaum import Pipe 171 pipes = {} 172 for ck, mk, lk in result: 173 if ck not in pipes: 174 pipes[ck] = {} 175 176 if mk not in pipes[ck]: 177 pipes[ck][mk] = {} 178 179 pipes[ck][mk][lk] = Pipe( 180 ck, mk, lk, 181 mrsm_instance = connector, 182 debug = debug, 183 **filter_keywords(Pipe, **kw) 184 ) 185 186 if not as_list: 187 return pipes 188 from meerschaum.utils.misc import flatten_pipes_dict 189 return flatten_pipes_dict(pipes)
Return a dictionary or list of meerschaum.Pipe
objects.
Parameters
- connector_keys (Union[str, List[str], None], default None):
String or list of connector keys.
If omitted or is
'*'
, fetch all possible keys. If a string begins with'_'
, select keys that do NOT match the string. - metric_keys (Union[str, List[str], None], default None):
String or list of metric keys. See
connector_keys
for formatting. - location_keys (Union[str, List[str], None], default None):
String or list of location keys. See
connector_keys
for formatting. - tags (Optional[List[str]], default None): If provided, only include pipes with these tags.
- params (Optional[Dict[str, Any]], default None):
Dictionary of additional parameters to search by.
Params are parsed into a SQL WHERE clause.
E.g.
{'a': 1, 'b': 2}
equates to'WHERE a = 1 AND b = 2'
- mrsm_instance (Union[str, InstanceConnector, None], default None):
Connector keys for the Meerschaum instance of the pipes.
Must be a
meerschaum.connectors.sql.SQLConnector.SQLConnector
ormeerschaum.connectors.api.APIConnector.APIConnector
. - as_list (bool, default False):
If
True
, return pipes in a list instead of a hierarchical dictionary.False
:{connector_keys: {metric_key: {location_key: Pipe}}}
True
:[Pipe]
- method (str, default 'registered'):
Available options:
['registered', 'explicit', 'all']
If'registered'
(default), create pipes based on registered keys in the connector's pipes table (API or SQL connector, depends on mrsm_instance). If'explicit'
, create pipes from provided connector_keys, metric_keys, and location_keys instead of consulting the pipes table. Useful for creating non-existent pipes. If'all'
, create pipes from predefined metrics and locations. Requiredconnector_keys
. NOTE: Method'all'
is not implemented! - **kw (Any:):
Keyword arguments to pass to the
meerschaum.Pipe
constructor.
Returns
- A dictionary of dictionaries and
meerschaum.Pipe
objects - in the connector, metric, location hierarchy.
- If
as_list
isTrue
, return a list ofmeerschaum.Pipe
objects.
Examples
>>> ### Manual definition:
>>> pipes = {
... <connector_keys>: {
... <metric_key>: {
... <location_key>: Pipe(
... <connector_keys>,
... <metric_key>,
... <location_key>,
... ),
... },
... },
... },
>>> ### Accessing a single pipe:
>>> pipes['sql:main']['weather'][None]
>>> ### Return a list instead:
>>> get_pipes(as_list=True)
[sql_main_weather]
>>>
80def get_connector( 81 type: str = None, 82 label: str = None, 83 refresh: bool = False, 84 debug: bool = False, 85 **kw: Any 86) -> Connector: 87 """ 88 Return existing connector or create new connection and store for reuse. 89 90 You can create new connectors if enough parameters are provided for the given type and flavor. 91 92 93 Parameters 94 ---------- 95 type: Optional[str], default None 96 Connector type (sql, api, etc.). 97 Defaults to the type of the configured `instance_connector`. 98 99 label: Optional[str], default None 100 Connector label (e.g. main). Defaults to `'main'`. 101 102 refresh: bool, default False 103 Refresh the Connector instance / construct new object. Defaults to `False`. 104 105 kw: Any 106 Other arguments to pass to the Connector constructor. 107 If the Connector has already been constructed and new arguments are provided, 108 `refresh` is set to `True` and the old Connector is replaced. 109 110 Returns 111 ------- 112 A new Meerschaum connector (e.g. `meerschaum.connectors.api.APIConnector`, 113 `meerschaum.connectors.sql.SQLConnector`). 114 115 Examples 116 -------- 117 The following parameters would create a new 118 `meerschaum.connectors.sql.SQLConnector` that isn't in the configuration file. 119 120 ``` 121 >>> conn = get_connector( 122 ... type = 'sql', 123 ... label = 'newlabel', 124 ... flavor = 'sqlite', 125 ... database = '/file/path/to/database.db' 126 ... ) 127 >>> 128 ``` 129 130 """ 131 from meerschaum.connectors.parse import parse_instance_keys 132 from meerschaum.config import get_config 133 from meerschaum.config.static import STATIC_CONFIG 134 from meerschaum.utils.warnings import warn 135 global _loaded_plugin_connectors 136 if isinstance(type, str) and not label and ':' in type: 137 type, label = type.split(':', maxsplit=1) 138 139 with _locks['_loaded_plugin_connectors']: 140 if not _loaded_plugin_connectors: 141 load_plugin_connectors() 142 _load_builtin_custom_connectors() 143 _loaded_plugin_connectors = True 144 145 if type is None and label is None: 146 default_instance_keys = get_config('meerschaum', 'instance', patch=True) 147 ### recursive call to get_connector 148 return parse_instance_keys(default_instance_keys) 149 150 ### NOTE: the default instance connector may not be main. 151 ### Only fall back to 'main' if the type is provided by the label is omitted. 152 label = label if label is not None else STATIC_CONFIG['connectors']['default_label'] 153 154 ### type might actually be a label. Check if so and raise a warning. 155 if type not in connectors: 156 possibilities, poss_msg = [], "" 157 for _type in get_config('meerschaum', 'connectors'): 158 if type in get_config('meerschaum', 'connectors', _type): 159 possibilities.append(f"{_type}:{type}") 160 if len(possibilities) > 0: 161 poss_msg = " Did you mean" 162 for poss in possibilities[:-1]: 163 poss_msg += f" '{poss}'," 164 if poss_msg.endswith(','): 165 poss_msg = poss_msg[:-1] 166 if len(possibilities) > 1: 167 poss_msg += " or" 168 poss_msg += f" '{possibilities[-1]}'?" 169 170 warn(f"Cannot create Connector of type '{type}'." + poss_msg, stack=False) 171 return None 172 173 if 'sql' not in types: 174 from meerschaum.connectors.plugin import PluginConnector 175 from meerschaum.connectors.valkey import ValkeyConnector 176 with _locks['types']: 177 types.update({ 178 'api': APIConnector, 179 'sql': SQLConnector, 180 'plugin': PluginConnector, 181 'valkey': ValkeyConnector, 182 }) 183 184 ### determine if we need to call the constructor 185 if not refresh: 186 ### see if any user-supplied arguments differ from the existing instance 187 if label in connectors[type]: 188 warning_message = None 189 for attribute, value in kw.items(): 190 if attribute not in connectors[type][label].meta: 191 import inspect 192 cls = connectors[type][label].__class__ 193 cls_init_signature = inspect.signature(cls) 194 cls_init_params = cls_init_signature.parameters 195 if attribute not in cls_init_params: 196 warning_message = ( 197 f"Received new attribute '{attribute}' not present in connector " + 198 f"{connectors[type][label]}.\n" 199 ) 200 elif connectors[type][label].__dict__[attribute] != value: 201 warning_message = ( 202 f"Mismatched values for attribute '{attribute}' in connector " 203 + f"'{connectors[type][label]}'.\n" + 204 f" - Keyword value: '{value}'\n" + 205 f" - Existing value: '{connectors[type][label].__dict__[attribute]}'\n" 206 ) 207 if warning_message is not None: 208 warning_message += ( 209 "\nSetting `refresh` to True and recreating connector with type:" 210 + f" '{type}' and label '{label}'." 211 ) 212 refresh = True 213 warn(warning_message) 214 else: ### connector doesn't yet exist 215 refresh = True 216 217 ### only create an object if refresh is True 218 ### (can be manually specified, otherwise determined above) 219 if refresh: 220 with _locks['connectors']: 221 try: 222 ### will raise an error if configuration is incorrect / missing 223 conn = types[type](label=label, **kw) 224 connectors[type][label] = conn 225 except InvalidAttributesError as ie: 226 warn( 227 f"Incorrect attributes for connector '{type}:{label}'.\n" 228 + str(ie), 229 stack = False, 230 ) 231 conn = None 232 except Exception as e: 233 from meerschaum.utils.formatting import get_console 234 console = get_console() 235 if console: 236 console.print_exception() 237 warn( 238 f"Exception when creating connector '{type}:{label}'.\n" + str(e), 239 stack = False, 240 ) 241 conn = None 242 if conn is None: 243 return None 244 245 return connectors[type][label]
Return existing connector or create new connection and store for reuse.
You can create new connectors if enough parameters are provided for the given type and flavor.
Parameters
- type (Optional[str], default None):
Connector type (sql, api, etc.).
Defaults to the type of the configured
instance_connector
. - label (Optional[str], default None):
Connector label (e.g. main). Defaults to
'main'
. - refresh (bool, default False):
Refresh the Connector instance / construct new object. Defaults to
False
. - kw (Any):
Other arguments to pass to the Connector constructor.
If the Connector has already been constructed and new arguments are provided,
refresh
is set toTrue
and the old Connector is replaced.
Returns
- A new Meerschaum connector (e.g.
meerschaum.connectors.api.APIConnector
, meerschaum.connectors.sql.SQLConnector
).
Examples
The following parameters would create a new
meerschaum.connectors.sql.SQLConnector
that isn't in the configuration file.
>>> conn = get_connector(
... type = 'sql',
... label = 'newlabel',
... flavor = 'sqlite',
... database = '/file/path/to/database.db'
... )
>>>
82def get_config( 83 *keys: str, 84 patch: bool = True, 85 substitute: bool = True, 86 sync_files: bool = True, 87 write_missing: bool = True, 88 as_tuple: bool = False, 89 warn: bool = True, 90 debug: bool = False 91) -> Any: 92 """ 93 Return the Meerschaum configuration dictionary. 94 If positional arguments are provided, index by the keys. 95 Raises a warning if invalid keys are provided. 96 97 Parameters 98 ---------- 99 keys: str: 100 List of strings to index. 101 102 patch: bool, default True 103 If `True`, patch missing default keys into the config directory. 104 Defaults to `True`. 105 106 sync_files: bool, default True 107 If `True`, sync files if needed. 108 Defaults to `True`. 109 110 write_missing: bool, default True 111 If `True`, write default values when the main config files are missing. 112 Defaults to `True`. 113 114 substitute: bool, default True 115 If `True`, subsitute 'MRSM{}' values. 116 Defaults to `True`. 117 118 as_tuple: bool, default False 119 If `True`, return a tuple of type (success, value). 120 Defaults to `False`. 121 122 Returns 123 ------- 124 The value in the configuration directory, indexed by the provided keys. 125 126 Examples 127 -------- 128 >>> get_config('meerschaum', 'instance') 129 'sql:main' 130 >>> get_config('does', 'not', 'exist') 131 UserWarning: Invalid keys in config: ('does', 'not', 'exist') 132 """ 133 import json 134 135 symlinks_key = STATIC_CONFIG['config']['symlinks_key'] 136 if debug: 137 from meerschaum.utils.debug import dprint 138 dprint(f"Indexing keys: {keys}", color=False) 139 140 if len(keys) == 0: 141 _rc = _config(substitute=substitute, sync_files=sync_files, write_missing=write_missing) 142 if as_tuple: 143 return True, _rc 144 return _rc 145 146 ### Weird threading issues, only import if substitute is True. 147 if substitute: 148 from meerschaum.config._read_config import search_and_substitute_config 149 ### Invalidate the cache if it was read before with substitute=False 150 ### but there still exist substitutions. 151 if ( 152 config is not None and substitute and keys[0] != symlinks_key 153 and 'MRSM{' in json.dumps(config.get(keys[0])) 154 ): 155 try: 156 _subbed = search_and_substitute_config({keys[0]: config[keys[0]]}) 157 except Exception as e: 158 import traceback 159 traceback.print_exc() 160 config[keys[0]] = _subbed[keys[0]] 161 if symlinks_key in _subbed: 162 if symlinks_key not in config: 163 config[symlinks_key] = {} 164 if keys[0] not in config[symlinks_key]: 165 config[symlinks_key][keys[0]] = {} 166 config[symlinks_key][keys[0]] = apply_patch_to_config( 167 _subbed, 168 config[symlinks_key][keys[0]] 169 ) 170 171 from meerschaum.config._sync import sync_files as _sync_files 172 if config is None: 173 _config(*keys, sync_files=sync_files) 174 175 invalid_keys = False 176 if keys[0] not in config and keys[0] != symlinks_key: 177 single_key_config = read_config( 178 keys=[keys[0]], substitute=substitute, write_missing=write_missing 179 ) 180 if keys[0] not in single_key_config: 181 invalid_keys = True 182 else: 183 config[keys[0]] = single_key_config.get(keys[0], None) 184 if symlinks_key in single_key_config and keys[0] in single_key_config[symlinks_key]: 185 if symlinks_key not in config: 186 config[symlinks_key] = {} 187 config[symlinks_key][keys[0]] = single_key_config[symlinks_key][keys[0]] 188 189 if sync_files: 190 _sync_files(keys=[keys[0]]) 191 192 c = config 193 if len(keys) > 0: 194 for k in keys: 195 try: 196 c = c[k] 197 except Exception as e: 198 invalid_keys = True 199 break 200 if invalid_keys: 201 ### Check if the keys are in the default configuration. 202 from meerschaum.config._default import default_config 203 in_default = True 204 patched_default_config = ( 205 search_and_substitute_config(default_config) 206 if substitute else copy.deepcopy(default_config) 207 ) 208 _c = patched_default_config 209 for k in keys: 210 try: 211 _c = _c[k] 212 except Exception as e: 213 in_default = False 214 if in_default: 215 c = _c 216 invalid_keys = False 217 warning_msg = f"Invalid keys in config: {keys}" 218 if not in_default: 219 try: 220 if warn: 221 from meerschaum.utils.warnings import warn as _warn 222 _warn(warning_msg, stacklevel=3, color=False) 223 except Exception as e: 224 if warn: 225 print(warning_msg) 226 if as_tuple: 227 return False, None 228 return None 229 230 ### Don't write keys that we haven't yet loaded into memory. 231 not_loaded_keys = [k for k in patched_default_config if k not in config] 232 for k in not_loaded_keys: 233 patched_default_config.pop(k, None) 234 235 set_config( 236 apply_patch_to_config( 237 patched_default_config, 238 config, 239 ) 240 ) 241 if patch and keys[0] != symlinks_key: 242 if write_missing: 243 write_config(config, debug=debug) 244 245 if as_tuple: 246 return (not invalid_keys), c 247 return c
Return the Meerschaum configuration dictionary. If positional arguments are provided, index by the keys. Raises a warning if invalid keys are provided.
Parameters
- keys (str:): List of strings to index.
- patch (bool, default True):
If
True
, patch missing default keys into the config directory. Defaults toTrue
. - sync_files (bool, default True):
If
True
, sync files if needed. Defaults toTrue
. - write_missing (bool, default True):
If
True
, write default values when the main config files are missing. Defaults toTrue
. - substitute (bool, default True):
If
True
, subsitute 'MRSM{}' values. Defaults toTrue
. - as_tuple (bool, default False):
If
True
, return a tuple of type (success, value). Defaults toFalse
.
Returns
- The value in the configuration directory, indexed by the provided keys.
Examples
>>> get_config('meerschaum', 'instance')
'sql:main'
>>> get_config('does', 'not', 'exist')
UserWarning: Invalid keys in config: ('does', 'not', 'exist')
60class Pipe: 61 """ 62 Access Meerschaum pipes via Pipe objects. 63 64 Pipes are identified by the following: 65 66 1. Connector keys (e.g. `'sql:main'`) 67 2. Metric key (e.g. `'weather'`) 68 3. Location (optional; e.g. `None`) 69 70 A pipe's connector keys correspond to a data source, and when the pipe is synced, 71 its `fetch` definition is evaluated and executed to produce new data. 72 73 Alternatively, new data may be directly synced via `pipe.sync()`: 74 75 ``` 76 >>> from meerschaum import Pipe 77 >>> pipe = Pipe('csv', 'weather') 78 >>> 79 >>> import pandas as pd 80 >>> df = pd.read_csv('weather.csv') 81 >>> pipe.sync(df) 82 ``` 83 """ 84 85 from ._fetch import ( 86 fetch, 87 get_backtrack_interval, 88 ) 89 from ._data import ( 90 get_data, 91 get_backtrack_data, 92 get_rowcount, 93 _get_data_as_iterator, 94 get_chunk_interval, 95 get_chunk_bounds, 96 ) 97 from ._register import register 98 from ._attributes import ( 99 attributes, 100 parameters, 101 columns, 102 indices, 103 indexes, 104 dtypes, 105 get_columns, 106 get_columns_types, 107 get_indices, 108 tags, 109 get_id, 110 id, 111 get_val_column, 112 parents, 113 children, 114 target, 115 _target_legacy, 116 guess_datetime, 117 ) 118 from ._show import show 119 from ._edit import edit, edit_definition, update 120 from ._sync import ( 121 sync, 122 get_sync_time, 123 exists, 124 filter_existing, 125 _get_chunk_label, 126 get_num_workers, 127 _persist_new_json_columns, 128 _persist_new_numeric_columns, 129 _persist_new_uuid_columns, 130 ) 131 from ._verify import ( 132 verify, 133 get_bound_interval, 134 get_bound_time, 135 ) 136 from ._delete import delete 137 from ._drop import drop 138 from ._clear import clear 139 from ._deduplicate import deduplicate 140 from ._bootstrap import bootstrap 141 from ._dtypes import enforce_dtypes, infer_dtypes 142 from ._copy import copy_to 143 144 def __init__( 145 self, 146 connector: str = '', 147 metric: str = '', 148 location: Optional[str] = None, 149 parameters: Optional[Dict[str, Any]] = None, 150 columns: Union[Dict[str, str], List[str], None] = None, 151 indices: Optional[Dict[str, Union[str, List[str]]]] = None, 152 tags: Optional[List[str]] = None, 153 target: Optional[str] = None, 154 dtypes: Optional[Dict[str, str]] = None, 155 instance: Optional[Union[str, InstanceConnector]] = None, 156 temporary: bool = False, 157 mrsm_instance: Optional[Union[str, InstanceConnector]] = None, 158 cache: bool = False, 159 debug: bool = False, 160 connector_keys: Optional[str] = None, 161 metric_key: Optional[str] = None, 162 location_key: Optional[str] = None, 163 indexes: Union[Dict[str, str], List[str], None] = None, 164 ): 165 """ 166 Parameters 167 ---------- 168 connector: str 169 Keys for the pipe's source connector, e.g. `'sql:main'`. 170 171 metric: str 172 Label for the pipe's contents, e.g. `'weather'`. 173 174 location: str, default None 175 Label for the pipe's location. Defaults to `None`. 176 177 parameters: Optional[Dict[str, Any]], default None 178 Optionally set a pipe's parameters from the constructor, 179 e.g. columns and other attributes. 180 You can edit these parameters with `edit pipes`. 181 182 columns: Union[Dict[str, str], List[str], None], default None 183 Set the `columns` dictionary of `parameters`. 184 If `parameters` is also provided, this dictionary is added under the `'columns'` key. 185 186 indices: Optional[Dict[str, Union[str, List[str]]]], default None 187 Set the `indices` dictionary of `parameters`. 188 If `parameters` is also provided, this dictionary is added under the `'indices'` key. 189 190 tags: Optional[List[str]], default None 191 A list of strings to be added under the `'tags'` key of `parameters`. 192 You can select pipes with certain tags using `--tags`. 193 194 dtypes: Optional[Dict[str, str]], default None 195 Set the `dtypes` dictionary of `parameters`. 196 If `parameters` is also provided, this dictionary is added under the `'dtypes'` key. 197 198 mrsm_instance: Optional[Union[str, InstanceConnector]], default None 199 Connector for the Meerschaum instance where the pipe resides. 200 Defaults to the preconfigured default instance (`'sql:main'`). 201 202 instance: Optional[Union[str, InstanceConnector]], default None 203 Alias for `mrsm_instance`. If `mrsm_instance` is supplied, this value is ignored. 204 205 temporary: bool, default False 206 If `True`, prevent instance tables (pipes, users, plugins) from being created. 207 208 cache: bool, default False 209 If `True`, cache fetched data into a local database file. 210 Defaults to `False`. 211 """ 212 from meerschaum.utils.warnings import error, warn 213 if (not connector and not connector_keys) or (not metric and not metric_key): 214 error( 215 "Please provide strings for the connector and metric\n " 216 + "(first two positional arguments)." 217 ) 218 219 ### Fall back to legacy `location_key` just in case. 220 if not location: 221 location = location_key 222 223 if not connector: 224 connector = connector_keys 225 226 if not metric: 227 metric = metric_key 228 229 if location in ('[None]', 'None'): 230 location = None 231 232 from meerschaum.config.static import STATIC_CONFIG 233 negation_prefix = STATIC_CONFIG['system']['fetch_pipes_keys']['negation_prefix'] 234 for k in (connector, metric, location, *(tags or [])): 235 if str(k).startswith(negation_prefix): 236 error(f"A pipe's keys and tags cannot start with the prefix '{negation_prefix}'.") 237 238 self.connector_keys = str(connector) 239 self.connector_key = self.connector_keys ### Alias 240 self.metric_key = metric 241 self.location_key = location 242 self.temporary = temporary 243 244 self._attributes = { 245 'connector_keys': self.connector_keys, 246 'metric_key': self.metric_key, 247 'location_key': self.location_key, 248 'parameters': {}, 249 } 250 251 ### only set parameters if values are provided 252 if isinstance(parameters, dict): 253 self._attributes['parameters'] = parameters 254 else: 255 if parameters is not None: 256 warn(f"The provided parameters are of invalid type '{type(parameters)}'.") 257 self._attributes['parameters'] = {} 258 259 columns = columns or self._attributes.get('parameters', {}).get('columns', {}) 260 if isinstance(columns, list): 261 columns = {str(col): str(col) for col in columns} 262 if isinstance(columns, dict): 263 self._attributes['parameters']['columns'] = columns 264 elif columns is not None: 265 warn(f"The provided columns are of invalid type '{type(columns)}'.") 266 267 indices = ( 268 indices 269 or indexes 270 or self._attributes.get('parameters', {}).get('indices', None) 271 or self._attributes.get('parameters', {}).get('indexes', None) 272 ) or columns 273 if isinstance(indices, dict): 274 indices_key = ( 275 'indexes' 276 if 'indexes' in self._attributes['parameters'] 277 else 'indices' 278 ) 279 self._attributes['parameters'][indices_key] = indices 280 281 if isinstance(tags, (list, tuple)): 282 self._attributes['parameters']['tags'] = tags 283 elif tags is not None: 284 warn(f"The provided tags are of invalid type '{type(tags)}'.") 285 286 if isinstance(target, str): 287 self._attributes['parameters']['target'] = target 288 elif target is not None: 289 warn(f"The provided target is of invalid type '{type(target)}'.") 290 291 if isinstance(dtypes, dict): 292 self._attributes['parameters']['dtypes'] = dtypes 293 elif dtypes is not None: 294 warn(f"The provided dtypes are of invalid type '{type(dtypes)}'.") 295 296 ### NOTE: The parameters dictionary is {} by default. 297 ### A Pipe may be registered without parameters, then edited, 298 ### or a Pipe may be registered with parameters set in-memory first. 299 # from meerschaum.config import get_config 300 _mrsm_instance = mrsm_instance if mrsm_instance is not None else instance 301 if _mrsm_instance is None: 302 _mrsm_instance = get_config('meerschaum', 'instance', patch=True) 303 304 if not isinstance(_mrsm_instance, str): 305 self._instance_connector = _mrsm_instance 306 self.instance_keys = str(_mrsm_instance) 307 else: ### NOTE: must be SQL or API Connector for this work 308 self.instance_keys = _mrsm_instance 309 310 self._cache = cache and get_config('system', 'experimental', 'cache') 311 312 313 @property 314 def meta(self): 315 """ 316 Return the four keys needed to reconstruct this pipe. 317 """ 318 return { 319 'connector': self.connector_keys, 320 'metric': self.metric_key, 321 'location': self.location_key, 322 'instance': self.instance_keys, 323 } 324 325 326 def keys(self) -> List[str]: 327 """ 328 Return the ordered keys for this pipe. 329 """ 330 return { 331 key: val 332 for key, val in self.meta.items() 333 if key != 'instance' 334 } 335 336 337 @property 338 def instance_connector(self) -> Union[InstanceConnector, None]: 339 """ 340 The connector to where this pipe resides. 341 May either be of type `meerschaum.connectors.sql.SQLConnector` or 342 `meerschaum.connectors.api.APIConnector`. 343 """ 344 if '_instance_connector' not in self.__dict__: 345 from meerschaum.connectors.parse import parse_instance_keys 346 conn = parse_instance_keys(self.instance_keys) 347 if conn: 348 self._instance_connector = conn 349 else: 350 return None 351 return self._instance_connector 352 353 @property 354 def connector(self) -> Union[meerschaum.connectors.Connector, None]: 355 """ 356 The connector to the data source. 357 """ 358 if '_connector' not in self.__dict__: 359 from meerschaum.connectors.parse import parse_instance_keys 360 import warnings 361 with warnings.catch_warnings(): 362 warnings.simplefilter('ignore') 363 try: 364 conn = parse_instance_keys(self.connector_keys) 365 except Exception as e: 366 conn = None 367 if conn: 368 self._connector = conn 369 else: 370 return None 371 return self._connector 372 373 374 @property 375 def cache_connector(self) -> Union[meerschaum.connectors.sql.SQLConnector, None]: 376 """ 377 If the pipe was created with `cache=True`, return the connector to the pipe's 378 SQLite database for caching. 379 """ 380 if not self._cache: 381 return None 382 383 if '_cache_connector' not in self.__dict__: 384 from meerschaum.connectors import get_connector 385 from meerschaum.config._paths import DUCKDB_RESOURCES_PATH, SQLITE_RESOURCES_PATH 386 _resources_path = SQLITE_RESOURCES_PATH 387 self._cache_connector = get_connector( 388 'sql', '_cache_' + str(self), 389 flavor='sqlite', 390 database=str(_resources_path / ('_cache_' + str(self) + '.db')), 391 ) 392 393 return self._cache_connector 394 395 396 @property 397 def cache_pipe(self) -> Union['meerschaum.Pipe', None]: 398 """ 399 If the pipe was created with `cache=True`, return another `meerschaum.Pipe` used to 400 manage the local data. 401 """ 402 if self.cache_connector is None: 403 return None 404 if '_cache_pipe' not in self.__dict__: 405 from meerschaum.config._patch import apply_patch_to_config 406 from meerschaum.utils.sql import sql_item_name 407 _parameters = copy.deepcopy(self.parameters) 408 _fetch_patch = { 409 'fetch': ({ 410 'definition': ( 411 f"SELECT * FROM " 412 + sql_item_name( 413 str(self.target), 414 self.instance_connector.flavor, 415 self.instance_connector.get_pipe_schema(self), 416 ) 417 ), 418 }) if self.instance_connector.type == 'sql' else ({ 419 'connector_keys': self.connector_keys, 420 'metric_key': self.metric_key, 421 'location_key': self.location_key, 422 }) 423 } 424 _parameters = apply_patch_to_config(_parameters, _fetch_patch) 425 self._cache_pipe = Pipe( 426 self.instance_keys, 427 (self.connector_keys + '_' + self.metric_key + '_cache'), 428 self.location_key, 429 mrsm_instance = self.cache_connector, 430 parameters = _parameters, 431 cache = False, 432 temporary = True, 433 ) 434 435 return self._cache_pipe 436 437 438 def __str__(self, ansi: bool=False): 439 return pipe_repr(self, ansi=ansi) 440 441 442 def __eq__(self, other): 443 try: 444 return ( 445 isinstance(self, type(other)) 446 and self.connector_keys == other.connector_keys 447 and self.metric_key == other.metric_key 448 and self.location_key == other.location_key 449 and self.instance_keys == other.instance_keys 450 ) 451 except Exception as e: 452 return False 453 454 def __hash__(self): 455 ### Using an esoteric separator to avoid collisions. 456 sep = "[\"']" 457 return hash( 458 str(self.connector_keys) + sep 459 + str(self.metric_key) + sep 460 + str(self.location_key) + sep 461 + str(self.instance_keys) + sep 462 ) 463 464 def __repr__(self, ansi: bool=True, **kw) -> str: 465 if not hasattr(sys, 'ps1'): 466 ansi = False 467 468 return pipe_repr(self, ansi=ansi, **kw) 469 470 def __pt_repr__(self): 471 from meerschaum.utils.packages import attempt_import 472 prompt_toolkit_formatted_text = attempt_import('prompt_toolkit.formatted_text', lazy=False) 473 return prompt_toolkit_formatted_text.ANSI(pipe_repr(self, ansi=True)) 474 475 def __getstate__(self) -> Dict[str, Any]: 476 """ 477 Define the state dictionary (pickling). 478 """ 479 return { 480 'connector': self.connector_keys, 481 'metric': self.metric_key, 482 'location': self.location_key, 483 'parameters': self.parameters, 484 'instance': self.instance_keys, 485 } 486 487 def __setstate__(self, _state: Dict[str, Any]): 488 """ 489 Read the state (unpickling). 490 """ 491 self.__init__(**_state) 492 493 494 def __getitem__(self, key: str) -> Any: 495 """ 496 Index the pipe's attributes. 497 If the `key` cannot be found`, return `None`. 498 """ 499 if key in self.attributes: 500 return self.attributes.get(key, None) 501 502 aliases = { 503 'connector': 'connector_keys', 504 'connector_key': 'connector_keys', 505 'metric': 'metric_key', 506 'location': 'location_key', 507 } 508 aliased_key = aliases.get(key, None) 509 if aliased_key is not None: 510 return self.attributes.get(aliased_key, None) 511 512 property_aliases = { 513 'instance': 'instance_keys', 514 'instance_key': 'instance_keys', 515 } 516 aliased_key = property_aliases.get(key, None) 517 if aliased_key is not None: 518 key = aliased_key 519 return getattr(self, key, None)
Access Meerschaum pipes via Pipe objects.
Pipes are identified by the following:
- Connector keys (e.g.
'sql:main'
) - Metric key (e.g.
'weather'
) - Location (optional; e.g.
None
)
A pipe's connector keys correspond to a data source, and when the pipe is synced,
its fetch
definition is evaluated and executed to produce new data.
Alternatively, new data may be directly synced via pipe.sync()
:
>>> from meerschaum import Pipe
>>> pipe = Pipe('csv', 'weather')
>>>
>>> import pandas as pd
>>> df = pd.read_csv('weather.csv')
>>> pipe.sync(df)
144 def __init__( 145 self, 146 connector: str = '', 147 metric: str = '', 148 location: Optional[str] = None, 149 parameters: Optional[Dict[str, Any]] = None, 150 columns: Union[Dict[str, str], List[str], None] = None, 151 indices: Optional[Dict[str, Union[str, List[str]]]] = None, 152 tags: Optional[List[str]] = None, 153 target: Optional[str] = None, 154 dtypes: Optional[Dict[str, str]] = None, 155 instance: Optional[Union[str, InstanceConnector]] = None, 156 temporary: bool = False, 157 mrsm_instance: Optional[Union[str, InstanceConnector]] = None, 158 cache: bool = False, 159 debug: bool = False, 160 connector_keys: Optional[str] = None, 161 metric_key: Optional[str] = None, 162 location_key: Optional[str] = None, 163 indexes: Union[Dict[str, str], List[str], None] = None, 164 ): 165 """ 166 Parameters 167 ---------- 168 connector: str 169 Keys for the pipe's source connector, e.g. `'sql:main'`. 170 171 metric: str 172 Label for the pipe's contents, e.g. `'weather'`. 173 174 location: str, default None 175 Label for the pipe's location. Defaults to `None`. 176 177 parameters: Optional[Dict[str, Any]], default None 178 Optionally set a pipe's parameters from the constructor, 179 e.g. columns and other attributes. 180 You can edit these parameters with `edit pipes`. 181 182 columns: Union[Dict[str, str], List[str], None], default None 183 Set the `columns` dictionary of `parameters`. 184 If `parameters` is also provided, this dictionary is added under the `'columns'` key. 185 186 indices: Optional[Dict[str, Union[str, List[str]]]], default None 187 Set the `indices` dictionary of `parameters`. 188 If `parameters` is also provided, this dictionary is added under the `'indices'` key. 189 190 tags: Optional[List[str]], default None 191 A list of strings to be added under the `'tags'` key of `parameters`. 192 You can select pipes with certain tags using `--tags`. 193 194 dtypes: Optional[Dict[str, str]], default None 195 Set the `dtypes` dictionary of `parameters`. 196 If `parameters` is also provided, this dictionary is added under the `'dtypes'` key. 197 198 mrsm_instance: Optional[Union[str, InstanceConnector]], default None 199 Connector for the Meerschaum instance where the pipe resides. 200 Defaults to the preconfigured default instance (`'sql:main'`). 201 202 instance: Optional[Union[str, InstanceConnector]], default None 203 Alias for `mrsm_instance`. If `mrsm_instance` is supplied, this value is ignored. 204 205 temporary: bool, default False 206 If `True`, prevent instance tables (pipes, users, plugins) from being created. 207 208 cache: bool, default False 209 If `True`, cache fetched data into a local database file. 210 Defaults to `False`. 211 """ 212 from meerschaum.utils.warnings import error, warn 213 if (not connector and not connector_keys) or (not metric and not metric_key): 214 error( 215 "Please provide strings for the connector and metric\n " 216 + "(first two positional arguments)." 217 ) 218 219 ### Fall back to legacy `location_key` just in case. 220 if not location: 221 location = location_key 222 223 if not connector: 224 connector = connector_keys 225 226 if not metric: 227 metric = metric_key 228 229 if location in ('[None]', 'None'): 230 location = None 231 232 from meerschaum.config.static import STATIC_CONFIG 233 negation_prefix = STATIC_CONFIG['system']['fetch_pipes_keys']['negation_prefix'] 234 for k in (connector, metric, location, *(tags or [])): 235 if str(k).startswith(negation_prefix): 236 error(f"A pipe's keys and tags cannot start with the prefix '{negation_prefix}'.") 237 238 self.connector_keys = str(connector) 239 self.connector_key = self.connector_keys ### Alias 240 self.metric_key = metric 241 self.location_key = location 242 self.temporary = temporary 243 244 self._attributes = { 245 'connector_keys': self.connector_keys, 246 'metric_key': self.metric_key, 247 'location_key': self.location_key, 248 'parameters': {}, 249 } 250 251 ### only set parameters if values are provided 252 if isinstance(parameters, dict): 253 self._attributes['parameters'] = parameters 254 else: 255 if parameters is not None: 256 warn(f"The provided parameters are of invalid type '{type(parameters)}'.") 257 self._attributes['parameters'] = {} 258 259 columns = columns or self._attributes.get('parameters', {}).get('columns', {}) 260 if isinstance(columns, list): 261 columns = {str(col): str(col) for col in columns} 262 if isinstance(columns, dict): 263 self._attributes['parameters']['columns'] = columns 264 elif columns is not None: 265 warn(f"The provided columns are of invalid type '{type(columns)}'.") 266 267 indices = ( 268 indices 269 or indexes 270 or self._attributes.get('parameters', {}).get('indices', None) 271 or self._attributes.get('parameters', {}).get('indexes', None) 272 ) or columns 273 if isinstance(indices, dict): 274 indices_key = ( 275 'indexes' 276 if 'indexes' in self._attributes['parameters'] 277 else 'indices' 278 ) 279 self._attributes['parameters'][indices_key] = indices 280 281 if isinstance(tags, (list, tuple)): 282 self._attributes['parameters']['tags'] = tags 283 elif tags is not None: 284 warn(f"The provided tags are of invalid type '{type(tags)}'.") 285 286 if isinstance(target, str): 287 self._attributes['parameters']['target'] = target 288 elif target is not None: 289 warn(f"The provided target is of invalid type '{type(target)}'.") 290 291 if isinstance(dtypes, dict): 292 self._attributes['parameters']['dtypes'] = dtypes 293 elif dtypes is not None: 294 warn(f"The provided dtypes are of invalid type '{type(dtypes)}'.") 295 296 ### NOTE: The parameters dictionary is {} by default. 297 ### A Pipe may be registered without parameters, then edited, 298 ### or a Pipe may be registered with parameters set in-memory first. 299 # from meerschaum.config import get_config 300 _mrsm_instance = mrsm_instance if mrsm_instance is not None else instance 301 if _mrsm_instance is None: 302 _mrsm_instance = get_config('meerschaum', 'instance', patch=True) 303 304 if not isinstance(_mrsm_instance, str): 305 self._instance_connector = _mrsm_instance 306 self.instance_keys = str(_mrsm_instance) 307 else: ### NOTE: must be SQL or API Connector for this work 308 self.instance_keys = _mrsm_instance 309 310 self._cache = cache and get_config('system', 'experimental', 'cache')
Parameters
- connector (str):
Keys for the pipe's source connector, e.g.
'sql:main'
. - metric (str):
Label for the pipe's contents, e.g.
'weather'
. - location (str, default None):
Label for the pipe's location. Defaults to
None
. - parameters (Optional[Dict[str, Any]], default None):
Optionally set a pipe's parameters from the constructor,
e.g. columns and other attributes.
You can edit these parameters with
edit pipes
. - columns (Union[Dict[str, str], List[str], None], default None):
Set the
columns
dictionary ofparameters
. Ifparameters
is also provided, this dictionary is added under the'columns'
key. - indices (Optional[Dict[str, Union[str, List[str]]]], default None):
Set the
indices
dictionary ofparameters
. Ifparameters
is also provided, this dictionary is added under the'indices'
key. - tags (Optional[List[str]], default None):
A list of strings to be added under the
'tags'
key ofparameters
. You can select pipes with certain tags using--tags
. - dtypes (Optional[Dict[str, str]], default None):
Set the
dtypes
dictionary ofparameters
. Ifparameters
is also provided, this dictionary is added under the'dtypes'
key. - mrsm_instance (Optional[Union[str, InstanceConnector]], default None):
Connector for the Meerschaum instance where the pipe resides.
Defaults to the preconfigured default instance (
'sql:main'
). - instance (Optional[Union[str, InstanceConnector]], default None):
Alias for
mrsm_instance
. Ifmrsm_instance
is supplied, this value is ignored. - temporary (bool, default False):
If
True
, prevent instance tables (pipes, users, plugins) from being created. - cache (bool, default False):
If
True
, cache fetched data into a local database file. Defaults toFalse
.
313 @property 314 def meta(self): 315 """ 316 Return the four keys needed to reconstruct this pipe. 317 """ 318 return { 319 'connector': self.connector_keys, 320 'metric': self.metric_key, 321 'location': self.location_key, 322 'instance': self.instance_keys, 323 }
Return the four keys needed to reconstruct this pipe.
326 def keys(self) -> List[str]: 327 """ 328 Return the ordered keys for this pipe. 329 """ 330 return { 331 key: val 332 for key, val in self.meta.items() 333 if key != 'instance' 334 }
Return the ordered keys for this pipe.
337 @property 338 def instance_connector(self) -> Union[InstanceConnector, None]: 339 """ 340 The connector to where this pipe resides. 341 May either be of type `meerschaum.connectors.sql.SQLConnector` or 342 `meerschaum.connectors.api.APIConnector`. 343 """ 344 if '_instance_connector' not in self.__dict__: 345 from meerschaum.connectors.parse import parse_instance_keys 346 conn = parse_instance_keys(self.instance_keys) 347 if conn: 348 self._instance_connector = conn 349 else: 350 return None 351 return self._instance_connector
The connector to where this pipe resides.
May either be of type meerschaum.connectors.sql.SQLConnector
or
meerschaum.connectors.api.APIConnector
.
353 @property 354 def connector(self) -> Union[meerschaum.connectors.Connector, None]: 355 """ 356 The connector to the data source. 357 """ 358 if '_connector' not in self.__dict__: 359 from meerschaum.connectors.parse import parse_instance_keys 360 import warnings 361 with warnings.catch_warnings(): 362 warnings.simplefilter('ignore') 363 try: 364 conn = parse_instance_keys(self.connector_keys) 365 except Exception as e: 366 conn = None 367 if conn: 368 self._connector = conn 369 else: 370 return None 371 return self._connector
The connector to the data source.
374 @property 375 def cache_connector(self) -> Union[meerschaum.connectors.sql.SQLConnector, None]: 376 """ 377 If the pipe was created with `cache=True`, return the connector to the pipe's 378 SQLite database for caching. 379 """ 380 if not self._cache: 381 return None 382 383 if '_cache_connector' not in self.__dict__: 384 from meerschaum.connectors import get_connector 385 from meerschaum.config._paths import DUCKDB_RESOURCES_PATH, SQLITE_RESOURCES_PATH 386 _resources_path = SQLITE_RESOURCES_PATH 387 self._cache_connector = get_connector( 388 'sql', '_cache_' + str(self), 389 flavor='sqlite', 390 database=str(_resources_path / ('_cache_' + str(self) + '.db')), 391 ) 392 393 return self._cache_connector
If the pipe was created with cache=True
, return the connector to the pipe's
SQLite database for caching.
396 @property 397 def cache_pipe(self) -> Union['meerschaum.Pipe', None]: 398 """ 399 If the pipe was created with `cache=True`, return another `meerschaum.Pipe` used to 400 manage the local data. 401 """ 402 if self.cache_connector is None: 403 return None 404 if '_cache_pipe' not in self.__dict__: 405 from meerschaum.config._patch import apply_patch_to_config 406 from meerschaum.utils.sql import sql_item_name 407 _parameters = copy.deepcopy(self.parameters) 408 _fetch_patch = { 409 'fetch': ({ 410 'definition': ( 411 f"SELECT * FROM " 412 + sql_item_name( 413 str(self.target), 414 self.instance_connector.flavor, 415 self.instance_connector.get_pipe_schema(self), 416 ) 417 ), 418 }) if self.instance_connector.type == 'sql' else ({ 419 'connector_keys': self.connector_keys, 420 'metric_key': self.metric_key, 421 'location_key': self.location_key, 422 }) 423 } 424 _parameters = apply_patch_to_config(_parameters, _fetch_patch) 425 self._cache_pipe = Pipe( 426 self.instance_keys, 427 (self.connector_keys + '_' + self.metric_key + '_cache'), 428 self.location_key, 429 mrsm_instance = self.cache_connector, 430 parameters = _parameters, 431 cache = False, 432 temporary = True, 433 ) 434 435 return self._cache_pipe
If the pipe was created with cache=True
, return another meerschaum.Pipe
used to
manage the local data.
21def fetch( 22 self, 23 begin: Union[datetime, str, None] = '', 24 end: Optional[datetime] = None, 25 check_existing: bool = True, 26 sync_chunks: bool = False, 27 debug: bool = False, 28 **kw: Any 29 ) -> Union['pd.DataFrame', Iterator['pd.DataFrame'], None]: 30 """ 31 Fetch a Pipe's latest data from its connector. 32 33 Parameters 34 ---------- 35 begin: Union[datetime, str, None], default '': 36 If provided, only fetch data newer than or equal to `begin`. 37 38 end: Optional[datetime], default None: 39 If provided, only fetch data older than or equal to `end`. 40 41 check_existing: bool, default True 42 If `False`, do not apply the backtrack interval. 43 44 sync_chunks: bool, default False 45 If `True` and the pipe's connector is of type `'sql'`, begin syncing chunks while fetching 46 loads chunks into memory. 47 48 debug: bool, default False 49 Verbosity toggle. 50 51 Returns 52 ------- 53 A `pd.DataFrame` of the newest unseen data. 54 55 """ 56 if 'fetch' not in dir(self.connector): 57 warn(f"No `fetch()` function defined for connector '{self.connector}'") 58 return None 59 60 from meerschaum.connectors import custom_types, get_connector_plugin 61 from meerschaum.utils.debug import dprint, _checkpoint 62 from meerschaum.utils.misc import filter_arguments 63 64 _chunk_hook = kw.pop('chunk_hook', None) 65 kw['workers'] = self.get_num_workers(kw.get('workers', None)) 66 if sync_chunks and _chunk_hook is None: 67 68 def _chunk_hook(chunk, **_kw) -> SuccessTuple: 69 """ 70 Wrap `Pipe.sync()` with a custom chunk label prepended to the message. 71 """ 72 from meerschaum.config._patch import apply_patch_to_config 73 kwargs = apply_patch_to_config(kw, _kw) 74 chunk_success, chunk_message = self.sync(chunk, **kwargs) 75 chunk_label = self._get_chunk_label(chunk, self.columns.get('datetime', None)) 76 if chunk_label: 77 chunk_message = '\n' + chunk_label + '\n' + chunk_message 78 return chunk_success, chunk_message 79 80 with mrsm.Venv(get_connector_plugin(self.connector)): 81 _args, _kwargs = filter_arguments( 82 self.connector.fetch, 83 self, 84 begin=_determine_begin( 85 self, 86 begin, 87 check_existing=check_existing, 88 debug=debug, 89 ), 90 end=end, 91 chunk_hook=_chunk_hook, 92 debug=debug, 93 **kw 94 ) 95 df = self.connector.fetch(*_args, **_kwargs) 96 return df
Fetch a Pipe's latest data from its connector.
Parameters
- begin (Union[datetime, str, None], default '':):
If provided, only fetch data newer than or equal to
begin
. - end (Optional[datetime], default None:):
If provided, only fetch data older than or equal to
end
. - check_existing (bool, default True):
If
False
, do not apply the backtrack interval. - sync_chunks (bool, default False):
If
True
and the pipe's connector is of type'sql'
, begin syncing chunks while fetching loads chunks into memory. - debug (bool, default False): Verbosity toggle.
Returns
- A
pd.DataFrame
of the newest unseen data.
99def get_backtrack_interval( 100 self, 101 check_existing: bool = True, 102 debug: bool = False, 103) -> Union[timedelta, int]: 104 """ 105 Get the chunk interval to use for this pipe. 106 107 Parameters 108 ---------- 109 check_existing: bool, default True 110 If `False`, return a backtrack_interval of 0 minutes. 111 112 Returns 113 ------- 114 The backtrack interval (`timedelta` or `int`) to use with this pipe's `datetime` axis. 115 """ 116 default_backtrack_minutes = get_config('pipes', 'parameters', 'fetch', 'backtrack_minutes') 117 configured_backtrack_minutes = self.parameters.get('fetch', {}).get('backtrack_minutes', None) 118 backtrack_minutes = ( 119 configured_backtrack_minutes 120 if configured_backtrack_minutes is not None 121 else default_backtrack_minutes 122 ) if check_existing else 0 123 124 backtrack_interval = timedelta(minutes=backtrack_minutes) 125 dt_col = self.columns.get('datetime', None) 126 if dt_col is None: 127 return backtrack_interval 128 129 dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]') 130 if 'int' in dt_dtype.lower(): 131 return backtrack_minutes 132 133 return backtrack_interval
Get the chunk interval to use for this pipe.
Parameters
- check_existing (bool, default True):
If
False
, return a backtrack_interval of 0 minutes.
Returns
- The backtrack interval (
timedelta
orint
) to use with this pipe'sdatetime
axis.
23def get_data( 24 self, 25 select_columns: Optional[List[str]] = None, 26 omit_columns: Optional[List[str]] = None, 27 begin: Union[datetime, int, None] = None, 28 end: Union[datetime, int, None] = None, 29 params: Optional[Dict[str, Any]] = None, 30 as_iterator: bool = False, 31 as_chunks: bool = False, 32 as_dask: bool = False, 33 chunk_interval: Union[timedelta, int, None] = None, 34 order: Optional[str] = 'asc', 35 limit: Optional[int] = None, 36 fresh: bool = False, 37 debug: bool = False, 38 **kw: Any 39) -> Union['pd.DataFrame', Iterator['pd.DataFrame'], None]: 40 """ 41 Get a pipe's data from the instance connector. 42 43 Parameters 44 ---------- 45 select_columns: Optional[List[str]], default None 46 If provided, only select these given columns. 47 Otherwise select all available columns (i.e. `SELECT *`). 48 49 omit_columns: Optional[List[str]], default None 50 If provided, remove these columns from the selection. 51 52 begin: Union[datetime, int, None], default None 53 Lower bound datetime to begin searching for data (inclusive). 54 Translates to a `WHERE` clause like `WHERE datetime >= begin`. 55 Defaults to `None`. 56 57 end: Union[datetime, int, None], default None 58 Upper bound datetime to stop searching for data (inclusive). 59 Translates to a `WHERE` clause like `WHERE datetime < end`. 60 Defaults to `None`. 61 62 params: Optional[Dict[str, Any]], default None 63 Filter the retrieved data by a dictionary of parameters. 64 See `meerschaum.utils.sql.build_where` for more details. 65 66 as_iterator: bool, default False 67 If `True`, return a generator of chunks of pipe data. 68 69 as_chunks: bool, default False 70 Alias for `as_iterator`. 71 72 as_dask: bool, default False 73 If `True`, return a `dask.DataFrame` 74 (which may be loaded into a Pandas DataFrame with `df.compute()`). 75 76 chunk_interval: Union[timedelta, int, None], default None 77 If `as_iterator`, then return chunks with `begin` and `end` separated by this interval. 78 This may be set under `pipe.parameters['chunk_minutes']`. 79 By default, use a timedelta of 1440 minutes (1 day). 80 If `chunk_interval` is an integer and the `datetime` axis a timestamp, 81 the use a timedelta with the number of minutes configured to this value. 82 If the `datetime` axis is an integer, default to the configured chunksize. 83 If `chunk_interval` is a `timedelta` and the `datetime` axis an integer, 84 use the number of minutes in the `timedelta`. 85 86 order: Optional[str], default 'asc' 87 If `order` is not `None`, sort the resulting dataframe by indices. 88 89 limit: Optional[int], default None 90 If provided, cap the dataframe to this many rows. 91 92 fresh: bool, default True 93 If `True`, skip local cache and directly query the instance connector. 94 Defaults to `True`. 95 96 debug: bool, default False 97 Verbosity toggle. 98 Defaults to `False`. 99 100 Returns 101 ------- 102 A `pd.DataFrame` for the pipe's data corresponding to the provided parameters. 103 104 """ 105 from meerschaum.utils.warnings import warn 106 from meerschaum.utils.venv import Venv 107 from meerschaum.connectors import get_connector_plugin 108 from meerschaum.utils.misc import iterate_chunks, items_str 109 from meerschaum.utils.dtypes import to_pandas_dtype 110 from meerschaum.utils.dataframe import add_missing_cols_to_df, df_is_chunk_generator 111 from meerschaum.utils.packages import attempt_import 112 dd = attempt_import('dask.dataframe') if as_dask else None 113 dask = attempt_import('dask') if as_dask else None 114 115 if select_columns == '*': 116 select_columns = None 117 elif isinstance(select_columns, str): 118 select_columns = [select_columns] 119 120 if isinstance(omit_columns, str): 121 omit_columns = [omit_columns] 122 123 as_iterator = as_iterator or as_chunks 124 125 def _sort_df(_df): 126 if df_is_chunk_generator(_df): 127 return _df 128 dt_col = self.columns.get('datetime', None) 129 indices = [] if dt_col not in _df.columns else [dt_col] 130 non_dt_cols = [ 131 col 132 for col_ix, col in self.columns.items() 133 if col_ix != 'datetime' and col in _df.columns 134 ] 135 indices.extend(non_dt_cols) 136 if 'dask' not in _df.__module__: 137 _df.sort_values( 138 by=indices, 139 inplace=True, 140 ascending=(str(order).lower() == 'asc'), 141 ) 142 _df.reset_index(drop=True, inplace=True) 143 else: 144 _df = _df.sort_values( 145 by=indices, 146 ascending=(str(order).lower() == 'asc'), 147 ) 148 _df = _df.reset_index(drop=True) 149 if limit is not None and len(_df) > limit: 150 return _df.head(limit) 151 return _df 152 153 if as_iterator or as_chunks: 154 df = self._get_data_as_iterator( 155 select_columns=select_columns, 156 omit_columns=omit_columns, 157 begin=begin, 158 end=end, 159 params=params, 160 chunk_interval=chunk_interval, 161 limit=limit, 162 order=order, 163 fresh=fresh, 164 debug=debug, 165 ) 166 return _sort_df(df) 167 168 if as_dask: 169 from multiprocessing.pool import ThreadPool 170 dask_pool = ThreadPool(self.get_num_workers()) 171 dask.config.set(pool=dask_pool) 172 chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug) 173 bounds = self.get_chunk_bounds( 174 begin=begin, 175 end=end, 176 bounded=False, 177 chunk_interval=chunk_interval, 178 debug=debug, 179 ) 180 dask_chunks = [ 181 dask.delayed(self.get_data)( 182 select_columns=select_columns, 183 omit_columns=omit_columns, 184 begin=chunk_begin, 185 end=chunk_end, 186 params=params, 187 chunk_interval=chunk_interval, 188 order=order, 189 limit=limit, 190 fresh=fresh, 191 debug=debug, 192 ) 193 for (chunk_begin, chunk_end) in bounds 194 ] 195 dask_meta = { 196 col: to_pandas_dtype(typ) 197 for col, typ in self.dtypes.items() 198 } 199 return _sort_df(dd.from_delayed(dask_chunks, meta=dask_meta)) 200 201 if not self.exists(debug=debug): 202 return None 203 204 if self.cache_pipe is not None: 205 if not fresh: 206 _sync_cache_tuple = self.cache_pipe.sync( 207 begin=begin, 208 end=end, 209 params=params, 210 debug=debug, 211 **kw 212 ) 213 if not _sync_cache_tuple[0]: 214 warn(f"Failed to sync cache for {self}:\n" + _sync_cache_tuple[1]) 215 fresh = True 216 else: ### Successfully synced cache. 217 return self.enforce_dtypes( 218 self.cache_pipe.get_data( 219 select_columns=select_columns, 220 omit_columns=omit_columns, 221 begin=begin, 222 end=end, 223 params=params, 224 order=order, 225 limit=limit, 226 debug=debug, 227 fresh=True, 228 **kw 229 ), 230 debug=debug, 231 ) 232 233 with Venv(get_connector_plugin(self.instance_connector)): 234 df = self.instance_connector.get_pipe_data( 235 pipe=self, 236 select_columns=select_columns, 237 omit_columns=omit_columns, 238 begin=begin, 239 end=end, 240 params=params, 241 limit=limit, 242 order=order, 243 debug=debug, 244 **kw 245 ) 246 if df is None: 247 return df 248 249 if not select_columns: 250 select_columns = [col for col in df.columns] 251 252 cols_to_omit = [ 253 col 254 for col in df.columns 255 if ( 256 col in (omit_columns or []) 257 or 258 col not in (select_columns or []) 259 ) 260 ] 261 cols_to_add = [ 262 col 263 for col in select_columns 264 if col not in df.columns 265 ] 266 if cols_to_omit: 267 warn( 268 ( 269 f"Received {len(cols_to_omit)} omitted column" 270 + ('s' if len(cols_to_omit) != 1 else '') 271 + f" for {self}. " 272 + "Consider adding `select_columns` and `omit_columns` support to " 273 + f"'{self.instance_connector.type}' connectors to improve performance." 274 ), 275 stack=False, 276 ) 277 _cols_to_select = [col for col in df.columns if col not in cols_to_omit] 278 df = df[_cols_to_select] 279 280 if cols_to_add: 281 warn( 282 ( 283 f"Specified columns {items_str(cols_to_add)} were not found on {self}. " 284 + "Adding these to the DataFrame as null columns." 285 ), 286 stack=False, 287 ) 288 df = add_missing_cols_to_df(df, {col: 'string' for col in cols_to_add}) 289 290 enforced_df = self.enforce_dtypes(df, debug=debug) 291 292 if order: 293 return _sort_df(enforced_df) 294 return enforced_df
Get a pipe's data from the instance connector.
Parameters
- select_columns (Optional[List[str]], default None):
If provided, only select these given columns.
Otherwise select all available columns (i.e.
SELECT *
). - omit_columns (Optional[List[str]], default None): If provided, remove these columns from the selection.
- begin (Union[datetime, int, None], default None):
Lower bound datetime to begin searching for data (inclusive).
Translates to a
WHERE
clause likeWHERE datetime >= begin
. Defaults toNone
. - end (Union[datetime, int, None], default None):
Upper bound datetime to stop searching for data (inclusive).
Translates to a
WHERE
clause likeWHERE datetime < end
. Defaults toNone
. - params (Optional[Dict[str, Any]], default None):
Filter the retrieved data by a dictionary of parameters.
See
meerschaum.utils.sql.build_where
for more details. - as_iterator (bool, default False):
If
True
, return a generator of chunks of pipe data. - as_chunks (bool, default False):
Alias for
as_iterator
. - as_dask (bool, default False):
If
True
, return adask.DataFrame
(which may be loaded into a Pandas DataFrame withdf.compute()
). - chunk_interval (Union[timedelta, int, None], default None):
If
as_iterator
, then return chunks withbegin
andend
separated by this interval. This may be set underpipe.parameters['chunk_minutes']
. By default, use a timedelta of 1440 minutes (1 day). Ifchunk_interval
is an integer and thedatetime
axis a timestamp, the use a timedelta with the number of minutes configured to this value. If thedatetime
axis is an integer, default to the configured chunksize. Ifchunk_interval
is atimedelta
and thedatetime
axis an integer, use the number of minutes in thetimedelta
. - order (Optional[str], default 'asc'):
If
order
is notNone
, sort the resulting dataframe by indices. - limit (Optional[int], default None): If provided, cap the dataframe to this many rows.
- fresh (bool, default True):
If
True
, skip local cache and directly query the instance connector. Defaults toTrue
. - debug (bool, default False):
Verbosity toggle.
Defaults to
False
.
Returns
- A
pd.DataFrame
for the pipe's data corresponding to the provided parameters.
395def get_backtrack_data( 396 self, 397 backtrack_minutes: Optional[int] = None, 398 begin: Union[datetime, int, None] = None, 399 params: Optional[Dict[str, Any]] = None, 400 limit: Optional[int] = None, 401 fresh: bool = False, 402 debug: bool = False, 403 **kw: Any 404) -> Optional['pd.DataFrame']: 405 """ 406 Get the most recent data from the instance connector as a Pandas DataFrame. 407 408 Parameters 409 ---------- 410 backtrack_minutes: Optional[int], default None 411 How many minutes from `begin` to select from. 412 If `None`, use `pipe.parameters['fetch']['backtrack_minutes']`. 413 414 begin: Optional[datetime], default None 415 The starting point to search for data. 416 If begin is `None` (default), use the most recent observed datetime 417 (AKA sync_time). 418 419 ``` 420 E.g. begin = 02:00 421 422 Search this region. Ignore this, even if there's data. 423 / / / / / / / / / | 424 -----|----------|----------|----------|----------|----------| 425 00:00 01:00 02:00 03:00 04:00 05:00 426 427 ``` 428 429 params: Optional[Dict[str, Any]], default None 430 The standard Meerschaum `params` query dictionary. 431 432 limit: Optional[int], default None 433 If provided, cap the number of rows to be returned. 434 435 fresh: bool, default False 436 If `True`, Ignore local cache and pull directly from the instance connector. 437 Only comes into effect if a pipe was created with `cache=True`. 438 439 debug: bool default False 440 Verbosity toggle. 441 442 Returns 443 ------- 444 A `pd.DataFrame` for the pipe's data corresponding to the provided parameters. Backtrack data 445 is a convenient way to get a pipe's data "backtracked" from the most recent datetime. 446 """ 447 from meerschaum.utils.warnings import warn 448 from meerschaum.utils.venv import Venv 449 from meerschaum.connectors import get_connector_plugin 450 451 if not self.exists(debug=debug): 452 return None 453 454 backtrack_interval = self.get_backtrack_interval(debug=debug) 455 if backtrack_minutes is None: 456 backtrack_minutes = ( 457 (backtrack_interval.total_seconds() / 60) 458 if isinstance(backtrack_interval, timedelta) 459 else backtrack_interval 460 ) 461 462 if self.cache_pipe is not None: 463 if not fresh: 464 _sync_cache_tuple = self.cache_pipe.sync(begin=begin, params=params, debug=debug, **kw) 465 if not _sync_cache_tuple[0]: 466 warn(f"Failed to sync cache for {self}:\n" + _sync_cache_tuple[1]) 467 fresh = True 468 else: ### Successfully synced cache. 469 return self.enforce_dtypes( 470 self.cache_pipe.get_backtrack_data( 471 fresh=True, 472 begin=begin, 473 backtrack_minutes=backtrack_minutes, 474 params=params, 475 limit=limit, 476 order=kw.get('order', 'desc'), 477 debug=debug, 478 **kw 479 ), 480 debug=debug, 481 ) 482 483 if hasattr(self.instance_connector, 'get_backtrack_data'): 484 with Venv(get_connector_plugin(self.instance_connector)): 485 return self.enforce_dtypes( 486 self.instance_connector.get_backtrack_data( 487 pipe=self, 488 begin=begin, 489 backtrack_minutes=backtrack_minutes, 490 params=params, 491 limit=limit, 492 debug=debug, 493 **kw 494 ), 495 debug=debug, 496 ) 497 498 if begin is None: 499 begin = self.get_sync_time(params=params, debug=debug) 500 501 backtrack_interval = ( 502 timedelta(minutes=backtrack_minutes) 503 if isinstance(begin, datetime) 504 else backtrack_minutes 505 ) 506 if begin is not None: 507 begin = begin - backtrack_interval 508 509 return self.get_data( 510 begin=begin, 511 params=params, 512 debug=debug, 513 limit=limit, 514 order=kw.get('order', 'desc'), 515 **kw 516 )
Get the most recent data from the instance connector as a Pandas DataFrame.
Parameters
- backtrack_minutes (Optional[int], default None):
How many minutes from
begin
to select from. IfNone
, usepipe.parameters['fetch']['backtrack_minutes']
. begin (Optional[datetime], default None): The starting point to search for data. If begin is
None
(default), use the most recent observed datetime (AKA sync_time).E.g. begin = 02:00 Search this region. Ignore this, even if there's data. / / / / / / / / / | -----|----------|----------|----------|----------|----------| 00:00 01:00 02:00 03:00 04:00 05:00
params (Optional[Dict[str, Any]], default None): The standard Meerschaum
params
query dictionary.- limit (Optional[int], default None): If provided, cap the number of rows to be returned.
- fresh (bool, default False):
If
True
, Ignore local cache and pull directly from the instance connector. Only comes into effect if a pipe was created withcache=True
. - debug (bool default False): Verbosity toggle.
Returns
- A
pd.DataFrame
for the pipe's data corresponding to the provided parameters. Backtrack data - is a convenient way to get a pipe's data "backtracked" from the most recent datetime.
519def get_rowcount( 520 self, 521 begin: Union[datetime, int, None] = None, 522 end: Union[datetime, int, None] = None, 523 params: Optional[Dict[str, Any]] = None, 524 remote: bool = False, 525 debug: bool = False 526) -> int: 527 """ 528 Get a Pipe's instance or remote rowcount. 529 530 Parameters 531 ---------- 532 begin: Optional[datetime], default None 533 Count rows where datetime > begin. 534 535 end: Optional[datetime], default None 536 Count rows where datetime < end. 537 538 remote: bool, default False 539 Count rows from a pipe's remote source. 540 **NOTE**: This is experimental! 541 542 debug: bool, default False 543 Verbosity toggle. 544 545 Returns 546 ------- 547 An `int` of the number of rows in the pipe corresponding to the provided parameters. 548 Returned 0 if the pipe does not exist. 549 """ 550 from meerschaum.utils.warnings import warn 551 from meerschaum.utils.venv import Venv 552 from meerschaum.connectors import get_connector_plugin 553 554 connector = self.instance_connector if not remote else self.connector 555 try: 556 with Venv(get_connector_plugin(connector)): 557 rowcount = connector.get_pipe_rowcount( 558 self, 559 begin=begin, 560 end=end, 561 params=params, 562 remote=remote, 563 debug=debug, 564 ) 565 if rowcount is None: 566 return 0 567 return rowcount 568 except AttributeError as e: 569 warn(e) 570 if remote: 571 return 0 572 warn(f"Failed to get a rowcount for {self}.") 573 return 0
Get a Pipe's instance or remote rowcount.
Parameters
- begin (Optional[datetime], default None): Count rows where datetime > begin.
- end (Optional[datetime], default None): Count rows where datetime < end.
- remote (bool, default False): Count rows from a pipe's remote source. NOTE: This is experimental!
- debug (bool, default False): Verbosity toggle.
Returns
- An
int
of the number of rows in the pipe corresponding to the provided parameters. - Returned 0 if the pipe does not exist.
576def get_chunk_interval( 577 self, 578 chunk_interval: Union[timedelta, int, None] = None, 579 debug: bool = False, 580) -> Union[timedelta, int]: 581 """ 582 Get the chunk interval to use for this pipe. 583 584 Parameters 585 ---------- 586 chunk_interval: Union[timedelta, int, None], default None 587 If provided, coerce this value into the correct type. 588 For example, if the datetime axis is an integer, then 589 return the number of minutes. 590 591 Returns 592 ------- 593 The chunk interval (`timedelta` or `int`) to use with this pipe's `datetime` axis. 594 """ 595 default_chunk_minutes = get_config('pipes', 'parameters', 'verify', 'chunk_minutes') 596 configured_chunk_minutes = self.parameters.get('verify', {}).get('chunk_minutes', None) 597 chunk_minutes = ( 598 (configured_chunk_minutes or default_chunk_minutes) 599 if chunk_interval is None 600 else ( 601 chunk_interval 602 if isinstance(chunk_interval, int) 603 else int(chunk_interval.total_seconds() / 60) 604 ) 605 ) 606 607 dt_col = self.columns.get('datetime', None) 608 if dt_col is None: 609 return timedelta(minutes=chunk_minutes) 610 611 dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]') 612 if 'int' in dt_dtype.lower(): 613 return chunk_minutes 614 return timedelta(minutes=chunk_minutes)
Get the chunk interval to use for this pipe.
Parameters
- chunk_interval (Union[timedelta, int, None], default None): If provided, coerce this value into the correct type. For example, if the datetime axis is an integer, then return the number of minutes.
Returns
- The chunk interval (
timedelta
orint
) to use with this pipe'sdatetime
axis.
617def get_chunk_bounds( 618 self, 619 begin: Union[datetime, int, None] = None, 620 end: Union[datetime, int, None] = None, 621 bounded: bool = False, 622 chunk_interval: Union[timedelta, int, None] = None, 623 debug: bool = False, 624) -> List[ 625 Tuple[ 626 Union[datetime, int, None], 627 Union[datetime, int, None], 628 ] 629]: 630 """ 631 Return a list of datetime bounds for iterating over the pipe's `datetime` axis. 632 633 Parameters 634 ---------- 635 begin: Union[datetime, int, None], default None 636 If provided, do not select less than this value. 637 Otherwise the first chunk will be unbounded. 638 639 end: Union[datetime, int, None], default None 640 If provided, do not select greater than or equal to this value. 641 Otherwise the last chunk will be unbounded. 642 643 bounded: bool, default False 644 If `True`, do not include `None` in the first chunk. 645 646 chunk_interval: Union[timedelta, int, None], default None 647 If provided, use this interval for the size of chunk boundaries. 648 The default value for this pipe may be set 649 under `pipe.parameters['verify']['chunk_minutes']`. 650 651 debug: bool, default False 652 Verbosity toggle. 653 654 Returns 655 ------- 656 A list of chunk bounds (datetimes or integers). 657 If unbounded, the first and last chunks will include `None`. 658 """ 659 include_less_than_begin = not bounded and begin is None 660 include_greater_than_end = not bounded and end is None 661 if begin is None: 662 begin = self.get_sync_time(newest=False, debug=debug) 663 if end is None: 664 end = self.get_sync_time(newest=True, debug=debug) 665 if begin is None and end is None: 666 return [(None, None)] 667 668 ### Set the chunk interval under `pipe.parameters['verify']['chunk_minutes']`. 669 chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug) 670 671 ### Build a list of tuples containing the chunk boundaries 672 ### so that we can sync multiple chunks in parallel. 673 ### Run `verify pipes --workers 1` to sync chunks in series. 674 chunk_bounds = [] 675 begin_cursor = begin 676 while begin_cursor < end: 677 end_cursor = begin_cursor + chunk_interval 678 chunk_bounds.append((begin_cursor, end_cursor)) 679 begin_cursor = end_cursor 680 681 ### The chunk interval might be too large. 682 if not chunk_bounds and end >= begin: 683 chunk_bounds = [(begin, end)] 684 685 ### Truncate the last chunk to the end timestamp. 686 if chunk_bounds[-1][1] > end: 687 chunk_bounds[-1] = (chunk_bounds[-1][0], end) 688 689 ### Pop the last chunk if its bounds are equal. 690 if chunk_bounds[-1][0] == chunk_bounds[-1][1]: 691 chunk_bounds = chunk_bounds[:-1] 692 693 if include_less_than_begin: 694 chunk_bounds = [(None, begin)] + chunk_bounds 695 if include_greater_than_end: 696 chunk_bounds = chunk_bounds + [(end, None)] 697 698 return chunk_bounds
Return a list of datetime bounds for iterating over the pipe's datetime
axis.
Parameters
- begin (Union[datetime, int, None], default None): If provided, do not select less than this value. Otherwise the first chunk will be unbounded.
- end (Union[datetime, int, None], default None): If provided, do not select greater than or equal to this value. Otherwise the last chunk will be unbounded.
- bounded (bool, default False):
If
True
, do not includeNone
in the first chunk. - chunk_interval (Union[timedelta, int, None], default None):
If provided, use this interval for the size of chunk boundaries.
The default value for this pipe may be set
under
pipe.parameters['verify']['chunk_minutes']
. - debug (bool, default False): Verbosity toggle.
Returns
- A list of chunk bounds (datetimes or integers).
- If unbounded, the first and last chunks will include
None
.
12def register( 13 self, 14 debug: bool = False, 15 **kw: Any 16 ) -> SuccessTuple: 17 """ 18 Register a new Pipe along with its attributes. 19 20 Parameters 21 ---------- 22 debug: bool, default False 23 Verbosity toggle. 24 25 kw: Any 26 Keyword arguments to pass to `instance_connector.register_pipe()`. 27 28 Returns 29 ------- 30 A `SuccessTuple` of success, message. 31 """ 32 if self.temporary: 33 return False, "Cannot register pipes created with `temporary=True` (read-only)." 34 35 from meerschaum.utils.formatting import get_console 36 from meerschaum.utils.venv import Venv 37 from meerschaum.connectors import get_connector_plugin, custom_types 38 from meerschaum.config._patch import apply_patch_to_config 39 40 import warnings 41 with warnings.catch_warnings(): 42 warnings.simplefilter('ignore') 43 try: 44 _conn = self.connector 45 except Exception as e: 46 _conn = None 47 48 if ( 49 _conn is not None 50 and 51 (_conn.type == 'plugin' or _conn.type in custom_types) 52 and 53 getattr(_conn, 'register', None) is not None 54 ): 55 try: 56 with Venv(get_connector_plugin(_conn), debug=debug): 57 params = self.connector.register(self) 58 except Exception as e: 59 get_console().print_exception() 60 params = None 61 params = {} if params is None else params 62 if not isinstance(params, dict): 63 from meerschaum.utils.warnings import warn 64 warn( 65 f"Invalid parameters returned from `register()` in connector {self.connector}:\n" 66 + f"{params}" 67 ) 68 else: 69 self.parameters = apply_patch_to_config(params, self.parameters) 70 71 if not self.parameters: 72 cols = self.columns if self.columns else {'datetime': None, 'id': None} 73 self.parameters = { 74 'columns': cols, 75 } 76 77 with Venv(get_connector_plugin(self.instance_connector)): 78 return self.instance_connector.register_pipe(self, debug=debug, **kw)
Register a new Pipe along with its attributes.
Parameters
- debug (bool, default False): Verbosity toggle.
- kw (Any):
Keyword arguments to pass to
instance_connector.register_pipe()
.
Returns
- A
SuccessTuple
of success, message.
14@property 15def attributes(self) -> Dict[str, Any]: 16 """ 17 Return a dictionary of a pipe's keys and parameters. 18 These values are reflected directly from the pipes table of the instance. 19 """ 20 import time 21 from meerschaum.config import get_config 22 from meerschaum.config._patch import apply_patch_to_config 23 from meerschaum.utils.venv import Venv 24 from meerschaum.connectors import get_connector_plugin 25 26 timeout_seconds = get_config('pipes', 'attributes', 'local_cache_timeout_seconds') 27 28 if '_attributes' not in self.__dict__: 29 self._attributes = {} 30 31 now = time.perf_counter() 32 last_refresh = self.__dict__.get('_attributes_sync_time', None) 33 timed_out = ( 34 last_refresh is None 35 or 36 (timeout_seconds is not None and (now - last_refresh) >= timeout_seconds) 37 ) 38 if not self.temporary and timed_out: 39 self._attributes_sync_time = now 40 local_attributes = self.__dict__.get('_attributes', {}) 41 with Venv(get_connector_plugin(self.instance_connector)): 42 instance_attributes = self.instance_connector.get_pipe_attributes(self) 43 self._attributes = apply_patch_to_config(instance_attributes, local_attributes) 44 return self._attributes
Return a dictionary of a pipe's keys and parameters. These values are reflected directly from the pipes table of the instance.
47@property 48def parameters(self) -> Optional[Dict[str, Any]]: 49 """ 50 Return the parameters dictionary of the pipe. 51 """ 52 if 'parameters' not in self.attributes: 53 self.attributes['parameters'] = {} 54 return self.attributes['parameters']
Return the parameters dictionary of the pipe.
66@property 67def columns(self) -> Union[Dict[str, str], None]: 68 """ 69 Return the `columns` dictionary defined in `meerschaum.Pipe.parameters`. 70 """ 71 if 'columns' not in self.parameters: 72 self.parameters['columns'] = {} 73 cols = self.parameters['columns'] 74 if not isinstance(cols, dict): 75 cols = {} 76 self.parameters['columns'] = cols 77 return cols
Return the columns
dictionary defined in meerschaum.Pipe.parameters
.
94@property 95def indices(self) -> Union[Dict[str, Union[str, List[str]]], None]: 96 """ 97 Return the `indices` dictionary defined in `meerschaum.Pipe.parameters`. 98 """ 99 indices_key = ( 100 'indexes' 101 if 'indexes' in self.parameters 102 else 'indices' 103 ) 104 if indices_key not in self.parameters: 105 self.parameters[indices_key] = {} 106 _indices = self.parameters[indices_key] 107 if not isinstance(_indices, dict): 108 _indices = {} 109 self.parameters[indices_key] = _indices 110 return {**self.columns, **_indices}
Return the indices
dictionary defined in meerschaum.Pipe.parameters
.
113@property 114def indexes(self) -> Union[Dict[str, Union[str, List[str]]], None]: 115 """ 116 Alias for `meerschaum.Pipe.indices`. 117 """ 118 return self.indices
Alias for meerschaum.Pipe.indices
.
171@property 172def dtypes(self) -> Union[Dict[str, Any], None]: 173 """ 174 If defined, return the `dtypes` dictionary defined in `meerschaum.Pipe.parameters`. 175 """ 176 from meerschaum.config._patch import apply_patch_to_config 177 configured_dtypes = self.parameters.get('dtypes', {}) 178 remote_dtypes = self.infer_dtypes(persist=False) 179 patched_dtypes = apply_patch_to_config(remote_dtypes, configured_dtypes) 180 return patched_dtypes
If defined, return the dtypes
dictionary defined in meerschaum.Pipe.parameters
.
192def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]: 193 """ 194 Check if the requested columns are defined. 195 196 Parameters 197 ---------- 198 *args: str 199 The column names to be retrieved. 200 201 error: bool, default False 202 If `True`, raise an `Exception` if the specified column is not defined. 203 204 Returns 205 ------- 206 A tuple of the same size of `args` or a `str` if `args` is a single argument. 207 208 Examples 209 -------- 210 >>> pipe = mrsm.Pipe('test', 'test') 211 >>> pipe.columns = {'datetime': 'dt', 'id': 'id'} 212 >>> pipe.get_columns('datetime', 'id') 213 ('dt', 'id') 214 >>> pipe.get_columns('value', error=True) 215 Exception: 🛑 Missing 'value' column for Pipe('test', 'test'). 216 """ 217 from meerschaum.utils.warnings import error as _error, warn 218 if not args: 219 args = tuple(self.columns.keys()) 220 col_names = [] 221 for col in args: 222 col_name = None 223 try: 224 col_name = self.columns[col] 225 if col_name is None and error: 226 _error(f"Please define the name of the '{col}' column for {self}.") 227 except Exception as e: 228 col_name = None 229 if col_name is None and error: 230 _error(f"Missing '{col}'" + f" column for {self}.") 231 col_names.append(col_name) 232 if len(col_names) == 1: 233 return col_names[0] 234 return tuple(col_names)
Check if the requested columns are defined.
Parameters
- *args (str): The column names to be retrieved.
- error (bool, default False):
If
True
, raise anException
if the specified column is not defined.
Returns
- A tuple of the same size of
args
or astr
ifargs
is a single argument.
Examples
>>> pipe = mrsm.Pipe('test', 'test')
>>> pipe.columns = {'datetime': 'dt', 'id': 'id'}
>>> pipe.get_columns('datetime', 'id')
('dt', 'id')
>>> pipe.get_columns('value', error=True)
Exception: 🛑 Missing 'value' column for Pipe('test', 'test').
237def get_columns_types(self, debug: bool = False) -> Union[Dict[str, str], None]: 238 """ 239 Get a dictionary of a pipe's column names and their types. 240 241 Parameters 242 ---------- 243 debug: bool, default False: 244 Verbosity toggle. 245 246 Returns 247 ------- 248 A dictionary of column names (`str`) to column types (`str`). 249 250 Examples 251 -------- 252 >>> pipe.get_columns_types() 253 { 254 'dt': 'TIMESTAMP WITHOUT TIMEZONE', 255 'id': 'BIGINT', 256 'val': 'DOUBLE PRECISION', 257 } 258 >>> 259 """ 260 from meerschaum.utils.venv import Venv 261 from meerschaum.connectors import get_connector_plugin 262 263 with Venv(get_connector_plugin(self.instance_connector)): 264 return self.instance_connector.get_pipe_columns_types(self, debug=debug)
Get a dictionary of a pipe's column names and their types.
Parameters
- debug (bool, default False:): Verbosity toggle.
Returns
- A dictionary of column names (
str
) to column types (str
).
Examples
>>> pipe.get_columns_types()
{
'dt': 'TIMESTAMP WITHOUT TIMEZONE',
'id': 'BIGINT',
'val': 'DOUBLE PRECISION',
}
>>>
491def get_indices(self) -> Dict[str, str]: 492 """ 493 Return a dictionary mapping index keys to their names on the database. 494 495 Returns 496 ------- 497 A dictionary of index keys to column names. 498 """ 499 _parameters = self.parameters 500 _index_template = _parameters.get('index_template', "IX_{target}_{column_names}") 501 _indices = self.indices 502 _target = self.target 503 _column_names = { 504 ix: ( 505 '_'.join(cols) 506 if isinstance(cols, (list, tuple)) 507 else str(cols) 508 ) 509 for ix, cols in _indices.items() 510 if cols 511 } 512 _index_names = { 513 ix: ( 514 _index_template.format( 515 target=_target, 516 column_names=column_names, 517 connector_keys=self.connector_keys, 518 metric_key=self.connector_key, 519 location_key=self.location_key, 520 ) 521 ) 522 for ix, column_names in _column_names.items() 523 } 524 return _index_names
Return a dictionary mapping index keys to their names on the database.
Returns
- A dictionary of index keys to column names.
267def get_id(self, **kw: Any) -> Union[int, None]: 268 """ 269 Fetch a pipe's ID from its instance connector. 270 If the pipe does not exist, return `None`. 271 """ 272 if self.temporary: 273 return None 274 from meerschaum.utils.venv import Venv 275 from meerschaum.connectors import get_connector_plugin 276 277 with Venv(get_connector_plugin(self.instance_connector)): 278 return self.instance_connector.get_pipe_id(self, **kw)
Fetch a pipe's ID from its instance connector.
If the pipe does not exist, return None
.
281@property 282def id(self) -> Union[int, None]: 283 """ 284 Fetch and cache a pipe's ID. 285 """ 286 if not ('_id' in self.__dict__ and self._id): 287 self._id = self.get_id() 288 return self._id
Fetch and cache a pipe's ID.
291def get_val_column(self, debug: bool = False) -> Union[str, None]: 292 """ 293 Return the name of the value column if it's defined, otherwise make an educated guess. 294 If not set in the `columns` dictionary, return the first numeric column that is not 295 an ID or datetime column. 296 If none may be found, return `None`. 297 298 Parameters 299 ---------- 300 debug: bool, default False: 301 Verbosity toggle. 302 303 Returns 304 ------- 305 Either a string or `None`. 306 """ 307 from meerschaum.utils.debug import dprint 308 if debug: 309 dprint('Attempting to determine the value column...') 310 try: 311 val_name = self.get_columns('value') 312 except Exception as e: 313 val_name = None 314 if val_name is not None: 315 if debug: 316 dprint(f"Value column: {val_name}") 317 return val_name 318 319 cols = self.columns 320 if cols is None: 321 if debug: 322 dprint('No columns could be determined. Returning...') 323 return None 324 try: 325 dt_name = self.get_columns('datetime', error=False) 326 except Exception as e: 327 dt_name = None 328 try: 329 id_name = self.get_columns('id', errors=False) 330 except Exception as e: 331 id_name = None 332 333 if debug: 334 dprint(f"dt_name: {dt_name}") 335 dprint(f"id_name: {id_name}") 336 337 cols_types = self.get_columns_types(debug=debug) 338 if cols_types is None: 339 return None 340 if debug: 341 dprint(f"cols_types: {cols_types}") 342 if dt_name is not None: 343 cols_types.pop(dt_name, None) 344 if id_name is not None: 345 cols_types.pop(id_name, None) 346 347 candidates = [] 348 candidate_keywords = {'float', 'double', 'precision', 'int', 'numeric',} 349 for search_term in candidate_keywords: 350 for col, typ in cols_types.items(): 351 if search_term in typ.lower(): 352 candidates.append(col) 353 break 354 if not candidates: 355 if debug: 356 dprint("No value column could be determined.") 357 return None 358 359 return candidates[0]
Return the name of the value column if it's defined, otherwise make an educated guess.
If not set in the columns
dictionary, return the first numeric column that is not
an ID or datetime column.
If none may be found, return None
.
Parameters
- debug (bool, default False:): Verbosity toggle.
Returns
- Either a string or
None
.
362@property 363def parents(self) -> List[meerschaum.Pipe]: 364 """ 365 Return a list of `meerschaum.Pipe` objects to be designated as parents. 366 """ 367 if 'parents' not in self.parameters: 368 return [] 369 from meerschaum.utils.warnings import warn 370 _parents_keys = self.parameters['parents'] 371 if not isinstance(_parents_keys, list): 372 warn( 373 f"Please ensure the parents for {self} are defined as a list of keys.", 374 stacklevel = 4 375 ) 376 return [] 377 from meerschaum import Pipe 378 _parents = [] 379 for keys in _parents_keys: 380 try: 381 p = Pipe(**keys) 382 except Exception as e: 383 warn(f"Unable to build parent with keys '{keys}' for {self}:\n{e}") 384 continue 385 _parents.append(p) 386 return _parents
Return a list of meerschaum.Pipe
objects to be designated as parents.
389@property 390def children(self) -> List[meerschaum.Pipe]: 391 """ 392 Return a list of `meerschaum.Pipe` objects to be designated as children. 393 """ 394 if 'children' not in self.parameters: 395 return [] 396 from meerschaum.utils.warnings import warn 397 _children_keys = self.parameters['children'] 398 if not isinstance(_children_keys, list): 399 warn( 400 f"Please ensure the children for {self} are defined as a list of keys.", 401 stacklevel = 4 402 ) 403 return [] 404 from meerschaum import Pipe 405 _children = [] 406 for keys in _children_keys: 407 try: 408 p = Pipe(**keys) 409 except Exception as e: 410 warn(f"Unable to build parent with keys '{keys}' for {self}:\n{e}") 411 continue 412 _children.append(p) 413 return _children
Return a list of meerschaum.Pipe
objects to be designated as children.
416@property 417def target(self) -> str: 418 """ 419 The target table name. 420 You can set the target name under on of the following keys 421 (checked in this order): 422 - `target` 423 - `target_name` 424 - `target_table` 425 - `target_table_name` 426 """ 427 if 'target' not in self.parameters: 428 target = self._target_legacy() 429 potential_keys = ('target_name', 'target_table', 'target_table_name') 430 for k in potential_keys: 431 if k in self.parameters: 432 target = self.parameters[k] 433 break 434 435 if self.instance_connector.type == 'sql': 436 from meerschaum.utils.sql import truncate_item_name 437 truncated_target = truncate_item_name(target, self.instance_connector.flavor) 438 if truncated_target != target: 439 warn( 440 f"The target '{target}' is too long for '{self.instance_connector.flavor}', " 441 + f"will use {truncated_target} instead." 442 ) 443 target = truncated_target 444 445 self.target = target 446 return self.parameters['target']
The target table name. You can set the target name under on of the following keys (checked in this order):
target
target_name
target_table
target_table_name
469def guess_datetime(self) -> Union[str, None]: 470 """ 471 Try to determine a pipe's datetime column. 472 """ 473 _dtypes = self.dtypes 474 475 ### Abort if the user explictly disallows a datetime index. 476 if 'datetime' in _dtypes: 477 if _dtypes['datetime'] is None: 478 return None 479 480 from meerschaum.utils.dtypes import are_dtypes_equal 481 dt_cols = [ 482 col 483 for col, typ in _dtypes.items() 484 if are_dtypes_equal(typ, 'datetime') 485 ] 486 if not dt_cols: 487 return None 488 return dt_cols[0]
Try to determine a pipe's datetime column.
12def show( 13 self, 14 nopretty: bool = False, 15 debug: bool = False, 16 **kw 17 ) -> SuccessTuple: 18 """ 19 Show attributes of a Pipe. 20 21 Parameters 22 ---------- 23 nopretty: bool, default False 24 If `True`, simply print the JSON of the pipe's attributes. 25 26 debug: bool, default False 27 Verbosity toggle. 28 29 Returns 30 ------- 31 A `SuccessTuple` of success, message. 32 33 """ 34 import json 35 from meerschaum.utils.formatting import ( 36 pprint, make_header, ANSI, highlight_pipes, fill_ansi, get_console, 37 ) 38 from meerschaum.utils.packages import import_rich, attempt_import 39 from meerschaum.utils.warnings import info 40 attributes_json = json.dumps(self.attributes) 41 if not nopretty: 42 _to_print = f"Attributes for {self}:" 43 if ANSI: 44 _to_print = fill_ansi(highlight_pipes(make_header(_to_print)), 'magenta') 45 print(_to_print) 46 rich = import_rich() 47 rich_json = attempt_import('rich.json') 48 get_console().print(rich_json.JSON(attributes_json)) 49 else: 50 print(_to_print) 51 else: 52 print(attributes_json) 53 54 return True, "Success"
Show attributes of a Pipe.
Parameters
- nopretty (bool, default False):
If
True
, simply print the JSON of the pipe's attributes. - debug (bool, default False): Verbosity toggle.
Returns
- A
SuccessTuple
of success, message.
21def edit( 22 self, 23 patch: bool = False, 24 interactive: bool = False, 25 debug: bool = False, 26 **kw: Any 27) -> SuccessTuple: 28 """ 29 Edit a Pipe's configuration. 30 31 Parameters 32 ---------- 33 patch: bool, default False 34 If `patch` is True, update parameters by cascading rather than overwriting. 35 interactive: bool, default False 36 If `True`, open an editor for the user to make changes to the pipe's YAML file. 37 debug: bool, default False 38 Verbosity toggle. 39 40 Returns 41 ------- 42 A `SuccessTuple` of success, message. 43 44 """ 45 from meerschaum.utils.venv import Venv 46 from meerschaum.connectors import get_connector_plugin 47 48 if self.temporary: 49 return False, "Cannot edit pipes created with `temporary=True` (read-only)." 50 51 if not interactive: 52 with Venv(get_connector_plugin(self.instance_connector)): 53 return self.instance_connector.edit_pipe(self, patch=patch, debug=debug, **kw) 54 55 from meerschaum.config._paths import PIPES_CACHE_RESOURCES_PATH 56 from meerschaum.utils.misc import edit_file 57 parameters_filename = str(self) + '.yaml' 58 parameters_path = PIPES_CACHE_RESOURCES_PATH / parameters_filename 59 60 from meerschaum.utils.yaml import yaml 61 62 edit_text = f"Edit the parameters for {self}" 63 edit_top = '#' * (len(edit_text) + 4) 64 edit_header = edit_top + f'\n# {edit_text} #\n' + edit_top + '\n\n' 65 66 from meerschaum.config import get_config 67 parameters = dict(get_config('pipes', 'parameters', patch=True)) 68 from meerschaum.config._patch import apply_patch_to_config 69 parameters = apply_patch_to_config(parameters, self.parameters) 70 71 ### write parameters to yaml file 72 with open(parameters_path, 'w+') as f: 73 f.write(edit_header) 74 yaml.dump(parameters, stream=f, sort_keys=False) 75 76 ### only quit editing if yaml is valid 77 editing = True 78 while editing: 79 edit_file(parameters_path) 80 try: 81 with open(parameters_path, 'r') as f: 82 file_parameters = yaml.load(f.read()) 83 except Exception as e: 84 from meerschaum.utils.warnings import warn 85 warn(f"Invalid format defined for '{self}':\n\n{e}") 86 input(f"Press [Enter] to correct the configuration for '{self}': ") 87 else: 88 editing = False 89 90 self.parameters = file_parameters 91 92 if debug: 93 from meerschaum.utils.formatting import pprint 94 pprint(self.parameters) 95 96 with Venv(get_connector_plugin(self.instance_connector)): 97 return self.instance_connector.edit_pipe(self, patch=patch, debug=debug, **kw)
Edit a Pipe's configuration.
Parameters
- patch (bool, default False):
If
patch
is True, update parameters by cascading rather than overwriting. - interactive (bool, default False):
If
True
, open an editor for the user to make changes to the pipe's YAML file. - debug (bool, default False): Verbosity toggle.
Returns
- A
SuccessTuple
of success, message.
100def edit_definition( 101 self, 102 yes: bool = False, 103 noask: bool = False, 104 force: bool = False, 105 debug : bool = False, 106 **kw : Any 107) -> SuccessTuple: 108 """ 109 Edit a pipe's definition file and update its configuration. 110 **NOTE:** This function is interactive and should not be used in automated scripts! 111 112 Returns 113 ------- 114 A `SuccessTuple` of success, message. 115 116 """ 117 if self.temporary: 118 return False, "Cannot edit pipes created with `temporary=True` (read-only)." 119 120 from meerschaum.connectors import instance_types 121 if (self.connector is None) or self.connector.type not in instance_types: 122 return self.edit(interactive=True, debug=debug, **kw) 123 124 import json 125 from meerschaum.utils.warnings import info, warn 126 from meerschaum.utils.debug import dprint 127 from meerschaum.config._patch import apply_patch_to_config 128 from meerschaum.utils.misc import edit_file 129 130 _parameters = self.parameters 131 if 'fetch' not in _parameters: 132 _parameters['fetch'] = {} 133 134 def _edit_api(): 135 from meerschaum.utils.prompt import prompt, yes_no 136 info( 137 f"Please enter the keys of the source pipe from '{self.connector}'.\n" + 138 "Type 'None' for None, or empty when there is no default. Press [CTRL+C] to skip." 139 ) 140 141 _keys = { 'connector_keys' : None, 'metric_key' : None, 'location_key' : None } 142 for k in _keys: 143 _keys[k] = _parameters['fetch'].get(k, None) 144 145 for k, v in _keys.items(): 146 try: 147 _keys[k] = prompt(k.capitalize().replace('_', ' ') + ':', icon=True, default=v) 148 except KeyboardInterrupt: 149 continue 150 if _keys[k] in ('', 'None', '\'None\'', '[None]'): 151 _keys[k] = None 152 153 _parameters['fetch'] = apply_patch_to_config(_parameters['fetch'], _keys) 154 155 info("You may optionally specify additional filter parameters as JSON.") 156 print(" Parameters are translated into a 'WHERE x AND y' clause, and lists are IN clauses.") 157 print(" For example, the following JSON would correspond to 'WHERE x = 1 AND y IN (2, 3)':") 158 print(json.dumps({'x': 1, 'y': [2, 3]}, indent=2, separators=(',', ': '))) 159 if force or yes_no( 160 "Would you like to add additional filter parameters?", 161 yes=yes, noask=noask 162 ): 163 from meerschaum.config._paths import PIPES_CACHE_RESOURCES_PATH 164 definition_filename = str(self) + '.json' 165 definition_path = PIPES_CACHE_RESOURCES_PATH / definition_filename 166 try: 167 definition_path.touch() 168 with open(definition_path, 'w+') as f: 169 json.dump(_parameters.get('fetch', {}).get('params', {}), f, indent=2) 170 except Exception as e: 171 return False, f"Failed writing file '{definition_path}':\n" + str(e) 172 173 _params = None 174 while True: 175 edit_file(definition_path) 176 try: 177 with open(definition_path, 'r') as f: 178 _params = json.load(f) 179 except Exception as e: 180 warn(f'Failed to read parameters JSON:\n{e}', stack=False) 181 if force or yes_no( 182 "Would you like to try again?\n " 183 + "If not, the parameters JSON file will be ignored.", 184 noask=noask, yes=yes 185 ): 186 continue 187 _params = None 188 break 189 if _params is not None: 190 if 'fetch' not in _parameters: 191 _parameters['fetch'] = {} 192 _parameters['fetch']['params'] = _params 193 194 self.parameters = _parameters 195 return True, "Success" 196 197 def _edit_sql(): 198 import pathlib, os, textwrap 199 from meerschaum.config._paths import PIPES_CACHE_RESOURCES_PATH 200 from meerschaum.utils.misc import edit_file 201 definition_filename = str(self) + '.sql' 202 definition_path = PIPES_CACHE_RESOURCES_PATH / definition_filename 203 204 sql_definition = _parameters['fetch'].get('definition', None) 205 if sql_definition is None: 206 sql_definition = '' 207 sql_definition = textwrap.dedent(sql_definition).lstrip() 208 209 try: 210 definition_path.touch() 211 with open(definition_path, 'w+') as f: 212 f.write(sql_definition) 213 except Exception as e: 214 return False, f"Failed writing file '{definition_path}':\n" + str(e) 215 216 edit_file(definition_path) 217 try: 218 with open(definition_path, 'r') as f: 219 file_definition = f.read() 220 except Exception as e: 221 return False, f"Failed reading file '{definition_path}':\n" + str(e) 222 223 if sql_definition == file_definition: 224 return False, f"No changes made to definition for {self}." 225 226 if ' ' not in file_definition: 227 return False, f"Invalid SQL definition for {self}." 228 229 if debug: 230 dprint("Read SQL definition:\n\n" + file_definition) 231 _parameters['fetch']['definition'] = file_definition 232 self.parameters = _parameters 233 return True, "Success" 234 235 locals()['_edit_' + str(self.connector.type)]() 236 return self.edit(interactive=False, debug=debug, **kw)
Edit a pipe's definition file and update its configuration. NOTE: This function is interactive and should not be used in automated scripts!
Returns
- A
SuccessTuple
of success, message.
13def update(self, *args, **kw) -> SuccessTuple: 14 """ 15 Update a pipe's parameters in its instance. 16 """ 17 kw['interactive'] = False 18 return self.edit(*args, **kw)
Update a pipe's parameters in its instance.
40def sync( 41 self, 42 df: Union[ 43 pd.DataFrame, 44 Dict[str, List[Any]], 45 List[Dict[str, Any]], 46 InferFetch 47 ] = InferFetch, 48 begin: Union[datetime, int, str, None] = '', 49 end: Union[datetime, int, None] = None, 50 force: bool = False, 51 retries: int = 10, 52 min_seconds: int = 1, 53 check_existing: bool = True, 54 blocking: bool = True, 55 workers: Optional[int] = None, 56 callback: Optional[Callable[[Tuple[bool, str]], Any]] = None, 57 error_callback: Optional[Callable[[Exception], Any]] = None, 58 chunksize: Optional[int] = -1, 59 sync_chunks: bool = True, 60 debug: bool = False, 61 _inplace: bool = True, 62 **kw: Any 63) -> SuccessTuple: 64 """ 65 Fetch new data from the source and update the pipe's table with new data. 66 67 Get new remote data via fetch, get existing data in the same time period, 68 and merge the two, only keeping the unseen data. 69 70 Parameters 71 ---------- 72 df: Union[None, pd.DataFrame, Dict[str, List[Any]]], default None 73 An optional DataFrame to sync into the pipe. Defaults to `None`. 74 75 begin: Union[datetime, int, str, None], default '' 76 Optionally specify the earliest datetime to search for data. 77 78 end: Union[datetime, int, str, None], default None 79 Optionally specify the latest datetime to search for data. 80 81 force: bool, default False 82 If `True`, keep trying to sync untul `retries` attempts. 83 84 retries: int, default 10 85 If `force`, how many attempts to try syncing before declaring failure. 86 87 min_seconds: Union[int, float], default 1 88 If `force`, how many seconds to sleep between retries. Defaults to `1`. 89 90 check_existing: bool, default True 91 If `True`, pull and diff with existing data from the pipe. 92 93 blocking: bool, default True 94 If `True`, wait for sync to finish and return its result, otherwise 95 asyncronously sync (oxymoron?) and return success. Defaults to `True`. 96 Only intended for specific scenarios. 97 98 workers: Optional[int], default None 99 If provided and the instance connector is thread-safe 100 (`pipe.instance_connector.IS_THREAD_SAFE is True`), 101 limit concurrent sync to this many threads. 102 103 callback: Optional[Callable[[Tuple[bool, str]], Any]], default None 104 Callback function which expects a SuccessTuple as input. 105 Only applies when `blocking=False`. 106 107 error_callback: Optional[Callable[[Exception], Any]], default None 108 Callback function which expects an Exception as input. 109 Only applies when `blocking=False`. 110 111 chunksize: int, default -1 112 Specify the number of rows to sync per chunk. 113 If `-1`, resort to system configuration (default is `900`). 114 A `chunksize` of `None` will sync all rows in one transaction. 115 116 sync_chunks: bool, default True 117 If possible, sync chunks while fetching them into memory. 118 119 debug: bool, default False 120 Verbosity toggle. Defaults to False. 121 122 Returns 123 ------- 124 A `SuccessTuple` of success (`bool`) and message (`str`). 125 """ 126 from meerschaum.utils.debug import dprint, _checkpoint 127 from meerschaum.connectors import custom_types 128 from meerschaum.plugins import Plugin 129 from meerschaum.utils.formatting import get_console 130 from meerschaum.utils.venv import Venv 131 from meerschaum.connectors import get_connector_plugin 132 from meerschaum.utils.misc import df_is_chunk_generator, filter_keywords, filter_arguments 133 from meerschaum.utils.pool import get_pool 134 from meerschaum.config import get_config 135 136 if (callback is not None or error_callback is not None) and blocking: 137 warn("Callback functions are only executed when blocking = False. Ignoring...") 138 139 _checkpoint(_total=2, **kw) 140 141 if chunksize == 0: 142 chunksize = None 143 sync_chunks = False 144 145 kw.update({ 146 'begin': begin, 147 'end': end, 148 'force': force, 149 'retries': retries, 150 'min_seconds': min_seconds, 151 'check_existing': check_existing, 152 'blocking': blocking, 153 'workers': workers, 154 'callback': callback, 155 'error_callback': error_callback, 156 'sync_chunks': sync_chunks, 157 'chunksize': chunksize, 158 }) 159 160 ### NOTE: Invalidate `_exists` cache before and after syncing. 161 self._exists = None 162 163 def _sync( 164 p: 'meerschaum.Pipe', 165 df: Union[ 166 'pd.DataFrame', 167 Dict[str, List[Any]], 168 List[Dict[str, Any]], 169 InferFetch 170 ] = InferFetch, 171 ) -> SuccessTuple: 172 if df is None: 173 p._exists = None 174 return ( 175 False, 176 f"You passed `None` instead of data into `sync()` for {p}.\n" 177 + "Omit the DataFrame to infer fetching.", 178 ) 179 ### Ensure that Pipe is registered. 180 if not p.temporary and p.get_id(debug=debug) is None: 181 ### NOTE: This may trigger an interactive session for plugins! 182 register_success, register_msg = p.register(debug=debug) 183 if not register_success: 184 if 'already' not in register_msg: 185 p._exists = None 186 return register_success, register_msg 187 188 ### If connector is a plugin with a `sync()` method, return that instead. 189 ### If the plugin does not have a `sync()` method but does have a `fetch()` method, 190 ### use that instead. 191 ### NOTE: The DataFrame must be omitted for the plugin sync method to apply. 192 ### If a DataFrame is provided, continue as expected. 193 if hasattr(df, 'MRSM_INFER_FETCH'): 194 try: 195 if p.connector is None: 196 if ':' not in p.connector_keys: 197 return True, f"{p} does not support fetching; nothing to do." 198 199 msg = f"{p} does not have a valid connector." 200 if p.connector_keys.startswith('plugin:'): 201 msg += f"\n Perhaps {p.connector_keys} has a syntax error?" 202 p._exists = None 203 return False, msg 204 except Exception: 205 p._exists = None 206 return False, f"Unable to create the connector for {p}." 207 208 ### Sync in place if this is a SQL pipe. 209 if ( 210 str(self.connector) == str(self.instance_connector) 211 and 212 hasattr(self.instance_connector, 'sync_pipe_inplace') 213 and 214 _inplace 215 and 216 get_config('system', 'experimental', 'inplace_sync') 217 ): 218 with Venv(get_connector_plugin(self.instance_connector)): 219 p._exists = None 220 _args, _kwargs = filter_arguments( 221 p.instance_connector.sync_pipe_inplace, 222 p, 223 debug=debug, 224 **kw 225 ) 226 return self.instance_connector.sync_pipe_inplace( 227 *_args, 228 **_kwargs 229 ) 230 231 ### Activate and invoke `sync(pipe)` for plugin connectors with `sync` methods. 232 try: 233 if getattr(p.connector, 'sync', None) is not None: 234 with Venv(get_connector_plugin(p.connector), debug=debug): 235 _args, _kwargs = filter_arguments( 236 p.connector.sync, 237 p, 238 debug=debug, 239 **kw 240 ) 241 return_tuple = p.connector.sync(*_args, **_kwargs) 242 p._exists = None 243 if not isinstance(return_tuple, tuple): 244 return_tuple = ( 245 False, 246 f"Plugin '{p.connector.label}' returned non-tuple value: {return_tuple}" 247 ) 248 return return_tuple 249 250 except Exception as e: 251 get_console().print_exception() 252 msg = f"Failed to sync {p} with exception: '" + str(e) + "'" 253 if debug: 254 error(msg, silent=False) 255 p._exists = None 256 return False, msg 257 258 ### Fetch the dataframe from the connector's `fetch()` method. 259 try: 260 with Venv(get_connector_plugin(p.connector), debug=debug): 261 df = p.fetch( 262 **filter_keywords( 263 p.fetch, 264 debug=debug, 265 **kw 266 ) 267 ) 268 except Exception as e: 269 get_console().print_exception( 270 suppress=[ 271 'meerschaum/core/Pipe/_sync.py', 272 'meerschaum/core/Pipe/_fetch.py', 273 ] 274 ) 275 msg = f"Failed to fetch data from {p.connector}:\n {e}" 276 df = None 277 278 if df is None: 279 p._exists = None 280 return False, f"No data were fetched for {p}." 281 282 if isinstance(df, list): 283 if len(df) == 0: 284 return True, f"No new rows were returned for {p}." 285 286 ### May be a chunk hook results list. 287 if isinstance(df[0], tuple): 288 success = all([_success for _success, _ in df]) 289 message = '\n'.join([_message for _, _message in df]) 290 return success, message 291 292 ### TODO: Depreciate async? 293 if df is True: 294 p._exists = None 295 return True, f"{p} is being synced in parallel." 296 297 ### CHECKPOINT: Retrieved the DataFrame. 298 _checkpoint(**kw) 299 300 ### Allow for dataframe generators or iterables. 301 if df_is_chunk_generator(df): 302 kw['workers'] = p.get_num_workers(kw.get('workers', None)) 303 dt_col = p.columns.get('datetime', None) 304 pool = get_pool(workers=kw.get('workers', 1)) 305 if debug: 306 dprint(f"Received {type(df)}. Attempting to sync first chunk...") 307 308 try: 309 chunk = next(df) 310 except StopIteration: 311 return True, "Received an empty generator; nothing to do." 312 313 chunk_success, chunk_msg = _sync(p, chunk) 314 chunk_msg = '\n' + self._get_chunk_label(chunk, dt_col) + '\n' + chunk_msg 315 if not chunk_success: 316 return chunk_success, f"Unable to sync initial chunk for {p}:\n{chunk_msg}" 317 if debug: 318 dprint("Successfully synced the first chunk, attemping the rest...") 319 320 failed_chunks = [] 321 def _process_chunk(_chunk): 322 try: 323 _chunk_success, _chunk_msg = _sync(p, _chunk) 324 except Exception as e: 325 _chunk_success, _chunk_msg = False, str(e) 326 if not _chunk_success: 327 failed_chunks.append(_chunk) 328 return ( 329 _chunk_success, 330 ( 331 '\n' 332 + self._get_chunk_label(_chunk, dt_col) 333 + '\n' 334 + _chunk_msg 335 ) 336 ) 337 338 results = sorted( 339 [(chunk_success, chunk_msg)] + ( 340 list(pool.imap(_process_chunk, df)) 341 if not df_is_chunk_generator(chunk) 342 else [ 343 _process_chunk(_child_chunks) 344 for _child_chunks in df 345 ] 346 ) 347 ) 348 chunk_messages = [chunk_msg for _, chunk_msg in results] 349 success_bools = [chunk_success for chunk_success, _ in results] 350 success = all(success_bools) 351 msg = '\n'.join(chunk_messages) 352 353 ### If some chunks succeeded, retry the failures. 354 retry_success = True 355 if not success and any(success_bools): 356 if debug: 357 dprint("Retrying failed chunks...") 358 chunks_to_retry = [c for c in failed_chunks] 359 failed_chunks = [] 360 for chunk in chunks_to_retry: 361 chunk_success, chunk_msg = _process_chunk(chunk) 362 msg += f"\n\nRetried chunk:\n{chunk_msg}\n" 363 retry_success = retry_success and chunk_success 364 365 success = success and retry_success 366 return success, msg 367 368 ### Cast to a dataframe and ensure datatypes are what we expect. 369 df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug) 370 371 ### Capture `numeric`, `uuid`, and `json` columns. 372 self._persist_new_json_columns(df, debug=debug) 373 self._persist_new_numeric_columns(df, debug=debug) 374 self._persist_new_uuid_columns(df, debug=debug) 375 376 if debug: 377 dprint( 378 "DataFrame to sync:\n" 379 + ( 380 str(df)[:255] 381 + '...' 382 if len(str(df)) >= 256 383 else str(df) 384 ), 385 **kw 386 ) 387 388 ### if force, continue to sync until success 389 return_tuple = False, f"Did not sync {p}." 390 run = True 391 _retries = 1 392 while run: 393 with Venv(get_connector_plugin(self.instance_connector)): 394 return_tuple = p.instance_connector.sync_pipe( 395 pipe=p, 396 df=df, 397 debug=debug, 398 **kw 399 ) 400 _retries += 1 401 run = (not return_tuple[0]) and force and _retries <= retries 402 if run and debug: 403 dprint(f"Syncing failed for {p}. Attempt ( {_retries} / {retries} )", **kw) 404 dprint(f"Sleeping for {min_seconds} seconds...", **kw) 405 time.sleep(min_seconds) 406 if _retries > retries: 407 warn( 408 f"Unable to sync {p} within {retries} attempt" + 409 ("s" if retries != 1 else "") + "!" 410 ) 411 412 ### CHECKPOINT: Finished syncing. Handle caching. 413 _checkpoint(**kw) 414 if self.cache_pipe is not None: 415 if debug: 416 dprint("Caching retrieved dataframe.", **kw) 417 _sync_cache_tuple = self.cache_pipe.sync(df, debug=debug, **kw) 418 if not _sync_cache_tuple[0]: 419 warn(f"Failed to sync local cache for {self}.") 420 421 self._exists = None 422 return return_tuple 423 424 if blocking: 425 self._exists = None 426 return _sync(self, df = df) 427 428 from meerschaum.utils.threading import Thread 429 def default_callback(result_tuple: SuccessTuple): 430 dprint(f"Asynchronous result from {self}: {result_tuple}", **kw) 431 432 def default_error_callback(x: Exception): 433 dprint(f"Error received for {self}: {x}", **kw) 434 435 if callback is None and debug: 436 callback = default_callback 437 if error_callback is None and debug: 438 error_callback = default_error_callback 439 try: 440 thread = Thread( 441 target=_sync, 442 args=(self,), 443 kwargs={'df': df}, 444 daemon=False, 445 callback=callback, 446 error_callback=error_callback, 447 ) 448 thread.start() 449 except Exception as e: 450 self._exists = None 451 return False, str(e) 452 453 self._exists = None 454 return True, f"Spawned asyncronous sync for {self}."
Fetch new data from the source and update the pipe's table with new data.
Get new remote data via fetch, get existing data in the same time period, and merge the two, only keeping the unseen data.
Parameters
- df (Union[None, pd.DataFrame, Dict[str, List[Any]]], default None):
An optional DataFrame to sync into the pipe. Defaults to
None
. - begin (Union[datetime, int, str, None], default ''): Optionally specify the earliest datetime to search for data.
- end (Union[datetime, int, str, None], default None): Optionally specify the latest datetime to search for data.
- force (bool, default False):
If
True
, keep trying to sync untulretries
attempts. - retries (int, default 10):
If
force
, how many attempts to try syncing before declaring failure. - min_seconds (Union[int, float], default 1):
If
force
, how many seconds to sleep between retries. Defaults to1
. - check_existing (bool, default True):
If
True
, pull and diff with existing data from the pipe. - blocking (bool, default True):
If
True
, wait for sync to finish and return its result, otherwise asyncronously sync (oxymoron?) and return success. Defaults toTrue
. Only intended for specific scenarios. - workers (Optional[int], default None):
If provided and the instance connector is thread-safe
(
pipe.instance_connector.IS_THREAD_SAFE is True
), limit concurrent sync to this many threads. - callback (Optional[Callable[[Tuple[bool, str]], Any]], default None):
Callback function which expects a SuccessTuple as input.
Only applies when
blocking=False
. - error_callback (Optional[Callable[[Exception], Any]], default None):
Callback function which expects an Exception as input.
Only applies when
blocking=False
. - chunksize (int, default -1):
Specify the number of rows to sync per chunk.
If
-1
, resort to system configuration (default is900
). Achunksize
ofNone
will sync all rows in one transaction. - sync_chunks (bool, default True): If possible, sync chunks while fetching them into memory.
- debug (bool, default False): Verbosity toggle. Defaults to False.
Returns
- A
SuccessTuple
of success (bool
) and message (str
).
457def get_sync_time( 458 self, 459 params: Optional[Dict[str, Any]] = None, 460 newest: bool = True, 461 apply_backtrack_interval: bool = False, 462 round_down: bool = False, 463 debug: bool = False 464) -> Union['datetime', None]: 465 """ 466 Get the most recent datetime value for a Pipe. 467 468 Parameters 469 ---------- 470 params: Optional[Dict[str, Any]], default None 471 Dictionary to build a WHERE clause for a specific column. 472 See `meerschaum.utils.sql.build_where`. 473 474 newest: bool, default True 475 If `True`, get the most recent datetime (honoring `params`). 476 If `False`, get the oldest datetime (`ASC` instead of `DESC`). 477 478 apply_backtrack_interval: bool, default False 479 If `True`, subtract the backtrack interval from the sync time. 480 481 round_down: bool, default False 482 If `True`, round down the datetime value to the nearest minute. 483 484 debug: bool, default False 485 Verbosity toggle. 486 487 Returns 488 ------- 489 A `datetime` object if the pipe exists, otherwise `None`. 490 491 """ 492 from meerschaum.utils.venv import Venv 493 from meerschaum.connectors import get_connector_plugin 494 from meerschaum.utils.misc import round_time 495 496 with Venv(get_connector_plugin(self.instance_connector)): 497 sync_time = self.instance_connector.get_sync_time( 498 self, 499 params=params, 500 newest=newest, 501 debug=debug, 502 ) 503 504 if round_down and isinstance(sync_time, datetime): 505 sync_time = round_time(sync_time, timedelta(minutes=1)) 506 507 if apply_backtrack_interval and sync_time is not None: 508 backtrack_interval = self.get_backtrack_interval(debug=debug) 509 try: 510 sync_time -= backtrack_interval 511 except Exception as e: 512 warn(f"Failed to apply backtrack interval:\n{e}") 513 514 return sync_time
Get the most recent datetime value for a Pipe.
Parameters
- params (Optional[Dict[str, Any]], default None):
Dictionary to build a WHERE clause for a specific column.
See
meerschaum.utils.sql.build_where
. - newest (bool, default True):
If
True
, get the most recent datetime (honoringparams
). IfFalse
, get the oldest datetime (ASC
instead ofDESC
). - apply_backtrack_interval (bool, default False):
If
True
, subtract the backtrack interval from the sync time. - round_down (bool, default False):
If
True
, round down the datetime value to the nearest minute. - debug (bool, default False): Verbosity toggle.
Returns
- A
datetime
object if the pipe exists, otherwiseNone
.
517def exists( 518 self, 519 debug : bool = False 520 ) -> bool: 521 """ 522 See if a Pipe's table exists. 523 524 Parameters 525 ---------- 526 debug: bool, default False 527 Verbosity toggle. 528 529 Returns 530 ------- 531 A `bool` corresponding to whether a pipe's underlying table exists. 532 533 """ 534 import time 535 from meerschaum.utils.venv import Venv 536 from meerschaum.connectors import get_connector_plugin 537 from meerschaum.config import STATIC_CONFIG 538 from meerschaum.utils.debug import dprint 539 now = time.perf_counter() 540 exists_timeout_seconds = STATIC_CONFIG['pipes']['exists_timeout_seconds'] 541 542 _exists = self.__dict__.get('_exists', None) 543 if _exists: 544 exists_timestamp = self.__dict__.get('_exists_timestamp', None) 545 if exists_timestamp is not None: 546 delta = now - exists_timestamp 547 if delta < exists_timeout_seconds: 548 if debug: 549 dprint(f"Returning cached `exists` for {self} ({round(delta, 2)} seconds old).") 550 return _exists 551 552 with Venv(get_connector_plugin(self.instance_connector)): 553 _exists = self.instance_connector.pipe_exists(pipe=self, debug=debug) 554 555 self.__dict__['_exists'] = _exists 556 self.__dict__['_exists_timestamp'] = now 557 return _exists
See if a Pipe's table exists.
Parameters
- debug (bool, default False): Verbosity toggle.
Returns
- A
bool
corresponding to whether a pipe's underlying table exists.
560def filter_existing( 561 self, 562 df: 'pd.DataFrame', 563 safe_copy: bool = True, 564 date_bound_only: bool = False, 565 include_unchanged_columns: bool = False, 566 chunksize: Optional[int] = -1, 567 debug: bool = False, 568 **kw 569) -> Tuple['pd.DataFrame', 'pd.DataFrame', 'pd.DataFrame']: 570 """ 571 Inspect a dataframe and filter out rows which already exist in the pipe. 572 573 Parameters 574 ---------- 575 df: 'pd.DataFrame' 576 The dataframe to inspect and filter. 577 578 safe_copy: bool, default True 579 If `True`, create a copy before comparing and modifying the dataframes. 580 Setting to `False` may mutate the DataFrames. 581 See `meerschaum.utils.dataframe.filter_unseen_df`. 582 583 date_bound_only: bool, default False 584 If `True`, only use the datetime index to fetch the sample dataframe. 585 586 include_unchanged_columns: bool, default False 587 If `True`, include the backtrack columns which haven't changed in the update dataframe. 588 This is useful if you can't update individual keys. 589 590 chunksize: Optional[int], default -1 591 The `chunksize` used when fetching existing data. 592 593 debug: bool, default False 594 Verbosity toggle. 595 596 Returns 597 ------- 598 A tuple of three pandas DataFrames: unseen, update, and delta. 599 """ 600 from meerschaum.utils.warnings import warn 601 from meerschaum.utils.debug import dprint 602 from meerschaum.utils.packages import attempt_import, import_pandas 603 from meerschaum.utils.misc import round_time 604 from meerschaum.utils.dataframe import ( 605 filter_unseen_df, 606 add_missing_cols_to_df, 607 get_unhashable_cols, 608 get_numeric_cols, 609 ) 610 from meerschaum.utils.dtypes import ( 611 to_pandas_dtype, 612 none_if_null, 613 ) 614 from meerschaum.config import get_config 615 pd = import_pandas() 616 pandas = attempt_import('pandas') 617 if 'dataframe' not in str(type(df)).lower(): 618 df = self.enforce_dtypes(df, chunksize=chunksize, debug=debug) 619 is_dask = 'dask' in df.__module__ 620 if is_dask: 621 dd = attempt_import('dask.dataframe') 622 merge = dd.merge 623 NA = pandas.NA 624 else: 625 merge = pd.merge 626 NA = pd.NA 627 628 def get_empty_df(): 629 empty_df = pd.DataFrame([]) 630 dtypes = dict(df.dtypes) if df is not None else {} 631 dtypes.update(self.dtypes) 632 pd_dtypes = { 633 col: to_pandas_dtype(str(typ)) 634 for col, typ in dtypes.items() 635 } 636 return add_missing_cols_to_df(empty_df, pd_dtypes) 637 638 if df is None: 639 empty_df = get_empty_df() 640 return empty_df, empty_df, empty_df 641 642 if (df.empty if not is_dask else len(df) == 0): 643 return df, df, df 644 645 ### begin is the oldest data in the new dataframe 646 begin, end = None, None 647 dt_col = self.columns.get('datetime', None) 648 dt_type = self.dtypes.get(dt_col, 'datetime64[ns]') if dt_col else None 649 try: 650 min_dt_val = df[dt_col].min(skipna=True) if dt_col else None 651 if is_dask and min_dt_val is not None: 652 min_dt_val = min_dt_val.compute() 653 min_dt = ( 654 pandas.to_datetime(min_dt_val).to_pydatetime() 655 if min_dt_val is not None and 'datetime' in str(dt_type) 656 else min_dt_val 657 ) 658 except Exception: 659 min_dt = None 660 if not ('datetime' in str(type(min_dt))) or str(min_dt) == 'NaT': 661 if 'int' not in str(type(min_dt)).lower(): 662 min_dt = None 663 664 if isinstance(min_dt, datetime): 665 begin = ( 666 round_time( 667 min_dt, 668 to='down' 669 ) - timedelta(minutes=1) 670 ) 671 elif dt_type and 'int' in dt_type.lower(): 672 begin = min_dt 673 elif dt_col is None: 674 begin = None 675 676 ### end is the newest data in the new dataframe 677 try: 678 max_dt_val = df[dt_col].max(skipna=True) if dt_col else None 679 if is_dask and max_dt_val is not None: 680 max_dt_val = max_dt_val.compute() 681 max_dt = ( 682 pandas.to_datetime(max_dt_val).to_pydatetime() 683 if max_dt_val is not None and 'datetime' in str(dt_type) 684 else max_dt_val 685 ) 686 except Exception: 687 import traceback 688 traceback.print_exc() 689 max_dt = None 690 691 if ('datetime' not in str(type(max_dt))) or str(min_dt) == 'NaT': 692 if 'int' not in str(type(max_dt)).lower(): 693 max_dt = None 694 695 if isinstance(max_dt, datetime): 696 end = ( 697 round_time( 698 max_dt, 699 to='down' 700 ) + timedelta(minutes=1) 701 ) 702 elif dt_type and 'int' in dt_type.lower(): 703 end = max_dt + 1 704 705 if max_dt is not None and min_dt is not None and min_dt > max_dt: 706 warn("Detected minimum datetime greater than maximum datetime.") 707 708 if begin is not None and end is not None and begin > end: 709 if isinstance(begin, datetime): 710 begin = end - timedelta(minutes=1) 711 ### We might be using integers for the datetime axis. 712 else: 713 begin = end - 1 714 715 unique_index_vals = { 716 col: df[col].unique() 717 for col in self.columns 718 if col in df.columns and col != dt_col 719 } if not date_bound_only else {} 720 filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit') 721 _ = kw.pop('params', None) 722 params = { 723 col: [ 724 none_if_null(val) 725 for val in unique_vals 726 ] 727 for col, unique_vals in unique_index_vals.items() 728 if len(unique_vals) <= filter_params_index_limit 729 } if not date_bound_only else {} 730 731 if debug: 732 dprint(f"Looking at data between '{begin}' and '{end}':", **kw) 733 734 backtrack_df = self.get_data( 735 begin=begin, 736 end=end, 737 chunksize=chunksize, 738 params=params, 739 debug=debug, 740 **kw 741 ) 742 if backtrack_df is None: 743 if debug: 744 dprint(f"No backtrack data was found for {self}.") 745 return df, get_empty_df(), df 746 747 if debug: 748 dprint(f"Existing data for {self}:\n" + str(backtrack_df), **kw) 749 dprint(f"Existing dtypes for {self}:\n" + str(backtrack_df.dtypes)) 750 751 ### Separate new rows from changed ones. 752 on_cols = [ 753 col for col_key, col in self.columns.items() 754 if ( 755 col 756 and 757 col_key != 'value' 758 and col in backtrack_df.columns 759 ) 760 ] 761 self_dtypes = self.dtypes 762 on_cols_dtypes = { 763 col: to_pandas_dtype(typ) 764 for col, typ in self_dtypes.items() 765 if col in on_cols 766 } 767 768 ### Detect changes between the old target and new source dataframes. 769 delta_df = add_missing_cols_to_df( 770 filter_unseen_df( 771 backtrack_df, 772 df, 773 dtypes={ 774 col: to_pandas_dtype(typ) 775 for col, typ in self_dtypes.items() 776 }, 777 safe_copy=safe_copy, 778 debug=debug 779 ), 780 on_cols_dtypes, 781 ) 782 783 ### Cast dicts or lists to strings so we can merge. 784 serializer = functools.partial(json.dumps, sort_keys=True, separators=(',', ':'), default=str) 785 786 def deserializer(x): 787 return json.loads(x) if isinstance(x, str) else x 788 789 unhashable_delta_cols = get_unhashable_cols(delta_df) 790 unhashable_backtrack_cols = get_unhashable_cols(backtrack_df) 791 for col in unhashable_delta_cols: 792 delta_df[col] = delta_df[col].apply(serializer) 793 for col in unhashable_backtrack_cols: 794 backtrack_df[col] = backtrack_df[col].apply(serializer) 795 casted_cols = set(unhashable_delta_cols + unhashable_backtrack_cols) 796 797 joined_df = merge( 798 delta_df.infer_objects(copy=False).fillna(NA), 799 backtrack_df.infer_objects(copy=False).fillna(NA), 800 how='left', 801 on=on_cols, 802 indicator=True, 803 suffixes=('', '_old'), 804 ) if on_cols else delta_df 805 for col in casted_cols: 806 if col in joined_df.columns: 807 joined_df[col] = joined_df[col].apply(deserializer) 808 if col in delta_df.columns: 809 delta_df[col] = delta_df[col].apply(deserializer) 810 811 ### Determine which rows are completely new. 812 new_rows_mask = (joined_df['_merge'] == 'left_only') if on_cols else None 813 cols = list(delta_df.columns) 814 815 unseen_df = ( 816 joined_df 817 .where(new_rows_mask) 818 .dropna(how='all')[cols] 819 .reset_index(drop=True) 820 ) if on_cols else delta_df 821 822 ### Rows that have already been inserted but values have changed. 823 update_df = ( 824 joined_df 825 .where(~new_rows_mask) 826 .dropna(how='all')[cols] 827 .reset_index(drop=True) 828 ) if on_cols else get_empty_df() 829 830 if include_unchanged_columns and on_cols: 831 unchanged_backtrack_cols = [ 832 col 833 for col in backtrack_df.columns 834 if col in on_cols or col not in update_df.columns 835 ] 836 update_df = merge( 837 backtrack_df[unchanged_backtrack_cols], 838 update_df, 839 how='inner', 840 on=on_cols, 841 ) 842 843 return unseen_df, update_df, delta_df
Inspect a dataframe and filter out rows which already exist in the pipe.
Parameters
- df ('pd.DataFrame'): The dataframe to inspect and filter.
- safe_copy (bool, default True):
If
True
, create a copy before comparing and modifying the dataframes. Setting toFalse
may mutate the DataFrames. Seemeerschaum.utils.dataframe.filter_unseen_df
. - date_bound_only (bool, default False):
If
True
, only use the datetime index to fetch the sample dataframe. - include_unchanged_columns (bool, default False):
If
True
, include the backtrack columns which haven't changed in the update dataframe. This is useful if you can't update individual keys. - chunksize (Optional[int], default -1):
The
chunksize
used when fetching existing data. - debug (bool, default False): Verbosity toggle.
Returns
- A tuple of three pandas DataFrames (unseen, update, and delta.):
868def get_num_workers(self, workers: Optional[int] = None) -> int: 869 """ 870 Get the number of workers to use for concurrent syncs. 871 872 Parameters 873 ---------- 874 The number of workers passed via `--workers`. 875 876 Returns 877 ------- 878 The number of workers, capped for safety. 879 """ 880 is_thread_safe = getattr(self.instance_connector, 'IS_THREAD_SAFE', False) 881 if not is_thread_safe: 882 return 1 883 884 engine_pool_size = ( 885 self.instance_connector.engine.pool.size() 886 if self.instance_connector.type == 'sql' 887 else None 888 ) 889 current_num_threads = threading.active_count() 890 current_num_connections = ( 891 self.instance_connector.engine.pool.checkedout() 892 if engine_pool_size is not None 893 else current_num_threads 894 ) 895 desired_workers = ( 896 min(workers or engine_pool_size, engine_pool_size) 897 if engine_pool_size is not None 898 else workers 899 ) 900 if desired_workers is None: 901 desired_workers = (multiprocessing.cpu_count() if is_thread_safe else 1) 902 903 return max( 904 (desired_workers - current_num_connections), 905 1, 906 )
Get the number of workers to use for concurrent syncs.
Parameters
- The number of workers passed via
--workers
.
Returns
- The number of workers, capped for safety.
15def verify( 16 self, 17 begin: Union[datetime, int, None] = None, 18 end: Union[datetime, int, None] = None, 19 params: Optional[Dict[str, Any]] = None, 20 chunk_interval: Union[timedelta, int, None] = None, 21 bounded: Optional[bool] = None, 22 deduplicate: bool = False, 23 workers: Optional[int] = None, 24 debug: bool = False, 25 **kwargs: Any 26) -> SuccessTuple: 27 """ 28 Verify the contents of the pipe by resyncing its interval. 29 30 Parameters 31 ---------- 32 begin: Union[datetime, int, None], default None 33 If specified, only verify rows greater than or equal to this value. 34 35 end: Union[datetime, int, None], default None 36 If specified, only verify rows less than this value. 37 38 chunk_interval: Union[timedelta, int, None], default None 39 If provided, use this as the size of the chunk boundaries. 40 Default to the value set in `pipe.parameters['chunk_minutes']` (1440). 41 42 bounded: Optional[bool], default None 43 If `True`, do not verify older than the oldest sync time or newer than the newest. 44 If `False`, verify unbounded syncs outside of the new and old sync times. 45 The default behavior (`None`) is to bound only if a bound interval is set 46 (e.g. `pipe.parameters['verify']['bound_days']`). 47 48 deduplicate: bool, default False 49 If `True`, deduplicate the pipe's table after the verification syncs. 50 51 workers: Optional[int], default None 52 If provided, limit the verification to this many threads. 53 Use a value of `1` to sync chunks in series. 54 55 debug: bool, default False 56 Verbosity toggle. 57 58 kwargs: Any 59 All keyword arguments are passed to `pipe.sync()`. 60 61 Returns 62 ------- 63 A SuccessTuple indicating whether the pipe was successfully resynced. 64 """ 65 from meerschaum.utils.pool import get_pool 66 from meerschaum.utils.misc import interval_str 67 workers = self.get_num_workers(workers) 68 69 ### Skip configured bounding in parameters 70 ### if `bounded` is explicitly `False`. 71 bound_time = ( 72 self.get_bound_time(debug=debug) 73 if bounded is not False 74 else None 75 ) 76 if bounded is None: 77 bounded = bound_time is not None 78 79 if bounded and begin is None: 80 begin = ( 81 bound_time 82 if bound_time is not None 83 else self.get_sync_time(newest=False, debug=debug) 84 ) 85 if bounded and end is None: 86 end = self.get_sync_time(newest=True, debug=debug) 87 88 if bounded and end is not None: 89 end += ( 90 timedelta(minutes=1) 91 if isinstance(end, datetime) 92 else 1 93 ) 94 95 sync_less_than_begin = not bounded and begin is None 96 sync_greater_than_end = not bounded and end is None 97 98 cannot_determine_bounds = not self.exists(debug=debug) 99 100 if cannot_determine_bounds: 101 sync_success, sync_msg = self.sync( 102 begin = begin, 103 end = end, 104 params = params, 105 workers = workers, 106 debug = debug, 107 **kwargs 108 ) 109 if not sync_success: 110 return sync_success, sync_msg 111 if deduplicate: 112 return self.deduplicate( 113 begin = begin, 114 end = end, 115 params = params, 116 workers = workers, 117 debug = debug, 118 **kwargs 119 ) 120 return sync_success, sync_msg 121 122 123 chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug) 124 chunk_bounds = self.get_chunk_bounds( 125 begin = begin, 126 end = end, 127 chunk_interval = chunk_interval, 128 bounded = bounded, 129 debug = debug, 130 ) 131 132 ### Consider it a success if no chunks need to be verified. 133 if not chunk_bounds: 134 if deduplicate: 135 return self.deduplicate( 136 begin = begin, 137 end = end, 138 params = params, 139 workers = workers, 140 debug = debug, 141 **kwargs 142 ) 143 return True, f"Could not determine chunks between '{begin}' and '{end}'; nothing to do." 144 145 begin_to_print = ( 146 begin 147 if begin is not None 148 else ( 149 chunk_bounds[0][0] 150 if bounded 151 else chunk_bounds[0][1] 152 ) 153 ) 154 end_to_print = ( 155 end 156 if end is not None 157 else ( 158 chunk_bounds[-1][1] 159 if bounded 160 else chunk_bounds[-1][0] 161 ) 162 ) 163 164 info( 165 f"Syncing {len(chunk_bounds)} chunk" + ('s' if len(chunk_bounds) != 1 else '') 166 + f" ({'un' if not bounded else ''}bounded)" 167 + f" of size '{interval_str(chunk_interval)}'" 168 + f" between '{begin_to_print}' and '{end_to_print}'." 169 ) 170 171 pool = get_pool(workers=workers) 172 173 ### Dictionary of the form bounds -> success_tuple, e.g.: 174 ### { 175 ### (2023-01-01, 2023-01-02): (True, "Success") 176 ### } 177 bounds_success_tuples = {} 178 def process_chunk_bounds( 179 chunk_begin_and_end: Tuple[ 180 Union[int, datetime], 181 Union[int, datetime] 182 ] 183 ): 184 if chunk_begin_and_end in bounds_success_tuples: 185 return chunk_begin_and_end, bounds_success_tuples[chunk_begin_and_end] 186 187 chunk_begin, chunk_end = chunk_begin_and_end 188 return chunk_begin_and_end, self.sync( 189 begin = chunk_begin, 190 end = chunk_end, 191 params = params, 192 workers = workers, 193 debug = debug, 194 **kwargs 195 ) 196 197 ### If we have more than one chunk, attempt to sync the first one and return if its fails. 198 if len(chunk_bounds) > 1: 199 first_chunk_bounds = chunk_bounds[0] 200 ( 201 (first_begin, first_end), 202 (first_success, first_msg) 203 ) = process_chunk_bounds(first_chunk_bounds) 204 if not first_success: 205 return ( 206 first_success, 207 f"\n{first_begin} - {first_end}\n" 208 + f"Failed to sync first chunk:\n{first_msg}" 209 ) 210 bounds_success_tuples[first_chunk_bounds] = (first_success, first_msg) 211 212 bounds_success_tuples.update(dict(pool.map(process_chunk_bounds, chunk_bounds))) 213 bounds_success_bools = {bounds: tup[0] for bounds, tup in bounds_success_tuples.items()} 214 215 message_header = f"{begin_to_print} - {end_to_print}" 216 if all(bounds_success_bools.values()): 217 msg = get_chunks_success_message(bounds_success_tuples, header=message_header) 218 if deduplicate: 219 deduplicate_success, deduplicate_msg = self.deduplicate( 220 begin = begin, 221 end = end, 222 params = params, 223 workers = workers, 224 debug = debug, 225 **kwargs 226 ) 227 return deduplicate_success, msg + '\n\n' + deduplicate_msg 228 return True, msg 229 230 chunk_bounds_to_resync = [ 231 bounds 232 for bounds, success in zip(chunk_bounds, bounds_success_bools) 233 if not success 234 ] 235 bounds_to_print = [ 236 f"{bounds[0]} - {bounds[1]}" 237 for bounds in chunk_bounds_to_resync 238 ] 239 if bounds_to_print: 240 warn( 241 f"Will resync the following failed chunks:\n " 242 + '\n '.join(bounds_to_print), 243 stack = False, 244 ) 245 246 retry_bounds_success_tuples = dict(pool.map(process_chunk_bounds, chunk_bounds_to_resync)) 247 bounds_success_tuples.update(retry_bounds_success_tuples) 248 retry_bounds_success_bools = { 249 bounds: tup[0] 250 for bounds, tup in retry_bounds_success_tuples.items() 251 } 252 253 if all(retry_bounds_success_bools.values()): 254 message = ( 255 get_chunks_success_message(bounds_success_tuples, header=message_header) 256 + f"\nRetried {len(chunk_bounds_to_resync)} chunks." 257 ) 258 if deduplicate: 259 deduplicate_success, deduplicate_msg = self.deduplicate( 260 begin = begin, 261 end = end, 262 params = params, 263 workers = workers, 264 debug = debug, 265 **kwargs 266 ) 267 return deduplicate_success, message + '\n\n' + deduplicate_msg 268 return True, message 269 270 message = get_chunks_success_message(bounds_success_tuples, header=message_header) 271 if deduplicate: 272 deduplicate_success, deduplicate_msg = self.deduplicate( 273 begin = begin, 274 end = end, 275 params = params, 276 workers = workers, 277 debug = debug, 278 **kwargs 279 ) 280 return deduplicate_success, message + '\n\n' + deduplicate_msg 281 return False, message
Verify the contents of the pipe by resyncing its interval.
Parameters
- begin (Union[datetime, int, None], default None): If specified, only verify rows greater than or equal to this value.
- end (Union[datetime, int, None], default None): If specified, only verify rows less than this value.
- chunk_interval (Union[timedelta, int, None], default None):
If provided, use this as the size of the chunk boundaries.
Default to the value set in
pipe.parameters['chunk_minutes']
(1440). - bounded (Optional[bool], default None):
If
True
, do not verify older than the oldest sync time or newer than the newest. IfFalse
, verify unbounded syncs outside of the new and old sync times. The default behavior (None
) is to bound only if a bound interval is set (e.g.pipe.parameters['verify']['bound_days']
). - deduplicate (bool, default False):
If
True
, deduplicate the pipe's table after the verification syncs. - workers (Optional[int], default None):
If provided, limit the verification to this many threads.
Use a value of
1
to sync chunks in series. - debug (bool, default False): Verbosity toggle.
- kwargs (Any):
All keyword arguments are passed to
pipe.sync()
.
Returns
- A SuccessTuple indicating whether the pipe was successfully resynced.
353def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]: 354 """ 355 Return the interval used to determine the bound time (limit for verification syncs). 356 If the datetime axis is an integer, just return its value. 357 358 Below are the supported keys for the bound interval: 359 360 - `pipe.parameters['verify']['bound_minutes']` 361 - `pipe.parameters['verify']['bound_hours']` 362 - `pipe.parameters['verify']['bound_days']` 363 - `pipe.parameters['verify']['bound_weeks']` 364 - `pipe.parameters['verify']['bound_years']` 365 - `pipe.parameters['verify']['bound_seconds']` 366 367 If multiple keys are present, the first on this priority list will be used. 368 369 Returns 370 ------- 371 A `timedelta` or `int` value to be used to determine the bound time. 372 """ 373 verify_params = self.parameters.get('verify', {}) 374 prefix = 'bound_' 375 suffixes_to_check = ('minutes', 'hours', 'days', 'weeks', 'years', 'seconds') 376 keys_to_search = { 377 key: val 378 for key, val in verify_params.items() 379 if key.startswith(prefix) 380 } 381 bound_time_key, bound_time_value = None, None 382 for key, value in keys_to_search.items(): 383 for suffix in suffixes_to_check: 384 if key == prefix + suffix: 385 bound_time_key = key 386 bound_time_value = value 387 break 388 if bound_time_key is not None: 389 break 390 391 if bound_time_value is None: 392 return bound_time_value 393 394 dt_col = self.columns.get('datetime', None) 395 if not dt_col: 396 return bound_time_value 397 398 dt_typ = self.dtypes.get(dt_col, 'datetime64[ns]') 399 if 'int' in dt_typ.lower(): 400 return int(bound_time_value) 401 402 interval_type = bound_time_key.replace(prefix, '') 403 return timedelta(**{interval_type: bound_time_value})
Return the interval used to determine the bound time (limit for verification syncs). If the datetime axis is an integer, just return its value.
Below are the supported keys for the bound interval:
- `pipe.parameters['verify']['bound_minutes']`
- `pipe.parameters['verify']['bound_hours']`
- `pipe.parameters['verify']['bound_days']`
- `pipe.parameters['verify']['bound_weeks']`
- `pipe.parameters['verify']['bound_years']`
- `pipe.parameters['verify']['bound_seconds']`
If multiple keys are present, the first on this priority list will be used.
Returns
- A
timedelta
orint
value to be used to determine the bound time.
406def get_bound_time(self, debug: bool = False) -> Union[datetime, int, None]: 407 """ 408 The bound time is the limit at which long-running verification syncs should stop. 409 A value of `None` means verification syncs should be unbounded. 410 411 Like deriving a backtrack time from `pipe.get_sync_time()`, 412 the bound time is the sync time minus a large window (e.g. 366 days). 413 414 Unbound verification syncs (i.e. `bound_time is None`) 415 if the oldest sync time is less than the bound interval. 416 417 Returns 418 ------- 419 A `datetime` or `int` corresponding to the 420 `begin` bound for verification and deduplication syncs. 421 """ 422 bound_interval = self.get_bound_interval(debug=debug) 423 if bound_interval is None: 424 return None 425 426 sync_time = self.get_sync_time(debug=debug) 427 if sync_time is None: 428 return None 429 430 bound_time = sync_time - bound_interval 431 oldest_sync_time = self.get_sync_time(newest=False, debug=debug) 432 433 return ( 434 bound_time 435 if bound_time > oldest_sync_time 436 else None 437 )
The bound time is the limit at which long-running verification syncs should stop.
A value of None
means verification syncs should be unbounded.
Like deriving a backtrack time from pipe.get_sync_time()
,
the bound time is the sync time minus a large window (e.g. 366 days).
Unbound verification syncs (i.e. bound_time is None
)
if the oldest sync time is less than the bound interval.
Returns
- A
datetime
orint
corresponding to the begin
bound for verification and deduplication syncs.
12def delete( 13 self, 14 drop: bool = True, 15 debug: bool = False, 16 **kw 17 ) -> SuccessTuple: 18 """ 19 Call the Pipe's instance connector's `delete_pipe()` method. 20 21 Parameters 22 ---------- 23 drop: bool, default True 24 If `True`, drop the pipes' target table. 25 26 debug : bool, default False 27 Verbosity toggle. 28 29 Returns 30 ------- 31 A `SuccessTuple` of success (`bool`), message (`str`). 32 33 """ 34 import os, pathlib 35 from meerschaum.utils.warnings import warn 36 from meerschaum.utils.venv import Venv 37 from meerschaum.connectors import get_connector_plugin 38 39 if self.temporary: 40 return ( 41 False, 42 "Cannot delete pipes created with `temporary=True` (read-only). " 43 + "You may want to call `pipe.drop()` instead." 44 ) 45 46 if self.cache_pipe is not None: 47 _drop_cache_tuple = self.cache_pipe.drop(debug=debug, **kw) 48 if not _drop_cache_tuple[0]: 49 warn(_drop_cache_tuple[1]) 50 if getattr(self.cache_connector, 'flavor', None) == 'sqlite': 51 _cache_db_path = pathlib.Path(self.cache_connector.database) 52 try: 53 os.remove(_cache_db_path) 54 except Exception as e: 55 warn(f"Could not delete cache file '{_cache_db_path}' for {self}:\n{e}") 56 57 if drop: 58 drop_success, drop_msg = self.drop(debug=debug) 59 if not drop_success: 60 warn(f"Failed to drop {self}:\n{drop_msg}") 61 62 with Venv(get_connector_plugin(self.instance_connector)): 63 result = self.instance_connector.delete_pipe(self, debug=debug, **kw) 64 65 if not isinstance(result, tuple): 66 return False, f"Received an unexpected result from '{self.instance_connector}': {result}" 67 68 if result[0]: 69 to_delete = ['_id'] 70 for member in to_delete: 71 if member in self.__dict__: 72 del self.__dict__[member] 73 return result
Call the Pipe's instance connector's delete_pipe()
method.
Parameters
- drop (bool, default True):
If
True
, drop the pipes' target table. - debug (bool, default False): Verbosity toggle.
Returns
- A
SuccessTuple
of success (bool
), message (str
).
13def drop( 14 self, 15 debug: bool = False, 16 **kw: Any 17) -> SuccessTuple: 18 """ 19 Call the Pipe's instance connector's `drop_pipe()` method. 20 21 Parameters 22 ---------- 23 debug: bool, default False: 24 Verbosity toggle. 25 26 Returns 27 ------- 28 A `SuccessTuple` of success, message. 29 30 """ 31 self._exists = False 32 from meerschaum.utils.warnings import warn 33 from meerschaum.utils.venv import Venv 34 from meerschaum.connectors import get_connector_plugin 35 36 if self.cache_pipe is not None: 37 _drop_cache_tuple = self.cache_pipe.drop(debug=debug, **kw) 38 if not _drop_cache_tuple[0]: 39 warn(_drop_cache_tuple[1]) 40 41 with Venv(get_connector_plugin(self.instance_connector)): 42 result = self.instance_connector.drop_pipe(self, debug=debug, **kw) 43 return result
Call the Pipe's instance connector's drop_pipe()
method.
Parameters
- debug (bool, default False:): Verbosity toggle.
Returns
- A
SuccessTuple
of success, message.
16def clear( 17 self, 18 begin: Optional[datetime] = None, 19 end: Optional[datetime] = None, 20 params: Optional[Dict[str, Any]] = None, 21 debug: bool = False, 22 **kwargs: Any 23) -> SuccessTuple: 24 """ 25 Call the Pipe's instance connector's `clear_pipe` method. 26 27 Parameters 28 ---------- 29 begin: Optional[datetime], default None: 30 If provided, only remove rows newer than this datetime value. 31 32 end: Optional[datetime], default None: 33 If provided, only remove rows older than this datetime column (not including end). 34 35 params: Optional[Dict[str, Any]], default None 36 See `meerschaum.utils.sql.build_where`. 37 38 debug: bool, default False: 39 Verbositity toggle. 40 41 Returns 42 ------- 43 A `SuccessTuple` corresponding to whether this procedure completed successfully. 44 45 Examples 46 -------- 47 >>> pipe = mrsm.Pipe('test', 'test', columns={'datetime': 'dt'}, instance='sql:local') 48 >>> pipe.sync({'dt': [datetime(2020, 1, 1, 0, 0)]}) 49 >>> pipe.sync({'dt': [datetime(2021, 1, 1, 0, 0)]}) 50 >>> pipe.sync({'dt': [datetime(2022, 1, 1, 0, 0)]}) 51 >>> 52 >>> pipe.clear(begin=datetime(2021, 1, 1, 0, 0)) 53 >>> pipe.get_data() 54 dt 55 0 2020-01-01 56 57 """ 58 from meerschaum.utils.warnings import warn 59 from meerschaum.utils.venv import Venv 60 from meerschaum.connectors import get_connector_plugin 61 62 if self.cache_pipe is not None: 63 success, msg = self.cache_pipe.clear( 64 begin = begin, 65 end = end, 66 params = params, 67 debug = debug, 68 **kwargs 69 ) 70 if not success: 71 warn(msg) 72 73 with Venv(get_connector_plugin(self.instance_connector)): 74 return self.instance_connector.clear_pipe( 75 self, 76 begin = begin, 77 end = end, 78 params = params, 79 debug = debug, 80 **kwargs 81 )
Call the Pipe's instance connector's clear_pipe
method.
Parameters
- begin (Optional[datetime], default None:): If provided, only remove rows newer than this datetime value.
- end (Optional[datetime], default None:): If provided, only remove rows older than this datetime column (not including end).
- params (Optional[Dict[str, Any]], default None):
See
meerschaum.utils.sql.build_where
. - debug (bool, default False:): Verbositity toggle.
Returns
- A
SuccessTuple
corresponding to whether this procedure completed successfully.
Examples
>>> pipe = mrsm.Pipe('test', 'test', columns={'datetime': 'dt'}, instance='sql:local')
>>> pipe.sync({'dt': [datetime(2020, 1, 1, 0, 0)]})
>>> pipe.sync({'dt': [datetime(2021, 1, 1, 0, 0)]})
>>> pipe.sync({'dt': [datetime(2022, 1, 1, 0, 0)]})
>>>
>>> pipe.clear(begin=datetime(2021, 1, 1, 0, 0))
>>> pipe.get_data()
dt
0 2020-01-01
15def deduplicate( 16 self, 17 begin: Union[datetime, int, None] = None, 18 end: Union[datetime, int, None] = None, 19 params: Optional[Dict[str, Any]] = None, 20 chunk_interval: Union[datetime, int, None] = None, 21 bounded: Optional[bool] = None, 22 workers: Optional[int] = None, 23 debug: bool = False, 24 _use_instance_method: bool = True, 25 **kwargs: Any 26) -> SuccessTuple: 27 """ 28 Call the Pipe's instance connector's `delete_duplicates` method to delete duplicate rows. 29 30 Parameters 31 ---------- 32 begin: Union[datetime, int, None], default None: 33 If provided, only deduplicate rows newer than this datetime value. 34 35 end: Union[datetime, int, None], default None: 36 If provided, only deduplicate rows older than this datetime column (not including end). 37 38 params: Optional[Dict[str, Any]], default None 39 Restrict deduplication to this filter (for multiplexed data streams). 40 See `meerschaum.utils.sql.build_where`. 41 42 chunk_interval: Union[timedelta, int, None], default None 43 If provided, use this for the chunk bounds. 44 Defaults to the value set in `pipe.parameters['chunk_minutes']` (1440). 45 46 bounded: Optional[bool], default None 47 Only check outside the oldest and newest sync times if bounded is explicitly `False`. 48 49 workers: Optional[int], default None 50 If the instance connector is thread-safe, limit concurrenct syncs to this many threads. 51 52 debug: bool, default False: 53 Verbositity toggle. 54 55 kwargs: Any 56 All other keyword arguments are passed to 57 `pipe.sync()`, `pipe.clear()`, and `pipe.get_data(). 58 59 Returns 60 ------- 61 A `SuccessTuple` corresponding to whether all of the chunks were successfully deduplicated. 62 """ 63 from meerschaum.utils.warnings import warn, info 64 from meerschaum.utils.misc import interval_str, items_str 65 from meerschaum.utils.venv import Venv 66 from meerschaum.connectors import get_connector_plugin 67 from meerschaum.utils.pool import get_pool 68 69 if self.cache_pipe is not None: 70 success, msg = self.cache_pipe.deduplicate( 71 begin = begin, 72 end = end, 73 params = params, 74 bounded = bounded, 75 debug = debug, 76 _use_instance_method = _use_instance_method, 77 **kwargs 78 ) 79 if not success: 80 warn(msg) 81 82 workers = self.get_num_workers(workers=workers) 83 pool = get_pool(workers=workers) 84 85 if _use_instance_method: 86 with Venv(get_connector_plugin(self.instance_connector)): 87 if hasattr(self.instance_connector, 'deduplicate_pipe'): 88 return self.instance_connector.deduplicate_pipe( 89 self, 90 begin = begin, 91 end = end, 92 params = params, 93 bounded = bounded, 94 debug = debug, 95 **kwargs 96 ) 97 98 ### Only unbound if explicitly False. 99 if bounded is None: 100 bounded = True 101 chunk_interval = self.get_chunk_interval(chunk_interval, debug=debug) 102 103 bound_time = self.get_bound_time(debug=debug) 104 if bounded and begin is None: 105 begin = ( 106 bound_time 107 if bound_time is not None 108 else self.get_sync_time(newest=False, debug=debug) 109 ) 110 if bounded and end is None: 111 end = self.get_sync_time(newest=True, debug=debug) 112 113 if bounded and end is not None: 114 end += ( 115 timedelta(minutes=1) 116 if isinstance(end, datetime) 117 else 1 118 ) 119 120 chunk_bounds = self.get_chunk_bounds( 121 bounded = bounded, 122 begin = begin, 123 end = end, 124 chunk_interval = chunk_interval, 125 debug = debug, 126 ) 127 128 indices = [col for col in self.columns.values() if col] 129 if not indices: 130 return False, f"Cannot deduplicate without index columns." 131 dt_col = self.columns.get('datetime', None) 132 133 def process_chunk_bounds(bounds) -> Tuple[ 134 Tuple[ 135 Union[datetime, int, None], 136 Union[datetime, int, None] 137 ], 138 SuccessTuple 139 ]: 140 ### Only selecting the index values here to keep bandwidth down. 141 chunk_begin, chunk_end = bounds 142 chunk_df = self.get_data( 143 select_columns = indices, 144 begin = chunk_begin, 145 end = chunk_end, 146 params = params, 147 debug = debug, 148 ) 149 if chunk_df is None: 150 return bounds, (True, "") 151 existing_chunk_len = len(chunk_df) 152 deduped_chunk_df = chunk_df.drop_duplicates(keep='last') 153 deduped_chunk_len = len(deduped_chunk_df) 154 155 if existing_chunk_len == deduped_chunk_len: 156 return bounds, (True, "") 157 158 chunk_msg_header = f"\n{chunk_begin} - {chunk_end}" 159 chunk_msg_body = "" 160 161 full_chunk = self.get_data( 162 begin=chunk_begin, 163 end=chunk_end, 164 params=params, 165 debug=debug, 166 ) 167 if full_chunk is None or len(full_chunk) == 0: 168 return bounds, (True, f"{chunk_msg_header}\nChunk is empty, skipping...") 169 170 chunk_indices = [ix for ix in indices if ix in full_chunk.columns] 171 if not chunk_indices: 172 return bounds, (False, f"None of {items_str(indices)} were present in chunk.") 173 try: 174 full_chunk = full_chunk.drop_duplicates( 175 subset=chunk_indices, 176 keep='last' 177 ).reset_index( 178 drop=True, 179 ) 180 except Exception as e: 181 return ( 182 bounds, 183 (False, f"Failed to deduplicate chunk on {items_str(chunk_indices)}:\n({e})") 184 ) 185 186 clear_success, clear_msg = self.clear( 187 begin=chunk_begin, 188 end=chunk_end, 189 params=params, 190 debug=debug, 191 ) 192 if not clear_success: 193 chunk_msg_body += f"Failed to clear chunk while deduplicating:\n{clear_msg}\n" 194 warn(chunk_msg_body) 195 196 sync_success, sync_msg = self.sync(full_chunk, debug=debug) 197 if not sync_success: 198 chunk_msg_body += f"Failed to sync chunk while deduplicating:\n{sync_msg}\n" 199 200 ### Finally check if the deduplication worked. 201 chunk_rowcount = self.get_rowcount( 202 begin=chunk_begin, 203 end=chunk_end, 204 params=params, 205 debug=debug, 206 ) 207 if chunk_rowcount != deduped_chunk_len: 208 return bounds, ( 209 False, ( 210 chunk_msg_header + "\n" 211 + chunk_msg_body + ("\n" if chunk_msg_body else '') 212 + "Chunk rowcounts still differ (" 213 + f"{chunk_rowcount} rowcount vs {deduped_chunk_len} chunk length)." 214 ) 215 ) 216 217 return bounds, ( 218 True, ( 219 chunk_msg_header + "\n" 220 + chunk_msg_body + ("\n" if chunk_msg_body else '') 221 + f"Deduplicated chunk from {existing_chunk_len} to {chunk_rowcount} rows." 222 ) 223 ) 224 225 info( 226 f"Deduplicating {len(chunk_bounds)} chunk" 227 + ('s' if len(chunk_bounds) != 1 else '') 228 + f" ({'un' if not bounded else ''}bounded)" 229 + f" of size '{interval_str(chunk_interval)}'" 230 + f" on {self}." 231 ) 232 bounds_success_tuples = dict(pool.map(process_chunk_bounds, chunk_bounds)) 233 bounds_successes = { 234 bounds: success_tuple 235 for bounds, success_tuple in bounds_success_tuples.items() 236 if success_tuple[0] 237 } 238 bounds_failures = { 239 bounds: success_tuple 240 for bounds, success_tuple in bounds_success_tuples.items() 241 if not success_tuple[0] 242 } 243 244 ### No need to retry if everything failed. 245 if len(bounds_failures) > 0 and len(bounds_successes) == 0: 246 return ( 247 False, 248 ( 249 f"Failed to deduplicate {len(bounds_failures)} chunk" 250 + ('s' if len(bounds_failures) != 1 else '') 251 + ".\n" 252 + "\n".join([msg for _, (_, msg) in bounds_failures.items() if msg]) 253 ) 254 ) 255 256 retry_bounds = [bounds for bounds in bounds_failures] 257 if not retry_bounds: 258 return ( 259 True, 260 ( 261 f"Successfully deduplicated {len(bounds_successes)} chunk" 262 + ('s' if len(bounds_successes) != 1 else '') 263 + ".\n" 264 + "\n".join([msg for _, (_, msg) in bounds_successes.items() if msg]) 265 ).rstrip('\n') 266 ) 267 268 info(f"Retrying {len(retry_bounds)} chunks for {self}...") 269 retry_bounds_success_tuples = dict(pool.map(process_chunk_bounds, retry_bounds)) 270 retry_bounds_successes = { 271 bounds: success_tuple 272 for bounds, success_tuple in bounds_success_tuples.items() 273 if success_tuple[0] 274 } 275 retry_bounds_failures = { 276 bounds: success_tuple 277 for bounds, success_tuple in bounds_success_tuples.items() 278 if not success_tuple[0] 279 } 280 281 bounds_successes.update(retry_bounds_successes) 282 if not retry_bounds_failures: 283 return ( 284 True, 285 ( 286 f"Successfully deduplicated {len(bounds_successes)} chunk" 287 + ('s' if len(bounds_successes) != 1 else '') 288 + f"({len(retry_bounds_successes)} retried):\n" 289 + "\n".join([msg for _, (_, msg) in bounds_successes.items() if msg]) 290 ).rstrip('\n') 291 ) 292 293 return ( 294 False, 295 ( 296 f"Failed to deduplicate {len(bounds_failures)} chunk" 297 + ('s' if len(retry_bounds_failures) != 1 else '') 298 + ".\n" 299 + "\n".join([msg for _, (_, msg) in retry_bounds_failures.items() if msg]) 300 ).rstrip('\n') 301 )
Call the Pipe's instance connector's delete_duplicates
method to delete duplicate rows.
Parameters
- begin (Union[datetime, int, None], default None:): If provided, only deduplicate rows newer than this datetime value.
- end (Union[datetime, int, None], default None:): If provided, only deduplicate rows older than this datetime column (not including end).
- params (Optional[Dict[str, Any]], default None):
Restrict deduplication to this filter (for multiplexed data streams).
See
meerschaum.utils.sql.build_where
. - chunk_interval (Union[timedelta, int, None], default None):
If provided, use this for the chunk bounds.
Defaults to the value set in
pipe.parameters['chunk_minutes']
(1440). - bounded (Optional[bool], default None):
Only check outside the oldest and newest sync times if bounded is explicitly
False
. - workers (Optional[int], default None): If the instance connector is thread-safe, limit concurrenct syncs to this many threads.
- debug (bool, default False:): Verbositity toggle.
- kwargs (Any):
All other keyword arguments are passed to
pipe.sync()
,pipe.clear()
, and `pipe.get_data().
Returns
- A
SuccessTuple
corresponding to whether all of the chunks were successfully deduplicated.
13def bootstrap( 14 self, 15 debug: bool = False, 16 yes: bool = False, 17 force: bool = False, 18 noask: bool = False, 19 shell: bool = False, 20 **kw 21 ) -> SuccessTuple: 22 """ 23 Prompt the user to create a pipe's requirements all from one method. 24 This method shouldn't be used in any automated scripts because it interactively 25 prompts the user and therefore may hang. 26 27 Parameters 28 ---------- 29 debug: bool, default False: 30 Verbosity toggle. 31 32 yes: bool, default False: 33 Print the questions and automatically agree. 34 35 force: bool, default False: 36 Skip the questions and agree anyway. 37 38 noask: bool, default False: 39 Print the questions but go with the default answer. 40 41 shell: bool, default False: 42 Used to determine if we are in the interactive shell. 43 44 Returns 45 ------- 46 A `SuccessTuple` corresponding to the success of this procedure. 47 48 """ 49 50 from meerschaum.utils.warnings import warn, info, error 51 from meerschaum.utils.prompt import prompt, yes_no 52 from meerschaum.utils.formatting import pprint 53 from meerschaum.config import get_config 54 from meerschaum.utils.formatting._shell import clear_screen 55 from meerschaum.utils.formatting import print_tuple 56 from meerschaum.actions import actions 57 from meerschaum.utils.venv import Venv 58 from meerschaum.connectors import get_connector_plugin 59 60 _clear = get_config('shell', 'clear_screen', patch=True) 61 62 if self.get_id(debug=debug) is not None: 63 delete_tuple = self.delete(debug=debug) 64 if not delete_tuple[0]: 65 return delete_tuple 66 67 if _clear: 68 clear_screen(debug=debug) 69 70 _parameters = _get_parameters(self, debug=debug) 71 self.parameters = _parameters 72 pprint(self.parameters) 73 try: 74 prompt( 75 f"\n Press [Enter] to register {self} with the above configuration:", 76 icon = False 77 ) 78 except KeyboardInterrupt as e: 79 return False, f"Aborting bootstrapping {self}." 80 81 with Venv(get_connector_plugin(self.instance_connector)): 82 register_tuple = self.instance_connector.register_pipe(self, debug=debug) 83 84 if not register_tuple[0]: 85 return register_tuple 86 87 if _clear: 88 clear_screen(debug=debug) 89 90 try: 91 if yes_no( 92 f"Would you like to edit the definition for {self}?", 93 yes=yes, 94 noask=noask, 95 default='n', 96 ): 97 edit_tuple = self.edit_definition(debug=debug) 98 if not edit_tuple[0]: 99 return edit_tuple 100 101 if yes_no( 102 f"Would you like to try syncing {self} now?", 103 yes=yes, 104 noask=noask, 105 default='n', 106 ): 107 sync_tuple = actions['sync']( 108 ['pipes'], 109 connector_keys=[self.connector_keys], 110 metric_keys=[self.metric_key], 111 location_keys=[self.location_key], 112 mrsm_instance=str(self.instance_connector), 113 debug=debug, 114 shell=shell, 115 ) 116 if not sync_tuple[0]: 117 return sync_tuple 118 except Exception as e: 119 return False, f"Failed to bootstrap {self}:\n" + str(e) 120 121 print_tuple((True, f"Finished bootstrapping {self}!")) 122 info( 123 f"You can edit this pipe later with `edit pipes` " 124 + "or set the definition with `edit pipes definition`.\n" 125 + " To sync data into your pipe, run `sync pipes`." 126 ) 127 128 return True, "Success"
Prompt the user to create a pipe's requirements all from one method. This method shouldn't be used in any automated scripts because it interactively prompts the user and therefore may hang.
Parameters
- debug (bool, default False:): Verbosity toggle.
- yes (bool, default False:): Print the questions and automatically agree.
- force (bool, default False:): Skip the questions and agree anyway.
- noask (bool, default False:): Print the questions but go with the default answer.
- shell (bool, default False:): Used to determine if we are in the interactive shell.
Returns
- A
SuccessTuple
corresponding to the success of this procedure.
19def enforce_dtypes( 20 self, 21 df: 'pd.DataFrame', 22 chunksize: Optional[int] = -1, 23 safe_copy: bool = True, 24 debug: bool = False, 25) -> 'pd.DataFrame': 26 """ 27 Cast the input dataframe to the pipe's registered data types. 28 If the pipe does not exist and dtypes are not set, return the dataframe. 29 """ 30 import traceback 31 from meerschaum.utils.warnings import warn 32 from meerschaum.utils.debug import dprint 33 from meerschaum.utils.dataframe import parse_df_datetimes, enforce_dtypes as _enforce_dtypes 34 from meerschaum.utils.packages import import_pandas 35 pd = import_pandas(debug=debug) 36 if df is None: 37 if debug: 38 dprint( 39 "Received None instead of a DataFrame.\n" 40 + " Skipping dtype enforcement..." 41 ) 42 return df 43 44 pipe_dtypes = self.dtypes 45 46 try: 47 if isinstance(df, str): 48 df = parse_df_datetimes( 49 pd.read_json(StringIO(df)), 50 ignore_cols=[ 51 col 52 for col, dtype in pipe_dtypes.items() 53 if 'datetime' not in str(dtype) 54 ], 55 chunksize=chunksize, 56 debug=debug, 57 ) 58 else: 59 df = parse_df_datetimes( 60 df, 61 ignore_cols=[ 62 col 63 for col, dtype in pipe_dtypes.items() 64 if 'datetime' not in str(dtype) 65 ], 66 chunksize=chunksize, 67 debug=debug, 68 ) 69 except Exception as e: 70 warn(f"Unable to cast incoming data as a DataFrame...:\n{e}\n\n{traceback.format_exc()}") 71 return None 72 73 if not pipe_dtypes: 74 if debug: 75 dprint( 76 f"Could not find dtypes for {self}.\n" 77 + " Skipping dtype enforcement..." 78 ) 79 return df 80 81 return _enforce_dtypes(df, pipe_dtypes, safe_copy=safe_copy, debug=debug)
Cast the input dataframe to the pipe's registered data types. If the pipe does not exist and dtypes are not set, return the dataframe.
84def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str, Any]: 85 """ 86 If `dtypes` is not set in `meerschaum.Pipe.parameters`, 87 infer the data types from the underlying table if it exists. 88 89 Parameters 90 ---------- 91 persist: bool, default False 92 If `True`, persist the inferred data types to `meerschaum.Pipe.parameters`. 93 94 Returns 95 ------- 96 A dictionary of strings containing the pandas data types for this Pipe. 97 """ 98 if not self.exists(debug=debug): 99 dtypes = {} 100 if not self.columns: 101 return {} 102 dt_col = self.columns.get('datetime', None) 103 if dt_col: 104 if not self.parameters.get('dtypes', {}).get(dt_col, None): 105 dtypes[dt_col] = 'datetime64[ns]' 106 return dtypes 107 108 from meerschaum.utils.sql import get_pd_type 109 from meerschaum.utils.misc import to_pandas_dtype 110 columns_types = self.get_columns_types(debug=debug) 111 112 ### NOTE: get_columns_types() may return either the types as 113 ### PostgreSQL- or Pandas-style. 114 dtypes = { 115 c: ( 116 get_pd_type(t, allow_custom_dtypes=True) 117 if str(t).isupper() 118 else to_pandas_dtype(t) 119 ) 120 for c, t in columns_types.items() 121 } if columns_types else {} 122 if persist: 123 self.dtypes = dtypes 124 self.edit(interactive=False, debug=debug) 125 return dtypes
If dtypes
is not set in meerschaum.Pipe.parameters
,
infer the data types from the underlying table if it exists.
Parameters
- persist (bool, default False):
If
True
, persist the inferred data types tomeerschaum.Pipe.parameters
.
Returns
- A dictionary of strings containing the pandas data types for this Pipe.
15def copy_to( 16 self, 17 instance_keys: str, 18 sync: bool = True, 19 begin: Union[datetime, int, None] = None, 20 end: Union[datetime, int, None] = None, 21 params: Optional[Dict[str, Any]] = None, 22 chunk_interval: Union[timedelta, int, None] = None, 23 debug: bool = False, 24 **kwargs: Any 25) -> SuccessTuple: 26 """ 27 Copy a pipe to another instance. 28 29 Parameters 30 ---------- 31 instance_keys: str 32 The instance to which to copy this pipe. 33 34 sync: bool, default True 35 If `True`, sync the source pipe's documents 36 37 begin: Union[datetime, int, None], default None 38 Beginning datetime value to pass to `Pipe.get_data()`. 39 40 end: Union[datetime, int, None], default None 41 End datetime value to pass to `Pipe.get_data()`. 42 43 params: Optional[Dict[str, Any]], default None 44 Parameters filter to pass to `Pipe.get_data()`. 45 46 chunk_interval: Union[timedelta, int, None], default None 47 The size of chunks to retrieve from `Pipe.get_data()` for syncing. 48 49 kwargs: Any 50 Additional flags to pass to `Pipe.get_data()` and `Pipe.sync()`, e.g. `workers`. 51 52 Returns 53 ------- 54 A SuccessTuple indicating success. 55 """ 56 if str(instance_keys) == self.instance_keys: 57 return False, f"Cannot copy {self} to instance '{instance_keys}'." 58 59 new_pipe = mrsm.Pipe( 60 self.connector_keys, 61 self.metric_key, 62 self.location_key, 63 parameters=self.parameters.copy(), 64 instance=instance_keys, 65 ) 66 67 new_pipe_is_registered = new_pipe.get_id() is not None 68 69 metadata_method = new_pipe.edit if new_pipe_is_registered else new_pipe.register 70 metadata_success, metadata_msg = metadata_method(debug=debug) 71 if not metadata_success: 72 return metadata_success, metadata_msg 73 74 if not self.exists(debug=debug): 75 return True, f"{self} does not exist; nothing to sync." 76 77 original_as_iterator = kwargs.get('as_iterator', None) 78 kwargs['as_iterator'] = True 79 80 chunk_generator = self.get_data( 81 begin=begin, 82 end=end, 83 params=params, 84 chunk_interval=chunk_interval, 85 debug=debug, 86 **kwargs 87 ) 88 89 if original_as_iterator is None: 90 _ = kwargs.pop('as_iterator', None) 91 else: 92 kwargs['as_iterator'] = original_as_iterator 93 94 sync_success, sync_msg = new_pipe.sync( 95 chunk_generator, 96 begin=begin, 97 end=end, 98 params=params, 99 debug=debug, 100 **kwargs 101 ) 102 msg = ( 103 f"Successfully synced {new_pipe}:\n{sync_msg}" 104 if sync_success 105 else f"Failed to sync {new_pipe}:\n{sync_msg}" 106 ) 107 return sync_success, msg
Copy a pipe to another instance.
Parameters
- instance_keys (str): The instance to which to copy this pipe.
- sync (bool, default True):
If
True
, sync the source pipe's documents - begin (Union[datetime, int, None], default None):
Beginning datetime value to pass to
Pipe.get_data()
. - end (Union[datetime, int, None], default None):
End datetime value to pass to
Pipe.get_data()
. - params (Optional[Dict[str, Any]], default None):
Parameters filter to pass to
Pipe.get_data()
. - chunk_interval (Union[timedelta, int, None], default None):
The size of chunks to retrieve from
Pipe.get_data()
for syncing. - kwargs (Any):
Additional flags to pass to
Pipe.get_data()
andPipe.sync()
, e.g.workers
.
Returns
- A SuccessTuple indicating success.
37class Plugin: 38 """Handle packaging of Meerschaum plugins.""" 39 def __init__( 40 self, 41 name: str, 42 version: Optional[str] = None, 43 user_id: Optional[int] = None, 44 required: Optional[List[str]] = None, 45 attributes: Optional[Dict[str, Any]] = None, 46 archive_path: Optional[pathlib.Path] = None, 47 venv_path: Optional[pathlib.Path] = None, 48 repo_connector: Optional['mrsm.connectors.api.APIConnector'] = None, 49 repo: Union['mrsm.connectors.api.APIConnector', str, None] = None, 50 ): 51 from meerschaum.config.static import STATIC_CONFIG 52 sep = STATIC_CONFIG['plugins']['repo_separator'] 53 _repo = None 54 if sep in name: 55 try: 56 name, _repo = name.split(sep) 57 except Exception as e: 58 error(f"Invalid plugin name: '{name}'") 59 self._repo_in_name = _repo 60 61 if attributes is None: 62 attributes = {} 63 self.name = name 64 self.attributes = attributes 65 self.user_id = user_id 66 self._version = version 67 if required: 68 self._required = required 69 self.archive_path = ( 70 archive_path if archive_path is not None 71 else PLUGINS_ARCHIVES_RESOURCES_PATH / f"{self.name}.tar.gz" 72 ) 73 self.venv_path = ( 74 venv_path if venv_path is not None 75 else VIRTENV_RESOURCES_PATH / self.name 76 ) 77 self._repo_connector = repo_connector 78 self._repo_keys = repo 79 80 81 @property 82 def repo_connector(self): 83 """ 84 Return the repository connector for this plugin. 85 NOTE: This imports the `connectors` module, which imports certain plugin modules. 86 """ 87 if self._repo_connector is None: 88 from meerschaum.connectors.parse import parse_repo_keys 89 90 repo_keys = self._repo_keys or self._repo_in_name 91 if self._repo_in_name and self._repo_keys and self._repo_keys != self._repo_in_name: 92 error( 93 f"Received inconsistent repos: '{self._repo_in_name}' and '{self._repo_keys}'." 94 ) 95 repo_connector = parse_repo_keys(repo_keys) 96 self._repo_connector = repo_connector 97 return self._repo_connector 98 99 100 @property 101 def version(self): 102 """ 103 Return the plugin's module version is defined (`__version__`) if it's defined. 104 """ 105 if self._version is None: 106 try: 107 self._version = self.module.__version__ 108 except Exception as e: 109 self._version = None 110 return self._version 111 112 113 @property 114 def module(self): 115 """ 116 Return the Python module of the underlying plugin. 117 """ 118 if '_module' not in self.__dict__ or self.__dict__.get('_module', None) is None: 119 if self.__file__ is None: 120 return None 121 from meerschaum.plugins import import_plugins 122 self._module = import_plugins(str(self), warn=False) 123 return self._module 124 125 126 @property 127 def __file__(self) -> Union[str, None]: 128 """ 129 Return the file path (str) of the plugin if it exists, otherwise `None`. 130 """ 131 if self.__dict__.get('_module', None) is not None: 132 return self.module.__file__ 133 134 potential_dir = PLUGINS_RESOURCES_PATH / self.name 135 if ( 136 potential_dir.exists() 137 and potential_dir.is_dir() 138 and (potential_dir / '__init__.py').exists() 139 ): 140 return str((potential_dir / '__init__.py').as_posix()) 141 142 potential_file = PLUGINS_RESOURCES_PATH / (self.name + '.py') 143 if potential_file.exists() and not potential_file.is_dir(): 144 return str(potential_file.as_posix()) 145 146 return None 147 148 149 @property 150 def requirements_file_path(self) -> Union[pathlib.Path, None]: 151 """ 152 If a file named `requirements.txt` exists, return its path. 153 """ 154 if self.__file__ is None: 155 return None 156 path = pathlib.Path(self.__file__).parent / 'requirements.txt' 157 if not path.exists(): 158 return None 159 return path 160 161 162 def is_installed(self, **kw) -> bool: 163 """ 164 Check whether a plugin is correctly installed. 165 166 Returns 167 ------- 168 A `bool` indicating whether a plugin exists and is successfully imported. 169 """ 170 return self.__file__ is not None 171 172 173 def make_tar(self, debug: bool = False) -> pathlib.Path: 174 """ 175 Compress the plugin's source files into a `.tar.gz` archive and return the archive's path. 176 177 Parameters 178 ---------- 179 debug: bool, default False 180 Verbosity toggle. 181 182 Returns 183 ------- 184 A `pathlib.Path` to the archive file's path. 185 186 """ 187 import tarfile, pathlib, subprocess, fnmatch 188 from meerschaum.utils.debug import dprint 189 from meerschaum.utils.packages import attempt_import 190 pathspec = attempt_import('pathspec', debug=debug) 191 192 if not self.__file__: 193 from meerschaum.utils.warnings import error 194 error(f"Could not find file for plugin '{self}'.") 195 if '__init__.py' in self.__file__ or os.path.isdir(self.__file__): 196 path = self.__file__.replace('__init__.py', '') 197 is_dir = True 198 else: 199 path = self.__file__ 200 is_dir = False 201 202 old_cwd = os.getcwd() 203 real_parent_path = pathlib.Path(os.path.realpath(path)).parent 204 os.chdir(real_parent_path) 205 206 default_patterns_to_ignore = [ 207 '.pyc', 208 '__pycache__/', 209 'eggs/', 210 '__pypackages__/', 211 '.git', 212 ] 213 214 def parse_gitignore() -> 'Set[str]': 215 gitignore_path = pathlib.Path(path) / '.gitignore' 216 if not gitignore_path.exists(): 217 return set(default_patterns_to_ignore) 218 with open(gitignore_path, 'r', encoding='utf-8') as f: 219 gitignore_text = f.read() 220 return set(pathspec.PathSpec.from_lines( 221 pathspec.patterns.GitWildMatchPattern, 222 default_patterns_to_ignore + gitignore_text.splitlines() 223 ).match_tree(path)) 224 225 patterns_to_ignore = parse_gitignore() if is_dir else set() 226 227 if debug: 228 dprint(f"Patterns to ignore:\n{patterns_to_ignore}") 229 230 with tarfile.open(self.archive_path, 'w:gz') as tarf: 231 if not is_dir: 232 tarf.add(f"{self.name}.py") 233 else: 234 for root, dirs, files in os.walk(self.name): 235 for f in files: 236 good_file = True 237 fp = os.path.join(root, f) 238 for pattern in patterns_to_ignore: 239 if pattern in str(fp) or f.startswith('.'): 240 good_file = False 241 break 242 if good_file: 243 if debug: 244 dprint(f"Adding '{fp}'...") 245 tarf.add(fp) 246 247 ### clean up and change back to old directory 248 os.chdir(old_cwd) 249 250 ### change to 775 to avoid permissions issues with the API in a Docker container 251 self.archive_path.chmod(0o775) 252 253 if debug: 254 dprint(f"Created archive '{self.archive_path}'.") 255 return self.archive_path 256 257 258 def install( 259 self, 260 skip_deps: bool = False, 261 force: bool = False, 262 debug: bool = False, 263 ) -> SuccessTuple: 264 """ 265 Extract a plugin's tar archive to the plugins directory. 266 267 This function checks if the plugin is already installed and if the version is equal or 268 greater than the existing installation. 269 270 Parameters 271 ---------- 272 skip_deps: bool, default False 273 If `True`, do not install dependencies. 274 275 force: bool, default False 276 If `True`, continue with installation, even if required packages fail to install. 277 278 debug: bool, default False 279 Verbosity toggle. 280 281 Returns 282 ------- 283 A `SuccessTuple` of success (bool) and a message (str). 284 285 """ 286 if self.full_name in _ongoing_installations: 287 return True, f"Already installing plugin '{self}'." 288 _ongoing_installations.add(self.full_name) 289 from meerschaum.utils.warnings import warn, error 290 if debug: 291 from meerschaum.utils.debug import dprint 292 import tarfile 293 import re 294 import ast 295 from meerschaum.plugins import sync_plugins_symlinks 296 from meerschaum.utils.packages import attempt_import, determine_version, reload_meerschaum 297 from meerschaum.utils.venv import init_venv 298 from meerschaum.utils.misc import safely_extract_tar 299 old_cwd = os.getcwd() 300 old_version = '' 301 new_version = '' 302 temp_dir = PLUGINS_TEMP_RESOURCES_PATH / self.name 303 temp_dir.mkdir(exist_ok=True) 304 305 if not self.archive_path.exists(): 306 return False, f"Missing archive file for plugin '{self}'." 307 if self.version is not None: 308 old_version = self.version 309 if debug: 310 dprint(f"Found existing version '{old_version}' for plugin '{self}'.") 311 312 if debug: 313 dprint(f"Extracting '{self.archive_path}' to '{temp_dir}'...") 314 315 try: 316 with tarfile.open(self.archive_path, 'r:gz') as tarf: 317 safely_extract_tar(tarf, temp_dir) 318 except Exception as e: 319 warn(e) 320 return False, f"Failed to extract plugin '{self.name}'." 321 322 ### search for version information 323 files = os.listdir(temp_dir) 324 325 if str(files[0]) == self.name: 326 is_dir = True 327 elif str(files[0]) == self.name + '.py': 328 is_dir = False 329 else: 330 error(f"Unknown format encountered for plugin '{self}'.") 331 332 fpath = temp_dir / files[0] 333 if is_dir: 334 fpath = fpath / '__init__.py' 335 336 init_venv(self.name, debug=debug) 337 with open(fpath, 'r', encoding='utf-8') as f: 338 init_lines = f.readlines() 339 new_version = None 340 for line in init_lines: 341 if '__version__' not in line: 342 continue 343 version_match = re.search(r'__version__(\s?)=', line.lstrip().rstrip()) 344 if not version_match: 345 continue 346 new_version = ast.literal_eval(line.split('=')[1].lstrip().rstrip()) 347 break 348 if not new_version: 349 warn( 350 f"No `__version__` defined for plugin '{self}'. " 351 + "Assuming new version...", 352 stack = False, 353 ) 354 355 packaging_version = attempt_import('packaging.version') 356 try: 357 is_new_version = (not new_version and not old_version) or ( 358 packaging_version.parse(old_version) < packaging_version.parse(new_version) 359 ) 360 is_same_version = new_version and old_version and ( 361 packaging_version.parse(old_version) == packaging_version.parse(new_version) 362 ) 363 except Exception as e: 364 is_new_version, is_same_version = True, False 365 366 ### Determine where to permanently store the new plugin. 367 plugin_installation_dir_path = PLUGINS_DIR_PATHS[0] 368 for path in PLUGINS_DIR_PATHS: 369 files_in_plugins_dir = os.listdir(path) 370 if ( 371 self.name in files_in_plugins_dir 372 or 373 (self.name + '.py') in files_in_plugins_dir 374 ): 375 plugin_installation_dir_path = path 376 break 377 378 success_msg = ( 379 f"Successfully installed plugin '{self}'" 380 + ("\n (skipped dependencies)" if skip_deps else "") 381 + "." 382 ) 383 success, abort = None, None 384 385 if is_same_version and not force: 386 success, msg = True, ( 387 f"Plugin '{self}' is up-to-date (version {old_version}).\n" + 388 " Install again with `-f` or `--force` to reinstall." 389 ) 390 abort = True 391 elif is_new_version or force: 392 for src_dir, dirs, files in os.walk(temp_dir): 393 if success is not None: 394 break 395 dst_dir = str(src_dir).replace(str(temp_dir), str(plugin_installation_dir_path)) 396 if not os.path.exists(dst_dir): 397 os.mkdir(dst_dir) 398 for f in files: 399 src_file = os.path.join(src_dir, f) 400 dst_file = os.path.join(dst_dir, f) 401 if os.path.exists(dst_file): 402 os.remove(dst_file) 403 404 if debug: 405 dprint(f"Moving '{src_file}' to '{dst_dir}'...") 406 try: 407 shutil.move(src_file, dst_dir) 408 except Exception as e: 409 success, msg = False, ( 410 f"Failed to install plugin '{self}': " + 411 f"Could not move file '{src_file}' to '{dst_dir}'" 412 ) 413 print(msg) 414 break 415 if success is None: 416 success, msg = True, success_msg 417 else: 418 success, msg = False, ( 419 f"Your installed version of plugin '{self}' ({old_version}) is higher than " 420 + f"attempted version {new_version}." 421 ) 422 423 shutil.rmtree(temp_dir) 424 os.chdir(old_cwd) 425 426 ### Reload the plugin's module. 427 sync_plugins_symlinks(debug=debug) 428 if '_module' in self.__dict__: 429 del self.__dict__['_module'] 430 init_venv(venv=self.name, force=True, debug=debug) 431 reload_meerschaum(debug=debug) 432 433 ### if we've already failed, return here 434 if not success or abort: 435 _ongoing_installations.remove(self.full_name) 436 return success, msg 437 438 ### attempt to install dependencies 439 dependencies_installed = skip_deps or self.install_dependencies(force=force, debug=debug) 440 if not dependencies_installed: 441 _ongoing_installations.remove(self.full_name) 442 return False, f"Failed to install dependencies for plugin '{self}'." 443 444 ### handling success tuple, bool, or other (typically None) 445 setup_tuple = self.setup(debug=debug) 446 if isinstance(setup_tuple, tuple): 447 if not setup_tuple[0]: 448 success, msg = setup_tuple 449 elif isinstance(setup_tuple, bool): 450 if not setup_tuple: 451 success, msg = False, ( 452 f"Failed to run post-install setup for plugin '{self}'." + '\n' + 453 f"Check `setup()` in '{self.__file__}' for more information " + 454 f"(no error message provided)." 455 ) 456 else: 457 success, msg = True, success_msg 458 elif setup_tuple is None: 459 success = True 460 msg = ( 461 f"Post-install for plugin '{self}' returned None. " + 462 f"Assuming plugin successfully installed." 463 ) 464 warn(msg) 465 else: 466 success = False 467 msg = ( 468 f"Post-install for plugin '{self}' returned unexpected value " + 469 f"of type '{type(setup_tuple)}': {setup_tuple}" 470 ) 471 472 _ongoing_installations.remove(self.full_name) 473 module = self.module 474 return success, msg 475 476 477 def remove_archive( 478 self, 479 debug: bool = False 480 ) -> SuccessTuple: 481 """Remove a plugin's archive file.""" 482 if not self.archive_path.exists(): 483 return True, f"Archive file for plugin '{self}' does not exist." 484 try: 485 self.archive_path.unlink() 486 except Exception as e: 487 return False, f"Failed to remove archive for plugin '{self}':\n{e}" 488 return True, "Success" 489 490 491 def remove_venv( 492 self, 493 debug: bool = False 494 ) -> SuccessTuple: 495 """Remove a plugin's virtual environment.""" 496 if not self.venv_path.exists(): 497 return True, f"Virtual environment for plugin '{self}' does not exist." 498 try: 499 shutil.rmtree(self.venv_path) 500 except Exception as e: 501 return False, f"Failed to remove virtual environment for plugin '{self}':\n{e}" 502 return True, "Success" 503 504 505 def uninstall(self, debug: bool = False) -> SuccessTuple: 506 """ 507 Remove a plugin, its virtual environment, and archive file. 508 """ 509 from meerschaum.utils.packages import reload_meerschaum 510 from meerschaum.plugins import sync_plugins_symlinks 511 from meerschaum.utils.warnings import warn, info 512 warnings_thrown_count: int = 0 513 max_warnings: int = 3 514 515 if not self.is_installed(): 516 info( 517 f"Plugin '{self.name}' doesn't seem to be installed.\n " 518 + "Checking for artifacts...", 519 stack = False, 520 ) 521 else: 522 real_path = pathlib.Path(os.path.realpath(self.__file__)) 523 try: 524 if real_path.name == '__init__.py': 525 shutil.rmtree(real_path.parent) 526 else: 527 real_path.unlink() 528 except Exception as e: 529 warn(f"Could not remove source files for plugin '{self.name}':\n{e}", stack=False) 530 warnings_thrown_count += 1 531 else: 532 info(f"Removed source files for plugin '{self.name}'.") 533 534 if self.venv_path.exists(): 535 success, msg = self.remove_venv(debug=debug) 536 if not success: 537 warn(msg, stack=False) 538 warnings_thrown_count += 1 539 else: 540 info(f"Removed virtual environment from plugin '{self.name}'.") 541 542 success = warnings_thrown_count < max_warnings 543 sync_plugins_symlinks(debug=debug) 544 self.deactivate_venv(force=True, debug=debug) 545 reload_meerschaum(debug=debug) 546 return success, ( 547 f"Successfully uninstalled plugin '{self}'." if success 548 else f"Failed to uninstall plugin '{self}'." 549 ) 550 551 552 def setup(self, *args: str, debug: bool = False, **kw: Any) -> Union[SuccessTuple, bool]: 553 """ 554 If exists, run the plugin's `setup()` function. 555 556 Parameters 557 ---------- 558 *args: str 559 The positional arguments passed to the `setup()` function. 560 561 debug: bool, default False 562 Verbosity toggle. 563 564 **kw: Any 565 The keyword arguments passed to the `setup()` function. 566 567 Returns 568 ------- 569 A `SuccessTuple` or `bool` indicating success. 570 571 """ 572 from meerschaum.utils.debug import dprint 573 import inspect 574 _setup = None 575 for name, fp in inspect.getmembers(self.module): 576 if name == 'setup' and inspect.isfunction(fp): 577 _setup = fp 578 break 579 580 ### assume success if no setup() is found (not necessary) 581 if _setup is None: 582 return True 583 584 sig = inspect.signature(_setup) 585 has_debug, has_kw = ('debug' in sig.parameters), False 586 for k, v in sig.parameters.items(): 587 if '**' in str(v): 588 has_kw = True 589 break 590 591 _kw = {} 592 if has_kw: 593 _kw.update(kw) 594 if has_debug: 595 _kw['debug'] = debug 596 597 if debug: 598 dprint(f"Running setup for plugin '{self}'...") 599 try: 600 self.activate_venv(debug=debug) 601 return_tuple = _setup(*args, **_kw) 602 self.deactivate_venv(debug=debug) 603 except Exception as e: 604 return False, str(e) 605 606 if isinstance(return_tuple, tuple): 607 return return_tuple 608 if isinstance(return_tuple, bool): 609 return return_tuple, f"Setup for Plugin '{self.name}' did not return a message." 610 if return_tuple is None: 611 return False, f"Setup for Plugin '{self.name}' returned None." 612 return False, f"Unknown return value from setup for Plugin '{self.name}': {return_tuple}" 613 614 615 def get_dependencies( 616 self, 617 debug: bool = False, 618 ) -> List[str]: 619 """ 620 If the Plugin has specified dependencies in a list called `required`, return the list. 621 622 **NOTE:** Dependecies which start with `'plugin:'` are Meerschaum plugins, not pip packages. 623 Meerschaum plugins may also specify connector keys for a repo after `'@'`. 624 625 Parameters 626 ---------- 627 debug: bool, default False 628 Verbosity toggle. 629 630 Returns 631 ------- 632 A list of required packages and plugins (str). 633 634 """ 635 if '_required' in self.__dict__: 636 return self._required 637 638 ### If the plugin has not yet been imported, 639 ### infer the dependencies from the source text. 640 ### This is not super robust, and it doesn't feel right 641 ### having multiple versions of the logic. 642 ### This is necessary when determining the activation order 643 ### without having import the module. 644 ### For consistency's sake, the module-less method does not cache the requirements. 645 if self.__dict__.get('_module', None) is None: 646 file_path = self.__file__ 647 if file_path is None: 648 return [] 649 with open(file_path, 'r', encoding='utf-8') as f: 650 text = f.read() 651 652 if 'required' not in text: 653 return [] 654 655 ### This has some limitations: 656 ### It relies on `required` being manually declared. 657 ### We lose the ability to dynamically alter the `required` list, 658 ### which is why we've kept the module-reliant method below. 659 import ast, re 660 ### NOTE: This technically would break 661 ### if `required` was the very first line of the file. 662 req_start_match = re.search(r'\nrequired(:\s*)?.*=', text) 663 if not req_start_match: 664 return [] 665 req_start = req_start_match.start() 666 equals_sign = req_start + text[req_start:].find('=') 667 668 ### Dependencies may have brackets within the strings, so push back the index. 669 first_opening_brace = equals_sign + 1 + text[equals_sign:].find('[') 670 if first_opening_brace == -1: 671 return [] 672 673 next_closing_brace = equals_sign + 1 + text[equals_sign:].find(']') 674 if next_closing_brace == -1: 675 return [] 676 677 start_ix = first_opening_brace + 1 678 end_ix = next_closing_brace 679 680 num_braces = 0 681 while True: 682 if '[' not in text[start_ix:end_ix]: 683 break 684 num_braces += 1 685 start_ix = end_ix 686 end_ix += text[end_ix + 1:].find(']') + 1 687 688 req_end = end_ix + 1 689 req_text = ( 690 text[(first_opening_brace-1):req_end] 691 .lstrip() 692 .replace('=', '', 1) 693 .lstrip() 694 .rstrip() 695 ) 696 try: 697 required = ast.literal_eval(req_text) 698 except Exception as e: 699 warn( 700 f"Unable to determine requirements for plugin '{self.name}' " 701 + "without importing the module.\n" 702 + " This may be due to dynamically setting the global `required` list.\n" 703 + f" {e}" 704 ) 705 return [] 706 return required 707 708 import inspect 709 self.activate_venv(dependencies=False, debug=debug) 710 required = [] 711 for name, val in inspect.getmembers(self.module): 712 if name == 'required': 713 required = val 714 break 715 self._required = required 716 self.deactivate_venv(dependencies=False, debug=debug) 717 return required 718 719 720 def get_required_plugins(self, debug: bool=False) -> List[meerschaum.plugins.Plugin]: 721 """ 722 Return a list of required Plugin objects. 723 """ 724 from meerschaum.utils.warnings import warn 725 from meerschaum.config import get_config 726 from meerschaum.config.static import STATIC_CONFIG 727 plugins = [] 728 _deps = self.get_dependencies(debug=debug) 729 sep = STATIC_CONFIG['plugins']['repo_separator'] 730 plugin_names = [ 731 _d[len('plugin:'):] for _d in _deps 732 if _d.startswith('plugin:') and len(_d) > len('plugin:') 733 ] 734 default_repo_keys = get_config('meerschaum', 'default_repository') 735 for _plugin_name in plugin_names: 736 if sep in _plugin_name: 737 try: 738 _plugin_name, _repo_keys = _plugin_name.split(sep) 739 except Exception as e: 740 _repo_keys = default_repo_keys 741 warn( 742 f"Invalid repo keys for required plugin '{_plugin_name}'.\n " 743 + f"Will try to use '{_repo_keys}' instead.", 744 stack = False, 745 ) 746 else: 747 _repo_keys = default_repo_keys 748 plugins.append(Plugin(_plugin_name, repo=_repo_keys)) 749 return plugins 750 751 752 def get_required_packages(self, debug: bool=False) -> List[str]: 753 """ 754 Return the required package names (excluding plugins). 755 """ 756 _deps = self.get_dependencies(debug=debug) 757 return [_d for _d in _deps if not _d.startswith('plugin:')] 758 759 760 def activate_venv(self, dependencies: bool=True, debug: bool=False, **kw) -> bool: 761 """ 762 Activate the virtual environments for the plugin and its dependencies. 763 764 Parameters 765 ---------- 766 dependencies: bool, default True 767 If `True`, activate the virtual environments for required plugins. 768 769 Returns 770 ------- 771 A bool indicating success. 772 """ 773 from meerschaum.utils.venv import venv_target_path 774 from meerschaum.utils.packages import activate_venv 775 from meerschaum.utils.misc import make_symlink, is_symlink 776 from meerschaum.config._paths import PACKAGE_ROOT_PATH 777 778 if dependencies: 779 for plugin in self.get_required_plugins(debug=debug): 780 plugin.activate_venv(debug=debug, **kw) 781 782 vtp = venv_target_path(self.name, debug=debug, allow_nonexistent=True) 783 venv_meerschaum_path = vtp / 'meerschaum' 784 785 try: 786 success, msg = True, "Success" 787 if is_symlink(venv_meerschaum_path): 788 if pathlib.Path(os.path.realpath(venv_meerschaum_path)) != PACKAGE_ROOT_PATH: 789 venv_meerschaum_path.unlink() 790 success, msg = make_symlink(venv_meerschaum_path, PACKAGE_ROOT_PATH) 791 except Exception as e: 792 success, msg = False, str(e) 793 if not success: 794 warn(f"Unable to create symlink {venv_meerschaum_path} to {PACKAGE_ROOT_PATH}:\n{msg}") 795 796 return activate_venv(self.name, debug=debug, **kw) 797 798 799 def deactivate_venv(self, dependencies: bool=True, debug: bool = False, **kw) -> bool: 800 """ 801 Deactivate the virtual environments for the plugin and its dependencies. 802 803 Parameters 804 ---------- 805 dependencies: bool, default True 806 If `True`, deactivate the virtual environments for required plugins. 807 808 Returns 809 ------- 810 A bool indicating success. 811 """ 812 from meerschaum.utils.packages import deactivate_venv 813 success = deactivate_venv(self.name, debug=debug, **kw) 814 if dependencies: 815 for plugin in self.get_required_plugins(debug=debug): 816 plugin.deactivate_venv(debug=debug, **kw) 817 return success 818 819 820 def install_dependencies( 821 self, 822 force: bool = False, 823 debug: bool = False, 824 ) -> bool: 825 """ 826 If specified, install dependencies. 827 828 **NOTE:** Dependencies that start with `'plugin:'` will be installed as 829 Meerschaum plugins from the same repository as this Plugin. 830 To install from a different repository, add the repo keys after `'@'` 831 (e.g. `'plugin:foo@api:bar'`). 832 833 Parameters 834 ---------- 835 force: bool, default False 836 If `True`, continue with the installation, even if some 837 required packages fail to install. 838 839 debug: bool, default False 840 Verbosity toggle. 841 842 Returns 843 ------- 844 A bool indicating success. 845 846 """ 847 from meerschaum.utils.packages import pip_install, venv_contains_package 848 from meerschaum.utils.debug import dprint 849 from meerschaum.utils.warnings import warn, info 850 from meerschaum.connectors.parse import parse_repo_keys 851 _deps = self.get_dependencies(debug=debug) 852 if not _deps and self.requirements_file_path is None: 853 return True 854 855 plugins = self.get_required_plugins(debug=debug) 856 for _plugin in plugins: 857 if _plugin.name == self.name: 858 warn(f"Plugin '{self.name}' cannot depend on itself! Skipping...", stack=False) 859 continue 860 _success, _msg = _plugin.repo_connector.install_plugin( 861 _plugin.name, debug=debug, force=force 862 ) 863 if not _success: 864 warn( 865 f"Failed to install required plugin '{_plugin}' from '{_plugin.repo_connector}'" 866 + f" for plugin '{self.name}':\n" + _msg, 867 stack = False, 868 ) 869 if not force: 870 warn( 871 "Try installing with the `--force` flag to continue anyway.", 872 stack = False, 873 ) 874 return False 875 info( 876 "Continuing with installation despite the failure " 877 + "(careful, things might be broken!)...", 878 icon = False 879 ) 880 881 882 ### First step: parse `requirements.txt` if it exists. 883 if self.requirements_file_path is not None: 884 if not pip_install( 885 requirements_file_path=self.requirements_file_path, 886 venv=self.name, debug=debug 887 ): 888 warn( 889 f"Failed to resolve 'requirements.txt' for plugin '{self.name}'.", 890 stack = False, 891 ) 892 if not force: 893 warn( 894 "Try installing with `--force` to continue anyway.", 895 stack = False, 896 ) 897 return False 898 info( 899 "Continuing with installation despite the failure " 900 + "(careful, things might be broken!)...", 901 icon = False 902 ) 903 904 905 ### Don't reinstall packages that are already included in required plugins. 906 packages = [] 907 _packages = self.get_required_packages(debug=debug) 908 accounted_for_packages = set() 909 for package_name in _packages: 910 for plugin in plugins: 911 if venv_contains_package(package_name, plugin.name): 912 accounted_for_packages.add(package_name) 913 break 914 packages = [pkg for pkg in _packages if pkg not in accounted_for_packages] 915 916 ### Attempt pip packages installation. 917 if packages: 918 for package in packages: 919 if not pip_install(package, venv=self.name, debug=debug): 920 warn( 921 f"Failed to install required package '{package}'" 922 + f" for plugin '{self.name}'.", 923 stack = False, 924 ) 925 if not force: 926 warn( 927 "Try installing with `--force` to continue anyway.", 928 stack = False, 929 ) 930 return False 931 info( 932 "Continuing with installation despite the failure " 933 + "(careful, things might be broken!)...", 934 icon = False 935 ) 936 return True 937 938 939 @property 940 def full_name(self) -> str: 941 """ 942 Include the repo keys with the plugin's name. 943 """ 944 from meerschaum.config.static import STATIC_CONFIG 945 sep = STATIC_CONFIG['plugins']['repo_separator'] 946 return self.name + sep + str(self.repo_connector) 947 948 949 def __str__(self): 950 return self.name 951 952 953 def __repr__(self): 954 return f"Plugin('{self.name}', repo='{self.repo_connector}')" 955 956 957 def __del__(self): 958 pass
Handle packaging of Meerschaum plugins.
39 def __init__( 40 self, 41 name: str, 42 version: Optional[str] = None, 43 user_id: Optional[int] = None, 44 required: Optional[List[str]] = None, 45 attributes: Optional[Dict[str, Any]] = None, 46 archive_path: Optional[pathlib.Path] = None, 47 venv_path: Optional[pathlib.Path] = None, 48 repo_connector: Optional['mrsm.connectors.api.APIConnector'] = None, 49 repo: Union['mrsm.connectors.api.APIConnector', str, None] = None, 50 ): 51 from meerschaum.config.static import STATIC_CONFIG 52 sep = STATIC_CONFIG['plugins']['repo_separator'] 53 _repo = None 54 if sep in name: 55 try: 56 name, _repo = name.split(sep) 57 except Exception as e: 58 error(f"Invalid plugin name: '{name}'") 59 self._repo_in_name = _repo 60 61 if attributes is None: 62 attributes = {} 63 self.name = name 64 self.attributes = attributes 65 self.user_id = user_id 66 self._version = version 67 if required: 68 self._required = required 69 self.archive_path = ( 70 archive_path if archive_path is not None 71 else PLUGINS_ARCHIVES_RESOURCES_PATH / f"{self.name}.tar.gz" 72 ) 73 self.venv_path = ( 74 venv_path if venv_path is not None 75 else VIRTENV_RESOURCES_PATH / self.name 76 ) 77 self._repo_connector = repo_connector 78 self._repo_keys = repo
81 @property 82 def repo_connector(self): 83 """ 84 Return the repository connector for this plugin. 85 NOTE: This imports the `connectors` module, which imports certain plugin modules. 86 """ 87 if self._repo_connector is None: 88 from meerschaum.connectors.parse import parse_repo_keys 89 90 repo_keys = self._repo_keys or self._repo_in_name 91 if self._repo_in_name and self._repo_keys and self._repo_keys != self._repo_in_name: 92 error( 93 f"Received inconsistent repos: '{self._repo_in_name}' and '{self._repo_keys}'." 94 ) 95 repo_connector = parse_repo_keys(repo_keys) 96 self._repo_connector = repo_connector 97 return self._repo_connector
Return the repository connector for this plugin.
NOTE: This imports the connectors
module, which imports certain plugin modules.
100 @property 101 def version(self): 102 """ 103 Return the plugin's module version is defined (`__version__`) if it's defined. 104 """ 105 if self._version is None: 106 try: 107 self._version = self.module.__version__ 108 except Exception as e: 109 self._version = None 110 return self._version
Return the plugin's module version is defined (__version__
) if it's defined.
113 @property 114 def module(self): 115 """ 116 Return the Python module of the underlying plugin. 117 """ 118 if '_module' not in self.__dict__ or self.__dict__.get('_module', None) is None: 119 if self.__file__ is None: 120 return None 121 from meerschaum.plugins import import_plugins 122 self._module = import_plugins(str(self), warn=False) 123 return self._module
Return the Python module of the underlying plugin.
149 @property 150 def requirements_file_path(self) -> Union[pathlib.Path, None]: 151 """ 152 If a file named `requirements.txt` exists, return its path. 153 """ 154 if self.__file__ is None: 155 return None 156 path = pathlib.Path(self.__file__).parent / 'requirements.txt' 157 if not path.exists(): 158 return None 159 return path
If a file named requirements.txt
exists, return its path.
162 def is_installed(self, **kw) -> bool: 163 """ 164 Check whether a plugin is correctly installed. 165 166 Returns 167 ------- 168 A `bool` indicating whether a plugin exists and is successfully imported. 169 """ 170 return self.__file__ is not None
Check whether a plugin is correctly installed.
Returns
- A
bool
indicating whether a plugin exists and is successfully imported.
173 def make_tar(self, debug: bool = False) -> pathlib.Path: 174 """ 175 Compress the plugin's source files into a `.tar.gz` archive and return the archive's path. 176 177 Parameters 178 ---------- 179 debug: bool, default False 180 Verbosity toggle. 181 182 Returns 183 ------- 184 A `pathlib.Path` to the archive file's path. 185 186 """ 187 import tarfile, pathlib, subprocess, fnmatch 188 from meerschaum.utils.debug import dprint 189 from meerschaum.utils.packages import attempt_import 190 pathspec = attempt_import('pathspec', debug=debug) 191 192 if not self.__file__: 193 from meerschaum.utils.warnings import error 194 error(f"Could not find file for plugin '{self}'.") 195 if '__init__.py' in self.__file__ or os.path.isdir(self.__file__): 196 path = self.__file__.replace('__init__.py', '') 197 is_dir = True 198 else: 199 path = self.__file__ 200 is_dir = False 201 202 old_cwd = os.getcwd() 203 real_parent_path = pathlib.Path(os.path.realpath(path)).parent 204 os.chdir(real_parent_path) 205 206 default_patterns_to_ignore = [ 207 '.pyc', 208 '__pycache__/', 209 'eggs/', 210 '__pypackages__/', 211 '.git', 212 ] 213 214 def parse_gitignore() -> 'Set[str]': 215 gitignore_path = pathlib.Path(path) / '.gitignore' 216 if not gitignore_path.exists(): 217 return set(default_patterns_to_ignore) 218 with open(gitignore_path, 'r', encoding='utf-8') as f: 219 gitignore_text = f.read() 220 return set(pathspec.PathSpec.from_lines( 221 pathspec.patterns.GitWildMatchPattern, 222 default_patterns_to_ignore + gitignore_text.splitlines() 223 ).match_tree(path)) 224 225 patterns_to_ignore = parse_gitignore() if is_dir else set() 226 227 if debug: 228 dprint(f"Patterns to ignore:\n{patterns_to_ignore}") 229 230 with tarfile.open(self.archive_path, 'w:gz') as tarf: 231 if not is_dir: 232 tarf.add(f"{self.name}.py") 233 else: 234 for root, dirs, files in os.walk(self.name): 235 for f in files: 236 good_file = True 237 fp = os.path.join(root, f) 238 for pattern in patterns_to_ignore: 239 if pattern in str(fp) or f.startswith('.'): 240 good_file = False 241 break 242 if good_file: 243 if debug: 244 dprint(f"Adding '{fp}'...") 245 tarf.add(fp) 246 247 ### clean up and change back to old directory 248 os.chdir(old_cwd) 249 250 ### change to 775 to avoid permissions issues with the API in a Docker container 251 self.archive_path.chmod(0o775) 252 253 if debug: 254 dprint(f"Created archive '{self.archive_path}'.") 255 return self.archive_path
Compress the plugin's source files into a .tar.gz
archive and return the archive's path.
Parameters
- debug (bool, default False): Verbosity toggle.
Returns
- A
pathlib.Path
to the archive file's path.
258 def install( 259 self, 260 skip_deps: bool = False, 261 force: bool = False, 262 debug: bool = False, 263 ) -> SuccessTuple: 264 """ 265 Extract a plugin's tar archive to the plugins directory. 266 267 This function checks if the plugin is already installed and if the version is equal or 268 greater than the existing installation. 269 270 Parameters 271 ---------- 272 skip_deps: bool, default False 273 If `True`, do not install dependencies. 274 275 force: bool, default False 276 If `True`, continue with installation, even if required packages fail to install. 277 278 debug: bool, default False 279 Verbosity toggle. 280 281 Returns 282 ------- 283 A `SuccessTuple` of success (bool) and a message (str). 284 285 """ 286 if self.full_name in _ongoing_installations: 287 return True, f"Already installing plugin '{self}'." 288 _ongoing_installations.add(self.full_name) 289 from meerschaum.utils.warnings import warn, error 290 if debug: 291 from meerschaum.utils.debug import dprint 292 import tarfile 293 import re 294 import ast 295 from meerschaum.plugins import sync_plugins_symlinks 296 from meerschaum.utils.packages import attempt_import, determine_version, reload_meerschaum 297 from meerschaum.utils.venv import init_venv 298 from meerschaum.utils.misc import safely_extract_tar 299 old_cwd = os.getcwd() 300 old_version = '' 301 new_version = '' 302 temp_dir = PLUGINS_TEMP_RESOURCES_PATH / self.name 303 temp_dir.mkdir(exist_ok=True) 304 305 if not self.archive_path.exists(): 306 return False, f"Missing archive file for plugin '{self}'." 307 if self.version is not None: 308 old_version = self.version 309 if debug: 310 dprint(f"Found existing version '{old_version}' for plugin '{self}'.") 311 312 if debug: 313 dprint(f"Extracting '{self.archive_path}' to '{temp_dir}'...") 314 315 try: 316 with tarfile.open(self.archive_path, 'r:gz') as tarf: 317 safely_extract_tar(tarf, temp_dir) 318 except Exception as e: 319 warn(e) 320 return False, f"Failed to extract plugin '{self.name}'." 321 322 ### search for version information 323 files = os.listdir(temp_dir) 324 325 if str(files[0]) == self.name: 326 is_dir = True 327 elif str(files[0]) == self.name + '.py': 328 is_dir = False 329 else: 330 error(f"Unknown format encountered for plugin '{self}'.") 331 332 fpath = temp_dir / files[0] 333 if is_dir: 334 fpath = fpath / '__init__.py' 335 336 init_venv(self.name, debug=debug) 337 with open(fpath, 'r', encoding='utf-8') as f: 338 init_lines = f.readlines() 339 new_version = None 340 for line in init_lines: 341 if '__version__' not in line: 342 continue 343 version_match = re.search(r'__version__(\s?)=', line.lstrip().rstrip()) 344 if not version_match: 345 continue 346 new_version = ast.literal_eval(line.split('=')[1].lstrip().rstrip()) 347 break 348 if not new_version: 349 warn( 350 f"No `__version__` defined for plugin '{self}'. " 351 + "Assuming new version...", 352 stack = False, 353 ) 354 355 packaging_version = attempt_import('packaging.version') 356 try: 357 is_new_version = (not new_version and not old_version) or ( 358 packaging_version.parse(old_version) < packaging_version.parse(new_version) 359 ) 360 is_same_version = new_version and old_version and ( 361 packaging_version.parse(old_version) == packaging_version.parse(new_version) 362 ) 363 except Exception as e: 364 is_new_version, is_same_version = True, False 365 366 ### Determine where to permanently store the new plugin. 367 plugin_installation_dir_path = PLUGINS_DIR_PATHS[0] 368 for path in PLUGINS_DIR_PATHS: 369 files_in_plugins_dir = os.listdir(path) 370 if ( 371 self.name in files_in_plugins_dir 372 or 373 (self.name + '.py') in files_in_plugins_dir 374 ): 375 plugin_installation_dir_path = path 376 break 377 378 success_msg = ( 379 f"Successfully installed plugin '{self}'" 380 + ("\n (skipped dependencies)" if skip_deps else "") 381 + "." 382 ) 383 success, abort = None, None 384 385 if is_same_version and not force: 386 success, msg = True, ( 387 f"Plugin '{self}' is up-to-date (version {old_version}).\n" + 388 " Install again with `-f` or `--force` to reinstall." 389 ) 390 abort = True 391 elif is_new_version or force: 392 for src_dir, dirs, files in os.walk(temp_dir): 393 if success is not None: 394 break 395 dst_dir = str(src_dir).replace(str(temp_dir), str(plugin_installation_dir_path)) 396 if not os.path.exists(dst_dir): 397 os.mkdir(dst_dir) 398 for f in files: 399 src_file = os.path.join(src_dir, f) 400 dst_file = os.path.join(dst_dir, f) 401 if os.path.exists(dst_file): 402 os.remove(dst_file) 403 404 if debug: 405 dprint(f"Moving '{src_file}' to '{dst_dir}'...") 406 try: 407 shutil.move(src_file, dst_dir) 408 except Exception as e: 409 success, msg = False, ( 410 f"Failed to install plugin '{self}': " + 411 f"Could not move file '{src_file}' to '{dst_dir}'" 412 ) 413 print(msg) 414 break 415 if success is None: 416 success, msg = True, success_msg 417 else: 418 success, msg = False, ( 419 f"Your installed version of plugin '{self}' ({old_version}) is higher than " 420 + f"attempted version {new_version}." 421 ) 422 423 shutil.rmtree(temp_dir) 424 os.chdir(old_cwd) 425 426 ### Reload the plugin's module. 427 sync_plugins_symlinks(debug=debug) 428 if '_module' in self.__dict__: 429 del self.__dict__['_module'] 430 init_venv(venv=self.name, force=True, debug=debug) 431 reload_meerschaum(debug=debug) 432 433 ### if we've already failed, return here 434 if not success or abort: 435 _ongoing_installations.remove(self.full_name) 436 return success, msg 437 438 ### attempt to install dependencies 439 dependencies_installed = skip_deps or self.install_dependencies(force=force, debug=debug) 440 if not dependencies_installed: 441 _ongoing_installations.remove(self.full_name) 442 return False, f"Failed to install dependencies for plugin '{self}'." 443 444 ### handling success tuple, bool, or other (typically None) 445 setup_tuple = self.setup(debug=debug) 446 if isinstance(setup_tuple, tuple): 447 if not setup_tuple[0]: 448 success, msg = setup_tuple 449 elif isinstance(setup_tuple, bool): 450 if not setup_tuple: 451 success, msg = False, ( 452 f"Failed to run post-install setup for plugin '{self}'." + '\n' + 453 f"Check `setup()` in '{self.__file__}' for more information " + 454 f"(no error message provided)." 455 ) 456 else: 457 success, msg = True, success_msg 458 elif setup_tuple is None: 459 success = True 460 msg = ( 461 f"Post-install for plugin '{self}' returned None. " + 462 f"Assuming plugin successfully installed." 463 ) 464 warn(msg) 465 else: 466 success = False 467 msg = ( 468 f"Post-install for plugin '{self}' returned unexpected value " + 469 f"of type '{type(setup_tuple)}': {setup_tuple}" 470 ) 471 472 _ongoing_installations.remove(self.full_name) 473 module = self.module 474 return success, msg
Extract a plugin's tar archive to the plugins directory.
This function checks if the plugin is already installed and if the version is equal or greater than the existing installation.
Parameters
- skip_deps (bool, default False):
If
True
, do not install dependencies. - force (bool, default False):
If
True
, continue with installation, even if required packages fail to install. - debug (bool, default False): Verbosity toggle.
Returns
- A
SuccessTuple
of success (bool) and a message (str).
477 def remove_archive( 478 self, 479 debug: bool = False 480 ) -> SuccessTuple: 481 """Remove a plugin's archive file.""" 482 if not self.archive_path.exists(): 483 return True, f"Archive file for plugin '{self}' does not exist." 484 try: 485 self.archive_path.unlink() 486 except Exception as e: 487 return False, f"Failed to remove archive for plugin '{self}':\n{e}" 488 return True, "Success"
Remove a plugin's archive file.
491 def remove_venv( 492 self, 493 debug: bool = False 494 ) -> SuccessTuple: 495 """Remove a plugin's virtual environment.""" 496 if not self.venv_path.exists(): 497 return True, f"Virtual environment for plugin '{self}' does not exist." 498 try: 499 shutil.rmtree(self.venv_path) 500 except Exception as e: 501 return False, f"Failed to remove virtual environment for plugin '{self}':\n{e}" 502 return True, "Success"
Remove a plugin's virtual environment.
505 def uninstall(self, debug: bool = False) -> SuccessTuple: 506 """ 507 Remove a plugin, its virtual environment, and archive file. 508 """ 509 from meerschaum.utils.packages import reload_meerschaum 510 from meerschaum.plugins import sync_plugins_symlinks 511 from meerschaum.utils.warnings import warn, info 512 warnings_thrown_count: int = 0 513 max_warnings: int = 3 514 515 if not self.is_installed(): 516 info( 517 f"Plugin '{self.name}' doesn't seem to be installed.\n " 518 + "Checking for artifacts...", 519 stack = False, 520 ) 521 else: 522 real_path = pathlib.Path(os.path.realpath(self.__file__)) 523 try: 524 if real_path.name == '__init__.py': 525 shutil.rmtree(real_path.parent) 526 else: 527 real_path.unlink() 528 except Exception as e: 529 warn(f"Could not remove source files for plugin '{self.name}':\n{e}", stack=False) 530 warnings_thrown_count += 1 531 else: 532 info(f"Removed source files for plugin '{self.name}'.") 533 534 if self.venv_path.exists(): 535 success, msg = self.remove_venv(debug=debug) 536 if not success: 537 warn(msg, stack=False) 538 warnings_thrown_count += 1 539 else: 540 info(f"Removed virtual environment from plugin '{self.name}'.") 541 542 success = warnings_thrown_count < max_warnings 543 sync_plugins_symlinks(debug=debug) 544 self.deactivate_venv(force=True, debug=debug) 545 reload_meerschaum(debug=debug) 546 return success, ( 547 f"Successfully uninstalled plugin '{self}'." if success 548 else f"Failed to uninstall plugin '{self}'." 549 )
Remove a plugin, its virtual environment, and archive file.
552 def setup(self, *args: str, debug: bool = False, **kw: Any) -> Union[SuccessTuple, bool]: 553 """ 554 If exists, run the plugin's `setup()` function. 555 556 Parameters 557 ---------- 558 *args: str 559 The positional arguments passed to the `setup()` function. 560 561 debug: bool, default False 562 Verbosity toggle. 563 564 **kw: Any 565 The keyword arguments passed to the `setup()` function. 566 567 Returns 568 ------- 569 A `SuccessTuple` or `bool` indicating success. 570 571 """ 572 from meerschaum.utils.debug import dprint 573 import inspect 574 _setup = None 575 for name, fp in inspect.getmembers(self.module): 576 if name == 'setup' and inspect.isfunction(fp): 577 _setup = fp 578 break 579 580 ### assume success if no setup() is found (not necessary) 581 if _setup is None: 582 return True 583 584 sig = inspect.signature(_setup) 585 has_debug, has_kw = ('debug' in sig.parameters), False 586 for k, v in sig.parameters.items(): 587 if '**' in str(v): 588 has_kw = True 589 break 590 591 _kw = {} 592 if has_kw: 593 _kw.update(kw) 594 if has_debug: 595 _kw['debug'] = debug 596 597 if debug: 598 dprint(f"Running setup for plugin '{self}'...") 599 try: 600 self.activate_venv(debug=debug) 601 return_tuple = _setup(*args, **_kw) 602 self.deactivate_venv(debug=debug) 603 except Exception as e: 604 return False, str(e) 605 606 if isinstance(return_tuple, tuple): 607 return return_tuple 608 if isinstance(return_tuple, bool): 609 return return_tuple, f"Setup for Plugin '{self.name}' did not return a message." 610 if return_tuple is None: 611 return False, f"Setup for Plugin '{self.name}' returned None." 612 return False, f"Unknown return value from setup for Plugin '{self.name}': {return_tuple}"
If exists, run the plugin's setup()
function.
Parameters
- *args (str):
The positional arguments passed to the
setup()
function. - debug (bool, default False): Verbosity toggle.
- **kw (Any):
The keyword arguments passed to the
setup()
function.
Returns
- A
SuccessTuple
orbool
indicating success.
615 def get_dependencies( 616 self, 617 debug: bool = False, 618 ) -> List[str]: 619 """ 620 If the Plugin has specified dependencies in a list called `required`, return the list. 621 622 **NOTE:** Dependecies which start with `'plugin:'` are Meerschaum plugins, not pip packages. 623 Meerschaum plugins may also specify connector keys for a repo after `'@'`. 624 625 Parameters 626 ---------- 627 debug: bool, default False 628 Verbosity toggle. 629 630 Returns 631 ------- 632 A list of required packages and plugins (str). 633 634 """ 635 if '_required' in self.__dict__: 636 return self._required 637 638 ### If the plugin has not yet been imported, 639 ### infer the dependencies from the source text. 640 ### This is not super robust, and it doesn't feel right 641 ### having multiple versions of the logic. 642 ### This is necessary when determining the activation order 643 ### without having import the module. 644 ### For consistency's sake, the module-less method does not cache the requirements. 645 if self.__dict__.get('_module', None) is None: 646 file_path = self.__file__ 647 if file_path is None: 648 return [] 649 with open(file_path, 'r', encoding='utf-8') as f: 650 text = f.read() 651 652 if 'required' not in text: 653 return [] 654 655 ### This has some limitations: 656 ### It relies on `required` being manually declared. 657 ### We lose the ability to dynamically alter the `required` list, 658 ### which is why we've kept the module-reliant method below. 659 import ast, re 660 ### NOTE: This technically would break 661 ### if `required` was the very first line of the file. 662 req_start_match = re.search(r'\nrequired(:\s*)?.*=', text) 663 if not req_start_match: 664 return [] 665 req_start = req_start_match.start() 666 equals_sign = req_start + text[req_start:].find('=') 667 668 ### Dependencies may have brackets within the strings, so push back the index. 669 first_opening_brace = equals_sign + 1 + text[equals_sign:].find('[') 670 if first_opening_brace == -1: 671 return [] 672 673 next_closing_brace = equals_sign + 1 + text[equals_sign:].find(']') 674 if next_closing_brace == -1: 675 return [] 676 677 start_ix = first_opening_brace + 1 678 end_ix = next_closing_brace 679 680 num_braces = 0 681 while True: 682 if '[' not in text[start_ix:end_ix]: 683 break 684 num_braces += 1 685 start_ix = end_ix 686 end_ix += text[end_ix + 1:].find(']') + 1 687 688 req_end = end_ix + 1 689 req_text = ( 690 text[(first_opening_brace-1):req_end] 691 .lstrip() 692 .replace('=', '', 1) 693 .lstrip() 694 .rstrip() 695 ) 696 try: 697 required = ast.literal_eval(req_text) 698 except Exception as e: 699 warn( 700 f"Unable to determine requirements for plugin '{self.name}' " 701 + "without importing the module.\n" 702 + " This may be due to dynamically setting the global `required` list.\n" 703 + f" {e}" 704 ) 705 return [] 706 return required 707 708 import inspect 709 self.activate_venv(dependencies=False, debug=debug) 710 required = [] 711 for name, val in inspect.getmembers(self.module): 712 if name == 'required': 713 required = val 714 break 715 self._required = required 716 self.deactivate_venv(dependencies=False, debug=debug) 717 return required
If the Plugin has specified dependencies in a list called required
, return the list.
NOTE: Dependecies which start with 'plugin:'
are Meerschaum plugins, not pip packages.
Meerschaum plugins may also specify connector keys for a repo after '@'
.
Parameters
- debug (bool, default False): Verbosity toggle.
Returns
- A list of required packages and plugins (str).
720 def get_required_plugins(self, debug: bool=False) -> List[meerschaum.plugins.Plugin]: 721 """ 722 Return a list of required Plugin objects. 723 """ 724 from meerschaum.utils.warnings import warn 725 from meerschaum.config import get_config 726 from meerschaum.config.static import STATIC_CONFIG 727 plugins = [] 728 _deps = self.get_dependencies(debug=debug) 729 sep = STATIC_CONFIG['plugins']['repo_separator'] 730 plugin_names = [ 731 _d[len('plugin:'):] for _d in _deps 732 if _d.startswith('plugin:') and len(_d) > len('plugin:') 733 ] 734 default_repo_keys = get_config('meerschaum', 'default_repository') 735 for _plugin_name in plugin_names: 736 if sep in _plugin_name: 737 try: 738 _plugin_name, _repo_keys = _plugin_name.split(sep) 739 except Exception as e: 740 _repo_keys = default_repo_keys 741 warn( 742 f"Invalid repo keys for required plugin '{_plugin_name}'.\n " 743 + f"Will try to use '{_repo_keys}' instead.", 744 stack = False, 745 ) 746 else: 747 _repo_keys = default_repo_keys 748 plugins.append(Plugin(_plugin_name, repo=_repo_keys)) 749 return plugins
Return a list of required Plugin objects.
752 def get_required_packages(self, debug: bool=False) -> List[str]: 753 """ 754 Return the required package names (excluding plugins). 755 """ 756 _deps = self.get_dependencies(debug=debug) 757 return [_d for _d in _deps if not _d.startswith('plugin:')]
Return the required package names (excluding plugins).
760 def activate_venv(self, dependencies: bool=True, debug: bool=False, **kw) -> bool: 761 """ 762 Activate the virtual environments for the plugin and its dependencies. 763 764 Parameters 765 ---------- 766 dependencies: bool, default True 767 If `True`, activate the virtual environments for required plugins. 768 769 Returns 770 ------- 771 A bool indicating success. 772 """ 773 from meerschaum.utils.venv import venv_target_path 774 from meerschaum.utils.packages import activate_venv 775 from meerschaum.utils.misc import make_symlink, is_symlink 776 from meerschaum.config._paths import PACKAGE_ROOT_PATH 777 778 if dependencies: 779 for plugin in self.get_required_plugins(debug=debug): 780 plugin.activate_venv(debug=debug, **kw) 781 782 vtp = venv_target_path(self.name, debug=debug, allow_nonexistent=True) 783 venv_meerschaum_path = vtp / 'meerschaum' 784 785 try: 786 success, msg = True, "Success" 787 if is_symlink(venv_meerschaum_path): 788 if pathlib.Path(os.path.realpath(venv_meerschaum_path)) != PACKAGE_ROOT_PATH: 789 venv_meerschaum_path.unlink() 790 success, msg = make_symlink(venv_meerschaum_path, PACKAGE_ROOT_PATH) 791 except Exception as e: 792 success, msg = False, str(e) 793 if not success: 794 warn(f"Unable to create symlink {venv_meerschaum_path} to {PACKAGE_ROOT_PATH}:\n{msg}") 795 796 return activate_venv(self.name, debug=debug, **kw)
Activate the virtual environments for the plugin and its dependencies.
Parameters
- dependencies (bool, default True):
If
True
, activate the virtual environments for required plugins.
Returns
- A bool indicating success.
799 def deactivate_venv(self, dependencies: bool=True, debug: bool = False, **kw) -> bool: 800 """ 801 Deactivate the virtual environments for the plugin and its dependencies. 802 803 Parameters 804 ---------- 805 dependencies: bool, default True 806 If `True`, deactivate the virtual environments for required plugins. 807 808 Returns 809 ------- 810 A bool indicating success. 811 """ 812 from meerschaum.utils.packages import deactivate_venv 813 success = deactivate_venv(self.name, debug=debug, **kw) 814 if dependencies: 815 for plugin in self.get_required_plugins(debug=debug): 816 plugin.deactivate_venv(debug=debug, **kw) 817 return success
Deactivate the virtual environments for the plugin and its dependencies.
Parameters
- dependencies (bool, default True):
If
True
, deactivate the virtual environments for required plugins.
Returns
- A bool indicating success.
820 def install_dependencies( 821 self, 822 force: bool = False, 823 debug: bool = False, 824 ) -> bool: 825 """ 826 If specified, install dependencies. 827 828 **NOTE:** Dependencies that start with `'plugin:'` will be installed as 829 Meerschaum plugins from the same repository as this Plugin. 830 To install from a different repository, add the repo keys after `'@'` 831 (e.g. `'plugin:foo@api:bar'`). 832 833 Parameters 834 ---------- 835 force: bool, default False 836 If `True`, continue with the installation, even if some 837 required packages fail to install. 838 839 debug: bool, default False 840 Verbosity toggle. 841 842 Returns 843 ------- 844 A bool indicating success. 845 846 """ 847 from meerschaum.utils.packages import pip_install, venv_contains_package 848 from meerschaum.utils.debug import dprint 849 from meerschaum.utils.warnings import warn, info 850 from meerschaum.connectors.parse import parse_repo_keys 851 _deps = self.get_dependencies(debug=debug) 852 if not _deps and self.requirements_file_path is None: 853 return True 854 855 plugins = self.get_required_plugins(debug=debug) 856 for _plugin in plugins: 857 if _plugin.name == self.name: 858 warn(f"Plugin '{self.name}' cannot depend on itself! Skipping...", stack=False) 859 continue 860 _success, _msg = _plugin.repo_connector.install_plugin( 861 _plugin.name, debug=debug, force=force 862 ) 863 if not _success: 864 warn( 865 f"Failed to install required plugin '{_plugin}' from '{_plugin.repo_connector}'" 866 + f" for plugin '{self.name}':\n" + _msg, 867 stack = False, 868 ) 869 if not force: 870 warn( 871 "Try installing with the `--force` flag to continue anyway.", 872 stack = False, 873 ) 874 return False 875 info( 876 "Continuing with installation despite the failure " 877 + "(careful, things might be broken!)...", 878 icon = False 879 ) 880 881 882 ### First step: parse `requirements.txt` if it exists. 883 if self.requirements_file_path is not None: 884 if not pip_install( 885 requirements_file_path=self.requirements_file_path, 886 venv=self.name, debug=debug 887 ): 888 warn( 889 f"Failed to resolve 'requirements.txt' for plugin '{self.name}'.", 890 stack = False, 891 ) 892 if not force: 893 warn( 894 "Try installing with `--force` to continue anyway.", 895 stack = False, 896 ) 897 return False 898 info( 899 "Continuing with installation despite the failure " 900 + "(careful, things might be broken!)...", 901 icon = False 902 ) 903 904 905 ### Don't reinstall packages that are already included in required plugins. 906 packages = [] 907 _packages = self.get_required_packages(debug=debug) 908 accounted_for_packages = set() 909 for package_name in _packages: 910 for plugin in plugins: 911 if venv_contains_package(package_name, plugin.name): 912 accounted_for_packages.add(package_name) 913 break 914 packages = [pkg for pkg in _packages if pkg not in accounted_for_packages] 915 916 ### Attempt pip packages installation. 917 if packages: 918 for package in packages: 919 if not pip_install(package, venv=self.name, debug=debug): 920 warn( 921 f"Failed to install required package '{package}'" 922 + f" for plugin '{self.name}'.", 923 stack = False, 924 ) 925 if not force: 926 warn( 927 "Try installing with `--force` to continue anyway.", 928 stack = False, 929 ) 930 return False 931 info( 932 "Continuing with installation despite the failure " 933 + "(careful, things might be broken!)...", 934 icon = False 935 ) 936 return True
If specified, install dependencies.
NOTE: Dependencies that start with 'plugin:'
will be installed as
Meerschaum plugins from the same repository as this Plugin.
To install from a different repository, add the repo keys after '@'
(e.g. 'plugin:foo@api:bar'
).
Parameters
- force (bool, default False):
If
True
, continue with the installation, even if some required packages fail to install. - debug (bool, default False): Verbosity toggle.
Returns
- A bool indicating success.
939 @property 940 def full_name(self) -> str: 941 """ 942 Include the repo keys with the plugin's name. 943 """ 944 from meerschaum.config.static import STATIC_CONFIG 945 sep = STATIC_CONFIG['plugins']['repo_separator'] 946 return self.name + sep + str(self.repo_connector)
Include the repo keys with the plugin's name.
18class Venv: 19 """ 20 Manage a virtual enviroment's activation status. 21 22 Examples 23 -------- 24 >>> from meerschaum.plugins import Plugin 25 >>> with Venv('mrsm') as venv: 26 ... import pandas 27 >>> with Venv(Plugin('noaa')) as venv: 28 ... import requests 29 >>> venv = Venv('mrsm') 30 >>> venv.activate() 31 True 32 >>> venv.deactivate() 33 True 34 >>> 35 """ 36 37 def __init__( 38 self, 39 venv: Union[str, 'meerschaum.plugins.Plugin', None] = 'mrsm', 40 debug: bool = False, 41 ) -> None: 42 from meerschaum.utils.venv import activate_venv, deactivate_venv, active_venvs 43 ### For some weird threading issue, 44 ### we can't use `isinstance` here. 45 if 'meerschaum.plugins._Plugin' in str(type(venv)): 46 self._venv = venv.name 47 self._activate = venv.activate_venv 48 self._deactivate = venv.deactivate_venv 49 self._kwargs = {} 50 else: 51 self._venv = venv 52 self._activate = activate_venv 53 self._deactivate = deactivate_venv 54 self._kwargs = {'venv': venv} 55 self._debug = debug 56 ### In case someone calls `deactivate()` before `activate()`. 57 self._kwargs['previously_active_venvs'] = copy.deepcopy(active_venvs) 58 59 60 def activate(self, debug: bool = False) -> bool: 61 """ 62 Activate this virtual environment. 63 If a `meerschaum.plugins.Plugin` was provided, its dependent virtual environments 64 will also be activated. 65 """ 66 from meerschaum.utils.venv import active_venvs 67 self._kwargs['previously_active_venvs'] = copy.deepcopy(active_venvs) 68 return self._activate(debug=(debug or self._debug), **self._kwargs) 69 70 71 def deactivate(self, debug: bool = False) -> bool: 72 """ 73 Deactivate this virtual environment. 74 If a `meerschaum.plugins.Plugin` was provided, its dependent virtual environments 75 will also be deactivated. 76 """ 77 return self._deactivate(debug=(debug or self._debug), **self._kwargs) 78 79 80 @property 81 def target_path(self) -> pathlib.Path: 82 """ 83 Return the target site-packages path for this virtual environment. 84 A `meerschaum.utils.venv.Venv` may have one virtual environment per minor Python version 85 (e.g. Python 3.10 and Python 3.7). 86 """ 87 from meerschaum.utils.venv import venv_target_path 88 return venv_target_path(venv=self._venv, allow_nonexistent=True, debug=self._debug) 89 90 91 @property 92 def root_path(self) -> pathlib.Path: 93 """ 94 Return the top-level path for this virtual environment. 95 """ 96 from meerschaum.config._paths import VIRTENV_RESOURCES_PATH 97 if self._venv is None: 98 return self.target_path.parent 99 return VIRTENV_RESOURCES_PATH / self._venv 100 101 102 def __enter__(self) -> None: 103 self.activate(debug=self._debug) 104 105 106 def __exit__(self, exc_type, exc_value, exc_traceback) -> None: 107 self.deactivate(debug=self._debug) 108 109 110 def __str__(self) -> str: 111 quote = "'" if self._venv is not None else "" 112 return "Venv(" + quote + str(self._venv) + quote + ")" 113 114 115 def __repr__(self) -> str: 116 return self.__str__()
Manage a virtual enviroment's activation status.
Examples
>>> from meerschaum.plugins import Plugin
>>> with Venv('mrsm') as venv:
... import pandas
>>> with Venv(Plugin('noaa')) as venv:
... import requests
>>> venv = Venv('mrsm')
>>> venv.activate()
True
>>> venv.deactivate()
True
>>>
37 def __init__( 38 self, 39 venv: Union[str, 'meerschaum.plugins.Plugin', None] = 'mrsm', 40 debug: bool = False, 41 ) -> None: 42 from meerschaum.utils.venv import activate_venv, deactivate_venv, active_venvs 43 ### For some weird threading issue, 44 ### we can't use `isinstance` here. 45 if 'meerschaum.plugins._Plugin' in str(type(venv)): 46 self._venv = venv.name 47 self._activate = venv.activate_venv 48 self._deactivate = venv.deactivate_venv 49 self._kwargs = {} 50 else: 51 self._venv = venv 52 self._activate = activate_venv 53 self._deactivate = deactivate_venv 54 self._kwargs = {'venv': venv} 55 self._debug = debug 56 ### In case someone calls `deactivate()` before `activate()`. 57 self._kwargs['previously_active_venvs'] = copy.deepcopy(active_venvs)
60 def activate(self, debug: bool = False) -> bool: 61 """ 62 Activate this virtual environment. 63 If a `meerschaum.plugins.Plugin` was provided, its dependent virtual environments 64 will also be activated. 65 """ 66 from meerschaum.utils.venv import active_venvs 67 self._kwargs['previously_active_venvs'] = copy.deepcopy(active_venvs) 68 return self._activate(debug=(debug or self._debug), **self._kwargs)
Activate this virtual environment.
If a meerschaum.plugins.Plugin
was provided, its dependent virtual environments
will also be activated.
71 def deactivate(self, debug: bool = False) -> bool: 72 """ 73 Deactivate this virtual environment. 74 If a `meerschaum.plugins.Plugin` was provided, its dependent virtual environments 75 will also be deactivated. 76 """ 77 return self._deactivate(debug=(debug or self._debug), **self._kwargs)
Deactivate this virtual environment.
If a meerschaum.plugins.Plugin
was provided, its dependent virtual environments
will also be deactivated.
80 @property 81 def target_path(self) -> pathlib.Path: 82 """ 83 Return the target site-packages path for this virtual environment. 84 A `meerschaum.utils.venv.Venv` may have one virtual environment per minor Python version 85 (e.g. Python 3.10 and Python 3.7). 86 """ 87 from meerschaum.utils.venv import venv_target_path 88 return venv_target_path(venv=self._venv, allow_nonexistent=True, debug=self._debug)
Return the target site-packages path for this virtual environment.
A meerschaum.utils.venv.Venv
may have one virtual environment per minor Python version
(e.g. Python 3.10 and Python 3.7).
91 @property 92 def root_path(self) -> pathlib.Path: 93 """ 94 Return the top-level path for this virtual environment. 95 """ 96 from meerschaum.config._paths import VIRTENV_RESOURCES_PATH 97 if self._venv is None: 98 return self.target_path.parent 99 return VIRTENV_RESOURCES_PATH / self._venv
Return the top-level path for this virtual environment.
50class Job: 51 """ 52 Manage a `meerschaum.utils.daemon.Daemon`, locally or remotely via the API. 53 """ 54 55 def __init__( 56 self, 57 name: str, 58 sysargs: Union[List[str], str, None] = None, 59 env: Optional[Dict[str, str]] = None, 60 executor_keys: Optional[str] = None, 61 delete_after_completion: bool = False, 62 _properties: Optional[Dict[str, Any]] = None, 63 _rotating_log=None, 64 _stdin_file=None, 65 _status_hook: Optional[Callable[[], str]] = None, 66 _result_hook: Optional[Callable[[], SuccessTuple]] = None, 67 _externally_managed: bool = False, 68 ): 69 """ 70 Create a new job to manage a `meerschaum.utils.daemon.Daemon`. 71 72 Parameters 73 ---------- 74 name: str 75 The name of the job to be created. 76 This will also be used as the Daemon ID. 77 78 sysargs: Union[List[str], str, None], default None 79 The sysargs of the command to be executed, e.g. 'start api'. 80 81 env: Optional[Dict[str, str]], default None 82 If provided, set these environment variables in the job's process. 83 84 executor_keys: Optional[str], default None 85 If provided, execute the job remotely on an API instance, e.g. 'api:main'. 86 87 delete_after_completion: bool, default False 88 If `True`, delete this job when it has finished executing. 89 90 _properties: Optional[Dict[str, Any]], default None 91 If provided, use this to patch the daemon's properties. 92 """ 93 from meerschaum.utils.daemon import Daemon 94 for char in BANNED_CHARS: 95 if char in name: 96 raise ValueError(f"Invalid name: ({char}) is not allowed.") 97 98 if isinstance(sysargs, str): 99 sysargs = shlex.split(sysargs) 100 101 and_key = STATIC_CONFIG['system']['arguments']['and_key'] 102 escaped_and_key = STATIC_CONFIG['system']['arguments']['escaped_and_key'] 103 if sysargs: 104 sysargs = [ 105 (arg if arg != escaped_and_key else and_key) 106 for arg in sysargs 107 ] 108 109 ### NOTE: 'local' and 'systemd' executors are being coalesced. 110 if executor_keys is None: 111 from meerschaum.jobs import get_executor_keys_from_context 112 executor_keys = get_executor_keys_from_context() 113 114 self.executor_keys = executor_keys 115 self.name = name 116 try: 117 self._daemon = ( 118 Daemon(daemon_id=name) 119 if executor_keys == 'local' 120 else None 121 ) 122 except Exception: 123 self._daemon = None 124 125 ### Handle any injected dependencies. 126 if _rotating_log is not None: 127 self._rotating_log = _rotating_log 128 if self._daemon is not None: 129 self._daemon._rotating_log = _rotating_log 130 131 if _stdin_file is not None: 132 self._stdin_file = _stdin_file 133 if self._daemon is not None: 134 self._daemon._stdin_file = _stdin_file 135 self._daemon._blocking_stdin_file_path = _stdin_file.blocking_file_path 136 137 if _status_hook is not None: 138 self._status_hook = _status_hook 139 140 if _result_hook is not None: 141 self._result_hook = _result_hook 142 143 self._externally_managed = _externally_managed 144 self._properties_patch = _properties or {} 145 if _externally_managed: 146 self._properties_patch.update({'externally_managed': _externally_managed}) 147 148 if env: 149 self._properties_patch.update({'env': env}) 150 151 if delete_after_completion: 152 self._properties_patch.update({'delete_after_completion': delete_after_completion}) 153 154 daemon_sysargs = ( 155 self._daemon.properties.get('target', {}).get('args', [None])[0] 156 if self._daemon is not None 157 else None 158 ) 159 160 if daemon_sysargs and sysargs and daemon_sysargs != sysargs: 161 warn("Given sysargs differ from existing sysargs.") 162 163 self._sysargs = [ 164 arg 165 for arg in (daemon_sysargs or sysargs or []) 166 if arg not in ('-d', '--daemon') 167 ] 168 for restart_flag in RESTART_FLAGS: 169 if restart_flag in self._sysargs: 170 self._properties_patch.update({'restart': True}) 171 break 172 173 @staticmethod 174 def from_pid(pid: int, executor_keys: Optional[str] = None) -> Job: 175 """ 176 Build a `Job` from the PID of a running Meerschaum process. 177 178 Parameters 179 ---------- 180 pid: int 181 The PID of the process. 182 183 executor_keys: Optional[str], default None 184 The executor keys to assign to the job. 185 """ 186 from meerschaum.config.paths import DAEMON_RESOURCES_PATH 187 188 psutil = mrsm.attempt_import('psutil') 189 try: 190 process = psutil.Process(pid) 191 except psutil.NoSuchProcess as e: 192 warn(f"Process with PID {pid} does not exist.", stack=False) 193 raise e 194 195 command_args = process.cmdline() 196 is_daemon = command_args[1] == '-c' 197 198 if is_daemon: 199 daemon_id = command_args[-1].split('daemon_id=')[-1].split(')')[0].replace("'", '') 200 root_dir = process.environ().get(STATIC_CONFIG['environment']['root'], None) 201 if root_dir is None: 202 from meerschaum.config.paths import ROOT_DIR_PATH 203 root_dir = ROOT_DIR_PATH 204 jobs_dir = root_dir / DAEMON_RESOURCES_PATH.name 205 daemon_dir = jobs_dir / daemon_id 206 pid_file = daemon_dir / 'process.pid' 207 208 if pid_file.exists(): 209 with open(pid_file, 'r', encoding='utf-8') as f: 210 daemon_pid = int(f.read()) 211 212 if pid != daemon_pid: 213 raise EnvironmentError(f"Differing PIDs: {pid=}, {daemon_pid=}") 214 else: 215 raise EnvironmentError(f"Is job '{daemon_id}' running?") 216 217 return Job(daemon_id, executor_keys=executor_keys) 218 219 from meerschaum._internal.arguments._parse_arguments import parse_arguments 220 from meerschaum.utils.daemon import get_new_daemon_name 221 222 mrsm_ix = 0 223 for i, arg in enumerate(command_args): 224 if 'mrsm' in arg or 'meerschaum' in arg.lower(): 225 mrsm_ix = i 226 break 227 228 sysargs = command_args[mrsm_ix+1:] 229 kwargs = parse_arguments(sysargs) 230 name = kwargs.get('name', get_new_daemon_name()) 231 return Job(name, sysargs, executor_keys=executor_keys) 232 233 def start(self, debug: bool = False) -> SuccessTuple: 234 """ 235 Start the job's daemon. 236 """ 237 if self.executor is not None: 238 if not self.exists(debug=debug): 239 return self.executor.create_job( 240 self.name, 241 self.sysargs, 242 properties=self.daemon.properties, 243 debug=debug, 244 ) 245 return self.executor.start_job(self.name, debug=debug) 246 247 if self.is_running(): 248 return True, f"{self} is already running." 249 250 success, msg = self.daemon.run( 251 keep_daemon_output=(not self.delete_after_completion), 252 allow_dirty_run=True, 253 ) 254 if not success: 255 return success, msg 256 257 return success, f"Started {self}." 258 259 def stop(self, timeout_seconds: Optional[int] = None, debug: bool = False) -> SuccessTuple: 260 """ 261 Stop the job's daemon. 262 """ 263 if self.executor is not None: 264 return self.executor.stop_job(self.name, debug=debug) 265 266 if self.daemon.status == 'stopped': 267 if not self.restart: 268 return True, f"{self} is not running." 269 elif self.stop_time is not None: 270 return True, f"{self} will not restart until manually started." 271 272 quit_success, quit_msg = self.daemon.quit(timeout=timeout_seconds) 273 if quit_success: 274 return quit_success, f"Stopped {self}." 275 276 warn( 277 f"Failed to gracefully quit {self}.", 278 stack=False, 279 ) 280 kill_success, kill_msg = self.daemon.kill(timeout=timeout_seconds) 281 if not kill_success: 282 return kill_success, kill_msg 283 284 return kill_success, f"Killed {self}." 285 286 def pause(self, timeout_seconds: Optional[int] = None, debug: bool = False) -> SuccessTuple: 287 """ 288 Pause the job's daemon. 289 """ 290 if self.executor is not None: 291 return self.executor.pause_job(self.name, debug=debug) 292 293 pause_success, pause_msg = self.daemon.pause(timeout=timeout_seconds) 294 if not pause_success: 295 return pause_success, pause_msg 296 297 return pause_success, f"Paused {self}." 298 299 def delete(self, debug: bool = False) -> SuccessTuple: 300 """ 301 Delete the job and its daemon. 302 """ 303 if self.executor is not None: 304 return self.executor.delete_job(self.name, debug=debug) 305 306 if self.is_running(): 307 stop_success, stop_msg = self.stop() 308 if not stop_success: 309 return stop_success, stop_msg 310 311 cleanup_success, cleanup_msg = self.daemon.cleanup() 312 if not cleanup_success: 313 return cleanup_success, cleanup_msg 314 315 return cleanup_success, f"Deleted {self}." 316 317 def is_running(self) -> bool: 318 """ 319 Determine whether the job's daemon is running. 320 """ 321 return self.status == 'running' 322 323 def exists(self, debug: bool = False) -> bool: 324 """ 325 Determine whether the job exists. 326 """ 327 if self.executor is not None: 328 return self.executor.get_job_exists(self.name, debug=debug) 329 330 return self.daemon.path.exists() 331 332 def get_logs(self) -> Union[str, None]: 333 """ 334 Return the output text of the job's daemon. 335 """ 336 if self.executor is not None: 337 return self.executor.get_logs(self.name) 338 339 return self.daemon.log_text 340 341 def monitor_logs( 342 self, 343 callback_function: Callable[[str], None] = partial(print, end=''), 344 input_callback_function: Optional[Callable[[], str]] = None, 345 stop_callback_function: Optional[Callable[[SuccessTuple], None]] = None, 346 stop_event: Optional[asyncio.Event] = None, 347 stop_on_exit: bool = False, 348 strip_timestamps: bool = False, 349 accept_input: bool = True, 350 debug: bool = False, 351 ): 352 """ 353 Monitor the job's log files and execute a callback on new lines. 354 355 Parameters 356 ---------- 357 callback_function: Callable[[str], None], default partial(print, end='') 358 The callback to execute as new data comes in. 359 Defaults to printing the output directly to `stdout`. 360 361 input_callback_function: Optional[Callable[[], str]], default None 362 If provided, execute this callback when the daemon is blocking on stdin. 363 Defaults to `sys.stdin.readline()`. 364 365 stop_callback_function: Optional[Callable[[SuccessTuple]], str], default None 366 If provided, execute this callback when the daemon stops. 367 The job's SuccessTuple will be passed to the callback. 368 369 stop_event: Optional[asyncio.Event], default None 370 If provided, stop monitoring when this event is set. 371 You may instead raise `meerschaum.jobs.StopMonitoringLogs` 372 from within `callback_function` to stop monitoring. 373 374 stop_on_exit: bool, default False 375 If `True`, stop monitoring when the job stops. 376 377 strip_timestamps: bool, default False 378 If `True`, remove leading timestamps from lines. 379 380 accept_input: bool, default True 381 If `True`, accept input when the daemon blocks on stdin. 382 """ 383 def default_input_callback_function(): 384 return sys.stdin.readline() 385 386 if input_callback_function is None: 387 input_callback_function = default_input_callback_function 388 389 if self.executor is not None: 390 self.executor.monitor_logs( 391 self.name, 392 callback_function, 393 input_callback_function=input_callback_function, 394 stop_callback_function=stop_callback_function, 395 stop_on_exit=stop_on_exit, 396 accept_input=accept_input, 397 strip_timestamps=strip_timestamps, 398 debug=debug, 399 ) 400 return 401 402 monitor_logs_coroutine = self.monitor_logs_async( 403 callback_function=callback_function, 404 input_callback_function=input_callback_function, 405 stop_callback_function=stop_callback_function, 406 stop_event=stop_event, 407 stop_on_exit=stop_on_exit, 408 strip_timestamps=strip_timestamps, 409 accept_input=accept_input, 410 ) 411 return asyncio.run(monitor_logs_coroutine) 412 413 async def monitor_logs_async( 414 self, 415 callback_function: Callable[[str], None] = partial(print, end='', flush=True), 416 input_callback_function: Optional[Callable[[], str]] = None, 417 stop_callback_function: Optional[Callable[[SuccessTuple], None]] = None, 418 stop_event: Optional[asyncio.Event] = None, 419 stop_on_exit: bool = False, 420 strip_timestamps: bool = False, 421 accept_input: bool = True, 422 _logs_path: Optional[pathlib.Path] = None, 423 _log=None, 424 _stdin_file=None, 425 debug: bool = False, 426 ): 427 """ 428 Monitor the job's log files and await a callback on new lines. 429 430 Parameters 431 ---------- 432 callback_function: Callable[[str], None], default partial(print, end='') 433 The callback to execute as new data comes in. 434 Defaults to printing the output directly to `stdout`. 435 436 input_callback_function: Optional[Callable[[], str]], default None 437 If provided, execute this callback when the daemon is blocking on stdin. 438 Defaults to `sys.stdin.readline()`. 439 440 stop_callback_function: Optional[Callable[[SuccessTuple]], str], default None 441 If provided, execute this callback when the daemon stops. 442 The job's SuccessTuple will be passed to the callback. 443 444 stop_event: Optional[asyncio.Event], default None 445 If provided, stop monitoring when this event is set. 446 You may instead raise `meerschaum.jobs.StopMonitoringLogs` 447 from within `callback_function` to stop monitoring. 448 449 stop_on_exit: bool, default False 450 If `True`, stop monitoring when the job stops. 451 452 strip_timestamps: bool, default False 453 If `True`, remove leading timestamps from lines. 454 455 accept_input: bool, default True 456 If `True`, accept input when the daemon blocks on stdin. 457 """ 458 def default_input_callback_function(): 459 return sys.stdin.readline() 460 461 if input_callback_function is None: 462 input_callback_function = default_input_callback_function 463 464 if self.executor is not None: 465 await self.executor.monitor_logs_async( 466 self.name, 467 callback_function, 468 input_callback_function=input_callback_function, 469 stop_callback_function=stop_callback_function, 470 stop_on_exit=stop_on_exit, 471 strip_timestamps=strip_timestamps, 472 accept_input=accept_input, 473 debug=debug, 474 ) 475 return 476 477 from meerschaum.utils.formatting._jobs import strip_timestamp_from_line 478 479 events = { 480 'user': stop_event, 481 'stopped': asyncio.Event(), 482 } 483 combined_event = asyncio.Event() 484 emitted_text = False 485 stdin_file = _stdin_file if _stdin_file is not None else self.daemon.stdin_file 486 487 async def check_job_status(): 488 nonlocal emitted_text 489 stopped_event = events.get('stopped', None) 490 if stopped_event is None: 491 return 492 493 sleep_time = 0.1 494 while sleep_time < 60: 495 if self.status == 'stopped': 496 if not emitted_text: 497 await asyncio.sleep(sleep_time) 498 sleep_time = round(sleep_time * 1.1, 2) 499 continue 500 501 if stop_callback_function is not None: 502 try: 503 if asyncio.iscoroutinefunction(stop_callback_function): 504 await stop_callback_function(self.result) 505 else: 506 stop_callback_function(self.result) 507 except asyncio.exceptions.CancelledError: 508 break 509 except Exception: 510 warn(traceback.format_exc()) 511 512 if stop_on_exit: 513 events['stopped'].set() 514 515 break 516 await asyncio.sleep(0.1) 517 518 async def check_blocking_on_input(): 519 while True: 520 if not emitted_text or not self.is_blocking_on_stdin(): 521 try: 522 await asyncio.sleep(0.1) 523 except asyncio.exceptions.CancelledError: 524 break 525 continue 526 527 if not self.is_running(): 528 break 529 530 await emit_latest_lines() 531 532 try: 533 print('', end='', flush=True) 534 if asyncio.iscoroutinefunction(input_callback_function): 535 data = await input_callback_function() 536 else: 537 data = input_callback_function() 538 except KeyboardInterrupt: 539 break 540 if not data.endswith('\n'): 541 data += '\n' 542 543 stdin_file.write(data) 544 await asyncio.sleep(0.1) 545 546 async def combine_events(): 547 event_tasks = [ 548 asyncio.create_task(event.wait()) 549 for event in events.values() 550 if event is not None 551 ] 552 if not event_tasks: 553 return 554 555 try: 556 done, pending = await asyncio.wait( 557 event_tasks, 558 return_when=asyncio.FIRST_COMPLETED, 559 ) 560 for task in pending: 561 task.cancel() 562 except asyncio.exceptions.CancelledError: 563 pass 564 finally: 565 combined_event.set() 566 567 check_job_status_task = asyncio.create_task(check_job_status()) 568 check_blocking_on_input_task = asyncio.create_task(check_blocking_on_input()) 569 combine_events_task = asyncio.create_task(combine_events()) 570 571 log = _log if _log is not None else self.daemon.rotating_log 572 lines_to_show = get_config('jobs', 'logs', 'lines_to_show') 573 574 async def emit_latest_lines(): 575 nonlocal emitted_text 576 lines = log.readlines() 577 for line in lines[(-1 * lines_to_show):]: 578 if stop_event is not None and stop_event.is_set(): 579 return 580 581 if strip_timestamps: 582 line = strip_timestamp_from_line(line) 583 584 try: 585 if asyncio.iscoroutinefunction(callback_function): 586 await callback_function(line) 587 else: 588 callback_function(line) 589 emitted_text = True 590 except StopMonitoringLogs: 591 return 592 except Exception: 593 warn(f"Error in logs callback:\n{traceback.format_exc()}") 594 595 await emit_latest_lines() 596 597 tasks = ( 598 [check_job_status_task] 599 + ([check_blocking_on_input_task] if accept_input else []) 600 + [combine_events_task] 601 ) 602 try: 603 _ = asyncio.gather(*tasks, return_exceptions=True) 604 except asyncio.exceptions.CancelledError: 605 raise 606 except Exception: 607 warn(f"Failed to run async checks:\n{traceback.format_exc()}") 608 609 watchfiles = mrsm.attempt_import('watchfiles') 610 async for changes in watchfiles.awatch( 611 _logs_path or LOGS_RESOURCES_PATH, 612 stop_event=combined_event, 613 ): 614 for change in changes: 615 file_path_str = change[1] 616 file_path = pathlib.Path(file_path_str) 617 latest_subfile_path = log.get_latest_subfile_path() 618 if latest_subfile_path != file_path: 619 continue 620 621 await emit_latest_lines() 622 623 await emit_latest_lines() 624 625 def is_blocking_on_stdin(self, debug: bool = False) -> bool: 626 """ 627 Return whether a job's daemon is blocking on stdin. 628 """ 629 if self.executor is not None: 630 return self.executor.get_job_is_blocking_on_stdin(self.name, debug=debug) 631 632 return self.is_running() and self.daemon.blocking_stdin_file_path.exists() 633 634 def write_stdin(self, data): 635 """ 636 Write to a job's daemon's `stdin`. 637 """ 638 self.daemon.stdin_file.write(data) 639 640 @property 641 def executor(self) -> Union[Executor, None]: 642 """ 643 If the job is remote, return the connector to the remote API instance. 644 """ 645 return ( 646 mrsm.get_connector(self.executor_keys) 647 if self.executor_keys != 'local' 648 else None 649 ) 650 651 @property 652 def status(self) -> str: 653 """ 654 Return the running status of the job's daemon. 655 """ 656 if '_status_hook' in self.__dict__: 657 return self._status_hook() 658 659 if self.executor is not None: 660 return self.executor.get_job_status(self.name) 661 662 return self.daemon.status 663 664 @property 665 def pid(self) -> Union[int, None]: 666 """ 667 Return the PID of the job's dameon. 668 """ 669 if self.executor is not None: 670 return self.executor.get_job_metadata(self.name).get('daemon', {}).get('pid', None) 671 672 return self.daemon.pid 673 674 @property 675 def restart(self) -> bool: 676 """ 677 Return whether to restart a stopped job. 678 """ 679 if self.executor is not None: 680 return self.executor.get_job_metadata(self.name).get('restart', False) 681 682 return self.daemon.properties.get('restart', False) 683 684 @property 685 def result(self) -> SuccessTuple: 686 """ 687 Return the `SuccessTuple` when the job has terminated. 688 """ 689 if self.is_running(): 690 return True, f"{self} is running." 691 692 if '_result_hook' in self.__dict__: 693 return self._result_hook() 694 695 if self.executor is not None: 696 return ( 697 self.executor.get_job_metadata(self.name) 698 .get('result', (False, "No result available.")) 699 ) 700 701 _result = self.daemon.properties.get('result', None) 702 if _result is None: 703 return False, "No result available." 704 705 return tuple(_result) 706 707 @property 708 def sysargs(self) -> List[str]: 709 """ 710 Return the sysargs to use for the Daemon. 711 """ 712 if self._sysargs: 713 return self._sysargs 714 715 if self.executor is not None: 716 return self.executor.get_job_metadata(self.name).get('sysargs', []) 717 718 target_args = self.daemon.target_args 719 if target_args is None: 720 return [] 721 self._sysargs = target_args[0] if len(target_args) > 0 else [] 722 return self._sysargs 723 724 @property 725 def daemon(self) -> 'Daemon': 726 """ 727 Return the daemon which this job manages. 728 """ 729 from meerschaum.utils.daemon import Daemon 730 if self._daemon is not None and self.executor is None and self._sysargs: 731 return self._daemon 732 733 remote_properties = ( 734 {} 735 if self.executor is None 736 else self.executor.get_job_properties(self.name) 737 ) 738 properties = {**remote_properties, **self._properties_patch} 739 740 self._daemon = Daemon( 741 target=entry, 742 target_args=[self._sysargs], 743 target_kw={}, 744 daemon_id=self.name, 745 label=shlex.join(self._sysargs), 746 properties=properties, 747 ) 748 if '_rotating_log' in self.__dict__: 749 self._daemon._rotating_log = self._rotating_log 750 751 if '_stdin_file' in self.__dict__: 752 self._daemon._stdin_file = self._stdin_file 753 self._daemon._blocking_stdin_file_path = self._stdin_file.blocking_file_path 754 755 return self._daemon 756 757 @property 758 def began(self) -> Union[datetime, None]: 759 """ 760 The datetime when the job began running. 761 """ 762 if self.executor is not None: 763 began_str = self.executor.get_job_began(self.name) 764 if began_str is None: 765 return None 766 return ( 767 datetime.fromisoformat(began_str) 768 .astimezone(timezone.utc) 769 .replace(tzinfo=None) 770 ) 771 772 began_str = self.daemon.properties.get('process', {}).get('began', None) 773 if began_str is None: 774 return None 775 776 return datetime.fromisoformat(began_str) 777 778 @property 779 def ended(self) -> Union[datetime, None]: 780 """ 781 The datetime when the job stopped running. 782 """ 783 if self.executor is not None: 784 ended_str = self.executor.get_job_ended(self.name) 785 if ended_str is None: 786 return None 787 return ( 788 datetime.fromisoformat(ended_str) 789 .astimezone(timezone.utc) 790 .replace(tzinfo=None) 791 ) 792 793 ended_str = self.daemon.properties.get('process', {}).get('ended', None) 794 if ended_str is None: 795 return None 796 797 return datetime.fromisoformat(ended_str) 798 799 @property 800 def paused(self) -> Union[datetime, None]: 801 """ 802 The datetime when the job was suspended while running. 803 """ 804 if self.executor is not None: 805 paused_str = self.executor.get_job_paused(self.name) 806 if paused_str is None: 807 return None 808 return ( 809 datetime.fromisoformat(paused_str) 810 .astimezone(timezone.utc) 811 .replace(tzinfo=None) 812 ) 813 814 paused_str = self.daemon.properties.get('process', {}).get('paused', None) 815 if paused_str is None: 816 return None 817 818 return datetime.fromisoformat(paused_str) 819 820 @property 821 def stop_time(self) -> Union[datetime, None]: 822 """ 823 Return the timestamp when the job was manually stopped. 824 """ 825 if self.executor is not None: 826 return self.executor.get_job_stop_time(self.name) 827 828 if not self.daemon.stop_path.exists(): 829 return None 830 831 stop_data = self.daemon._read_stop_file() 832 if not stop_data: 833 return None 834 835 stop_time_str = stop_data.get('stop_time', None) 836 if not stop_time_str: 837 warn(f"Could not read stop time for {self}.") 838 return None 839 840 return datetime.fromisoformat(stop_time_str) 841 842 @property 843 def hidden(self) -> bool: 844 """ 845 Return a bool indicating whether this job should be displayed. 846 """ 847 return ( 848 self.name.startswith('_') 849 or self.name.startswith('.') 850 or self._is_externally_managed 851 ) 852 853 def check_restart(self) -> SuccessTuple: 854 """ 855 If `restart` is `True` and the daemon is not running, 856 restart the job. 857 Do not restart if the job was manually stopped. 858 """ 859 if self.is_running(): 860 return True, f"{self} is running." 861 862 if not self.restart: 863 return True, f"{self} does not need to be restarted." 864 865 if self.stop_time is not None: 866 return True, f"{self} was manually stopped." 867 868 return self.start() 869 870 @property 871 def label(self) -> str: 872 """ 873 Return the job's Daemon label (joined sysargs). 874 """ 875 from meerschaum._internal.arguments import compress_pipeline_sysargs 876 sysargs = compress_pipeline_sysargs(self.sysargs) 877 return shlex.join(sysargs).replace(' + ', '\n+ ') 878 879 @property 880 def _externally_managed_file(self) -> pathlib.Path: 881 """ 882 Return the path to the externally managed file. 883 """ 884 return self.daemon.path / '.externally-managed' 885 886 def _set_externally_managed(self): 887 """ 888 Set this job as externally managed. 889 """ 890 self._externally_managed = True 891 try: 892 self._externally_managed_file.parent.mkdir(exist_ok=True, parents=True) 893 self._externally_managed_file.touch() 894 except Exception as e: 895 warn(e) 896 897 @property 898 def _is_externally_managed(self) -> bool: 899 """ 900 Return whether this job is externally managed. 901 """ 902 return self.executor_keys in (None, 'local') and ( 903 self._externally_managed or self._externally_managed_file.exists() 904 ) 905 906 @property 907 def env(self) -> Dict[str, str]: 908 """ 909 Return the environment variables to set for the job's process. 910 """ 911 if '_env' in self.__dict__: 912 return self.__dict__['_env'] 913 914 _env = self.daemon.properties.get('env', {}) 915 default_env = { 916 'PYTHONUNBUFFERED': '1', 917 'LINES': str(get_config('jobs', 'terminal', 'lines')), 918 'COLUMNS': str(get_config('jobs', 'terminal', 'columns')), 919 } 920 self._env = {**default_env, **_env} 921 return self._env 922 923 @property 924 def delete_after_completion(self) -> bool: 925 """ 926 Return whether this job is configured to delete itself after completion. 927 """ 928 if '_delete_after_completion' in self.__dict__: 929 return self.__dict__.get('_delete_after_completion', False) 930 931 self._delete_after_completion = self.daemon.properties.get('delete_after_completion', False) 932 return self._delete_after_completion 933 934 def __str__(self) -> str: 935 sysargs = self.sysargs 936 sysargs_str = shlex.join(sysargs) if sysargs else '' 937 job_str = f'Job("{self.name}"' 938 if sysargs_str: 939 job_str += f', "{sysargs_str}"' 940 941 job_str += ')' 942 return job_str 943 944 def __repr__(self) -> str: 945 return str(self) 946 947 def __hash__(self) -> int: 948 return hash(self.name)
Manage a meerschaum.utils.daemon.Daemon
, locally or remotely via the API.
55 def __init__( 56 self, 57 name: str, 58 sysargs: Union[List[str], str, None] = None, 59 env: Optional[Dict[str, str]] = None, 60 executor_keys: Optional[str] = None, 61 delete_after_completion: bool = False, 62 _properties: Optional[Dict[str, Any]] = None, 63 _rotating_log=None, 64 _stdin_file=None, 65 _status_hook: Optional[Callable[[], str]] = None, 66 _result_hook: Optional[Callable[[], SuccessTuple]] = None, 67 _externally_managed: bool = False, 68 ): 69 """ 70 Create a new job to manage a `meerschaum.utils.daemon.Daemon`. 71 72 Parameters 73 ---------- 74 name: str 75 The name of the job to be created. 76 This will also be used as the Daemon ID. 77 78 sysargs: Union[List[str], str, None], default None 79 The sysargs of the command to be executed, e.g. 'start api'. 80 81 env: Optional[Dict[str, str]], default None 82 If provided, set these environment variables in the job's process. 83 84 executor_keys: Optional[str], default None 85 If provided, execute the job remotely on an API instance, e.g. 'api:main'. 86 87 delete_after_completion: bool, default False 88 If `True`, delete this job when it has finished executing. 89 90 _properties: Optional[Dict[str, Any]], default None 91 If provided, use this to patch the daemon's properties. 92 """ 93 from meerschaum.utils.daemon import Daemon 94 for char in BANNED_CHARS: 95 if char in name: 96 raise ValueError(f"Invalid name: ({char}) is not allowed.") 97 98 if isinstance(sysargs, str): 99 sysargs = shlex.split(sysargs) 100 101 and_key = STATIC_CONFIG['system']['arguments']['and_key'] 102 escaped_and_key = STATIC_CONFIG['system']['arguments']['escaped_and_key'] 103 if sysargs: 104 sysargs = [ 105 (arg if arg != escaped_and_key else and_key) 106 for arg in sysargs 107 ] 108 109 ### NOTE: 'local' and 'systemd' executors are being coalesced. 110 if executor_keys is None: 111 from meerschaum.jobs import get_executor_keys_from_context 112 executor_keys = get_executor_keys_from_context() 113 114 self.executor_keys = executor_keys 115 self.name = name 116 try: 117 self._daemon = ( 118 Daemon(daemon_id=name) 119 if executor_keys == 'local' 120 else None 121 ) 122 except Exception: 123 self._daemon = None 124 125 ### Handle any injected dependencies. 126 if _rotating_log is not None: 127 self._rotating_log = _rotating_log 128 if self._daemon is not None: 129 self._daemon._rotating_log = _rotating_log 130 131 if _stdin_file is not None: 132 self._stdin_file = _stdin_file 133 if self._daemon is not None: 134 self._daemon._stdin_file = _stdin_file 135 self._daemon._blocking_stdin_file_path = _stdin_file.blocking_file_path 136 137 if _status_hook is not None: 138 self._status_hook = _status_hook 139 140 if _result_hook is not None: 141 self._result_hook = _result_hook 142 143 self._externally_managed = _externally_managed 144 self._properties_patch = _properties or {} 145 if _externally_managed: 146 self._properties_patch.update({'externally_managed': _externally_managed}) 147 148 if env: 149 self._properties_patch.update({'env': env}) 150 151 if delete_after_completion: 152 self._properties_patch.update({'delete_after_completion': delete_after_completion}) 153 154 daemon_sysargs = ( 155 self._daemon.properties.get('target', {}).get('args', [None])[0] 156 if self._daemon is not None 157 else None 158 ) 159 160 if daemon_sysargs and sysargs and daemon_sysargs != sysargs: 161 warn("Given sysargs differ from existing sysargs.") 162 163 self._sysargs = [ 164 arg 165 for arg in (daemon_sysargs or sysargs or []) 166 if arg not in ('-d', '--daemon') 167 ] 168 for restart_flag in RESTART_FLAGS: 169 if restart_flag in self._sysargs: 170 self._properties_patch.update({'restart': True}) 171 break
Create a new job to manage a meerschaum.utils.daemon.Daemon
.
Parameters
- name (str): The name of the job to be created. This will also be used as the Daemon ID.
- sysargs (Union[List[str], str, None], default None): The sysargs of the command to be executed, e.g. 'start api'.
- env (Optional[Dict[str, str]], default None): If provided, set these environment variables in the job's process.
- executor_keys (Optional[str], default None): If provided, execute the job remotely on an API instance, e.g. 'api:main'.
- delete_after_completion (bool, default False):
If
True
, delete this job when it has finished executing. - _properties (Optional[Dict[str, Any]], default None): If provided, use this to patch the daemon's properties.
173 @staticmethod 174 def from_pid(pid: int, executor_keys: Optional[str] = None) -> Job: 175 """ 176 Build a `Job` from the PID of a running Meerschaum process. 177 178 Parameters 179 ---------- 180 pid: int 181 The PID of the process. 182 183 executor_keys: Optional[str], default None 184 The executor keys to assign to the job. 185 """ 186 from meerschaum.config.paths import DAEMON_RESOURCES_PATH 187 188 psutil = mrsm.attempt_import('psutil') 189 try: 190 process = psutil.Process(pid) 191 except psutil.NoSuchProcess as e: 192 warn(f"Process with PID {pid} does not exist.", stack=False) 193 raise e 194 195 command_args = process.cmdline() 196 is_daemon = command_args[1] == '-c' 197 198 if is_daemon: 199 daemon_id = command_args[-1].split('daemon_id=')[-1].split(')')[0].replace("'", '') 200 root_dir = process.environ().get(STATIC_CONFIG['environment']['root'], None) 201 if root_dir is None: 202 from meerschaum.config.paths import ROOT_DIR_PATH 203 root_dir = ROOT_DIR_PATH 204 jobs_dir = root_dir / DAEMON_RESOURCES_PATH.name 205 daemon_dir = jobs_dir / daemon_id 206 pid_file = daemon_dir / 'process.pid' 207 208 if pid_file.exists(): 209 with open(pid_file, 'r', encoding='utf-8') as f: 210 daemon_pid = int(f.read()) 211 212 if pid != daemon_pid: 213 raise EnvironmentError(f"Differing PIDs: {pid=}, {daemon_pid=}") 214 else: 215 raise EnvironmentError(f"Is job '{daemon_id}' running?") 216 217 return Job(daemon_id, executor_keys=executor_keys) 218 219 from meerschaum._internal.arguments._parse_arguments import parse_arguments 220 from meerschaum.utils.daemon import get_new_daemon_name 221 222 mrsm_ix = 0 223 for i, arg in enumerate(command_args): 224 if 'mrsm' in arg or 'meerschaum' in arg.lower(): 225 mrsm_ix = i 226 break 227 228 sysargs = command_args[mrsm_ix+1:] 229 kwargs = parse_arguments(sysargs) 230 name = kwargs.get('name', get_new_daemon_name()) 231 return Job(name, sysargs, executor_keys=executor_keys)
Build a Job
from the PID of a running Meerschaum process.
Parameters
- pid (int): The PID of the process.
- executor_keys (Optional[str], default None): The executor keys to assign to the job.
233 def start(self, debug: bool = False) -> SuccessTuple: 234 """ 235 Start the job's daemon. 236 """ 237 if self.executor is not None: 238 if not self.exists(debug=debug): 239 return self.executor.create_job( 240 self.name, 241 self.sysargs, 242 properties=self.daemon.properties, 243 debug=debug, 244 ) 245 return self.executor.start_job(self.name, debug=debug) 246 247 if self.is_running(): 248 return True, f"{self} is already running." 249 250 success, msg = self.daemon.run( 251 keep_daemon_output=(not self.delete_after_completion), 252 allow_dirty_run=True, 253 ) 254 if not success: 255 return success, msg 256 257 return success, f"Started {self}."
Start the job's daemon.
259 def stop(self, timeout_seconds: Optional[int] = None, debug: bool = False) -> SuccessTuple: 260 """ 261 Stop the job's daemon. 262 """ 263 if self.executor is not None: 264 return self.executor.stop_job(self.name, debug=debug) 265 266 if self.daemon.status == 'stopped': 267 if not self.restart: 268 return True, f"{self} is not running." 269 elif self.stop_time is not None: 270 return True, f"{self} will not restart until manually started." 271 272 quit_success, quit_msg = self.daemon.quit(timeout=timeout_seconds) 273 if quit_success: 274 return quit_success, f"Stopped {self}." 275 276 warn( 277 f"Failed to gracefully quit {self}.", 278 stack=False, 279 ) 280 kill_success, kill_msg = self.daemon.kill(timeout=timeout_seconds) 281 if not kill_success: 282 return kill_success, kill_msg 283 284 return kill_success, f"Killed {self}."
Stop the job's daemon.
286 def pause(self, timeout_seconds: Optional[int] = None, debug: bool = False) -> SuccessTuple: 287 """ 288 Pause the job's daemon. 289 """ 290 if self.executor is not None: 291 return self.executor.pause_job(self.name, debug=debug) 292 293 pause_success, pause_msg = self.daemon.pause(timeout=timeout_seconds) 294 if not pause_success: 295 return pause_success, pause_msg 296 297 return pause_success, f"Paused {self}."
Pause the job's daemon.
299 def delete(self, debug: bool = False) -> SuccessTuple: 300 """ 301 Delete the job and its daemon. 302 """ 303 if self.executor is not None: 304 return self.executor.delete_job(self.name, debug=debug) 305 306 if self.is_running(): 307 stop_success, stop_msg = self.stop() 308 if not stop_success: 309 return stop_success, stop_msg 310 311 cleanup_success, cleanup_msg = self.daemon.cleanup() 312 if not cleanup_success: 313 return cleanup_success, cleanup_msg 314 315 return cleanup_success, f"Deleted {self}."
Delete the job and its daemon.
317 def is_running(self) -> bool: 318 """ 319 Determine whether the job's daemon is running. 320 """ 321 return self.status == 'running'
Determine whether the job's daemon is running.
323 def exists(self, debug: bool = False) -> bool: 324 """ 325 Determine whether the job exists. 326 """ 327 if self.executor is not None: 328 return self.executor.get_job_exists(self.name, debug=debug) 329 330 return self.daemon.path.exists()
Determine whether the job exists.
332 def get_logs(self) -> Union[str, None]: 333 """ 334 Return the output text of the job's daemon. 335 """ 336 if self.executor is not None: 337 return self.executor.get_logs(self.name) 338 339 return self.daemon.log_text
Return the output text of the job's daemon.
341 def monitor_logs( 342 self, 343 callback_function: Callable[[str], None] = partial(print, end=''), 344 input_callback_function: Optional[Callable[[], str]] = None, 345 stop_callback_function: Optional[Callable[[SuccessTuple], None]] = None, 346 stop_event: Optional[asyncio.Event] = None, 347 stop_on_exit: bool = False, 348 strip_timestamps: bool = False, 349 accept_input: bool = True, 350 debug: bool = False, 351 ): 352 """ 353 Monitor the job's log files and execute a callback on new lines. 354 355 Parameters 356 ---------- 357 callback_function: Callable[[str], None], default partial(print, end='') 358 The callback to execute as new data comes in. 359 Defaults to printing the output directly to `stdout`. 360 361 input_callback_function: Optional[Callable[[], str]], default None 362 If provided, execute this callback when the daemon is blocking on stdin. 363 Defaults to `sys.stdin.readline()`. 364 365 stop_callback_function: Optional[Callable[[SuccessTuple]], str], default None 366 If provided, execute this callback when the daemon stops. 367 The job's SuccessTuple will be passed to the callback. 368 369 stop_event: Optional[asyncio.Event], default None 370 If provided, stop monitoring when this event is set. 371 You may instead raise `meerschaum.jobs.StopMonitoringLogs` 372 from within `callback_function` to stop monitoring. 373 374 stop_on_exit: bool, default False 375 If `True`, stop monitoring when the job stops. 376 377 strip_timestamps: bool, default False 378 If `True`, remove leading timestamps from lines. 379 380 accept_input: bool, default True 381 If `True`, accept input when the daemon blocks on stdin. 382 """ 383 def default_input_callback_function(): 384 return sys.stdin.readline() 385 386 if input_callback_function is None: 387 input_callback_function = default_input_callback_function 388 389 if self.executor is not None: 390 self.executor.monitor_logs( 391 self.name, 392 callback_function, 393 input_callback_function=input_callback_function, 394 stop_callback_function=stop_callback_function, 395 stop_on_exit=stop_on_exit, 396 accept_input=accept_input, 397 strip_timestamps=strip_timestamps, 398 debug=debug, 399 ) 400 return 401 402 monitor_logs_coroutine = self.monitor_logs_async( 403 callback_function=callback_function, 404 input_callback_function=input_callback_function, 405 stop_callback_function=stop_callback_function, 406 stop_event=stop_event, 407 stop_on_exit=stop_on_exit, 408 strip_timestamps=strip_timestamps, 409 accept_input=accept_input, 410 ) 411 return asyncio.run(monitor_logs_coroutine)
Monitor the job's log files and execute a callback on new lines.
Parameters
- callback_function (Callable[[str], None], default partial(print, end='')):
The callback to execute as new data comes in.
Defaults to printing the output directly to
stdout
. - input_callback_function (Optional[Callable[[], str]], default None):
If provided, execute this callback when the daemon is blocking on stdin.
Defaults to
sys.stdin.readline()
. - stop_callback_function (Optional[Callable[[SuccessTuple]], str], default None): If provided, execute this callback when the daemon stops. The job's SuccessTuple will be passed to the callback.
- stop_event (Optional[asyncio.Event], default None):
If provided, stop monitoring when this event is set.
You may instead raise
meerschaum.jobs.StopMonitoringLogs
from withincallback_function
to stop monitoring. - stop_on_exit (bool, default False):
If
True
, stop monitoring when the job stops. - strip_timestamps (bool, default False):
If
True
, remove leading timestamps from lines. - accept_input (bool, default True):
If
True
, accept input when the daemon blocks on stdin.
413 async def monitor_logs_async( 414 self, 415 callback_function: Callable[[str], None] = partial(print, end='', flush=True), 416 input_callback_function: Optional[Callable[[], str]] = None, 417 stop_callback_function: Optional[Callable[[SuccessTuple], None]] = None, 418 stop_event: Optional[asyncio.Event] = None, 419 stop_on_exit: bool = False, 420 strip_timestamps: bool = False, 421 accept_input: bool = True, 422 _logs_path: Optional[pathlib.Path] = None, 423 _log=None, 424 _stdin_file=None, 425 debug: bool = False, 426 ): 427 """ 428 Monitor the job's log files and await a callback on new lines. 429 430 Parameters 431 ---------- 432 callback_function: Callable[[str], None], default partial(print, end='') 433 The callback to execute as new data comes in. 434 Defaults to printing the output directly to `stdout`. 435 436 input_callback_function: Optional[Callable[[], str]], default None 437 If provided, execute this callback when the daemon is blocking on stdin. 438 Defaults to `sys.stdin.readline()`. 439 440 stop_callback_function: Optional[Callable[[SuccessTuple]], str], default None 441 If provided, execute this callback when the daemon stops. 442 The job's SuccessTuple will be passed to the callback. 443 444 stop_event: Optional[asyncio.Event], default None 445 If provided, stop monitoring when this event is set. 446 You may instead raise `meerschaum.jobs.StopMonitoringLogs` 447 from within `callback_function` to stop monitoring. 448 449 stop_on_exit: bool, default False 450 If `True`, stop monitoring when the job stops. 451 452 strip_timestamps: bool, default False 453 If `True`, remove leading timestamps from lines. 454 455 accept_input: bool, default True 456 If `True`, accept input when the daemon blocks on stdin. 457 """ 458 def default_input_callback_function(): 459 return sys.stdin.readline() 460 461 if input_callback_function is None: 462 input_callback_function = default_input_callback_function 463 464 if self.executor is not None: 465 await self.executor.monitor_logs_async( 466 self.name, 467 callback_function, 468 input_callback_function=input_callback_function, 469 stop_callback_function=stop_callback_function, 470 stop_on_exit=stop_on_exit, 471 strip_timestamps=strip_timestamps, 472 accept_input=accept_input, 473 debug=debug, 474 ) 475 return 476 477 from meerschaum.utils.formatting._jobs import strip_timestamp_from_line 478 479 events = { 480 'user': stop_event, 481 'stopped': asyncio.Event(), 482 } 483 combined_event = asyncio.Event() 484 emitted_text = False 485 stdin_file = _stdin_file if _stdin_file is not None else self.daemon.stdin_file 486 487 async def check_job_status(): 488 nonlocal emitted_text 489 stopped_event = events.get('stopped', None) 490 if stopped_event is None: 491 return 492 493 sleep_time = 0.1 494 while sleep_time < 60: 495 if self.status == 'stopped': 496 if not emitted_text: 497 await asyncio.sleep(sleep_time) 498 sleep_time = round(sleep_time * 1.1, 2) 499 continue 500 501 if stop_callback_function is not None: 502 try: 503 if asyncio.iscoroutinefunction(stop_callback_function): 504 await stop_callback_function(self.result) 505 else: 506 stop_callback_function(self.result) 507 except asyncio.exceptions.CancelledError: 508 break 509 except Exception: 510 warn(traceback.format_exc()) 511 512 if stop_on_exit: 513 events['stopped'].set() 514 515 break 516 await asyncio.sleep(0.1) 517 518 async def check_blocking_on_input(): 519 while True: 520 if not emitted_text or not self.is_blocking_on_stdin(): 521 try: 522 await asyncio.sleep(0.1) 523 except asyncio.exceptions.CancelledError: 524 break 525 continue 526 527 if not self.is_running(): 528 break 529 530 await emit_latest_lines() 531 532 try: 533 print('', end='', flush=True) 534 if asyncio.iscoroutinefunction(input_callback_function): 535 data = await input_callback_function() 536 else: 537 data = input_callback_function() 538 except KeyboardInterrupt: 539 break 540 if not data.endswith('\n'): 541 data += '\n' 542 543 stdin_file.write(data) 544 await asyncio.sleep(0.1) 545 546 async def combine_events(): 547 event_tasks = [ 548 asyncio.create_task(event.wait()) 549 for event in events.values() 550 if event is not None 551 ] 552 if not event_tasks: 553 return 554 555 try: 556 done, pending = await asyncio.wait( 557 event_tasks, 558 return_when=asyncio.FIRST_COMPLETED, 559 ) 560 for task in pending: 561 task.cancel() 562 except asyncio.exceptions.CancelledError: 563 pass 564 finally: 565 combined_event.set() 566 567 check_job_status_task = asyncio.create_task(check_job_status()) 568 check_blocking_on_input_task = asyncio.create_task(check_blocking_on_input()) 569 combine_events_task = asyncio.create_task(combine_events()) 570 571 log = _log if _log is not None else self.daemon.rotating_log 572 lines_to_show = get_config('jobs', 'logs', 'lines_to_show') 573 574 async def emit_latest_lines(): 575 nonlocal emitted_text 576 lines = log.readlines() 577 for line in lines[(-1 * lines_to_show):]: 578 if stop_event is not None and stop_event.is_set(): 579 return 580 581 if strip_timestamps: 582 line = strip_timestamp_from_line(line) 583 584 try: 585 if asyncio.iscoroutinefunction(callback_function): 586 await callback_function(line) 587 else: 588 callback_function(line) 589 emitted_text = True 590 except StopMonitoringLogs: 591 return 592 except Exception: 593 warn(f"Error in logs callback:\n{traceback.format_exc()}") 594 595 await emit_latest_lines() 596 597 tasks = ( 598 [check_job_status_task] 599 + ([check_blocking_on_input_task] if accept_input else []) 600 + [combine_events_task] 601 ) 602 try: 603 _ = asyncio.gather(*tasks, return_exceptions=True) 604 except asyncio.exceptions.CancelledError: 605 raise 606 except Exception: 607 warn(f"Failed to run async checks:\n{traceback.format_exc()}") 608 609 watchfiles = mrsm.attempt_import('watchfiles') 610 async for changes in watchfiles.awatch( 611 _logs_path or LOGS_RESOURCES_PATH, 612 stop_event=combined_event, 613 ): 614 for change in changes: 615 file_path_str = change[1] 616 file_path = pathlib.Path(file_path_str) 617 latest_subfile_path = log.get_latest_subfile_path() 618 if latest_subfile_path != file_path: 619 continue 620 621 await emit_latest_lines() 622 623 await emit_latest_lines()
Monitor the job's log files and await a callback on new lines.
Parameters
- callback_function (Callable[[str], None], default partial(print, end='')):
The callback to execute as new data comes in.
Defaults to printing the output directly to
stdout
. - input_callback_function (Optional[Callable[[], str]], default None):
If provided, execute this callback when the daemon is blocking on stdin.
Defaults to
sys.stdin.readline()
. - stop_callback_function (Optional[Callable[[SuccessTuple]], str], default None): If provided, execute this callback when the daemon stops. The job's SuccessTuple will be passed to the callback.
- stop_event (Optional[asyncio.Event], default None):
If provided, stop monitoring when this event is set.
You may instead raise
meerschaum.jobs.StopMonitoringLogs
from withincallback_function
to stop monitoring. - stop_on_exit (bool, default False):
If
True
, stop monitoring when the job stops. - strip_timestamps (bool, default False):
If
True
, remove leading timestamps from lines. - accept_input (bool, default True):
If
True
, accept input when the daemon blocks on stdin.
625 def is_blocking_on_stdin(self, debug: bool = False) -> bool: 626 """ 627 Return whether a job's daemon is blocking on stdin. 628 """ 629 if self.executor is not None: 630 return self.executor.get_job_is_blocking_on_stdin(self.name, debug=debug) 631 632 return self.is_running() and self.daemon.blocking_stdin_file_path.exists()
Return whether a job's daemon is blocking on stdin.
634 def write_stdin(self, data): 635 """ 636 Write to a job's daemon's `stdin`. 637 """ 638 self.daemon.stdin_file.write(data)
Write to a job's daemon's stdin
.
640 @property 641 def executor(self) -> Union[Executor, None]: 642 """ 643 If the job is remote, return the connector to the remote API instance. 644 """ 645 return ( 646 mrsm.get_connector(self.executor_keys) 647 if self.executor_keys != 'local' 648 else None 649 )
If the job is remote, return the connector to the remote API instance.
651 @property 652 def status(self) -> str: 653 """ 654 Return the running status of the job's daemon. 655 """ 656 if '_status_hook' in self.__dict__: 657 return self._status_hook() 658 659 if self.executor is not None: 660 return self.executor.get_job_status(self.name) 661 662 return self.daemon.status
Return the running status of the job's daemon.
664 @property 665 def pid(self) -> Union[int, None]: 666 """ 667 Return the PID of the job's dameon. 668 """ 669 if self.executor is not None: 670 return self.executor.get_job_metadata(self.name).get('daemon', {}).get('pid', None) 671 672 return self.daemon.pid
Return the PID of the job's dameon.
674 @property 675 def restart(self) -> bool: 676 """ 677 Return whether to restart a stopped job. 678 """ 679 if self.executor is not None: 680 return self.executor.get_job_metadata(self.name).get('restart', False) 681 682 return self.daemon.properties.get('restart', False)
Return whether to restart a stopped job.
684 @property 685 def result(self) -> SuccessTuple: 686 """ 687 Return the `SuccessTuple` when the job has terminated. 688 """ 689 if self.is_running(): 690 return True, f"{self} is running." 691 692 if '_result_hook' in self.__dict__: 693 return self._result_hook() 694 695 if self.executor is not None: 696 return ( 697 self.executor.get_job_metadata(self.name) 698 .get('result', (False, "No result available.")) 699 ) 700 701 _result = self.daemon.properties.get('result', None) 702 if _result is None: 703 return False, "No result available." 704 705 return tuple(_result)
Return the SuccessTuple
when the job has terminated.
707 @property 708 def sysargs(self) -> List[str]: 709 """ 710 Return the sysargs to use for the Daemon. 711 """ 712 if self._sysargs: 713 return self._sysargs 714 715 if self.executor is not None: 716 return self.executor.get_job_metadata(self.name).get('sysargs', []) 717 718 target_args = self.daemon.target_args 719 if target_args is None: 720 return [] 721 self._sysargs = target_args[0] if len(target_args) > 0 else [] 722 return self._sysargs
Return the sysargs to use for the Daemon.
724 @property 725 def daemon(self) -> 'Daemon': 726 """ 727 Return the daemon which this job manages. 728 """ 729 from meerschaum.utils.daemon import Daemon 730 if self._daemon is not None and self.executor is None and self._sysargs: 731 return self._daemon 732 733 remote_properties = ( 734 {} 735 if self.executor is None 736 else self.executor.get_job_properties(self.name) 737 ) 738 properties = {**remote_properties, **self._properties_patch} 739 740 self._daemon = Daemon( 741 target=entry, 742 target_args=[self._sysargs], 743 target_kw={}, 744 daemon_id=self.name, 745 label=shlex.join(self._sysargs), 746 properties=properties, 747 ) 748 if '_rotating_log' in self.__dict__: 749 self._daemon._rotating_log = self._rotating_log 750 751 if '_stdin_file' in self.__dict__: 752 self._daemon._stdin_file = self._stdin_file 753 self._daemon._blocking_stdin_file_path = self._stdin_file.blocking_file_path 754 755 return self._daemon
Return the daemon which this job manages.
757 @property 758 def began(self) -> Union[datetime, None]: 759 """ 760 The datetime when the job began running. 761 """ 762 if self.executor is not None: 763 began_str = self.executor.get_job_began(self.name) 764 if began_str is None: 765 return None 766 return ( 767 datetime.fromisoformat(began_str) 768 .astimezone(timezone.utc) 769 .replace(tzinfo=None) 770 ) 771 772 began_str = self.daemon.properties.get('process', {}).get('began', None) 773 if began_str is None: 774 return None 775 776 return datetime.fromisoformat(began_str)
The datetime when the job began running.
778 @property 779 def ended(self) -> Union[datetime, None]: 780 """ 781 The datetime when the job stopped running. 782 """ 783 if self.executor is not None: 784 ended_str = self.executor.get_job_ended(self.name) 785 if ended_str is None: 786 return None 787 return ( 788 datetime.fromisoformat(ended_str) 789 .astimezone(timezone.utc) 790 .replace(tzinfo=None) 791 ) 792 793 ended_str = self.daemon.properties.get('process', {}).get('ended', None) 794 if ended_str is None: 795 return None 796 797 return datetime.fromisoformat(ended_str)
The datetime when the job stopped running.
799 @property 800 def paused(self) -> Union[datetime, None]: 801 """ 802 The datetime when the job was suspended while running. 803 """ 804 if self.executor is not None: 805 paused_str = self.executor.get_job_paused(self.name) 806 if paused_str is None: 807 return None 808 return ( 809 datetime.fromisoformat(paused_str) 810 .astimezone(timezone.utc) 811 .replace(tzinfo=None) 812 ) 813 814 paused_str = self.daemon.properties.get('process', {}).get('paused', None) 815 if paused_str is None: 816 return None 817 818 return datetime.fromisoformat(paused_str)
The datetime when the job was suspended while running.
820 @property 821 def stop_time(self) -> Union[datetime, None]: 822 """ 823 Return the timestamp when the job was manually stopped. 824 """ 825 if self.executor is not None: 826 return self.executor.get_job_stop_time(self.name) 827 828 if not self.daemon.stop_path.exists(): 829 return None 830 831 stop_data = self.daemon._read_stop_file() 832 if not stop_data: 833 return None 834 835 stop_time_str = stop_data.get('stop_time', None) 836 if not stop_time_str: 837 warn(f"Could not read stop time for {self}.") 838 return None 839 840 return datetime.fromisoformat(stop_time_str)
Return the timestamp when the job was manually stopped.
853 def check_restart(self) -> SuccessTuple: 854 """ 855 If `restart` is `True` and the daemon is not running, 856 restart the job. 857 Do not restart if the job was manually stopped. 858 """ 859 if self.is_running(): 860 return True, f"{self} is running." 861 862 if not self.restart: 863 return True, f"{self} does not need to be restarted." 864 865 if self.stop_time is not None: 866 return True, f"{self} was manually stopped." 867 868 return self.start()
If restart
is True
and the daemon is not running,
restart the job.
Do not restart if the job was manually stopped.
870 @property 871 def label(self) -> str: 872 """ 873 Return the job's Daemon label (joined sysargs). 874 """ 875 from meerschaum._internal.arguments import compress_pipeline_sysargs 876 sysargs = compress_pipeline_sysargs(self.sysargs) 877 return shlex.join(sysargs).replace(' + ', '\n+ ')
Return the job's Daemon label (joined sysargs).
906 @property 907 def env(self) -> Dict[str, str]: 908 """ 909 Return the environment variables to set for the job's process. 910 """ 911 if '_env' in self.__dict__: 912 return self.__dict__['_env'] 913 914 _env = self.daemon.properties.get('env', {}) 915 default_env = { 916 'PYTHONUNBUFFERED': '1', 917 'LINES': str(get_config('jobs', 'terminal', 'lines')), 918 'COLUMNS': str(get_config('jobs', 'terminal', 'columns')), 919 } 920 self._env = {**default_env, **_env} 921 return self._env
Return the environment variables to set for the job's process.
923 @property 924 def delete_after_completion(self) -> bool: 925 """ 926 Return whether this job is configured to delete itself after completion. 927 """ 928 if '_delete_after_completion' in self.__dict__: 929 return self.__dict__.get('_delete_after_completion', False) 930 931 self._delete_after_completion = self.daemon.properties.get('delete_after_completion', False) 932 return self._delete_after_completion
Return whether this job is configured to delete itself after completion.
10def pprint( 11 *args, 12 detect_password: bool = True, 13 nopretty: bool = False, 14 **kw 15 ) -> None: 16 """Pretty print an object according to the configured ANSI and UNICODE settings. 17 If detect_password is True (default), search and replace passwords with '*' characters. 18 Does not mutate objects. 19 """ 20 from meerschaum.utils.packages import attempt_import, import_rich 21 from meerschaum.utils.formatting import ANSI, UNICODE, get_console, print_tuple 22 from meerschaum.utils.warnings import error 23 from meerschaum.utils.misc import replace_password, dict_from_od, filter_keywords 24 from collections import OrderedDict 25 import copy, json 26 27 if ( 28 len(args) == 1 29 and 30 isinstance(args[0], tuple) 31 and 32 len(args[0]) == 2 33 and 34 isinstance(args[0][0], bool) 35 and 36 isinstance(args[0][1], str) 37 ): 38 return print_tuple(args[0]) 39 40 modify = True 41 rich_pprint = None 42 if ANSI and not nopretty: 43 rich = import_rich() 44 if rich is not None: 45 rich_pretty = attempt_import('rich.pretty') 46 if rich_pretty is not None: 47 def _rich_pprint(*args, **kw): 48 _console = get_console() 49 _kw = filter_keywords(_console.print, **kw) 50 _console.print(*args, **_kw) 51 rich_pprint = _rich_pprint 52 elif not nopretty: 53 pprintpp = attempt_import('pprintpp', warn=False) 54 try: 55 _pprint = pprintpp.pprint 56 except Exception as e: 57 import pprint as _pprint_module 58 _pprint = _pprint_module.pprint 59 60 func = ( 61 _pprint if rich_pprint is None else rich_pprint 62 ) if not nopretty else print 63 64 try: 65 args_copy = copy.deepcopy(args) 66 except Exception as e: 67 args_copy = args 68 modify = False 69 _args = [] 70 for a in args: 71 c = a 72 ### convert OrderedDict into dict 73 if isinstance(a, OrderedDict) or issubclass(type(a), OrderedDict): 74 c = dict_from_od(copy.deepcopy(c)) 75 _args.append(c) 76 args = _args 77 78 _args = list(args) 79 if detect_password and modify: 80 _args = [] 81 for a in args: 82 c = a 83 if isinstance(c, dict): 84 c = replace_password(copy.deepcopy(c)) 85 if nopretty: 86 try: 87 c = json.dumps(c) 88 is_json = True 89 except Exception as e: 90 is_json = False 91 if not is_json: 92 try: 93 c = str(c) 94 except Exception as e: 95 pass 96 _args.append(c) 97 98 ### filter out unsupported keywords 99 func_kw = filter_keywords(func, **kw) if not nopretty else {} 100 error_msg = None 101 try: 102 func(*_args, **func_kw) 103 except Exception as e: 104 error_msg = e 105 if error_msg is not None: 106 error(error_msg)
Pretty print an object according to the configured ANSI and UNICODE settings. If detect_password is True (default), search and replace passwords with '*' characters. Does not mutate objects.
1222def attempt_import( 1223 *names: str, 1224 lazy: bool = True, 1225 warn: bool = True, 1226 install: bool = True, 1227 venv: Optional[str] = 'mrsm', 1228 precheck: bool = True, 1229 split: bool = True, 1230 check_update: bool = False, 1231 check_pypi: bool = False, 1232 check_is_installed: bool = True, 1233 allow_outside_venv: bool = True, 1234 color: bool = True, 1235 debug: bool = False 1236) -> Any: 1237 """ 1238 Raise a warning if packages are not installed; otherwise import and return modules. 1239 If `lazy` is `True`, return lazy-imported modules. 1240 1241 Returns tuple of modules if multiple names are provided, else returns one module. 1242 1243 Parameters 1244 ---------- 1245 names: List[str] 1246 The packages to be imported. 1247 1248 lazy: bool, default True 1249 If `True`, lazily load packages. 1250 1251 warn: bool, default True 1252 If `True`, raise a warning if a package cannot be imported. 1253 1254 install: bool, default True 1255 If `True`, attempt to install a missing package into the designated virtual environment. 1256 If `check_update` is True, install updates if available. 1257 1258 venv: Optional[str], default 'mrsm' 1259 The virtual environment in which to search for packages and to install packages into. 1260 1261 precheck: bool, default True 1262 If `True`, attempt to find module before importing (necessary for checking if modules exist 1263 and retaining lazy imports), otherwise assume lazy is `False`. 1264 1265 split: bool, default True 1266 If `True`, split packages' names on `'.'`. 1267 1268 check_update: bool, default False 1269 If `True` and `install` is `True`, install updates if the required minimum version 1270 does not match. 1271 1272 check_pypi: bool, default False 1273 If `True` and `check_update` is `True`, check PyPI when determining whether 1274 an update is required. 1275 1276 check_is_installed: bool, default True 1277 If `True`, check if the package is contained in the virtual environment. 1278 1279 allow_outside_venv: bool, default True 1280 If `True`, search outside of the specified virtual environment 1281 if the package cannot be found. 1282 Setting to `False` will reinstall the package into a virtual environment, even if it 1283 is installed outside. 1284 1285 color: bool, default True 1286 If `False`, do not print ANSI colors. 1287 1288 Returns 1289 ------- 1290 The specified modules. If they're not available and `install` is `True`, it will first 1291 download them into a virtual environment and return the modules. 1292 1293 Examples 1294 -------- 1295 >>> pandas, sqlalchemy = attempt_import('pandas', 'sqlalchemy') 1296 >>> pandas = attempt_import('pandas') 1297 1298 """ 1299 1300 import importlib.util 1301 1302 ### to prevent recursion, check if parent Meerschaum package is being imported 1303 if names == ('meerschaum',): 1304 return _import_module('meerschaum') 1305 1306 if venv == 'mrsm' and _import_hook_venv is not None: 1307 if debug: 1308 print(f"Import hook for virtual environment '{_import_hook_venv}' is active.") 1309 venv = _import_hook_venv 1310 1311 _warnings = _import_module('meerschaum.utils.warnings') 1312 warn_function = _warnings.warn 1313 1314 def do_import(_name: str, **kw) -> Union['ModuleType', None]: 1315 with Venv(venv=venv, debug=debug): 1316 ### determine the import method (lazy vs normal) 1317 from meerschaum.utils.misc import filter_keywords 1318 import_method = ( 1319 _import_module if not lazy 1320 else lazy_import 1321 ) 1322 try: 1323 mod = import_method(_name, **(filter_keywords(import_method, **kw))) 1324 except Exception as e: 1325 if warn: 1326 import traceback 1327 traceback.print_exception(type(e), e, e.__traceback__) 1328 warn_function( 1329 f"Failed to import module '{_name}'.\nException:\n{e}", 1330 ImportWarning, 1331 stacklevel = (5 if lazy else 4), 1332 color = False, 1333 ) 1334 mod = None 1335 return mod 1336 1337 modules = [] 1338 for name in names: 1339 ### Check if package is a declared dependency. 1340 root_name = name.split('.')[0] if split else name 1341 install_name = _import_to_install_name(root_name) 1342 1343 if install_name is None: 1344 install_name = root_name 1345 if warn and root_name != 'plugins': 1346 warn_function( 1347 f"Package '{root_name}' is not declared in meerschaum.utils.packages.", 1348 ImportWarning, 1349 stacklevel = 3, 1350 color = False 1351 ) 1352 1353 ### Determine if the package exists. 1354 if precheck is False: 1355 found_module = ( 1356 do_import( 1357 name, debug=debug, warn=False, venv=venv, color=color, 1358 check_update=False, check_pypi=False, split=split, 1359 ) is not None 1360 ) 1361 else: 1362 if check_is_installed: 1363 with _locks['_is_installed_first_check']: 1364 if not _is_installed_first_check.get(name, False): 1365 package_is_installed = is_installed( 1366 name, 1367 venv = venv, 1368 split = split, 1369 allow_outside_venv = allow_outside_venv, 1370 debug = debug, 1371 ) 1372 _is_installed_first_check[name] = package_is_installed 1373 else: 1374 package_is_installed = _is_installed_first_check[name] 1375 else: 1376 package_is_installed = _is_installed_first_check.get( 1377 name, 1378 venv_contains_package(name, venv=venv, split=split, debug=debug) 1379 ) 1380 found_module = package_is_installed 1381 1382 if not found_module: 1383 if install: 1384 if not pip_install( 1385 install_name, 1386 venv = venv, 1387 split = False, 1388 check_update = check_update, 1389 color = color, 1390 debug = debug 1391 ) and warn: 1392 warn_function( 1393 f"Failed to install '{install_name}'.", 1394 ImportWarning, 1395 stacklevel = 3, 1396 color = False, 1397 ) 1398 elif warn: 1399 ### Raise a warning if we can't find the package and install = False. 1400 warn_function( 1401 (f"\n\nMissing package '{name}' from virtual environment '{venv}'; " 1402 + "some features will not work correctly." 1403 + f"\n\nSet install=True when calling attempt_import.\n"), 1404 ImportWarning, 1405 stacklevel = 3, 1406 color = False, 1407 ) 1408 1409 ### Do the import. Will be lazy if lazy=True. 1410 m = do_import( 1411 name, debug=debug, warn=warn, venv=venv, color=color, 1412 check_update=check_update, check_pypi=check_pypi, install=install, split=split, 1413 ) 1414 modules.append(m) 1415 1416 modules = tuple(modules) 1417 if len(modules) == 1: 1418 return modules[0] 1419 return modules
Raise a warning if packages are not installed; otherwise import and return modules.
If lazy
is True
, return lazy-imported modules.
Returns tuple of modules if multiple names are provided, else returns one module.
Parameters
- names (List[str]): The packages to be imported.
- lazy (bool, default True):
If
True
, lazily load packages. - warn (bool, default True):
If
True
, raise a warning if a package cannot be imported. - install (bool, default True):
If
True
, attempt to install a missing package into the designated virtual environment. Ifcheck_update
is True, install updates if available. - venv (Optional[str], default 'mrsm'): The virtual environment in which to search for packages and to install packages into.
- precheck (bool, default True):
If
True
, attempt to find module before importing (necessary for checking if modules exist and retaining lazy imports), otherwise assume lazy isFalse
. - split (bool, default True):
If
True
, split packages' names on'.'
. - check_update (bool, default False):
If
True
andinstall
isTrue
, install updates if the required minimum version does not match. - check_pypi (bool, default False):
If
True
andcheck_update
isTrue
, check PyPI when determining whether an update is required. - check_is_installed (bool, default True):
If
True
, check if the package is contained in the virtual environment. - allow_outside_venv (bool, default True):
If
True
, search outside of the specified virtual environment if the package cannot be found. Setting toFalse
will reinstall the package into a virtual environment, even if it is installed outside. - color (bool, default True):
If
False
, do not print ANSI colors.
Returns
- The specified modules. If they're not available and
install
isTrue
, it will first - download them into a virtual environment and return the modules.
Examples
>>> pandas, sqlalchemy = attempt_import('pandas', 'sqlalchemy')
>>> pandas = attempt_import('pandas')
20class Connector(metaclass=abc.ABCMeta): 21 """ 22 The base connector class to hold connection attributes. 23 """ 24 def __init__( 25 self, 26 type: Optional[str] = None, 27 label: Optional[str] = None, 28 **kw: Any 29 ): 30 """ 31 Set the given keyword arguments as attributes. 32 33 Parameters 34 ---------- 35 type: str 36 The `type` of the connector (e.g. `sql`, `api`, `plugin`). 37 38 label: str 39 The `label` for the connector. 40 41 42 Examples 43 -------- 44 Run `mrsm edit config` and to edit connectors in the YAML file: 45 46 ```yaml 47 meerschaum: 48 connections: 49 {type}: 50 {label}: 51 ### attributes go here 52 ``` 53 54 """ 55 self._original_dict = copy.deepcopy(self.__dict__) 56 self._set_attributes(type=type, label=label, **kw) 57 58 ### NOTE: Override `REQUIRED_ATTRIBUTES` if `uri` is set. 59 self.verify_attributes( 60 ['uri'] 61 if 'uri' in self.__dict__ 62 else getattr(self, 'REQUIRED_ATTRIBUTES', None) 63 ) 64 65 def _reset_attributes(self): 66 self.__dict__ = self._original_dict 67 68 def _set_attributes( 69 self, 70 *args, 71 inherit_default: bool = True, 72 **kw: Any 73 ): 74 from meerschaum.config.static import STATIC_CONFIG 75 from meerschaum.utils.warnings import error 76 77 self._attributes = {} 78 79 default_label = STATIC_CONFIG['connectors']['default_label'] 80 81 ### NOTE: Support the legacy method of explicitly passing the type. 82 label = kw.get('label', None) 83 if label is None: 84 if len(args) == 2: 85 label = args[1] 86 elif len(args) == 0: 87 label = None 88 else: 89 label = args[0] 90 91 if label == 'default': 92 error( 93 f"Label cannot be 'default'. Did you mean '{default_label}'?", 94 InvalidAttributesError, 95 ) 96 self.__dict__['label'] = label 97 98 from meerschaum.config import get_config 99 conn_configs = copy.deepcopy(get_config('meerschaum', 'connectors')) 100 connector_config = copy.deepcopy(get_config('system', 'connectors')) 101 102 ### inherit attributes from 'default' if exists 103 if inherit_default: 104 inherit_from = 'default' 105 if self.type in conn_configs and inherit_from in conn_configs[self.type]: 106 _inherit_dict = copy.deepcopy(conn_configs[self.type][inherit_from]) 107 self._attributes.update(_inherit_dict) 108 109 ### load user config into self._attributes 110 if self.type in conn_configs and self.label in conn_configs[self.type]: 111 self._attributes.update(conn_configs[self.type][self.label] or {}) 112 113 ### load system config into self._sys_config 114 ### (deep copy so future Connectors don't inherit changes) 115 if self.type in connector_config: 116 self._sys_config = copy.deepcopy(connector_config[self.type]) 117 118 ### add additional arguments or override configuration 119 self._attributes.update(kw) 120 121 ### finally, update __dict__ with _attributes. 122 self.__dict__.update(self._attributes) 123 124 def verify_attributes( 125 self, 126 required_attributes: Optional[List[str]] = None, 127 debug: bool = False, 128 ) -> None: 129 """ 130 Ensure that the required attributes have been met. 131 132 The Connector base class checks the minimum requirements. 133 Child classes may enforce additional requirements. 134 135 Parameters 136 ---------- 137 required_attributes: Optional[List[str]], default None 138 Attributes to be verified. If `None`, default to `['label']`. 139 140 debug: bool, default False 141 Verbosity toggle. 142 143 Returns 144 ------- 145 Don't return anything. 146 147 Raises 148 ------ 149 An error if any of the required attributes are missing. 150 """ 151 from meerschaum.utils.warnings import error, warn 152 from meerschaum.utils.debug import dprint 153 from meerschaum.utils.misc import items_str 154 if required_attributes is None: 155 required_attributes = ['label'] 156 157 missing_attributes = set() 158 for a in required_attributes: 159 if a not in self.__dict__: 160 missing_attributes.add(a) 161 if len(missing_attributes) > 0: 162 error( 163 ( 164 f"Missing {items_str(list(missing_attributes))} " 165 + f"for connector '{self.type}:{self.label}'." 166 ), 167 InvalidAttributesError, 168 silent=True, 169 stack=False 170 ) 171 172 173 def __str__(self): 174 """ 175 When cast to a string, return type:label. 176 """ 177 return f"{self.type}:{self.label}" 178 179 def __repr__(self): 180 """ 181 Represent the connector as type:label. 182 """ 183 return str(self) 184 185 @property 186 def meta(self) -> Dict[str, Any]: 187 """ 188 Return the keys needed to reconstruct this Connector. 189 """ 190 _meta = { 191 key: value 192 for key, value in self.__dict__.items() 193 if not str(key).startswith('_') 194 } 195 _meta.update({ 196 'type': self.type, 197 'label': self.label, 198 }) 199 return _meta 200 201 202 @property 203 def type(self) -> str: 204 """ 205 Return the type for this connector. 206 """ 207 _type = self.__dict__.get('type', None) 208 if _type is None: 209 import re 210 is_executor = self.__class__.__name__.lower().endswith('executor') 211 suffix_regex = ( 212 r'connector$' 213 if not is_executor 214 else r'executor$' 215 ) 216 _type = re.sub(suffix_regex, '', self.__class__.__name__.lower()) 217 self.__dict__['type'] = _type 218 return _type 219 220 221 @property 222 def label(self) -> str: 223 """ 224 Return the label for this connector. 225 """ 226 _label = self.__dict__.get('label', None) 227 if _label is None: 228 from meerschaum.config.static import STATIC_CONFIG 229 _label = STATIC_CONFIG['connectors']['default_label'] 230 self.__dict__['label'] = _label 231 return _label
The base connector class to hold connection attributes.
24 def __init__( 25 self, 26 type: Optional[str] = None, 27 label: Optional[str] = None, 28 **kw: Any 29 ): 30 """ 31 Set the given keyword arguments as attributes. 32 33 Parameters 34 ---------- 35 type: str 36 The `type` of the connector (e.g. `sql`, `api`, `plugin`). 37 38 label: str 39 The `label` for the connector. 40 41 42 Examples 43 -------- 44 Run `mrsm edit config` and to edit connectors in the YAML file: 45 46 ```yaml 47 meerschaum: 48 connections: 49 {type}: 50 {label}: 51 ### attributes go here 52 ``` 53 54 """ 55 self._original_dict = copy.deepcopy(self.__dict__) 56 self._set_attributes(type=type, label=label, **kw) 57 58 ### NOTE: Override `REQUIRED_ATTRIBUTES` if `uri` is set. 59 self.verify_attributes( 60 ['uri'] 61 if 'uri' in self.__dict__ 62 else getattr(self, 'REQUIRED_ATTRIBUTES', None) 63 )
124 def verify_attributes( 125 self, 126 required_attributes: Optional[List[str]] = None, 127 debug: bool = False, 128 ) -> None: 129 """ 130 Ensure that the required attributes have been met. 131 132 The Connector base class checks the minimum requirements. 133 Child classes may enforce additional requirements. 134 135 Parameters 136 ---------- 137 required_attributes: Optional[List[str]], default None 138 Attributes to be verified. If `None`, default to `['label']`. 139 140 debug: bool, default False 141 Verbosity toggle. 142 143 Returns 144 ------- 145 Don't return anything. 146 147 Raises 148 ------ 149 An error if any of the required attributes are missing. 150 """ 151 from meerschaum.utils.warnings import error, warn 152 from meerschaum.utils.debug import dprint 153 from meerschaum.utils.misc import items_str 154 if required_attributes is None: 155 required_attributes = ['label'] 156 157 missing_attributes = set() 158 for a in required_attributes: 159 if a not in self.__dict__: 160 missing_attributes.add(a) 161 if len(missing_attributes) > 0: 162 error( 163 ( 164 f"Missing {items_str(list(missing_attributes))} " 165 + f"for connector '{self.type}:{self.label}'." 166 ), 167 InvalidAttributesError, 168 silent=True, 169 stack=False 170 )
Ensure that the required attributes have been met.
The Connector base class checks the minimum requirements. Child classes may enforce additional requirements.
Parameters
- required_attributes (Optional[List[str]], default None):
Attributes to be verified. If
None
, default to['label']
. - debug (bool, default False): Verbosity toggle.
Returns
- Don't return anything.
Raises
- An error if any of the required attributes are missing.
185 @property 186 def meta(self) -> Dict[str, Any]: 187 """ 188 Return the keys needed to reconstruct this Connector. 189 """ 190 _meta = { 191 key: value 192 for key, value in self.__dict__.items() 193 if not str(key).startswith('_') 194 } 195 _meta.update({ 196 'type': self.type, 197 'label': self.label, 198 }) 199 return _meta
Return the keys needed to reconstruct this Connector.
202 @property 203 def type(self) -> str: 204 """ 205 Return the type for this connector. 206 """ 207 _type = self.__dict__.get('type', None) 208 if _type is None: 209 import re 210 is_executor = self.__class__.__name__.lower().endswith('executor') 211 suffix_regex = ( 212 r'connector$' 213 if not is_executor 214 else r'executor$' 215 ) 216 _type = re.sub(suffix_regex, '', self.__class__.__name__.lower()) 217 self.__dict__['type'] = _type 218 return _type
Return the type for this connector.
221 @property 222 def label(self) -> str: 223 """ 224 Return the label for this connector. 225 """ 226 _label = self.__dict__.get('label', None) 227 if _label is None: 228 from meerschaum.config.static import STATIC_CONFIG 229 _label = STATIC_CONFIG['connectors']['default_label'] 230 self.__dict__['label'] = _label 231 return _label
Return the label for this connector.
290def make_connector(cls, _is_executor: bool = False): 291 """ 292 Register a class as a `Connector`. 293 The `type` will be the lower case of the class name, without the suffix `connector`. 294 295 Parameters 296 ---------- 297 instance: bool, default False 298 If `True`, make this connector type an instance connector. 299 This requires implementing the various pipes functions and lots of testing. 300 301 Examples 302 -------- 303 >>> import meerschaum as mrsm 304 >>> from meerschaum.connectors import make_connector, Connector 305 >>> 306 >>> @make_connector 307 >>> class FooConnector(Connector): 308 ... REQUIRED_ATTRIBUTES: list[str] = ['username', 'password'] 309 ... 310 >>> conn = mrsm.get_connector('foo:bar', username='dog', password='cat') 311 >>> print(conn.username, conn.password) 312 dog cat 313 >>> 314 """ 315 import re 316 suffix_regex = ( 317 r'connector$' 318 if not _is_executor 319 else r'executor$' 320 ) 321 typ = re.sub(suffix_regex, '', cls.__name__.lower()) 322 with _locks['types']: 323 types[typ] = cls 324 with _locks['custom_types']: 325 custom_types.add(typ) 326 with _locks['connectors']: 327 if typ not in connectors: 328 connectors[typ] = {} 329 if getattr(cls, 'IS_INSTANCE', False): 330 with _locks['instance_types']: 331 if typ not in instance_types: 332 instance_types.append(typ) 333 334 return cls
Register a class as a Connector
.
The type
will be the lower case of the class name, without the suffix connector
.
Parameters
- instance (bool, default False):
If
True
, make this connector type an instance connector. This requires implementing the various pipes functions and lots of testing.
Examples
>>> import meerschaum as mrsm
>>> from meerschaum.connectors import make_connector, Connector
>>>
>>> @make_connector
>>> class FooConnector(Connector):
... REQUIRED_ATTRIBUTES: list[str] = ['username', 'password']
...
>>> conn = mrsm.get_connector('foo:bar', username='dog', password='cat')
>>> print(conn.username, conn.password)
dog cat
>>>
49def entry( 50 sysargs: Optional[List[str]] = None, 51 _patch_args: Optional[Dict[str, Any]] = None, 52) -> SuccessTuple: 53 """ 54 Parse arguments and launch a Meerschaum action. 55 56 Returns 57 ------- 58 A `SuccessTuple` indicating success. 59 """ 60 import shlex 61 import json 62 from meerschaum.utils.formatting import make_header 63 from meerschaum._internal.arguments import ( 64 parse_arguments, 65 split_chained_sysargs, 66 split_pipeline_sysargs, 67 sysargs_has_api_executor_keys, 68 get_pipeline_sysargs, 69 ) 70 from meerschaum.config.static import STATIC_CONFIG 71 if sysargs is None: 72 sysargs = [] 73 if not isinstance(sysargs, list): 74 sysargs = shlex.split(sysargs) 75 76 pipeline_key = STATIC_CONFIG['system']['arguments']['pipeline_key'] 77 escaped_pipeline_key = STATIC_CONFIG['system']['arguments']['escaped_pipeline_key'] 78 sysargs, pipeline_args = split_pipeline_sysargs(sysargs) 79 80 has_daemon = '-d' in sysargs or '--daemon' in sysargs 81 has_start_job = sysargs[:2] == ['start', 'job'] 82 pipeline_has_api_executor_keys = sysargs_has_api_executor_keys(pipeline_args) 83 84 chained_sysargs = ( 85 [sysargs] 86 if has_daemon or has_start_job or pipeline_has_api_executor_keys 87 else split_chained_sysargs(sysargs) 88 ) 89 if pipeline_args: 90 chained_sysargs = [get_pipeline_sysargs(sysargs, pipeline_args, _patch_args=_patch_args)] 91 92 results: List[SuccessTuple] = [] 93 94 for _sysargs in chained_sysargs: 95 if escaped_pipeline_key in _sysargs: 96 _sysargs = [ 97 pipeline_key 98 if _arg == escaped_pipeline_key 99 else _arg 100 for _arg in _sysargs 101 ] 102 103 args = parse_arguments(_sysargs) 104 if _patch_args: 105 args.update(_patch_args) 106 argparse_exception = args.get( 107 STATIC_CONFIG['system']['arguments']['failure_key'], 108 None, 109 ) 110 if argparse_exception is not None: 111 args_text = args.get('text', '') 112 if not args_text.startswith('show arguments'): 113 return ( 114 False, 115 ( 116 "Invalid arguments:" 117 + (f"\n{args_text}" if args_text else '') 118 + f"\n {argparse_exception}" 119 ) 120 ) 121 122 entry_success, entry_msg = entry_with_args(_patch_args=_patch_args, **args) 123 results.append((entry_success, entry_msg)) 124 125 if not entry_success: 126 break 127 128 success = all(_success for _success, _ in results) 129 any_success = any(_success for _success, _ in results) 130 success_messages = [_msg for _success, _msg in results if _success] 131 132 successes_msg = ( 133 success_messages[0] 134 if len(success_messages) and len(results) == 1 135 else ( 136 ( 137 'Successfully c' 138 if success 139 else ( 140 'Failed pipeline after ' 141 + f"{len(success_messages)} step" 142 + ('s' if len(success_messages) != 1 else '') 143 + '.\n\nC' 144 ) 145 ) + 'ompleted step' 146 + ('s' if len(success_messages) != 1 else '') 147 + ':\n\n' 148 + '\n'.join( 149 [ 150 ( 151 make_header(shlex.join(_sysargs)) 152 + '\n ' + _msg + '\n' 153 ) 154 for i, (_msg, _sysargs) in enumerate(zip(success_messages, chained_sysargs)) 155 ] 156 ) 157 ) 158 ) 159 has_fail = results[-1][0] is False 160 fail_ix = len(results) - 1 161 fail_sysargs = chained_sysargs[fail_ix] if has_fail else None 162 fail_msg = results[-1][1] if has_fail else '' 163 fails_msg = ( 164 'Failed to complete step:\n\n' 165 + make_header(shlex.join(fail_sysargs)) 166 + '\n ' 167 + fail_msg 168 169 ) if not results[-1][0] else '' 170 171 msg = ( 172 successes_msg 173 + ('\n\n' if any_success else '') 174 + fails_msg 175 ).rstrip() if len(chained_sysargs) > 1 else results[0][1] 176 177 if _systemd_result_path: 178 import json 179 from meerschaum.utils.warnings import warn 180 import meerschaum as mrsm 181 182 job = mrsm.Job(_job_name, executor_keys='systemd') 183 if job.delete_after_completion: 184 delete_success, delete_msg = job.delete() 185 mrsm.pprint((delete_success, delete_msg)) 186 else: 187 try: 188 if _systemd_result_path.parent.exists(): 189 with open(_systemd_result_path, 'w+', encoding='utf-8') as f: 190 json.dump((success, msg), f) 191 except Exception as e: 192 warn(f"Failed to write job result:\n{e}") 193 194 return success, msg