execlib/execlog/syncers/router.py

166 lines
6.6 KiB
Python

import logging
from pathlib import Path
from concurrent.futures import as_completed
from tqdm import tqdm
from colorama import Fore, Back, Style
from inotify_simple import flags as iflags
from co3.resources import DiskResource
from co3 import Differ, Syncer, Database
from execlog.event import Event
from execlog.routers import PathRouter
from execlog.util.generic import color_text
logger = logging.getLogger(__name__)
class PathDiffer(Differ[Path]):
def __init__(
self,
database: Database,
):
super().__init__(DiskResource(), database)
def l_transform(self, item):
'''
Transform ``(path, head)`` tuple from ``DiskResource``.
'''
return Path(*item)
class PathRouterSyncer(Syncer[Path]):
def __init__(
self,
differ: PathDiffer,
router: PathRouter,
):
super().__init__(differ)
self.router = router
def _construct_event(
self,
fpath: str | Path,
endpoint: str | Path,
action: bytes
):
return Event(
endpoint=str(endpoint),
name=str(Path(fpath).relative_to(endpoint)),
action=[action], # synthetic action to match any flag filters
)
def handle_l_excl(self, path: Path, disk_pairs: list):
'''
Handle disk exclusive paths (i.e., those added to disk since last sync).
'''
return [
self._construct_event(str(path), endpoint, iflags.CREATE)
for endpoint, _ in disk_pairs
]
def handle_r_excl(self, path: Path, db_vals: list):
'''
Handle database exclusive paths (i.e., those deleted from disk since last sync).
Searches for matching endpoints under the attached router and creates
corresponding events.
.. admonition:: On lack of endpoints
This method handles database exclusive items, i.e., paths no longer on disk
but still in the database. For typical Router designs, it is not important to
preserve possible endpoints of origin for this kind of event; what matters is
the absolute path of the file to be removed. In general, file events are
associated solely with a path, but we in some cases may be sensitive to the
base path seen to be "triggering" that file event, as router methods can hook
in to specific endpoints. This has somewhat dubious effects, as multiple
events (with the same action) are dispatched for the same file, purely to
observe the Router convention of endpoints and allowing independent
trajectories through the execution sequence.
One concern here is that you might, in theory, want to respond to the same
file deletion event in different ways under different endpoints. This will be
accessible when picking up such an event live, as endpoints are grouped by
watch descriptor and can be all be triggered from the single file event. This
is the one case where we can't *really* simulate the event taking place with
the available data, and instead have to peer into the router to see what root
paths the file could theoretically trigger. Most of the time, this won't be
too problematic, since we'll be watching the same paths and can tell where a
deleted file would've been. But there are cases where a watch path endpoint
may be abandoned, and thus no callback will be there to receive the DELETE
event. *Routers should heavily consider implementing a global DELETE handler
to prevent these cases if it's critical to respond to deletions.* Otherwise,
we still make an attempt to propagate under appropriate endpoints, allowing
for possible "deconstructor-like" behavior of specific filetypes (e.g.,
cleaning up auxiliary elements, writing to a log, creating a backup, etc).
'''
return [
self._construct_event(str(path), str(endpoint), iflags.DELETE)
for endpoint in self.router.routemap
if Path(path).is_relative_to(Path(endpoint))
]
def handle_lr_int(self, path: Path, path_tuples: tuple[list, list]):
'''
Handle paths reflected both in the database and on disk.
Paths only reach this method if still present after being passed through
``filter_diff_sets``, which will filter out those files that are up-to-date in the
database.
'''
return [
self._construct_event(str(path), endpoint, iflags.MODIFY)
for endpoint, _ in path_tuples[0]
]
def filter_diff_sets(self, l_excl, r_excl, lr_int):
total_disk_files = len(l_excl) + len(lr_int)
total_joint_files = len(lr_int)
def file_out_of_sync(p):
_, db_el = lr_int[p]
db_mtime = float(db_el[0].get('mtime','0'))
disk_mtime = Path(p).stat().st_mtime
return disk_mtime > db_mtime
lr_int = {p:v for p,v in lr_int.items() if file_out_of_sync(p)}
# compute out-of-sync details
oos_count = len(l_excl) + len(lr_int)
oos_prcnt = oos_count / max(total_disk_files, 1) * 100
logger.info(color_text(f'{len(l_excl)} new files to add', Fore.GREEN)),
logger.info(color_text(f'{len(lr_int)} modified files [{total_joint_files} up-to-date]', Fore.YELLOW)),
logger.info(color_text(f'{len(r_excl)} files to remove', Fore.RED)),
logger.info(color_text(f'({oos_prcnt:.2f}%) of disk files out-of-sync', Style.DIM)),
return l_excl, r_excl, lr_int
def process_chunk(self, event_sets):
chunk_events = [e for event_set in event_sets for e in event_set]
# 1) flush synthetic events for the batch through the chained router
# 2) block until completed and sweep up the collected inserts
event_futures = self.router.submit(chunk_events)
# note: we structure this future waiting like this for the TQDM view
results = []
for future in tqdm(
as_completed(event_futures),
total=len(event_futures),
desc=f'Awaiting chunk futures [submitted {len(event_futures)}]'
):
try:
if not future.cancelled():
results.append(future.result())
except Exception as e:
logger.warning(f"Sync job failed with exception {e}")
return results
def shutdown(self):
super().shutdown()
self.router.shutdown()