co3/co3/collector.py

128 lines
4.6 KiB
Python

'''
Defines the Collector base class.
This module is the critical "middleware" connecting the primitive object definitions and
their representations in the database. It operates with full knowledge of how both are
defined, and abstracts away both the prep work for DB insertions as well as updates
trickling down the primitive hierarchy.
The `src` format target is re-used for both canonical tables/primitives, as well as
<prim>_conversion_matter tables in tables/conversions under the `src` format. The latter
is meant to extend those attributes that are format-specific (i.e., would change when, say,
converting to `html5`), and thus need to be broken across the format dimension.
Note:
Despite the structure of the database module, this class does not currently inherit
from a super class in localsys.db (like the accessors and managers, for instance).
This will likely ultimately be the model that's embraced, but until FTS (or other
groups) need a collector, this will be remain an independent class. It is, however,
named like a concrete subclass, taking on the "Core" prefix.
'''
from pathlib import Path
from collections import defaultdict
import logging
from uuid import uuid4
import sqlalchemy as sa
from co3 import util
from co3.schema import Schema
from co3.component import Component
logger = logging.getLogger(__name__)
class Collector[C: Component]:
def __init__(self, schema: Schema[C]):
self.schema = schema
self._inserts = defaultdict(lambda: defaultdict(list))
@property
def inserts(self):
return self._inserts_from_receipts()
def _inserts_from_receipts(self, receipts: list=None, pop=False):
'''
Group up added inserts by Component, often to be used directly for bulk insertion.
Optionally provide a list of `receipts` to group up only the corresponding subset of
inserts, and `pop` to remove encountered receipts from the internal store.
'''
inserts = defaultdict(list)
if receipts is None:
receipts = list(self._inserts.keys())
for receipt in receipts:
if pop:
receipt_tuple = self._inserts.pop(receipt, None)
else:
receipt_tuple = self._inserts.get(receipt, None)
if receipt_tuple is not None:
component, insert_data = receipt_tuple
inserts[component].append(insert_data)
return dict(inserts)
def _reset_session(self):
self._inserts = defaultdict(lambda: defaultdict(list))
def _generate_unique_receipt(self):
receipt = str(uuid4())
while receipt in self._inserts:
receipt = str(uuid4())
return receipt
def add_insert(
self,
component : C,
insert_data : dict,
receipts : list | None = None,
):
'''
Parameters:
component: Component from registered schema
insert_data: dict with (possibly raw/incomplete) insert data
receipts: optional list to which generated receipt should be appended.
Accommodates the common receipt list aggregation pattern.
'''
if component not in self.schema:
#logger.debug(f'Inserts provided for non-existent table {table_name}')
return None
receipt = self._generate_unique_receipt()
self._inserts[receipt] = (
component,
component.prepare_insert_data(insert_data),
)
if receipts is not None:
receipts.append(receipt)
return receipt
def collect_inserts(self, receipts=None):
'''
Collect insert-ready dictionaries for the core primitive schema. This method is
effectively a light wrapper around the File and Note-based collection logic
elsewhere in the class.
The overall collection scheme embraces a session-like sequential update model to
an internal insert tracker. The sequence of insert methods is ordered according to
the schema hierarchy, and higher level inserts dictate the scope for lower level
inserts (all methods check and populate the same `inserts` dictionary). Calling
this method flushes any existing inserts, ensuring a re-scan takes place across
calls (or "sessions").
Parameters:
skip_updated: whether to ignore primitives with existing up-to-date
database entries
Returns:
Table name-indexed dictionary of insert lists (of column name-indexed dicts)
'''
return self._inserts_from_receipts(receipts, pop=True)