clean up general DB interfaces, make minor docstring revisions

2024-04-30 20:13:19 -07:00
parent dffc538fba
commit 06eb7b1047
11 changed files with 124 additions and 684 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 .pytest_cache/
 localsys.egg-info/
 .ipynb_checkpoints/
 .pytest_cache/
 .python-version
 # vendor and build files
--- a/co3.egg-info/SOURCES.txt
+++ b/co3.egg-info/SOURCES.txt
@@ -37,3 +37,8 @@ co3/util/__init__.py
 co3/util/db.py
 co3/util/regex.py
 co3/util/types.py
 tests/test_co3.py
 tests/test_database.py
 tests/test_imports.py
 tests/test_mapper.py
 tests/test_schema.py
--- a/co3/accessor.py
+++ b/co3/accessor.py
@@ -1,10 +1,9 @@
 '''
 Accessor
 Provides access to an underlying schema through a supported set of operations. Class
 methods could be general, high-level SQL wrappers, or convenience functions for common
 schema-specific queries.
 '''
 import time
 import inspect
 from pathlib import Path
 from collections import defaultdict
@@ -23,7 +22,16 @@ class Accessor[C: Component](metaclass=ABCMeta):
    Parameters:
        engine: SQLAlchemy engine to use for queries. Engine is initialized dynamically as
                a property (based on the config) if not provided
    Instance variables:
        access_log: time-indexed log of access queries performed
    '''
    def __init__(self):
        self.access_log = {}
    def log_access(self, stmt):
        self.access_log[time.time()] = f'{stmt}'
    @abstractmethod
    def raw_select(
        self,
@@ -36,7 +44,7 @@ class Accessor[C: Component](metaclass=ABCMeta):
    def select(
        self,
        connection,
-        component: C,
+        component: str | C,
        *args,
        **kwargs
    ):
--- a/co3/accessors/fts.py
+++ b/co3/accessors/fts.py
@@ -2,25 +2,37 @@ import sqlalchemy as sa
 from co3 import util
 from co3.accessor import Accessor
 from co3.accessors.sql import SQLAccessor
 class FTSAccessor(Accessor):
-    def search(
+    '''
    Perform queries on efficient full-text search (FTS) tables.
    Note how this class doesn't inherit from ``SQLAccessor``, or even
    ``RelationalAccessor``. We don't look at FTS tables as Relations, due to restrictions
    on their JOIN capabilities, and avoid embracing what composability they may have to
    prevent accidentally introducing inefficiencies. Instead, just single FTS tables can
    be selected from via this Accessor, and the FTSTable type, despite the use of the word
    "table", is a direct child of the Component type.
    '''
    def __init__(self):
        self.sql_accessor = SQLAccessor()
        self.access_log   = sql_accessor.access_log
    def select(
        self,
        connection,
        table_name  : str,
-        select_cols : str  | list | None = '*',
+        select_cols : str | list | None = '*',
-        search_cols : str  | None = None,
+        search_cols : str | None        = None,
-        q           : str  | None = None,
+        query       : str | None        = None,
-        colq        : str  | None = None,
+        col_query   : str | None        = None,
-        snip_col    : int  | None = 0,
+        snip_col    : int | None        = 0,
-        hl_col      : int  | None = 0,
+        hl_col      : int | None        = 0,
-        limit       : int  | None = 100,
+        limit       : int | None        = 100,
-        snip        : int  | None = 64,
+        snip        : int | None        = 64,
-        tokenizer   : str  | None = 'unicode61',
+        tokenizer   : str | None        = 'unicode61',
        group_by    : str  | None = None,
        agg_cols    : list | None = None,
        wherein_dict: dict | None = None,
        unique_on   : dict | None = None,
    ):
        '''
        Execute a search query against an indexed FTS table for specific primitives. This
@@ -33,23 +45,6 @@ class FTSAccessor(Accessor):
        ``MATCH``-based score for the text & column queries. Results are (a list of) fully
        expanded dictionaries housing column-value pairs.
        Note:
            GROUP BY cannot be paired with SQLITE FTS extensions; thus, we perform manual
            group checks on the result set in Python before response
        Analysis:
            The returned JSON structure has been (loosely) optimized for speed on the client
            side. Fully forming individual dictionary based responses saves time in
            Javascript, as the JSON parser is expected to be able to create the objects
            faster than post-hoc construction in JS. This return structure was compared
            against returning an array of arrays (all ordered in the same fashion), along with
            a column list to be associated with each of the result values. While this saves
            some size on the payload (the same column names don't have to be transmitted for
            each result), the size of the returned content massively outweighs the
            predominantly short column names. The only way this structure would be viable is
            if a significant amount was saved on transfer compared to the slow down in JS
            object construction; this is (almost) never the case.
        Parameters:
            table_name  : name of FTS table to search
            search_cols : space separated string of columns to use for primary queries
@@ -62,28 +57,25 @@ class FTSAccessor(Accessor):
            limit       : maximum number of results to return in the SQL query
            snip        : snippet length (max: 64)
            tokenizer   : tokenizer to use (assumes relevant FTS table has been built)
            ...
            wherein_dict: (col-name, value-list) pairs to match result set against, via
                          WHERE ... IN clauses
        Returns:
            Dictionary with search results (list of column indexed dictionaries) and relevant
            metadata.
        '''
-        search_q = ''
+        search_query = ''
        if type(select_cols) is list:
            select_cols = ', '.join(select_cols)
        # construct main search query
-        if search_cols and q:
+        if search_cols and query:
-            search_q = f'{{{search_cols}}} : {q}'
+            search_query = f'{{{search_cols}}} : {query}'
        # add auxiliary search constraints
-        if colq:
+        if col_query:
-            search_q += f' {colq}'
+            search_query += f' {col_query}'
-        search_q = search_q.strip()
+        search_query = search_query.strip()
        hl_start = '<b><mark>'
        hl_end   = '</mark></b>'
@@ -99,8 +91,8 @@ class FTSAccessor(Accessor):
        '''
        where_clauses = []
-        if search_q:
+        if search_query:
-            where_clauses.append(f"{fts_table_name} MATCH '{search_q}'\n")
+            where_clauses.append(f"{fts_table_name} MATCH '{search_query}'\n")
        if wherein_dict:
            for col, vals in wherein_dict.items():
@@ -112,36 +104,6 @@ class FTSAccessor(Accessor):
        sql += f'ORDER BY rank LIMIT {limit};'
-        row_dicts, cols = self.raw_select(sql, include_cols=True)
+        row_dicts, cols = self.sql_accessor.raw_select(connection, sql, include_cols=True)
-        if group_by is None:
+        return row_dicts, cols
            return row_dicts, cols
        if agg_cols is None:
            agg_cols = []
        # "group by" block ID and wrangle the links into a list
        # note we can't perform native GROUP BYs with FTS results
        group_by_idx = {}
        for row in row_dicts:
            group_by_attr = row.get(group_by)
            # add new entries
            for agg_col in agg_cols:
                row[f'{agg_col}_agg'] = set()
            if group_by_attr is None:
                continue
            if group_by_attr not in group_by_idx:
                group_by_idx[group_by_attr] = row
            for agg_col in agg_cols:
                if agg_col in row:
                    group_by_idx[group_by_attr][f'{agg_col}_agg'].add(row[agg_col])
        return {
            'results'     : group_by_idx,
            'columns'     : cols,
            'num_results' : len(row_dicts),
        }
--- a/co3/accessors/sql.py
+++ b/co3/accessors/sql.py
@@ -1,62 +1,58 @@
 '''
-Design proposal: variable backends
+.. admonition:: Design proposal: variable backends
-One particular feature not supported by the current type hierarchy is the possible use of
+    One particular feature not supported by the current type hierarchy is the possible use of
-different backends to implement a general interface like SQLAccessor. One could imagine,
+    different backends to implement a general interface like SQLAccessor. One could imagine,
-for instance, using ``sqlalchemy`` or ``sqlite`` to define the same methods laid out in a
+    for instance, using ``sqlalchemy`` or ``sqlite`` to define the same methods laid out in a
-parent class blueprint. It's not too difficult to imagine cases where both of these may be
+    parent class blueprint. It's not too difficult to imagine cases where both of these may be
-useful, but for now it is outside the development scope. Should it ever enter the scope,
+    useful, but for now it is outside the development scope. Should it ever enter the scope,
-however, we might consider a simple ``backend`` argument on instantiation, keeping just the
+    however, we might consider a simple ``backend`` argument on instantiation, keeping just the
-SQLAccessor exposed rather than a whole set of backend-specific types:
+    SQLAccessor exposed rather than a whole set of backend-specific types:
-.. code-block:: python
+    .. code-block:: python
-    class SQLAlchemyAccessor(RelationalAccessor): # may also inherit from a dedicated interface parent
+        class SQLAlchemyAccessor(RelationalAccessor): # may also inherit from a dedicated interface parent
-        def select(...):
+            def select(...):
-            ...
+                ...
-    class SQLiteAccessor(RelationalAccessor): 
+        class SQLiteAccessor(RelationalAccessor): 
-        def select(...):
+            def select(...):
-            ...
+                ...
-    class SQLAccessor(RelationalAccessor): 
+        class SQLAccessor(RelationalAccessor): 
-        backends = {
+            backends = {
-            'sqlalchemy': SQLAlchemyAccessor,
+                'sqlalchemy': SQLAlchemyAccessor,
-            'sqlite':     SQLteAccessor,
+                'sqlite':     SQLteAccessor,
-        }
+            }
-        def __init__(self, backend: str):
+            def __init__(self, backend: str):
-            self.backend = self.backends.get(backend)
+                self.backend = self.backends.get(backend)
-        def select(...):
+            def select(...):
-            return self.backend.select(...)
+                return self.backend.select(...)
-For now, we can look at SQLAccessor (and equivalents in other type hierarchies, like
+    For now, we can look at SQLAccessor (and equivalents in other type hierarchies, like
-SQLManagers) as being SQLAlchemyAccessors and not supporting any backend swapping. But in
+    SQLManagers) as being SQLAlchemyAccessors and not supporting any backend swapping. But in
-theory, to make the above change, we'd just need to rename it and wrap it up.
+    theory, to make the above change, we'd just need to rename it and wrap it up.
 '''
 from pathlib import Path
 from collections.abc import Iterable
 import inspect
-from functools import cache
+from pathlib import Path
 import sqlalchemy as sa
 from co3 import util
 from co3.engines import SQLEngine
 from co3.accessor import Accessor
 from co3.components import Relation, SQLTable
 class RelationalAccessor[R: Relation](Accessor[R]):
    @staticmethod
    def raw_select(
        self, 
        connection,
        text: str
    ):
        #connection.exec
        raise NotImplementedError
    def select(
@@ -110,6 +106,7 @@ class SQLAccessor(RelationalAccessor[SQLTable]):
            bind_params=bind_params,
            include_cols=include_cols
        )
        self.log_access(sql)
        if mappings:
            return res.mappings().all()
@@ -165,6 +162,7 @@ class SQLAccessor(RelationalAccessor[SQLTable]):
            statement = statement.limit(limit)
        res = SQLEngine.execute(connection, statement, include_cols=include_cols)
        self.log_access(statement)
        if mappings:
            return res.mappings().all()
@@ -181,11 +179,15 @@ class SQLAccessor(RelationalAccessor[SQLTable]):
        returned dictionaries. This information is not available under CursorResults and thus
        must be provided separately. This will yield results like the following:
-        [..., {'table1.col':<value>, 'table2.col':<value>, ...}, ...]
+        .. code-block:: python
            [..., {'table1.col':<value>, 'table2.col':<value>, ...}, ...]
        Instead of the automatic mapping names:
-        [..., {'col':<value>, 'col_1':<value>, ...}, ...]
+        .. code-block:: python
            [..., {'col':<value>, 'col_1':<value>, ...}, ...]
        which can make accessing certain results a little more intuitive. 
        '''
--- a/co3/accessors/vss.py
+++ b/co3/accessors/vss.py
@@ -1,10 +1,9 @@
 import time
 import pickle
 import logging
 from pathlib import Path
 import time
 import sqlalchemy as sa
 #from sentence_transformers import SentenceTransformer, util
 from co3.accessor import Accessor
@@ -55,10 +54,11 @@ class VSSAccessor(Accessor):
            normalize_embeddings = True
        )
-    def search(
+    def select(
        self,
-        query      : str,
+        connection,
        index_name : str,
        query      : str,
        limit      : int = 10,
        score_threshold = 0.5,
    ):
--- a/co3/components/init.py
+++ b/co3/components/init.py
@@ -111,6 +111,9 @@ class SQLTable(Relation[SQLTableLike]):
            self.obj.join(_with.obj, on, isouter=outer)
        )
 class FTSTable(Relation[SQLTableLike]):
    pass
 # key-value stores
 class Dictionary(Relation[dict]):
    def get_attributes(self):
--- a/co3/managers/fts.py
+++ b/co3/managers/fts.py
@@ -1,12 +1,11 @@
 from collections import defaultdict
 import logging
 import time
 import logging
 from collections import defaultdict
 from tqdm import tqdm
 import sqlalchemy as sa
 from co3 import util
 from co3.manager import Manager
 from co3.accessors.sql import SQLAccessor
@@ -14,154 +13,24 @@ from co3.accessors.sql import SQLAccessor
 logger = logging.getLogger(__name__)
 class FTSManager(Manager):
-    def __init__(self, database, engine=None):
+    def __init__(self):
-        super().__init__(database, engine)
+        self.sql_accessor = SQLAccessor()
        self.accessor = SQLAccessor(engine)
        self.composer = CoreComposer()
    def recreate_simple(self):
        select_cols = [
            tables.files.c.rpath, tables.files.c.ftype,
            tables.notes.c.title, tables.notes.c.link,
            tables.note_conversion_matter.c.type,
            tables.blocks.c.name, tables.blocks.c.header,
            tables.blocks.c.elem_pos,
            tables.block_conversion_matter.c.content,
            tables.links.c.target,
            #tables.links.c.target.label('link_target'),
        ]
    def recreate_from_table(self, table: str, cols):
        inserts, res_cols = self.accessor.select(
-            'leftward_link_conversions',
+            table,
-            cols=select_cols,
+            cols=cols,
            include_cols=True,
        )
        util.db.populate_fts5(
            self.engine,
            ['search'],
            #columns=select_cols,
            columns=res_cols,
            inserts=inserts,
        )
    def recreate_pivot(self):
        block_cols = [
            tables.files.c.rpath, tables.files.c.ftype,
            tables.notes.c.title, tables.notes.c.link,
            tables.note_conversion_matter.c.type,
            tables.blocks.c.name, tables.blocks.c.header,
            tables.blocks.c.elem_pos,
            #tables.block_conversion_matter.c.content,
            #tables.links.c.target,
            #tables.links.c.target.label('link_target'),
        ]
        lcp = self.accessor.link_content_pivot(cols=block_cols)
        start = time.time()
        inserts, cols = self.accessor.select(
            lcp,
            #cols=select_cols
            include_cols=True,
        )
        logger.info(f'FTS recreate: pre-index SELECT took {time.time()-start:.2f}s')
        return inserts
        #util.db.populate_fts5(
        #    self.engine,
        #    ['search'],
        #    #columns=select_cols,
        #    columns=res_cols,
        #    inserts=inserts,
        #)
    def recreate(self):
        select_cols = [
            tables.links.c.id,
            tables.files.c.rpath, tables.files.c.ftype,
            tables.files.c.title, tables.files.c.link,
            tables.notes.c.type,
            tables.elements.c.name, tables.blocks.c.header,
            tables.elements.c.elem_pos,
            tables.element_conversions.c.format,
            tables.element_conversions.c.content,
            tables.links.c.target,
            tables.links.c.type.label('link_type'),
            #tables.links.c.target.label('link_target'),
        ]
        start = time.time()
        fts_basis = self.composer.get_table('leftward_link_conversions')
        inserts = self.accessor.select(
            fts_basis,
            cols=select_cols,
        )
        logger.info(f'FTS recreate: pre-index SELECT took {time.time()-start:.2f}s')
        start = time.time()
        flattened_blocks = defaultdict(lambda:defaultdict(lambda:None))
        other_results = []
        # flatten conversions across supported formats
        # i.e. put "src" and "html" into columns within a single row
        for row in inserts:
            name = row['name']
            # remove pure identifier rows
            link_id = row.pop('id')
            _format = row.pop('format')
            content = row.pop('content')
            # add new derived rows
            row['src']  = None
            row['html'] = None
            if name is None:
                other_results.append(row)
                continue
            if flattened_blocks[name][link_id] is None:
                flattened_blocks[name][link_id] = row
            nrow = flattened_blocks[name][link_id]
            if _format == 'src':
                 nrow['src'] = content
            elif _format == 'html5':
                 nrow['html'] = content
        inserts = other_results
        for name, link_dict in flattened_blocks.items():
            for link_id, row in link_dict.items():
                inserts.append(row)
        # should have at least one result to index
        assert len(inserts) > 0
        cols = inserts[0].keys()
        # all keys should match
        assert all([cols == d.keys() for d in inserts])
        logger.info(f'FTS recreate: insert post-processing took {time.time()-start:.2f}s')
        util.db.populate_fts5(
@@ -177,40 +46,3 @@ class FTSManager(Manager):
    def migrate(self): pass
 def recreate_old(self):
    full_table = schema.leftward_block_conversions()
    primitive_tables = schema.primitive_tables() 
    virtual_inserts = []
    for ptype, table in primitive_tables.items():
        table_rows = db.util.sa_exec_dicts(self.engine, sa.select(table))
        for row in table_rows:
            pdict = util.db.prepare_insert(full_table, row)
            pdict['ptype'] = ptype
            virtual_inserts.append(pdict)
    select_cols = [
        tables.blocks.c.name, tables.blocks.c.content,
        tables.blocks.c.html, tables.blocks.c.header,
        tables.blocks.c.block_pos, tables.blocks.c.note_name,
        tables.links.c.to_file,
        tables.notes.c.link,
        tables.note_conversion_matter.c.type,
        tables.note_conversion_matter.c.summary,
        'ptype',
    ]
    util.db.populate_fts5(
        self.engine,
        'search',
        columns=select_cols,
        inserts=virtual_inserts,
    )
--- a/co3/managers/sql.py
+++ b/co3/managers/sql.py
@@ -1,5 +1,6 @@
 '''
-Note: Common on insert behavior
+.. admonition:: Common on insert behavior
    - Tables with unique constraints have been equipped with ``sqlite_on_conflict_unique``
      flags, enabling conflicting bulk inserts to replace conflicting rows gracefully. No
      need to worry about explicitly handling upserts.
@@ -9,7 +10,8 @@ Note: Common on insert behavior
      bulk calls AND allow update/replacements on conflict; the setting above allows this
      bulk usage to continue as is.
-Note: Options for insert/update model
+.. admonition:: Options for insert/update model
    1. Collect all possible update objects, and use a SQLite bulk insert that only updates
       if needed (based on modified time, for example). This sounds on the surface like
       the right approach since we defer as much to bulk SQL logic, but it's far and away
@@ -50,8 +52,6 @@ from co3.engines import SQLEngine
 from co3.manager import Manager
 from co3.components import Relation, SQLTable
 #from localsys.reloader.router._base import ChainRouter, Event
 logger = logging.getLogger(__name__)
@@ -76,24 +76,21 @@ class SQLManager(RelationalManager[SQLTable]):
    def __init__(self, *args, **kwargs):
        '''
        The insert lock is a _reentrant lock_, meaning the same thread can acquire the
-        lock again with out deadlocking (simplifying across methods of this class that
+        lock again without deadlocking (simplifying across methods of this class that
        need it).
        '''
        super().__init__(*args, **kwargs)
        self.routers  = []
        self._router    = None
        self._insert_lock = threading.RLock()
-    @property
+    def update(self):
-    def router(self):
+        pass
        if self._router is None:
            self._router = ChainRouter(self.routers)
        return self._router
-    def add_router(self, router):
+    def migrate(self):
-        self.routers.append(router)
+        pass
    def sync(self):
        pass
    def recreate(
        self,
@@ -112,8 +109,6 @@ class SQLManager(RelationalManager[SQLTable]):
        metadata.drop_all(engine.manager)
        metadata.create_all(engine.manager, checkfirst=True)
    def update(self): pass
    def insert(
        self,
        connection,
@@ -167,246 +162,3 @@ class SQLManager(RelationalManager[SQLTable]):
        return res_list
    def _file_sync_bools(self):
        synced_bools = []
        fpaths = utils.paths.iter_nested_paths(self.collector.basepath, no_dir=True)
        db_rpath_index = self.database.files.group_by('rpath')()
        for fpath in fpaths:
            file = File(fpath, self.collector.basepath)
            db_file = db_rpath_index.get(file.rpath)
            file_in_sync = db_file and float(db_file.get('mtime','0')) >= file.mtime
            synced_bools.append(file_in_sync)
        return fpaths, synced_bools
    def sync(
        self,
        limit=0,
        chunk_time=0,
        dry=False,
        chunk_size_cap=1000,
        print_report=True,
    ):
        '''
        Note: 
            Could be dangerous if going from fast file processing to note processing.
            Imagine estimating 1000 iter/sec, then transferring that to the next batch
            when it's more like 0.2 iter/sec. We would lose any chunking. (Luckily, in
            practice, turns out that notes are almost always processed before the huge set
            of nested files lurking and we don't experience this issue.)
        '''
        #filepaths, update_bools = self.collector.file_inserts(dry_run=True)
        #fpaths_to_update = [f for (f, b) in zip(filepaths, update_bools) if not b] 
        filepaths, update_bools = self._file_sync_bools()
        fpaths_to_update = [f for (f, b) in zip(filepaths, update_bools) if not b] 
        def calc_chunk_size(sec_per_file):
            '''
            Chunk cap only applied here; if manually set elsewhere in the method, it's
            intentional e.g., if chunk_time <= 0 and we just want one big iteration.
            '''
            return min(max(int(chunk_time / sec_per_file), 1), chunk_size_cap)
        # nothing to do
        if not fpaths_to_update:
            logger.info('Sync has nothing to do, exiting')
            return None
        ood_count = len(fpaths_to_update)
        total_fps = len(filepaths) 
        ood_prcnt = ood_count/total_fps*100
        logger.info(
            f'Pre-sync scan yielded {ood_count}/{total_fps} ({ood_prcnt:.2f}%) out-of-date files'
        )
        if limit <= 0: limit = ood_count
        if chunk_time <= 0:
            chunk_size = ood_count
        else:
            chunk_size = calc_chunk_size(5)
        dry_inserts = None
        remaining = limit
        chunk_time_used = 0
        pbar = tqdm(
            desc=f'Adaptive chunked sync for [limit {limit}]',
            total=remaining,
        )
        report = []
        def collect_report():
            report.append({
                'chunk_size': chunk_limit,
                'chunk_time_used': chunk_time_used,
            })
        with pbar as _:
            while remaining > 0:
                time_pcnt = chunk_time_used / max(chunk_time,1) * 100
                pbar.set_description(
                    f'Adaptive chunked sync [size {chunk_size} (max {chunk_size_cap})] '
                    f'[prev chunk {chunk_time_used:.2f}s/{chunk_time}s ({time_pcnt:.2f}%)]'
                )
                chunk_limit = min(remaining, chunk_size)
                start_idx = limit - remaining
                chunk_fpaths = fpaths_to_update[start_idx:start_idx+chunk_limit]
                chunk_time_start = time.time()
                #inserts = self.collector.collect_inserts(
                #    chunk_fpaths,
                #    skip_updated=False, # saves on DB check given provided list
                #)
                # 1) flush synthetic events for the batch through the chained router
                # 2) block until completed and sweep up the collected inserts
                futures = self.router.submit([
                    Event(
                        endpoint=str(self.collector.basepath),
                        name=str(path.relative_to(self.collector.basepath)),
                        action=[0xffffffff], # synthetic action to match any flag filters
                    ) for path in chunk_fpaths
                ])
                for future in tqdm(
                    as_completed(futures),
                    total=chunk_size,
                    desc=f'Awaiting chunk futures [submitted {len(futures)}/{chunk_size}]'
                ):
                    #print('Finished future', cfuture)
                    try:
                        result = future.result()
                    except Exception as e:
                        logger.warning(f"Sync job failed with exception {e}")
                #wait(futures)#, timeout=10)
                inserts = self.collector.collect_inserts()
                chunk_time_used = time.time() - chunk_time_start
                collect_report()
                #if not inserts: break
                if dry:
                    if dry_inserts is None:
                        dry_inserts = inserts
                    else:
                        for ik, iv in inserts.items():
                            dry_inserts[ik].extend(iv)
                else:
                    self.insert(inserts)
                remaining -= chunk_limit
                # re-calculate chunk size
                sec_per_file = chunk_time_used / chunk_size
                chunk_size = calc_chunk_size(sec_per_file)
                #pbar.update(n=limit-remaining)
                pbar.update(n=chunk_limit)
        if print_report:
            num_chunks = len(report)
            total_ins  = sum(map(lambda c:c['chunk_size'], report))
            avg_chunk  = total_ins / num_chunks
            total_time = sum(map(lambda c:c['chunk_time_used'], report))
            avg_time   = total_time / num_chunks
            time_match = avg_time / max(chunk_time*100, 1)
            print( 'Sync report:                                              ')
            print(f'    Total chunks         : {num_chunks}                   ')
            print(f'    Total file inserts   : {total_ins} / {limit}          ')
            print(f'    Average chunk size   : {avg_chunk:.2f}                ')
            print(f'    Total time spent     : {total_time:.2f}s              ')
            print(f'    Average time / chunk : {avg_time:.2f}s / {chunk_time}s')
            print(f'    Percent time match   : {time_match:.2f}%              ')
        if dry: return dry_inserts
    def migrate(self):
        # create MD object representation current DB state
        pre_metadata = sa.MetaData()
        pre_metadata.reflect(bind=self.engine)
        table_rows = {}
        for table_name, table in pre_metadata.tables.items():
            # pre-migration rows
            select_results = utils.db.sa_execute(self.engine, sa.select(table))
            named_results  = utils.db.named_results(table, select_results)
            table_rows[table_name] = named_results
        #table_update_map = {
        #    'files': (
        #        lambda r: File(Path(NOTEPATH, r['name']), NOTEPATH).get_table_data()
        #    ),
        #    'notes': (
        #        lambda r: Note(Path(NOTEPATH, r['name']), NOTEPATH).get_table_data()
        #    ),
        #    'note_conversion_matter': (
        #        lambda r: Note(Path(NOTEPATH, r['name']), NOTEPATH).get_table_data()
        #    ),
        #}
        # update schema
        self.recreate()
        with self.engine.connect() as connection:
            for table_name, table in tables.metadata.tables.items():
                if table_name not in table_rows: continue
                logger.info(f'Migrating table "{table_name}"')
                if table_name == 'files':
                    update_note_rows = []
                    for i,note_row in enumerate(table_rows['files']):
                        file = File(Path(NOTEPATH, note_row['name']), NOTEPATH)
                        u_note_row = utils.db.prepare_insert(
                            tables.files,
                            file.get_table_data()
                        )
                        note_row.update({k:v for k,v in u_note_row.items() if v})
                        update_note_rows.append(note_row)
                    table_rows['files'] = update_note_rows
                elif table_name == 'notes':
                    update_note_rows = []
                    for note_row in table_rows['notes']:
                        note = Note(Path(NOTEPATH, note_row['name']), NOTEPATH)
                        u_note_row = utils.db.prepare_insert(
                            tables.notes,
                            note.get_table_data()
                        )
                        note_row.update({k:v for k,v in u_note_row.items() if v})
                        update_note_rows.append(note_row)
                    table_rows['notes'] = update_note_rows
                # generic update blueprint for later; now doesn't matter as conversion based
                # inserts require re-converting...so you may as well just do a full re-build.
                #update_rows = []
                #if table_name in table_update_map:
                #    for i,row in enumerate(table_rows['files']):
                #        update_obj = table_update_map[table_name](row)
                #        u_row = utils.db.prepare_insert(
                #            table,
                #            note.get_table_data()
                #        )
                #        note_row.update({k:v for k,v in u_note_row.items() if v})
                #        update_note_rows.append(note_row)
                connection.execute(
                    sa.insert(table),
                    table_rows[table_name]
                )
            connection.commit()
--- a/co3/managers/vss.py
+++ b/co3/managers/vss.py
@@ -1,142 +1,16 @@
 from collections import defaultdict
 import logging
 import pickle
 import time
 from pathlib import Path
 import numpy as np
 from tqdm import tqdm
 import sqlalchemy as sa
 from co3.manager import Manager
 from co3 import util
 logger = logging.getLogger(__name__)
 class VSSManager(Manager):
-    def __init__(self, database, engine=None):
+    def recreate(self): pass
        super().__init__(database, engine)
-        from localsys.db.databases.core import CoreDatabase
+    def insert(self): pass
        self.core_db = CoreDatabase()
    def recreate(self):
        start = time.time()
        chunks    = []
        chunk_ids = []
        block_map = {}
        for note_name, note_groupby in self.core_db.blocks().items():
            blocks = note_groupby.get('aggregates', [])
            for block in blocks:
                if block.get('format') != 'src': continue
                block_name = block.get('name', '')
                block_content = block.get('content', '')
                block_map[block_name] = block_content
                block_chunks, chunk_ranges = utils.embed.split_block(block_content)
                chunks.extend(block_chunks)
                chunk_ids.extend([f'{block_name}@{r[0]}:{r[1]}' for r in chunk_ranges])
        logger.info(f'VSS pre-index SELECT took {time.time()-start:.2f}s')
        start = time.time()
        # one-time batched embedding
        chunk_embeddings = self.database.access.embed_chunks(chunks)
        logger.info(f'VSS embeddings took {time.time()-start:.2f}s')
        start = time.time()
        # aggregate embeddings up the content hierarchy
        zero_embed = lambda: np.atleast_2d(
            np.zeros(self.database.access._embedding_size, dtype='float32')
        )
        block_embedding_map = defaultdict(zero_embed)
        note_embedding_map  = defaultdict(zero_embed)
        for i, chunk_id in enumerate(chunk_ids):
            at_split   = chunk_id.split('@')
            note_name  = at_split[0]
            block_name = '@'.join(at_split[:3])
            block_embedding_map[block_name] += chunk_embeddings[i]
            note_embedding_map[note_name]   += chunk_embeddings[i]
        # re-normalize aggregated embeddings
        # > little inefficient but gobbled up asymptotically by chunk processing
        # > could o/w first convert each map to a full 2D np array, then back
        for embed_map in [block_embedding_map, note_embedding_map]:
            for k, agg_embedding in embed_map.items():
                embed_map[k] = utils.embed.unit_row_wise(agg_embedding)
        logger.info(f'VSS recreate: pre-index SELECT took {time.time()-start:.2f}s')
        # cols = ['rowid', 'chunk_embedding']
        # def pack_inserts(embeddings):
        #     return [
        #         { cols[0]: i, cols[1]: embedding.tobytes() }
        #         for i, embedding in enumerate(embeddings)
        #     ]
        # chunk_inserts = pack_inserts(chunk_embeddings)
        # block_inserts = pack_inserts(block_embedding_map.values())
        # note_inserts  = pack_inserts(note_embedding_map.values())
        chunk_ids = chunk_ids
        block_ids = list(block_embedding_map.keys())
        note_ids  = list(note_embedding_map.keys())
        block_embeddings = np.vstack(list(block_embedding_map.values()))
        note_embeddings  = np.vstack(list(note_embedding_map.values()))
        blocks = [block_map[b] for b in block_ids]
        notes  = note_ids
        self.database.access.write_embeddings({
            'chunks': (chunk_ids, chunk_embeddings, chunks),
            'blocks': (block_ids, block_embeddings, blocks),
            'notes' : (note_ids,  note_embeddings,  notes),
        })
        logger.info(f'Post-embedding parsing took {time.time()-start:.2f}s')
        # utils.db.create_vss0(
        #     self.engine,
        #     'chunks',
        #     columns=list(cols),
        #     inserts=chunk_inserts,
        #     populate=True,
        #     reset=True,
        #     embedding_size=self._embedding_size,
        # )
    def update(self): pass
    def sync(self): pass
    def migrate(self): pass
    def _archive_note_embed(self):
        for block in blocks:
            content = block.get('content')
            if not content: continue
            block_chunks = utils.embed.split_block(content)
            chunks.extend(block_chunks)
        embeddings = utils.embed.embed_chunks(chunks)
        block_embeddings = utils.embed.map_groups(embeddings, embed_block_map)
        self.convert_map['vect'] = block_embeddings
        note_embedding = utils.embed.unit_row_wise(
            embeddings.sum(axis=0).reshape(1, -1)
        )
        return {
            'content': note_embedding.tobytes(),
        }
--- a/co3/mapper.py
+++ b/co3/mapper.py
@@ -59,6 +59,7 @@ class Mapper[C: Component]:
    inserts (hence why we tie Mappers to Schemas one-to-one). 
    .. admonition:: Dev note
        the Composer needs reconsideration, or at least its positioning directly in this
        class. It may be more appropriate to have at the Schema level, or even just
        dissolved altogether if arbitrary named Components can be attached to schemas.