initial commit

This commit is contained in:
Sam G. 2024-03-28 23:11:30 -07:00
commit 9218f4a404
199 changed files with 10051 additions and 0 deletions

1
.python-version Normal file
View File

@ -0,0 +1 @@
co4

0
MANIFEST.in Normal file
View File

21
README.md Normal file
View File

@ -0,0 +1,21 @@
# Overview
`co3` is a package for file conversion and associated database operations. The `CO3` base class
provides a standard interface for performing conversions, preparing inserts, and
interacting with database schemas that mirror the class hierarchy.
Simplified description of the operational model:
**Goal**: interact with a storage medium (database, pickled structure, VSS framework) with
a known schema.
- **Accessor** to provide access to stored items
- **Composer** to compose common access points (e.g., JOINed tables)
- **Indexer** to index/cache access queries
- **Manager** to manage storage state (e.g., supported inserts, database syncs)
- **Collector** to collect data for updating storage state
- **Database** to collect data for updating storage state
- **Mapper** to collect data for updating storage state
- **Relation** to collect data for updating storage state
**CO3** is an abstract base class that makes it easy to integrate this model with object
hierarchies that mirror a storage schema.

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessor.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/sql.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/co3.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/collector.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/composer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/database.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/sql.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/indexer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/manager.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/sql.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/mapper.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/relation.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/relations/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/util/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/util/db.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/util/regex.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessor.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/table.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/co4.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/collector.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/composer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/_base.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/core.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/indexer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/manager.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/core.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/utils/db.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/utils/paths.py

109
co3/__init__.py Normal file
View File

@ -0,0 +1,109 @@
'''
Database submodule
- `db`: contains SQLAlchemy-based schema definitions
- `accessors`: convenience methods for accessing database entries
- `populate`: convenience methods for populating database tables
The `accessors` and `populate` submodules are each split into `schema` and `fts` method
groups. The former concerns methods relating to the actual database schema, the latter to
their SQLite FTS counterparts.
Note: Subpackages organization
Subpackages are broken up by inheritance. Within a given submodule, you have a
`_base.py` file defining the base class associated with that submodule's title, along
with concrete subclasses of that base in their own files. Deeper inheritance would
recursively extend this structure. The `__init__.py` for a given submodule then
exposes the concrete instances, leaving the base hidden. For example,
accessors/
_base.py
core.py
fts.py
`core` and `fts` house the `CoreAccessor` and `FTSAccessor` classes, respectively,
and are the direct subclasses of the `Accessor` parent found in the `_base`. This base
class _could_ be placed outside of the submodule in the parent directory (imported
with something like `from db import accessor` instead of `from db.accessor import
_base`). This is entirely valid, but I tend to prefer when the base class is among its
direct children, as
- In this case at least, the base doesn't need to be exposed
- The base class is being stowed away under an appropriately named submodule; having a
separate `accessor.py` and `accessors/` file/directory can feel a little cluttered.
- It makes imports across the accessors feel standardized:
```py
from localsys.db.accessors._base import Accessor
from localsys.db.accessors.core import CoreAccessor
```
Both have the same level of nesting to reach the class.
Frankly, both means of organization are perfectly fine, and as far as I can tell,
semantically sound in their own right. This particular scheme is just a preference in
the moment, and so long as I keep things consistent, choosing one over the other
shouldn't matter.
Additionally, note how `__init__.py`s are typically set up when providing wider access
to internal modules. The `init` typically pulls out classes from sibling modules
(i.e., files), but will import subpackages are the topmost level. For example, for the
structure
```
db/
__init__.py
accessors/
__init__.py
_base.py
core.py
fts.py
```
we have
```db/__init__.py
from localsys.db import accessors
```
which just imports the subpackage `accessors`. However, within subpackage:
```db/accessors/__init__.py
from localsys.db.accessors.core import CoreAccessor
```
we don't just import the submodule `core`; we did into the file to grab the relevant
class and pull it into the outer namespace. Overarching point: `__init__.py` files
typically reach into the sibling files (submodules) and pull out classes. Given that
this behavior is recursive, `__init__.py` then respect subpackages (nested
directories), importing them at the top-level and expecting an internal `__init__.py`
will have managed access appropriately.
Note: Organization for inheritance over composition
At a glance, the organization of subpackages here feels like it clashes with those
seen in `localsys.primitives`. `note_components`, for instance, houses the components
for the outer `note` module. Contrast this with how the `core` submodule looks: it's
composing `*/core.py` files across subpackages `accessors` and `managers`, rather than
a single subpackage like `note`. This seems inconsistent, but the subpackages here are
actually still organized in the same way: by inheritance. It just happens that the
all of the note components inherit from the same base class, and are thus confined to
a single subpackage. This aside, the subpackages themselves are still created around
inheritance, wrapping up a base and direct subclasses.
'''
from co3.accessor import Accessor
from co3.co3 import CO3
from co3.collector import Collector
from co3.composer import Composer
from co3.database import Database
from co3.indexer import Indexer
from co3.manager import Manager
from co3.mapper import Mapper
from co3.relation import Relation
from co3 import accessors
from co3 import databases
from co3 import managers
from co3 import relations
from co3 import util

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

28
co3/accessor.py Normal file
View File

@ -0,0 +1,28 @@
'''
Accessor
Provides access to an underlying schema through a supported set of operations. Class
methods could be general, high-level SQL wrappers, or convenience functions for common
schema-specific queries.
'''
import inspect
from pathlib import Path
from collections import defaultdict
import sqlalchemy as sa
#from co3.database import Database
class Accessor[D: 'Database']:
'''
Access wrapper class for complex queries and easy integration with Composer tables.
Implements high-level access to things like common constrained SELECT queries.
Parameters:
engine: SQLAlchemy engine to use for queries. Engine is initialized dynamically as
a property (based on the config) if not provided
'''
def __init__(self, database: D):
self.database = database

24
co3/accessors/__init__.py Normal file
View File

@ -0,0 +1,24 @@
'''
Note that subclasses in this subpackage are split differently to other subpackages in the
DB. Instead of being split by table group, corresponding to a Composer (which defines that
table group), Accessors are split by a separate dimension: table "type". This is why we
have a "TableAccessor" and an "FTSAccessor": the former exposes access operations
available to generic tables, the latter to FTS tables (instead of being designed
specifically around "core" and "fts" groups, for instance).
Seeing as FTS tables are "generic" tables, it seems inconsistent not to have FTSAccessor
inherit from TableAccessor. While this would work fine, the model we're working with
doesn't really need it; you can instead think of the FTSAccessor as defining _only_
FTS-specific operations. Given that you have a Composer for your desired table group, you
can then wrap it with your desired set of "access actions," available in separate Accessor
subclasses.
For instance, you could wrap an FTSComposer in either a TableAccessor or FTSAccessor. The
former will treat the tables in the composer like regular tables, exposing methods like
`.select` and `.select_one`, whereas the latter defines FTS-specific actions like
`.search`.
'''
from co3.accessors.sql import SQLAccessor
from co3.accessors.fts import FTSAccessor
from co3.accessors.vss import VSSAccessor

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

147
co3/accessors/fts.py Normal file
View File

@ -0,0 +1,147 @@
import sqlalchemy as sa
from co3 import util
from co3.accessor import Accessor
class FTSAccessor(Accessor):
def search(
self,
table_name : str,
select_cols : str | list | None = '*',
search_cols : str | None = None,
q : str | None = None,
colq : str | None = None,
snip_col : int | None = 0,
hl_col : int | None = 0,
limit : int | None = 100,
snip : int | None = 64,
tokenizer : str | None = 'unicode61',
group_by : str | None = None,
agg_cols : list | None = None,
wherein_dict: dict | None = None,
unique_on : dict | None = None,
):
'''
Execute a search query against an indexed FTS table for specific primitives. This
method is mostly a generic FTS handler, capable of handling queries to any available
FTS table with a matching naming scheme (`fts_<type>_<tokenizer>`). The current
intention is support all tokenizers, for file, note, block, and link primitives.
Search results include all FTS table columns, as well as SQLite-supported `snippet`s
and `highlight`s for matches. Matches are filtered and ordered by SQLite's
`MATCH`-based score for the text & column queries. Results are (a list of) fully
expanded dictionaries housing column-value pairs.
Note:
GROUP BY cannot be paired with SQLITE FTS extensions; thus, we perform manual
group checks on the result set in Python before response
Analysis:
The returned JSON structure has been (loosely) optimized for speed on the client
side. Fully forming individual dictionary based responses saves time in
Javascript, as the JSON parser is expected to be able to create the objects
faster than post-hoc construction in JS. This return structure was compared
against returning an array of arrays (all ordered in the same fashion), along with
a column list to be associated with each of the result values. While this saves
some size on the payload (the same column names don't have to be transmitted for
each result), the size of the returned content massively outweighs the
predominantly short column names. The only way this structure would be viable is
if a significant amount was saved on transfer compared to the slow down in JS
object construction; this is (almost) never the case.
Parameters:
table_name : name of FTS table to search
search_cols : space separated string of columns to use for primary queries
q : search query
colq : column constraint string; must conform to SQLite standards (e.g.,
`<col>:<text>`
snip_col : table column to use for snippets (default: 1; source content column)
hl_col : table column to use for highlights (default: 2; format column, applied
to HTML targets)
limit : maximum number of results to return in the SQL query
snip : snippet length (max: 64)
tokenizer : tokenizer to use (assumes relevant FTS table has been built)
...
wherein_dict: (col-name, value-list) pairs to match result set against, via
WHERE ... IN clauses
Returns:
Dictionary with search results (list of column indexed dictionaries) and relevant
metadata.
'''
search_q = ''
if type(select_cols) is list:
select_cols = ', '.join(select_cols)
# construct main search query
if search_cols and q:
search_q = f'{{{search_cols}}} : {q}'
# add auxiliary search constraints
if colq:
search_q += f' {colq}'
search_q = search_q.strip()
hl_start = '<b><mark>'
hl_end = '</mark></b>'
fts_table_name = f'{table_name}_fts_{tokenizer}'
sql = f'''
SELECT
{select_cols},
snippet({fts_table_name}, {snip_col}, '{hl_start}', '{hl_end}', '...', {snip}) AS snippet,
highlight({fts_table_name}, {hl_col}, '{hl_start}', '{hl_end}') AS highlight
FROM {fts_table_name}
'''
where_clauses = []
if search_q:
where_clauses.append(f"{fts_table_name} MATCH '{search_q}'\n")
if wherein_dict:
for col, vals in wherein_dict.items():
where_clauses.append(f'{col} IN {tuple(vals)}\n')
if where_clauses:
where_str = " AND ".join(where_clauses)
sql += f'WHERE {where_str}'
sql += f'ORDER BY rank LIMIT {limit};'
row_dicts, cols = self.raw_select(sql, include_cols=True)
if group_by is None:
return row_dicts, cols
if agg_cols is None:
agg_cols = []
# "group by" block ID and wrangle the links into a list
# note we can't perform native GROUP BYs with FTS results
group_by_idx = {}
for row in row_dicts:
group_by_attr = row.get(group_by)
# add new entries
for agg_col in agg_cols:
row[f'{agg_col}_agg'] = set()
if group_by_attr is None:
continue
if group_by_attr not in group_by_idx:
group_by_idx[group_by_attr] = row
for agg_col in agg_cols:
if agg_col in row:
group_by_idx[group_by_attr][f'{agg_col}_agg'].add(row[agg_col])
return {
'results' : group_by_idx,
'columns' : cols,
'num_results' : len(row_dicts),
}

96
co3/accessors/sql.py Normal file
View File

@ -0,0 +1,96 @@
from pathlib import Path
from collections.abc import Iterable
import inspect
from functools import cache
import sqlalchemy as sa
from co3 import util
from co3.accessor import Accessor
from co3.relation import Relation
#from co3.databases.sql import RelationalDatabase, TabularDatabase, SQLDatabase
from co3.relations import TabularRelation, SQLTable
class RelationalAccessor[D: 'RelationalDatabase', R: Relation](Accessor[D]):
pass
class TabularAccessor[D: 'TabularDatabase', R: TabularRelation](RelationalAccessor[D, R]):
pass
class SQLAccessor(TabularAccessor['SQLDatabase', SQLTable]):
def raw_select(
self,
sql,
bind_params=None,
mappings=False,
include_cols=False,
):
res_method = utils.db.sa_exec_dicts
if mappings:
res_method = utils.db.sa_exec_mappings
return res_method(self.database.engine, sa.text(sql), bind_params=bind_params, include_cols=include_cols)
def select(
self,
table: sa.Table | sa.Subquery | sa.Join,
cols = None,
where = None,
distinct_on = None,
order_by = None,
limit = 0,
mappings = False,
include_cols = False,
):
'''
Perform a SELECT query against the provided table-like object (see
`check_table()`).
Deprecated: String aliases
String aliases for tables are no longer supported. This method no longer checks
against any specific schema table-maps or Composers. Instead, this should be
done outside the Accessor.
Parameters:
group_by: list of columns to group by; for now serves as a proxy for DISTINCT
(no aggregation methods accepted)
order_by: column to order results by (can use <col>.desc() to order
by descending)
'''
if where is None:
where = sa.true()
res_method = utils.db.sa_exec_dicts
if mappings:
res_method = utils.db.sa_exec_mappings
stmt = sa.select(table).where(where)
if cols is not None:
stmt = sa.select(*cols).select_from(table).where(where)
if distinct_on is not None:
stmt = stmt.group_by(distinct_on)
if order_by is not None:
stmt = stmt.order_by(order_by)
if limit > 0:
stmt = stmt.limit(limit)
return res_method(self.engine, stmt, include_cols=include_cols)
def select_one(self, table, cols=None, where=None, mappings=False, include_cols=False):
res = self.select(table, cols, where, mappings, include_cols, limit=1)
if include_cols and len(res[0]) > 0:
return res[0][0], res[1]
if len(res) > 0:
return res[0]
return None

100
co3/accessors/vss.py Normal file
View File

@ -0,0 +1,100 @@
import pickle
import logging
from pathlib import Path
import time
import sqlalchemy as sa
#from sentence_transformers import SentenceTransformer, util
from co3.accessor import Accessor
logger = logging.getLogger(__name__)
class VSSAccessor(Accessor):
def __init__(self, cache_path):
super().__init__()
self._model = None
self._embeddings = None
self._embedding_size = 384
self.embedding_path = Path(cache_path, 'embeddings.pkl')
def write_embeddings(self, embedding_dict):
self.embedding_path.write_bytes(pickle.dumps(embedding_dict))
def read_embeddings(self):
if not self.embedding_path.exists():
logger.warning(
f'Attempting to access non-existent embeddings at {self.embedding_path}'
)
return None
return pickle.loads(self.embedding_path.read_bytes())
@property
def model(self):
if self._model is None:
# model trained with 128 token seqs
self._model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
return self._model
@property
def embeddings(self):
if self._embeddings is None:
self._embeddings = self.read_embeddings()
return self._embeddings
def embed_chunks(self, chunks, batch_size=64, show_prog=True):
return self.model.encode(
chunks,
batch_size = batch_size,
show_progress_bar = show_prog,
convert_to_numpy = True,
normalize_embeddings = True
)
def search(
self,
query : str,
index_name : str,
limit : int = 10,
score_threshold = 0.5,
):
'''
Parameters:
index_name: one of ['chunks','blocks','notes']
'''
if not query:
return None
if index_name not in self.embeddings:
logger.warning(
f'Index "{index_name}" does not exist'
)
return None
start = time.time()
query_embedding = self.embed_chunks(query, show_prog=False)
index_ids, index_embeddings, index_items = self.embeddings[index_name]
hits = util.semantic_search(
query_embedding,
index_embeddings,
top_k=limit,
score_function=util.dot_score
)[0]
hits = [hit for hit in hits if hit['score'] >= score_threshold]
for hit in hits:
idx = hit['corpus_id']
hit['group_name'] = index_ids[idx]
hit['item'] = index_items[idx]
logger.info(f'{len(hits)} hits in {time.time()-start:.2f}s')
return hits

106
co3/co3.py Normal file
View File

@ -0,0 +1,106 @@
'''
CO4
CO4 is an abstract base class for scaffolding object hierarchies and managing operations
with associated database schemas. It facilitates something like a "lightweight ORM" for
classes/tables/states with fixed transformations of interest. The canonical use case is
managing hierarchical document relations, format conversions, and syntactical components.
'''
import inspect
import logging
from functools import wraps, partial
#from localsys.db.schema import tables
logger = logging.getLogger(__name__)
#def register_format(_format):
# def decorator(func):
# self.collate.format_map[_format] = func
#
# @wraps(func)
# def register(*args, **kwargs):
# return func(*args, **kwargs)
#
# return register
# return decorator
def collate(action_key, action_groups=None):
def decorator(func):
if action_groups is None:
action_groups = [None]
func._action_data = (action_key, action_groups)
return func
return decorator
class FormatRegistryMeta(type):
def __new__(cls, name, bases, attrs):
action_registry = {}
# add registered superclass methods; iterate over bases (usually just one), then
# that base's chain down (reversed), then methods from each subclass
for base in bases:
for _class in reversed(base.mro()):
methods = inspect.getmembers(_class, predicate=inspect.isfunction)
for _, method in methods:
if hasattr(method, '_action_data'):
action_key, action_groups = method._action_data
action_registry[action_key] = (method, action_groups)
# add final registered formats for the current class, overwriting any found in
# superclass chain
for attr_name, attr_value in attrs.items():
if hasattr(attr_value, '_action_data'):
action_key, action_groups = attr_value._action_data
action_registry[action_key] = (method, action_groups)
attrs['action_map'] = action_registry
return super().__new__(cls, name, bases, attrs)
class CO3(metaclass=FormatRegistryMeta):
'''
CO3: COllate, COllect, COmpose - conversion & DB insertion base
- Collate: organize and transform conversion outputs, possibly across class components
- Collect: gather core attributes, conversion data, and subcomponents for DB insertion
- Compose: construct object-associated DB table references through the class hierarchy
Note: on action groups
Group keys are simply named collections to make it easy for storage components to
be attached to action subsets. They do _not_ augment the action registration
namespace, meaning the action key should still be unique; the group key is purely
auxiliary.
Action methods can also be attached to several groups, in case there is
overlapping utility within or across schemas or storage media. In this case, it
becomes particularly critical to ensure registered `collate` methods really are
just "gathering results" from possibly heavy-duty operations, rather than
performing them when called, so as to reduce wasted computation.
'''
@property
def attributes(self):
'''
Method to define how a subtype's inserts should be handled under `collect` for
canonical attributes, i.e., inserts to the type's table.
'''
return vars(self)
@property
def components(self):
'''
Method to define how a subtype's inserts should be handled under `collect` for
constituent components that need handling.
'''
return []
def collate(self, action_key, *action_args, **action_kwargs):
if action_key not in self.action_map:
logger.debug(f'Collation for {action_key} not supported')
return None
else:
return self.action_map[action_key](self)

109
co3/collector.py Normal file
View File

@ -0,0 +1,109 @@
'''
Defines the Collector base class.
This module is the critical "middleware" connecting the primitive object definitions and
their representations in the database. It operates with full knowledge of how both are
defined, and abstracts away both the prep work for DB insertions as well as updates
trickling down the primitive hierarchy.
The `src` format target is re-used for both canonical tables/primitives, as well as
<prim>_conversion_matter tables in tables/conversions under the `src` format. The latter
is meant to extend those attributes that are format-specific (i.e., would change when, say,
converting to `html5`), and thus need to be broken across the format dimension.
Note:
Despite the structure of the database module, this class does not currently inherit
from a super class in localsys.db (like the accessors and managers, for instance).
This will likely ultimately be the model that's embraced, but until FTS (or other
groups) need a collector, this will be remain an independent class. It is, however,
named like a concrete subclass, taking on the "Core" prefix.
'''
from pathlib import Path
from collections import defaultdict
import logging
import importlib
import subprocess
from uuid import uuid4
import sqlalchemy as sa
from co3 import util
#from localsys.db.schema import tables
logger = logging.getLogger(__name__)
class Collector:
def __init__(self):
self._inserts = defaultdict(lambda: defaultdict(list))
@property
def inserts(self):
return self._inserts_from_receipts()
def _inserts_from_receipts(self, receipts=None, pop=False):
inserts = defaultdict(list)
if receipts is None:
receipts = list(self._inserts.keys())
for receipt in receipts:
if pop: insert_dict = self._inserts.pop(receipt, {})
else: insert_dict = self._inserts[receipt]
for table, insert_list in insert_dict.items():
inserts[table].extend(insert_list)
return dict(inserts)
def _reset_session(self):
self._inserts = defaultdict(lambda: defaultdict(list))
def _generate_unique_receipt(self):
return str(uuid4())
def add_insert(self, table_name, insert_dict, receipts=None):
'''
TODO: formalize table_name mapping; at class level provide a `table_map`, or provide
the table object itself to this method
'''
if table_name not in tables.table_map:
#logger.debug(f'Inserts provided for non-existent table {table_name}')
return None
receipt = self._generate_unique_receipt()
self._inserts[receipt][table_name].append(
utils.db.prepare_insert(
tables.table_map[table_name],
insert_dict
)
)
if receipts is not None:
receipts.append(receipt)
return receipt
def collect_inserts(self, receipts=None):
'''
Collect insert-ready dictionaries for the core primitive schema. This method is
effectively a light wrapper around the File and Note-based collection logic
elsewhere in the class.
The overall collection scheme embraces a session-like sequential update model to
an internal insert tracker. The sequence of insert methods is ordered according to
the schema hierarchy, and higher level inserts dictate the scope for lower level
inserts (all methods check and populate the same `inserts` dictionary). Calling
this method flushes any existing inserts, ensuring a re-scan takes place across
calls (or "sessions").
Parameters:
skip_updated: whether to ignore primitives with existing up-to-date
database entries
Returns:
Table name-indexed dictionary of insert lists (of column name-indexed dicts)
'''
return self._inserts_from_receipts(receipts, pop=True)

89
co3/composer.py Normal file
View File

@ -0,0 +1,89 @@
'''
Composer
Base for manually defining table compositions outside those natural to the schema
hierarchy (i.e., constructable by a `CO4.compose()` call).
Example: suppose we have a simple object hierarchy A(CO4) -> B -> C. C's in-built
`compose()` method may not always be desirable when constructing composite tables and
running related queries. In this case, a custom Composer can be used to make needed
composite tables easier to reference; in the case below, we define the "BC" composite
table.
```
class ExampleComposer(Composer):
@register_table
def BC(self):
full_B = B.compose(full=True)
full_C = C.compose(full=True)
return full_B.join(
full_C,
full_B.c.name == full_C.c.name, # TODO: is this fine? or do we need base table refs
outer=True
)
'''
from pathlib import Path
from co3.mapper import Mapper
def register_table(table_name=None):
'''
Registry decorator for defined composer classes. Decorating a class method simply
attaches a `table_name` attribute to it, setting it to either a provided value or the
name of the method itself. Methods with a `table_name` attribute are later swept up at
the class level and placed in the `table_map`.
'''
def decorator(func):
if table_name is None:
table_name = func.__name__
func.table_name = table_name
return func
return decorator
class Composer[M: Mapper]:
'''
Base composer wrapper for table groupings.
The schema is centered around a connected group of tables (via foreign keys). Thus,
most operations need to be coordinated across tables. The `accessors` submodules
are mostly intended to provide a "secondary layer" over the base set of tables in the
schema, exposing common higher level table compositions (i.e., chained JOINs). See
concrete instances (e.g., CoreAccess, FTSAccessor) for actual implementations these
tables; the base class does not expose
Tables in subclasses are registered with the `register_table` decorator, automatically
indexing them under the provided name and making them available via the `table_map`.
'''
def __init__(self):
self._set_tables()
def _set_tables(self):
'''
Skip properties (so appropriate delays can be used), and
Set the table registry at the class level. This only takes place during the first
instantiation of the class, and makes it possible to definitively tie methods to
composed tables during lookup with `get_table()`.
'''
cls = self.__class__
# in case the class has already be instantiated
if hasattr(cls, 'table_map'): return
table_map = {}
for key, value in cls.__dict__.items():
if isinstance(value, property):
continue # Skip properties
if callable(value) and hasattr(value, 'table_name'):
table_map[value.table_name] = value(self)
cls.table_map = table_map
def get_table(self, table_name):
'''
Retrieve the named table composition, if defined.
'''
return self.table_map.get(table_name)

87
co3/database.py Normal file
View File

@ -0,0 +1,87 @@
'''
Database
Central object for defining storage protocol-specific interfaces. The database wraps up
central items for interacting with database resources, namely the Accessor and Manager
objects.
The Database type hierarchy attempts to be exceedingly general; SQL-derivatives should
subclass from the RelationalDatabase subtype, for example, which itself becomes a new
generic via type dependence on Relation.
'''
import logging
from typing import Self
from co3.accessor import Accessor
from co3.composer import Composer
from co3.manager import Manager
from co3.indexer import Indexer
logger = logging.getLogger(__name__)
class Database:
accessor: type[Accessor[Self]] = Accessor
manager: type[Manager[Self]] = Manager
def __init__(self, resource):
'''
Variables:
_local_cache: a database-local property store for ad-hoc CacheBlock-esque
methods, that are nevertheless _not_ query/group-by responses to
pass on to the Indexer. Dependent properties should write to the
this cache and check for existence of stored results; the cache
state must be managed globally.
'''
self.resource = resource
self._access = self.accessor(self)
self._manage = self.manager(self)
self._index = Indexer(self._access)
self._local_cache = {}
self.reset_cache = False
@property
def engine(self):
'''
Database property to provide a singleton engine for DB interaction, initializing
the database if it doesn't already exist.
TODO: figure out thread safety across engines and/or connection. Any issue with
hanging on to the same engine instance for the Database instance?
'''
raise NotImplementedError
def connect(self):
self.engine.connect()
@property
def access(self):
return self._access
@property
def compose(self):
return self._compose
@property
def index(self):
if self.reset_cache:
self._index.cache_clear()
self.reset_cache = False
return self._index
@property
def manage(self):
'''
Accessing `.manage` queues a cache clear on the external index, as well wipes the
local index.
'''
self.reset_cache = True
self._local_cache = {}
return self._manage
def populate_indexes(self): pass

View File

@ -0,0 +1,3 @@
from co3.databases.sql import *
from co3.databases.fts import FTSDatabase
from co3.databases.vss import VSSDatabase

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

8
co3/databases/fts.py Normal file
View File

@ -0,0 +1,8 @@
from co3.database import Database
from co3.accessors.fts import FTSAccessor
from co3.managers.fts import FTSManager
class FTSDatabase(Database):
accessor = FTSAccessor
manager = FTSManager

34
co3/databases/sql.py Normal file
View File

@ -0,0 +1,34 @@
from typing import Self
from co3.database import Database
from co3.accessors.sql import RelationalAccessor, TabularAccessor, SQLAccessor
from co3.managers.sql import RelationalManager, TabularManager, SQLManager
from co3.relation import Relation
from co3.relations import TabularRelation, SQLTable
class RelationalDatabase[R: Relation](Database):
accessor: type[RelationalAccessor[Self, R]] = RelationalAccessor[Self, R]
manager: type[RelationalManager[Self, R]] = RelationalManager[Self, R]
class TabularDatabase[R: TabularRelation](RelationalDatabase[R]):
'''
accessor/manager assignments satisfy supertype's type settings;
`TabluarAccessor[Self, R]` is of type `type[RelationalAccessor[Self, R]]`
(and yes, `type[]` specifies that the variable is itself being set to a type or a
class, rather than a satisfying _instance_)
'''
accessor: type[TabularAccessor[Self, R]] = TabularAccessor[Self, R]
manager: type[TabularManager[Self, R]] = TabularManager[Self, R]
class SQLDatabase[R: SQLTable](TabularDatabase[R]):
accessor = SQLAccessor
manager = SQLManager
class SQLiteDatabase(SQLDatabase[SQLTable]):
pass

9
co3/databases/vss.py Normal file
View File

@ -0,0 +1,9 @@
from co3.database import Database
from co3.accessors.vss import VSSAccessor
from co3.managers.vss import VSSManager
class VSSDatabase(Database):
accessor = VSSAccessor
manager = VSSManager

433
co3/indexer.py Normal file
View File

@ -0,0 +1,433 @@
import time
import logging
import threading
from collections import defaultdict
from collections.abc import Iterable
import sqlalchemy as sa
logger = logging.getLogger(__name__)
class Indexer:
'''
Indexer class
Provides restricted access to an underlying Accessor to enable more efficient, superficial
caching.
Cache clearing is to be handled by a wrapper class, like the Database.
Caching occurs at the class level, with indexes prefixed by table's origin Composer.
This means that cached selects/group-bys will be available regardless of the provided
Accessors so long as the same Composer is used under the hood.
'''
_cls_select_cache = {}
_cls_groupby_cache = defaultdict(dict)
def __init__(self, accessor, cache_select=True, cache_groupby=True):
self.accessor = accessor
# set instance caches; if remains None, methods can't index
self._select_cache = None
self._groupby_cache = None
if cache_groupby and not cache_select:
raise ValueError('cannot cache groupbys without select caching enabled')
if cache_select:
self._select_cache = self._cls_select_cache
if cache_groupby:
self._groupby_cache = self._cls_groupby_cache
self._access_lock = threading.Lock()
def cache_clear(self, group_by_only=False):
self._groupby_cache.clear()
if not group_by_only:
self._select_cache.clear()
def cache_block(
self,
table,
**kwargs,
):
'''
Provide a user-friendly, dynamically re-indexable
'''
return CacheBlock(
indexer = self,
table = table,
**kwargs,
)
def cached_query(
self,
table,
cols = None,
where = None,
distinct_on = None,
order_by = None,
limit = 0,
group_by = None,
agg_on = None,
index_on = None,
):
'''
Like `group_by`, but makes a full query to the Accessors table `table_name` and
caches the results. The processing performed by the GROUP BY is also cached.
Update: `cached_select` and `cached_group_by` now unified by a single
`cached_query` method. This allows better defined GROUP BY caches, that are
reactive to the full set of parameters returning the result set (and not just the
table, requiring a full query).
Note: on cache keys
Cache keys are now fully stringified, as many objects are now allowed to be
native SQLAlchemy objects. Indexing these objects works, but doing so will
condition the cache on their memory addresses, which isn't what we want.
SQLAlchemy converts most join/column/table-like objects to reasonable strings,
which will look the same regardless of instance.
Context: this became a clear issue when passing in more
`order_by=<col>.desc()`. The `desc()` causes the index to store the column in
an instance-specific way, rather than an easily re-usable, canonical column
reference. Each time the CoreDatabase.files() was being called, for instance,
that @property would be re-evaluated, causing `desc()` to be re-initialized,
and thus look different to the cache. Stringifying everything prevents this
(although this could well be an indication that only a single `cache_block`
should ever be returned be database properties).
Note: on access locks
A double-checked locking scheme is employed before both of the stages (select
and manual group by), using the same lock. This resolves the common scenario
where many threads need to look up a query in the cache, experience a cache
miss, and try to do the work. This non-linearly explodes the total time to
wait in my experience, so doing this only when needed saves tons of time,
especially in high-congestion moments.
'''
start = time.time()
cache_key = tuple(map(str, (table, cols, where, distinct_on, order_by, limit)))
# apparently this is the double-check locking scheme (didn't realize when implementing)
if self._select_cache is None or cache_key not in self._select_cache:
# cache re-compute possible, acquire lock to continue. A later thread may
# acquire this after work has been done by an earlier thread, so re-eval the
# condition below before actually performing a DB read. If access results in a
# cache hit, locking isn't needed.
with self._access_lock:
if self._select_cache is None or cache_key not in self._select_cache:
results = self.accessor.select(
table,
cols=cols,
where=where,
distinct_on=distinct_on,
order_by=order_by,
limit=limit,
mappings=True
)
# cache results if select_cache is defined
if self._select_cache is not None:
self._select_cache[cache_key] = results
logger.debug(
f'Indexer "select" cache miss for table "{table}": access in {time.time()-start:.4f}s'
)
else:
results = self._select_cache[cache_key]
logger.debug(
f'Indexer "select" cache hit for table "{table}": access in {time.time()-start:.4f}s'
)
else:
results = self._select_cache[cache_key]
logger.debug(
f'Indexer "select" cache hit for table "{table}": access in {time.time()-start:.4f}s'
)
start = time.time()
cache_key = (*cache_key, group_by, agg_on, index_on)
if group_by is not None:
if self._groupby_cache is None or cache_key not in self._groupby_cache:
with self._access_lock:
if self._groupby_cache is None or cache_key not in self._groupby_cache:
results = self.group_by(
results,
group_by = group_by,
agg_on = agg_on,
index_on = index_on,
return_index = True,
)
if self._groupby_cache is not None:
self._groupby_cache[cache_key] = results
logger.debug(
f'Indexer "group_by" cache miss for table "{table}": access in {time.time()-start:.4f}s'
)
else:
results = self._groupby_cache[cache_key]
logger.debug(
f'Indexer "group_by" cache hit for table "{table}": access in {time.time()-start:.4f}s'
)
else:
results = self._groupby_cache[cache_key]
logger.debug(
f'Indexer "group_by" cache hit for table "{table}": access in {time.time()-start:.4f}s'
)
return results
@classmethod
def group_by(
cls,
rows,
group_by,
agg_on=None,
index_on=None,
return_index=False,
):
'''
Post-query "group by"-like aggregation. Creates an index over a set of columns
(`group_by_cols`), and aggregates values from `agg_cols` under the groups.
Rows can be dicts or mappings, and columns can be strings or SQLAlchemy columns.
To ensure the right columns are being used for the operation, it's best to pass in
mappings and use SQA columns if you aren't sure exactly how the keys look in your
results (dicts can have ambiguous keys across tables with the same columns and/or
different labeling schemes altogether).
TODO: add a flag that handles None's as distinct. That is, for the group_by
column(s) of interest, if rows in the provided query set have NULL values for
these columns, treat all such rows as their "own group" and return them alongside
the grouped/aggregated ones. This is behavior desired by something like
FTSManager.recreate(), which wants to bundle up conversions for blocks
(effectively grouping by blocks.name and link.id, aggregating on
block_conversions.format, then flattening). You could either do this, or as the
caller just make sure to first filter the result set before grouping (e.g.,
splitting the NULL-valued rows from those that are well-defined), and then
stitching the two sets back together afterward.
Multi-dim update:
- group_by: can be a tuple of tuples of columns. Each inner tuple is a nested
"group by index" in the group by index
-
'''
if not rows:
return {} if return_index else []
rows_are_mappings = not isinstance(rows[0], dict)
if not rows_are_mappings:
if isinstance(group_by, sa.Column):
group_by = group_by.name
else:
group_by = str(group_by)
#if group_by is None: group_by = []
#elif not isinstance(group_by, Iterable): group_by = [group_by]
if agg_on is None: agg_on = []
elif not isinstance(agg_on, Iterable): agg_on = [agg_on]
if index_on is None: index_on = []
elif not isinstance(index_on, Iterable): index_on = [index_on]
agg_on_names = []
for agg in agg_on:
# if a SQA column, can either use `.name` or `str(c)`. The latter includes the
# table name, the former doesn't; ambiguity can be introduced here.
if isinstance(agg, sa.Column):
agg_on_names.append(agg.name)
else:
agg_on_names.append(str(agg))
index_on_names = []
for index in index_on:
# if a SQA column, can either use `.name` or `str(c)`. The latter includes the
# table name, the former doesn't; ambiguity can be introduced here.
if isinstance(index, sa.Column):
index_on_names.append(index.name)
else:
index_on_names.append(str(index))
# when rows are dicts, use columns' string names
if not rows_are_mappings:
agg_on = agg_on_names
index_on = index_on_names
#print(f'rows_are_mappings: {rows_are_mappings}')
#print(f'group_by: {group_by}')
#print(f'agg_on: {agg_on}')
#print(f'agg_on_names: {agg_on_names}')
#print(f'index_on: {index_on}')
#print(f'index_on_names: {index_on_names}')
# "group by" block ID and wrangle the links into a list
group_by_idx = {}
for row in rows:
# generic get
group_by_attr = row.get(group_by)
# wrap possible mapping dict
row_dict = dict(row)
# add new entries; standardize
#aggregates = {}
#for agg_name in agg_on_names:
# aggregates[agg_name] = []
#row_dict['aggregates'] = aggregates
row_dict['aggregates'] = []
indexes = {}
for index_name in index_on_names:
indexes[index_name] = {}
row_dict['indexes'] = indexes
if group_by_attr is None:
continue
if group_by_attr not in group_by_idx:
group_by_idx[group_by_attr] = row_dict
# actually include all agg cols, even if None, so agg array indexes align
agg_dict = {
agg_on_names[i] : row.get(agg_col)
for i, agg_col in enumerate(agg_on)
}
#aggregates = group_by_idx[group_by_attr]['aggregates']
#for agg_key, agg_val in agg_dict.items():
# aggregates[agg_key].append(agg_val)
aggregates = group_by_idx[group_by_attr]['aggregates']
aggregates.append(agg_dict)
indexes = group_by_idx[group_by_attr]['indexes']
for i, index_col in enumerate(index_on):
index_name = index_on_names[i]
indexes[index_name][row[index_col]] = agg_dict
if return_index:
return group_by_idx
return list(group_by_idx.values())
class CacheBlock:
'''
CacheBlock class
Wraps up a set of query parameters for a specific entity, and provides cached access
to different types of "re-queries" via an associated Indexer.
The goal here is to help build/define entities as the possibly complex transformations
on the base schema that they are. For example, the Note primitive (entity)
incorporates details across `files`, `notes`, `note_conversions`, and
`note_conversion_matter` tables (defined in a single endpoint by a Composer), often
needs to be selected in particular ways (via an Accessor), and results stored for fast
access later on (handled by an Indexer). This pipeline can be daunting and requires
too many moving parts to be handled explicitly everywhere. CacheBlocks wrap up a set
of query "preferences," exposing a simpler interface for downstream access to
entities. It still allows for low-level control over re-grouping/indexing, raw hits to
the actual DB, etc, but keeps things tighter and well-behaved for the Indexer.
You can think of these as the Indexer's "fingers"; they're deployable mini-Indexes
that "send back" results to the class cache, which is "broadcast" to all other
instances for use when necessary.
Note: Example usage
```py
cb = CacheBlock()
# Set up cached queries with chained params or via call:
cb.where(t.notes.c.name=="name").group_by(t.note_conversions.c.format)
cb() # get results
# - OR - # (use strings when known)
cb.where(t.notes.c.name=="name").group_by('format')
cb() # get results
# - OR - # (use kwargs in the call; results returned right away)
cb(
where=(t.notes.c.name=="name"),
group_by='format'
)
```
'''
def __init__(
self,
indexer,
table,
cols = None,
where = None,
distinct_on = None,
order_by = None,
limit = 0,
group_by = None,
agg_on = None,
index_on = None,
):
self.indexer = indexer
self.query_args = {
'table' : table,
'cols' : cols,
'where' : where,
'distinct_on' : distinct_on,
'order_by' : order_by,
'limit' : limit,
'group_by' : group_by,
'agg_on' : agg_on,
'index_on' : index_on,
}
def _query(self, **kwargs):
'''Make cached query with defaults, override with those provided'''
return self.indexer.cached_query(**{
k : (v if k not in kwargs else kwargs[k])
for k,v in self.query_args.items()
})
def __call__(self, **kwargs):
'''
TODO: overload this for the queries, i.e. getting keys or returning aggregates
'''
return self._query(**kwargs)
def where(self, where):
self.query_args['where'] = where
return self
#return self._query(where=where)
def distinct_on(self, distinct_on):
self.query_args['distinct_on'] = distinct_on
return self
#return self._query(distinct_on=distinct_on)
def order_by(self, order_by):
self.query_args['order_by'] = order_by
return self
#return self._query(order_by=order_by)
def limit(self, limit):
self.query_args['limit'] = limit
return self
def group_by(self, group_by):
self.query_args['group_by'] = group_by
return self
#return self._query(group_by=group_by)

Some files were not shown because too many files have changed in this diff Show More