initial commit

This commit is contained in:
Sam G. 2024-03-28 23:11:30 -07:00
commit 057e20163d
206 changed files with 10154 additions and 0 deletions

1
.python-version Normal file
View File

@ -0,0 +1 @@
co4

0
MANIFEST.in Normal file
View File

16
Makefile Normal file
View File

@ -0,0 +1,16 @@
PYTHON=/home/smgr/.pyenv/versions/co4/bin/python
BASH=/usr/bin/bash
## ------------------ docs ------------------ ##
docs-build:
sphinx-apidoc --module-first -o docs/_autoref/ co4
make -C docs/ html
docs-serve:
cd docs/_build/html && python -m http.server 9090
docs-clean:
make -C docs/ clean
## ------------------------------------------ ##

21
README.md Normal file
View File

@ -0,0 +1,21 @@
# Overview
`co3` is a package for file conversion and associated database operations. The `CO3` base class
provides a standard interface for performing conversions, preparing inserts, and
interacting with database schemas that mirror the class hierarchy.
Simplified description of the operational model:
**Goal**: interact with a storage medium (database, pickled structure, VSS framework) with
a known schema.
- **Accessor** to provide access to stored items
- **Composer** to compose common access points (e.g., JOINed tables)
- **Indexer** to index/cache access queries
- **Manager** to manage storage state (e.g., supported inserts, database syncs)
- **Collector** to collect data for updating storage state
- **Database** to collect data for updating storage state
- **Mapper** to collect data for updating storage state
- **Relation** to collect data for updating storage state
**CO3** is an abstract base class that makes it easy to integrate this model with object
hierarchies that mirror a storage schema.

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessor.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/sql.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/accessors/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/co3.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/collector.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/composer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/database.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/sql.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/databases/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/indexer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/manager.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/sql.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/managers/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/mapper.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/relation.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/relations/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/util/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/util/db.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co3/co3/util/regex.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessor.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/table.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/accessors/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/co4.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/collector.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/composer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/_base.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/core.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/databases/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/indexer.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/manager.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/__init__.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/core.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/fts.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/managers/vss.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/utils/db.py

View File

@ -0,0 +1 @@
/home/smgr/Documents/projects/ontolog/co4/co4/utils/paths.py

30
co3.egg-info/PKG-INFO Normal file
View File

@ -0,0 +1,30 @@
Metadata-Version: 2.1
Name: co3
Version: 0.1.1
Summary: Lightweight ORM
Author-email: Sam Griesemer <samgriesemer@gmail.com>
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Requires-Python: >=3.11
Description-Content-Type: text/markdown
Requires-Dist: tqdm
# Overview
`co4` is a package for file conversion and associated database operations. The `CO4` base class
provides a standard interface for performing conversions, preparing inserts, and
interacting with database schemas that mirror the class hierarchy.
Simplified description of the operational model:
**Goal**: interact with a storage medium (database, pickled structure, VSS framework) with
a known schema.
- **Accessor** to provide access to stored items
- **Composer** to compose common access points (e.g., JOINed tables)
- **Indexer** to index/cache access queries
- **Manager** to manage storage state (e.g., supported inserts, database syncs)
- **Collector** to collect data for updating storage state
**CO4** is an abstract base class that makes it easy to integrate this model with object
hierarchies that mirror a storage schema.

34
co3.egg-info/SOURCES.txt Normal file
View File

@ -0,0 +1,34 @@
MANIFEST.in
README.md
pyproject.toml
co3/__init__.py
co3/accessor.py
co3/co3.py
co3/collector.py
co3/composer.py
co3/database.py
co3/indexer.py
co3/manager.py
co3/mapper.py
co3/relation.py
co3.egg-info/PKG-INFO
co3.egg-info/SOURCES.txt
co3.egg-info/dependency_links.txt
co3.egg-info/requires.txt
co3.egg-info/top_level.txt
co3/accessors/__init__.py
co3/accessors/fts.py
co3/accessors/sql.py
co3/accessors/vss.py
co3/databases/__init__.py
co3/databases/fts.py
co3/databases/sql.py
co3/databases/vss.py
co3/managers/__init__.py
co3/managers/fts.py
co3/managers/sql.py
co3/managers/vss.py
co3/relations/__init__.py
co3/util/__init__.py
co3/util/db.py
co3/util/regex.py

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
tqdm

View File

@ -0,0 +1 @@
co3

109
co3/__init__.py Normal file
View File

@ -0,0 +1,109 @@
'''
Database submodule
- `db`: contains SQLAlchemy-based schema definitions
- `accessors`: convenience methods for accessing database entries
- `populate`: convenience methods for populating database tables
The `accessors` and `populate` submodules are each split into `schema` and `fts` method
groups. The former concerns methods relating to the actual database schema, the latter to
their SQLite FTS counterparts.
Note: Subpackages organization
Subpackages are broken up by inheritance. Within a given submodule, you have a
`_base.py` file defining the base class associated with that submodule's title, along
with concrete subclasses of that base in their own files. Deeper inheritance would
recursively extend this structure. The `__init__.py` for a given submodule then
exposes the concrete instances, leaving the base hidden. For example,
accessors/
_base.py
core.py
fts.py
`core` and `fts` house the `CoreAccessor` and `FTSAccessor` classes, respectively,
and are the direct subclasses of the `Accessor` parent found in the `_base`. This base
class _could_ be placed outside of the submodule in the parent directory (imported
with something like `from db import accessor` instead of `from db.accessor import
_base`). This is entirely valid, but I tend to prefer when the base class is among its
direct children, as
- In this case at least, the base doesn't need to be exposed
- The base class is being stowed away under an appropriately named submodule; having a
separate `accessor.py` and `accessors/` file/directory can feel a little cluttered.
- It makes imports across the accessors feel standardized:
```py
from localsys.db.accessors._base import Accessor
from localsys.db.accessors.core import CoreAccessor
```
Both have the same level of nesting to reach the class.
Frankly, both means of organization are perfectly fine, and as far as I can tell,
semantically sound in their own right. This particular scheme is just a preference in
the moment, and so long as I keep things consistent, choosing one over the other
shouldn't matter.
Additionally, note how `__init__.py`s are typically set up when providing wider access
to internal modules. The `init` typically pulls out classes from sibling modules
(i.e., files), but will import subpackages are the topmost level. For example, for the
structure
```
db/
__init__.py
accessors/
__init__.py
_base.py
core.py
fts.py
```
we have
```db/__init__.py
from localsys.db import accessors
```
which just imports the subpackage `accessors`. However, within subpackage:
```db/accessors/__init__.py
from localsys.db.accessors.core import CoreAccessor
```
we don't just import the submodule `core`; we did into the file to grab the relevant
class and pull it into the outer namespace. Overarching point: `__init__.py` files
typically reach into the sibling files (submodules) and pull out classes. Given that
this behavior is recursive, `__init__.py` then respect subpackages (nested
directories), importing them at the top-level and expecting an internal `__init__.py`
will have managed access appropriately.
Note: Organization for inheritance over composition
At a glance, the organization of subpackages here feels like it clashes with those
seen in `localsys.primitives`. `note_components`, for instance, houses the components
for the outer `note` module. Contrast this with how the `core` submodule looks: it's
composing `*/core.py` files across subpackages `accessors` and `managers`, rather than
a single subpackage like `note`. This seems inconsistent, but the subpackages here are
actually still organized in the same way: by inheritance. It just happens that the
all of the note components inherit from the same base class, and are thus confined to
a single subpackage. This aside, the subpackages themselves are still created around
inheritance, wrapping up a base and direct subclasses.
'''
from co3.accessor import Accessor
from co3.co3 import CO3
from co3.collector import Collector
from co3.composer import Composer
from co3.database import Database
from co3.indexer import Indexer
from co3.manager import Manager
from co3.mapper import Mapper
from co3.relation import Relation
from co3 import accessors
from co3 import databases
from co3 import managers
from co3 import relations
from co3 import util

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

28
co3/accessor.py Normal file
View File

@ -0,0 +1,28 @@
'''
Accessor
Provides access to an underlying schema through a supported set of operations. Class
methods could be general, high-level SQL wrappers, or convenience functions for common
schema-specific queries.
'''
import inspect
from pathlib import Path
from collections import defaultdict
import sqlalchemy as sa
#from co3.database import Database
class Accessor[D: 'Database']:
'''
Access wrapper class for complex queries and easy integration with Composer tables.
Implements high-level access to things like common constrained SELECT queries.
Parameters:
engine: SQLAlchemy engine to use for queries. Engine is initialized dynamically as
a property (based on the config) if not provided
'''
def __init__(self, database: D):
self.database = database

24
co3/accessors/__init__.py Normal file
View File

@ -0,0 +1,24 @@
'''
Note that subclasses in this subpackage are split differently to other subpackages in the
DB. Instead of being split by table group, corresponding to a Composer (which defines that
table group), Accessors are split by a separate dimension: table "type". This is why we
have a "TableAccessor" and an "FTSAccessor": the former exposes access operations
available to generic tables, the latter to FTS tables (instead of being designed
specifically around "core" and "fts" groups, for instance).
Seeing as FTS tables are "generic" tables, it seems inconsistent not to have FTSAccessor
inherit from TableAccessor. While this would work fine, the model we're working with
doesn't really need it; you can instead think of the FTSAccessor as defining _only_
FTS-specific operations. Given that you have a Composer for your desired table group, you
can then wrap it with your desired set of "access actions," available in separate Accessor
subclasses.
For instance, you could wrap an FTSComposer in either a TableAccessor or FTSAccessor. The
former will treat the tables in the composer like regular tables, exposing methods like
`.select` and `.select_one`, whereas the latter defines FTS-specific actions like
`.search`.
'''
from co3.accessors.sql import SQLAccessor
from co3.accessors.fts import FTSAccessor
from co3.accessors.vss import VSSAccessor

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

147
co3/accessors/fts.py Normal file
View File

@ -0,0 +1,147 @@
import sqlalchemy as sa
from co3 import util
from co3.accessor import Accessor
class FTSAccessor(Accessor):
def search(
self,
table_name : str,
select_cols : str | list | None = '*',
search_cols : str | None = None,
q : str | None = None,
colq : str | None = None,
snip_col : int | None = 0,
hl_col : int | None = 0,
limit : int | None = 100,
snip : int | None = 64,
tokenizer : str | None = 'unicode61',
group_by : str | None = None,
agg_cols : list | None = None,
wherein_dict: dict | None = None,
unique_on : dict | None = None,
):
'''
Execute a search query against an indexed FTS table for specific primitives. This
method is mostly a generic FTS handler, capable of handling queries to any available
FTS table with a matching naming scheme (`fts_<type>_<tokenizer>`). The current
intention is support all tokenizers, for file, note, block, and link primitives.
Search results include all FTS table columns, as well as SQLite-supported `snippet`s
and `highlight`s for matches. Matches are filtered and ordered by SQLite's
`MATCH`-based score for the text & column queries. Results are (a list of) fully
expanded dictionaries housing column-value pairs.
Note:
GROUP BY cannot be paired with SQLITE FTS extensions; thus, we perform manual
group checks on the result set in Python before response
Analysis:
The returned JSON structure has been (loosely) optimized for speed on the client
side. Fully forming individual dictionary based responses saves time in
Javascript, as the JSON parser is expected to be able to create the objects
faster than post-hoc construction in JS. This return structure was compared
against returning an array of arrays (all ordered in the same fashion), along with
a column list to be associated with each of the result values. While this saves
some size on the payload (the same column names don't have to be transmitted for
each result), the size of the returned content massively outweighs the
predominantly short column names. The only way this structure would be viable is
if a significant amount was saved on transfer compared to the slow down in JS
object construction; this is (almost) never the case.
Parameters:
table_name : name of FTS table to search
search_cols : space separated string of columns to use for primary queries
q : search query
colq : column constraint string; must conform to SQLite standards (e.g.,
`<col>:<text>`
snip_col : table column to use for snippets (default: 1; source content column)
hl_col : table column to use for highlights (default: 2; format column, applied
to HTML targets)
limit : maximum number of results to return in the SQL query
snip : snippet length (max: 64)
tokenizer : tokenizer to use (assumes relevant FTS table has been built)
...
wherein_dict: (col-name, value-list) pairs to match result set against, via
WHERE ... IN clauses
Returns:
Dictionary with search results (list of column indexed dictionaries) and relevant
metadata.
'''
search_q = ''
if type(select_cols) is list:
select_cols = ', '.join(select_cols)
# construct main search query
if search_cols and q:
search_q = f'{{{search_cols}}} : {q}'
# add auxiliary search constraints
if colq:
search_q += f' {colq}'
search_q = search_q.strip()
hl_start = '<b><mark>'
hl_end = '</mark></b>'
fts_table_name = f'{table_name}_fts_{tokenizer}'
sql = f'''
SELECT
{select_cols},
snippet({fts_table_name}, {snip_col}, '{hl_start}', '{hl_end}', '...', {snip}) AS snippet,
highlight({fts_table_name}, {hl_col}, '{hl_start}', '{hl_end}') AS highlight
FROM {fts_table_name}
'''
where_clauses = []
if search_q:
where_clauses.append(f"{fts_table_name} MATCH '{search_q}'\n")
if wherein_dict:
for col, vals in wherein_dict.items():
where_clauses.append(f'{col} IN {tuple(vals)}\n')
if where_clauses:
where_str = " AND ".join(where_clauses)
sql += f'WHERE {where_str}'
sql += f'ORDER BY rank LIMIT {limit};'
row_dicts, cols = self.raw_select(sql, include_cols=True)
if group_by is None:
return row_dicts, cols
if agg_cols is None:
agg_cols = []
# "group by" block ID and wrangle the links into a list
# note we can't perform native GROUP BYs with FTS results
group_by_idx = {}
for row in row_dicts:
group_by_attr = row.get(group_by)
# add new entries
for agg_col in agg_cols:
row[f'{agg_col}_agg'] = set()
if group_by_attr is None:
continue
if group_by_attr not in group_by_idx:
group_by_idx[group_by_attr] = row
for agg_col in agg_cols:
if agg_col in row:
group_by_idx[group_by_attr][f'{agg_col}_agg'].add(row[agg_col])
return {
'results' : group_by_idx,
'columns' : cols,
'num_results' : len(row_dicts),
}

96
co3/accessors/sql.py Normal file
View File

@ -0,0 +1,96 @@
from pathlib import Path
from collections.abc import Iterable
import inspect
from functools import cache
import sqlalchemy as sa
from co3 import util
from co3.accessor import Accessor
from co3.relation import Relation
#from co3.databases.sql import RelationalDatabase, TabularDatabase, SQLDatabase
from co3.relations import TabularRelation, SQLTable
class RelationalAccessor[D: 'RelationalDatabase', R: Relation](Accessor[D]):
pass
class TabularAccessor[D: 'TabularDatabase', R: TabularRelation](RelationalAccessor[D, R]):
pass
class SQLAccessor(TabularAccessor['SQLDatabase', SQLTable]):
def raw_select(
self,
sql,
bind_params=None,
mappings=False,
include_cols=False,
):
res_method = utils.db.sa_exec_dicts
if mappings:
res_method = utils.db.sa_exec_mappings
return res_method(self.database.engine, sa.text(sql), bind_params=bind_params, include_cols=include_cols)
def select(
self,
table: sa.Table | sa.Subquery | sa.Join,
cols = None,
where = None,
distinct_on = None,
order_by = None,
limit = 0,
mappings = False,
include_cols = False,
):
'''
Perform a SELECT query against the provided table-like object (see
`check_table()`).
Deprecated: String aliases
String aliases for tables are no longer supported. This method no longer checks
against any specific schema table-maps or Composers. Instead, this should be
done outside the Accessor.
Parameters:
group_by: list of columns to group by; for now serves as a proxy for DISTINCT
(no aggregation methods accepted)
order_by: column to order results by (can use <col>.desc() to order
by descending)
'''
if where is None:
where = sa.true()
res_method = utils.db.sa_exec_dicts
if mappings:
res_method = utils.db.sa_exec_mappings
stmt = sa.select(table).where(where)
if cols is not None:
stmt = sa.select(*cols).select_from(table).where(where)
if distinct_on is not None:
stmt = stmt.group_by(distinct_on)
if order_by is not None:
stmt = stmt.order_by(order_by)
if limit > 0:
stmt = stmt.limit(limit)
return res_method(self.engine, stmt, include_cols=include_cols)
def select_one(self, table, cols=None, where=None, mappings=False, include_cols=False):
res = self.select(table, cols, where, mappings, include_cols, limit=1)
if include_cols and len(res[0]) > 0:
return res[0][0], res[1]
if len(res) > 0:
return res[0]
return None

100
co3/accessors/vss.py Normal file
View File

@ -0,0 +1,100 @@
import pickle
import logging
from pathlib import Path
import time
import sqlalchemy as sa
#from sentence_transformers import SentenceTransformer, util
from co3.accessor import Accessor
logger = logging.getLogger(__name__)
class VSSAccessor(Accessor):
def __init__(self, cache_path):
super().__init__()
self._model = None
self._embeddings = None
self._embedding_size = 384
self.embedding_path = Path(cache_path, 'embeddings.pkl')
def write_embeddings(self, embedding_dict):
self.embedding_path.write_bytes(pickle.dumps(embedding_dict))
def read_embeddings(self):
if not self.embedding_path.exists():
logger.warning(
f'Attempting to access non-existent embeddings at {self.embedding_path}'
)
return None
return pickle.loads(self.embedding_path.read_bytes())
@property
def model(self):
if self._model is None:
# model trained with 128 token seqs
self._model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
return self._model
@property
def embeddings(self):
if self._embeddings is None:
self._embeddings = self.read_embeddings()
return self._embeddings
def embed_chunks(self, chunks, batch_size=64, show_prog=True):
return self.model.encode(
chunks,
batch_size = batch_size,
show_progress_bar = show_prog,
convert_to_numpy = True,
normalize_embeddings = True
)
def search(
self,
query : str,
index_name : str,
limit : int = 10,
score_threshold = 0.5,
):
'''
Parameters:
index_name: one of ['chunks','blocks','notes']
'''
if not query:
return None
if index_name not in self.embeddings:
logger.warning(
f'Index "{index_name}" does not exist'
)
return None
start = time.time()
query_embedding = self.embed_chunks(query, show_prog=False)
index_ids, index_embeddings, index_items = self.embeddings[index_name]
hits = util.semantic_search(
query_embedding,
index_embeddings,
top_k=limit,
score_function=util.dot_score
)[0]
hits = [hit for hit in hits if hit['score'] >= score_threshold]
for hit in hits:
idx = hit['corpus_id']
hit['group_name'] = index_ids[idx]
hit['item'] = index_items[idx]
logger.info(f'{len(hits)} hits in {time.time()-start:.2f}s')
return hits

106
co3/co3.py Normal file
View File

@ -0,0 +1,106 @@
'''
CO4
CO4 is an abstract base class for scaffolding object hierarchies and managing operations
with associated database schemas. It facilitates something like a "lightweight ORM" for
classes/tables/states with fixed transformations of interest. The canonical use case is
managing hierarchical document relations, format conversions, and syntactical components.
'''
import inspect
import logging
from functools import wraps, partial
#from localsys.db.schema import tables
logger = logging.getLogger(__name__)
#def register_format(_format):
# def decorator(func):
# self.collate.format_map[_format] = func
#
# @wraps(func)
# def register(*args, **kwargs):
# return func(*args, **kwargs)
#
# return register
# return decorator
def collate(action_key, action_groups=None):
def decorator(func):
if action_groups is None:
action_groups = [None]
func._action_data = (action_key, action_groups)
return func
return decorator
class FormatRegistryMeta(type):
def __new__(cls, name, bases, attrs):
action_registry = {}
# add registered superclass methods; iterate over bases (usually just one), then
# that base's chain down (reversed), then methods from each subclass
for base in bases:
for _class in reversed(base.mro()):
methods = inspect.getmembers(_class, predicate=inspect.isfunction)
for _, method in methods:
if hasattr(method, '_action_data'):
action_key, action_groups = method._action_data
action_registry[action_key] = (method, action_groups)
# add final registered formats for the current class, overwriting any found in
# superclass chain
for attr_name, attr_value in attrs.items():
if hasattr(attr_value, '_action_data'):
action_key, action_groups = attr_value._action_data
action_registry[action_key] = (method, action_groups)
attrs['action_map'] = action_registry
return super().__new__(cls, name, bases, attrs)
class CO3(metaclass=FormatRegistryMeta):
'''
CO3: COllate, COllect, COmpose - conversion & DB insertion base
- Collate: organize and transform conversion outputs, possibly across class components
- Collect: gather core attributes, conversion data, and subcomponents for DB insertion
- Compose: construct object-associated DB table references through the class hierarchy
Note: on action groups
Group keys are simply named collections to make it easy for storage components to
be attached to action subsets. They do _not_ augment the action registration
namespace, meaning the action key should still be unique; the group key is purely
auxiliary.
Action methods can also be attached to several groups, in case there is
overlapping utility within or across schemas or storage media. In this case, it
becomes particularly critical to ensure registered `collate` methods really are
just "gathering results" from possibly heavy-duty operations, rather than
performing them when called, so as to reduce wasted computation.
'''
@property
def attributes(self):
'''
Method to define how a subtype's inserts should be handled under `collect` for
canonical attributes, i.e., inserts to the type's table.
'''
return vars(self)
@property
def components(self):
'''
Method to define how a subtype's inserts should be handled under `collect` for
constituent components that need handling.
'''
return []
def collate(self, action_key, *action_args, **action_kwargs):
if action_key not in self.action_map:
logger.debug(f'Collation for {action_key} not supported')
return None
else:
return self.action_map[action_key](self)

109
co3/collector.py Normal file
View File

@ -0,0 +1,109 @@
'''
Defines the Collector base class.
This module is the critical "middleware" connecting the primitive object definitions and
their representations in the database. It operates with full knowledge of how both are
defined, and abstracts away both the prep work for DB insertions as well as updates
trickling down the primitive hierarchy.
The `src` format target is re-used for both canonical tables/primitives, as well as
<prim>_conversion_matter tables in tables/conversions under the `src` format. The latter
is meant to extend those attributes that are format-specific (i.e., would change when, say,
converting to `html5`), and thus need to be broken across the format dimension.
Note:
Despite the structure of the database module, this class does not currently inherit
from a super class in localsys.db (like the accessors and managers, for instance).
This will likely ultimately be the model that's embraced, but until FTS (or other
groups) need a collector, this will be remain an independent class. It is, however,
named like a concrete subclass, taking on the "Core" prefix.
'''
from pathlib import Path
from collections import defaultdict
import logging
import importlib
import subprocess
from uuid import uuid4
import sqlalchemy as sa
from co3 import util
#from localsys.db.schema import tables
logger = logging.getLogger(__name__)
class Collector:
def __init__(self):
self._inserts = defaultdict(lambda: defaultdict(list))
@property
def inserts(self):
return self._inserts_from_receipts()
def _inserts_from_receipts(self, receipts=None, pop=False):
inserts = defaultdict(list)
if receipts is None:
receipts = list(self._inserts.keys())
for receipt in receipts:
if pop: insert_dict = self._inserts.pop(receipt, {})
else: insert_dict = self._inserts[receipt]
for table, insert_list in insert_dict.items():
inserts[table].extend(insert_list)
return dict(inserts)
def _reset_session(self):
self._inserts = defaultdict(lambda: defaultdict(list))
def _generate_unique_receipt(self):
return str(uuid4())
def add_insert(self, table_name, insert_dict, receipts=None):
'''
TODO: formalize table_name mapping; at class level provide a `table_map`, or provide
the table object itself to this method
'''
if table_name not in tables.table_map:
#logger.debug(f'Inserts provided for non-existent table {table_name}')
return None
receipt = self._generate_unique_receipt()
self._inserts[receipt][table_name].append(
utils.db.prepare_insert(
tables.table_map[table_name],
insert_dict
)
)
if receipts is not None:
receipts.append(receipt)
return receipt
def collect_inserts(self, receipts=None):
'''
Collect insert-ready dictionaries for the core primitive schema. This method is
effectively a light wrapper around the File and Note-based collection logic
elsewhere in the class.
The overall collection scheme embraces a session-like sequential update model to
an internal insert tracker. The sequence of insert methods is ordered according to
the schema hierarchy, and higher level inserts dictate the scope for lower level
inserts (all methods check and populate the same `inserts` dictionary). Calling
this method flushes any existing inserts, ensuring a re-scan takes place across
calls (or "sessions").
Parameters:
skip_updated: whether to ignore primitives with existing up-to-date
database entries
Returns:
Table name-indexed dictionary of insert lists (of column name-indexed dicts)
'''
return self._inserts_from_receipts(receipts, pop=True)

89
co3/composer.py Normal file
View File

@ -0,0 +1,89 @@
'''
Composer
Base for manually defining table compositions outside those natural to the schema
hierarchy (i.e., constructable by a `CO4.compose()` call).
Example: suppose we have a simple object hierarchy A(CO4) -> B -> C. C's in-built
`compose()` method may not always be desirable when constructing composite tables and
running related queries. In this case, a custom Composer can be used to make needed
composite tables easier to reference; in the case below, we define the "BC" composite
table.
```
class ExampleComposer(Composer):
@register_table
def BC(self):
full_B = B.compose(full=True)
full_C = C.compose(full=True)
return full_B.join(
full_C,
full_B.c.name == full_C.c.name, # TODO: is this fine? or do we need base table refs
outer=True
)
'''
from pathlib import Path
from co3.mapper import Mapper
def register_table(table_name=None):
'''
Registry decorator for defined composer classes. Decorating a class method simply
attaches a `table_name` attribute to it, setting it to either a provided value or the
name of the method itself. Methods with a `table_name` attribute are later swept up at
the class level and placed in the `table_map`.
'''
def decorator(func):
if table_name is None:
table_name = func.__name__
func.table_name = table_name
return func
return decorator
class Composer[M: Mapper]:
'''
Base composer wrapper for table groupings.
The schema is centered around a connected group of tables (via foreign keys). Thus,
most operations need to be coordinated across tables. The `accessors` submodules
are mostly intended to provide a "secondary layer" over the base set of tables in the
schema, exposing common higher level table compositions (i.e., chained JOINs). See
concrete instances (e.g., CoreAccess, FTSAccessor) for actual implementations these
tables; the base class does not expose
Tables in subclasses are registered with the `register_table` decorator, automatically
indexing them under the provided name and making them available via the `table_map`.
'''
def __init__(self):
self._set_tables()
def _set_tables(self):
'''
Skip properties (so appropriate delays can be used), and
Set the table registry at the class level. This only takes place during the first
instantiation of the class, and makes it possible to definitively tie methods to
composed tables during lookup with `get_table()`.
'''
cls = self.__class__
# in case the class has already be instantiated
if hasattr(cls, 'table_map'): return
table_map = {}
for key, value in cls.__dict__.items():
if isinstance(value, property):
continue # Skip properties
if callable(value) and hasattr(value, 'table_name'):
table_map[value.table_name] = value(self)
cls.table_map = table_map
def get_table(self, table_name):
'''
Retrieve the named table composition, if defined.
'''
return self.table_map.get(table_name)

87
co3/database.py Normal file
View File

@ -0,0 +1,87 @@
'''
Database
Central object for defining storage protocol-specific interfaces. The database wraps up
central items for interacting with database resources, namely the Accessor and Manager
objects.
The Database type hierarchy attempts to be exceedingly general; SQL-derivatives should
subclass from the RelationalDatabase subtype, for example, which itself becomes a new
generic via type dependence on Relation.
'''
import logging
from typing import Self
from co3.accessor import Accessor
from co3.composer import Composer
from co3.manager import Manager
from co3.indexer import Indexer
logger = logging.getLogger(__name__)
class Database:
accessor: type[Accessor[Self]] = Accessor
manager: type[Manager[Self]] = Manager
def __init__(self, resource):
'''
Variables:
_local_cache: a database-local property store for ad-hoc CacheBlock-esque
methods, that are nevertheless _not_ query/group-by responses to
pass on to the Indexer. Dependent properties should write to the
this cache and check for existence of stored results; the cache
state must be managed globally.
'''
self.resource = resource
self._access = self.accessor(self)
self._manage = self.manager(self)
self._index = Indexer(self._access)
self._local_cache = {}
self.reset_cache = False
@property
def engine(self):
'''
Database property to provide a singleton engine for DB interaction, initializing
the database if it doesn't already exist.
TODO: figure out thread safety across engines and/or connection. Any issue with
hanging on to the same engine instance for the Database instance?
'''
raise NotImplementedError
def connect(self):
self.engine.connect()
@property
def access(self):
return self._access
@property
def compose(self):
return self._compose
@property
def index(self):
if self.reset_cache:
self._index.cache_clear()
self.reset_cache = False
return self._index
@property
def manage(self):
'''
Accessing `.manage` queues a cache clear on the external index, as well wipes the
local index.
'''
self.reset_cache = True
self._local_cache = {}
return self._manage
def populate_indexes(self): pass

View File

@ -0,0 +1,3 @@
from co3.databases.sql import *
from co3.databases.fts import FTSDatabase
from co3.databases.vss import VSSDatabase

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More