From c2e09b2c10afb09c702ac449a73fea315ad92f86 Mon Sep 17 00:00:00 2001 From: "Sam G." Date: Wed, 15 May 2024 19:56:01 -0700 Subject: [PATCH] clean up repo files, README, auxiliary files (pre-BFG) --- .gitignore | 6 ++++- LICENSE | 22 ++++++++++++++++ README.md | 60 ++++++++++++++++++++++---------------------- co3/accessors/vss.py | 11 +++----- docs/Makefile | 20 +++++++++++++++ pyproject.toml | 48 +++++++++++++++++++++++++---------- 6 files changed, 116 insertions(+), 51 deletions(-) create mode 100644 LICENSE create mode 100644 docs/Makefile diff --git a/.gitignore b/.gitignore index d8bede4..2511f64 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # generic py __pycache__/ .pytest_cache/ -localsys.egg-info/ +*.egg-info/ .ipynb_checkpoints/ .pytest_cache/ .python-version @@ -12,3 +12,7 @@ build/ docs/_autoref/ docs/_autosummary/ docs/_build/ + +# local +notebooks/ +/Makefile diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..93f98c5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2024 Sam Griesemer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/README.md b/README.md index 45c7486..480168f 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,35 @@ # Overview -`co3` is a package for file conversion and associated database operations. The `CO3` base class -provides a standard interface for performing conversions, preparing inserts, and -interacting with database schemas that mirror the class hierarchy. +`co3` is a lightweight Python ORM for hierarchical storage management. It implements a +general type system for defining database components like relations, schemas, engines, +etc. Objects inheriting from the `CO3` base class can then define data transformations +that connect to database components, and can be automatically collected for coordinated +database insertion. -Simplified description of the operational model: +`co3` attempts to provide a general interface for interacting with a storage media (e.g., +database, pickled objects, VSS framework, in-memory key-value stores, etc). The following +top-level classes capture the bulk of the operational model: -**Goal**: interact with a storage medium (database, pickled structure, VSS framework) with -a known schema. +- **Database**: reference to a storage medium, with an `Accessor` for accessing data, + `Manager` for managing database state, and an `Engine` for managing connections and + external operations. +- **Accessor**: provides access to stored items in a `Database`, typically via a supported + `select` operation over known `Component` types +- **Manager**: manages database storage state (e.g., supported inserts or database sync + operations) +- **Mapper**: associates `CO3` types with `Schema` components, and provides automatic + collection and composition operations for supported items +- **Collector**: collects data from defined `CO3` type transformations and prepares for + `Database` insert operations +- **Component**: atomic storage groups for databases (i.e., generalized notion of a + "relation" in relational algebra). +- **Indexer**: automatic caching of supported access queries to a `Database` +- **Schema**: general schema analog for grouping related `Component` sets +- **Differ**: facilitates set operations on results from selectable resources (e.g., + automatic comparison between file data on disk and file rows in a SQL database) +- **Syncer**: generalized syncing procedure for items between data resources (e.g., + syncing new, modified, and deleted files from disk to a SQL database that stores file + metadata). -- **Accessor** to provide access to stored items -- **Composer** to compose common access points (e.g., JOINed tables) -- **Indexer** to index/cache access queries -- **Manager** to manage storage state (e.g., supported inserts, database syncs) -- **Collector** to collect data for updating storage state -- **Database** to collect data for updating storage state -- **Mapper** to collect data for updating storage state -- **Component** to collect data for updating storage state +The **CO3** an abstract base class then makes it easy to integrate this model with regular +Python object hierarchies that can be mapped to a storage schema. -**CO3** is an abstract base class that makes it easy to integrate this model with object -hierarchies that mirror a storage schema. - -# Detailed structural breakdown -There are a few pillars of the CO3 model that meaningfully group up functionality: - -- Database: generic to a Component type, provides basic connection to a database at a - specific address/location. The explicit Component type makes it easy to hook into - appropriately typed functional objects: - * Manager: generic to a Component and Database type, provides a supported set of - state-modifying operations to a constituent database - * Accessor: generic to a Component and Database type, provides a supported set of - state inspection operations on a constituent database - * Indexer: -- Mapper: generic to a Component, serves as the fundamental connective component between - types in the data representation hierarchy (CO3 subclasses) and database Components. diff --git a/co3/accessors/vss.py b/co3/accessors/vss.py index 6b49882..e80db7b 100644 --- a/co3/accessors/vss.py +++ b/co3/accessors/vss.py @@ -11,6 +11,8 @@ from co3.accessor import Accessor logger = logging.getLogger(__name__) class VSSAccessor(Accessor): + _model_cls = None + def __init__(self, cache_path): super().__init__() @@ -35,8 +37,7 @@ class VSSAccessor(Accessor): @property def model(self): if self._model is None: - # model trained with 128 token seqs - self._model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2') + self._model = self._model_cls() return self._model @property @@ -60,12 +61,8 @@ class VSSAccessor(Accessor): index_name : str, query : str, limit : int = 10, - score_threshold = 0.5, + score_threshold = 0.5, ): - ''' - Parameters: - index_name: one of ['chunks','blocks','notes'] - ''' if not query: return None diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/pyproject.toml b/pyproject.toml index c5e7a78..712cef8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,27 +1,49 @@ [build-system] -requires = ["setuptools"] +requires = ["setuptools", "wheel", "setuptools-git-versioning>=2.0,<3"] build-backend = "setuptools.build_meta" [project] name = "co3" -version = "0.1.1" -authors = [ - { name="Sam Griesemer", email="samgriesemer@gmail.com" }, -] -description = "Lightweight ORM" +description = "Lightweight Python ORM for hierarchical storage management" readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.12" +dynamic = ["version"] +#license = {file = "LICENSE"} +authors = [ + { name="Sam Griesemer", email="samgriesemer+git@gmail.com" }, +] +keywords = ["database", "orm"] classifiers = [ - "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.12", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", ] dependencies = [ - "tqdm" + "tqdm", + "wcmatch", + "numpy", + "sqlalchemy", + "colorama", ] -[tool.setuptools.packages.find] -#where = ["localsys"] # this is too deeply nested, need to remain at root level & use -#include; by default, `where` is `.` -include = ["co3*"] # pattern to match package names +[project.optional-dependencies] +tests = ["pytest"] +docs = [ + "sphinx", + "sphinx-togglebutton", + "sphinx-autodoc-typehints", + "furo", + "myst-parser", +] +[project.urls] +Homepage = "https://doc.olog.io/co3" +Documentation = "https://doc.olog.io/co3" +Repository = "https://git.olog.io/olog/co3" +Issues = "https://git.olog.io/olog/co3/issues" + + +[tool.setuptools.packages.find] +include = ["co3*"] # pattern to match package names