From d48cc8fbc8b7a9b462fecad195202fed072cbd78 Mon Sep 17 00:00:00 2001
From: "Sam G." <samgriesemer@gmail.com>
Date: Wed, 15 May 2024 20:36:13 -0700
Subject: [PATCH] initial commit

---
 .gitignore                             |  18 +++
 LICENSE                                |  22 +++
 README.md                              |   3 +
 objectlib/__init__.py                  |   3 +
 objectlib/combinatorics/__init__.py    |   1 +
 objectlib/combinatorics/counting.py    | 143 +++++++++++++++++++
 objectlib/evolution/__init__.py        |   6 +
 objectlib/evolution/candidate.py       | 146 ++++++++++++++++++++
 objectlib/evolution/crossover.py       |  20 +++
 objectlib/evolution/evolutionary.py    |  59 ++++++++
 objectlib/evolution/genetic.py         |  60 ++++++++
 objectlib/evolution/mutation.py        |  41 ++++++
 objectlib/evolution/selection.py       |   6 +
 objectlib/probability/__init__.py      |   2 +
 objectlib/probability/distributions.py | 181 +++++++++++++++++++++++++
 objectlib/probability/sampling.py      |  23 ++++
 objectlib/utils/__init__.py            |   0
 objectlib/utils/dataloader.py          |  11 ++
 objectlib/utils/generator.py           |  60 ++++++++
 objectlib/utils/naming.py              |   2 +
 objectlib/utils/options.py             |  75 ++++++++++
 objectlib/utils/selection.py           |  84 ++++++++++++
 objectlib/utils/timing.py              |   0
 pyproject.toml                         |  49 +++++++
 24 files changed, 1015 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 objectlib/__init__.py
 create mode 100644 objectlib/combinatorics/__init__.py
 create mode 100644 objectlib/combinatorics/counting.py
 create mode 100644 objectlib/evolution/__init__.py
 create mode 100644 objectlib/evolution/candidate.py
 create mode 100644 objectlib/evolution/crossover.py
 create mode 100644 objectlib/evolution/evolutionary.py
 create mode 100644 objectlib/evolution/genetic.py
 create mode 100644 objectlib/evolution/mutation.py
 create mode 100644 objectlib/evolution/selection.py
 create mode 100644 objectlib/probability/__init__.py
 create mode 100644 objectlib/probability/distributions.py
 create mode 100644 objectlib/probability/sampling.py
 create mode 100644 objectlib/utils/__init__.py
 create mode 100644 objectlib/utils/dataloader.py
 create mode 100644 objectlib/utils/generator.py
 create mode 100644 objectlib/utils/naming.py
 create mode 100644 objectlib/utils/options.py
 create mode 100644 objectlib/utils/selection.py
 create mode 100644 objectlib/utils/timing.py
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2511f64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,18 @@
+# generic py
+__pycache__/
+.pytest_cache/
+*.egg-info/
+.ipynb_checkpoints/
+.pytest_cache/
+.python-version
+
+# vendor and build files
+dist/
+build/
+docs/_autoref/
+docs/_autosummary/
+docs/_build/
+
+# local
+notebooks/
+/Makefile
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..93f98c5
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2024 Sam Griesemer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b03cc46
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# objectlib
+A Python library for misc object types (primarily probabilistic and algorithmic) and
+serial streaming utilities
diff --git a/objectlib/__init__.py b/objectlib/__init__.py
new file mode 100644
index 0000000..0cb54b0
--- /dev/null
+++ b/objectlib/__init__.py
@@ -0,0 +1,3 @@
+from . import combinatorics
+from . import evolution
+from . import probability
diff --git a/objectlib/combinatorics/__init__.py b/objectlib/combinatorics/__init__.py
new file mode 100644
index 0000000..0788707
--- /dev/null
+++ b/objectlib/combinatorics/__init__.py
@@ -0,0 +1 @@
+from . import counting
\ No newline at end of file
diff --git a/objectlib/combinatorics/counting.py b/objectlib/combinatorics/counting.py
new file mode 100644
index 0000000..e93fdf5
--- /dev/null
+++ b/objectlib/combinatorics/counting.py
@@ -0,0 +1,143 @@
+from math import factorial
+import itertools
+import random
+
+class Product:
+    '''Cartesian product of iterables'''
+    def __init__(self, *data, repeat=1):
+        '''
+        TODO: address repeat variable in more efficient manner (i.e. dont repeat explicitly in memory)
+        params: any number of iterables
+        -> data: list of all passed iterables
+        '''
+        self.data = data*repeat
+
+    def count(self):
+        '''Compute number of elements in product'''
+        count = 1
+        for datum in self.data: count *= len(datum)
+        return count
+
+    def generate(self):
+        '''Generate all Cartesian product set'''
+        return itertools.product(*self.data)
+
+    def sample(self, m=1):
+        '''Randomly generate m samples from the product'''
+        for _ in range(m):
+            yield tuple(random.choice(exp) for exp in self.data)
+
+    def sample_without_replacement(self, m=1):
+        '''
+        Randomly generate m unique samples from the product. If m
+        greater than the number of possibles samples, return as
+        many as possible.
+        TODO: can implement approaches described in blog analysis.
+        Iterative resampling is cheap early, but expensive later.
+        Can attempt to dynamically implement both strategies for
+        optimal performance.
+        Decide on usage for never ending generator; for now loop
+        change allows negative values to work.
+        '''
+        if m > self.count(): return None
+        generated = set()
+        while len(generated) != m:
+            gtuple = next(self.sample(1))
+            if gtuple not in generated:
+                generated.add(gtuple)
+                yield gtuple
+
+class Permutation:
+    '''Permutations of iterables'''
+    def __init__(self, data):
+        self.data = data
+        self.n = len(data)
+
+    @staticmethod
+    def nPk(n, k):
+        '''Compute nPk'''
+        return int(factorial(n) / factorial(n-k))
+
+    def count(self, k=None):
+        '''Compute nPk expxlicitly on object data'''
+        if k is None: k = self.n
+        if k > self.n: return None
+        return Permutation.nPk(self.n, k)
+        
+    def generate(self, k=None):
+        '''Return generator over all permutations of object data'''
+        if k is None: k = self.n
+        return itertools.permutations(self.data, k)
+
+    def generate_with_repetition(self, k=None):
+        pass
+
+    def sample(self, k=None, m=1):
+        '''Return generator over m random samples from k-permutations of object data'''
+        if k is None: k = self.n
+        if k > self.n: return None
+        for _ in range(m):
+            yield tuple(random.sample(self.data, k))
+
+    def sample_without_replacement(self, m=1):
+        '''Randomly generate m unique permutations of object data'''
+        if m > self.count(): return None
+        generated = []
+        while len(generated) < m:
+            gtuple = next(self.sample(1))
+            if gtuple not in generated:
+                generated.append(gtuple)
+                yield gtuple
+
+    def duplicates(self, k=None):
+        if k is None: k = self.n
+        if k > self.n: return None
+        return set(self.generate(k))
+
+class Combination:
+    '''Combinations of iterables'''
+    def __init__(self, data):
+        self.data = data
+        self.n = len(data)
+
+    @staticmethod
+    def nCk(n, k):
+        '''Compute nCk'''
+        return int(factorial(n) / (factorial(k)*factorial(n-k)))
+
+    def count(self, k=None):
+        '''Compute nCk implicitly on object data'''
+        if k is None: k = self.n
+        if k > self.n: return None
+        return Combination.nCk(self.n, k)
+        
+    def generate(self, k=None):
+        '''Return generator over all combinations of object data'''
+        if k is None: k = self.n
+        return itertools.combinations(self.data, k)
+
+    def generate_with_repetition(self, k=None):
+        pass
+
+    def sample(self, k=None, m=1):
+        '''Return generator over m random samples from k-combinations of object data'''
+        if k is None: k = self.n
+        if k > self.n: return None
+        for _ in range(m):
+            indices = sorted(random.sample(range(self.n), k))
+            yield tuple(self.data[i] for i in indices)
+
+    def sample_without_replacement(self, m=1):
+        '''Randomly generate m unique permutations of object data'''
+        if m > self.count(): return None
+        generated = []
+        while len(generated) < m:
+            gtuple = next(self.sample(1))
+            if gtuple not in generated:
+                generated.append(gtuple)
+                yield gtuple
+
+    def duplicates(self, k):
+        if k is None: k = self.n
+        if k > self.n: return None
+        return set(self.generate(k))
diff --git a/objectlib/evolution/__init__.py b/objectlib/evolution/__init__.py
new file mode 100644
index 0000000..d93fd81
--- /dev/null
+++ b/objectlib/evolution/__init__.py
@@ -0,0 +1,6 @@
+from . import candidate
+from . import crossover
+from . import evolutionary
+from . import genetic
+from . import mutation
+from . import selection
\ No newline at end of file
diff --git a/objectlib/evolution/candidate.py b/objectlib/evolution/candidate.py
new file mode 100644
index 0000000..1c4ec01
--- /dev/null
+++ b/objectlib/evolution/candidate.py
@@ -0,0 +1,146 @@
+import string
+
+from ..sim.agent import Agent
+from ..combinatorics import counting
+from ..ml import nn
+
+class Candidate(Agent):
+    '''
+    Base candidate class for evolutionary algorithms
+    NOTE: can always consider switching the constructor to
+    by default take random attributes and yield a stochastic
+    candidate. This might make sense if we are to strictly
+    follow what is most commonly used. However, it's not
+    obvious how to go from specified genotype into a constructor
+    expecting parameters for random generation; this would certainly
+    be more sloppy than the current simple entry point constructor.
+
+    Want to separate candidate from agent. Candidates dont need to be
+    defined in the context of a gym evnironment. They just hold a genotype
+    and inherit the basic functions seen in base
+
+    UPDATE: candidates ARE agents. They NEED to be defined in the context
+    of a gym environment, as there must be a way of evaluating the candidates
+    in an objective manner. This environment can be completely static, but
+    the point is that it provides context for evaluating fitness. Candiates
+    are to inherit the same methods as any agent, but have the additional
+    `genotype` attribute which holds their internal representation in the
+    context of a genetic algorithm process. 
+    '''
+    def __init__(self, genotype):
+        super().__init__()
+        self.genotype = genotype
+
+    def __str__(self):
+        return self.epigenesis().__str__
+
+    @classmethod
+    def random(cls, genotype):
+        '''
+        Alternate constructor for random candidate construction, to be
+        implemented by subclassing type if stochastic construction. 
+
+        NOTE: This is currently the official way of creating random objects:
+        define a standard constructor that sets internal variables based on
+        given arguments. Then define class methods which take their own set
+        of parameters and construct the main object by creating values and
+        sending them to the constructor. This is, so far, the cleanest and
+        most extendible approach to constructor overloading I've seen, and
+        has since worked very well.
+        '''
+        pass
+
+    def epigenesis(self):
+        '''Process of turning genotype into phenotype'''
+        return self.genotype
+
+class AlphaString(Candidate):
+    '''Candidate child for genetic string'''
+    @classmethod
+    def random(cls, length, alphabet=string.printable):
+        '''
+        Create random AlphaString
+
+        alphastr = AlphaString.random(length)
+        alphastr = AlphaString.random(length, 'abc')
+
+        :genotype: list (mutable)
+        :phenotype: conversion to string
+        '''
+        gene = counting.Product(*[alphabet]*length)
+        gene = list(next(gene.sample()))
+        return cls(gene)
+
+    def epigenesis(self):
+        return ''.join(self.genotype)
+
+class BitString(AlphaString):
+    '''Candidate child for genetic string'''
+    @classmethod
+    def random(cls, length):
+        return super().random(length, '01')
+
+class NeuralNetwork(Candidate):
+    '''
+    NeuralNetwork candidate object for use in
+    neuroevolution implementations. This candidate
+    has a phenotype represented by its observable
+    actions resulting from inference, and a genotype
+    represented by its underlying internal network
+    structure and weights. All evolution operations (as
+    usual) are performed on the genotype level.
+
+    :phenotype: output from inference and resulting behavior
+    :genotype: internal network structure and weight values
+    '''
+    def __init__(self, genotype):
+        '''
+        Genotype expected to be of the form of `.weights`
+        attribute from the NeuralNetwork class (i.e. a list
+        of NumPy arrays)
+        DONT need if just going to set genotype
+        '''
+        super().__init__(genotype)
+        self.time_alive = 0
+
+    @classmethod
+    def random(cls, layers, rng=1):
+        '''
+        Take layers structure as input, instantiate neural
+        network with given layers, set random weights according
+        to [-rng, +rng]
+
+        :layers: list of network layer size
+        :rng: weights generated from [-rng, +rng]
+        '''
+        net = nn.NeuralNetwork(layers, epsilon=rng)
+        return cls(net.weights)
+
+    def epigenesis(self):
+        '''
+        Convert from network structure to observable actions
+        via inference on live neural network architecture using
+        genotype weights. This process requires a data point on
+        which to evaluate the network
+
+        TODO: consider how this is being done; should a nn object
+        be kept in memory at all times and modifications be made
+        directly to its weights so come inference time everything is
+        ready to go? This seems a little bulky but may end up being
+        more efficient. Initializing a network each time from weights
+        though has a tiny overhead; it just sets the nn object's weights
+        and no additional computation is needed.
+        Also how are we going to pass the incoming data to the network
+        for the actual inference procedure? Should the data be set to
+        the network itself, passed to the function, or set under the
+        candidate object?
+        '''
+        net = nn.NeuralNetwork.from_weights(self.genotype)
+        return net
+
+    def update(self):
+        self.time_alive += 1
+
+    def act(self):
+        net = self.epigenesis()
+        return net.predict(self.state)[0]
diff --git a/objectlib/evolution/crossover.py b/objectlib/evolution/crossover.py
new file mode 100644
index 0000000..c001af7
--- /dev/null
+++ b/objectlib/evolution/crossover.py
@@ -0,0 +1,20 @@
+import random
+
+def single_point(parent1, parent2):
+    '''
+    General single point crossover method for any two
+    iterables of the same length
+    '''
+    child = parent1.genotype.copy()
+    begin = random.randint(0, len(child) - 1)
+    end = random.randint(0, len(child) - 1)
+    start, stop = min(begin, end), max(begin, end)
+    child[start:stop] = parent2.genotype[start:stop]
+    return child
+
+def multipoint(parent1, parent2):
+    '''Generalizes single point crossover, could make redundant'''
+    pass
+
+def weight_slice(net1, net2):
+    return net1
\ No newline at end of file
diff --git a/objectlib/evolution/evolutionary.py b/objectlib/evolution/evolutionary.py
new file mode 100644
index 0000000..7250a3a
--- /dev/null
+++ b/objectlib/evolution/evolutionary.py
@@ -0,0 +1,59 @@
+import random
+
+class Evolutionary:
+    '''Base evolutionary algorithm class'''
+    def __init__(self, population_size, num_generations, mutation_params, candidate, cand_params, num_offspring=1, gym=None):
+        self.population = []
+        self.population_size = population_size
+        self.num_generations = num_generations
+        self.mutation_params = mutation_params
+        self.candidate = candidate
+        self.cand_params = cand_params
+        self.num_offspring = num_offspring
+        self.gym = gym
+        self.action = []
+
+    def fitness(self, candidate):
+        '''Fitness function for evaluating candidate quality'''
+        raise NotImplementedError
+
+    def selection(self, population):
+        '''Method of parent selection for crossover'''
+        raise NotImplementedError
+
+    def crossover(self, parent1, parent2):
+        '''Method of reproduction between candidates'''
+        raise NotImplementedError
+
+    def mutation(self, candidate):
+        '''Method of random mutation in candidate'''
+        raise NotImplementedError
+
+    def termination(self, population):
+        '''
+        Termination condition for simulation
+        By default, return False so that simulation
+        runs for all generations
+        '''
+        return False
+
+    def create_population(self):
+        for _ in range(self.population_size):
+            # create random candidate from given params
+            cand = self.candidate.random(**self.cand_params)
+
+            # add candidate to population
+            self.population.append(cand)
+
+            # register agent in gym if applicable
+            if self.gym: self.gym.register_agent(cand)
+
+    def run(self):
+        '''
+        Run evolutionary simulation, after class setup has been completed.
+        Implementation will vary based on subclassing type. General approach
+        will iterate until termination condition met, evaluating, selection,
+        breeding, and mutating a population of candidates. Generator yielding
+        generation specific details is encouraged functional form.
+        '''
+        raise NotImplementedError
diff --git a/objectlib/evolution/genetic.py b/objectlib/evolution/genetic.py
new file mode 100644
index 0000000..a17232d
--- /dev/null
+++ b/objectlib/evolution/genetic.py
@@ -0,0 +1,60 @@
+from . import evolutionary
+
+class GeneticAlgorithm(evolutionary.Evolutionary):
+    '''
+    Standard genetic algorithm (in a way, the genetic algo is itself
+    an agent, taking states, maintaining internal representation,
+    reacting and responding to the environment
+    '''
+    def run(self):
+        # initialize population of candidates
+        self.create_population()
+        self.gym.start()
+
+        # begin generation loop
+        for gen in range(self.num_generations):
+            # execute actions and get new gym state
+            self.gym.tick()
+
+            # rank individuals based on current fitness
+            self.population.sort(key=lambda x: self.fitness(x), reverse=True)
+
+            # balance population size
+            self.population = self.population[:self.population_size]
+
+            # maintain gym agent registry
+            self.gym.update_agents(self.population)
+            self.gym.refresh_state()
+
+            # yield generation specific details
+            top_candidate = self.population[0]
+            bot_candidate = self.population[-1]
+            yield {'generation'    : gen,
+                   'best_candidate': str(top_candidate.epigenesis()),
+                   'best_fitness'  : self.fitness(top_candidate),
+                   'worst_fitness' : self.fitness(bot_candidate),}
+                   #'state'         : self.gym.state}
+
+            # for cand in self.population:
+            #     print(str(cand.epigenesis()))
+
+            # check termination condition
+            if self.termination(self.population):
+                return self.population[0]
+
+            # consider multiple offspring per generation
+            for _ in range(self.num_offspring):
+                # stochastically select parent candidates
+                parent1 = self.selection(self.population)
+                parent2 = self.selection(self.population)
+
+                # create child candidate via crossover
+                child_genotype = self.crossover(parent1, parent2)
+                child = self.candidate(child_genotype)
+
+                # perform (possible) mutations on child
+                self.mutation(child, **self.mutation_params)
+
+                # add child to population, gym for next round eval
+                self.population.append(child)
+                self.gym.register_agent(child)
diff --git a/objectlib/evolution/mutation.py b/objectlib/evolution/mutation.py
new file mode 100644
index 0000000..fa0afb3
--- /dev/null
+++ b/objectlib/evolution/mutation.py
@@ -0,0 +1,41 @@
+import random
+
+def mutation_decorator(mutate):
+    def wrapper(candidate, rate, **kwargs):
+        if random.random() < rate:
+            mutate(candidate, **kwargs)
+    return wrapper
+
+def class_mutation_decorator(mutate):
+    def wrapper(self, candidate, rate, **kwargs):
+        if random.random() < rate:
+            mutate(self, candidate, **kwargs)
+    return wrapper
+
+@mutation_decorator
+def bitflip(candidate):
+    '''in-place flip bit in bit-array'''
+    gene = candidate.genotype
+    rand = random.randint(0, len(gene)-1)
+    gene[rand] = str(int(gene[rand])^1)
+
+@mutation_decorator
+def alterchar(candidate):
+    '''shift character up or down'''
+    gene = candidate.genotype
+    rand = random.randint(0, len(gene)-1)
+    gene[rand] = chr(ord(gene[rand]) + random.choice([-1, 1]))
+
+@mutation_decorator
+def alter_weight(candidate, rng):
+    '''
+    Modify real numbers uniformly at random from a 
+    NeuralNetwork weight vector
+    '''
+    weights = candidate.genotype
+    layer = random.randint(0,len(weights)-1)
+    shape = weights[layer].shape
+    i, j = random.randint(0,shape[0]-1), random.randint(0,shape[1]-1)
+    weights[layer][i,j] += random.uniform(-rng, rng)
+
+    
\ No newline at end of file
diff --git a/objectlib/evolution/selection.py b/objectlib/evolution/selection.py
new file mode 100644
index 0000000..97fc076
--- /dev/null
+++ b/objectlib/evolution/selection.py
@@ -0,0 +1,6 @@
+import random
+
+def roulette(population):
+    rand = random.random()*random.random()
+    rand = int(rand*len(population))
+    return population[rand]
diff --git a/objectlib/probability/__init__.py b/objectlib/probability/__init__.py
new file mode 100644
index 0000000..85651f6
--- /dev/null
+++ b/objectlib/probability/__init__.py
@@ -0,0 +1,2 @@
+from . import distributions
+from . import sampling
\ No newline at end of file
diff --git a/objectlib/probability/distributions.py b/objectlib/probability/distributions.py
new file mode 100644
index 0000000..f87b8f5
--- /dev/null
+++ b/objectlib/probability/distributions.py
@@ -0,0 +1,181 @@
+import random
+import math
+
+from ..combinatorics.counting import Combination
+
+class Distribution():
+    '''
+    Consider extending base class to Continuous and Discrete subclasses.
+    There are some slight differences that might matter to the API (e.g.
+    "pdf" vs "pmf")
+    '''
+    def __init__(self, *params):
+        self.params = params
+
+    def pdf(self, x):
+        '''return pdf(x) = density(x)'''
+        pass
+
+    def cdf(self, x):
+        '''return cdf(x) = Pr(x <= X)'''
+        pass
+
+    def quantile(self, x):
+        '''return cdf^{-1}(p) = [Pr(x <= X) == p]'''
+        pass
+
+    def sample(self, n=1):
+        '''
+        n: number of samples
+
+        Dev note: should consider returning something like a "Sample" object, which wraps
+        the samples and provides convenient empirical estimates e.g. MLE of parameters
+        '''
+        if n == -1:
+            while True:
+                yield next(self.sample())
+
+    '''common moments (consider making @property)'''
+    def mean(self):
+        '''distribution mean'''
+        pass
+
+    def variance(self):
+        '''distribution variance'''
+        pass
+
+class DiscreteDistribution(Distribution):
+    '''
+    Discrete distribution base class
+    '''
+    pass
+
+class ContinuousDistribution(Distribution):
+    '''
+    Continuous distribution base class
+    '''
+    pass
+
+class Bernoulli(DiscreteDistribution):
+    def __init__(self, p):
+        self.p = p
+
+    def pdf(self, x):
+        return self.p**x * (1-self.p)**(1-x)
+
+    def cdf(self, x):
+        return (1-self.p)**(1-int(x))
+
+    def sample(self, n=1):
+        for _ in range(n):
+            yield 1 if random.random() < self.p else 0
+
+    def mean(self):
+        return self.p
+
+    def variance(self):
+        return self.p*(1-self.p) 
+
+class Binomial(DiscreteDistribution):
+    '''
+    consider indexing probabilities at different
+    values for later access. precompute could be cheap
+    and save time later, or could expensive and never
+    really used. have to compare the options
+    '''
+
+    def __init__(self, n, p):
+        self.n = n
+        self.p = p
+        self._cdf = {}
+
+    def pdf(self, x):
+        return Combination.nCk(self.n, x)*self.p**x*(1-self.p)**(self.n-x)
+
+    def cdf(self, x, index=False):
+        '''iteratively (naively) compute
+        P(X <= x)'''
+        p = 0
+        for i in range(int(x)+1):
+            if i == self.n: p = 1
+            else: p += self.pdf(i)
+            if index:
+                self._cdf[i] = p
+        return p
+
+    def sample(self, n=1):
+        '''
+        naive implementation (for now). meant to be used
+        with relatively small n. consider poisson
+        sampling for sufficiently large n
+        '''
+        super().sample(n)
+        # index entire cdf
+        if self.n not in self._cdf:
+            self.cdf(self.n, index=True)
+
+        for _ in range(n):
+            r = random.random()
+            for x in self._cdf:
+                if self._cdf[x] >= r: break 
+            yield x
+
+    def mean(self):
+        return self.n*self.p
+
+    def variance(self):
+        return self.n* self.p*(1-self.p) 
+
+class Exponential(ContinuousDistribution): pass
+
+class Normal(ContinuousDistribution): pass
+
+class Poisson(ContinuousDistribution):
+    def __init__(self, lmda):
+        self.lmda = lmda  
+
+    def pdf(self, k):
+        return self.lmda**k*math.e**(-self.lmda)/math.factorial(k)
+
+    def cdf(self, x):
+        pass
+
+    def quantile(self, x):
+        pass
+
+    def sample(self, n):
+        for _ in range(n):
+            yield None
+
+    def mean(self):
+        return self.lmda
+
+    def variance(self):
+        return self.lmda
+
+class Uniform(Distribution):
+    def __init__(self, a, b):
+        self.a = a
+        self.b = b
+        self.lower = min(a,b)
+        self.width = abs(a-b)
+
+    def pdf(self, x):
+        return 1 / self.width
+
+    def cdf(self, x):
+        return (x-self.lower) / self.width
+
+    def quantile(self, x):
+        pass
+
+    def sample(self, n=1):
+        yield from super().sample(n)
+        for _ in range(n):
+            yield random.random()*self.width+self.lower
+
+    def mean(self):
+        return (self.a+self.b)/2
+
+    def variance(self):
+        return (self.b-self.a)**2/12
diff --git a/objectlib/probability/sampling.py b/objectlib/probability/sampling.py
new file mode 100644
index 0000000..b7fa54f
--- /dev/null
+++ b/objectlib/probability/sampling.py
@@ -0,0 +1,23 @@
+import math
+import random
+
+def sample(population, k=1):
+    for i in range(k):
+        r = math.floor(random.random()*len(population))
+        yield population[r]
+        population.pop(r)
+
+def inverse_transform(inv_cdf):
+    r = random.random()
+    return inv_cdf(r)
+
+#def discrete_inverse_transform(cdf):
+#  '''general naive implementation'''
+#  def inv(p):
+#    x = 0
+#    for i in range(x):
+#      p +=  
+    
+def rejection_sampling(): pass
+def importance_sampling(): pass
+
diff --git a/objectlib/utils/__init__.py b/objectlib/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/objectlib/utils/dataloader.py b/objectlib/utils/dataloader.py
new file mode 100644
index 0000000..d59f053
--- /dev/null
+++ b/objectlib/utils/dataloader.py
@@ -0,0 +1,11 @@
+class DataLoader:
+  def __init__(self, path, batch_size):
+    self.path = path
+    self.batch_size
+
+    self.load()
+
+  def load(self):
+    pass
+    # return iterator over dataset files
+    
diff --git a/objectlib/utils/generator.py b/objectlib/utils/generator.py
new file mode 100644
index 0000000..90bb289
--- /dev/null
+++ b/objectlib/utils/generator.py
@@ -0,0 +1,60 @@
+def exhaust(gen, func=None, interval=1, verbose=True, last=True):
+    '''
+    Exhaust given generator, applying given function (func)
+    to items along  the way (if verbose=True) at specified 
+    interval. Return final element of the generator.
+    '''
+    for i, item in enumerate(gen):
+        if i % interval == 0:
+            if func: func(item)
+            if verbose: print(item)
+    
+    # perform same actions for last item
+    if last and i % interval != 0:
+        if func: func(item)
+        if verbose: print(item)
+
+    return item
+
+async def async_exhaust(gen, func=None, interval=1, verbose=True, last=True):
+    '''
+    Exhaust given generator, applying given function (func)
+    to items along  the way (if verbose=True) at specified 
+    interval. Return final element of the generator.
+    '''
+    for i, item in enumerate(gen):
+        if i % interval == 0:
+            if func: await func(item)
+            if verbose: print(item)
+    
+    # perform same actions for last item
+    if last and i % interval != 0:
+        if func: await func(item)
+        if verbose: print(item)
+
+    return item
+
+def chunk(gen, n, last=True):
+    '''
+    map generator <gen> to "chunked" generator,
+    yielding lists of <n> elements of the original
+    generator. Last chunk not guaranteed to be size
+    <n>, can specific last=False if partial chunks 
+    shouldn't be returned.
+
+    TODO: consider adding a time parameter as well, 
+    such that if <n> items have not arrived from the
+    original generator in <t> seconds, return the 
+    current chunk. Could protect against long running,
+    async generator processes (and may be useful for
+    time dependent physics sims)
+    '''
+    chunk = []
+    for item in gen:
+        chunk.append(item)
+        if len(chunk) == n:
+            yield chunk
+            chunk = []
+
+    if last and chunk:
+        yield chunk
diff --git a/objectlib/utils/naming.py b/objectlib/utils/naming.py
new file mode 100644
index 0000000..b58d337
--- /dev/null
+++ b/objectlib/utils/naming.py
@@ -0,0 +1,2 @@
+def camel_to_snake(s):
+    return ''.join(['_'+c.lower() if c.isupper() else c for c in s]).lstrip('_')
diff --git a/objectlib/utils/options.py b/objectlib/utils/options.py
new file mode 100644
index 0000000..50ce65c
--- /dev/null
+++ b/objectlib/utils/options.py
@@ -0,0 +1,75 @@
+class Opt(dict):
+    '''
+    TODO: may want to be able to set values using standard dict API, so would have
+    to redirect options set to the value dict
+
+    TODO: could maybe throw specialized errors for patterns like require, but a key error
+    will be thrown either way, which might be good enough
+
+    Default pattern value is 'optional', as it seems to make the least assumptions
+    about the nature of the parameter. When the base has some set values, but no pattern
+    is given, we simply recover the default `update` behaviour of standard dicts. Here we
+    iterate over the target keys, and if the key has no pattern, we set it directly to the
+    base. This takes care of both keys that are in the base but we've left them to be taken
+    care of optionally by default, and any other keys unknown to both the base and to
+    patterns. The subset of the target keys without a pattern is the only group of unprocessed
+    keys at that point in time.
+
+    Note that dicts can use their `update` method on an Opt object, and of course vice versa.
+    There is not point in using an Opt object if no pattern is specified, as all patterns
+    not specified are assumed to be optional, which is exactly what regular dicts do.
+
+    Why is "ignore" needed? If you don't want your defaults changed, why not just leave them
+    out? Well this is a valid point, but in the case you are using the Opt object to set your
+    class attributes, there are some you want to ensure _dont_ get set (which will overwrite
+    your defaults you may have set outside the Opt object).
+
+    Permission options include:
+    - require: require provided dict specifies key, no base value needed
+    - optional: key is optional in provided dict, will be used instad of any base values
+    - merge: merge provided values with base values in expectable way
+    - ignore: ignore provided values under this key in preference for base (if defined, doesn't need to be)
+    '''
+    def __init__(self, d):
+        super().__init__(**d)
+
+    def set_pattern(self, pattern):
+        self.pattern = pattern
+
+    def update(self, target):
+        '''
+        Main purpose of this class. Update base values with target values according to the
+        permissions set.
+        '''
+        # execute update pattern
+        for key, pattern in self.pattern.items():
+            if pattern == 'require':
+                self[key] = target[key]
+            elif pattern == 'optional':
+                self[key] = target.get(key, self.get(key, None))
+            elif pattern == 'merge':
+                self[key] = self.merge(self.get(key), target.get(key))
+
+        # add key-value pairs that don't have a pattern, but may or may not already have
+        # an entry in base. All keys with a pattern have already been processed (if they've
+        # been ignored, and there wasn't an entry in base, then that key doesn't have a
+        # representative in the base, but this is intentional
+        for key, value in target.items():
+            if self.pattern.get(key) is None:
+                self[key] = value
+
+    def merge(self, val1, val2):
+        if val1 is None and val2 is None:
+            raise Exception('no values provided to merge')
+
+        if val1 is None: return val2
+        if val2 is None: return val1
+
+        if type(val1) != type(val2):
+            raise Exception('mismatching types on merge')
+
+        if type(val1) == dict:
+            return {**val1, **val2}
+        else:
+            return val1 + val2
+
diff --git a/objectlib/utils/selection.py b/objectlib/utils/selection.py
new file mode 100644
index 0000000..96b8279
--- /dev/null
+++ b/objectlib/utils/selection.py
@@ -0,0 +1,84 @@
+class Collection():
+    """Docstring for Collection."""
+
+    def __init__(self, objs=[], data=[]):
+        self.objs = objs
+        self._data = data
+        self.state = []
+
+        self.state_map = {}
+        self.data_map = {}
+
+        self.key = lambda d,i: i
+
+    def data(self, data, key=None):
+        self._data = data
+
+        if key is not None:
+            self.key = key
+
+        for i, d in enumerate(self._data):
+            idx = self.key(d, i)
+            self.data_map[idx] = d
+
+        for i, o in enumerate(self.state):
+            idx = self.key(o['dat'], i)
+            self.state_map[idx] = o['obj']
+
+    def enter(self):
+        '''
+        check keys across data to state dicts; those keys in the data dict not in the
+        state dict are in the enter selection
+        '''
+        enter = []
+        for k, v in self.data_map.items():
+            if k not in self.state_map:
+                enter.append(v)
+
+        return Collection(data=enter)
+
+    def merge(self):
+        return self
+
+    def update(self):
+        '''
+        check keys across data to state dicts; those keys in both the state dict and the
+        data dict are in the update selection
+        '''
+        return self
+
+    def exit(self):
+        '''
+        check keys across data to state dicts; those keys in the state dict not in the
+        data dict are in the exit selection
+        '''
+        exit = []
+        for k, v in self.state_map.items():
+            if k not in self.data_map:
+                exit.append(v)
+
+        return Collection(objs=exit)
+
+    def append(self, func):
+        '''
+        Takes a function and applies it to each
+        '''
+        for i, d in enumerate(self._data):
+            obj = func(d,i)
+            self.objs.append(obj)
+            self.state.append({'dat': d, 'obj': obj})
+
+#.select('group')
+#.data([])
+#.enter().append()
+#.update()
+#.exit().remove()
+
+#groups = {
+#    'group': Collection([1,2,3,4,5])
+#}
+#
+#col = groups['group']
+#col.data([1,2,3])
+#col.enter()
+#col.exit()
diff --git a/objectlib/utils/timing.py b/objectlib/utils/timing.py
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..3b03a01
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,49 @@
+[build-system]
+requires = ["setuptools", "wheel", "setuptools-git-versioning>=2.0,<3"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools-git-versioning]
+enabled = true
+
+[project]
+name = "objectlib"
+description = "Object serialization and streaming utilities"
+readme = "README.md"
+requires-python = ">=3.12"
+dynamic = ["version"]
+#license = {file = "LICENSE"}
+authors = [
+  { name="Sam Griesemer", email="samgriesemer+git@gmail.com" },
+]
+keywords = [""]
+classifiers = [
+    "Programming Language :: Python :: 3.12",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Developers",
+]
+dependencies = [
+    "numpy",
+    "colorama",
+]
+
+[project.optional-dependencies]
+tests = ["pytest"]
+docs = [
+    "sphinx",
+    "sphinx-togglebutton",
+    "sphinx-autodoc-typehints",
+    "furo",
+    "myst-parser",
+]
+
+[project.urls]
+Homepage = "https://doc.olog.io/objectlib"
+Documentation = "https://doc.olog.io/objectlib"
+Repository = "https://git.olog.io/olog/objectlib"
+Issues = "https://git.olog.io/olog/objectlib/issues"
+
+
+[tool.setuptools.packages.find]
+include = ["objectlib*"] # pattern to match package names