From 70779fa0ad7d0c921aa5a34f089a2b46142219fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Fri, 15 Oct 2021 17:50:24 +0200 Subject: [PATCH] First commit --- .gitignore | 5 + LICENSE | 201 +++++++++++++++++++ MANIFEST.in | 6 + README.md | 91 +++++++++ requirements.txt | 0 setup.cfg | 4 + setup.py | 57 ++++++ test-requirements.txt | 1 + tests/test_history.py | 227 +++++++++++++++++++++ tests/test_main.py | 79 ++++++++ tsih/__init__.py | 444 ++++++++++++++++++++++++++++++++++++++++++ tsih/serialization.py | 89 +++++++++ tsih/utils.py | 87 +++++++++ 13 files changed, 1291 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 test-requirements.txt create mode 100644 tests/test_history.py create mode 100644 tests/test_main.py create mode 100644 tsih/__init__.py create mode 100644 tsih/serialization.py create mode 100644 tsih/utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..675225c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.* +*.pyc +__pycache__ +build +dist \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e12e234 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2016 Jesús Manuel Sánchez Martínez - Grupo de Sistemas Inteligentes (GSI) DIT UPM + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..98cf8eb --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include requirements.txt +include test-requirements.txt +include README.md +graft tsih +global-exclude __pycache__ +global-exclude *.py[co] diff --git a/README.md b/README.md new file mode 100644 index 0000000..5541a54 --- /dev/null +++ b/README.md @@ -0,0 +1,91 @@ +# TSIH - A dict with a HISTory + +`tsih.Dict` is a type of `UserDict` that allows versioning, backed up by a `sqlite3` database. + +* Transparent operation +* Only changes (deltas) are stored. +* Forward-filling of values. A value is reused in future versions, unless it changes. +* Auto-versioning option (off by default), to produce a new version every time a value change happens. +* Ability to store related entries as separate dictionaries. Each `tsih.Dict` has a `dict_name` that is used in the database to identify the dictionary. +* Tuple-based indexing. Get and set values by `dict_name`, `version` and `key`. + +## Usage and examples + +`tsih.Dict` objects can be used just like regular dictionaries: + +```python +>>> from tsih import Dict +>>> a = Dict() +>>> a['test'] = True +>>> a +{'test': True} +>>> a.get('missing', 5) +5 +>>> a['missing'] +Traceback (most recent call last): + File "", line 1, in +KeyError: 'missing' +``` + +But at any point, new versions can be produced: + +```python +>>> a.version +0 +>>> a['start'] = 'now' +>>> a +{'test': True, 'start': 'now'} +>>> a.version = 1 +>>> a['start'] = 'one version ago' +>>> a +{'test': True, 'start': 'one version ago'} +``` + +Previous values can be accessed using tuple keys, i.e., (version, key): + +```python +>>> a[(0, 'start')] +'now' +>>> a[(1, 'start')] +'one version ago' +``` + +Each version only "records" changes, but later versions (even if they don't exist yet) inherit unchanged values from the previous ones: + +```python +>>> a[(5, 'start')] +'one version ago' +>>> a.version = 5 +>>> # Until the value is changed +>>> a['start'] = '4 versions ago' +>>> a[(5, 'start')] +'4 versions ago' +``` + +You can access *every* state of the Dict using `None` in place of the version and/or the key. +In that case, we will get an iterator, which we can turn into a list explicitly or with the `.value` method. + +For example, here we get all the changes to the `start` key: + +```python +>>> a[(None, 'start')].value() # +[(0.0, 'now'), (1.0, 'one version ago'), (5.0, '4 versions ago')] +``` + +Similarly, to get the keys and values at a specific version: + +```python +>>> list(a[(0, None)]) +[('start', 'now'), ('test', True)] +``` + +Or, we can combine both to get the keys and values at every version: + +```python +>>> a[(None, None)].value() +[(0.0, 'start', 'now'), (1.0, 'start', 'one version ago'), (5.0, 'start', '4 versions ago'), (0.0, 'test', True), (1.0, 'test', True), (5.0, 'test', True)] +``` + +## Use cases + +Tsih was originally part of the [Soil](https://github.com/gsi-upm/soil) Agent-Based Social Simulation framework, where both the environment and the agents need to keep track of state (i.e., attribute) changes. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..970eadf --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[aliases] +test=pytest +[tool:pytest] +addopts = --verbose \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7843823 --- /dev/null +++ b/setup.py @@ -0,0 +1,57 @@ +import os +import re +from setuptools import setup +from pathlib import Path + +this_directory = Path(__file__).parent +long_description = (this_directory / "README.md").read_text() + +version = "" +with open(os.path.join('tsih', '__init__.py')) as f: + version = re.search( + r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE + ).group(1) + assert version + + +def parse_requirements(filename): + """ load requirements from a pip requirements file """ + with open(filename, 'r') as f: + lineiter = list(line.strip() for line in f) + return [line for line in lineiter if line and not line.startswith("#")] + + +install_reqs = parse_requirements("requirements.txt") +test_reqs = parse_requirements("test-requirements.txt") +extras_require={} +extras_require['all'] = [dep for package in extras_require.values() for dep in package] + + +setup( + name='tsih', + packages=['tsih'], # this must be the same as the name above + version=version, + description=("A lightweight library to store an object's history into a SQL database"), + long_description=long_description, + long_description_content_type='text/markdown', + author='J. Fernando Sanchez', + author_email='jf.sanchez@upm.es', + url='https://github.com/balkian/tsih', # use the URL to the github repo + download_url='https://github.com/balkian/tsih/archive/{}.tar.gz'.format( + version), + keywords=['history', 'sql', 'records'], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: POSIX', + 'Programming Language :: Python :: 3'], + install_requires=install_reqs, + extras_require=extras_require, + tests_require=test_reqs, + setup_requires=['pytest-runner', ], + include_package_data=True, +) diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/tests/test_history.py b/tests/test_history.py new file mode 100644 index 0000000..2665972 --- /dev/null +++ b/tests/test_history.py @@ -0,0 +1,227 @@ +from unittest import TestCase + +import os +import shutil +from glob import glob + +from tsih import * +from tsih import utils + + +ROOT = os.path.abspath(os.path.dirname(__file__)) +DBROOT = os.path.join(ROOT, 'testdb') + + +class TestHistory(TestCase): + + def setUp(self): + if not os.path.exists(DBROOT): + os.makedirs(DBROOT) + + def tearDown(self): + if os.path.exists(DBROOT): + shutil.rmtree(DBROOT) + + def test_history(self): + """ + """ + tuples = ( + ('a_0', 0, 'id', 'h'), + ('a_0', 1, 'id', 'e'), + ('a_0', 2, 'id', 'l'), + ('a_0', 3, 'id', 'l'), + ('a_0', 4, 'id', 'o'), + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 3, 'prob', 2), + ('env', 5, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + h = History() + h.save_tuples(tuples) + # assert h['env', 0, 'prob'] == 0 + for i in range(1, 7): + assert h['env', i, 'prob'] == ((i-1)//2)+1 + + + for i, k in zip(range(5), 'hello'): + assert h['a_0', i, 'id'] == k + for record, value in zip(h['a_0', None, 'id'], 'hello'): + t_step, val = record + assert val == value + + for i, k in zip(range(5), 'value'): + assert h['a_1', i, 'id'] == k + for i in range(5, 8): + assert h['a_1', i, 'id'] == 'e' + for i in range(7): + assert h['a_2', i, 'finished'] == False + assert h['a_2', 7, 'finished'] + + def test_history_gen(self): + """ + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + h = History() + h.save_tuples(tuples) + for t_step, key, value in h['env', None, None]: + assert t_step == value + assert key == 'prob' + + records = list(h[None, 7, None]) + assert len(records) == 3 + for i in records: + agent_id, key, value = i + if agent_id == 'a_1': + assert key == 'id' + assert value == 'e' + elif agent_id == 'a_2': + assert key == 'finished' + assert value + else: + assert key == 'prob' + assert value == 3 + + records = h['a_1', 7, None] + assert records['id'] == 'e' + + def test_history_file(self): + """ + History should be saved to a file + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + db_path = os.path.join(DBROOT, 'test') + h = History(db_path=db_path) + h.save_tuples(tuples) + h.flush_cache() + assert os.path.exists(db_path) + + # Recover the data + recovered = History(db_path=db_path) + assert recovered['a_1', 0, 'id'] == 'v' + assert recovered['a_1', 4, 'id'] == 'e' + + # Using backup=True should create a backup copy, and initialize an empty history + newhistory = History(db_path=db_path, backup=True) + backuppaths = glob(db_path + '.backup*.sqlite') + assert len(backuppaths) == 1 + backuppath = backuppaths[0] + assert newhistory.db_path == h.db_path + assert os.path.exists(backuppath) + assert len(newhistory[None, None, None]) == 0 + + def test_interpolation(self): + """ + Values for a key are valid until a new value is introduced at a later version + """ + tuples = ( + ('a_1', 0, 'id', 'a'), + ('a_1', 4, 'id', 'b'), + ) + db_path = os.path.join(DBROOT, 'test') + h = History(db_path=db_path) + h.save_tuples(tuples) + h.flush_cache() + assert os.path.exists(db_path) + + assert h['a_1', 2, 'id'] == 'a' + # Recover the data + recovered = History(db_path=db_path) + assert recovered['a_1', 0, 'id'] == 'a' + assert recovered['a_1', 4, 'id'] == 'b' + assert recovered['a_1', 2, 'id'] == 'a' + + def test_history_tuples(self): + """ + The data recovered should be equal to the one recorded. + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + h = History() + h.save_tuples(tuples) + recovered = list(h.to_tuples()) + assert recovered + for i in recovered: + assert i in tuples + + def test_stats(self): + """ + The data recovered should be equal to the one recorded. + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + stat_tuples = [ + {'num_infected': 5, 'runtime': 0.2}, + {'num_infected': 5, 'runtime': 0.2}, + {'new': '40'}, + ] + h = History() + h.save_tuples(tuples) + for stat in stat_tuples: + h.save_stats(stat) + recovered = h.get_stats() + assert recovered + assert recovered[0]['num_infected'] == 5 + assert recovered[1]['runtime'] == 0.2 + assert recovered[2]['new'] == '40' + + def test_unflatten(self): + ex = {'count.neighbors.3': 4, + 'count.times.2': 4, + 'count.total.4': 4, + 'mean.neighbors': 3, + 'mean.times': 2, + 'mean.total': 4, + 't_step': 2, + 'trial_id': 'exporter_sim_trial_1605817956-4475424'} + res = utils.unflatten_dict(ex) + + assert 'count' in res + assert all(x in res['count'] for x in ['times', 'total', 'neighbors']) + assert res['count']['times']['2'] == 4 + assert 'mean' in res + assert all(x in res['mean'] for x in ['times', 'total', 'neighbors']) + assert 't_step' in res + assert 'trial_id' in res diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..1d38190 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,79 @@ +from unittest import TestCase +import os +import shutil +import pathlib + +from tsih import Dict + + +ROOT = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) +DBROOT = ROOT / 'testdb' + + +class TestTsih(TestCase): + def setUp(self): + if not os.path.exists(DBROOT): + os.makedirs(DBROOT) + + def tearDown(self): + if os.path.exists(DBROOT): + shutil.rmtree(DBROOT) + + def test_basic(self): + '''The data stored in each version should be retrievable''' + d = Dict() + d['text'] = 'hello' + d.version = 1 + d['text'] = 'world' + assert d[(0, 'text')] == 'hello' + assert d[(1, 'text')] == 'world' + + def test_auto_version(self): + '''Changing a value when `auto_version` is on should produce a new version automatically''' + d = Dict(version=0, auto_version=True) + d['text'] = 'hello' + d['text'] = 'world' + assert d[(1, 'text')] == 'hello' + assert d[(2, 'text')] == 'world' + + def test_serialized(self): + ''' + Using the same database should enable retrieving the values of a previous + dictionary. + ''' + d = Dict(name='robot', db_path=DBROOT / 'basic.sqlite') + d['text'] = 'hello' + d.version = 25 + d['text'] = 'world' + assert d[(0, 'text')] == 'hello' + assert d[(24, 'text')] == 'hello' + assert d[(25, 'text')] == 'world' + del d + + recovered = Dict(name='robot', db_path=DBROOT / 'basic.sqlite') + assert recovered[(0, 'text')] == 'hello' + assert recovered[(24, 'text')] == 'hello' + assert recovered[(25, 'text')] == 'world' + + def test_custom(self): + ''' + Inheriting from the Dict class should not change the behavior. + ''' + + class CustomDict(Dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, db_path=DBROOT / 'custom.sqlite', **kwargs) + + d = CustomDict(name='robot') + d['text'] = 'hello' + d.version = 25 + d['text'] = 'world' + assert d[(0, 'text')] == 'hello' + assert d[(24, 'text')] == 'hello' + assert d[(25, 'text')] == 'world' + del d + + recovered = CustomDict(name='robot') + assert recovered[(0, 'text')] == 'hello' + assert recovered[(24, 'text')] == 'hello' + assert recovered[(26, 'text')] == 'world' diff --git a/tsih/__init__.py b/tsih/__init__.py new file mode 100644 index 0000000..8074b1c --- /dev/null +++ b/tsih/__init__.py @@ -0,0 +1,444 @@ +import time +import os +import pandas as pd +import sqlite3 +import copy +import uuid +import logging +import pathlib +import tempfile + +logger = logging.getLogger(__name__) + +__version__ = '0.1.4' + +from collections import UserDict, namedtuple + +from . import serialization +from .utils import open_or_reuse, unflatten_dict + + + +class Dict(UserDict): + + def __init__(self, name=None, db_name=None, db_path=None, backup=False, readonly=False, version=0, auto_version=False): + super().__init__() + self.dict_name = name or 'anonymous_{}'.format(uuid.uuid1()) + self._history = History(name=db_name, db_path=db_path, backup=backup, readonly=readonly) + self.version = version + self.auto_version = auto_version + + def __delitem__(self, key): + if isinstance(key, tuple): + raise ValueError('Cannot remove past entries') + if self.auto_version: + self.version += 1 + self.data[key] = None + + def __getitem__(self, key): + if isinstance(key, tuple): + if len(key) < 3: + key = tuple([self.dict_name] + list(key)) + self._history.flush_cache() + return self._history[key] + + return self.data[key] + + def __del__(self): + self._history.close() + + def __setcurrent(self, key, value): + if self.auto_version: + self.version += 1 + self.data[key] = value + self._history.save_record(dict_id=self.dict_name, + t_step=float(self.version), + key=key, + value=value) + + def __setitem__(self, key, value): + if not isinstance(key, tuple): + self.__setcurrent(key, value) + else: + if len(key) < 3: + key = tuple([self.dict_name] + list(key)) + k = history.Key(*key) + if k.t_step == version and k.dict_id == self.dict_name: + return self.__setcurrent(key.key, key.value) + self._history.save_record(*k, + value=value) + + +class History: + """ + Store and retrieve values from a sqlite database. + """ + + def __init__(self, name=None, db_path=None, backup=False, readonly=False): + if readonly and (not os.path.exists(db_path)): + raise Exception('The DB file does not exist. Cannot open in read-only mode') + + self._db = None + self._temp = db_path is None + self._stats_columns = None + self.readonly = readonly + + if self._temp: + if not name: + name = time.time() + # The file will be deleted as soon as it's closed + # Normally, that will be on destruction + db_path = tempfile.NamedTemporaryFile(suffix='{}.sqlite'.format(name)).name + + + if backup and os.path.exists(db_path): + newname = db_path + '.backup{}.sqlite'.format(time.time()) + os.rename(db_path, newname) + + self.db_path = db_path + + self.db = db_path + self._dtypes = {} + self._tups = [] + + + if self.readonly: + return + + with self.db: + logger.debug('Creating database {}'.format(self.db_path)) + self.db.execute('''CREATE TABLE IF NOT EXISTS history (dict_id text, t_step real, key text, value text)''') + self.db.execute('''CREATE TABLE IF NOT EXISTS value_types (key text, value_type text)''') + self.db.execute('''CREATE TABLE IF NOT EXISTS stats (stat_id text)''') + self.db.execute('''CREATE UNIQUE INDEX IF NOT EXISTS idx_history ON history (dict_id, t_step, key);''') + + @property + def db(self): + try: + self._db.cursor() + except (sqlite3.ProgrammingError, AttributeError): + self.db = None # Reset the database + return self._db + + @db.setter + def db(self, db_path=None): + self._close() + db_path = db_path or self.db_path + if isinstance(db_path, str) or isinstance(db_path, pathlib.Path): + logger.debug('Connecting to database {}'.format(db_path)) + self._db = sqlite3.connect(db_path) + self._db.row_factory = sqlite3.Row + else: + self._db = db_path + + def __del__(self): + self._close() + + def close(self): + self._close() + + def _close(self): + if self._db is None: + return + self.flush_cache() + self._db.close() + self._db = None + + def save_stats(self, stat): + if self.readonly: + print('DB in readonly mode') + return + if not stat: + return + with self.db: + if not self._stats_columns: + self._stats_columns = list(c['name'] for c in self.db.execute('PRAGMA table_info(stats)')) + + for column, value in stat.items(): + if column in self._stats_columns: + continue + dtype = 'text' + if not isinstance(value, str): + try: + float(value) + dtype = 'real' + int(value) + dtype = 'int' + except (ValueError, OverflowError): + pass + self.db.execute('ALTER TABLE stats ADD "{}" "{}"'.format(column, dtype)) + self._stats_columns.append(column) + + columns = ", ".join(map(lambda x: '"{}"'.format(x), stat.keys())) + values = ", ".join(['"{0}"'.format(col) for col in stat.values()]) + query = "INSERT INTO stats ({columns}) VALUES ({values})".format( + columns=columns, + values=values + ) + self.db.execute(query) + + def get_stats(self, unflatten=True): + rows = self.db.execute("select * from stats").fetchall() + res = [] + for row in rows: + d = {} + for k in row.keys(): + if row[k] is None: + continue + d[k] = row[k] + if unflatten: + d = unflatten_dict(d) + res.append(d) + return res + + @property + def dtypes(self): + self._read_types() + return {k:v[0] for k, v in self._dtypes.items()} + + def save_tuples(self, tuples): + ''' + Save a series of tuples, converting them to records if necessary + ''' + self.save_records(Record(*tup) for tup in tuples) + + def save_records(self, records): + ''' + Save a collection of records + ''' + for record in records: + if not isinstance(record, Record): + record = Record(*record) + self.save_record(*record) + + def save_record(self, dict_id, t_step, key, value): + ''' + Save a collection of records to the database. + Database writes are cached. + ''' + if self.readonly: + raise Exception('DB in readonly mode') + if key not in self._dtypes: + self._read_types() + if key not in self._dtypes: + name = serialization.name(value) + serializer = serialization.serializer(name) + deserializer = serialization.deserializer(name) + self._dtypes[key] = (name, serializer, deserializer) + with self.db: + self.db.execute("replace into value_types (key, value_type) values (?, ?)", (key, name)) + value = self._dtypes[key][1](value) + + self._tups.append(Record(dict_id=dict_id, + t_step=t_step, + key=key, + value=value)) + + def flush_cache(self): + ''' + Use a cache to save state changes to avoid opening a session for every change. + The cache will be flushed at the end of the simulation, and when history is accessed. + ''' + if self.readonly: + raise Exception('DB in readonly mode') + logger.debug('Flushing cache {}'.format(self.db_path)) + with self.db: + self.db.executemany("replace into history(dict_id, t_step, key, value) values (?, ?, ?, ?)", self._tups) + self._tups.clear() + + def to_tuples(self): + self.flush_cache() + with self.db: + res = self.db.execute("select dict_id, t_step, key, value from history ").fetchall() + for r in res: + dict_id, t_step, key, value = r + if key not in self._dtypes: + self._read_types() + if key not in self._dtypes: + raise ValueError("Unknown datatype for {} and {}".format(key, value)) + value = self._dtypes[key][2](value) + yield dict_id, t_step, key, value + + def _read_types(self): + with self.db: + res = self.db.execute("select key, value_type from value_types ").fetchall() + for k, v in res: + serializer = serialization.serializer(v) + deserializer = serialization.deserializer(v) + self._dtypes[k] = (v, serializer, deserializer) + + def __getitem__(self, key): + self.flush_cache() + key = Key(*key) + dict_ids = [key.dict_id] if key.dict_id is not None else [] + t_steps = [key.t_step] if key.t_step is not None else [] + keys = [key.key] if key.key is not None else [] + + df = self.read_sql(dict_ids=dict_ids, + t_steps=t_steps, + keys=keys) + r = Records(df, filter=key, dtypes=self._dtypes) + if r.resolved: + return r.value() + return r + + def read_sql(self, keys=None, dict_ids=None, not_dict_ids=None, t_steps=None, convert_types=False, limit=-1): + + self._read_types() + + def escape_and_join(v): + if v is None: + return + return ",".join(map(lambda x: "\'{}\'".format(x), v)) + + filters = [("key in ({})".format(escape_and_join(keys)), keys), + ("dict_id in ({})".format(escape_and_join(dict_ids)), dict_ids), + ("dict_id not in ({})".format(escape_and_join(not_dict_ids)), not_dict_ids) + ] + filters = list(k[0] for k in filters if k[1]) + + last_df = None + if t_steps: + # Convert negative indices into positive + if any(x<0 for x in t_steps): + max_t = int(self.db.execute("select max(t_step) from history").fetchone()[0]) + t_steps = [t if t>0 else max_t+1+t for t in t_steps] + + # We will be doing ffill interpolation, so we need to look for + # the last value before the minimum step in the query + min_step = min(t_steps) + last_filters = ['t_step < {}'.format(min_step),] + last_filters = last_filters + filters + condition = ' and '.join(last_filters) + + last_query = ''' + select h1.* + from history h1 + inner join ( + select dict_id, key, max(t_step) as t_step + from history + where {condition} + group by dict_id, key + ) h2 + on h1.dict_id = h2.dict_id and + h1.key = h2.key and + h1.t_step = h2.t_step + '''.format(condition=condition) + last_df = pd.read_sql_query(last_query, self.db) + + filters.append("t_step >= '{}' and t_step <= '{}'".format(min_step, max(t_steps))) + + condition = '' + if filters: + condition = 'where {} '.format(' and '.join(filters)) + query = 'select * from history {} limit {}'.format(condition, limit) + df = pd.read_sql_query(query, self.db) + if last_df is not None: + df = pd.concat([df, last_df]) + + df_p = df.pivot_table(values='value', index=['t_step'], + columns=['key', 'dict_id'], + aggfunc='first') + + for k, v in self._dtypes.items(): + if k in df_p: + dtype, _, deserial = v + try: + df_p[k] = df_p[k].fillna(method='ffill').astype(dtype) + except (TypeError, ValueError): + # Avoid forward-filling unknown/incompatible types + continue + if t_steps: + df_p = df_p.reindex(t_steps, method='ffill') + return df_p.ffill() + + def __getstate__(self): + state = dict(**self.__dict__) + del state['_db'] + del state['_dtypes'] + return state + + def __setstate__(self, state): + self.__dict__ = state + self._dtypes = {} + self._db = None + + def dump(self, f): + self._close() + for line in open_or_reuse(self.db_path, 'rb'): + f.write(line) + +class Records(): + + def __init__(self, df, filter=None, dtypes=None): + if not filter: + filter = Key(dict_id=None, + t_step=None, + key=None) + self._df = df + self._filter = filter + self.dtypes = dtypes or {} + super().__init__() + + def mask(self, tup): + res = () + for i, k in zip(tup[:-1], self._filter): + if k is None: + res = res + (i,) + res = res + (tup[-1],) + return res + + def filter(self, newKey): + f = list(self._filter) + for ix, i in enumerate(f): + if i is None: + f[ix] = newKey + self._filter = Key(*f) + + @property + def resolved(self): + return sum(1 for i in self._filter if i is not None) == 3 + + def __iter__(self): + for column, series in self._df.iteritems(): + key, dict_id = column + for t_step, value in series.iteritems(): + r = Record(t_step=t_step, + dict_id=dict_id, + key=key, + value=value) + yield self.mask(r) + + def value(self): + if self.resolved: + f = self._filter + try: + i = self._df[f.key][str(f.dict_id)] + ix = i.index.get_loc(f.t_step, method='ffill') + return i.iloc[ix] + except KeyError as ex: + return self.dtypes[f.key][2]() + return list(self) + + def df(self): + return self._df + + def __getitem__(self, k): + n = copy.copy(self) + n.filter(k) + if n.resolved: + return n.value() + return n + + def __len__(self): + return len(self._df) + + def __str__(self): + if self.resolved: + return str(self.value()) + return ''.format(self._filter) + +Key = namedtuple('Key', ['dict_id', 't_step', 'key']) +Record = namedtuple('Record', 'dict_id t_step key value') + +Stat = namedtuple('Stat', 'stat_id text') diff --git a/tsih/serialization.py b/tsih/serialization.py new file mode 100644 index 0000000..7dfec3d --- /dev/null +++ b/tsih/serialization.py @@ -0,0 +1,89 @@ +import os +import logging +import ast +import sys +import importlib +from itertools import product, chain + + +logger = logging.getLogger('soil') + + +builtins = importlib.import_module('builtins') + +def name(value, known_modules=[]): + '''Return a name that can be imported, to serialize/deserialize an object''' + if value is None: + return 'None' + if not isinstance(value, type): # Get the class name first + value = type(value) + tname = value.__name__ + if hasattr(builtins, tname): + return tname + modname = value.__module__ + if modname == '__main__': + return tname + if known_modules and modname in known_modules: + return tname + for kmod in known_modules: + if not kmod: + continue + module = importlib.import_module(kmod) + if hasattr(module, tname): + return tname + return '{}.{}'.format(modname, tname) + + +def serializer(type_): + if type_ != 'str' and hasattr(builtins, type_): + return repr + return lambda x: x + + +def serialize(v, known_modules=[]): + '''Get a text representation of an object.''' + tname = name(v, known_modules=known_modules) + func = serializer(tname) + return func(v), tname + +def deserializer(type_, known_modules=[]): + if type(type_) != str: # Already deserialized + return type_ + if type_ == 'str': + return lambda x='': x + if type_ == 'None': + return lambda x=None: None + if hasattr(builtins, type_): # Check if it's a builtin type + cls = getattr(builtins, type_) + return lambda x=None: ast.literal_eval(x) if x is not None else cls() + # Otherwise, see if we can find the module and the class + modules = known_modules or [] + options = [] + + for mod in modules: + if mod: + options.append((mod, type_)) + + if '.' in type_: # Fully qualified module + module, type_ = type_.rsplit(".", 1) + options.append ((module, type_)) + + errors = [] + for modname, tname in options: + try: + module = importlib.import_module(modname) + cls = getattr(module, tname) + return getattr(cls, 'deserialize', cls) + except (ImportError, AttributeError) as ex: + errors.append((modname, tname, ex)) + raise Exception('Could not find type {}. Tried: {}'.format(type_, errors)) + + +def deserialize(type_, value=None, **kwargs): + '''Get an object from a text representation''' + if not isinstance(type_, str): + return type_ + des = deserializer(type_, **kwargs) + if value is None: + return des + return des(value) diff --git a/tsih/utils.py b/tsih/utils.py new file mode 100644 index 0000000..22ee024 --- /dev/null +++ b/tsih/utils.py @@ -0,0 +1,87 @@ +import logging +import time +import os + +from shutil import copyfile + +from contextlib import contextmanager + +logger = logging.getLogger('soil') +# logging.basicConfig() +# logger.setLevel(logging.INFO) + + +@contextmanager +def timer(name='task', pre="", function=logger.info, to_object=None): + start = time.time() + function('{}Starting {} at {}.'.format(pre, name, + time.strftime("%X", time.gmtime(start)))) + yield start + end = time.time() + function('{}Finished {} at {} in {} seconds'.format(pre, name, + time.strftime("%X", time.gmtime(end)), + str(end-start))) + if to_object: + to_object.start = start + to_object.end = end + + +def safe_open(path, mode='r', backup=True, **kwargs): + outdir = os.path.dirname(path) + if outdir and not os.path.exists(outdir): + os.makedirs(outdir) + if backup and 'w' in mode and os.path.exists(path): + creation = os.path.getctime(path) + stamp = time.strftime('%Y-%m-%d_%H.%M.%S', time.localtime(creation)) + + backup_dir = os.path.join(outdir, 'backup') + if not os.path.exists(backup_dir): + os.makedirs(backup_dir) + newpath = os.path.join(backup_dir, '{}@{}'.format(os.path.basename(path), + stamp)) + copyfile(path, newpath) + return open(path, mode=mode, **kwargs) + + +def open_or_reuse(f, *args, **kwargs): + try: + return safe_open(f, *args, **kwargs) + except (AttributeError, TypeError): + return f + +def flatten_dict(d): + if not isinstance(d, dict): + return d + return dict(_flatten_dict(d)) + +def _flatten_dict(d, prefix=''): + if not isinstance(d, dict): + # print('END:', prefix, d) + yield prefix, d + return + if prefix: + prefix = prefix + '.' + for k, v in d.items(): + # print(k, v) + res = list(_flatten_dict(v, prefix='{}{}'.format(prefix, k))) + # print('RES:', res) + yield from res + + +def unflatten_dict(d): + out = {} + for k, v in d.items(): + target = out + if not isinstance(k, str): + target[k] = v + continue + tokens = k.split('.') + if len(tokens) < 2: + target[k] = v + continue + for token in tokens[:-1]: + if token not in target: + target[token] = {} + target = target[token] + target[tokens[-1]] = v + return out