commit 70779fa0ad7d0c921aa5a34f089a2b46142219fa Author: J. Fernando Sánchez Date: Fri Oct 15 17:50:24 2021 +0200 First commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..675225c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.* +*.pyc +__pycache__ +build +dist \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e12e234 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2016 Jesús Manuel Sánchez Martínez - Grupo de Sistemas Inteligentes (GSI) DIT UPM + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..98cf8eb --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include requirements.txt +include test-requirements.txt +include README.md +graft tsih +global-exclude __pycache__ +global-exclude *.py[co] diff --git a/README.md b/README.md new file mode 100644 index 0000000..5541a54 --- /dev/null +++ b/README.md @@ -0,0 +1,91 @@ +# TSIH - A dict with a HISTory + +`tsih.Dict` is a type of `UserDict` that allows versioning, backed up by a `sqlite3` database. + +* Transparent operation +* Only changes (deltas) are stored. +* Forward-filling of values. A value is reused in future versions, unless it changes. +* Auto-versioning option (off by default), to produce a new version every time a value change happens. +* Ability to store related entries as separate dictionaries. Each `tsih.Dict` has a `dict_name` that is used in the database to identify the dictionary. +* Tuple-based indexing. Get and set values by `dict_name`, `version` and `key`. + +## Usage and examples + +`tsih.Dict` objects can be used just like regular dictionaries: + +```python +>>> from tsih import Dict +>>> a = Dict() +>>> a['test'] = True +>>> a +{'test': True} +>>> a.get('missing', 5) +5 +>>> a['missing'] +Traceback (most recent call last): + File "", line 1, in +KeyError: 'missing' +``` + +But at any point, new versions can be produced: + +```python +>>> a.version +0 +>>> a['start'] = 'now' +>>> a +{'test': True, 'start': 'now'} +>>> a.version = 1 +>>> a['start'] = 'one version ago' +>>> a +{'test': True, 'start': 'one version ago'} +``` + +Previous values can be accessed using tuple keys, i.e., (version, key): + +```python +>>> a[(0, 'start')] +'now' +>>> a[(1, 'start')] +'one version ago' +``` + +Each version only "records" changes, but later versions (even if they don't exist yet) inherit unchanged values from the previous ones: + +```python +>>> a[(5, 'start')] +'one version ago' +>>> a.version = 5 +>>> # Until the value is changed +>>> a['start'] = '4 versions ago' +>>> a[(5, 'start')] +'4 versions ago' +``` + +You can access *every* state of the Dict using `None` in place of the version and/or the key. +In that case, we will get an iterator, which we can turn into a list explicitly or with the `.value` method. + +For example, here we get all the changes to the `start` key: + +```python +>>> a[(None, 'start')].value() # +[(0.0, 'now'), (1.0, 'one version ago'), (5.0, '4 versions ago')] +``` + +Similarly, to get the keys and values at a specific version: + +```python +>>> list(a[(0, None)]) +[('start', 'now'), ('test', True)] +``` + +Or, we can combine both to get the keys and values at every version: + +```python +>>> a[(None, None)].value() +[(0.0, 'start', 'now'), (1.0, 'start', 'one version ago'), (5.0, 'start', '4 versions ago'), (0.0, 'test', True), (1.0, 'test', True), (5.0, 'test', True)] +``` + +## Use cases + +Tsih was originally part of the [Soil](https://github.com/gsi-upm/soil) Agent-Based Social Simulation framework, where both the environment and the agents need to keep track of state (i.e., attribute) changes. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..970eadf --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[aliases] +test=pytest +[tool:pytest] +addopts = --verbose \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7843823 --- /dev/null +++ b/setup.py @@ -0,0 +1,57 @@ +import os +import re +from setuptools import setup +from pathlib import Path + +this_directory = Path(__file__).parent +long_description = (this_directory / "README.md").read_text() + +version = "" +with open(os.path.join('tsih', '__init__.py')) as f: + version = re.search( + r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE + ).group(1) + assert version + + +def parse_requirements(filename): + """ load requirements from a pip requirements file """ + with open(filename, 'r') as f: + lineiter = list(line.strip() for line in f) + return [line for line in lineiter if line and not line.startswith("#")] + + +install_reqs = parse_requirements("requirements.txt") +test_reqs = parse_requirements("test-requirements.txt") +extras_require={} +extras_require['all'] = [dep for package in extras_require.values() for dep in package] + + +setup( + name='tsih', + packages=['tsih'], # this must be the same as the name above + version=version, + description=("A lightweight library to store an object's history into a SQL database"), + long_description=long_description, + long_description_content_type='text/markdown', + author='J. Fernando Sanchez', + author_email='jf.sanchez@upm.es', + url='https://github.com/balkian/tsih', # use the URL to the github repo + download_url='https://github.com/balkian/tsih/archive/{}.tar.gz'.format( + version), + keywords=['history', 'sql', 'records'], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: POSIX', + 'Programming Language :: Python :: 3'], + install_requires=install_reqs, + extras_require=extras_require, + tests_require=test_reqs, + setup_requires=['pytest-runner', ], + include_package_data=True, +) diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..e079f8a --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/tests/test_history.py b/tests/test_history.py new file mode 100644 index 0000000..2665972 --- /dev/null +++ b/tests/test_history.py @@ -0,0 +1,227 @@ +from unittest import TestCase + +import os +import shutil +from glob import glob + +from tsih import * +from tsih import utils + + +ROOT = os.path.abspath(os.path.dirname(__file__)) +DBROOT = os.path.join(ROOT, 'testdb') + + +class TestHistory(TestCase): + + def setUp(self): + if not os.path.exists(DBROOT): + os.makedirs(DBROOT) + + def tearDown(self): + if os.path.exists(DBROOT): + shutil.rmtree(DBROOT) + + def test_history(self): + """ + """ + tuples = ( + ('a_0', 0, 'id', 'h'), + ('a_0', 1, 'id', 'e'), + ('a_0', 2, 'id', 'l'), + ('a_0', 3, 'id', 'l'), + ('a_0', 4, 'id', 'o'), + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 3, 'prob', 2), + ('env', 5, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + h = History() + h.save_tuples(tuples) + # assert h['env', 0, 'prob'] == 0 + for i in range(1, 7): + assert h['env', i, 'prob'] == ((i-1)//2)+1 + + + for i, k in zip(range(5), 'hello'): + assert h['a_0', i, 'id'] == k + for record, value in zip(h['a_0', None, 'id'], 'hello'): + t_step, val = record + assert val == value + + for i, k in zip(range(5), 'value'): + assert h['a_1', i, 'id'] == k + for i in range(5, 8): + assert h['a_1', i, 'id'] == 'e' + for i in range(7): + assert h['a_2', i, 'finished'] == False + assert h['a_2', 7, 'finished'] + + def test_history_gen(self): + """ + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + h = History() + h.save_tuples(tuples) + for t_step, key, value in h['env', None, None]: + assert t_step == value + assert key == 'prob' + + records = list(h[None, 7, None]) + assert len(records) == 3 + for i in records: + agent_id, key, value = i + if agent_id == 'a_1': + assert key == 'id' + assert value == 'e' + elif agent_id == 'a_2': + assert key == 'finished' + assert value + else: + assert key == 'prob' + assert value == 3 + + records = h['a_1', 7, None] + assert records['id'] == 'e' + + def test_history_file(self): + """ + History should be saved to a file + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + db_path = os.path.join(DBROOT, 'test') + h = History(db_path=db_path) + h.save_tuples(tuples) + h.flush_cache() + assert os.path.exists(db_path) + + # Recover the data + recovered = History(db_path=db_path) + assert recovered['a_1', 0, 'id'] == 'v' + assert recovered['a_1', 4, 'id'] == 'e' + + # Using backup=True should create a backup copy, and initialize an empty history + newhistory = History(db_path=db_path, backup=True) + backuppaths = glob(db_path + '.backup*.sqlite') + assert len(backuppaths) == 1 + backuppath = backuppaths[0] + assert newhistory.db_path == h.db_path + assert os.path.exists(backuppath) + assert len(newhistory[None, None, None]) == 0 + + def test_interpolation(self): + """ + Values for a key are valid until a new value is introduced at a later version + """ + tuples = ( + ('a_1', 0, 'id', 'a'), + ('a_1', 4, 'id', 'b'), + ) + db_path = os.path.join(DBROOT, 'test') + h = History(db_path=db_path) + h.save_tuples(tuples) + h.flush_cache() + assert os.path.exists(db_path) + + assert h['a_1', 2, 'id'] == 'a' + # Recover the data + recovered = History(db_path=db_path) + assert recovered['a_1', 0, 'id'] == 'a' + assert recovered['a_1', 4, 'id'] == 'b' + assert recovered['a_1', 2, 'id'] == 'a' + + def test_history_tuples(self): + """ + The data recovered should be equal to the one recorded. + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + h = History() + h.save_tuples(tuples) + recovered = list(h.to_tuples()) + assert recovered + for i in recovered: + assert i in tuples + + def test_stats(self): + """ + The data recovered should be equal to the one recorded. + """ + tuples = ( + ('a_1', 0, 'id', 'v'), + ('a_1', 1, 'id', 'a'), + ('a_1', 2, 'id', 'l'), + ('a_1', 3, 'id', 'u'), + ('a_1', 4, 'id', 'e'), + ('env', 1, 'prob', 1), + ('env', 2, 'prob', 2), + ('env', 3, 'prob', 3), + ('a_2', 7, 'finished', True), + ) + stat_tuples = [ + {'num_infected': 5, 'runtime': 0.2}, + {'num_infected': 5, 'runtime': 0.2}, + {'new': '40'}, + ] + h = History() + h.save_tuples(tuples) + for stat in stat_tuples: + h.save_stats(stat) + recovered = h.get_stats() + assert recovered + assert recovered[0]['num_infected'] == 5 + assert recovered[1]['runtime'] == 0.2 + assert recovered[2]['new'] == '40' + + def test_unflatten(self): + ex = {'count.neighbors.3': 4, + 'count.times.2': 4, + 'count.total.4': 4, + 'mean.neighbors': 3, + 'mean.times': 2, + 'mean.total': 4, + 't_step': 2, + 'trial_id': 'exporter_sim_trial_1605817956-4475424'} + res = utils.unflatten_dict(ex) + + assert 'count' in res + assert all(x in res['count'] for x in ['times', 'total', 'neighbors']) + assert res['count']['times']['2'] == 4 + assert 'mean' in res + assert all(x in res['mean'] for x in ['times', 'total', 'neighbors']) + assert 't_step' in res + assert 'trial_id' in res diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..1d38190 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,79 @@ +from unittest import TestCase +import os +import shutil +import pathlib + +from tsih import Dict + + +ROOT = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) +DBROOT = ROOT / 'testdb' + + +class TestTsih(TestCase): + def setUp(self): + if not os.path.exists(DBROOT): + os.makedirs(DBROOT) + + def tearDown(self): + if os.path.exists(DBROOT): + shutil.rmtree(DBROOT) + + def test_basic(self): + '''The data stored in each version should be retrievable''' + d = Dict() + d['text'] = 'hello' + d.version = 1 + d['text'] = 'world' + assert d[(0, 'text')] == 'hello' + assert d[(1, 'text')] == 'world' + + def test_auto_version(self): + '''Changing a value when `auto_version` is on should produce a new version automatically''' + d = Dict(version=0, auto_version=True) + d['text'] = 'hello' + d['text'] = 'world' + assert d[(1, 'text')] == 'hello' + assert d[(2, 'text')] == 'world' + + def test_serialized(self): + ''' + Using the same database should enable retrieving the values of a previous + dictionary. + ''' + d = Dict(name='robot', db_path=DBROOT / 'basic.sqlite') + d['text'] = 'hello' + d.version = 25 + d['text'] = 'world' + assert d[(0, 'text')] == 'hello' + assert d[(24, 'text')] == 'hello' + assert d[(25, 'text')] == 'world' + del d + + recovered = Dict(name='robot', db_path=DBROOT / 'basic.sqlite') + assert recovered[(0, 'text')] == 'hello' + assert recovered[(24, 'text')] == 'hello' + assert recovered[(25, 'text')] == 'world' + + def test_custom(self): + ''' + Inheriting from the Dict class should not change the behavior. + ''' + + class CustomDict(Dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, db_path=DBROOT / 'custom.sqlite', **kwargs) + + d = CustomDict(name='robot') + d['text'] = 'hello' + d.version = 25 + d['text'] = 'world' + assert d[(0, 'text')] == 'hello' + assert d[(24, 'text')] == 'hello' + assert d[(25, 'text')] == 'world' + del d + + recovered = CustomDict(name='robot') + assert recovered[(0, 'text')] == 'hello' + assert recovered[(24, 'text')] == 'hello' + assert recovered[(26, 'text')] == 'world' diff --git a/tsih/__init__.py b/tsih/__init__.py new file mode 100644 index 0000000..8074b1c --- /dev/null +++ b/tsih/__init__.py @@ -0,0 +1,444 @@ +import time +import os +import pandas as pd +import sqlite3 +import copy +import uuid +import logging +import pathlib +import tempfile + +logger = logging.getLogger(__name__) + +__version__ = '0.1.4' + +from collections import UserDict, namedtuple + +from . import serialization +from .utils import open_or_reuse, unflatten_dict + + + +class Dict(UserDict): + + def __init__(self, name=None, db_name=None, db_path=None, backup=False, readonly=False, version=0, auto_version=False): + super().__init__() + self.dict_name = name or 'anonymous_{}'.format(uuid.uuid1()) + self._history = History(name=db_name, db_path=db_path, backup=backup, readonly=readonly) + self.version = version + self.auto_version = auto_version + + def __delitem__(self, key): + if isinstance(key, tuple): + raise ValueError('Cannot remove past entries') + if self.auto_version: + self.version += 1 + self.data[key] = None + + def __getitem__(self, key): + if isinstance(key, tuple): + if len(key) < 3: + key = tuple([self.dict_name] + list(key)) + self._history.flush_cache() + return self._history[key] + + return self.data[key] + + def __del__(self): + self._history.close() + + def __setcurrent(self, key, value): + if self.auto_version: + self.version += 1 + self.data[key] = value + self._history.save_record(dict_id=self.dict_name, + t_step=float(self.version), + key=key, + value=value) + + def __setitem__(self, key, value): + if not isinstance(key, tuple): + self.__setcurrent(key, value) + else: + if len(key) < 3: + key = tuple([self.dict_name] + list(key)) + k = history.Key(*key) + if k.t_step == version and k.dict_id == self.dict_name: + return self.__setcurrent(key.key, key.value) + self._history.save_record(*k, + value=value) + + +class History: + """ + Store and retrieve values from a sqlite database. + """ + + def __init__(self, name=None, db_path=None, backup=False, readonly=False): + if readonly and (not os.path.exists(db_path)): + raise Exception('The DB file does not exist. Cannot open in read-only mode') + + self._db = None + self._temp = db_path is None + self._stats_columns = None + self.readonly = readonly + + if self._temp: + if not name: + name = time.time() + # The file will be deleted as soon as it's closed + # Normally, that will be on destruction + db_path = tempfile.NamedTemporaryFile(suffix='{}.sqlite'.format(name)).name + + + if backup and os.path.exists(db_path): + newname = db_path + '.backup{}.sqlite'.format(time.time()) + os.rename(db_path, newname) + + self.db_path = db_path + + self.db = db_path + self._dtypes = {} + self._tups = [] + + + if self.readonly: + return + + with self.db: + logger.debug('Creating database {}'.format(self.db_path)) + self.db.execute('''CREATE TABLE IF NOT EXISTS history (dict_id text, t_step real, key text, value text)''') + self.db.execute('''CREATE TABLE IF NOT EXISTS value_types (key text, value_type text)''') + self.db.execute('''CREATE TABLE IF NOT EXISTS stats (stat_id text)''') + self.db.execute('''CREATE UNIQUE INDEX IF NOT EXISTS idx_history ON history (dict_id, t_step, key);''') + + @property + def db(self): + try: + self._db.cursor() + except (sqlite3.ProgrammingError, AttributeError): + self.db = None # Reset the database + return self._db + + @db.setter + def db(self, db_path=None): + self._close() + db_path = db_path or self.db_path + if isinstance(db_path, str) or isinstance(db_path, pathlib.Path): + logger.debug('Connecting to database {}'.format(db_path)) + self._db = sqlite3.connect(db_path) + self._db.row_factory = sqlite3.Row + else: + self._db = db_path + + def __del__(self): + self._close() + + def close(self): + self._close() + + def _close(self): + if self._db is None: + return + self.flush_cache() + self._db.close() + self._db = None + + def save_stats(self, stat): + if self.readonly: + print('DB in readonly mode') + return + if not stat: + return + with self.db: + if not self._stats_columns: + self._stats_columns = list(c['name'] for c in self.db.execute('PRAGMA table_info(stats)')) + + for column, value in stat.items(): + if column in self._stats_columns: + continue + dtype = 'text' + if not isinstance(value, str): + try: + float(value) + dtype = 'real' + int(value) + dtype = 'int' + except (ValueError, OverflowError): + pass + self.db.execute('ALTER TABLE stats ADD "{}" "{}"'.format(column, dtype)) + self._stats_columns.append(column) + + columns = ", ".join(map(lambda x: '"{}"'.format(x), stat.keys())) + values = ", ".join(['"{0}"'.format(col) for col in stat.values()]) + query = "INSERT INTO stats ({columns}) VALUES ({values})".format( + columns=columns, + values=values + ) + self.db.execute(query) + + def get_stats(self, unflatten=True): + rows = self.db.execute("select * from stats").fetchall() + res = [] + for row in rows: + d = {} + for k in row.keys(): + if row[k] is None: + continue + d[k] = row[k] + if unflatten: + d = unflatten_dict(d) + res.append(d) + return res + + @property + def dtypes(self): + self._read_types() + return {k:v[0] for k, v in self._dtypes.items()} + + def save_tuples(self, tuples): + ''' + Save a series of tuples, converting them to records if necessary + ''' + self.save_records(Record(*tup) for tup in tuples) + + def save_records(self, records): + ''' + Save a collection of records + ''' + for record in records: + if not isinstance(record, Record): + record = Record(*record) + self.save_record(*record) + + def save_record(self, dict_id, t_step, key, value): + ''' + Save a collection of records to the database. + Database writes are cached. + ''' + if self.readonly: + raise Exception('DB in readonly mode') + if key not in self._dtypes: + self._read_types() + if key not in self._dtypes: + name = serialization.name(value) + serializer = serialization.serializer(name) + deserializer = serialization.deserializer(name) + self._dtypes[key] = (name, serializer, deserializer) + with self.db: + self.db.execute("replace into value_types (key, value_type) values (?, ?)", (key, name)) + value = self._dtypes[key][1](value) + + self._tups.append(Record(dict_id=dict_id, + t_step=t_step, + key=key, + value=value)) + + def flush_cache(self): + ''' + Use a cache to save state changes to avoid opening a session for every change. + The cache will be flushed at the end of the simulation, and when history is accessed. + ''' + if self.readonly: + raise Exception('DB in readonly mode') + logger.debug('Flushing cache {}'.format(self.db_path)) + with self.db: + self.db.executemany("replace into history(dict_id, t_step, key, value) values (?, ?, ?, ?)", self._tups) + self._tups.clear() + + def to_tuples(self): + self.flush_cache() + with self.db: + res = self.db.execute("select dict_id, t_step, key, value from history ").fetchall() + for r in res: + dict_id, t_step, key, value = r + if key not in self._dtypes: + self._read_types() + if key not in self._dtypes: + raise ValueError("Unknown datatype for {} and {}".format(key, value)) + value = self._dtypes[key][2](value) + yield dict_id, t_step, key, value + + def _read_types(self): + with self.db: + res = self.db.execute("select key, value_type from value_types ").fetchall() + for k, v in res: + serializer = serialization.serializer(v) + deserializer = serialization.deserializer(v) + self._dtypes[k] = (v, serializer, deserializer) + + def __getitem__(self, key): + self.flush_cache() + key = Key(*key) + dict_ids = [key.dict_id] if key.dict_id is not None else [] + t_steps = [key.t_step] if key.t_step is not None else [] + keys = [key.key] if key.key is not None else [] + + df = self.read_sql(dict_ids=dict_ids, + t_steps=t_steps, + keys=keys) + r = Records(df, filter=key, dtypes=self._dtypes) + if r.resolved: + return r.value() + return r + + def read_sql(self, keys=None, dict_ids=None, not_dict_ids=None, t_steps=None, convert_types=False, limit=-1): + + self._read_types() + + def escape_and_join(v): + if v is None: + return + return ",".join(map(lambda x: "\'{}\'".format(x), v)) + + filters = [("key in ({})".format(escape_and_join(keys)), keys), + ("dict_id in ({})".format(escape_and_join(dict_ids)), dict_ids), + ("dict_id not in ({})".format(escape_and_join(not_dict_ids)), not_dict_ids) + ] + filters = list(k[0] for k in filters if k[1]) + + last_df = None + if t_steps: + # Convert negative indices into positive + if any(x<0 for x in t_steps): + max_t = int(self.db.execute("select max(t_step) from history").fetchone()[0]) + t_steps = [t if t>0 else max_t+1+t for t in t_steps] + + # We will be doing ffill interpolation, so we need to look for + # the last value before the minimum step in the query + min_step = min(t_steps) + last_filters = ['t_step < {}'.format(min_step),] + last_filters = last_filters + filters + condition = ' and '.join(last_filters) + + last_query = ''' + select h1.* + from history h1 + inner join ( + select dict_id, key, max(t_step) as t_step + from history + where {condition} + group by dict_id, key + ) h2 + on h1.dict_id = h2.dict_id and + h1.key = h2.key and + h1.t_step = h2.t_step + '''.format(condition=condition) + last_df = pd.read_sql_query(last_query, self.db) + + filters.append("t_step >= '{}' and t_step <= '{}'".format(min_step, max(t_steps))) + + condition = '' + if filters: + condition = 'where {} '.format(' and '.join(filters)) + query = 'select * from history {} limit {}'.format(condition, limit) + df = pd.read_sql_query(query, self.db) + if last_df is not None: + df = pd.concat([df, last_df]) + + df_p = df.pivot_table(values='value', index=['t_step'], + columns=['key', 'dict_id'], + aggfunc='first') + + for k, v in self._dtypes.items(): + if k in df_p: + dtype, _, deserial = v + try: + df_p[k] = df_p[k].fillna(method='ffill').astype(dtype) + except (TypeError, ValueError): + # Avoid forward-filling unknown/incompatible types + continue + if t_steps: + df_p = df_p.reindex(t_steps, method='ffill') + return df_p.ffill() + + def __getstate__(self): + state = dict(**self.__dict__) + del state['_db'] + del state['_dtypes'] + return state + + def __setstate__(self, state): + self.__dict__ = state + self._dtypes = {} + self._db = None + + def dump(self, f): + self._close() + for line in open_or_reuse(self.db_path, 'rb'): + f.write(line) + +class Records(): + + def __init__(self, df, filter=None, dtypes=None): + if not filter: + filter = Key(dict_id=None, + t_step=None, + key=None) + self._df = df + self._filter = filter + self.dtypes = dtypes or {} + super().__init__() + + def mask(self, tup): + res = () + for i, k in zip(tup[:-1], self._filter): + if k is None: + res = res + (i,) + res = res + (tup[-1],) + return res + + def filter(self, newKey): + f = list(self._filter) + for ix, i in enumerate(f): + if i is None: + f[ix] = newKey + self._filter = Key(*f) + + @property + def resolved(self): + return sum(1 for i in self._filter if i is not None) == 3 + + def __iter__(self): + for column, series in self._df.iteritems(): + key, dict_id = column + for t_step, value in series.iteritems(): + r = Record(t_step=t_step, + dict_id=dict_id, + key=key, + value=value) + yield self.mask(r) + + def value(self): + if self.resolved: + f = self._filter + try: + i = self._df[f.key][str(f.dict_id)] + ix = i.index.get_loc(f.t_step, method='ffill') + return i.iloc[ix] + except KeyError as ex: + return self.dtypes[f.key][2]() + return list(self) + + def df(self): + return self._df + + def __getitem__(self, k): + n = copy.copy(self) + n.filter(k) + if n.resolved: + return n.value() + return n + + def __len__(self): + return len(self._df) + + def __str__(self): + if self.resolved: + return str(self.value()) + return ''.format(self._filter) + +Key = namedtuple('Key', ['dict_id', 't_step', 'key']) +Record = namedtuple('Record', 'dict_id t_step key value') + +Stat = namedtuple('Stat', 'stat_id text') diff --git a/tsih/serialization.py b/tsih/serialization.py new file mode 100644 index 0000000..7dfec3d --- /dev/null +++ b/tsih/serialization.py @@ -0,0 +1,89 @@ +import os +import logging +import ast +import sys +import importlib +from itertools import product, chain + + +logger = logging.getLogger('soil') + + +builtins = importlib.import_module('builtins') + +def name(value, known_modules=[]): + '''Return a name that can be imported, to serialize/deserialize an object''' + if value is None: + return 'None' + if not isinstance(value, type): # Get the class name first + value = type(value) + tname = value.__name__ + if hasattr(builtins, tname): + return tname + modname = value.__module__ + if modname == '__main__': + return tname + if known_modules and modname in known_modules: + return tname + for kmod in known_modules: + if not kmod: + continue + module = importlib.import_module(kmod) + if hasattr(module, tname): + return tname + return '{}.{}'.format(modname, tname) + + +def serializer(type_): + if type_ != 'str' and hasattr(builtins, type_): + return repr + return lambda x: x + + +def serialize(v, known_modules=[]): + '''Get a text representation of an object.''' + tname = name(v, known_modules=known_modules) + func = serializer(tname) + return func(v), tname + +def deserializer(type_, known_modules=[]): + if type(type_) != str: # Already deserialized + return type_ + if type_ == 'str': + return lambda x='': x + if type_ == 'None': + return lambda x=None: None + if hasattr(builtins, type_): # Check if it's a builtin type + cls = getattr(builtins, type_) + return lambda x=None: ast.literal_eval(x) if x is not None else cls() + # Otherwise, see if we can find the module and the class + modules = known_modules or [] + options = [] + + for mod in modules: + if mod: + options.append((mod, type_)) + + if '.' in type_: # Fully qualified module + module, type_ = type_.rsplit(".", 1) + options.append ((module, type_)) + + errors = [] + for modname, tname in options: + try: + module = importlib.import_module(modname) + cls = getattr(module, tname) + return getattr(cls, 'deserialize', cls) + except (ImportError, AttributeError) as ex: + errors.append((modname, tname, ex)) + raise Exception('Could not find type {}. Tried: {}'.format(type_, errors)) + + +def deserialize(type_, value=None, **kwargs): + '''Get an object from a text representation''' + if not isinstance(type_, str): + return type_ + des = deserializer(type_, **kwargs) + if value is None: + return des + return des(value) diff --git a/tsih/utils.py b/tsih/utils.py new file mode 100644 index 0000000..22ee024 --- /dev/null +++ b/tsih/utils.py @@ -0,0 +1,87 @@ +import logging +import time +import os + +from shutil import copyfile + +from contextlib import contextmanager + +logger = logging.getLogger('soil') +# logging.basicConfig() +# logger.setLevel(logging.INFO) + + +@contextmanager +def timer(name='task', pre="", function=logger.info, to_object=None): + start = time.time() + function('{}Starting {} at {}.'.format(pre, name, + time.strftime("%X", time.gmtime(start)))) + yield start + end = time.time() + function('{}Finished {} at {} in {} seconds'.format(pre, name, + time.strftime("%X", time.gmtime(end)), + str(end-start))) + if to_object: + to_object.start = start + to_object.end = end + + +def safe_open(path, mode='r', backup=True, **kwargs): + outdir = os.path.dirname(path) + if outdir and not os.path.exists(outdir): + os.makedirs(outdir) + if backup and 'w' in mode and os.path.exists(path): + creation = os.path.getctime(path) + stamp = time.strftime('%Y-%m-%d_%H.%M.%S', time.localtime(creation)) + + backup_dir = os.path.join(outdir, 'backup') + if not os.path.exists(backup_dir): + os.makedirs(backup_dir) + newpath = os.path.join(backup_dir, '{}@{}'.format(os.path.basename(path), + stamp)) + copyfile(path, newpath) + return open(path, mode=mode, **kwargs) + + +def open_or_reuse(f, *args, **kwargs): + try: + return safe_open(f, *args, **kwargs) + except (AttributeError, TypeError): + return f + +def flatten_dict(d): + if not isinstance(d, dict): + return d + return dict(_flatten_dict(d)) + +def _flatten_dict(d, prefix=''): + if not isinstance(d, dict): + # print('END:', prefix, d) + yield prefix, d + return + if prefix: + prefix = prefix + '.' + for k, v in d.items(): + # print(k, v) + res = list(_flatten_dict(v, prefix='{}{}'.format(prefix, k))) + # print('RES:', res) + yield from res + + +def unflatten_dict(d): + out = {} + for k, v in d.items(): + target = out + if not isinstance(k, str): + target[k] = v + continue + tokens = k.split('.') + if len(tokens) < 2: + target[k] = v + continue + for token in tokens[:-1]: + if token not in target: + target[token] = {} + target = target[token] + target[tokens[-1]] = v + return out