mirror of
https://github.com/gsi-upm/soil
synced 2025-09-14 12:12:21 +00:00
Now the environment does not deal with history directly, it delegates it to a specific class. The analysis also uses history instances instead of either using the database directly or creating a proxy environment. This should make it easier to change the implementation in the future. In fact, the change was motivated by the large size of the csv files in previous versions. This new implementation only stores results in deltas, and it fills any necessary values when needed.
167 lines
4.6 KiB
Python
167 lines
4.6 KiB
Python
import pandas as pd
|
|
|
|
import glob
|
|
import yaml
|
|
from os.path import join
|
|
|
|
from . import utils, history
|
|
|
|
|
|
def read_data(*args, group=False, **kwargs):
|
|
iterable = _read_data(*args, **kwargs)
|
|
if group:
|
|
return group_trials(iterable)
|
|
else:
|
|
return list(iterable)
|
|
|
|
|
|
def _read_data(pattern, *args, from_csv=False, process_args=None, **kwargs):
|
|
if not process_args:
|
|
process_args = {}
|
|
for folder in glob.glob(pattern):
|
|
config_file = glob.glob(join(folder, '*.yml'))[0]
|
|
config = yaml.load(open(config_file))
|
|
df = None
|
|
if from_csv:
|
|
for trial_data in sorted(glob.glob(join(folder,
|
|
'*.environment.csv'))):
|
|
df = read_csv(trial_data, **kwargs)
|
|
yield config_file, df, config
|
|
else:
|
|
for trial_data in sorted(glob.glob(join(folder, '*.db.sqlite'))):
|
|
df = read_sql(trial_data, **kwargs)
|
|
yield config_file, df, config
|
|
|
|
|
|
def read_sql(db, *args, **kwargs):
|
|
h = history.History(db, backup=False)
|
|
df = h.read_sql(*args, **kwargs)
|
|
return df
|
|
|
|
|
|
def read_csv(filename, keys=None, convert_types=False, **kwargs):
|
|
'''
|
|
Read a CSV in canonical form: ::
|
|
|
|
<agent_id, t_step, key, value, value_type>
|
|
|
|
'''
|
|
df = pd.read_csv(filename)
|
|
if convert_types:
|
|
df = convert_types_slow(df)
|
|
if keys:
|
|
df = df[df['key'].isin(keys)]
|
|
df = process_one(df)
|
|
return df
|
|
|
|
|
|
def convert_row(row):
|
|
row['value'] = utils.convert(row['value'], row['value_type'])
|
|
return row
|
|
|
|
|
|
def convert_types_slow(df):
|
|
'''This is a slow operation.'''
|
|
dtypes = get_types(df)
|
|
for k, v in dtypes.items():
|
|
t = df[df['key']==k]
|
|
t['value'] = t['value'].astype(v)
|
|
df = df.apply(convert_row, axis=1)
|
|
return df
|
|
|
|
def split_df(df):
|
|
'''
|
|
Split a dataframe in two dataframes: one with the history of agents,
|
|
and one with the environment history
|
|
'''
|
|
envmask = (df['agent_id'] == 'env')
|
|
n_env = envmask.sum()
|
|
if n_env == len(df):
|
|
return df, None
|
|
elif n_env == 0:
|
|
return None, df
|
|
agents, env = [x for _, x in df.groupby(envmask)]
|
|
return env, agents
|
|
|
|
|
|
def process(df, **kwargs):
|
|
'''
|
|
Process a dataframe in canonical form ``(t_step, agent_id, key, value, value_type)`` into
|
|
two dataframes with a column per key: one with the history of the agents, and one for the
|
|
history of the environment.
|
|
'''
|
|
env, agents = split_df(df)
|
|
return process_one(env, **kwargs), process_one(agents, **kwargs)
|
|
|
|
|
|
def get_types(df):
|
|
dtypes = df.groupby(by=['key'])['value_type'].unique()
|
|
return {k:v[0] for k,v in dtypes.iteritems()}
|
|
|
|
|
|
def process_one(df, *keys, columns=['key', 'agent_id'], values='value',
|
|
fill=True, index=['t_step',],
|
|
aggfunc='first', **kwargs):
|
|
'''
|
|
Process a dataframe in canonical form ``(t_step, agent_id, key, value, value_type)`` into
|
|
a dataframe with a column per key
|
|
'''
|
|
if df is None:
|
|
return df
|
|
if keys:
|
|
df = df[df['key'].isin(keys)]
|
|
|
|
df = df.pivot_table(values=values, index=index, columns=columns,
|
|
aggfunc=aggfunc, **kwargs)
|
|
if fill:
|
|
df = fillna(df)
|
|
return df
|
|
|
|
|
|
def get_count(df, *keys):
|
|
if keys:
|
|
df = df[list(keys)]
|
|
counts = pd.DataFrame()
|
|
for key in df.columns.levels[0]:
|
|
g = df[key].apply(pd.Series.value_counts, axis=1).fillna(0)
|
|
for value, series in g.iteritems():
|
|
counts[key, value] = series
|
|
counts.columns = pd.MultiIndex.from_tuples(counts.columns)
|
|
return counts
|
|
|
|
|
|
def get_value(df, *keys, aggfunc='sum'):
|
|
if keys:
|
|
df = df[list(keys)]
|
|
return df.groupby(axis=1, level=0).agg(aggfunc, axis=1)
|
|
|
|
|
|
def plot_all(*args, **kwargs):
|
|
'''
|
|
Read all the trial data and plot the result of applying a function on them.
|
|
'''
|
|
dfs = do_all(*args, **kwargs)
|
|
ps = []
|
|
for line in dfs:
|
|
f, df, config = line
|
|
df.plot(title=config['name'])
|
|
ps.append(df)
|
|
return ps
|
|
|
|
def do_all(pattern, func, *keys, include_env=False, **kwargs):
|
|
for config_file, df, config in read_data(pattern, keys=keys):
|
|
p = func(df, *keys, **kwargs)
|
|
p.plot(title=config['name'])
|
|
yield config_file, p, config
|
|
|
|
|
|
def group_trials(trials, aggfunc=['mean', 'min', 'max', 'std']):
|
|
trials = list(trials)
|
|
trials = list(map(lambda x: x[1] if isinstance(x, tuple) else x, trials))
|
|
return pd.concat(trials).groupby(level=0).agg(aggfunc).reorder_levels([2, 0,1] ,axis=1)
|
|
|
|
|
|
def fillna(df):
|
|
new_df = df.ffill(axis=0)
|
|
return new_df
|