import pandas as pd import glob import yaml from os.path import join from . import utils def read_data(*args, group=False, **kwargs): iterable = _read_data(*args, **kwargs) if group: return group_trials(iterable) else: return list(iterable) def _read_data(pattern, keys=None, convert_types=False, process=None, from_csv=False, **kwargs): for folder in glob.glob(pattern): config_file = glob.glob(join(folder, '*.yml'))[0] config = yaml.load(open(config_file)) df = None if from_csv: for trial_data in sorted(glob.glob(join(folder, '*.environment.csv'))): df = read_csv(trial_data, convert_types=convert_types) if process: df = process(df, **kwargs) yield config_file, df, config else: for trial_data in sorted(glob.glob(join(folder, '*.db.sqlite'))): df = read_sql(trial_data, convert_types=convert_types, keys=keys) if process: df = process(df, **kwargs) yield config_file, df, config def read_csv(filename, keys=None, convert_types=False, **kwargs): ''' Read a CSV in canonical form: :: ''' df = pd.read_csv(filename) if convert_types: df = convert_types_slow(df) if keys: df = df[df['key'].isin(keys)] return df def read_sql(filename, keys=None, convert_types=False, limit=-1): condition = '' if keys: k = map(lambda x: "\'{}\'".format(x), keys) condition = 'where key in ({})'.format(','.join(k)) query = 'select * from history {} limit {}'.format(condition, limit) df = pd.read_sql_query(query, 'sqlite:///{}'.format(filename)) if convert_types: df = convert_types_slow(df) return df def convert_row(row): row['value'] = utils.convert(row['value'], row['value_type']) return row def convert_types_slow(df): '''This is a slow operation.''' dtypes = get_types(df) for k, v in dtypes.items(): t = df[df['key']==k] t['value'] = t['value'].astype(v) df = df.apply(convert_row, axis=1) return df def split_df(df): ''' Split a dataframe in two dataframes: one with the history of agents, and one with the environment history ''' envmask = (df['agent_id'] == 'env') n_env = envmask.sum() if n_env == len(df): return df, None elif n_env == 0: return None, df agents, env = [x for _, x in df.groupby(envmask)] return env, agents def process(df, **kwargs): ''' Process a dataframe in canonical form ``(t_step, agent_id, key, value, value_type)`` into two dataframes with a column per key: one with the history of the agents, and one for the history of the environment. ''' env, agents = split_df(df) return process_one(env, **kwargs), process_one(agents, **kwargs) def get_types(df): dtypes = df.groupby(by=['key'])['value_type'].unique() return {k:v[0] for k,v in dtypes.iteritems()} def process_one(df, *keys, columns=['key'], values='value', index=['t_step', 'agent_id'], aggfunc='first', **kwargs): ''' Process a dataframe in canonical form ``(t_step, agent_id, key, value, value_type)`` into a dataframe with a column per key ''' if df is None: return df if keys: df = df[df['key'].isin(keys)] dtypes = get_types(df) df = df.pivot_table(values=values, index=index, columns=columns, aggfunc=aggfunc, **kwargs) df = df.fillna(0).astype(dtypes) return df def get_count_processed(df, *keys): if keys: df = df[list(keys)] # p = df.groupby(level=0).apply(pd.Series.value_counts) p = df.unstack().apply(pd.Series.value_counts, axis=1) return p def get_count(df, *keys): if keys: df = df[df['key'].isin(keys)] p = df.groupby(by=['t_step', 'key', 'value']).size().unstack(level=[1,2]).fillna(0) return p def get_value(df, *keys, aggfunc='sum'): if keys: df = df[df['key'].isin(keys)] p = process_one(df, *keys) p = p.groupby(level='t_step').agg(aggfunc) return p def plot_all(*args, **kwargs): ''' Read all the trial data and plot the result of applying a function on them. ''' dfs = do_all(*args, **kwargs) ps = [] for line in dfs: f, df, config = line df.plot(title=config['name']) ps.append(df) return ps def do_all(pattern, func, *keys, include_env=False, **kwargs): for config_file, df, config in read_data(pattern, keys=keys): p = func(df, *keys, **kwargs) p.plot(title=config['name']) yield config_file, p, config def group_trials(trials, aggfunc=['mean', 'min', 'max', 'std']): trials = list(trials) trials = list(map(lambda x: x[1] if isinstance(x, tuple) else x, trials)) return pd.concat(trials).groupby(level=0).agg(aggfunc).reorder_levels([2, 0,1] ,axis=1)