Source code for rubin_sim.maf.run_comparison.gather_summaries

__all__ = ("combine_result_dbs", "gather_summaries")

import argparse
import glob
import logging
import os
import sqlite3

import pandas as pd

logger = logging.getLogger(__name__)


[docs] def combine_result_dbs(run_dirs, dbfilename="resultsDb_sqlite.db"): """Helper function for gather_summaries Parameters ---------- run_dirs : `list` [`str`] A list of directories to search for MAF result databases. dbfilename : `str` The database filename to look for (default: resultsDb_sqlite.db). """ # query to grab all the summary stats sql_q = "SELECT summarystats.summary_value, " sql_q += "metrics.metric_name, metrics.metric_info_label, " sql_q += "metrics.slicer_name, summarystats.summary_name, metrics.run_name " sql_q += "FROM summarystats INNER JOIN metrics ON metrics.metric_id=summarystats.metric_id" all_summaries = [] for rdir in run_dirs: fname = os.path.join(rdir, dbfilename) if not os.path.isfile(fname): logger.warning(f"No resultsDb database in {rdir}") con = sqlite3.connect(fname) temp_df = pd.read_sql(sql_q, con) con.close() # Make column names def make_summary_name(x): summary_name = " ".join( [ x.summary_name.strip(), x.metric_name.strip(), x.metric_info_label.strip(), x.slicer_name.strip(), ] ) summary_name = summary_name.replace(" ", " ") return summary_name temp_df["summary_names"] = temp_df.apply(make_summary_name, axis=1) all_summaries.append(temp_df[["summary_names", "summary_value", "run_name"]]) # Make one big dataframe all_summaries = pd.concat(all_summaries) # Group by run names and drop duplicates g = all_summaries.groupby(["run_name", "summary_names"]).agg({"summary_value": "last"}) # Convert to one row with all summary stats per run result_df = g.reset_index("summary_names").pivot(columns="summary_names") # That ended up as a MultiIndex which we didn't need, so fix and rename result_df.columns = result_df.columns.droplevel(0).rename("metric") return result_df
[docs] def gather_summaries(): """Find resultsDbs in a series of directories and gather up their summary stats into a single CSV or hdf5 file. Outputs one row per unique run name. """ parser = argparse.ArgumentParser( description="Find resultsDbs in a series of directories and " "gather up their summary stats into a single CSV or hdf5 file. " "Intended to run on a set of metrics run on multiple " "simulations, so that each results_db has similar summary" "statistics." ) parser.add_argument( "--base_dir", type=str, default=".", help="Root directory from where to search for MAF (sub)directories.", ) parser.add_argument( "--outfile", type=str, default="summary", help="Output file name. Default (summary)", ) parser.add_argument( "--to_csv", dest="to_csv", action="store_true", help="Create a .csv file, instead of the default hdf file.", ) parser.add_argument("--to_hdf", dest="to_hdf", action="store_true") parser.add_argument( "--dirs", type=str, default=None, help="comma separated list of directories to use, default None", ) args = parser.parse_args() if args.dirs is None: run_dirs = glob.glob(args.base_dir + "/*/") else: run_dirs = args.dirs.split(",") # Create output file name if needed if args.to_csv: outfile = args.outfile + ".csv" else: outfile = args.outfile + ".h5" result_df = combine_result_dbs(run_dirs) # Save summary statistics if args.to_csv: result_df.to_csv(outfile) else: # Create a CSV file result_df.to_hdf(outfile, key="stats")