riga · July 11, 2023 09:09 · Jul 5, 2023 · Jul 5, 2023
diff --git a/evaluate_klub.py b/evaluate_klub.py
@@ -93,7 +93,6 @@ def evaluate_sample(
 
     # potentially run in parallel
     if n_parallel > 1:
-        # run in parallel
         with ProcessPool(n_parallel) as pool:
             list(tqdm(
                 pool.imap(_evaluate_file_mp, evaluation_args),
@@ -109,7 +108,7 @@ def evaluate_sample(
 
 def evaluate_file(input_file_path: str, output_file_path: str) -> None:
     # prepare expressions
-    expressions = klub_index_columns + klub_index_columns
+    expressions = klub_index_columns + klub_input_columns
 
     # load the klub array
     f = uproot.open(input_file_path)

diff --git a/evaluate_klub.py b/evaluate_klub.py
@@ -0,0 +1,147 @@
+# coding: utf-8
+
+from __future__ import annotations
+
+import os
+from fnmatch import fnmatch
+from multiprocessing import Pool as ProcessPool
+from typing import Any
+
+from tqdm import tqdm
+import numpy as np
+import awkward as ak
+import uproot
+
+
+#
+# configurations
+#
+
+masses = [
+    250, 260, 270, 280, 300, 320, 350, 400, 450, 500, 550, 600, 650,
+    700, 750, 800, 850, 900, 1000, 1250, 1500, 1750, 2000, 2500, 3000,
+]
+spins = [0, 2]
+baseline_selection = (
+    "isLeptrigger & "
+    "((pairType == 0) | (pairType == 1) | (pairType == 2)) & "
+    "(nleps == 0) & "
+    "(nbjetscand > 1)"
+)
+klub_index_columns = [
+    "EventNumber",
+    "RunNumber",
+    "lumi",
+]
+klub_input_columns = [
+    # TODO: add the full list of required klub input branches here
+]
+
+
+#
+# NN evaluation
+#
+
+def evaluate_events(events: ak.Array) -> ak.Array:
+    # TODO: add actual evaluation and return an ak array with results
+    return ak.zip({"dnn_output": np.ones(len(events))})
+
+
+#
+# high-level evaluation functions
+#
+
+def evaluate_samples(
+    skim_directory: str,
+    output_directory: str,
+    n_parallel: int = 1,
+) -> None:
+    # get a list of all sample names in the klub directory
+    sample_names = []
+    for sample_name in os.listdir(skim_directory):
+        sample_dir = os.path.join(skim_directory, sample_name)
+        if os.path.isdir(sample_dir) and os.path.exists(os.path.join(sample_dir, "output_0.root")):
+            sample_names.append(sample_name)
+
+    # start the evaluation
+    print(f"evaluating {len(sample_names)} samples")
+    for sample_name in sample_names:
+        evaluate_sample(skim_directory, output_directory, sample_name, n_parallel=n_parallel)
+
+
+def evaluate_sample(
+    skim_directory: str,
+    output_directory: str,
+    sample_name: str,
+    n_parallel: int = 1,
+) -> None:
+    print(f"evaluate {sample_name} ...")
+
+    # ensure that the output directory exists
+    output_sample_dir = os.path.join(output_directory, sample_name)
+    output_sample_dir = os.path.expandvars(os.path.expanduser(output_sample_dir))
+    if not os.path.exists(output_sample_dir):
+        os.makedirs(output_sample_dir)
+
+    # determine all file names to load
+    input_sample_dir = os.path.join(skim_directory, sample_name)
+    evaluation_args = [
+        (os.path.join(input_sample_dir, file_name), os.path.join(output_sample_dir, file_name))
+        for file_name in os.listdir(input_sample_dir)
+        if fnmatch(file_name, "output_*.root")
+    ]
+
+    # potentially run in parallel
+    if n_parallel > 1:
+        # run in parallel
+        with ProcessPool(n_parallel) as pool:
+            list(tqdm(
+                pool.imap(_evaluate_file_mp, evaluation_args),
+                total=len(evaluation_args),
+            ))
+    else:
+        list(tqdm(
+            map(_evaluate_file_mp, evaluation_args),
+            total=len(evaluation_args),
+        ))
+    print("done")
+
+
+def evaluate_file(input_file_path: str, output_file_path: str) -> None:
+    # prepare expressions
+    expressions = klub_index_columns + klub_index_columns
+
+    # load the klub array
+    f = uproot.open(input_file_path)
+    input_array = f["HTauTauTree"].arrays(expressions=expressions, cut=baseline_selection)
+
+    # run the evaluation
+    output_array = evaluate_events(input_array)
+
+    # add index columns
+    for c in klub_index_columns:
+        output_array = ak.with_field(output_array, input_array[c], c)
+
+    # save the output as root
+    output_file = uproot.recreate(output_file_path)
+    output_file["evaluation"] = dict(zip(output_array.fields, ak.unzip(output_array)))
+
+
+def _evaluate_file_mp(args: Any) -> None:
+    return evaluate_file(*args)
+
+
+# entry hook
+if __name__ == "__main__":
+    # evaluate_samples(
+    #     skim_directory="/eos/user/t/tokramer/hhbbtautau/skims/2017",
+    #     output_directory="/eos/user/m/mrieger/hhres_dnn_datacards/nn/2017",
+    #     n_parallel=1,
+    # )
+
+    evaluate_sample(
+        skim_directory="/eos/user/t/tokramer/hhbbtautau/skims/2017",
+        output_directory="/eos/user/m/mrieger/hhres_dnn_datacards/nn/2017",
+        sample_name="SKIM_ggF_Radion_m900",
+        n_parallel=1,
+    )