OpenHands · adityasoni9998 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/benchmarks/agentic_code_search/README.md b/benchmarks/agentic_code_search/README.md
@@ -0,0 +1,5 @@
+## Agentic Code Search
+
+Benchmarking code to evaluate LLMs on their ability to localize code from a python repository that requires editing to fix a given issue description in natural language
+
+- NOTE: The JSONL file for the ground truth is prepared using [this code](https://github.com/adityasoni9998/LocAgent/blob/master/util/benchmark/gen_oracle_locations.py).
diff --git a/benchmarks/agentic_code_search/__init__.py b/benchmarks/agentic_code_search/__init__.py
diff --git a/benchmarks/agentic_code_search/eval_infer.py b/benchmarks/agentic_code_search/eval_infer.py
@@ -0,0 +1,44 @@
+import json
+from argparse import ArgumentParser
+
+
+def main(args):
+    results_file = args.results_file
+    f1_file = 0
+    f1_function = 0
+    f1_module = 0
+    num_steps = 0
+    num_tool_calls = 0
+    total_time = 0
+    cnt = 0
+    with open(results_file, "r") as f:
+        for line in f:
+            result = json.loads(line)
+            test_result = result["test_result"]
+            if "num_steps" in test_result:
+                num_steps += test_result["num_steps"]
+            if "num_tool_calls" in test_result:
+                num_tool_calls += test_result["num_tool_calls"]
+            if "wall_time_seconds" in test_result:
+                total_time += test_result["wall_time_seconds"]
+
+            reward_dict = result["test_result"]["reward"]
+            cnt += 1
+            if reward_dict is not None:
+                f1_file += reward_dict.get("file_reward", 0)
+                f1_module += reward_dict.get("module_reward", 0)
+                f1_function += reward_dict.get("entity_reward", 0)
+
+    print(f"Average File F1 score: {f1_file / cnt:.4f} over {cnt} samples")
+    print(f"Average Module F1 score: {f1_module / cnt:.4f} over {cnt} samples")
+    print(f"Average Function F1 score: {f1_function / cnt:.4f} over {cnt} samples")
+    print(f"Average # of steps: {num_steps / cnt:.4f} over {cnt} samples")
+    print(f"Average # of tool calls: {num_tool_calls / cnt:.4f} over {cnt} samples")
+    print(f"Average wall time (s): {total_time / cnt:.4f} over {cnt} samples")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--results_file", type=str, required=True)
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/agentic_code_search/prompts/file_module.j2 b/benchmarks/agentic_code_search/prompts/file_module.j2
@@ -0,0 +1,30 @@
+I have access to a python code repository in the directory {{ working_dir }} . Consider the following issue description:
+
+<issue_description>
+{{ problem_statement }}
+</issue_description>
+
+Act as a code search agent and localize the specific files, classes or functions of code that need modification to resolve the issue in <issue_description>.
+
+NOTE: You do not need to solve the issue, all you need to do is localize relevant code from the repository. Your output will be used to guide another agent to solve the issue.
+
+Your final output should list the locations requiring modification, wrapped with triple backticks ```
+Each location should include the file path, class name (if applicable), and function name. Here is an example Output:
+```
+full_path1/file1.py
+class: MyClass1
+function: my_function1
+
+full_path2/file2.py
+function: MyClass2.my_function2
+
+full_path3/file3.py
+function: my_function3
+```
+
+IMPORTANT: Your output MUST follow the below rules:
+1. The final output must be returned in the message parameter of the Finish tool wrapped within ```, and there should be NO text outside these triple backticks (```).
+2. The locations of the file path must be RELATIVE to the {{ working_dir }} directory WITHOUT any leading "./" in the output.
+3. For each localized code output, you MUST always include the file path and the function name. If the function is within a class you MUST also include the class name.
+4. Only include those locations in your output that need modification to resolve the issue in <issue_description>. Do NOT include any locations that do not need modification.
+