diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..337fc1d8f 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -103,10 +103,15 @@ async def check_title_appearance_in_start_concurrent(structure, page_list, model def toc_detector_single_page(content, model=None): prompt = f""" - Your job is to detect if there is a table of content provided in the given text. + Your job is to detect if there is a table of content provided in the given single-page text. Given text: {content} + Guidance for single-page documents: + - Text with actual document content, including policies, rules, regulations, memos, or numbered sections, is not a table of contents just because it is structured. + - A true table of contents primarily lists references to content elsewhere in the document, usually as navigation entries for later sections or pages. + - If the given page contains the content itself instead of references to other content, answer "no". + return the following JSON format: {{ "thinking": @@ -1151,4 +1156,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt if truncated_items: print(f"Truncated {len(truncated_items)} TOC items that exceeded document length") - return toc_with_page_number \ No newline at end of file + return toc_with_page_number diff --git a/tests/test_toc_detector.py b/tests/test_toc_detector.py new file mode 100644 index 000000000..1b8cb5c62 --- /dev/null +++ b/tests/test_toc_detector.py @@ -0,0 +1,32 @@ +import importlib + + +page_index = importlib.import_module("pageindex.page_index") + + +def test_single_page_toc_prompt_distinguishes_content_from_toc(monkeypatch): + captured = {} + + def fake_completion(model, prompt): + captured["model"] = model + captured["prompt"] = prompt + return '{"thinking": "structured content, not a toc", "toc_detected": "no"}' + + monkeypatch.setattr(page_index, "llm_completion", fake_completion) + + result = page_index.toc_detector_single_page( + "1. Scope\nThis policy applies to all staff.\n\n" + "2. Requirements\nUsers must rotate secrets.\n\n" + "3. Exceptions\nExceptions require approval.", + model="test-model", + ) + + assert result == "no" + assert captured["model"] == "test-model" + + prompt = captured["prompt"].lower() + assert "single-page" in prompt + assert "actual document content" in prompt + assert "numbered sections" in prompt + assert "true table of contents" in prompt + assert "references to content elsewhere" in prompt