Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pageindex/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,14 @@ def _load_workspace(self):
if loaded:
print(f"Loaded {loaded} document(s) from workspace.")

def get_document(self, doc_id: str) -> str:
"""Return document metadata JSON."""
def get_document(self, doc_id) -> str:
"""Return document metadata JSON. doc_id can be a string or a list of strings."""
return get_document(self.documents, doc_id)

def get_document_structure(self, doc_id: str) -> str:
"""Return document tree structure JSON (without text fields)."""
def get_document_structure(self, doc_id) -> str:
"""Return document tree structure JSON (without text fields). doc_id can be a string or a list of strings."""
return get_document_structure(self.documents, doc_id)

def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
def get_page_content(self, doc_id, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12'). doc_id can be a string or a list of strings."""
return get_page_content(self.documents, doc_id, pages)
86 changes: 39 additions & 47 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,8 @@ def toc_detector_single_page(content, model=None):
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""

response = llm_completion(model=model, prompt=prompt)
# print('response', response)
json_content = extract_json(response)
return json_content['toc_detected']
return json_content.get('toc_detected', 'no')


def check_if_toc_extraction_is_complete(content, toc, model=None):
Expand All @@ -137,7 +136,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
return json_content.get('completed', 'no')


def check_if_toc_transformation_is_complete(content, toc, model=None):
Expand All @@ -155,7 +154,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
return json_content.get('completed', 'no')

def extract_toc_content(content, model=None):
prompt = f"""
Expand All @@ -175,27 +174,19 @@ def extract_toc_content(content, model=None):
{"role": "user", "content": prompt},
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True)
response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response, model)
continue_prompt = "please continue the generation of table of contents, directly output the remaining part of the structure"

attempt = 0
max_attempts = 5

while not (if_complete == "yes" and finish_reason == "finished"):
attempt += 1
if attempt > max_attempts:
raise Exception('Failed to complete table of contents after maximum retries')

chat_history = [
{"role": "user", "content": prompt},
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True)
for attempt in range(max_attempts):
new_response, finish_reason = llm_completion(model=model, prompt=continue_prompt, chat_history=chat_history, return_finish_reason=True)
response = response + new_response
chat_history.append({"role": "user", "content": continue_prompt})
chat_history.append({"role": "assistant", "content": new_response})
if_complete = check_if_toc_transformation_is_complete(content, response, model)
if if_complete == "yes" and finish_reason == "finished":
break
else:
raise Exception('Failed to complete table of contents extraction after maximum retries')

return response

Expand All @@ -217,7 +208,7 @@ def detect_page_index(toc_content, model=None):

response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['page_index_given_in_toc']
return json_content.get('page_index_given_in_toc', 'no')

def toc_extractor(page_list, toc_page_list, model):
def transform_dots_to_colon(text):
Expand Down Expand Up @@ -296,43 +287,41 @@ def toc_transformer(toc_content, model=None):
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
if if_complete == "yes" and finish_reason == "finished":
last_complete = extract_json(last_complete)
cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
cleaned_response = convert_page_to_int(last_complete.get('table_of_contents', []))
return cleaned_response

last_complete = get_json_content(last_complete)
attempt = 0
max_attempts = 5
while not (if_complete == "yes" and finish_reason == "finished"):
attempt += 1
if attempt > max_attempts:
raise Exception('Failed to complete toc transformation after maximum retries')
position = last_complete.rfind('}')
if position != -1:
last_complete = last_complete[:position+2]
prompt = f"""
Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
The response should be in the following JSON format:

The raw table of contents json structure is:
{toc_content}
chat_history = [
{"role": "user", "content": prompt},
{"role": "assistant", "content": last_complete},
]
continue_prompt = "Please continue the table of contents JSON structure from where you left off. Directly output only the remaining part."

The incomplete transformed table of contents json structure is:
{last_complete}
position = last_complete.rfind('}')
if position != -1:
last_complete = last_complete[:position+2]

Please continue the json structure, directly output the remaining part of the json structure."""
max_attempts = 5
for attempt in range(max_attempts):

new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
new_complete, finish_reason = llm_completion(model=model, prompt=continue_prompt, chat_history=chat_history, return_finish_reason=True)

if new_complete.startswith('```json'):
new_complete = get_json_content(new_complete)
last_complete = last_complete+new_complete
new_complete = get_json_content(new_complete)
last_complete = last_complete + new_complete

chat_history.append({"role": "user", "content": continue_prompt})
chat_history.append({"role": "assistant", "content": new_complete})

if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)

if if_complete == "yes" and finish_reason == "finished":
break
else:
raise Exception('Failed to complete TOC transformation after maximum retries')

last_complete = extract_json(last_complete)

cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
cleaned_response = convert_page_to_int(last_complete.get('table_of_contents', []))
return cleaned_response


Expand Down Expand Up @@ -753,7 +742,10 @@ async def single_toc_item_index_fixer(section_title, content, model=None):
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
response = await llm_acompletion(model=model, prompt=prompt)
json_content = extract_json(response)
return convert_physical_index_to_int(json_content['physical_index'])
physical_index = json_content.get('physical_index')
if physical_index is None:
return None
return convert_physical_index_to_int(physical_index)



Expand Down
69 changes: 57 additions & 12 deletions pageindex/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,29 @@ def _traverse(nodes):

# ── Tool functions ────────────────────────────────────────────────────────────

def get_document(documents: dict, doc_id: str) -> str:
"""Return JSON with document metadata: doc_id, doc_name, doc_description, type, status, page_count (PDF) or line_count (Markdown)."""
def get_document(documents: dict, doc_id) -> str:
"""Return JSON with document metadata. doc_id can be a string or a list of strings."""
if isinstance(doc_id, list):
results = {}
for d_id in doc_id:
doc_info = documents.get(d_id)
if not doc_info:
results[d_id] = {'error': f'Document {d_id} not found'}
continue
res = {
'doc_id': d_id,
'doc_name': doc_info.get('doc_name', ''),
'doc_description': doc_info.get('doc_description', ''),
'type': doc_info.get('type', ''),
'status': 'completed',
}
if doc_info.get('type') == 'pdf':
res['page_count'] = _count_pages(doc_info)
else:
res['line_count'] = _count_pages(doc_info)
results[d_id] = res
return json.dumps(results, ensure_ascii=False)

doc_info = documents.get(doc_id)
if not doc_info:
return json.dumps({'error': f'Document {doc_id} not found'})
Expand All @@ -99,8 +120,19 @@ def get_document(documents: dict, doc_id: str) -> str:
return json.dumps(result)


def get_document_structure(documents: dict, doc_id: str) -> str:
"""Return tree structure JSON with text fields removed (saves tokens)."""
def get_document_structure(documents: dict, doc_id) -> str:
"""Return tree structure JSON with text fields removed. doc_id can be a string or a list of strings."""
if isinstance(doc_id, list):
results = {}
for d_id in doc_id:
doc_info = documents.get(d_id)
if not doc_info:
results[d_id] = {'error': f'Document {d_id} not found'}
continue
structure = doc_info.get('structure', [])
results[d_id] = remove_fields(structure, fields=['text'])
return json.dumps(results, ensure_ascii=False)

doc_info = documents.get(doc_id)
if not doc_info:
return json.dumps({'error': f'Document {doc_id} not found'})
Expand All @@ -109,16 +141,29 @@ def get_document_structure(documents: dict, doc_id: str) -> str:
return json.dumps(structure_no_text, ensure_ascii=False)


def get_page_content(documents: dict, doc_id: str, pages: str) -> str:
def get_page_content(documents: dict, doc_id, pages: str) -> str:
"""
Retrieve page content for a document.

pages format: '5-7', '3,8', or '12'
For PDF: pages are physical page numbers (1-indexed).
For Markdown: pages are line numbers corresponding to node headers.

Returns JSON list of {'page': int, 'content': str}.
Retrieve page content for document(s). doc_id can be a string or a list of strings.
... (rest of originally provided docstring)
"""
if isinstance(doc_id, list):
results = {}
for d_id in doc_id:
doc_info = documents.get(d_id)
if not doc_info:
results[d_id] = {'error': f'Document {d_id} not found'}
continue
try:
page_nums = _parse_pages(pages)
if doc_info.get('type') == 'pdf':
content = _get_pdf_page_content(doc_info, page_nums)
else:
content = _get_md_page_content(doc_info, page_nums)
results[d_id] = content
except Exception as e:
results[d_id] = {'error': f'Failed to read page content for {d_id}: {e}'}
return json.dumps(results, ensure_ascii=False)

doc_info = documents.get(doc_id)
if not doc_info:
return json.dumps({'error': f'Document {doc_id} not found'})
Expand Down
Loading