From ff73e6f84b269be82110f459c6a424a6afe830f4 Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Thu, 28 May 2026 13:24:15 -0700 Subject: [PATCH 1/6] feat(rag): add PowerPoint (.pptx) extraction with VLM image support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PPTX files were explicitly excluded from GAIA's RAG pipeline — users had to save as PDF first. This adds native extraction of text (shapes, tables, speaker notes) and embedded images (via VLM when available), mirroring the existing PDF pipeline pattern. New module pptx_utils.py handles slide-level extraction with group shape recursion and markdown table formatting. A zip bomb guard rejects files whose uncompressed size exceeds 500 MB before handing them to python-pptx. --- setup.py | 2 + src/gaia/rag/pptx_utils.py | 265 ++++++++++++++ src/gaia/rag/sdk.py | 291 ++++++++++++++- src/gaia/ui/routers/files.py | 8 +- src/gaia/ui/utils.py | 16 +- tests/unit/chat/ui/test_server.py | 7 +- tests/unit/rag/test_pptx_extraction.py | 470 +++++++++++++++++++++++++ 7 files changed, 1041 insertions(+), 18 deletions(-) create mode 100644 src/gaia/rag/pptx_utils.py create mode 100644 tests/unit/rag/test_pptx_extraction.py diff --git a/setup.py b/setup.py index 6e68e774e..760571eae 100644 --- a/setup.py +++ b/setup.py @@ -153,6 +153,7 @@ "numpy>=1.24.0", "pymupdf>=1.24.0", "pypdf", + "python-pptx>=0.6.21", "sentence-transformers", "safetensors", # torch is pinned lower-bound only. The "audio" extra caps @@ -227,6 +228,7 @@ "numpy>=1.24.0", "pymupdf>=1.24.0", "pypdf", + "python-pptx>=0.6.21", "sentence-transformers", ], "lint": [ diff --git a/src/gaia/rag/pptx_utils.py b/src/gaia/rag/pptx_utils.py new file mode 100644 index 000000000..951a618cc --- /dev/null +++ b/src/gaia/rag/pptx_utils.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +""" +PowerPoint (.pptx) extraction utilities for multi-modal RAG. + +Extracts text, tables, speaker notes, and embedded images from PPTX slides. +Image processing mirrors ``pdf_utils.py`` (resize, compress, same dict format). +""" + +import io +import logging +from typing import List, Tuple + +logger = logging.getLogger(__name__) + +# Shared constants (same as pdf_utils.py) +MAX_DIMENSION = 1600 +MAX_SIZE_KB = 300 +MAX_ITERATIONS = 5 +MAX_GROUP_DEPTH = 5 + + +def extract_images_from_slide(slide, slide_num: int) -> List[dict]: + """ + Extract embedded images from a PPTX slide. + + Iterates picture shapes, extracts image blobs, and processes with PIL + (resize / compress) identically to ``pdf_utils.extract_images_from_page_pymupdf``. + + Args: + slide: A ``pptx.slide.Slide`` object. + slide_num: Slide number (1-indexed, for logging). + + Returns: + List of image dicts:: + + [{"image_bytes": bytes, "width": int, "height": int, + "format": "png", "size_kb": float}, ...] + """ + images: List[dict] = [] + + try: + from PIL import Image + except ImportError: + logger.error("Pillow not installed. Install: uv pip install Pillow") + return images + + for shape_index, shape in enumerate(_iter_shapes(slide.shapes)): + if not hasattr(shape, "image"): + continue + + try: + image_blob = shape.image.blob + + img = Image.open(io.BytesIO(image_blob)) + + width, height = img.size + size_kb = len(image_blob) / 1024 + + # Convert to RGB if needed + if img.mode not in ("RGB", "RGBA"): + logger.debug("Converting %s to RGB", img.mode) + img = img.convert("RGB") + + # Resize if too large + if width > MAX_DIMENSION or height > MAX_DIMENSION: + scale = min(MAX_DIMENSION / width, MAX_DIMENSION / height) + new_width = int(width * scale) + new_height = int(height * scale) + logger.info( + " Resizing: %dx%d -> %dx%d", width, height, new_width, new_height + ) + img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) + + # Save as optimized PNG + png_buffer = io.BytesIO() + img.save(png_buffer, format="PNG", optimize=True, compress_level=6) + png_bytes = png_buffer.getvalue() + size_kb = len(png_bytes) / 1024 + + # Iteratively compress until target size is reached + compression_iterations = 0 + while size_kb > MAX_SIZE_KB and compression_iterations < MAX_ITERATIONS: + compression_iterations += 1 + logger.info( + " Compressing (iteration %d): %.0fKB -> <%dKB", + compression_iterations, + size_kb, + MAX_SIZE_KB, + ) + img = img.resize( + (img.width // 2, img.height // 2), Image.Resampling.LANCZOS + ) + png_buffer = io.BytesIO() + img.save(png_buffer, format="PNG", optimize=True, compress_level=9) + png_bytes = png_buffer.getvalue() + size_kb = len(png_bytes) / 1024 + + if size_kb <= MAX_SIZE_KB: + logger.info( + " Compressed to %.0fKB (%dx%d) in %d iteration(s)", + size_kb, + img.width, + img.height, + compression_iterations, + ) + else: + logger.warning( + " Could not compress below %dKB after %d iterations (final: %.0fKB)", + MAX_SIZE_KB, + MAX_ITERATIONS, + size_kb, + ) + + images.append( + { + "image_bytes": png_bytes, + "width": img.width, + "height": img.height, + "format": "png", + "size_kb": size_kb, + } + ) + + logger.debug( + "Extracted image %d from slide %d: %dx%d, %.1fKB", + shape_index + 1, + slide_num, + img.width, + img.height, + size_kb, + ) + + except Exception as e: + # WMF/EMF metafiles, corrupt blobs, etc. — skip with warning. + logger.warning( + "Failed to extract image %d from slide %d: %s", + shape_index + 1, + slide_num, + e, + ) + continue + + return images + + +def count_images_in_slide(slide) -> Tuple[bool, int]: + """ + Fast check for embedded image presence without extraction. + + Args: + slide: A ``pptx.slide.Slide`` object. + + Returns: + ``(has_images, count)`` tuple. + """ + count = 0 + for shape in _iter_shapes(slide.shapes): + if hasattr(shape, "image"): + count += 1 + return (count > 0, count) + + +def extract_text_from_slide( + slide, slide_num: int # pylint: disable=unused-argument +) -> str: + """ + Extract all native text from a PPTX slide. + + Handles text frames, tables (formatted as markdown), and group shapes + (recursed up to :data:`MAX_GROUP_DEPTH` levels). + + Args: + slide: A ``pptx.slide.Slide`` object. + slide_num: Slide number (1-indexed, for logging). + + Returns: + Concatenated slide text with paragraph boundaries. + """ + parts: List[str] = [] + + for shape in _iter_shapes(slide.shapes): + # Table shapes — format as markdown + if shape.has_table: + table_md = _table_to_markdown(shape.table) + if table_md: + parts.append(table_md) + # Text-bearing shapes (text boxes, titles, subtitles, etc.) + elif shape.has_text_frame: + text = _text_frame_to_str(shape.text_frame) + if text: + parts.append(text) + + return "\n\n".join(parts) + + +def extract_notes_from_slide(slide) -> str: + """ + Extract speaker notes from a slide. + + Args: + slide: A ``pptx.slide.Slide`` object. + + Returns: + Notes text, or empty string if none. + """ + try: + if not slide.has_notes_slide: + return "" + notes_slide = slide.notes_slide + if notes_slide and notes_slide.notes_text_frame: + text = notes_slide.notes_text_frame.text + return text.strip() if text else "" + except Exception as e: + logger.debug("Could not extract notes from slide: %s", e) + return "" + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _iter_shapes(shapes, depth: int = 0): + """Yield all leaf shapes, recursing into group shapes up to *MAX_GROUP_DEPTH*. + + Group shapes themselves are never yielded — only their children. When the + depth limit is reached the group is silently skipped (its children are not + visited) to prevent stack overflow on pathological files. + """ + from pptx.enum.shapes import MSO_SHAPE_TYPE # lazy import + + for shape in shapes: + if shape.shape_type == MSO_SHAPE_TYPE.GROUP: + if depth < MAX_GROUP_DEPTH: + yield from _iter_shapes(shape.shapes, depth + 1) + # else: skip the group entirely at the depth limit + else: + yield shape + + +def _text_frame_to_str(text_frame) -> str: + """Join paragraphs from a text frame, preserving line breaks.""" + lines = [p.text.strip() for p in text_frame.paragraphs if p.text.strip()] + return "\n".join(lines) + + +def _table_to_markdown(table) -> str: + """Convert a ``pptx.table.Table`` to a markdown table string.""" + rows = [] + for row in table.rows: + cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells] + rows.append("| " + " | ".join(cells) + " |") + + if not rows: + return "" + + # Insert header separator after first row + num_cols = len(table.rows[0].cells) + separator = "| " + " | ".join(["---"] * num_cols) + " |" + rows.insert(1, separator) + + return "\n".join(rows) diff --git a/src/gaia/rag/sdk.py b/src/gaia/rag/sdk.py index 5accbdd53..88b7cbf96 100644 --- a/src/gaia/rag/sdk.py +++ b/src/gaia/rag/sdk.py @@ -850,6 +850,281 @@ def _extract_text_from_pdf(self, pdf_path: str) -> tuple: self.log.error(f"Error reading PDF {pdf_path}: {e}") raise + def _extract_text_from_pptx(self, pptx_path: str) -> tuple: + """ + Extract text from PowerPoint (.pptx) file with VLM for embedded images. + + Mirrors :meth:`_extract_text_from_pdf` — same per-slide loop, VLM + integration, merge strategy, and metadata structure. + + Returns: + ``(text, num_slides, metadata)`` tuple where metadata contains: + + - num_slides: int + - vlm_slides: int (slides enhanced with VLM) + - total_images: int (total images processed) + - vlm_checked: bool + - vlm_available: bool + - pptx_status: str (``"readable"`` or ``"corrupted"``) + """ + import time as time_module # pylint: disable=reimported + + file_name = Path(pptx_path).name + + # Step 0: Open the PPTX. python-pptx raises PackageNotFoundError or + # similar for corrupt / non-PPTX files. + try: + from pptx import Presentation # pylint: disable=import-outside-toplevel + except ImportError: + raise ImportError( + "python-pptx is required for PowerPoint processing. " + "Install it with: uv pip install python-pptx" + ) + + # Guard against zip bombs: .pptx is a ZIP container. Check that + # the total uncompressed size is sane before handing it to python-pptx. + import zipfile + + try: + with zipfile.ZipFile(pptx_path, "r") as zf: + total_uncompressed = sum(info.file_size for info in zf.infolist()) + max_uncompressed = 500 * 1024 * 1024 # 500 MB + if total_uncompressed > max_uncompressed: + msg = ( + f"PowerPoint file too large after decompression: {file_name}\n" + f"Uncompressed size: {total_uncompressed / (1024*1024):.0f} MB " + f"(limit: {max_uncompressed / (1024*1024):.0f} MB)\n" + "The file may be a zip bomb or contain very large embedded media.\n" + "Suggestions:\n" + " 1. Remove unnecessary images/media to reduce file size\n" + " 2. Save as PDF and index the PDF instead" + ) + self.log.error( + f"PPTX zip bomb guard: {pptx_path} ({total_uncompressed} bytes uncompressed)" + ) + raise ValueError(msg) + except zipfile.BadZipFile as e: + msg = ( + f"Could not read PowerPoint file: {file_name}\n" + f"Reason: {e}\n" + "The file appears to be corrupted or not a valid .pptx file.\n" + "Suggestions:\n" + " 1. Re-download or re-export the presentation\n" + " 2. Try opening the file in PowerPoint to confirm it is readable\n" + " 3. Save as PDF and index the PDF instead" + ) + self.log.error(f"Corrupted PPTX (bad zip): {pptx_path}: {e}") + raise ValueError(msg) from e + + try: + prs = Presentation(pptx_path) + except Exception as e: + msg = ( + f"Could not read PowerPoint file: {file_name}\n" + f"Reason: {e}\n" + "The file appears to be corrupted or not a valid .pptx file.\n" + "Suggestions:\n" + " 1. Re-download or re-export the presentation\n" + " 2. Try opening the file in PowerPoint to confirm it is readable\n" + " 3. Save as PDF and index the PDF instead" + ) + self.log.error(f"Corrupted PPTX {pptx_path}: {e}") + raise ValueError(msg) from e + + try: + extract_start = time_module.time() + total_slides = len(prs.slides) + self.log.info(f"📊 Extracting text from {total_slides} slides...") + + # Initialize VLM client (auto-enabled if available) + vlm = None + vlm_available = False + try: + from gaia.llm import ( # pylint: disable=import-outside-toplevel + VLMClient, + ) + from gaia.rag.pptx_utils import ( # pylint: disable=import-outside-toplevel + count_images_in_slide, + extract_images_from_slide, + extract_notes_from_slide, + extract_text_from_slide, + ) + + vlm = VLMClient( + vlm_model=self.config.vlm_model, base_url=self.config.base_url + ) + vlm_available = vlm.check_availability() + + if vlm_available and self.config.show_stats: + print(" 🔍 VLM enabled: Will extract text from slide images") + elif not vlm_available and self.config.show_stats: + print(" ⚠️ VLM not available - images will not be processed") + print(" 📥 To enable VLM image extraction:") + print( + " 1. Open Lemonade Model Manager (http://localhost:13305)" + ) + print(f" 2. Download model: {self.config.vlm_model}") + + except Exception as vlm_error: + if self.config.show_stats: + print(f" ⚠️ VLM initialization failed: {vlm_error}") + self.log.warning(f"VLM initialization failed: {vlm_error}") + vlm_available = False + + if self.config.show_stats: + print(f"\n{'='*60}") + print(" 📊 COMPUTE INTENSIVE: PowerPoint Text Extraction") + print(f" 📊 Total slides: {total_slides}") + print(f" ⏱️ Estimated time: {total_slides * 0.2:.1f} seconds") + if vlm_available: + print(" 🖼️ VLM: Enabled for image text extraction") + else: + print(" 🖼️ VLM: Disabled (text-only extraction)") + print(f"{'='*60}") + + pages_data = [] + vlm_slides_count = 0 + total_images_processed = 0 + + for i, slide in enumerate(prs.slides, 1): + page_start = time_module.time() + + # Step 1: Extract native text from shapes / tables + slide_text = extract_text_from_slide(slide, slide_num=i) + + # Step 2: Extract speaker notes + notes_text = extract_notes_from_slide(slide) + if notes_text: + slide_text = ( + slide_text + "\n\n**Speaker Notes:**\n" + notes_text + if slide_text + else "**Speaker Notes:**\n" + notes_text + ) + + # Step 3: Check for images + has_imgs = False + num_imgs = 0 + if vlm_available: + try: + has_imgs, num_imgs = count_images_in_slide(slide) + except Exception: # pylint: disable=broad-except + pass + + # Step 4: Extract from images if present + image_texts = [] + if has_imgs and vlm_available: + try: + images = extract_images_from_slide(slide, slide_num=i) + if images: + image_texts = vlm.extract_from_page_images( + images, page_num=i + ) + if image_texts: + vlm_slides_count += 1 + total_images_processed += len(image_texts) + except Exception as img_error: + self.log.warning( + f"Image extraction failed on slide {i}: {img_error}" + ) + + # Step 5: Merge native text + VLM image texts + merged_text = self._merge_page_texts( + slide_text, image_texts, page_num=i + ) + + pages_data.append( + { + "page": i, + "text": merged_text, + "has_images": has_imgs, + "num_images": num_imgs, + "vlm_used": len(image_texts) > 0, + } + ) + + page_duration = time_module.time() - page_start + + if self.config.show_stats: + progress_pct = (i / total_slides) * 100 + avg_time = (time_module.time() - extract_start) / i + eta = avg_time * (total_slides - i) + vlm_indicator = " 🖼️" if len(image_texts) > 0 else "" + print( + f" 📊 Slide {i}/{total_slides} ({progress_pct:.0f}%){vlm_indicator} | " + f"⏱️ {page_duration:.2f}s | ETA: {eta:.1f}s" + " " * 10, + end="\r", + flush=True, + ) + + # Cleanup VLM + if vlm_available and vlm: + try: + vlm.cleanup() + except Exception: # pylint: disable=broad-except + pass + + extract_duration = time_module.time() - extract_start + + # Build full text — uses [Page N] markers for downstream compatibility + # with chunking/retrieval code that parses [Page N] patterns. + full_text = "\n\n".join( + [f"[Page {p['page']}]\n{p['text']}" for p in pages_data] + ) + + if self.config.show_stats: + print( + f"\n ✅ Extracted {len(full_text):,} characters from {total_slides} slides" + ) + slides_per_sec = ( + total_slides / extract_duration + if extract_duration > 0 + else float("inf") + ) + print( + f" ⏱️ Total extraction time: {extract_duration:.2f}s ({slides_per_sec:.1f} slides/sec)" + ) + print(f" 💾 Text size: {len(full_text) / 1024:.1f} KB") + if vlm_slides_count > 0: + print( + f" 🖼️ VLM enhanced: {vlm_slides_count} slides, {total_images_processed} images" + ) + print(f"{'='*60}\n") + + self.log.info( + f"📝 Extracted {len(full_text):,} characters in {extract_duration:.2f}s (VLM: {vlm_slides_count} slides)" + ) + + # Check for empty presentation + has_any_content = any((p["text"] or "").strip() for p in pages_data) + if not has_any_content: + msg = ( + f"No extractable text in PowerPoint: {file_name}\n" + f"The file has {total_slides} slide(s) but none contained text.\n" + "Suggestions:\n" + " 1. Ensure the presentation has text content (not just images)\n" + " 2. Enable VLM image extraction by downloading " + f"{self.config.vlm_model} in Lemonade\n" + " 3. Save as PDF and index the PDF instead" + ) + self.log.error(f"Empty PPTX (no text): {pptx_path}") + raise ValueError(msg) + + metadata = { + "num_slides": total_slides, + "vlm_slides": vlm_slides_count, + "total_images": total_images_processed, + "vlm_checked": True, + "vlm_available": vlm_available, + "pptx_status": "readable", + } + + return full_text, total_slides, metadata + except ValueError: + raise + except Exception as e: + self.log.error(f"Error reading PPTX {pptx_path}: {e}") + raise + def _merge_page_texts( self, pypdf_text: str, image_texts: list, page_num: int ) -> str: @@ -1217,9 +1492,9 @@ def _extract_text_from_file(self, file_path: str) -> tuple: Returns: (text, metadata_dict) tuple where metadata_dict contains: - - num_pages: int (for PDFs) or None - - vlm_pages: int (for PDFs with VLM) or None - - total_images: int (for PDFs with VLM) or None + - num_pages: int (for PDFs/PPTX) or None + - vlm_pages: int (for PDFs/PPTX with VLM) or None + - total_images: int (for PDFs/PPTX with VLM) or None """ file_type = self._get_file_type(file_path) metadata = {"num_pages": None, "vlm_pages": None, "total_images": None} @@ -1232,6 +1507,14 @@ def _extract_text_from_file(self, file_path: str) -> tuple: metadata["total_images"] = pdf_metadata.get("total_images", 0) return text, metadata + # PowerPoint files + elif file_type == ".pptx": + text, num_slides, pptx_metadata = self._extract_text_from_pptx(file_path) + metadata["num_pages"] = num_slides + metadata["vlm_pages"] = pptx_metadata.get("vlm_slides", 0) + metadata["total_images"] = pptx_metadata.get("total_images", 0) + return text, metadata + # Text-based files elif file_type in [".txt", ".md", ".markdown", ".rst", ".log"]: return self._extract_text_from_text_file(file_path), metadata @@ -1909,7 +2192,7 @@ def index_document(self, file_path: str) -> Dict[str, Any]: Index a document for retrieval. Supports: - - Documents: PDF, TXT, MD, CSV, JSON + - Documents: PDF, PPTX, TXT, MD, CSV, JSON - Backend Code: Python, Java, C/C++, Go, Rust, Ruby, PHP, Swift, Kotlin, Scala - Web Code: JavaScript/TypeScript, HTML, CSS/SCSS/SASS/LESS, Vue, Svelte, Astro - Config: YAML, XML, TOML, INI, ENV, Properties diff --git a/src/gaia/ui/routers/files.py b/src/gaia/ui/routers/files.py index d065710dc..503387c85 100644 --- a/src/gaia/ui/routers/files.py +++ b/src/gaia/ui/routers/files.py @@ -72,7 +72,7 @@ async def upload_file(file: UploadFile = File(...)): - Allowed types: common images (png, jpg, jpeg, gif, webp, bmp, svg) and the document types listed in ALLOWED_EXTENSIONS (pdf, txt, md, csv, json, xlsx, html, xml, yaml, and code files). Legacy Office - formats (.doc/.docx/.ppt/.pptx/.xls) are NOT allowed — GAIA does + formats (.doc/.docx/.ppt/.xls) are NOT allowed — GAIA does not currently ship extractors for them. Returns: @@ -91,9 +91,9 @@ async def upload_file(file: UploadFile = File(...)): detail=( f"File type '{ext}' is not allowed. " f"Supported types: images (png, jpg, jpeg, gif, webp, bmp, svg) " - f"and documents (pdf, txt, md, csv, json, xlsx, html, code files, etc.). " - f"Microsoft Word/PowerPoint and legacy .xls are not yet supported — " - f"save as PDF or .xlsx." + f"and documents (pdf, pptx, txt, md, csv, json, xlsx, html, code files, etc.). " + f"Microsoft Word (.doc/.docx), legacy PowerPoint (.ppt), and legacy Excel (.xls) " + f"are not yet supported — save as PDF, .pptx, or .xlsx." ), ) diff --git a/src/gaia/ui/utils.py b/src/gaia/ui/utils.py index dd93f4384..2bd78933b 100644 --- a/src/gaia/ui/utils.py +++ b/src/gaia/ui/utils.py @@ -46,13 +46,15 @@ # etc.) and poisons retrieval quality. # # Notable intentional exclusions (no parser ships with GAIA today): -# .doc/.docx/.ppt/.pptx — need python-docx / python-pptx -# .xls (legacy BIFF) — openpyxl only handles .xlsx; would raise on open +# .doc/.docx/.ppt — need python-docx / python-pptx (legacy .ppt) +# .xls (legacy BIFF) — openpyxl only handles .xlsx; would raise on open +# NOTE: .pptx IS supported — python-pptx ships with GAIA since v0.21. # See ``_UNSUPPORTED_CATEGORIES`` below for the user-facing rejection hint. # Image formats are tracked by issue #730 (VLM-based indexing). ALLOWED_EXTENSIONS = frozenset( { ".pdf", + ".pptx", ".txt", ".md", ".csv", @@ -285,12 +287,12 @@ def sanitize_document_path(user_path: str) -> Path: "Tip: Export your data to CSV or JSON format, then index those files.", ), "office": ( - {".doc", ".docx", ".ppt", ".pptx", ".xls"}, - "Microsoft Word, PowerPoint, and legacy Excel (.xls) files " + {".doc", ".docx", ".ppt", ".xls"}, + "Microsoft Word, legacy PowerPoint (.ppt), and legacy Excel (.xls) files " "are not yet supported — GAIA does not currently ship the " "parsers needed to extract text from these formats. " - "Tip: Save as PDF from Word/PowerPoint, or re-save .xls as " - ".xlsx — GAIA supports PDFs and modern .xlsx workbooks.", + "Tip: Save as PDF from Word, or re-save .xls as " + ".xlsx — GAIA supports PDFs, PPTX, and modern .xlsx workbooks.", ), } @@ -307,7 +309,7 @@ def sanitize_document_path(user_path: str) -> Path: detail = ( f"{hint} " - f"Supported formats: PDF, TXT, MD, CSV, JSON, XLSX, " + f"Supported formats: PDF, PPTX, TXT, MD, CSV, JSON, XLSX, " f"HTML, XML, YAML, and 30+ code file formats. " f"Want support for {category + ' files' if category else 'this file type'}? " f"Request it at https://github.com/amd/gaia/issues/new?title=[Feature]%20Support%20{ext}%20file%20indexing" diff --git a/tests/unit/chat/ui/test_server.py b/tests/unit/chat/ui/test_server.py index c9f41763f..a3f8fd26d 100644 --- a/tests/unit/chat/ui/test_server.py +++ b/tests/unit/chat/ui/test_server.py @@ -1447,11 +1447,12 @@ def test_allows_document_extensions(self): def test_rejects_legacy_office_extensions(self): """Office formats without extractors must be rejected, not silently indexed as binary garbage. Regression test for the allowlist cleanup - that removed .doc/.docx/.ppt/.pptx/.xls — GAIA does not currently - ship python-docx/python-pptx/xlrd so these would produce garbage.""" + that removed .doc/.docx/.ppt/.xls — GAIA does not currently + ship python-docx/xlrd so these would produce garbage. + NOTE: .pptx IS supported (python-pptx ships with GAIA).""" from pathlib import Path - for ext in [".doc", ".docx", ".ppt", ".pptx", ".xls"]: + for ext in [".doc", ".docx", ".ppt", ".xls"]: with pytest.raises(Exception) as exc_info: _validate_file_path(Path(f"/home/user/file{ext}").resolve()) assert exc_info.value.status_code == 400 diff --git a/tests/unit/rag/test_pptx_extraction.py b/tests/unit/rag/test_pptx_extraction.py new file mode 100644 index 000000000..aca010e5c --- /dev/null +++ b/tests/unit/rag/test_pptx_extraction.py @@ -0,0 +1,470 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Unit tests for PowerPoint (.pptx) extraction in gaia.rag.sdk. + +Covers: +- Text extraction from shapes, titles, tables, speaker notes +- Multi-slide [Page N] marker generation +- Image extraction with VLM (mocked) +- Graceful handling of empty / corrupted presentations +- Metadata structure + +Fixtures are built programmatically with python-pptx so the tests remain +hermetic and don't require committing binary fixture files to the repo. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +pptx = pytest.importorskip("pptx") + +from pptx import Presentation # noqa: E402 +from pptx.util import Inches # noqa: E402 + +from gaia.rag.sdk import RAGSDK, RAGConfig # noqa: E402 + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def rag(tmp_path: Path) -> RAGSDK: + """ + A RAGSDK instance scoped to tmp_path with heavy ML deps stubbed out. + + Same pattern as test_pdf_extraction_errors.py — stubs sentence-transformers, + faiss, VLM, and chat/LLM initialization. + """ + config = RAGConfig( + cache_dir=str(tmp_path / ".gaia"), + show_stats=False, + use_local_llm=False, + ) + + fake_vlm = MagicMock(name="VLMClient") + fake_vlm.check_availability.return_value = False + + with ( + patch.object(RAGSDK, "_check_dependencies", return_value=None), + patch("gaia.rag.sdk.AgentSDK", autospec=True) as mock_agent_sdk, + patch("gaia.llm.VLMClient", return_value=fake_vlm), + ): + mock_agent_sdk.return_value = MagicMock(name="AgentSDK") + instance = RAGSDK(config=config) + + instance._test_vlm_patch = patch("gaia.llm.VLMClient", return_value=fake_vlm) + instance._test_vlm_patch.start() + yield instance + instance._test_vlm_patch.stop() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _create_pptx(path: Path, slides: list) -> None: + """Create a test .pptx programmatically. + + Each dict in *slides* may have keys: + + - ``title`` (str): slide title text + - ``body`` (str): body placeholder text + - ``notes`` (str): speaker notes + - ``table`` (list[list[str]]): rows for a table (first row is header) + """ + prs = Presentation() + for content in slides: + slide_layout = prs.slide_layouts[1] # Title + Content + slide = prs.slides.add_slide(slide_layout) + + if "title" in content and slide.shapes.title: + slide.shapes.title.text = content["title"] + + if "body" in content: + # Use the content placeholder (index 1) + for ph in slide.placeholders: + if ph.placeholder_format.idx == 1: + ph.text = content["body"] + break + + if "notes" in content: + slide.notes_slide.notes_text_frame.text = content["notes"] + + if "table" in content: + rows_data = content["table"] + num_rows = len(rows_data) + num_cols = len(rows_data[0]) if rows_data else 0 + if num_rows and num_cols: + table_shape = slide.shapes.add_table( + num_rows, num_cols, Inches(1), Inches(3), Inches(6), Inches(2) + ) + table = table_shape.table + for r_idx, row in enumerate(rows_data): + for c_idx, cell_text in enumerate(row): + table.cell(r_idx, c_idx).text = cell_text + + prs.save(str(path)) + + +def _create_pptx_with_image(path: Path, image_bytes: bytes) -> None: + """Create a .pptx with a single slide containing an embedded image.""" + import io + + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[5]) # Blank layout + slide.shapes.add_picture(io.BytesIO(image_bytes), Inches(1), Inches(1)) + prs.save(str(path)) + + +def _make_red_png() -> bytes: + """Generate a minimal 10x10 red PNG image.""" + from PIL import Image + + img = Image.new("RGB", (10, 10), color="red") + import io + + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + +# --------------------------------------------------------------------------- +# Tests — Text Extraction +# --------------------------------------------------------------------------- + + +class TestPptxTextExtraction: + """Tests for _extract_text_from_pptx text handling.""" + + def test_basic_text_extraction(self, rag, tmp_path): + """Single slide with title + body — both appear in output.""" + pptx_path = tmp_path / "basic.pptx" + _create_pptx(pptx_path, [{"title": "Hello World", "body": "Test content here"}]) + + text, num_slides, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + assert "Hello World" in text + assert "Test content here" in text + assert num_slides == 1 + + def test_multi_slide_page_markers(self, rag, tmp_path): + """Multiple slides produce [Page 1], [Page 2], [Page 3] markers.""" + pptx_path = tmp_path / "multi.pptx" + _create_pptx( + pptx_path, + [ + {"title": "Slide One", "body": "Content 1"}, + {"title": "Slide Two", "body": "Content 2"}, + {"title": "Slide Three", "body": "Content 3"}, + ], + ) + + text, num_slides, _ = rag._extract_text_from_pptx(str(pptx_path)) + + assert num_slides == 3 + assert "[Page 1]" in text + assert "[Page 2]" in text + assert "[Page 3]" in text + assert "Slide One" in text + assert "Slide Two" in text + assert "Slide Three" in text + + def test_speaker_notes_extraction(self, rag, tmp_path): + """Speaker notes appear in extracted text.""" + pptx_path = tmp_path / "notes.pptx" + _create_pptx( + pptx_path, + [{"title": "Presentation", "notes": "Remember to mention the deadline"}], + ) + + text, _, _ = rag._extract_text_from_pptx(str(pptx_path)) + + assert "Speaker Notes:" in text + assert "Remember to mention the deadline" in text + + def test_table_extraction(self, rag, tmp_path): + """Tables are extracted as markdown.""" + pptx_path = tmp_path / "table.pptx" + _create_pptx( + pptx_path, + [ + { + "title": "Data Table", + "table": [ + ["Name", "Age", "City"], + ["Alice", "30", "Boston"], + ["Bob", "25", "Denver"], + ], + } + ], + ) + + text, _, _ = rag._extract_text_from_pptx(str(pptx_path)) + + assert "Alice" in text + assert "Bob" in text + assert "Boston" in text + assert "Denver" in text + # Should have markdown table formatting + assert "|" in text + assert "---" in text + + +# --------------------------------------------------------------------------- +# Tests — Edge Cases +# --------------------------------------------------------------------------- + + +class TestPptxEdgeCases: + """Tests for empty, blank, and corrupted presentations.""" + + def test_empty_presentation_raises(self, rag, tmp_path): + """PPTX with no slides raises ValueError with guidance.""" + pptx_path = tmp_path / "empty.pptx" + prs = Presentation() + prs.save(str(pptx_path)) + + # Empty presentation has 0 slides — should not crash + # but the result may have no content. The method checks + # has_any_content after the loop. + # With 0 slides, pages_data is empty, has_any_content is False + # unless total_slides==0 means any() on empty list returns False + with pytest.raises(ValueError, match="No extractable text"): + rag._extract_text_from_pptx(str(pptx_path)) + + def test_blank_slide(self, rag, tmp_path): + """PPTX with one completely blank slide raises ValueError.""" + pptx_path = tmp_path / "blank.pptx" + prs = Presentation() + prs.slides.add_slide(prs.slide_layouts[6]) # Blank layout + prs.save(str(pptx_path)) + + with pytest.raises(ValueError, match="No extractable text"): + rag._extract_text_from_pptx(str(pptx_path)) + + def test_corrupted_file_raises(self, rag, tmp_path): + """Garbage bytes with .pptx extension raises ValueError.""" + pptx_path = tmp_path / "corrupt.pptx" + pptx_path.write_bytes(b"this is not a pptx file at all") + + with pytest.raises(ValueError, match="corrupted|not a valid"): + rag._extract_text_from_pptx(str(pptx_path)) + + +# --------------------------------------------------------------------------- +# Tests — Metadata +# --------------------------------------------------------------------------- + + +class TestPptxMetadata: + """Tests for metadata structure from _extract_text_from_pptx.""" + + def test_metadata_keys(self, rag, tmp_path): + """Metadata contains expected keys and values.""" + pptx_path = tmp_path / "meta.pptx" + _create_pptx(pptx_path, [{"title": "Test", "body": "Content"}]) + + _, num_slides, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + assert num_slides == 1 + assert metadata["num_slides"] == 1 + assert metadata["vlm_slides"] == 0 + assert metadata["total_images"] == 0 + assert metadata["vlm_checked"] is True + assert metadata["vlm_available"] is False + assert metadata["pptx_status"] == "readable" + + +# --------------------------------------------------------------------------- +# Tests — VLM Integration (mocked) +# --------------------------------------------------------------------------- + + +class TestPptxVLMIntegration: + """Tests for VLM image extraction from slides.""" + + def test_vlm_not_available_skips_images(self, rag, tmp_path): + """When VLM is unavailable, embedded images are not processed. + + We add text alongside the image so the empty-content check doesn't + fire — the test is specifically about VLM skipping, not about empty + presentations. + """ + PIL = pytest.importorskip("PIL") # noqa: N806, F841 + import io as _io + + pptx_path = tmp_path / "with_image.pptx" + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[5]) + slide.shapes.add_picture(_io.BytesIO(_make_red_png()), Inches(1), Inches(1)) + # Add a text box so the slide is not considered empty + txBox = slide.shapes.add_textbox(Inches(3), Inches(1), Inches(3), Inches(1)) + txBox.text_frame.text = "Slide with image and text" + prs.save(str(pptx_path)) + + text, _, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + # VLM is mocked as unavailable in the fixture + assert metadata["total_images"] == 0 + assert metadata["vlm_slides"] == 0 + assert "Slide with image and text" in text + + def test_vlm_processes_images(self, rag, tmp_path): + """When VLM is available, extracted image text appears in output.""" + PIL = pytest.importorskip("PIL") # noqa: N806, F841 + import io as _io + + pptx_path = tmp_path / "vlm_test.pptx" + # Create slide with both an image and text (so empty-check doesn't fire) + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[5]) + slide.shapes.add_picture(_io.BytesIO(_make_red_png()), Inches(1), Inches(1)) + txBox = slide.shapes.add_textbox(Inches(3), Inches(1), Inches(3), Inches(1)) + txBox.text_frame.text = "Slide with VLM image" + prs.save(str(pptx_path)) + + # Patch VLM to be available and return text + fake_vlm = MagicMock(name="VLMClient") + fake_vlm.check_availability.return_value = True + fake_vlm.extract_from_page_images.return_value = [ + { + "image_num": 1, + "text": "Extracted text from slide image", + "dimensions": "10x10", + "size_kb": 0.5, + } + ] + + # Need to re-patch VLM for this specific test + rag._test_vlm_patch.stop() + with patch("gaia.llm.VLMClient", return_value=fake_vlm): + text, _, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + # Re-start the original patch for fixture cleanup + rag._test_vlm_patch.start() + + assert "Extracted text from slide image" in text + assert metadata["total_images"] == 1 + assert metadata["vlm_slides"] == 1 + + +# --------------------------------------------------------------------------- +# Tests — Integration with _extract_text_from_file dispatcher +# --------------------------------------------------------------------------- + + +class TestPptxDispatcher: + """Tests for .pptx routing through _extract_text_from_file.""" + + def test_pptx_dispatches_correctly(self, rag, tmp_path): + """_extract_text_from_file routes .pptx to _extract_text_from_pptx.""" + pptx_path = tmp_path / "dispatch.pptx" + _create_pptx(pptx_path, [{"title": "Dispatch Test", "body": "Works"}]) + + text, metadata = rag._extract_text_from_file(str(pptx_path)) + + assert "Dispatch Test" in text + assert "Works" in text + assert metadata["num_pages"] == 1 # num_slides mapped to num_pages + + +# --------------------------------------------------------------------------- +# Tests — pptx_utils module +# --------------------------------------------------------------------------- + + +class TestPptxUtils: + """Direct tests for pptx_utils helper functions.""" + + def test_extract_text_from_slide(self): + """extract_text_from_slide captures text frame content.""" + from gaia.rag.pptx_utils import extract_text_from_slide + + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[1]) + slide.shapes.title.text = "My Title" + for ph in slide.placeholders: + if ph.placeholder_format.idx == 1: + ph.text = "Body text" + break + + text = extract_text_from_slide(slide, slide_num=1) + assert "My Title" in text + assert "Body text" in text + + def test_extract_notes_from_slide(self): + """extract_notes_from_slide returns speaker notes.""" + from gaia.rag.pptx_utils import extract_notes_from_slide + + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[1]) + slide.notes_slide.notes_text_frame.text = "These are my notes" + + notes = extract_notes_from_slide(slide) + assert notes == "These are my notes" + + def test_extract_notes_empty(self): + """extract_notes_from_slide returns empty string when no notes.""" + from gaia.rag.pptx_utils import extract_notes_from_slide + + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[6]) # Blank layout + + notes = extract_notes_from_slide(slide) + assert notes == "" + + def test_count_images_in_slide_no_images(self): + """count_images_in_slide returns (False, 0) for text-only slides.""" + from gaia.rag.pptx_utils import count_images_in_slide + + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[1]) + slide.shapes.title.text = "No images here" + + has_images, count = count_images_in_slide(slide) + assert has_images is False + assert count == 0 + + def test_count_images_in_slide_with_image(self): + """count_images_in_slide detects embedded images.""" + PIL = pytest.importorskip("PIL") # noqa: N806, F841 + import io + + from gaia.rag.pptx_utils import count_images_in_slide + + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[5]) + slide.shapes.add_picture(io.BytesIO(_make_red_png()), Inches(1), Inches(1)) + + has_images, count = count_images_in_slide(slide) + assert has_images is True + assert count >= 1 + + def test_table_to_markdown(self): + """_table_to_markdown produces valid markdown table.""" + from gaia.rag.pptx_utils import _table_to_markdown + + prs = Presentation() + slide = prs.slides.add_slide(prs.slide_layouts[5]) + table_shape = slide.shapes.add_table( + 2, 3, Inches(1), Inches(1), Inches(6), Inches(2) + ) + table = table_shape.table + table.cell(0, 0).text = "Name" + table.cell(0, 1).text = "Age" + table.cell(0, 2).text = "City" + table.cell(1, 0).text = "Alice" + table.cell(1, 1).text = "30" + table.cell(1, 2).text = "Boston" + + md = _table_to_markdown(table) + assert "| Name | Age | City |" in md + assert "| --- | --- | --- |" in md + assert "| Alice | 30 | Boston |" in md From 956e3226c28ac0bbb9143309dfb2b3345b6863b4 Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Thu, 28 May 2026 17:59:07 -0700 Subject: [PATCH 2/6] =?UTF-8?q?fix(rag):=20address=20PPTX=20review=20feedb?= =?UTF-8?q?ack=20=E2=80=94=20import=20ordering,=20docs,=20logging?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/guides/chat.mdx | 13 +++++++------ docs/sdk/sdks/rag.mdx | 2 +- src/gaia/rag/sdk.py | 15 +++++++++------ 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/docs/guides/chat.mdx b/docs/guides/chat.mdx index e2a66bde9..c919d96df 100644 --- a/docs/guides/chat.mdx +++ b/docs/guides/chat.mdx @@ -102,7 +102,7 @@ gaia chat --query "Hello" --show-stats ## Document Q&A (RAG) -RAG (Retrieval-Augmented Generation) enables chatting with PDF documents using semantic search and context retrieval. +RAG (Retrieval-Augmented Generation) enables chatting with PDF and PowerPoint (.pptx) documents using semantic search and context retrieval. ### CLI with RAG @@ -110,15 +110,16 @@ RAG (Retrieval-Augmented Generation) enables chatting with PDF documents using s ```bash - # Chat with single document + # Chat with a PDF or PowerPoint document gaia chat --index manual.pdf + gaia chat --index slides.pptx ``` ```bash - # Chat with multiple documents - gaia chat --index doc1.pdf doc2.pdf doc3.pdf + # Chat with multiple documents (PDF and PPTX supported) + gaia chat --index doc1.pdf doc2.pdf slides.pptx ``` @@ -131,7 +132,7 @@ RAG (Retrieval-Augmented Generation) enables chatting with PDF documents using s ```bash - # Auto-index every PDF in a folder, and any new ones dropped in later + # Auto-index every PDF/PPTX in a folder, and any new ones dropped in later gaia chat --watch ./docs ``` @@ -152,7 +153,7 @@ RAG (Retrieval-Augmented Generation) enables chatting with PDF documents using s - **PDF Indexing Requirements:** Processing PDFs with images requires a Vision Language Model (VLM). GAIA uses `Qwen3-VL-4B-Instruct-GGUF` by default for extracting text from images in PDFs. + **Document Indexing Requirements:** Processing PDFs and PPTX files with images requires a Vision Language Model (VLM). GAIA uses `Qwen3-VL-4B-Instruct-GGUF` by default for extracting text from images in documents. To download all models needed for chat (including VLM): ```bash diff --git a/docs/sdk/sdks/rag.mdx b/docs/sdk/sdks/rag.mdx index 646f0b92e..cc3236f24 100644 --- a/docs/sdk/sdks/rag.mdx +++ b/docs/sdk/sdks/rag.mdx @@ -62,7 +62,7 @@ This happens once when you index documents: ```mermaid %%{init: {'theme':'base', 'themeVariables': { 'primaryColor':'#ED1C24', 'primaryTextColor':'#fff', 'primaryBorderColor':'#C8171E', 'lineColor':'#F4484D', 'secondaryColor':'#2d2d2d', 'tertiaryColor':'#f5f5f5', 'fontFamily': 'system-ui, -apple-system, sans-serif'}}}%% flowchart TD - A(["PDF Document"]) --> B(["EXTRACT TEXT"]) + A(["PDF / PPTX Document"]) --> B(["EXTRACT TEXT"]) B --> C(["SPLIT INTO CHUNKS"]) C --> D(["GENERATE EMBEDDINGS"]) D --> E[("STORE IN FAISS")] diff --git a/src/gaia/rag/sdk.py b/src/gaia/rag/sdk.py index 88b7cbf96..13dde1ca8 100644 --- a/src/gaia/rag/sdk.py +++ b/src/gaia/rag/sdk.py @@ -936,6 +936,11 @@ def _extract_text_from_pptx(self, pptx_path: str) -> tuple: total_slides = len(prs.slides) self.log.info(f"📊 Extracting text from {total_slides} slides...") + from gaia.rag.pptx_utils import ( # pylint: disable=import-outside-toplevel + extract_notes_from_slide, + extract_text_from_slide, + ) + # Initialize VLM client (auto-enabled if available) vlm = None vlm_available = False @@ -946,8 +951,6 @@ def _extract_text_from_pptx(self, pptx_path: str) -> tuple: from gaia.rag.pptx_utils import ( # pylint: disable=import-outside-toplevel count_images_in_slide, extract_images_from_slide, - extract_notes_from_slide, - extract_text_from_slide, ) vlm = VLMClient( @@ -1007,8 +1010,8 @@ def _extract_text_from_pptx(self, pptx_path: str) -> tuple: if vlm_available: try: has_imgs, num_imgs = count_images_in_slide(slide) - except Exception: # pylint: disable=broad-except - pass + except Exception as e: # pylint: disable=broad-except + self.log.debug("count_images_in_slide failed on slide %d: %s", i, e) # Step 4: Extract from images if present image_texts = [] @@ -1060,8 +1063,8 @@ def _extract_text_from_pptx(self, pptx_path: str) -> tuple: if vlm_available and vlm: try: vlm.cleanup() - except Exception: # pylint: disable=broad-except - pass + except Exception as e: # pylint: disable=broad-except + self.log.debug("VLM cleanup failed: %s", e) extract_duration = time_module.time() - extract_start From f961999befd8558d3cd443c4cf8fdee8f199766b Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Thu, 28 May 2026 18:22:53 -0700 Subject: [PATCH 3/6] style(rag): fix Black formatting in PPTX debug logging --- src/gaia/rag/sdk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gaia/rag/sdk.py b/src/gaia/rag/sdk.py index 13dde1ca8..70e37e7b6 100644 --- a/src/gaia/rag/sdk.py +++ b/src/gaia/rag/sdk.py @@ -1011,7 +1011,9 @@ def _extract_text_from_pptx(self, pptx_path: str) -> tuple: try: has_imgs, num_imgs = count_images_in_slide(slide) except Exception as e: # pylint: disable=broad-except - self.log.debug("count_images_in_slide failed on slide %d: %s", i, e) + self.log.debug( + "count_images_in_slide failed on slide %d: %s", i, e + ) # Step 4: Extract from images if present image_texts = [] From 2ae0fd0e6c56393e5425194810fd23850c146822 Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Thu, 28 May 2026 20:24:46 -0700 Subject: [PATCH 4/6] fix(rag): address remaining PPTX review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move `import zipfile` to module-level, surface `pptx_status` in index_document stats, harden test VLM patch with try/finally, use modern type hints, add logger.debug for slide_num, warn on group shape depth limit, and fix txBox → tx_box naming. --- src/gaia/rag/pptx_utils.py | 22 ++++++++++++--------- src/gaia/rag/sdk.py | 4 +++- tests/unit/rag/test_pptx_extraction.py | 27 +++++++++++++------------- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/src/gaia/rag/pptx_utils.py b/src/gaia/rag/pptx_utils.py index 951a618cc..0f10db41e 100644 --- a/src/gaia/rag/pptx_utils.py +++ b/src/gaia/rag/pptx_utils.py @@ -9,9 +9,10 @@ Image processing mirrors ``pdf_utils.py`` (resize, compress, same dict format). """ +from __future__ import annotations + import io import logging -from typing import List, Tuple logger = logging.getLogger(__name__) @@ -22,7 +23,7 @@ MAX_GROUP_DEPTH = 5 -def extract_images_from_slide(slide, slide_num: int) -> List[dict]: +def extract_images_from_slide(slide, slide_num: int) -> list[dict]: """ Extract embedded images from a PPTX slide. @@ -39,7 +40,7 @@ def extract_images_from_slide(slide, slide_num: int) -> List[dict]: [{"image_bytes": bytes, "width": int, "height": int, "format": "png", "size_kb": float}, ...] """ - images: List[dict] = [] + images: list[dict] = [] try: from PIL import Image @@ -146,7 +147,7 @@ def extract_images_from_slide(slide, slide_num: int) -> List[dict]: return images -def count_images_in_slide(slide) -> Tuple[bool, int]: +def count_images_in_slide(slide) -> tuple[bool, int]: """ Fast check for embedded image presence without extraction. @@ -163,9 +164,7 @@ def count_images_in_slide(slide) -> Tuple[bool, int]: return (count > 0, count) -def extract_text_from_slide( - slide, slide_num: int # pylint: disable=unused-argument -) -> str: +def extract_text_from_slide(slide, slide_num: int) -> str: """ Extract all native text from a PPTX slide. @@ -179,7 +178,8 @@ def extract_text_from_slide( Returns: Concatenated slide text with paragraph boundaries. """ - parts: List[str] = [] + logger.debug("Extracting text from slide %d", slide_num) + parts: list[str] = [] for shape in _iter_shapes(slide.shapes): # Table shapes — format as markdown @@ -236,7 +236,11 @@ def _iter_shapes(shapes, depth: int = 0): if shape.shape_type == MSO_SHAPE_TYPE.GROUP: if depth < MAX_GROUP_DEPTH: yield from _iter_shapes(shape.shapes, depth + 1) - # else: skip the group entirely at the depth limit + else: + logger.warning( + "Group shape nesting exceeds MAX_GROUP_DEPTH (%d); skipping children", + MAX_GROUP_DEPTH, + ) else: yield shape diff --git a/src/gaia/rag/sdk.py b/src/gaia/rag/sdk.py index 70e37e7b6..f469a775b 100644 --- a/src/gaia/rag/sdk.py +++ b/src/gaia/rag/sdk.py @@ -15,6 +15,7 @@ import secrets import threading import time +import zipfile from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional @@ -883,7 +884,6 @@ def _extract_text_from_pptx(self, pptx_path: str) -> tuple: # Guard against zip bombs: .pptx is a ZIP container. Check that # the total uncompressed size is sane before handing it to python-pptx. - import zipfile try: with zipfile.ZipFile(pptx_path, "r") as zf: @@ -2678,6 +2678,8 @@ def index_document(self, file_path: str) -> Dict[str, Any]: stats["total_chunks"] = len(self.chunks) if file_type == ".pdf": stats["pdf_status"] = "readable" + elif file_type == ".pptx": + stats["pptx_status"] = "readable" return stats except PDFExtractionError as e: diff --git a/tests/unit/rag/test_pptx_extraction.py b/tests/unit/rag/test_pptx_extraction.py index aca010e5c..e441b2e98 100644 --- a/tests/unit/rag/test_pptx_extraction.py +++ b/tests/unit/rag/test_pptx_extraction.py @@ -305,8 +305,8 @@ def test_vlm_not_available_skips_images(self, rag, tmp_path): slide = prs.slides.add_slide(prs.slide_layouts[5]) slide.shapes.add_picture(_io.BytesIO(_make_red_png()), Inches(1), Inches(1)) # Add a text box so the slide is not considered empty - txBox = slide.shapes.add_textbox(Inches(3), Inches(1), Inches(3), Inches(1)) - txBox.text_frame.text = "Slide with image and text" + tx_box = slide.shapes.add_textbox(Inches(3), Inches(1), Inches(3), Inches(1)) + tx_box.text_frame.text = "Slide with image and text" prs.save(str(pptx_path)) text, _, metadata = rag._extract_text_from_pptx(str(pptx_path)) @@ -326,8 +326,8 @@ def test_vlm_processes_images(self, rag, tmp_path): prs = Presentation() slide = prs.slides.add_slide(prs.slide_layouts[5]) slide.shapes.add_picture(_io.BytesIO(_make_red_png()), Inches(1), Inches(1)) - txBox = slide.shapes.add_textbox(Inches(3), Inches(1), Inches(3), Inches(1)) - txBox.text_frame.text = "Slide with VLM image" + tx_box = slide.shapes.add_textbox(Inches(3), Inches(1), Inches(3), Inches(1)) + tx_box.text_frame.text = "Slide with VLM image" prs.save(str(pptx_path)) # Patch VLM to be available and return text @@ -344,15 +344,16 @@ def test_vlm_processes_images(self, rag, tmp_path): # Need to re-patch VLM for this specific test rag._test_vlm_patch.stop() - with patch("gaia.llm.VLMClient", return_value=fake_vlm): - text, _, metadata = rag._extract_text_from_pptx(str(pptx_path)) - - # Re-start the original patch for fixture cleanup - rag._test_vlm_patch.start() - - assert "Extracted text from slide image" in text - assert metadata["total_images"] == 1 - assert metadata["vlm_slides"] == 1 + try: + with patch("gaia.llm.VLMClient", return_value=fake_vlm): + text, _, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + assert "Extracted text from slide image" in text + assert metadata["total_images"] == 1 + assert metadata["vlm_slides"] == 1 + finally: + # Re-start the original patch for fixture cleanup + rag._test_vlm_patch.start() # --------------------------------------------------------------------------- From 97c60a66ca62adb0d161c90596e30b31b865913d Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Fri, 29 May 2026 01:36:06 -0700 Subject: [PATCH 5/6] feat(rag): PPTX to PDF conversion via PowerPoint COM for full-fidelity extraction When PowerPoint is installed (Windows), automatically converts PPTX to PDF and feeds it through the existing PDF pipeline -- capturing charts, SmartArt, and visual layout that python-pptx cannot access. Speaker notes are still extracted via python-pptx since they do not appear in PDF renders. Falls back gracefully to python-pptx native extraction when PowerPoint is not available (Linux, Mac, or no Office installed). --- src/gaia/rag/pptx_utils.py | 93 ++++++++++++++++++++++++++ src/gaia/rag/sdk.py | 70 +++++++++++++++++++ tests/unit/rag/test_pptx_extraction.py | 84 +++++++++++++++++++++++ 3 files changed, 247 insertions(+) diff --git a/src/gaia/rag/pptx_utils.py b/src/gaia/rag/pptx_utils.py index 0f10db41e..a42db42c0 100644 --- a/src/gaia/rag/pptx_utils.py +++ b/src/gaia/rag/pptx_utils.py @@ -13,6 +13,9 @@ import io import logging +import os +import platform +import subprocess logger = logging.getLogger(__name__) @@ -267,3 +270,93 @@ def _table_to_markdown(table) -> str: rows.insert(1, separator) return "\n".join(rows) + + +# --------------------------------------------------------------------------- +# PPTX → PDF conversion (Windows only, requires PowerPoint) +# --------------------------------------------------------------------------- + + +def convert_pptx_to_pdf(pptx_path: str, output_dir: str) -> str | None: + """Convert a PPTX file to PDF using PowerPoint COM automation. + + Only works on Windows with Microsoft PowerPoint installed. Returns the + path to the generated PDF on success, or ``None`` if conversion is not + possible (wrong OS, PowerPoint not installed, timeout, etc.). + + This function never raises — the caller should fall back to python-pptx + native extraction when ``None`` is returned. + + Args: + pptx_path: Absolute path to the ``.pptx`` file. + output_dir: Directory where the PDF will be written. + + Returns: + Absolute path to the generated PDF, or ``None``. + """ + if platform.system() != "Windows": + logger.debug( + "PPTX→PDF conversion requires Windows (current: %s)", platform.system() + ) + return None + + from pathlib import Path # already available, but keep import local for clarity + + pptx_abs = str(Path(pptx_path).resolve()) + pdf_name = Path(pptx_path).stem + ".pdf" + pdf_abs = str(Path(output_dir).resolve() / pdf_name) + + # PowerShell script using PowerPoint COM. Single-quoted paths handle + # spaces correctly. MsoTriState values are raw integers to avoid + # needing the Office interop assembly. + # msoTrue = -1, msoFalse = 0, ppSaveAsPDF = 32 + ps_script = ( + "$ErrorActionPreference = 'Stop'; " + "$ppt = New-Object -ComObject PowerPoint.Application; " + "try { " + f" $pres = $ppt.Presentations.Open('{pptx_abs}', " + " [int]-1, " # ReadOnly = msoTrue + " [int]0, " # Untitled = msoFalse + " [int]0" # WithWindow = msoFalse + " ); " + f" $pres.SaveAs('{pdf_abs}', 32); " # 32 = ppSaveAsPDF + " $pres.Close(); " + "} finally { " + " $ppt.Quit(); " + "}" + ) + + try: + result = subprocess.run( + ["powershell", "-NoProfile", "-Command", ps_script], + capture_output=True, + text=True, + timeout=120, + ) + + if result.returncode == 0 and os.path.exists(pdf_abs): + logger.info("PPTX→PDF conversion succeeded: %s", pdf_abs) + return pdf_abs + + logger.debug( + "PPTX→PDF conversion failed (rc=%d): %s", + result.returncode, + result.stderr.strip()[:200] if result.stderr else "(no stderr)", + ) + return None + + except subprocess.TimeoutExpired: + logger.warning("PPTX→PDF conversion timed out after 120s") + # Kill any orphaned PowerPoint process spawned by COM + subprocess.run( + ["taskkill", "/f", "/im", "POWERPNT.EXE"], + capture_output=True, + timeout=10, + ) + return None + except FileNotFoundError: + logger.debug("PowerShell not found on PATH") + return None + except Exception as e: + logger.debug("PPTX→PDF conversion failed: %s", e) + return None diff --git a/src/gaia/rag/sdk.py b/src/gaia/rag/sdk.py index f469a775b..93d187c84 100644 --- a/src/gaia/rag/sdk.py +++ b/src/gaia/rag/sdk.py @@ -937,10 +937,80 @@ def _extract_text_from_pptx(self, pptx_path: str) -> tuple: self.log.info(f"📊 Extracting text from {total_slides} slides...") from gaia.rag.pptx_utils import ( # pylint: disable=import-outside-toplevel + convert_pptx_to_pdf, extract_notes_from_slide, extract_text_from_slide, ) + # ---------------------------------------------------------- + # Fast path: PPTX → PDF via PowerPoint COM, then existing + # PDF pipeline. Captures charts, SmartArt, and visual + # layout that python-pptx cannot access. + # ---------------------------------------------------------- + pdf_conversion_path = None + tmp_dir = None + try: + import tempfile as _tempfile + + tmp_dir = _tempfile.mkdtemp(prefix="gaia_pptx_") + pdf_conversion_path = convert_pptx_to_pdf( + str(Path(pptx_path).resolve()), tmp_dir + ) + except Exception as conv_err: + self.log.debug("PPTX→PDF conversion not available: %s", conv_err) + + if pdf_conversion_path: + try: + if self.config.show_stats: + print( + " 📊 Using PowerPoint→PDF conversion for full-fidelity extraction" + ) + self.log.info("Using PowerPoint→PDF conversion for %s", file_name) + pdf_text, pdf_pages, pdf_metadata = self._extract_text_from_pdf( + pdf_conversion_path + ) + + # Append speaker notes (not in PDF render) + notes_parts = [] + for i, slide in enumerate(prs.slides, 1): + notes = extract_notes_from_slide(slide) + if notes: + notes_parts.append( + f"\n[Page {i}] **Speaker Notes:**\n{notes}" + ) + if notes_parts: + pdf_text += "\n\n" + "\n".join(notes_parts) + + metadata = { + "num_slides": pdf_pages, + "vlm_slides": pdf_metadata.get("vlm_pages", 0), + "total_images": pdf_metadata.get("total_images", 0), + "vlm_checked": pdf_metadata.get("vlm_checked", False), + "vlm_available": pdf_metadata.get("vlm_available", False), + "pptx_status": "readable", + "conversion": "powerpoint_com", + } + + extract_duration = time_module.time() - extract_start + self.log.info( + f"📝 Extracted {len(pdf_text):,} characters via PDF conversion in {extract_duration:.2f}s" + ) + return pdf_text, pdf_pages, metadata + except Exception as pdf_err: + self.log.warning( + "PDF extraction from converted PPTX failed, falling back: %s", + pdf_err, + ) + finally: + import shutil + + if tmp_dir: + shutil.rmtree(tmp_dir, ignore_errors=True) + + # ---------------------------------------------------------- + # Fallback: python-pptx native extraction + # ---------------------------------------------------------- + # Initialize VLM client (auto-enabled if available) vlm = None vlm_available = False diff --git a/tests/unit/rag/test_pptx_extraction.py b/tests/unit/rag/test_pptx_extraction.py index e441b2e98..5f72ded13 100644 --- a/tests/unit/rag/test_pptx_extraction.py +++ b/tests/unit/rag/test_pptx_extraction.py @@ -469,3 +469,87 @@ def test_table_to_markdown(self): assert "| Name | Age | City |" in md assert "| --- | --- | --- |" in md assert "| Alice | 30 | Boston |" in md + + +# --------------------------------------------------------------------------- +# Tests — PowerPoint COM → PDF conversion path +# --------------------------------------------------------------------------- + + +class TestPptxPdfConversion: + """Tests for the PPTX → PDF → existing pipeline fast path.""" + + def test_pdf_conversion_used_when_available(self, rag, tmp_path): + """When convert_pptx_to_pdf returns a PDF, _extract_text_from_pdf is used.""" + pptx_path = tmp_path / "conv.pptx" + _create_pptx( + pptx_path, + [ + {"title": "Converted Slide", "body": "Body text", "notes": "My notes"}, + ], + ) + + fake_pdf_text = "[Page 1]\nConverted slide content from PDF" + fake_pdf_metadata = { + "num_pages": 1, + "vlm_pages": 0, + "total_images": 0, + "vlm_checked": True, + "vlm_available": False, + "pdf_status": "readable", + } + + with ( + patch( + "gaia.rag.pptx_utils.convert_pptx_to_pdf", + return_value="/fake/output.pdf", + ), + patch.object( + rag, + "_extract_text_from_pdf", + return_value=(fake_pdf_text, 1, fake_pdf_metadata), + ) as mock_pdf, + ): + text, num_slides, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + mock_pdf.assert_called_once_with("/fake/output.pdf") + assert "Converted slide content from PDF" in text + assert "My notes" in text # Notes appended from python-pptx + assert metadata["conversion"] == "powerpoint_com" + + def test_pdf_conversion_fallback_on_none(self, rag, tmp_path): + """When convert_pptx_to_pdf returns None, python-pptx fallback runs.""" + pptx_path = tmp_path / "fallback.pptx" + _create_pptx( + pptx_path, [{"title": "Fallback Slide", "body": "Fallback content"}] + ) + + with patch("gaia.rag.pptx_utils.convert_pptx_to_pdf", return_value=None): + text, num_slides, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + assert "Fallback Slide" in text + assert "Fallback content" in text + assert "conversion" not in metadata # No conversion key in fallback path + + def test_pdf_conversion_fallback_on_exception(self, rag, tmp_path): + """When PDF extraction from converted file fails, python-pptx fallback runs.""" + pptx_path = tmp_path / "exc.pptx" + _create_pptx(pptx_path, [{"title": "Exception Slide", "body": "Safe content"}]) + + with ( + patch( + "gaia.rag.pptx_utils.convert_pptx_to_pdf", + return_value="/fake/output.pdf", + ), + patch.object( + rag, + "_extract_text_from_pdf", + side_effect=RuntimeError("PDF parsing exploded"), + ), + ): + text, num_slides, metadata = rag._extract_text_from_pptx(str(pptx_path)) + + # Should have fallen back to python-pptx + assert "Exception Slide" in text + assert "Safe content" in text + assert "conversion" not in metadata From 4921fd6fccffe9380ad48c62c17808228c19009f Mon Sep 17 00:00:00 2001 From: Ovtcharov Date: Fri, 29 May 2026 02:12:24 -0700 Subject: [PATCH 6/6] test(rag): add PPTX end-to-end integration test with real LLM Builds a test PPTX programmatically (no sensitive files), indexes it through the full pipeline (COM conversion, PDF extraction, VLM, embeddings), then verifies LLM answers contain expected facts (cost, location, team members, speaker notes). Skips when Lemonade is not running. --- tests/integration/test_pptx_rag_e2e.py | 195 +++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 tests/integration/test_pptx_rag_e2e.py diff --git a/tests/integration/test_pptx_rag_e2e.py b/tests/integration/test_pptx_rag_e2e.py new file mode 100644 index 000000000..f10d2a998 --- /dev/null +++ b/tests/integration/test_pptx_rag_e2e.py @@ -0,0 +1,195 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +End-to-end integration test for PPTX extraction through the full RAG + LLM +pipeline. + +Creates a test PPTX programmatically (no sensitive files), indexes it via +RAGSDK, queries via rag.query() (real LLM on Lemonade), and verifies the +answers contain expected content. + +Skips when Lemonade is not reachable. +""" + +from __future__ import annotations + +import os +import tempfile +import time +from pathlib import Path +from urllib.error import URLError +from urllib.request import urlopen + +import pytest + + +def _lemonade_available(base_url="http://localhost:13305"): + """Check if Lemonade server is reachable.""" + try: + with urlopen(f"{base_url}/api/v1/models", timeout=3) as r: + return r.status == 200, "ok" + except (URLError, OSError) as e: + return False, f"Lemonade not reachable: {e}" + + +def _create_test_pptx(path: Path) -> None: + """Build a multi-slide PPTX with known content for Q&A verification.""" + from pptx import Presentation + from pptx.util import Inches + + prs = Presentation() + + # Slide 1: Title + slide1 = prs.slides.add_slide(prs.slide_layouts[0]) + slide1.shapes.title.text = "Project Aurora: Next-Gen Solar Panel Technology" + slide1.placeholders[1].text = "Q3 2026 Technical Review" + + # Slide 2: Problem statement + slide2 = prs.slides.add_slide(prs.slide_layouts[1]) + slide2.shapes.title.text = "Current Challenges" + slide2.placeholders[1].text = ( + "Traditional silicon solar panels have reached a theoretical efficiency " + "ceiling of 29%. Manufacturing costs remain high at $0.30 per watt. " + "Degradation rates of 0.5% per year reduce lifetime output by 12% over " + "25 years. Project Aurora addresses these limitations with perovskite " + "tandem cell technology." + ) + + # Slide 3: Solution with specific data + slide3 = prs.slides.add_slide(prs.slide_layouts[1]) + slide3.shapes.title.text = "Aurora Perovskite Solution" + slide3.placeholders[1].text = ( + "The Aurora tandem cell combines perovskite and silicon layers to achieve " + "33.7% efficiency in lab conditions. Manufacturing cost target is $0.18 " + "per watt using roll-to-roll printing. The perovskite layer absorbs " + "blue and green wavelengths while silicon handles red and infrared. " + "Field trials in Phoenix, Arizona showed 31.2% real-world efficiency." + ) + + # Slide 4: Timeline with speaker notes + slide4 = prs.slides.add_slide(prs.slide_layouts[1]) + slide4.shapes.title.text = "Project Timeline" + slide4.placeholders[1].text = ( + "Phase 1 (Q1 2026): Lab prototype validation. " + "Phase 2 (Q3 2026): Pilot manufacturing line in Dresden, Germany. " + "Phase 3 (Q1 2027): Commercial production at 500MW annual capacity. " + "Total investment: $47 million across all phases." + ) + slide4.notes_slide.notes_text_frame.text = ( + "The Dresden facility was chosen for its existing semiconductor " + "supply chain and skilled workforce. EU Green Deal subsidies cover " + "30% of Phase 2 costs." + ) + + # Slide 5: Team with a table + slide5 = prs.slides.add_slide(prs.slide_layouts[5]) # Blank + tx = slide5.shapes.add_textbox(Inches(0.5), Inches(0.3), Inches(8), Inches(0.5)) + tx.text_frame.text = "Core Team" + table_shape = slide5.shapes.add_table( + 4, 3, Inches(0.5), Inches(1), Inches(8), Inches(3) + ) + table = table_shape.table + for c, text in enumerate(["Name", "Role", "Location"]): + table.cell(0, c).text = text + for r, row in enumerate( + [ + ["Dr. Elena Vasquez", "Chief Scientist", "MIT"], + ["Marcus Chen", "Manufacturing Lead", "Dresden"], + ["Aisha Patel", "Field Testing Director", "Phoenix"], + ], + 1, + ): + for c, text in enumerate(row): + table.cell(r, c).text = text + + prs.save(str(path)) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestPptxRagE2E: + """End-to-end PPTX → RAG → LLM tests against real Lemonade.""" + + @pytest.fixture(autouse=True) + def _skip_without_lemonade(self): + ok, reason = _lemonade_available() + if not ok: + pytest.skip(f"Skipping E2E: {reason}") + + @pytest.fixture + def pptx_path(self, tmp_path): + path = tmp_path / "aurora_solar.pptx" + _create_test_pptx(path) + return str(path) + + @pytest.fixture + def rag(self, tmp_path, pptx_path): + from gaia.rag.sdk import RAGSDK, RAGConfig + + config = RAGConfig( + cache_dir=str(tmp_path / ".rag_cache"), + show_stats=False, + vlm_model="Gemma-4-E4B-it-GGUF", + ) + rag = RAGSDK(config=config) + stats = rag.index_document(pptx_path) + assert stats["success"], f"Indexing failed: {stats.get('error')}" + assert stats["num_chunks"] > 0 + return rag + + def test_index_creates_chunks(self, rag): + """Indexing a PPTX creates a non-empty vector index.""" + assert len(rag.chunks) >= 1 + + def test_retrieval_finds_efficiency_data(self, rag): + """Semantic search retrieves the correct chunk for efficiency questions.""" + chunks, scores = rag._retrieve_chunks( + "What efficiency does the Aurora panel achieve?" + ) + assert chunks, "No chunks retrieved" + top = chunks[0].lower() + assert "33.7" in top or "efficiency" in top + + def test_llm_answers_cost_question(self, rag): + """Full LLM Q&A returns the manufacturing cost from the slides.""" + start = time.time() + response = rag.query("What is the manufacturing cost target for Aurora?") + latency = time.time() - start + + answer = response.text if hasattr(response, "text") else str(response) + answer_lower = answer.lower() + + # The slides say "$0.18 per watt" + assert ( + "$0.18" in answer_lower or "0.18" in answer_lower + ), f"Expected '$0.18' in answer but got: {answer[:300]}" + assert latency < 120, f"LLM query took too long: {latency:.1f}s" + + def test_llm_answers_location_question(self, rag): + """LLM correctly identifies the manufacturing location.""" + response = rag.query("Where is the pilot manufacturing facility located?") + answer = response.text if hasattr(response, "text") else str(response) + + assert ( + "dresden" in answer.lower() + ), f"Expected 'Dresden' in answer but got: {answer[:300]}" + + def test_llm_answers_team_question(self, rag): + """LLM retrieves team member info from the table.""" + response = rag.query("Who is the Chief Scientist on the project?") + answer = response.text if hasattr(response, "text") else str(response) + + assert ( + "vasquez" in answer.lower() or "elena" in answer.lower() + ), f"Expected 'Vasquez' or 'Elena' in answer but got: {answer[:300]}" + + def test_speaker_notes_indexed(self, rag): + """Speaker notes content is retrievable.""" + chunks, _ = rag._retrieve_chunks("EU Green Deal subsidies") + assert chunks, "No chunks retrieved for speaker notes query" + found = any("green deal" in c.lower() or "subsid" in c.lower() for c in chunks) + assert found, "Speaker notes content not found in any chunk"