From 841ca2db63f899bfcee156fe481caee34608b96e Mon Sep 17 00:00:00 2001 From: Mubashir Rahim Date: Wed, 3 Jun 2026 14:57:01 +0500 Subject: [PATCH] fix: keep word boundaries when a chunk token repeats the final token RecursiveCharacterTextSplitter re-appends the separator stripped by str.split() to every part except the last one. The check used `part != splits[-1]`, comparing by value instead of by index. When an earlier token had the same value as the final token (e.g. "cat dog bird cat"), its trailing separator was dropped and it was merged into the next token ("catdog"), corrupting chunk content and degrading retrieval. Compare by index instead, matching the already-correct fallback splitter in core/utils/fast_ops.py. Adds regression tests that fail before and pass after the fix. Co-Authored-By: Claude Opus 4.8 --- core/parser/morphik_parser.py | 5 +- .../unit/test_recursive_text_splitter.py | 112 ++++++++++++++++++ 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 core/tests/unit/test_recursive_text_splitter.py diff --git a/core/parser/morphik_parser.py b/core/parser/morphik_parser.py index 7e781f7e..0e79100f 100644 --- a/core/parser/morphik_parser.py +++ b/core/parser/morphik_parser.py @@ -76,8 +76,9 @@ def _split_recursive(self, text: str, separators: list[str]) -> list[str]: splits = list(text) chunks = [] current = "" - for part in splits: - add_part = part + (sep if sep and part != splits[-1] else "") + last_index = len(splits) - 1 + for index, part in enumerate(splits): + add_part = part + (sep if sep and index != last_index else "") if self.length_function(current + add_part) > self.chunk_size: if current: chunks.append(current) diff --git a/core/tests/unit/test_recursive_text_splitter.py b/core/tests/unit/test_recursive_text_splitter.py new file mode 100644 index 00000000..5ecd4dd2 --- /dev/null +++ b/core/tests/unit/test_recursive_text_splitter.py @@ -0,0 +1,112 @@ +"""Unit tests for the RecursiveCharacterTextSplitter used by StandardChunker. + +The splitter is defined inside ``core.parser.morphik_parser`` which imports heavy +optional dependencies (docling, openpyxl, ...). We stub those modules so the pure +chunking logic can be exercised in isolation. +""" + +import sys +import types + +import pytest + + +MODULE_UNDER_TEST = "core.parser.morphik_parser" + + +def _drop_module(name): + sys.modules.pop(name, None) + if "." not in name: + return + parent_name, attr_name = name.rsplit(".", 1) + parent = sys.modules.get(parent_name) + if parent is not None and hasattr(parent, attr_name): + delattr(parent, attr_name) + + +def _stub_module(monkeypatch, name): + module = types.ModuleType(name) + module.__path__ = [] + monkeypatch.setitem(sys.modules, name, module) + if "." in name: + parent_name, attr_name = name.rsplit(".", 1) + parent = sys.modules.get(parent_name) + if parent is not None: + monkeypatch.setattr(parent, attr_name, module, raising=False) + return module + + +def _install_dependency_stubs(monkeypatch): + _stub_module(monkeypatch, "docling") + _stub_module(monkeypatch, "docling.datamodel") + docling_base_models = _stub_module(monkeypatch, "docling.datamodel.base_models") + docling_base_models.InputFormat = types.SimpleNamespace(PDF="pdf") + + docling_pipeline_options = _stub_module(monkeypatch, "docling.datamodel.pipeline_options") + docling_pipeline_options.PdfPipelineOptions = type("PdfPipelineOptions", (), {}) + docling_pipeline_options.EasyOcrOptions = type("EasyOcrOptions", (), {}) + docling_pipeline_options.TableStructureOptions = type("TableStructureOptions", (), {}) + + docling_document_converter = _stub_module(monkeypatch, "docling.document_converter") + docling_document_converter.DocumentConverter = type( + "DocumentConverter", (), {"__init__": lambda self, *a, **kw: None} + ) + docling_document_converter.PdfFormatOption = type( + "PdfFormatOption", (), {"__init__": lambda self, *a, **kw: None} + ) + + assemblyai = _stub_module(monkeypatch, "assemblyai") + assemblyai.settings = types.SimpleNamespace(api_key=None) + assemblyai.Transcript = type("Transcript", (), {}) + assemblyai.TranscriptionConfig = type("TranscriptionConfig", (), {"__init__": lambda self, *a, **kw: None}) + assemblyai.Transcriber = type("Transcriber", (), {"__init__": lambda self, *a, **kw: None}) + + cv2 = _stub_module(monkeypatch, "cv2") + cv2.CAP_PROP_FPS = 0 + cv2.CAP_PROP_FRAME_COUNT = 1 + cv2.VideoCapture = lambda path: None + + _stub_module(monkeypatch, "litellm") + _stub_module(monkeypatch, "openpyxl") + + filetype = _stub_module(monkeypatch, "filetype") + filetype.guess = lambda content: None + + +@pytest.fixture +def splitter_cls(monkeypatch): + _drop_module(MODULE_UNDER_TEST) + _install_dependency_stubs(monkeypatch) + + from core.parser.morphik_parser import RecursiveCharacterTextSplitter + + yield RecursiveCharacterTextSplitter + + _drop_module(MODULE_UNDER_TEST) + + +def test_duplicate_token_keeps_word_boundary(splitter_cls): + """A token that also equals the final token must not lose its trailing separator. + + Regression: the old code compared ``part != splits[-1]`` by value, so the first + "cat" had its separator dropped and got merged into the next word ("catdog"). + """ + splitter = splitter_cls(chunk_size=8, chunk_overlap=0, separators=[" ", ""]) + chunks = [chunk.content for chunk in splitter.split_text("cat dog bird cat")] + + rejoined = "".join(chunks) + assert "catdog" not in rejoined + assert "cat dog" in rejoined + # The reconstructed text must contain every original word with boundaries intact. + for word in ("cat", "dog", "bird"): + assert word in rejoined.split() + + +def test_no_separator_loss_on_repeated_tokens(splitter_cls): + """Repeated tokens elsewhere in the text keep their separators too.""" + splitter = splitter_cls(chunk_size=12, chunk_overlap=0, separators=[" ", ""]) + chunks = [chunk.content for chunk in splitter.split_text("the quick fox the")] + + rejoined = "".join(chunks) + assert "thequick" not in rejoined + assert rejoined.split() == ["the", "quick", "fox", "the"]