-
Notifications
You must be signed in to change notification settings - Fork 398
feat(logosdb): add LogosDB vector database integration #782
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
a9ec9c6
01e0fd0
812eea6
b932872
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| from typing import Annotated, Unpack | ||
|
|
||
| import click | ||
|
|
||
| from vectordb_bench.backend.clients import DB | ||
| from vectordb_bench.cli.cli import ( | ||
| CommonTypedDict, | ||
| cli, | ||
| click_parameter_decorators_from_typed_dict, | ||
| run, | ||
| ) | ||
|
|
||
| DBTYPE = DB.LogosDB | ||
|
|
||
|
|
||
| class LogosDBTypedDict(CommonTypedDict): | ||
| uri: Annotated[ | ||
| str, | ||
| click.option( | ||
| "--uri", | ||
| type=str, | ||
| help="Path to LogosDB directory (local embedded DB)", | ||
| required=False, | ||
| default="/tmp/vectordbbench_logosdb", | ||
| show_default=True, | ||
| ), | ||
| ] | ||
|
|
||
|
|
||
| @cli.command() | ||
| @click_parameter_decorators_from_typed_dict(LogosDBTypedDict) | ||
| def LogosDB(**parameters: Unpack[LogosDBTypedDict]): | ||
| from .config import LogosDBConfig, LogosDBIndexConfig | ||
|
|
||
| run( | ||
| db=DBTYPE, | ||
| db_config=LogosDBConfig(uri=parameters["uri"]), | ||
| db_case_config=LogosDBIndexConfig(), | ||
| **parameters, | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| from pydantic import BaseModel | ||
|
|
||
| from ..api import DBCaseConfig, DBConfig, MetricType | ||
|
|
||
|
|
||
| class LogosDBConfig(DBConfig): | ||
| uri: str = "/tmp/vectordbbench_logosdb" | ||
|
|
||
| def to_dict(self) -> dict: | ||
| return {"uri": self.uri} | ||
|
|
||
|
|
||
| class LogosDBIndexConfig(BaseModel, DBCaseConfig): | ||
| metric_type: MetricType | None = None | ||
|
|
||
| def parse_metric(self) -> int: | ||
| import logosdb | ||
|
|
||
| if self.metric_type == MetricType.L2: | ||
| return logosdb.DIST_L2 | ||
| if self.metric_type == MetricType.IP: | ||
| return logosdb.DIST_IP | ||
| return logosdb.DIST_COSINE | ||
|
|
||
| def index_param(self) -> dict: | ||
| return {} | ||
|
|
||
| def search_param(self) -> dict: | ||
| return {} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| import logging | ||
| import os | ||
| import shutil | ||
| from collections.abc import Iterable | ||
| from contextlib import contextmanager | ||
|
|
||
| import numpy as np | ||
|
|
||
| from ..api import VectorDB | ||
| from .config import LogosDBIndexConfig | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class LogosDB(VectorDB): | ||
| def __init__( | ||
| self, | ||
| dim: int, | ||
| db_config: dict, | ||
| db_case_config: LogosDBIndexConfig, | ||
| collection_name: str = "LogosDBCollection", | ||
| drop_old: bool = False, | ||
| name: str = "LogosDB", | ||
| **kwargs, | ||
| ): | ||
| self.name = name | ||
| self.db_config = db_config | ||
| self.case_config = db_case_config | ||
| self.dim = dim | ||
| self.uri = db_config["uri"] | ||
| self.db = None | ||
|
|
||
| if drop_old and os.path.exists(self.uri): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. must-change: this
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok thanks a lot, fixed here: 01e0fd0 |
||
| log.info(f"{self.name} drop_old: removing {self.uri}") | ||
| shutil.rmtree(self.uri) | ||
|
|
||
| import logosdb as _logosdb | ||
|
|
||
| distance = self.case_config.parse_metric() | ||
| db = _logosdb.DB(self.uri, dim=self.dim, distance=distance) | ||
| log.info(f"{self.name} initialized at {self.uri} dim={dim} distance={distance}") | ||
| del db | ||
|
|
||
| @contextmanager | ||
| def init(self): | ||
| import logosdb as _logosdb | ||
|
|
||
| distance = self.case_config.parse_metric() | ||
| self.db = _logosdb.DB(self.uri, dim=self.dim, distance=distance) | ||
| try: | ||
| yield | ||
| finally: | ||
| del self.db | ||
| self.db = None | ||
|
|
||
| def insert_embeddings( | ||
| self, | ||
| embeddings: Iterable[list[float]], | ||
| metadata: list[int], | ||
| **kwargs, | ||
| ) -> tuple[int, Exception]: | ||
| assert self.db is not None | ||
| try: | ||
| embeddings_arr = np.array(list(embeddings), dtype=np.float32) | ||
| texts = [str(m) for m in metadata] | ||
| self.db.put_batch(embeddings_arr, texts=texts) | ||
| return len(metadata), None | ||
| except Exception as e: | ||
| log.warning(f"{self.name} insert_embeddings error: {e}") | ||
| return 0, e | ||
|
|
||
| def search_embedding( | ||
| self, | ||
| query: list[float], | ||
| k: int = 100, | ||
| filters: dict | None = None, | ||
| timeout: int | None = None, | ||
| ) -> list[int]: | ||
| assert self.db is not None | ||
| q = np.array(query, dtype=np.float32) | ||
| hits = self.db.search(q, top_k=k) | ||
| return [int(h.text) for h in hits] | ||
|
|
||
| def optimize(self, data_size: int | None = None): | ||
| log.info(f"{self.name} optimize: HNSW index is built incrementally, no explicit step needed") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ | |
| from ..backend.clients.endee.cli import Endee | ||
| from ..backend.clients.hologres.cli import HologresHGraph | ||
| from ..backend.clients.lancedb.cli import LanceDB | ||
| from ..backend.clients.logosdb.cli import LogosDB | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. must-change: this import placement fails ruff
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed here, possibly: 812eea6
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls review again or merge @XuanYang-cn |
||
| from ..backend.clients.lindorm.cli import LindormHNSW, LindormIVFBQ, LindormIVFPQ | ||
| from ..backend.clients.mariadb.cli import MariaDBHNSW | ||
| from ..backend.clients.memorydb.cli import MemoryDB | ||
|
|
@@ -97,6 +98,7 @@ | |
| cli.add_command(PolarDBHNSWPQ) | ||
| cli.add_command(PolarDBHNSWSQ) | ||
| cli.add_command(SeekDBHNSW) | ||
| cli.add_command(LogosDB) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
must-change:
LogosDBinheritssearch_concurrent=TruefromCommonTypedDict, but LogosDB documents one DB directory as single-process while VDBBench concurrent search starts multipleProcessPoolExecutorworkers against the same--uri. The default command can fail or report invalid concurrent-search results after loading. Setparameters["search_concurrent"] = Falseor reject--search-concurrentfor LogosDB until a supported single-process concurrent runner exists.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for catching this. Fixed in the latest commit by hard-setting
parameters["search_concurrent"] = Falsein the CLI handler.Quick note: I did test multi-process concurrent reads empirically (4
Poolworkers opening the same DB path and running 50 searches each) and all succeeded without errors (LogosDB's memory-mapped storage appears safe for concurrent readers). That said, since the official docs declare it single-process, disabling concurrent search is the right conservative call for now. Can revisit if/when LogosDB formally documents multi-reader support.Fixed here: b932872