From fee90ddd9dc35709352e11b28f03219fd587168b Mon Sep 17 00:00:00 2001 From: jack Date: Wed, 26 Feb 2025 14:41:10 +0800 Subject: [PATCH] add support for the single document and custom chunks method --- lightrag/lightrag.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 46638243..b1d347e1 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -487,7 +487,7 @@ class LightRAG: input: str | list[str], split_by_character: str | None = None, split_by_character_only: bool = False, - ids: list[str] | None = None, + ids: str | list[str] | None = None, ) -> None: """Sync Insert documents with checkpoint support @@ -496,7 +496,7 @@ class LightRAG: split_by_character: if split_by_character is not None, split the string by character, if chunk longer than split_by_character_only: if split_by_character_only is True, split the string by character only, when split_by_character is None, this parameter is ignored. - ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated + ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated """ loop = always_get_an_event_loop() loop.run_until_complete( @@ -508,7 +508,7 @@ class LightRAG: input: str | list[str], split_by_character: str | None = None, split_by_character_only: bool = False, - ids: list[str] | None = None, + ids: str | list[str] | None = None, ) -> None: """Async Insert documents with checkpoint support @@ -524,12 +524,12 @@ class LightRAG: split_by_character, split_by_character_only ) - def insert_custom_chunks(self, full_text: str, text_chunks: list[str]) -> None: + def insert_custom_chunks(self, full_text: str, text_chunks: list[str], doc_id: str | list[str] | None = None) -> None: loop = always_get_an_event_loop() - loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks)) + loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks, doc_id)) async def ainsert_custom_chunks( - self, full_text: str, text_chunks: list[str] + self, full_text: str, text_chunks: list[str], doc_id: str | None = None ) -> None: update_storage = False try: @@ -538,7 +538,10 @@ class LightRAG: text_chunks = [self.clean_text(chunk) for chunk in text_chunks] # Process cleaned texts - doc_key = compute_mdhash_id(full_text, prefix="doc-") + if doc_id is None: + doc_key = compute_mdhash_id(full_text, prefix="doc-") + else: + doc_key = doc_id new_docs = {doc_key: {"content": full_text}} _add_doc_keys = await self.full_docs.filter_keys({doc_key}) @@ -594,6 +597,8 @@ class LightRAG: """ if isinstance(input, str): input = [input] + if isinstance(ids, str): + ids = [ids] # 1. Validate ids if provided or generate MD5 hash IDs if ids is not None: