From 845e914f1bcc8d8cd1543146b319f31318d359a6 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 23 Feb 2025 15:46:47 +0800 Subject: [PATCH] fix: make ids parameter optional and optimize input text cleaning - Add default None value for ids parameter - Move text cleaning into else branch - Only clean text when auto-generating ids - Preserve original text with custom ids - Improve code readability --- lightrag/lightrag.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 67ef3aab..efc49c2a 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -577,7 +577,7 @@ class LightRAG: await self._insert_done() async def apipeline_enqueue_documents( - self, input: str | list[str], ids: list[str] | None + self, input: str | list[str], ids: list[str] | None = None ) -> None: """ Pipeline for Processing Documents @@ -591,9 +591,6 @@ class LightRAG: if isinstance(input, str): input = [input] - # Clean input text and remove duplicates - input = list(set(self.clean_text(doc) for doc in input)) - # 1. Validate ids if provided or generate MD5 hash IDs if ids is not None: # Check if the number of IDs matches the number of documents @@ -607,6 +604,8 @@ class LightRAG: # Generate contents dict of IDs provided by user and documents contents = {id_: doc for id_, doc in zip(ids, input)} else: + # Clean input text and remove duplicates + input = list(set(self.clean_text(doc) for doc in input)) # Generate contents dict of MD5 hash IDs and documents contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}