fix: make ids parameter optional and optimize input text cleaning

- Add default None value for ids parameter
- Move text cleaning into else branch
- Only clean text when auto-generating ids
- Preserve original text with custom ids
- Improve code readability
This commit is contained in:
yangdx
2025-02-23 15:46:47 +08:00
parent 411782797b
commit 845e914f1b

View File

@@ -577,7 +577,7 @@ class LightRAG:
await self._insert_done()
async def apipeline_enqueue_documents(
self, input: str | list[str], ids: list[str] | None
self, input: str | list[str], ids: list[str] | None = None
) -> None:
"""
Pipeline for Processing Documents
@@ -591,9 +591,6 @@ class LightRAG:
if isinstance(input, str):
input = [input]
# Clean input text and remove duplicates
input = list(set(self.clean_text(doc) for doc in input))
# 1. Validate ids if provided or generate MD5 hash IDs
if ids is not None:
# Check if the number of IDs matches the number of documents
@@ -607,6 +604,8 @@ class LightRAG:
# Generate contents dict of IDs provided by user and documents
contents = {id_: doc for id_, doc in zip(ids, input)}
else:
# Clean input text and remove duplicates
input = list(set(self.clean_text(doc) for doc in input))
# Generate contents dict of MD5 hash IDs and documents
contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}