From d70d7ff20e875402b2259a9614ad29b465b8eb1a Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 20:05:59 +0100 Subject: [PATCH] added at call check --- lightrag/lightrag.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index bf395e29..bf03447e 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -538,16 +538,6 @@ class LightRAG: logger.info("All documents have been processed or are duplicates") return - to_process_docs_ids = set(to_process_docs.keys()) - - # Get allready processed documents (text chunks and full docs) - text_chunks_processed_doc_ids = await self.text_chunks.filter_keys( - to_process_docs_ids - ) - full_docs_processed_doc_ids = await self.full_docs.filter_keys( - to_process_docs_ids - ) - # 2. split docs into chunks, insert chunks, update doc status batch_size = self.addon_params.get("insert_batch_size", 10) batch_docs_list = [ @@ -597,14 +587,15 @@ class LightRAG: await self._process_entity_relation_graph(chunks) tasks[id_doc] = [] + # Check if document already processed the doc - if id_doc not in full_docs_processed_doc_ids: + if await self.full_docs.get_by_id(id_doc) is None: tasks[id_doc].append( self.full_docs.upsert({id_doc: {"content": status_doc.content}}) ) # Check if chunks already processed the doc - if id_doc not in text_chunks_processed_doc_ids: + if await self.text_chunks.get_by_id(id_doc) is None: tasks[id_doc].append(self.text_chunks.upsert(chunks)) # Process document (text chunks and full docs) in parallel