From 4acf92dfd97b665d2e498a8289b205d10ecf00ba Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 11:35:31 +0100 Subject: [PATCH] cleaned code --- lightrag/lightrag.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 819a33e1..f33427cf 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -487,14 +487,13 @@ class LightRAG: } # 3. Filter out already processed documents - _add_doc_keys: set[str] = set() + add_doc_keys: set[str] = set() for doc_id in new_docs.keys(): current_doc = await self.doc_status.get_by_id(doc_id) - if not current_doc or current_doc["status"] == DocStatus.FAILED: - _add_doc_keys.add(doc_id) + add_doc_keys.add(doc_id) - new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + new_docs = {k: v for k, v in new_docs.items() if k in add_doc_keys} if not new_docs: logger.info("All documents have been processed or are duplicates") @@ -503,7 +502,7 @@ class LightRAG: # 4. Store original document for doc_id, doc in new_docs.items(): await self.full_docs.upsert( - {doc_id: {"content": doc["content"], "status": DocStatus.PENDING}} + {doc_id: doc} ) logger.info(f"Stored {len(new_docs)} new unique documents") @@ -610,7 +609,23 @@ class LightRAG: continue async def apipeline_process_extract_graph(self): - """Get pendding or failed chunks, extract entities and relationships from each chunk""" + """ + Process pending or failed chunks to extract entities and relationships. + + This method retrieves all chunks that are currently marked as pending or have previously failed. + It then extracts entities and relationships from each chunk and updates the status accordingly. + + Steps: + 1. Retrieve all pending and failed chunks. + 2. For each chunk, attempt to extract entities and relationships. + 3. Update the chunk's status to processed if successful, or failed if an error occurs. + + Raises: + Exception: If there is an error during the extraction process. + + Returns: + None + """ # 1. get all pending and failed chunks to_process_doc_keys: list[str] = []