From f8779cb193986e8f615c496cc1ce1d06b2c45cb8 Mon Sep 17 00:00:00 2001 From: Yannick Stephan Date: Sun, 9 Feb 2025 14:32:48 +0100 Subject: [PATCH] updated naming --- lightrag/lightrag.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 985fd329..5d608208 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -396,7 +396,7 @@ class LightRAG: split_by_character is None, this parameter is ignored. """ await self.apipeline_process_documents(string_or_strings) - await self.apipeline_process_chunks(split_by_character, split_by_character_only) + await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only) def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() @@ -465,7 +465,7 @@ class LightRAG: if update_storage: await self._insert_done() - async def apipeline_process_documents(self, string_or_strings: str | list[str]): + async def apipeline_enqueue_documents(self, string_or_strings: str | list[str]): """Pipeline process documents 1. Remove duplicate contents from the list @@ -505,7 +505,7 @@ class LightRAG: logger.info("All documents have been processed or are duplicates") return - # 4. Store original document + # 4. Store status document await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") @@ -529,23 +529,21 @@ class LightRAG: return to_process_doc_keys - async def apipeline_process_chunks( + async def apipeline_process_enqueue_documents( self, split_by_character: str | None = None, split_by_character_only: bool = False, ) -> None: - """Pipeline process chunks - - 1. Get pending documents - 2. Split documents into chunks - 3. Insert chunks - - Args: - split_by_character (str | None): If not None, split the string by character, if chunk longer than - chunk_size, split the sub chunk by token size. - split_by_character_only (bool): If split_by_character_only is True, split the string by character only, - when split_by_character is None, this parameter is ignored. """ + Process pending documents by splitting them into chunks, processing + each chunk for entity and relation extraction, and updating the + document status. + + 1. Get all pending and failed documents + 2. Split document content into chunks + 3. Process each chunk for entity and relation extraction + 4. Update the document status + """ # 1. get all pending and failed documents pending_doc_ids = await self._get_pending_documents() @@ -612,10 +610,11 @@ class LightRAG: self.full_docs.upsert({doc_id: {"content": status_doc["content"]}}) ) - # check if chunks already processed the doc + # Check if chunks already processed the doc if doc_id not in text_chunks_processed_doc_ids: tasks[doc_id].append(self.text_chunks.upsert(chunks)) + # Process document (text chunks and full docs) in parallel for doc_id, task in tasks.items(): try: await asyncio.gather(*task)